|
|
@ -1,7 +1,9 @@ |
|
|
|
import re |
|
|
|
import numpy as np |
|
|
|
import scipy |
|
|
|
|
|
|
|
from nltk.stem.porter import PorterStemmer as stemmer |
|
|
|
from sklearn.svm import SVC |
|
|
|
|
|
|
|
|
|
|
|
def readfile(filename): |
|
|
@ -95,3 +97,85 @@ print('Number of non-zero entries: {0:d}\n'.format(sum(features > 0))) |
|
|
|
print('Program paused. Press enter to continue.\n') |
|
|
|
input() |
|
|
|
|
|
|
|
## =========== Part 3: Train Linear SVM for Spam Classification ======== |
|
|
|
# In this section, you will train a linear classifier to determine if an |
|
|
|
# email is Spam or Not-Spam. |
|
|
|
|
|
|
|
# Load the Spam Email dataset |
|
|
|
# You will have X, y in your environment |
|
|
|
data = scipy.io.loadmat('mat/spamTrain.mat', matlab_compatible=True) |
|
|
|
X = data['X'] |
|
|
|
y = data['y'] |
|
|
|
|
|
|
|
print('\nTraining Linear SVM (Spam Classification)\n') |
|
|
|
print('(this may take 1 to 2 minutes) ...\n') |
|
|
|
|
|
|
|
C = 0.1 |
|
|
|
clf = SVC(C=C, kernel='linear') |
|
|
|
clf.fit(X, y.ravel()) |
|
|
|
|
|
|
|
p = clf.predict(X) |
|
|
|
|
|
|
|
print('Training Accuracy: {0:f}\n'.format(np.mean((p == y.ravel()).astype(float)) * 100)) |
|
|
|
|
|
|
|
## =================== Part 4: Test Spam Classification ================ |
|
|
|
# After training the classifier, we can evaluate it on a test set. We have |
|
|
|
# included a test set in spamTest.mat |
|
|
|
|
|
|
|
# Load the test dataset |
|
|
|
# You will have Xtest, ytest in your environment |
|
|
|
|
|
|
|
data = scipy.io.loadmat('mat/spamTest.mat', matlab_compatible=True) |
|
|
|
Xtest = data['Xtest'] |
|
|
|
ytest = data['ytest'] |
|
|
|
|
|
|
|
print('\nEvaluating the trained Linear SVM on a test set ...\n') |
|
|
|
|
|
|
|
p = clf.predict(Xtest) |
|
|
|
|
|
|
|
print('Test Accuracy: {0:f}\n'.format(np.mean((p == ytest.ravel()).astype(float)) * 100)) |
|
|
|
input() |
|
|
|
|
|
|
|
## ================= Part 5: Top Predictors of Spam ==================== |
|
|
|
# Since the model we are training is a linear SVM, we can inspect the |
|
|
|
# weights learned by the model to understand better how it is determining |
|
|
|
# whether an email is spam or not. The following code finds the words with |
|
|
|
# the highest weights in the classifier. Informally, the classifier |
|
|
|
# 'thinks' that these words are the most likely indicators of spam. |
|
|
|
# |
|
|
|
|
|
|
|
# Sort the weights and obtain the vocabulary list |
|
|
|
|
|
|
|
idx = clf.coef_.argsort()[:,::-1] |
|
|
|
vocabList = np.array(getvocablist()) |
|
|
|
|
|
|
|
print('\nTop predictors of spam: \n') |
|
|
|
for i in range(0,15): |
|
|
|
print(' {0:<15s} ({1:f}) \n'.format(vocabList[idx[0,i]], clf.coef_[0,idx[0,i]])) |
|
|
|
|
|
|
|
print('\n\n') |
|
|
|
print('\nProgram paused. Press enter to continue.\n') |
|
|
|
input() |
|
|
|
|
|
|
|
## =================== Part 6: Try Your Own Emails ===================== |
|
|
|
# Now that you've trained the spam classifier, you can use it on your own |
|
|
|
# emails! In the starter code, we have included spamSample1.txt, |
|
|
|
# spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. |
|
|
|
# The following code reads in one of these emails and then uses your |
|
|
|
# learned SVM classifier to determine whether the email is Spam or |
|
|
|
# Not Spam |
|
|
|
|
|
|
|
# Set the file to be read in (change this to spamSample2.txt, |
|
|
|
# emailSample1.txt or emailSample2.txt to see different predictions on |
|
|
|
# different emails types). Try your own emails as well! |
|
|
|
|
|
|
|
filename = 'spamSample1' |
|
|
|
file_contents = readfile('text/' + filename + '.txt') |
|
|
|
|
|
|
|
# Read and predict |
|
|
|
word_indices = process_email(file_contents) |
|
|
|
x = email_features(word_indices).reshape(1, -1) # 1d arrays are deprecated |
|
|
|
p = clf.predict(x) |
|
|
|
|
|
|
|
print('\nProcessed {0:s}\n\nSpam Classification: {1:d}\n'.format(filename, int(p[0]))) |
|
|
|
print('(1 indicates spam, 0 indicates not spam)\n\n') |
|
|
|