Browse Source

finish basic exercises in ex6

add comments
wchen342 6 years ago
  1. 84


@ -1,7 +1,9 @@
import re
import numpy as np
import scipy
from nltk.stem.porter import PorterStemmer as stemmer
from sklearn.svm import SVC
def readfile(filename):
@ -95,3 +97,85 @@ print('Number of non-zero entries: {0:d}\n'.format(sum(features > 0)))
print('Program paused. Press enter to continue.\n')
## =========== Part 3: Train Linear SVM for Spam Classification ========
# In this section, you will train a linear classifier to determine if an
# email is Spam or Not-Spam.
# Load the Spam Email dataset
# You will have X, y in your environment
data ='mat/spamTrain.mat', matlab_compatible=True)
X = data['X']
y = data['y']
print('\nTraining Linear SVM (Spam Classification)\n')
print('(this may take 1 to 2 minutes) ...\n')
C = 0.1
clf = SVC(C=C, kernel='linear'), y.ravel())
p = clf.predict(X)
print('Training Accuracy: {0:f}\n'.format(np.mean((p == y.ravel()).astype(float)) * 100))
## =================== Part 4: Test Spam Classification ================
# After training the classifier, we can evaluate it on a test set. We have
# included a test set in spamTest.mat
# Load the test dataset
# You will have Xtest, ytest in your environment
data ='mat/spamTest.mat', matlab_compatible=True)
Xtest = data['Xtest']
ytest = data['ytest']
print('\nEvaluating the trained Linear SVM on a test set ...\n')
p = clf.predict(Xtest)
print('Test Accuracy: {0:f}\n'.format(np.mean((p == ytest.ravel()).astype(float)) * 100))
## ================= Part 5: Top Predictors of Spam ====================
# Since the model we are training is a linear SVM, we can inspect the
# weights learned by the model to understand better how it is determining
# whether an email is spam or not. The following code finds the words with
# the highest weights in the classifier. Informally, the classifier
# 'thinks' that these words are the most likely indicators of spam.
# Sort the weights and obtain the vocabulary list
idx = clf.coef_.argsort()[:,::-1]
vocabList = np.array(getvocablist())
print('\nTop predictors of spam: \n')
for i in range(0,15):
print(' {0:<15s} ({1:f}) \n'.format(vocabList[idx[0,i]], clf.coef_[0,idx[0,i]]))
print('\nProgram paused. Press enter to continue.\n')
## =================== Part 6: Try Your Own Emails =====================
# Now that you've trained the spam classifier, you can use it on your own
# emails! In the starter code, we have included spamSample1.txt,
# spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.
# The following code reads in one of these emails and then uses your
# learned SVM classifier to determine whether the email is Spam or
# Not Spam
# Set the file to be read in (change this to spamSample2.txt,
# emailSample1.txt or emailSample2.txt to see different predictions on
# different emails types). Try your own emails as well!
filename = 'spamSample1'
file_contents = readfile('text/' + filename + '.txt')
# Read and predict
word_indices = process_email(file_contents)
x = email_features(word_indices).reshape(1, -1) # 1d arrays are deprecated
p = clf.predict(x)
print('\nProcessed {0:s}\n\nSpam Classification: {1:d}\n'.format(filename, int(p[0])))
print('(1 indicates spam, 0 indicates not spam)\n\n')