Browse Source

Added comments to exercise1 and exercise6. Automatically choose C and sigma in final part of ex6.

master
wchen342 7 years ago
parent
commit
7cf1738c4b
  1. 1
      ex1/ex1.py
  2. 2
      ex1/ex1_multi.py
  3. 84
      ex6/ex6.py
  4. 59
      ex6/ex6_spam.py

1
ex1/ex1.py

@ -1,3 +1,4 @@
## Machine Learning Online Class - Exercise 1: Linear Regression
import numpy as np
import matplotlib.pyplot as plt

2
ex1/ex1_multi.py

@ -1,3 +1,5 @@
## Machine Learning Online Class
# Exercise 1: Linear regression with multiple variables
import numpy as np
import matplotlib.pyplot as plt

84
ex6/ex6.py

@ -1,9 +1,17 @@
## Machine Learning Online Class
# Exercise 6 | Support Vector Machines
import numpy as np
import scipy.io
import matplotlib.pyplot as plt
import sys
from sklearn.svm import SVC
# PLOTDATA Plots the data points X and y into a new figure
# PLOTDATA(x,y) plots the data points with + for the positive examples
# and o for the negative examples. X is assumed to be a Mx2 matrix.
#
# Note: This was slightly modified such that it expects y = 1 or y = 0
def plotdata(X, y):
pos = y == 1
neg = y == 0
@ -11,6 +19,10 @@ def plotdata(X, y):
plt.plot(X[:, :1][neg], X[:, 1:2][neg], 'ko', markerfacecolor='y', markersize=7)
# VISUALIZEBOUNDARYLINEAR plots a linear decision boundary learned by the
# SVM
# VISUALIZEBOUNDARYLINEAR(X, y, model) plots a linear decision boundary
# learned by the SVM and overlays the data on it
def visualize_boundary_linear(X, y, clf):
w = clf.coef_
b = clf.intercept_
@ -20,33 +32,70 @@ def visualize_boundary_linear(X, y, clf):
plt.plot(xp, yp, '-b')
# VISUALIZEBOUNDARY plots a non-linear decision boundary learned by the SVM
# VISUALIZEBOUNDARYLINEAR(X, y, model) plots a non-linear decision
# boundary learned by the SVM and overlays the data on it
def visualize_boundary(X, y, clf):
plotdata(X, y)
x1plot = np.linspace(X[:, :1].min(), X[:, :1].max(), 100).T
x2plot = np.linspace(X[:, 1:2].min(), X[:, 1:2].max(), 100).T
X1, X2 = np.meshgrid(x1plot, x2plot)
vals = np.zeros(X1.shape)
for i in range(1, X1.shape[1]+1):
this_X = np.hstack((X1[:, i-1:i], X2[:, i-1:i]))
vals[:, i-1] = clf.predict(this_X)
for i in range(1, X1.shape[1] + 1):
this_X = np.hstack((X1[:, i - 1:i], X2[:, i - 1:i]))
vals[:, i - 1] = clf.predict(this_X)
plt.contour(X1, X2, vals, [0], colors='b')
# plt.contour(X1, X2, vals, colors='b')
# EX6PARAMS returns your choice of C and sigma for Part 3 of the exercise
# where you select the optimal (C, sigma) learning parameters to use for SVM
# with RBF kernel
# C, sigma = EX6PARAMS(X, y, Xval, yval) returns your choice of C and
# sigma. You should complete this function to return the optimal C and
# sigma based on a cross-validation set.
def dataset3_params(X, y, Xval, yval):
# You need to return the following variables correctly.
C = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]
sigma = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]
minError = sys.maxsize
finalC = 0
finalSigma = 0
clf = SVC(kernel='rbf')
for i in C:
for j in sigma:
clf = clf.set_params(C=i, gamma=1 / (2 * j * j))
clf.fit(X, y.ravel())
predictions = clf.predict(Xval)
error = np.mean(predictions.reshape(-1, 1) != yval)
if error <= minError:
minError = error
finalC = i
finalSigma = j
return finalC, finalSigma
## =============== Part 1: Loading and Visualizing Data ================
print('Loading and Visualizing Data ...\n')
# Load from ex6data1:
# You will have X, y in your environment
data = scipy.io.loadmat('mat/ex6data1.mat', matlab_compatible=True)
X = data['X']
y = data['y']
# Plot training data
plotdata(X, y)
plt.show()
print('Program paused. Press enter to continue.\n')
input()
## ==================== Part 2: Training Linear SVM ====================
print('\nTraining Linear SVM ...\n')
C = 1
clf = SVC(C=C, kernel='linear')
@ -59,22 +108,30 @@ input()
print('Loading and Visualizing Data ...\n')
## =============== Part 4: Visualizing Dataset 2 ================
# Load from ex6data2:
# You will have X, y in your environment
data = scipy.io.loadmat('mat/ex6data2.mat', matlab_compatible=True)
X = data['X']
y = data['y']
# Plot training data
plotdata(X, y)
plt.show()
print('Program paused. Press enter to continue.\n')
input()
## ========== Part 5: Training SVM with RBF Kernel (Dataset 2) ==========
print('\nTraining SVM with RBF Kernel (this may take 1 to 2 minutes) ...\n')
# SVM Parameters
C = 1
sigma = 0.1
gamma = 1 / (2 * sigma * sigma)
# Train SVM Model
clf = SVC(C=C, gamma=gamma, kernel='rbf')
clf.fit(X, y.ravel())
@ -84,16 +141,31 @@ plt.show()
print('Program paused. Press enter to continue.\n')
input()
## =============== Part 6: Visualizing Dataset 3 ================
print('Loading and Visualizing Data ...\n')
# Load from ex6data3:
# You will have X, y in your environment
data = scipy.io.loadmat('mat/ex6data3.mat', matlab_compatible=True)
X = data['X']
y = data['y']
Xval = data['Xval']
yval = data['yval']
# Plot training data
plotdata(X, y)
plt.show()
print('Program paused. Press enter to continue.\n')
input()
## ========== Part 7: Training SVM with RBF Kernel (Dataset 3) ==========
# Try different SVM Parameters here
C, sigma = dataset3_params(X, y, Xval, yval)
# Train the SVM
clf = clf.set_params(C=C, gamma = 1 / (2 * sigma * sigma))
clf.fit(X, y.ravel())
visualize_boundary(X, y, clf)

59
ex6/ex6_spam.py

@ -1,3 +1,5 @@
## Machine Learning Online Class
# Exercise 6 | Spam Classification with SVMs
import re
import numpy as np
import scipy
@ -6,6 +8,9 @@ from nltk.stem.porter import PorterStemmer as stemmer
from sklearn.svm import SVC
# READFILE reads a file and returns its entire contents
# file_contents = READFILE(filename) reads a file and returns its entire
# contents in file_contents
def readfile(filename):
fid = open(filename, mode='r')
if fid is not None:
@ -17,6 +22,10 @@ def readfile(filename):
return file_content
# GETVOCABLIST reads the fixed vocabulary list in vocab.txt and returns a
# cell array of the words
# vocabList = GETVOCABLIST() reads the fixed vocabulary list in vocab.txt
# and returns a cell array of the words in vocabList.
def getvocablist():
fid = open('text/vocab.txt')
# n = 1899
@ -27,6 +36,11 @@ def getvocablist():
return vocablist
# PROCESSEMAIL preprocesses a the body of an email and
# returns a list of word_indices
# word_indices = PROCESSEMAIL(email_contents) preprocesses
# the body of an email and returns a list of indices of the
# words contained in the email.
def process_email(email_content):
vocablist = getvocablist()
word_indices = []
@ -63,7 +77,6 @@ def process_email(email_content):
l += len(word) + 1
processed_content.append(word)
# Look up the word in the dictionary and add to word_indices
try:
ind = vocablist.index(word)
@ -78,6 +91,10 @@ def process_email(email_content):
return word_indices
# EMAILFEATURES takes in a word_indices vector and produces a feature vector
# from the word indices
# x = EMAILFEATURES(word_indices) takes in a word_indices vector and
# produces a feature vector from the word indices.
def email_features(word_indices):
n = 1899
x = np.zeros(n)
@ -86,15 +103,14 @@ def email_features(word_indices):
## ==================== Part 2: Feature Extraction ====================
# Now, you will convert each email into a vector of features in R^n.
# You should complete the code in emailFeatures.m to produce a feature
# vector for a given email.
print('\nExtracting features from sample email (emailSample1.txt)\n')
# Extract Features
file_contents = readfile('text/emailSample1.txt')
word_indices = process_email(file_contents)
features = email_features(word_indices)
# Print Stats
print('Length of feature vector: {0:d}\n'.format(len(features)))
print('Number of non-zero entries: {0:d}\n'.format(sum(features > 0)))
@ -102,8 +118,6 @@ print('Program paused. Press enter to continue.\n')
input()
## =========== Part 3: Train Linear SVM for Spam Classification ========
# In this section, you will train a linear classifier to determine if an
# email is Spam or Not-Spam.
# Load the Spam Email dataset
# You will have X, y in your environment
@ -123,12 +137,9 @@ p = clf.predict(X)
print('Training Accuracy: {0:f}\n'.format(np.mean((p == y.ravel()).astype(float)) * 100))
## =================== Part 4: Test Spam Classification ================
# After training the classifier, we can evaluate it on a test set. We have
# included a test set in spamTest.mat
# Load the test dataset
# You will have Xtest, ytest in your environment
data = scipy.io.loadmat('mat/spamTest.mat', matlab_compatible=True)
Xtest = data['Xtest']
ytest = data['ytest']
@ -141,44 +152,26 @@ print('Test Accuracy: {0:f}\n'.format(np.mean((p == ytest.ravel()).astype(float)
input()
## ================= Part 5: Top Predictors of Spam ====================
# Since the model we are training is a linear SVM, we can inspect the
# weights learned by the model to understand better how it is determining
# whether an email is spam or not. The following code finds the words with
# the highest weights in the classifier. Informally, the classifier
# 'thinks' that these words are the most likely indicators of spam.
#
# Sort the weights and obtain the vocabulary list
idx = clf.coef_.argsort()[:,::-1]
idx = clf.coef_.argsort()[:, ::-1]
vocabList = np.array(getvocablist())
print('\nTop predictors of spam: \n')
for i in range(0,15):
print(' {0:<15s} ({1:f}) \n'.format(vocabList[idx[0,i]], clf.coef_[0,idx[0,i]]))
for i in range(0, 15):
print(' {0:<15s} ({1:f}) \n'.format(vocabList[idx[0, i]], clf.coef_[0, idx[0, i]]))
print('\n\n')
print('\nProgram paused. Press enter to continue.\n')
input()
## =================== Part 6: Try Your Own Emails =====================
# Now that you've trained the spam classifier, you can use it on your own
# emails! In the starter code, we have included spamSample1.txt,
# spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.
# The following code reads in one of these emails and then uses your
# learned SVM classifier to determine whether the email is Spam or
# Not Spam
# Set the file to be read in (change this to spamSample2.txt,
# emailSample1.txt or emailSample2.txt to see different predictions on
# different emails types). Try your own emails as well!
filename = 'spamSample1'
filename = 'spamSample3'
file_contents = readfile('text/' + filename + '.txt')
# Read and predict
word_indices = process_email(file_contents)
x = email_features(word_indices).reshape(1, -1) # 1d arrays are deprecated
word_indices = process_email(file_contents)
x = email_features(word_indices).reshape(1, -1) # 1d arrays are deprecated
p = clf.predict(x)
print('\nProcessed {0:s}\n\nSpam Classification: {1:d}\n'.format(filename, int(p[0])))

Loading…
Cancel
Save