### Added comments to exercise1 and exercise6. Automatically choose C and sigma in final part of ex6.

master wchen342 6 years ago
parent
commit
7cf1738c4b
4 changed files with 107 additions and 39 deletions
1. 1
ex1/ex1.py
2. 2
ex1/ex1_multi.py
3. 84
ex6/ex6.py
4. 59
ex6/ex6_spam.py

#### 1 ex1/ex1.py View File

 `@ -1,3 +1,4 @@` `## Machine Learning Online Class - Exercise 1: Linear Regression` `import numpy as np` `import matplotlib.pyplot as plt` ``` ```

#### 2 ex1/ex1_multi.py View File

 `@ -1,3 +1,5 @@` `## Machine Learning Online Class` `# Exercise 1: Linear regression with multiple variables` `import numpy as np` `import matplotlib.pyplot as plt` ``` ```

#### 84 ex6/ex6.py View File

 `@ -1,9 +1,17 @@` `## Machine Learning Online Class` `# Exercise 6 | Support Vector Machines` `import numpy as np` `import scipy.io` `import matplotlib.pyplot as plt` `import sys` `from sklearn.svm import SVC` ``` ``` ``` ``` `# PLOTDATA Plots the data points X and y into a new figure` `# PLOTDATA(x,y) plots the data points with + for the positive examples` `# and o for the negative examples. X is assumed to be a Mx2 matrix.` `#` `# Note: This was slightly modified such that it expects y = 1 or y = 0` `def plotdata(X, y):` ` pos = y == 1` ` neg = y == 0` `@ -11,6 +19,10 @@ def plotdata(X, y):` ` plt.plot(X[:, :1][neg], X[:, 1:2][neg], 'ko', markerfacecolor='y', markersize=7)` ``` ``` ``` ``` `# VISUALIZEBOUNDARYLINEAR plots a linear decision boundary learned by the` `# SVM` `# VISUALIZEBOUNDARYLINEAR(X, y, model) plots a linear decision boundary` `# learned by the SVM and overlays the data on it` `def visualize_boundary_linear(X, y, clf):` ` w = clf.coef_` ` b = clf.intercept_` `@ -20,33 +32,70 @@ def visualize_boundary_linear(X, y, clf):` ` plt.plot(xp, yp, '-b')` ``` ``` ``` ``` `# VISUALIZEBOUNDARY plots a non-linear decision boundary learned by the SVM` `# VISUALIZEBOUNDARYLINEAR(X, y, model) plots a non-linear decision` `# boundary learned by the SVM and overlays the data on it` `def visualize_boundary(X, y, clf):` ` plotdata(X, y)` ` x1plot = np.linspace(X[:, :1].min(), X[:, :1].max(), 100).T` ` x2plot = np.linspace(X[:, 1:2].min(), X[:, 1:2].max(), 100).T` ` X1, X2 = np.meshgrid(x1plot, x2plot)` ` vals = np.zeros(X1.shape)` ` for i in range(1, X1.shape+1):` ` this_X = np.hstack((X1[:, i-1:i], X2[:, i-1:i]))` ` vals[:, i-1] = clf.predict(this_X)` ``` ``` ` for i in range(1, X1.shape + 1):` ` this_X = np.hstack((X1[:, i - 1:i], X2[:, i - 1:i]))` ` vals[:, i - 1] = clf.predict(this_X)` ` plt.contour(X1, X2, vals, , colors='b')` ` # plt.contour(X1, X2, vals, colors='b')` ``` ``` ``` ``` `# EX6PARAMS returns your choice of C and sigma for Part 3 of the exercise` `# where you select the optimal (C, sigma) learning parameters to use for SVM` `# with RBF kernel` `# C, sigma = EX6PARAMS(X, y, Xval, yval) returns your choice of C and` `# sigma. You should complete this function to return the optimal C and` `# sigma based on a cross-validation set.` `def dataset3_params(X, y, Xval, yval):` ` # You need to return the following variables correctly.` ` C = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]` ` sigma = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]` ``` ``` ` minError = sys.maxsize` ` finalC = 0` ` finalSigma = 0` ``` ``` ` clf = SVC(kernel='rbf')` ``` ``` ` for i in C:` ` for j in sigma:` ` clf = clf.set_params(C=i, gamma=1 / (2 * j * j))` ` clf.fit(X, y.ravel())` ` predictions = clf.predict(Xval)` ` error = np.mean(predictions.reshape(-1, 1) != yval)` ` if error <= minError:` ` minError = error` ` finalC = i` ` finalSigma = j` ` return finalC, finalSigma` ``` ``` ``` ``` `## =============== Part 1: Loading and Visualizing Data ================` `print('Loading and Visualizing Data ...\n')` ``` ``` `# Load from ex6data1:` `# You will have X, y in your environment` `data = scipy.io.loadmat('mat/ex6data1.mat', matlab_compatible=True)` ``` ``` `X = data['X']` `y = data['y']` ``` ``` `# Plot training data` `plotdata(X, y)` `plt.show()` ``` ``` `print('Program paused. Press enter to continue.\n')` `input()` ``` ``` `## ==================== Part 2: Training Linear SVM ====================` `print('\nTraining Linear SVM ...\n')` ``` ``` `C = 1` `clf = SVC(C=C, kernel='linear')` ``` ``` `@ -59,22 +108,30 @@ input()` ``` ``` `print('Loading and Visualizing Data ...\n')` ``` ``` `## =============== Part 4: Visualizing Dataset 2 ================` `# Load from ex6data2:` `# You will have X, y in your environment` `data = scipy.io.loadmat('mat/ex6data2.mat', matlab_compatible=True)` ``` ``` `X = data['X']` `y = data['y']` ``` ``` `# Plot training data` `plotdata(X, y)` `plt.show()` ``` ``` `print('Program paused. Press enter to continue.\n')` `input()` ``` ``` `## ========== Part 5: Training SVM with RBF Kernel (Dataset 2) ==========` `print('\nTraining SVM with RBF Kernel (this may take 1 to 2 minutes) ...\n')` ``` ``` `# SVM Parameters` `C = 1` `sigma = 0.1` `gamma = 1 / (2 * sigma * sigma)` ``` ``` `# Train SVM Model` `clf = SVC(C=C, gamma=gamma, kernel='rbf')` `clf.fit(X, y.ravel())` ``` ``` `@ -84,16 +141,31 @@ plt.show()` `print('Program paused. Press enter to continue.\n')` `input()` ``` ``` `## =============== Part 6: Visualizing Dataset 3 ================` `print('Loading and Visualizing Data ...\n')` ``` ``` `# Load from ex6data3:` `# You will have X, y in your environment` `data = scipy.io.loadmat('mat/ex6data3.mat', matlab_compatible=True)` ``` ``` `X = data['X']` `y = data['y']` `Xval = data['Xval']` `yval = data['yval']` ``` ``` `# Plot training data` `plotdata(X, y)` `plt.show()` ``` ``` `print('Program paused. Press enter to continue.\n')` `input()` ``` ``` `## ========== Part 7: Training SVM with RBF Kernel (Dataset 3) ==========` `# Try different SVM Parameters here` `C, sigma = dataset3_params(X, y, Xval, yval)` ``` ``` `# Train the SVM` `clf = clf.set_params(C=C, gamma = 1 / (2 * sigma * sigma))` `clf.fit(X, y.ravel())` ``` ``` `visualize_boundary(X, y, clf)`

#### 59 ex6/ex6_spam.py View File

 `@ -1,3 +1,5 @@` `## Machine Learning Online Class` `# Exercise 6 | Spam Classification with SVMs` `import re` `import numpy as np` `import scipy` `@ -6,6 +8,9 @@ from nltk.stem.porter import PorterStemmer as stemmer` `from sklearn.svm import SVC` ``` ``` ``` ``` `# READFILE reads a file and returns its entire contents` `# file_contents = READFILE(filename) reads a file and returns its entire` `# contents in file_contents` `def readfile(filename):` ` fid = open(filename, mode='r')` ` if fid is not None:` `@ -17,6 +22,10 @@ def readfile(filename):` ` return file_content` ``` ``` ``` ``` `# GETVOCABLIST reads the fixed vocabulary list in vocab.txt and returns a` `# cell array of the words` `# vocabList = GETVOCABLIST() reads the fixed vocabulary list in vocab.txt` `# and returns a cell array of the words in vocabList.` `def getvocablist():` ` fid = open('text/vocab.txt')` ` # n = 1899` `@ -27,6 +36,11 @@ def getvocablist():` ` return vocablist` ``` ``` ``` ``` `# PROCESSEMAIL preprocesses a the body of an email and` `# returns a list of word_indices` `# word_indices = PROCESSEMAIL(email_contents) preprocesses` `# the body of an email and returns a list of indices of the` `# words contained in the email.` `def process_email(email_content):` ` vocablist = getvocablist()` ` word_indices = []` `@ -63,7 +77,6 @@ def process_email(email_content):` ` l += len(word) + 1` ` processed_content.append(word)` ``` ``` ``` ``` ` # Look up the word in the dictionary and add to word_indices` ` try:` ` ind = vocablist.index(word)` `@ -78,6 +91,10 @@ def process_email(email_content):` ` return word_indices` ``` ``` ``` ``` `# EMAILFEATURES takes in a word_indices vector and produces a feature vector` `# from the word indices` `# x = EMAILFEATURES(word_indices) takes in a word_indices vector and` `# produces a feature vector from the word indices.` `def email_features(word_indices):` ` n = 1899` ` x = np.zeros(n)` `@ -86,15 +103,14 @@ def email_features(word_indices):` ``` ``` ``` ``` `## ==================== Part 2: Feature Extraction ====================` `# Now, you will convert each email into a vector of features in R^n.` `# You should complete the code in emailFeatures.m to produce a feature` `# vector for a given email.` `print('\nExtracting features from sample email (emailSample1.txt)\n')` ``` ``` `# Extract Features` `file_contents = readfile('text/emailSample1.txt')` `word_indices = process_email(file_contents)` `features = email_features(word_indices)` ``` ``` `# Print Stats` `print('Length of feature vector: {0:d}\n'.format(len(features)))` `print('Number of non-zero entries: {0:d}\n'.format(sum(features > 0)))` ``` ``` `@ -102,8 +118,6 @@ print('Program paused. Press enter to continue.\n')` `input()` ``` ``` `## =========== Part 3: Train Linear SVM for Spam Classification ========` `# In this section, you will train a linear classifier to determine if an` `# email is Spam or Not-Spam.` ``` ``` `# Load the Spam Email dataset` `# You will have X, y in your environment` `@ -123,12 +137,9 @@ p = clf.predict(X)` `print('Training Accuracy: {0:f}\n'.format(np.mean((p == y.ravel()).astype(float)) * 100))` ``` ``` `## =================== Part 4: Test Spam Classification ================` `# After training the classifier, we can evaluate it on a test set. We have` `# included a test set in spamTest.mat` ``` ``` `# Load the test dataset` `# You will have Xtest, ytest in your environment` ``` ``` `data = scipy.io.loadmat('mat/spamTest.mat', matlab_compatible=True)` `Xtest = data['Xtest']` `ytest = data['ytest']` `@ -141,44 +152,26 @@ print('Test Accuracy: {0:f}\n'.format(np.mean((p == ytest.ravel()).astype(float)` `input()` ``` ``` `## ================= Part 5: Top Predictors of Spam ====================` `# Since the model we are training is a linear SVM, we can inspect the` `# weights learned by the model to understand better how it is determining` `# whether an email is spam or not. The following code finds the words with` `# the highest weights in the classifier. Informally, the classifier` `# 'thinks' that these words are the most likely indicators of spam.` `#` ``` ``` `# Sort the weights and obtain the vocabulary list` ``` ``` `idx = clf.coef_.argsort()[:,::-1]` `idx = clf.coef_.argsort()[:, ::-1]` `vocabList = np.array(getvocablist())` ``` ``` `print('\nTop predictors of spam: \n')` `for i in range(0,15):` ` print(' {0:<15s} ({1:f}) \n'.format(vocabList[idx[0,i]], clf.coef_[0,idx[0,i]]))` `for i in range(0, 15):` ` print(' {0:<15s} ({1:f}) \n'.format(vocabList[idx[0, i]], clf.coef_[0, idx[0, i]]))` ``` ``` `print('\n\n')` `print('\nProgram paused. Press enter to continue.\n')` `input()` ``` ``` `## =================== Part 6: Try Your Own Emails =====================` `# Now that you've trained the spam classifier, you can use it on your own` `# emails! In the starter code, we have included spamSample1.txt,` `# spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.` `# The following code reads in one of these emails and then uses your` `# learned SVM classifier to determine whether the email is Spam or` `# Not Spam` ``` ``` `# Set the file to be read in (change this to spamSample2.txt,` `# emailSample1.txt or emailSample2.txt to see different predictions on` `# different emails types). Try your own emails as well!` ``` ``` `filename = 'spamSample1'` `filename = 'spamSample3'` `file_contents = readfile('text/' + filename + '.txt')` ``` ``` `# Read and predict` `word_indices = process_email(file_contents)` `x = email_features(word_indices).reshape(1, -1) # 1d arrays are deprecated` `word_indices = process_email(file_contents)` `x = email_features(word_indices).reshape(1, -1) # 1d arrays are deprecated` `p = clf.predict(x)` ``` ``` `print('\nProcessed {0:s}\n\nSpam Classification: {1:d}\n'.format(filename, int(p)))`