Browse Source

Initial commit of ex6

master
wchen342 7 years ago
commit
225fed031d
  1. 9
      cs229_hw.iml
  2. 103
      ex6/ex6.py
  3. 97
      ex6/ex6_spam.py
  4. BIN
      ex6/mat/ex6data1.mat
  5. BIN
      ex6/mat/ex6data2.mat
  6. BIN
      ex6/mat/ex6data3.mat
  7. BIN
      ex6/mat/spamTest.mat
  8. BIN
      ex6/mat/spamTrain.mat
  9. 10
      ex6/text/emailSample1.txt
  10. 34
      ex6/text/emailSample2.txt
  11. 42
      ex6/text/spamSample1.txt
  12. 8
      ex6/text/spamSample2.txt
  13. 1899
      ex6/text/vocab.txt

9
cs229_hw.iml

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

103
ex6/ex6.py

@ -0,0 +1,103 @@
import numpy as np
import scipy.io
import matplotlib.pyplot as plt
from sklearn.svm import SVC
def plotdata(X, y):
pos = y == 1
neg = y == 0
plt.plot(X[:, :1][pos], X[:, 1:2][pos], 'k+', linewidth=1, markersize=7)
plt.plot(X[:, :1][neg], X[:, 1:2][neg], 'ko', markerfacecolor='y', markersize=7)
def visualize_boundary_linear(X, y, clf):
w = clf.coef_
b = clf.intercept_
xp = np.linspace(X[:, :1].min(), X[:, :1].max(), 100)
yp = - (w[0, 0] * xp + b) / w[0, 1]
plotdata(X, y)
plt.plot(xp, yp, '-b')
def visualize_boundary(X, y, clf):
plotdata(X, y)
x1plot = np.linspace(X[:, :1].min(), X[:, :1].max(), 100).T
x2plot = np.linspace(X[:, 1:2].min(), X[:, 1:2].max(), 100).T
X1, X2 = np.meshgrid(x1plot, x2plot)
vals = np.zeros(X1.shape)
for i in range(1, X1.shape[1]+1):
this_X = np.hstack((X1[:, i-1:i], X2[:, i-1:i]))
vals[:, i-1] = clf.predict(this_X)
plt.contour(X1, X2, vals, [0], colors='b')
# plt.contour(X1, X2, vals, colors='b')
print('Loading and Visualizing Data ...\n')
data = scipy.io.loadmat('mat/ex6data1.mat', matlab_compatible=True)
X = data['X']
y = data['y']
plotdata(X, y)
plt.show()
print('Program paused. Press enter to continue.\n')
input()
C = 1
clf = SVC(C=C, kernel='linear')
clf.fit(X, y.ravel())
visualize_boundary_linear(X, y, clf)
plt.show()
print('Program paused. Press enter to continue.\n')
input()
print('Loading and Visualizing Data ...\n')
data = scipy.io.loadmat('mat/ex6data2.mat', matlab_compatible=True)
X = data['X']
y = data['y']
plotdata(X, y)
plt.show()
print('Program paused. Press enter to continue.\n')
input()
print('\nTraining SVM with RBF Kernel (this may take 1 to 2 minutes) ...\n')
C = 1
sigma = 0.1
gamma = 1 / (2 * sigma * sigma)
clf = SVC(C=C, gamma=gamma, kernel='rbf')
clf.fit(X, y.ravel())
visualize_boundary(X, y, clf)
plt.show()
print('Program paused. Press enter to continue.\n')
input()
print('Loading and Visualizing Data ...\n')
data = scipy.io.loadmat('mat/ex6data3.mat', matlab_compatible=True)
X = data['X']
y = data['y']
plotdata(X, y)
print('Program paused. Press enter to continue.\n')
input()
clf.fit(X, y.ravel())
visualize_boundary(X, y, clf)
plt.show()
print('Program paused. Press enter to continue.\n')
input()

97
ex6/ex6_spam.py

@ -0,0 +1,97 @@
import re
import numpy as np
from nltk.stem.porter import PorterStemmer as stemmer
def readfile(filename):
fid = open(filename, mode='r')
if fid is not None:
file_content = fid.read()
fid.close()
else:
file_content = ''
print('Unable to open {0:s}\n'.format(filename))
return file_content
def getvocablist():
fid = open('text/vocab.txt')
# n = 1899
vocablist = fid.read().splitlines()
vocablist = [string.split("\t")[1] for string in vocablist]
print(vocablist)
fid.close()
return vocablist
def process_email(email_content):
vocablist = getvocablist()
word_indices = []
# Preprocess
email_content = str(email_content).lower() # Lower case
email_content = re.sub(r'<[^<>]+>', '', email_content) # Strip all HTML
email_content = re.sub(r'[0-9]+', 'number', email_content) # Handle Numbers
email_content = re.sub(r'(http|https)://[^\s]*', 'httpaddr', email_content) # Handle URLS
email_content = re.sub(r'[^\s][email protected][^\s]+', 'emailaddr', email_content) # Handle Email Addresses
email_content = re.sub(r'[$]+', 'dollar', email_content) # Handle $ sign
print('\n==== Processed Email ====\n\n')
word_list = re.split(r'[ @\$/#\.\-:&\*\+=\[\]\?!\(\)\{\},\'\">_<;%\n\r]',
email_content) # Tokenize and get rid of punctuation
ptn = re.compile(r'[^a-zA-Z0-9]')
stm = stemmer()
processed_content = []
l = 0
for word in word_list:
word = ptn.sub('', word).strip() # Remove any non alphanumeric characters
word = stm.stem(word) # Stem the word
if len(word) < 1: # Skip the word if it is too short
continue
if l > 78:
processed_content.append('\n')
l = 0
else:
l += len(word) + 1
processed_content.append(word)
# Look up the word in the dictionary and add to word_indices
try:
ind = vocablist.index(word)
except ValueError as e:
# print(e)
continue
if ind >= 0:
word_indices.append(ind)
# Print to screen
print(' '.join(processed_content))
return word_indices
def email_features(word_indices):
n = 1899
x = np.zeros(n)
x[word_indices] = 1
return x
## ==================== Part 2: Feature Extraction ====================
# Now, you will convert each email into a vector of features in R^n.
# You should complete the code in emailFeatures.m to produce a feature
# vector for a given email.
print('\nExtracting features from sample email (emailSample1.txt)\n')
file_contents = readfile('text/emailSample1.txt')
word_indices = process_email(file_contents)
features = email_features(word_indices)
print('Length of feature vector: {0:d}\n'.format(len(features)))
print('Number of non-zero entries: {0:d}\n'.format(sum(features > 0)))
print('Program paused. Press enter to continue.\n')
input()

BIN
ex6/mat/ex6data1.mat

Binary file not shown.

BIN
ex6/mat/ex6data2.mat

Binary file not shown.

BIN
ex6/mat/ex6data3.mat

Binary file not shown.

BIN
ex6/mat/spamTest.mat

Binary file not shown.

BIN
ex6/mat/spamTrain.mat

Binary file not shown.

10
ex6/text/emailSample1.txt

@ -0,0 +1,10 @@
> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100.
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2
if youre running something big..
To unsubscribe yourself from this mailing list, send an email to:
[email protected]

34
ex6/text/emailSample2.txt

@ -0,0 +1,34 @@
Folks,
my first time posting - have a bit of Unix experience, but am new to Linux.
Just got a new PC at home - Dell box with Windows XP. Added a second hard disk
for Linux. Partitioned the disk and have installed Suse 7.2 from CD, which went
fine except it didn't pick up my monitor.
I have a Dell branded E151FPp 15" LCD flat panel monitor and a nVidia GeForce4
Ti4200 video card, both of which are probably too new to feature in Suse's default
set. I downloaded a driver from the nVidia website and installed it using RPM.
Then I ran Sax2 (as was recommended in some postings I found on the net), but
it still doesn't feature my video card in the available list. What next?
Another problem. I have a Dell branded keyboard and if I hit Caps-Lock twice,
the whole machine crashes (in Linux, not Windows) - even the on/off switch is
inactive, leaving me to reach for the power cable instead.
If anyone can help me in any way with these probs., I'd be really grateful -
I've searched the 'net but have run out of ideas.
Or should I be going for a different version of Linux such as RedHat? Opinions
welcome.
Thanks a lot,
Peter
--
Irish Linux Users' Group: [email protected]
http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.
List maintainer: [email protected]

42
ex6/text/spamSample1.txt

@ -0,0 +1,42 @@
Do You Want To Make $1000 Or More Per Week?
If you are a motivated and qualified individual - I
will personally demonstrate to you a system that will
make you $1,000 per week or more! This is NOT mlm.
Call our 24 hour pre-recorded number to get the
details.
000-456-789
I need people who want to make serious money. Make
the call and get the facts.
Invest 2 minutes in yourself now!
000-456-789
Looking forward to your call and I will introduce you
to people like yourself who
are currently making $10,000 plus per week!
000-456-789
3484lJGv6-241lEaN9080lRmS6-271WxHo7524qiyT5-438rjUv5615hQcf0-662eiDB9057dMtVl72

8
ex6/text/spamSample2.txt

@ -0,0 +1,8 @@
Best Buy Viagra Generic Online
Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed!
We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers!
http://medphysitcstech.ru

1899
ex6/text/vocab.txt

File diff suppressed because it is too large
Loading…
Cancel
Save