Browse Source

Finish debugging ex5. Add numpy version of fmincg.

master
wchen342 6 years ago
parent
commit
46de43741a
  1. 58
      ex5/ex5.py
  2. 191
      extra/fmincg.py

58
ex5/ex5.py

@ -2,6 +2,7 @@
import numpy as np
import matplotlib.pyplot as plt
import scipy, scipy.io, scipy.optimize
from extra.fmincg import fmincg
# LINEARREGCOSTFUNCTION Compute cost and gradient for regularized linear
@ -18,7 +19,7 @@ def linear_reg_cost_function(X, y, theta, ld):
theta = theta.reshape((d1, d2))
J = 0.5 / m * (np.sum((X.dot(theta) - y) ** 2) + ld * np.sum(theta[1:, :] ** 2))
grad = 1 / m * X.T.dot((X.dot(theta) - y)) + ld / m * np.vstack((np.zeros((1, theta.shape[1])), theta[1:, :]))
grad = 1 / m * X.T.dot(X.dot(theta) - y) + ld / m * np.vstack((np.zeros((1, theta.shape[1])), theta[1:, :]))
return J, grad.flatten()
@ -35,12 +36,16 @@ def train_linear_reg(X, y, ld):
cost_function = lambda t: linear_reg_cost_function(X, y, t, ld)
# Now, costFunction is a function that takes in only one argument
options = {'maxiter': 2000, 'norm': 3, 'disp': True}
options = {'maxiter': 200, 'disp': True, 'gtol': 0.05}
# Minimize using minimize
# Because the implementation of Scipy, the result looks somehow different
theta = scipy.optimize.minimize(cost_function, initial_theta, options=options, jac=True, method='BFGS')['x']
return theta
# Because the implementation of Scipy's minimize function, the result looks a little different
ret = scipy.optimize.minimize(cost_function, initial_theta, options=options, jac=True, method='BFGS')
return ret['x']
# Used this minimization method provided in MATLAB to verify my implementation
# theta = fmincg(cost_function, initial_theta.flatten(), {'MaxIter': 200}, ())
# return theta
# LEARNINGCURVE Generates the train and cross validation set errors needed
@ -116,7 +121,30 @@ def plotfit(min_x, max_x, mu, sigma, theta, p):
plt.plot(x, X_poly.dot(theta), '--', linewidth=2)
np.set_printoptions(formatter={'float': '{: 0.5f}'.format}, edgeitems=50, linewidth=150)
# VALIDATIONCURVE Generate the train and validation errors needed to
# plot a validation curve that we can use to select lambda
# [lambda_vec, error_train, error_val] = ...
# VALIDATIONCURVE(X, y, Xval, yval) returns the train
# and validation errors (in error_train, error_val)
# for different values of lambda. You are given the training set (X,
# y) and validation set (Xval, yval).
def validation_curve(X, y, Xval, yval):
# Selected values of lambda (you should not change this)
lambda_vec = np.array([0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]).T
# Initialize return values
error_train = np.zeros((lambda_vec.size, 1))
error_val = np.zeros((lambda_vec.size, 1))
for i in range(0, lambda_vec.size):
ld = lambda_vec[i]
theta = train_linear_reg(X, y, ld).reshape(X.shape[1], y.shape[1])
error_train[i] = 0.5 / X.shape[0] * np.sum((X.dot(theta) - y) ** 2)
error_val[i] = 0.5 / yval.shape[0] * np.sum((Xval.dot(theta) - yval) ** 2)
return lambda_vec, error_train, error_val
np.set_printoptions(formatter={'float': '{: 0.64f}'.format, 'longfloat': '{: 0.64f}'.format}, edgeitems=50, linewidth=150)
## =========== Part 1: Loading and Visualizing Data =============
# Load Training Data
print('Loading and Visualizing Data ...\n')
@ -201,7 +229,6 @@ p = 8
# Map X onto Polynomial Features and Normalize
X_poly = poly_features(X, p)
X_poly2 = np.copy(X_poly)
X_poly, mu, sigma = feature_normalize(X_poly) # Normalize
X_poly = np.hstack((np.ones((m, 1)), X_poly)) # Add Ones
@ -213,7 +240,6 @@ X_poly_test = np.hstack((np.ones((X_poly_test.shape[0], 1)), X_poly_test)) # Ad
# Map X_poly_val and normalize (using mu and sigma)
X_poly_val = poly_features(Xval, p)
X_poly_val2 = np.copy(X_poly_val)
X_poly_val = X_poly_val - mu
X_poly_val = X_poly_val / sigma
X_poly_val = np.hstack((np.ones((X_poly_val.shape[0], 1)), X_poly_val)) # Add Ones
@ -253,3 +279,19 @@ for i in range(0, m):
print(' \t{0:d}\t\t{1:f}\t{2:f}\n'.format(i + 1, error_train[i, 0], error_val[i, 0]))
print('Program paused. Press enter to continue.\n')
input()
## =========== Part 8: Validation for Selecting Lambda =============
lambda_vec, error_train, error_val = validation_curve(X_poly, y, X_poly_val, yval)
plt.plot(lambda_vec, error_train, lambda_vec, error_val)
plt.legend(['Train', 'Cross Validation'])
plt.xlabel('lambda')
plt.ylabel('Error')
plt.show()
print('lambda\t\tTrain Error\tValidation Error\n')
for i in range(0, lambda_vec.size):
print(' {0:f}\t{1:f}\t{2:f}\n'.format(lambda_vec[i], error_train[i, 0], error_val[i, 0]))
print('Program paused. Press enter to continue.\n')

191
extra/fmincg.py

@ -0,0 +1,191 @@
import numpy as np
import math
# Minimize a continuous differentialble multivariate function. Starting point
# is given by "X" (D by 1), and the function named in the string "f", must
# return a function value and a vector of partial derivatives. The Polack-
# Ribiere flavour of conjugate gradients is used to compute search directions,
# and a line search using quadratic and cubic polynomial approximations and the
# Wolfe-Powell stopping criteria is used together with the slope ratio method
# for guessing initial step sizes. Additionally a bunch of checks are made to
# make sure that exploration is taking place and that extrapolation will not
# be unboundedly large. The "length" gives the length of the run: if it is
# positive, it gives the maximum number of line searches, if negative its
# absolute gives the maximum allowed number of function evaluations. You can
# (optionally) give "length" a second component, which will indicate the
# reduction in function value to be expected in the first line-search (defaults
# to 1.0). The function returns when either its length is up, or if no further
# progress can be made (ie, we are at a minimum, or so close that due to
# numerical problems, we cannot get any closer). If the function terminates
# within a few iterations, it could be an indication that the function value
# and derivatives are not consistent (ie, there may be a bug in the
# implementation of your "f" function). The function returns the found
# solution "X", a vector of function values "fX" indicating the progress made
# and "i" the number of iterations (line searches or function evaluations,
# depending on the sign of "length") used.
#
# Usage: [X, fX, i] = fmincg(f, X, options, P1, P2, P3, P4, P5)
#
# See also: checkgrad
#
# Copyright (C) 2001 and 2002 by Carl Edward Rasmussen. Date 2002-02-13
#
#
# (C) Copyright 1999, 2000 & 2001, Carl Edward Rasmussen
#
# Permission is granted for anyone to copy, use, or modify these
# programs and accompanying documents for purposes of research or
# education, provided this copyright notice is retained, and note is
# made of any changes that have been made.
#
# These programs and documents are distributed without any warranty,
# express or implied. As the programs were written for research
# purposes only, they have not been tested to the degree that would be
# advisable in any important application. All use of these programs is
# entirely at the user's own risk.
#
# [ml-class] Changes Made:
# 1) Function name and argument specifications
# 2) Output display
#
# Read options
def fmincg(f, X, options, args):
if options is not None and type(options) is dict and options.get('MaxIter') is not None:
length = options.get('MaxIter')
else:
length = 100
RHO = 0.01 # a bunch of constants for line searches
SIG = 0.5 # RHO and SIG are the constants in the Wolfe-Powell conditions
INT = 0.1 # don't reevaluate within 0.1 of the limit of the current bracket
EXT = 3.0 # extrapolate maximum 3 times the current bracket
MAX = 20 # max 20 function evaluations per line search
RATIO = 100 # maximum allowed slope ratio
red = 1
S = 'Iteration '
i = 0 # zero the run length counter
ls_failed = 0 # no previous line search has failed
fX = np.array([])
f1, df1 = f(*((X,) + args)) # get function value and gradient
i += length < 0 # count epochs?!
s = -df1 # search direction is steepest
d1 = -s.T.dot(s) # this is the slope
z1 = red / (1 - d1) # initial step is red/(|s|+1)
while i < abs(length): # while not finished
i += length > 0 # count iterations?!
X0 = X
f0 = f1
df0 = df1 # make a copy of current values
X = X + z1 * s # begin line search
f2, df2 = f(*((X,) + args))
i += length < 0 # count epochs?!
d2 = df2.T.dot(s)
f3 = f1
d3 = d1
z3 = -z1 # initialize point 3 equal to point 1
if length > 0:
M = MAX
else:
M = min(MAX, -length - i)
success = 0
limit = -1 # initialize quanteties
while 1:
while ((f2 > f1 + z1 * RHO * d1) or (d2 > -SIG * d1)) and (M > 0):
limit = z1 # tighten the bracket
if f2 > f1:
z2 = z3 - (0.5 * d3 * z3 * z3) / (d3 * z3 + f2 - f3) # quadratic fit
else:
A = 6 * (f2 - f3) / z3 + 3 * (d2 + d3) # cubic fit
B = 3 * (f3 - f2) - z3 * (d3 + 2 * d2)
z2 = (np.sqrt(B * B - A * d2 * z3 * z3) - B) / A # numerical error possible - ok!
if math.isnan(z2) or math.isinf(z2):
z2 = z3 / 2 # if we had a numerical problem then bisect
z2 = np.amax(
np.array([np.amin(np.array([z2, INT * z3])), (1 - INT) * z3])) # don't accept too close to limits
z1 = z1 + z2 # update the step
X = X + z2 * s
f2, df2 = f(*((X,) + args))
M = M - 1
i = i + (length < 0) # count epochs?!
d2 = df2.T.dot(s)
z3 = z3 - z2 # z3 is now relative to the location of z2
if f2 > f1 + z1 * RHO * d1 or d2 > -SIG * d1:
break # this is a failure
elif d2 > SIG * d1:
success = 1
break # success
elif M == 0:
break # failure
A = 6 * (f2 - f3) / z3 + 3 * (d2 + d3) # make cubic extrapolation
B = 3 * (f3 - f2) - z3 * (d3 + 2 * d2)
z2 = -d2 * z3 * z3 / (B + np.sqrt(B * B - A * d2 * z3 * z3)) # num. error possible - ok!
if not np.isreal(z2) or np.isnan(z2) or np.isinf(z2) or z2 < 0: # num prob or wrong sign?
if limit < -0.5: # if we have no upper limit
z2 = z1 * (EXT - 1) # the extrapolate the maximum amount
else:
z2 = (limit - z1) / 2 # otherwise bisect
elif (limit > -0.5) and (z2 + z1 > limit): # extraplation beyond max?
z2 = (limit - z1) / 2 # bisect
elif (limit < -0.5) and (z2 + z1 > z1 * EXT): # extrapolation beyond limit
z2 = z1 * (EXT - 1.0) # set to extrapolation limit
elif z2 < -z3 * INT:
z2 = -z3 * INT
elif (limit > -0.5) and (z2 < (limit - z1) * (1.0 - INT)): # too close to limit?
z2 = (limit - z1) * (1.0 - INT)
f3 = f2
d3 = d2
z3 = -z2 # set point 3 equal to point 2
z1 = z1 + z2
X = X + z2 * s # update current estimates
f2, df2 = f(*((X,) + args))
M = M - 1
i = i + (length < 0) # count epochs?!
d2 = df2.T.dot(s)
# end of line search
if success: # if line search succeeded
f1 = f2
fX = np.hstack((fX, f1))
print('{0:s} {1:4d} | Cost: {2:4.6e}\r'.format(S, i, f1))
s = (df2.dot(df2) - df1.dot(df2)) / (df1.dot(df1)) * s - df2 # Polack-Ribiere direction
tmp = df1
df1 = df2
df2 = tmp # swap derivatives
d2 = df1.dot(s)
if d2 > 0: # new slope must be negative
s = -df1 # otherwise use steepest direction
d2 = -s.dot(s)
z1 = z1 * min(RATIO, d1 / (d2 - np.finfo(np.double).tiny)) # slope ratio but max RATIO
d1 = d2
ls_failed = 0 # this line search did not fail
else:
X = X0
f1 = f0
df1 = df0 # restore point from before failed line search
if ls_failed or i > np.abs(length): # line search failed twice in a row
break # or we ran out of time, so we give up
tmp = df1
df1 = df2
df2 = tmp # swap derivatives
s = -df1 # try steepest
d1 = -s.dot(s)
z1 = 1 / (1 - d1)
ls_failed = 1 # this line search failed
print('\n')
return X
Loading…
Cancel
Save