本期主要讲利用逻辑回归来做多分类,包括数据可视化,假设函数,损失函数,参数最优化,一对多分类训练器,模型准确率评估等,对应吴恩达机器学习第四周编程练习,融入自己的想法。
%matplotlib inline
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import cv2 as cv
#import scipy.misc #Used to show matrix as an image
import matplotlib.cm as cm #Used to display images in a specific colormap
from scipy.special import expit #导入logistic函数
import scipy.io as sio #调用scipy模块读取matlab的文件
data = sio.loadmat("CourseraML/ex3/data/ex3data1.mat") #读取数据
X, y = data["X"], data["y"] #特征和标签
X = np.insert(X, 0, 1, axis =1) #插入截距虚拟列
print(X.shape, y.shape)#(5000, 401) (5000, 1)
print(np.unique(y)) #[ 1 2 3 4 5 6 7 8 9 10]
def getDataImg(row): #构造20*20=400像素矩阵
width, height = 20, 20
square = row[1:].reshape(width, height)
return square.T
def dataDisplay(indices_to_display = None): #定义数据图像显示函数
width, height = 20, 20
nrows, ncols = 10, 10
if not indices_to_display:
indices_to_display = random.sample(range(X.shape[0]), nrows*ncols) #从50000个随机抽取10*10=100个
big_picture = np.zeros((height* nrows, width*ncols)) #200*200像素模板
irow, icol = 0, 0
for idx in indices_to_display:
if icol == ncols:
irow +=1
icol = 0
iimg = getDataImg(X[idx]) #生成20*20像素的小图
big_picture[irow*height:irow*height+iimg.shape[0], icol*width:icol*width+iimg.shape[1]] = iimg #模板定位放置小图
icol +=1
fig = plt.figure(figsize = (6, 6)) #新建画布
#img = Image.fromarray(np.uint8(big_picture), mode = "L") #图像模糊
img = Image.fromarray((big_picture * 5).astype('uint8'),mode = "L") #清晰图像,通过系数5调节清晰度
plt.imshow(img,cmap = cm.Greys_r)
dataDisplay()
def h(mytheta, myX): #定义假设函数
return expit(np.dot(myX, mytheta)) #5000*1
def costFunction(mytheta, myX, myy, mylambda = 0): #定义损失函数
m = myX.shape[0] #样本量5000
myh = h(mytheta, myX) #5000*1
term1 = np.log(myh).dot(-myy.T) #5000*5000
term2 = np.log(1-myh).dot(1-myy.T) #5000*5000
term3 = mytheta.T.dot((mytheta))*mylambda/(2*m) #正则项 1*1
return (term1-term2)/m +term3 #5000*5000+1*1 broadcast技术
initial_theta = np.zeros((X.shape[1], 1)) #初始化theta值401*1
#print(costFunction(initial_theta, X, y)) #(5000, 5000)
from scipy import optimize
def costGradient(mytheta, myX, myy, mylambda = 0.): #定义梯度下降函数
m = myX.shape[0] #
beta = h(mytheta, myX)-myy.T #shape: (5000,5000)
regterm = mytheta[1:]*(mylambda/m) #shape: (400,1)
grad = (1./m)*np.dot(myX.T, beta) #shape: (401, 5000)
grad[1:] = grad[1:] + regterm
return grad #shape: (401, 5000)
def optimizeTheta(mytheta,myX,myy,mylambda=0.):
result = optimize.fmin_cg(costFunction, fprime=costGradient, x0=mytheta, \
args=(myX, myy, mylambda), maxiter=50, disp=False,\
full_output=True) #返回参数theta和最小损失函数值
return result[0], result[1]
def oneVsAll(myX, myy, labels_num, mylambda): #定义1对多分类训练器
all_theta = np.zeros((labels_num, X.shape[1])) #存放10个假设函数的参数,10*401
initial_theta = np.zeros(X.shape[1]) #初始化参数值 401*1
for i in range(labels_num):
print('Optimizing for handwritten number {}...'.format(i))
iclass = i if i else 10 #数字0 属于第10个类别
logic_Y = np.array([1 if x==iclass else 0 for x in y]) #设置每次2分类的新标签 5000*1
itheta, imincost = optimizeTheta(initial_theta, X, logic_Y, mylambda)
all_theta[i,:] = itheta #401*1
return all_theta #返回十次2分类的 最优化参数
def predict_one_vs_all(myX, all_theta): #多分类预测
hypots = h(all_theta.T, myX) #假设函数值5000*10,每一行有10个假设函数值
h_argmax = np.argmax(hypots, axis =1) #每行找出最大假设函数的序号
return h_argmax
all_theta = oneVsAll(X, y, 10, 1) #5000*401
y_pred = predict_one_vs_all(X, all_theta)#5000*1
y_pred = [x if x else 10 for x in y_pred]
#print(y_pred) #预测值类别
#print(list(y.reshape(-1))) #真是类别
n_correct, n_total = 0, 0 #正确数,总数
for row in range(len(y_pred)):
n_total +=1
if y_pred[row] == y[row]:
n_correct +=1 #正确数加1
accuarcy = np.round(n_correct/n_total,2)
print("The accuarcy is {}%".format(accuarcy*100))
最后输出大约是这样子的,正确率在94%左右。
Optimizing for handwritten number 0...
Optimizing for handwritten number 1...
Optimizing for handwritten number 2...
Optimizing for handwritten number 3...
Optimizing for handwritten number 4...
Optimizing for handwritten number 5...
Optimizing for handwritten number 6...
Optimizing for handwritten number 7...
Optimizing for handwritten number 8...
Optimizing for handwritten number 9...
The accuarcy is 94.0%
本文地址:https://blog.csdn.net/zengbowengood/article/details/107442085
如对本文有疑问, 点击进行留言回复!!
从零开始的PYTHON3摸鱼(二)windows如何运行python,编辑器选择
网友评论