|
10 | 10 |
|
11 | 11 |
|
12 | 12 | def loadDataSet(fileName): |
13 | | - dataMat = []; labelMat = [] |
| 13 | + dataMat = [] |
| 14 | + labelMat = [] |
14 | 15 | fr = open(fileName) |
15 | 16 | for line in fr.readlines(): |
16 | 17 | lineArr = line.strip().split('\t') |
17 | | - #dataMat.append([float(lineArr[0]), float(lineArr[1]), float(lineArr[2])]) |
| 18 | + # dataMat.append([float(lineArr[0]), float(lineArr[1]), float(lineArr[2])]) |
18 | 19 | dataMat.append([float(lineArr[0]), float(lineArr[1])]) |
19 | 20 | labelMat.append(float(lineArr[2])) |
20 | | - return dataMat,labelMat |
| 21 | + return dataMat, labelMat |
21 | 22 |
|
22 | 23 |
|
23 | 24 | def seqPegasos(dataSet, labels, lam, T): |
24 | | - m,n = shape(dataSet); w = zeros(n) |
| 25 | + m, n = shape(dataSet) |
| 26 | + w = zeros(n) |
25 | 27 | for t in range(1, T+1): |
26 | 28 | i = random.randint(m) |
27 | 29 | eta = 1.0/(lam*t) |
28 | | - p = predict(w, dataSet[i,:]) |
| 30 | + p = predict(w, dataSet[i, :]) |
29 | 31 | if labels[i]*p < 1: |
30 | | - w = (1.0 - 1/t)*w + eta*labels[i]*dataSet[i,:] |
| 32 | + w = (1.0 - 1/t)*w + eta*labels[i]*dataSet[i, :] |
31 | 33 | else: |
32 | 34 | w = (1.0 - 1/t)*w |
33 | 35 | print w |
34 | 36 | return w |
35 | 37 |
|
36 | 38 |
|
37 | 39 | def predict(w, x): |
38 | | - return w*x.T |
| 40 | + return w*x.T # 就是预测 y 的值 |
39 | 41 |
|
40 | 42 |
|
41 | 43 | def batchPegasos(dataSet, labels, lam, T, k): |
42 | | - m,n = shape(dataSet); w = zeros(n); |
| 44 | + """batchPegasos() |
| 45 | +
|
| 46 | + Args: |
| 47 | + dataMat 特征集合 |
| 48 | + labels 分类结果集合 |
| 49 | + lam 固定值,微调的空间 |
| 50 | + T 迭代次数 |
| 51 | + k 待处理列表大小 |
| 52 | + Returns: |
| 53 | + w 权重向量 |
| 54 | + """ |
| 55 | + m, n = shape(dataSet) |
| 56 | + w = zeros(n) |
43 | 57 | dataIndex = range(m) |
44 | 58 | for t in range(1, T+1): |
45 | | - wDelta = mat(zeros(n)) # 重置 wDelta |
| 59 | + wDelta = mat(zeros(n)) # 重置 wDelta |
| 60 | + |
| 61 | + # 它是学习率,代表了权重调整幅度的大小。(也可以理解为随机梯度的步长) |
| 62 | + # 输入T和K分别设定了迭代次数和待处理列表的大小。在T次迭代过程中,每次需要重新计算eta |
46 | 63 | eta = 1.0/(lam*t) |
47 | 64 | random.shuffle(dataIndex) |
48 | | - for j in range(k):# 全部的训练集 |
| 65 | + for j in range(k): # 全部的训练集 内循环中执行批处理,将分类错误的值全部做累加后更新权重向量 |
49 | 66 | i = dataIndex[j] |
50 | | - p = predict(w, dataSet[i,:]) # mapper 代码 |
51 | | - if labels[i]*p < 1: # mapper 代码 |
52 | | - wDelta += labels[i]*dataSet[i,:].A # 累积变化 |
53 | | - w = (1.0 - 1/t)*w + (eta/k)*wDelta # 在每个 T上应用更改 |
54 | | - return w |
| 67 | + p = predict(w, dataSet[i, :]) # mapper 代码 |
55 | 68 |
|
| 69 | + # 如果预测正确,并且预测结果的绝对值>=1, 认为没问题。 |
| 70 | + # 否则算是预测错误, 通过预测错误的结果,来累计更新w. |
| 71 | + if labels[i]*p < 1: # mapper 代码 |
| 72 | + wDelta += labels[i]*dataSet[i, :].A # 累积变化 |
| 73 | + # w通过不断的随机梯度的方式来优化 |
| 74 | + w = (1.0 - 1/t)*w + (eta/k)*wDelta # 在每个 T上应用更改 |
| 75 | + # print '-----', w |
| 76 | + # print '++++++', w |
| 77 | + return w |
56 | 78 |
|
57 | 79 |
|
58 | | -datArr,labelList = loadDataSet('testSet.txt') |
| 80 | +datArr, labelList = loadDataSet('input/15.BigData_MapReduce/testSet.txt') |
59 | 81 | datMat = mat(datArr) |
60 | | -#finalWs = seqPegasos(datMat, labelList, 2, 5000) |
| 82 | +# finalWs = seqPegasos(datMat, labelList, 2, 5000) |
61 | 83 | finalWs = batchPegasos(datMat, labelList, 2, 50, 100) |
62 | 84 | print finalWs |
63 | 85 |
|
64 | 86 | import matplotlib |
65 | 87 | import matplotlib.pyplot as plt |
66 | 88 | fig = plt.figure() |
67 | 89 | ax = fig.add_subplot(111) |
68 | | -x1=[]; y1=[]; xm1=[]; ym1=[] |
| 90 | +x1 = [] |
| 91 | +y1 = [] |
| 92 | +xm1 = [] |
| 93 | +ym1 = [] |
69 | 94 | for i in range(len(labelList)): |
70 | 95 | if labelList[i] == 1.0: |
71 | | - x1.append(datMat[i,0]); y1.append(datMat[i,1]) |
| 96 | + x1.append(datMat[i, 0]) |
| 97 | + y1.append(datMat[i, 1]) |
72 | 98 | else: |
73 | | - xm1.append(datMat[i,0]); ym1.append(datMat[i,1]) |
| 99 | + xm1.append(datMat[i, 0]) |
| 100 | + ym1.append(datMat[i, 1]) |
74 | 101 | ax.scatter(x1, y1, marker='s', s=90) |
75 | 102 | ax.scatter(xm1, ym1, marker='o', s=50, c='red') |
76 | 103 | x = arange(-6.0, 8.0, 0.1) |
77 | | -y = (-finalWs[0,0]*x - 0)/finalWs[0,1] |
78 | | -#y2 = (0.43799*x)/0.12316 |
79 | | -y2 = (0.498442*x)/0.092387 #2 iterations |
80 | | -ax.plot(x,y) |
81 | | -ax.plot(x,y2,'g-.') |
82 | | -ax.axis([-6,8,-4,5]) |
83 | | -ax.legend(('50 Iterations', '2 Iterations') ) |
| 104 | +y = (-finalWs[0, 0]*x - 0)/finalWs[0, 1] |
| 105 | +# y2 = (0.43799*x)/0.12316 |
| 106 | +y2 = (0.498442*x)/0.092387 # 2 iterations |
| 107 | +ax.plot(x, y) |
| 108 | +ax.plot(x, y2, 'g-.') |
| 109 | +ax.axis([-6, 8, -4, 5]) |
| 110 | +ax.legend(('50 Iterations', '2 Iterations')) |
84 | 111 | plt.show() |
0 commit comments