Skip to content

Commit 36127ba

Browse files
更新 15章 代码新格式
1 parent fc272b1 commit 36127ba

2 files changed

Lines changed: 30 additions & 25 deletions

File tree

src/python/15.BigData_MapReduce/mrSVM.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
# coding:utf8
33
'''
44
Created on 2017-04-07
5+
Update on 2017-06-20
56
MapReduce version of Pegasos SVM
67
Using mrjob to automate job flow
7-
@author: Peter/ApacheCN-xy
8+
@author: Peter/ApacheCN-xy/片刻
9+
《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning
810
'''
911
from mrjob.job import MRJob
1012

@@ -17,14 +19,14 @@ class MRsvm(MRJob):
1719

1820
def __init__(self, *args, **kwargs):
1921
super(MRsvm, self).__init__(*args, **kwargs)
20-
self.data = pickle.load(open('input/15.BigData_MapReduce/svmDat27'))
22+
self.data = pickle.load(open('/opt/git/MachineLearning/input/15.BigData_MapReduce/svmDat27'))
2123
self.w = 0
2224
self.eta = 0.69
2325
self.dataList = []
2426
self.k = self.options.batchsize
2527
self.numMappers = 1
2628
self.t = 1 # iteration number
27-
29+
2830
def configure_options(self):
2931
super(MRsvm, self).configure_options()
3032
self.add_passthrough_option(
@@ -42,20 +44,20 @@ def map(self, mapperId, inVals): # 需要 2 个参数
4244
self.w = inVals[1]
4345
elif inVals[0] == 'x':
4446
self.dataList.append(inVals[1]) # 累积数据点计算
45-
elif inVals[0] == 't':
47+
elif inVals[0] == 't': # 迭代次数
4648
self.t = inVals[1]
4749
else:
48-
self.eta = inVals # 这用于 debug, eta未在map中使用
50+
self.eta = inVals # 这用于 debug, eta未在map中使用
4951

5052
def map_fin(self):
51-
labels = self.data[:,-1]
52-
X = self.data[:, 0:-1] # 将数据重新形成 X 和 Y
53-
if self.w == 0:
53+
labels = self.data[:, -1]
54+
X = self.data[:, :-1] # 将数据重新形成 X 和 Y
55+
if self.w == 0:
5456
self.w = [0.001] * shape(X)[1] # 在第一次迭代时,初始化 w
5557
for index in self.dataList:
56-
p = mat(self.w)*X[index, :].T # calc p=w*dataSet[key].T
58+
p = mat(self.w)*X[index, :].T # calc p=w*dataSet[key].T
5759
if labels[index]*p < 1.0:
58-
yield (1, ['u', index]) # 确保一切数据包含相同的key
60+
yield (1, ['u', index]) # 确保一切数据包含相同的key
5961
yield (1, ['w', self.w]) # 它们将在同一个 reducer
6062
yield (1, ['t', self.t])
6163

@@ -66,7 +68,7 @@ def reduce(self, _, packedVals):
6668
elif valArr[0] == 'w':
6769
self.w = valArr[1]
6870
elif valArr[0] == 't':
69-
self.t = valArr[1]
71+
self.t = valArr[1]
7072

7173
labels = self.data[:, -1]
7274
X = self.data[:, 0:-1]

src/python/15.BigData_MapReduce/proximalSVM.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,39 @@
1+
#!/usr/bin/python
2+
# coding:utf8
13
'''
2-
Created on Feb 25, 2011
3-
4-
@author: Peter
4+
Created on 2011-02-25
5+
Update on 2017-06-20
6+
@author: Peter/ApacheCN-xy/片刻
7+
《机器学习实战》更新地址:https://github.com/apachecn/MachineLearning
58
'''
69
import numpy
710

811
def map(key, value):
912
# input key= class for one training example, e.g. "-1.0"
1013
classes = [float(item) for item in key.split(",")] # e.g. [-1.0]
1114
D = numpy.diag(classes)
12-
15+
1316
# input value = feature vector for one training example, e.g. "3.0, 7.0, 2.0"
1417
featurematrix = [float(item) for item in value.split(",")]
1518
A = numpy.matrix(featurematrix)
16-
19+
1720
# create matrix E and vector e
18-
e = numpy.matrix(numpy.ones(len(A)).reshape(len(A),1))
19-
E = numpy.matrix(numpy.append(A,-e,axis=1))
20-
21+
e = numpy.matrix(numpy.ones(len(A)).reshape(len(A), 1))
22+
E = numpy.matrix(numpy.append(A, -e, axis=1))
23+
2124
# create a tuple with the values to be used by reducer
2225
# and encode it with base64 to avoid potential trouble with '\t' and '\n' used
2326
# as default separators in Hadoop Streaming
24-
producedvalue = base64.b64encode(pickle.dumps( (E.T*E, E.T*D*e) )
25-
27+
producedvalue = base64.b64encode(pickle.dumps( (E.T*E, E.T*D*e))
28+
2629
# note: a single constant key "producedkey" sends to only one reducer
2730
# somewhat "atypical" due to low degree of parallism on reducer side
2831
print "producedkey\t%s" % (producedvalue)
29-
32+
3033
def reduce(key, values, mu=0.1):
3134
sumETE = None
3235
sumETDe = None
33-
36+
3437
# key isn't used, so ignoring it with _ (underscore).
3538
for _, value in values:
3639
# unpickle values
@@ -39,13 +42,13 @@ def reduce(key, values, mu=0.1):
3942
# create the I/mu with correct dimensions
4043
sumETE = numpy.matrix(numpy.eye(ETE.shape[1])/mu)
4144
sumETE += ETE
42-
45+
4346
if sumETDe == None:
4447
# create sumETDe with correct dimensions
4548
sumETDe = ETDe
4649
else:
4750
sumETDe += ETDe
48-
51+
4952
# note: omega = result[:-1] and gamma = result[-1]
5053
# but printing entire vector as output
5154
result = sumETE.I*sumETDe

0 commit comments

Comments
 (0)