Skip to content

Commit 05f383e

Browse files
committed
添加logistic回归源代码中的预测病马死亡率的代码,修改相应的md文件
1 parent 1322ca9 commit 05f383e

2 files changed

Lines changed: 105 additions & 8 deletions

File tree

docs/5.Logistic回归.md

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ def stocGradAscent1(dataMatrix, classLabels, numIter=150):
321321

322322
[完整代码地址](https://github.com/apachecn/MachineLearning/blob/master/src/python/5.Logistic/logistic.py): <https://github.com/apachecn/MachineLearning/blob/master/src/python/5.Logistic/logistic.py>
323323

324-
### 项目案例3: 从疝气病症预测病马的死亡率
324+
### 项目案例2: 从疝气病症预测病马的死亡率
325325

326326
#### 项目概述
327327

@@ -473,40 +473,64 @@ def stocGradAscent1(dataMatrix, classLabels, numIter=150):
473473
Logistic 回归分类函数
474474

475475
```python
476+
# 分类函数,根据回归系数和特征向量来计算 Sigmoid的值
476477
def classifyVector(inX, weights):
478+
'''
479+
Desc:
480+
最终的分类函数,根据回归系数和特征向量来计算 Sigmoid 的值,大于0.5函数返回1,否则返回0
481+
Args:
482+
inX -- 特征向量,features
483+
weights -- 根据梯度下降/随机梯度下降 计算得到的回归系数
484+
Returns:
485+
如果 prob 计算大于 0.5 函数返回 1
486+
否则返回 0
487+
'''
477488
prob = sigmoid(sum(inX*weights))
478489
if prob > 0.5: return 1.0
479490
else: return 0.0
480491

481-
492+
# 打开测试集和训练集,并对数据进行格式化处理
482493
def colicTest():
483-
frTrain = open('horseColicTraining.txt')
484-
frTest = open('horseColicTest.txt')
494+
'''
495+
Desc:
496+
打开测试集和训练集,并对数据进行格式化处理
497+
Args:
498+
None
499+
Returns:
500+
errorRate -- 分类错误率
501+
'''
502+
frTrain = open('input/5.Logistic/horseColicTraining.txt')
503+
frTest = open('input/5.Logistic/horseColicTest.txt')
485504
trainingSet = []
486505
trainingLabels = []
506+
# 解析训练数据集中的数据特征和Labels
507+
# trainingSet 中存储训练数据集的特征,trainingLabels 存储训练数据集的样本对应的分类标签
487508
for line in frTrain.readlines():
488509
currLine = line.strip().split('\t')
489510
lineArr = []
490511
for i in range(21):
491512
lineArr.append(float(currLine[i]))
492513
trainingSet.append(lineArr)
493-
trainLabels.append(float(currLine[21]))
514+
trainingLabels.append(float(currLine[21]))
515+
# 使用 改进后的 随机梯度下降算法 求得在此数据集上的最佳回归系数 trainWeights
494516
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 500)
495517
errorCount = 0
496518
numTestVec = 0.0
519+
# 读取 测试数据集 进行测试,计算分类错误的样本条数和最终的错误率
497520
for line in frTest.readlines():
498521
numTestVec += 1.0
499522
currLine = line.strip().split('\t')
500523
lineArr = []
501524
for i in range(21):
502525
lineArr.append(float(currLine[i]))
503526
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
504-
errorCount += 1
527+
errorCount += 1
505528
errorRate = (float(errorCount) / numTestVec)
506529
print "the error rate of this test is: %f" % errorRate
507530
return errorRate
508531

509532

533+
# 调用 colicTest() 10次并求结果的平均值
510534
def multiTest():
511535
numTests = 10
512536
errorSum = 0.0

src/python/5.Logistic/logistic.py

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
import matplotlib.pyplot as plt
1313

1414

15+
# ---------------------------------------------------------------------------
16+
# 使用 Logistic 回归在简单数据集上的分类
17+
1518
# 解析数据
1619
def loadDataSet(file_name):
1720
# dataMat为原始数据, labelMat为原始数据的标签
@@ -149,7 +152,7 @@ def plotBestFit(dataArr, labelMat, weights):
149152
plt.show()
150153

151154

152-
def main():
155+
def simpleTest():
153156
# 1.收集并准备数据
154157
dataMat, labelMat = loadDataSet("input/5.Logistic/TestSet.txt")
155158

@@ -167,5 +170,75 @@ def main():
167170
plotBestFit(dataArr, labelMat, weights)
168171

169172

173+
#--------------------------------------------------------------------------------
174+
# 从疝气病症预测病马的死亡率
175+
176+
# 分类函数,根据回归系数和特征向量来计算 Sigmoid的值
177+
def classifyVector(inX, weights):
178+
'''
179+
Desc:
180+
最终的分类函数,根据回归系数和特征向量来计算 Sigmoid 的值,大于0.5函数返回1,否则返回0
181+
Args:
182+
inX -- 特征向量,features
183+
weights -- 根据梯度下降/随机梯度下降 计算得到的回归系数
184+
Returns:
185+
如果 prob 计算大于 0.5 函数返回 1
186+
否则返回 0
187+
'''
188+
prob = sigmoid(sum(inX*weights))
189+
if prob > 0.5: return 1.0
190+
else: return 0.0
191+
192+
# 打开测试集和训练集,并对数据进行格式化处理
193+
def colicTest():
194+
'''
195+
Desc:
196+
打开测试集和训练集,并对数据进行格式化处理
197+
Args:
198+
None
199+
Returns:
200+
errorRate -- 分类错误率
201+
'''
202+
frTrain = open('input/5.Logistic/horseColicTraining.txt')
203+
frTest = open('input/5.Logistic/horseColicTest.txt')
204+
trainingSet = []
205+
trainingLabels = []
206+
# 解析训练数据集中的数据特征和Labels
207+
# trainingSet 中存储训练数据集的特征,trainingLabels 存储训练数据集的样本对应的分类标签
208+
for line in frTrain.readlines():
209+
currLine = line.strip().split('\t')
210+
lineArr = []
211+
for i in range(21):
212+
lineArr.append(float(currLine[i]))
213+
trainingSet.append(lineArr)
214+
trainingLabels.append(float(currLine[21]))
215+
# 使用 改进后的 随机梯度下降算法 求得在此数据集上的最佳回归系数 trainWeights
216+
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 500)
217+
errorCount = 0
218+
numTestVec = 0.0
219+
# 读取 测试数据集 进行测试,计算分类错误的样本条数和最终的错误率
220+
for line in frTest.readlines():
221+
numTestVec += 1.0
222+
currLine = line.strip().split('\t')
223+
lineArr = []
224+
for i in range(21):
225+
lineArr.append(float(currLine[i]))
226+
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
227+
errorCount += 1
228+
errorRate = (float(errorCount) / numTestVec)
229+
print "the error rate of this test is: %f" % errorRate
230+
return errorRate
231+
232+
233+
# 调用 colicTest() 10次并求结果的平均值
234+
def multiTest():
235+
numTests = 10
236+
errorSum = 0.0
237+
for k in range(numTests):
238+
errorSum += colicTest()
239+
print "after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests))
240+
241+
170242
if __name__ == "__main__":
171-
main()
243+
simpleTest()
244+
# multiTest()

0 commit comments

Comments
 (0)