1212import matplotlib .pyplot as plt
1313
1414
15+ # ---------------------------------------------------------------------------
16+ # 使用 Logistic 回归在简单数据集上的分类
17+
1518# 解析数据
1619def loadDataSet (file_name ):
1720 # dataMat为原始数据, labelMat为原始数据的标签
@@ -149,7 +152,7 @@ def plotBestFit(dataArr, labelMat, weights):
149152 plt .show ()
150153
151154
152- def main ():
155+ def simpleTest ():
153156 # 1.收集并准备数据
154157 dataMat , labelMat = loadDataSet ("input/5.Logistic/TestSet.txt" )
155158
@@ -167,5 +170,75 @@ def main():
167170 plotBestFit (dataArr , labelMat , weights )
168171
169172
173+ #--------------------------------------------------------------------------------
174+ # 从疝气病症预测病马的死亡率
175+
176+ # 分类函数,根据回归系数和特征向量来计算 Sigmoid的值
177+ def classifyVector (inX , weights ):
178+ '''
179+ Desc:
180+ 最终的分类函数,根据回归系数和特征向量来计算 Sigmoid 的值,大于0.5函数返回1,否则返回0
181+ Args:
182+ inX -- 特征向量,features
183+ weights -- 根据梯度下降/随机梯度下降 计算得到的回归系数
184+ Returns:
185+ 如果 prob 计算大于 0.5 函数返回 1
186+ 否则返回 0
187+ '''
188+ prob = sigmoid (sum (inX * weights ))
189+ if prob > 0.5 : return 1.0
190+ else : return 0.0
191+
192+ # 打开测试集和训练集,并对数据进行格式化处理
193+ def colicTest ():
194+ '''
195+ Desc:
196+ 打开测试集和训练集,并对数据进行格式化处理
197+ Args:
198+ None
199+ Returns:
200+ errorRate -- 分类错误率
201+ '''
202+ frTrain = open ('input/5.Logistic/horseColicTraining.txt' )
203+ frTest = open ('input/5.Logistic/horseColicTest.txt' )
204+ trainingSet = []
205+ trainingLabels = []
206+ # 解析训练数据集中的数据特征和Labels
207+ # trainingSet 中存储训练数据集的特征,trainingLabels 存储训练数据集的样本对应的分类标签
208+ for line in frTrain .readlines ():
209+ currLine = line .strip ().split ('\t ' )
210+ lineArr = []
211+ for i in range (21 ):
212+ lineArr .append (float (currLine [i ]))
213+ trainingSet .append (lineArr )
214+ trainingLabels .append (float (currLine [21 ]))
215+ # 使用 改进后的 随机梯度下降算法 求得在此数据集上的最佳回归系数 trainWeights
216+ trainWeights = stocGradAscent1 (array (trainingSet ), trainingLabels , 500 )
217+ errorCount = 0
218+ numTestVec = 0.0
219+ # 读取 测试数据集 进行测试,计算分类错误的样本条数和最终的错误率
220+ for line in frTest .readlines ():
221+ numTestVec += 1.0
222+ currLine = line .strip ().split ('\t ' )
223+ lineArr = []
224+ for i in range (21 ):
225+ lineArr .append (float (currLine [i ]))
226+ if int (classifyVector (array (lineArr ), trainWeights )) != int (currLine [21 ]):
227+ errorCount += 1
228+ errorRate = (float (errorCount ) / numTestVec )
229+ print "the error rate of this test is: %f" % errorRate
230+ return errorRate
231+
232+
233+ # 调用 colicTest() 10次并求结果的平均值
234+ def multiTest ():
235+ numTests = 10
236+ errorSum = 0.0
237+ for k in range (numTests ):
238+ errorSum += colicTest ()
239+ print "after %d iterations the average error rate is: %f" % (numTests , errorSum / float (numTests ))
240+
241+
170242if __name__ == "__main__" :
171- main ()
243+ simpleTest ()
244+ # multiTest()
0 commit comments