Skip to content

Commit 73fa4db

Browse files
修改 树回归的原理和后剪枝的描述
1 parent 627ec87 commit 73fa4db

2 files changed

Lines changed: 33 additions & 35 deletions

File tree

docs/9.树回归.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ CART 和 C4.5 之间主要差异在于分类结果上,CART 可以回归分析
5656
```
5757
对每个特征:
5858
对每个特征值:
59-
将数据集切分成两份
59+
将数据集切分成两份(小于该特征值的数据样本放在左子树,否则放在右子树)
6060
计算切分的误差
6161
如果当前误差小于当前最小误差,那么将当前切分设定为最佳切分并更新最小误差
6262
返回最佳切分的特征和阈值
@@ -272,8 +272,6 @@ def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
272272

273273
决策树构造完成后进行剪枝。剪枝的过程是对拥有同样父节点的一组节点进行检查,判断如果将其合并,熵的增加量是否小于某一阈值。如果确实小,则这一组节点可以合并一个节点,其中包含了所有可能的结果。合并也被称作 `塌陷处理` ,在回归树中一般采用取需要合并的所有子树的平均值。后剪枝是目前最普遍的做法。
274274

275-
后剪枝的剪枝过程是删除一些子树,然后用其叶子节点代替,这个叶子节点所标识的类别通过大多数原则(majority class criterion)确定。所谓大多数原则,是指剪枝过程中, 将一些子树删除而用叶节点代替,这个叶节点所标识的类别用这棵子树中大多数训练样本所属的类别来标识,所标识的类 称为majority class ,(majority class 在很多英文文献中也多次出现)。
276-
277275
后剪枝 prune() 的伪代码如下:
278276

279277
```

src/python/9.RegTrees/regTrees.py

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -395,12 +395,12 @@ def createForeCast(tree, testData, modelEval=regTreeEval):
395395

396396

397397
if __name__ == "__main__":
398-
# 测试数据集
399-
testMat = mat(eye(4))
400-
print testMat
401-
print type(testMat)
402-
mat0, mat1 = binSplitDataSet(testMat, 1, 0.5)
403-
print mat0, '\n-----------\n', mat1
398+
# # 测试数据集
399+
# testMat = mat(eye(4))
400+
# print testMat
401+
# print type(testMat)
402+
# mat0, mat1 = binSplitDataSet(testMat, 1, 0.5)
403+
# print mat0, '\n-----------\n', mat1
404404

405405
# # 回归树
406406
# myDat = loadDataSet('input/9.RegTrees/data1.txt')
@@ -431,29 +431,29 @@ def createForeCast(tree, testData, modelEval=regTreeEval):
431431
# myTree = createTree(myMat, modelLeaf, modelErr)
432432
# print myTree
433433

434-
# # # 回归树 VS 模型树 VS 线性回归
435-
# trainMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_train.txt'))
436-
# testMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_test.txt'))
437-
# # # 回归树
438-
# myTree1 = createTree(trainMat, ops=(1, 20))
439-
# print myTree1
440-
# yHat1 = createForeCast(myTree1, testMat[:, 0])
441-
# print "--------------\n"
442-
# # print yHat1
443-
# # print "ssss==>", testMat[:, 1]
444-
# print "回归树:", corrcoef(yHat1, testMat[:, 1],rowvar=0)[0, 1]
445-
446-
# # 模型树
447-
# myTree2 = createTree(trainMat, modelLeaf, modelErr, ops=(1, 20))
448-
# yHat2 = createForeCast(myTree2, testMat[:, 0], modelTreeEval)
449-
# print myTree2
450-
# print "模型树:", corrcoef(yHat2, testMat[:, 1],rowvar=0)[0, 1]
451-
452-
# # 线性回归
453-
# ws, X, Y = linearSolve(trainMat)
454-
# print ws
455-
# m = len(testMat[:, 0])
456-
# yHat3 = mat(zeros((m, 1)))
457-
# for i in range(shape(testMat)[0]):
458-
# yHat3[i] = testMat[i, 0]*ws[1, 0] + ws[0, 0]
459-
# print "线性回归:", corrcoef(yHat3, testMat[:, 1],rowvar=0)[0, 1]
434+
# # 回归树 VS 模型树 VS 线性回归
435+
trainMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_train.txt'))
436+
testMat = mat(loadDataSet('input/9.RegTrees/bikeSpeedVsIq_test.txt'))
437+
# # 回归树
438+
myTree1 = createTree(trainMat, ops=(1, 20))
439+
print myTree1
440+
yHat1 = createForeCast(myTree1, testMat[:, 0])
441+
print "--------------\n"
442+
# print yHat1
443+
# print "ssss==>", testMat[:, 1]
444+
print "回归树:", corrcoef(yHat1, testMat[:, 1],rowvar=0)[0, 1]
445+
446+
# 模型树
447+
myTree2 = createTree(trainMat, modelLeaf, modelErr, ops=(1, 20))
448+
yHat2 = createForeCast(myTree2, testMat[:, 0], modelTreeEval)
449+
print myTree2
450+
print "模型树:", corrcoef(yHat2, testMat[:, 1],rowvar=0)[0, 1]
451+
452+
# 线性回归
453+
ws, X, Y = linearSolve(trainMat)
454+
print ws
455+
m = len(testMat[:, 0])
456+
yHat3 = mat(zeros((m, 1)))
457+
for i in range(shape(testMat)[0]):
458+
yHat3[i] = testMat[i, 0]*ws[1, 0] + ws[0, 0]
459+
print "线性回归:", corrcoef(yHat3, testMat[:, 1],rowvar=0)[0, 1]

0 commit comments

Comments
 (0)