使用决策树预测隐形眼镜类型
说明:
将数据集文件 ‘lenses.txt’ 放在当前文件夹
from math
import log
import operator
熵的定义
"""
这部分是在用代码计算香农熵公式,即用代码写公式并计算结果
"""
def calcShannonEnt(dataSet
):
numEntries
= len(dataSet
)
labelCounts
= {}
for featVec
in dataSet
:
currentLabel
= featVec
[-1]
if currentLabel
not in labelCounts
.keys
():
labelCounts
[currentLabel
] = 0
labelCounts
[currentLabel
] += 1
shannonEnt
= 0.0
for key
in labelCounts
:
prob
= float(labelCounts
[key
])/numEntries
shannonEnt
-= prob
* log
(prob
, 2)
return shannonEnt
划分数据集: 按照给定特征划分数据集
def splitDataSet(dataSet
, axis
, value
):
retDataSet
= []
for featVec
in dataSet
:
if featVec
[axis
] == value
:
reducedFeatVec
= featVec
[:axis
]
reducedFeatVec
.extend
(featVec
[axis
+1:])
retDataSet
.append
(reducedFeatVec
)
return retDataSet
选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet
):
numFeatures
= len(dataSet
[0]) - 1
baseEntropy
= calcShannonEnt
(dataSet
)
bestInfoGain
= 0.0
bestFeature
= -1
for i
in range(numFeatures
):
featList
= [example
[i
] for example
in dataSet
]
uniqueVals
= set(featList
)
newEntropy
= 0.0
for value
in uniqueVals
:
subDataSet
= splitDataSet
(dataSet
, i
, value
)
prob
= len(subDataSet
)/float(len(dataSet
))
newEntropy
+= prob
* calcShannonEnt
(subDataSet
)
infoGain
= baseEntropy
- newEntropy
if (infoGain
> bestInfoGain
):
bestInfoGain
= infoGain
bestFeature
= i
return bestFeature
多数表决法决定该叶子节点的分类
"""
代码与第2章classify0部分的投票表决代码非常类似
"""
def majorityCnt(classList
):
classCount
={}
for vote
in classList
:
if vote
not in classCount
.keys
():
classCount
[vote
] = 0
classCount
[vote
] += 1
sortedClassCount
= sorted(classCount
.items
(), key
=operator
.itemgetter
(1), reverse
=True)
return sortedClassCount
[0][0]
递归构建决策树
def createTree(dataSet
, labels
):
classList
= [example
[-1] for example
in dataSet
]
if classList
.count
(classList
[0]) == len(classList
):
return classList
[0]
if len(dataSet
[0]) == 1:
return majorityCnt
(classList
)
bestFeat
= chooseBestFeatureToSplit
(dataSet
)
bestFeatLabel
= labels
[bestFeat
]
myTree
= {bestFeatLabel
:{}}
del(labels
[bestFeat
])
featValues
= [example
[bestFeat
] for example
in dataSet
]
uniqueVals
= set(featValues
)
"""
代码遍历当前选择特征包含的所有属性值,在每个数据集划分上递归调用函数,
createTree(),得到的返回值将被插入到字典变量myTree中,因此函数终止执行时,字典中将会嵌套很多代表叶子节点信息的字典数据。
"""
for value
in uniqueVals
:
subLabels
= labels
[:]
myTree
[bestFeatLabel
][value
] = createTree
(splitDataSet
(dataSet
, bestFeat
, value
), subLabels
)
return myTree
使用决策树执行分类
def classify(inputTree
, featLabels
, testVec
):
firstStr
= list(inputTree
)[0]
secondDict
= inputTree
[firstStr
]
featIndex
= featLabels
.index
(firstStr
)
for key
in secondDict
.keys
():
if testVec
[featIndex
] == key
:
if type(secondDict
[key
]).__name__
== 'dict':
classLabel
= classify
(secondDict
[key
], featLabels
, testVec
)
else:
classLabel
= secondDict
[key
]
return classLabel
使用决策树预测隐形眼镜类型
fr
= open('lenses.txt')
lenses
= [inst
.strip
().split
('\t') for inst
in fr
.readlines
()]
lensesLabels
= ['age', 'prescript', 'astigmatic', 'tearRate']
lensesTree
= createTree
(lenses
,lensesLabels
)
print(lensesTree
)
{'tearRate': {'reduced': 'no lenses', 'normal': {'astigmatic': {'yes': {'prescript': {'myope': 'hard', 'hyper': {'age': {'pre': 'no lenses', 'presbyopic': 'no lenses', 'young': 'hard'}}}}, 'no': {'age': {'pre': 'soft', 'presbyopic': {'prescript': {'myope': 'no lenses', 'hyper': 'soft'}}, 'young': 'soft'}}}}}}
使用决策树模型进行预测
lensesLabels
= ['age', 'prescript', 'astigmatic', 'tearRate']
classify
(lensesTree
, lensesLabels
, lenses
[0][:-1])
'no lenses'
对 lenses 数据集所有数据进行决策树分类预测
preds
= []
for i
in range(len(lenses
)):
pred
= classify
(lensesTree
, lensesLabels
, lenses
[i
][:-1])
preds
.append
(pred
)
print(preds
)
['no lenses', 'soft', 'no lenses', 'hard', 'no lenses', 'soft', 'no lenses', 'hard', 'no lenses', 'soft', 'no lenses', 'hard', 'no lenses', 'soft', 'no lenses', 'no lenses', 'no lenses', 'no lenses', 'no lenses', 'hard', 'no lenses', 'soft', 'no lenses', 'no lenses']