全局建模的不足: 线性回归算法,在全局上拟合所有的样本,使得整体上误差最小。 但是当数据有众多的特征并且特征之间的关系十分复杂时,建立全局模型的想法就显得十分的困难,并且效果也不会太好。 在实际生活中,很多问题都是非线性的,不可能使用全局线性模型来拟合任意的数据。
局部性建模: 对于一些复杂的非线性的数据,我们可以将数据切分成很多分易建模的数据,然后再各个子段分别使用模型进行建模。
基于决策树的想法: 决策树是按照信息熵增的大小顺序将属性进行排序,并按各个属性的所有可能值将数据进行划分。使得一旦按照某种特征进行划分后,该特征在之后的算法执行过程中不再起作用,切分方式过于迅速,并且划分的过于细致。
因此,我们可以使用二元切分法,即每次把数据集(按照某一属性的某一属性值)切成两份,大于该属性值的数据进入左子树,其他进入右子树。类似于形成了一个决策二叉树。 当数据满足某些条件后,就不在进行划分,形成叶子节点,叶子节点代表当前的决策结果。
两类不同的叶子节点: 根据叶子节点信息的不同,可以将算法分为两类: 1:回归树: 叶子节点为当前子数据集的目标变量的均值(单个数) 2:模型树 :叶子节点为由当前子数据集,训练得到的一条预测直线(预测直线的参数向量w)
代码:
''' from numpy import * #加载数据集 def loadDataSet(fileName): #general function to parse tab -delimited floats dataMat = [] #assume last column is target value fr = open(fileName) for line in fr.readlines(): curLine = line.strip().split('\t') fltLine = map(float,curLine) #map all elements to float() dataMat.append(fltLine) return dataMat #按照某一属性的某一属性值将数据进行划分为mat0和mat1 def binSplitDataSet(dataSet, feature, value): mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:][0] mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:][0] return mat0,mat1 #回归树的叶子节点的计算方法:求所有数据的目标变量的均值 def regLeaf(dataSet):#returns the value used for each leaf return mean(dataSet[:,-1]) #回归树的误差计算:子集所有数据的方差总和 def regErr(dataSet): return var(dataSet[:,-1]) * shape(dataSet)[0] #使用线性回归公式对数据进行拟合 def linearSolve(dataSet): #helper function used in two places m,n = shape(dataSet) X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y xTx = X.T*X if linalg.det(xTx) == 0.0: raise NameError('This matrix is singular, cannot do inverse,\n\ try increasing the second value of ops') ws = xTx.I * (X.T * Y) return ws,X,Y #模型树的叶子节点的计算方法:线性回归公式对数据进行拟合 def modelLeaf(dataSet):#create linear model and return coeficients ws,X,Y = linearSolve(dataSet) return ws #模型树误差计算方法:偏差平方和 def modelErr(dataSet): ws,X,Y = linearSolve(dataSet) yHat = X * ws return sum(power(Y - yHat,2)) #遍历所有的属性和属性值,找到使误差和最小的划分属性和属性值 def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)): tolS = ops[0]; tolN = ops[1] #if all the target variables are the same value: quit and return value if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1 return None, leafType(dataSet) m,n = shape(dataSet) #the choice of the best feature is driven by Reduction in RSS error from mean S = errType(dataSet) bestS = inf; bestIndex = 0; bestValue = 0 for featIndex in range(n-1): for splitVal in set(dataSet[:,featIndex]): mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue newS = errType(mat0) + errType(mat1) if newS < bestS: bestIndex = featIndex bestValue = splitVal bestS = newS #if the decrease (S-bestS) is less than a threshold don't do the split if (S - bestS) < tolS: return None, leafType(dataSet) #exit cond 2 mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): #exit cond 3 return None, leafType(dataSet) return bestIndex,bestValue#returns the best feature to split on #and the value used for that split #递归创建二分决策树 def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split if feat == None: return val #if the splitting hit a stop condition return val retTree = {} retTree['spInd'] = feat retTree['spVal'] = val lSet, rSet = binSplitDataSet(dataSet, feat, val) retTree['left'] = createTree(lSet, leafType, errType, ops) retTree['right'] = createTree(rSet, leafType, errType, ops) return retTree #判断当前节点是否是非叶子节点 def isTree(obj): return (type(obj).__name__=='dict') #获取该节点数据的均值 def getMean(tree): if isTree(tree['right']): tree['right'] = getMean(tree['right']) if isTree(tree['left']): tree['left'] = getMean(tree['left']) return (tree['left']+tree['right'])/2.0 #后剪枝优化 #用测试集来判断叶节点合并是否能降低测试误差,若是的话则合并 def prune(tree, testData): if shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree if (isTree(tree['right']) or isTree(tree['left'])):#if the branches are not trees try to prune them lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet) if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet) #if they are now both leafs, see if we can merge them if not isTree(tree['left']) and not isTree(tree['right']): lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\ sum(power(rSet[:,-1] - tree['right'],2)) treeMean = (tree['left']+tree['right'])/2.0 errorMerge = sum(power(testData[:,-1] - treeMean,2)) if errorMerge < errorNoMerge: print "merging" return treeMean else: return tree else: return tree #回归树的叶子节点预测值 def regTreeEval(model, inDat): return float(model) #模型树的叶子接待你预测值 def modelTreeEval(model, inDat): n = shape(inDat)[1] X = mat(ones((1,n+1))) X[:,1:n+1]=inDat return float(X*model) #根据测试数据inData的各个属性值,不断判断进入分支,直到到达叶子节点,并返回预测值 def treeForeCast(tree, inData, modelEval=regTreeEval): if not isTree(tree): return modelEval(tree, inData) if inData[tree['spInd']] > tree['spVal']: if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval) else: return modelEval(tree['left'], inData) else: if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval) else: return modelEval(tree['right'], inData) #预测测试集中每个数据的预测值 def createForeCast(tree, testData, modelEval=regTreeEval): m=len(testData) yHat = mat(zeros((m,1))) for i in range(m): yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval) return yHat