随机森林算法Matlab实现
瞎BB代码计算当前自身gini系数求最优划分点及其gini系数对data中按decision属性值从小到大排列生成结点生成随机采样样本数据生成决策树评价函数随机森林样本决策函数正确率计算函数主函数
样本数据
瞎BB
1.实现根据样本数据(用眼距离distance、最长持续用眼时长duration、总用眼时长total_time、户外运动时长outdoor、用眼角度angle、健康环境光照用眼比例proportion)判别是否需要近视预警 2.这个是大一数模校赛写的Matlab实现随机森林算法的代码。那个时候太菜。 3.代码还有一点bug,在建立决策树时用的是分支,如果样本太少会出bug。 4.如果不是一定要用Matlab,可以用python中的sklearn库来实现随机森林算法。 详情:随机森林算法python实现 https://blog.csdn.net/CYBERLIFERK800/article/details/90552735 5.需改进:用二叉树递归代替分支结构生成决策树,评估函数应改用recall来评判好坏,并交叉验证。等有时间了就改一改。
代码
计算当前自身gini系数
%计算当前自身gini系数
function gini_now=gini_self(data)
sample_select=size(data,1)-1;
decision_select=size(data,2)-1;
time=0;
for i=1:sample_select
if data(i,decision_select+1)
time=time+1;
end
end
gini_now=1-(time/sample_select)^2-((sample_select-time)/sample_select)^2;
求最优划分点及其gini系数
%求最优划分点及其gini系数,输入数据和决策属性,输出最优划分点和最优gini系数
function [boundary_best,gini_best]=gini(data_new,decision)
sample_select=size(data_new,1)-1; %选取样本数
decision_select=size(data_new,2)-1; %选取决策属性个数
%初始化
range=[min(data_new(1:sample_select,decision)) max(data_new(1:sample_select,decision))];%决策属性值范围
gini_best=1; %最优解
boundary_best=range(1); %最优边界
%计算time_lt sum_lt time_ge sum_ge
for j=range(1)+1:range(2)
result_temp=[0 0];
time_lt=0; %小于boundary的样本个数
sum_lt=0; %小于boundary的样本中需要预警的个数
time_ge=0; %大于等于boundary的样本个数
sum_ge=0; %大于等于boundary的样本中需要预警的个数
boundary=j;
for i=1:sample_select
if(data_new(i,decision)<boundary)
time_lt=time_lt+1;
sum_lt=sum_lt+data_new(i,decision_select+1);
else
time_ge=time_ge+1;
sum_ge=sum_ge+data_new(i,decision_select+1);
end
end
%计算gini系数
time=[time_lt time_lt time_ge time_ge];
sum=[sum_lt time_lt-sum_lt sum_ge time_ge-sum_ge];
rate=sum./time;
result_temp(1)=1-rate(1)^2-rate(2)^2;
result_temp(2)=1-rate(3)^2-rate(4)^2;
result=time_lt/sample_select*result_temp(1)+time_ge/sample_select*result_temp(2);
if result<gini_best
gini_best=result;
boundary_best=boundary;
end
end
对data中按decision属性值从小到大排列
%对data中按decision属性值从小到大排列,输出新数据和划分位置
function [data_new,index]=BubbleSort(data,decision,boundary)
sample_select=size(data,1)-1;
for i=1:sample_select-1
for j=1:sample_select-i
if data(j,decision)>data(j+1,decision)
temp=data(j,:);
data(j,:)=data(j+1,:);
data(j+1,:)=temp;
end
end
end
for i=1:sample_select
if data(i,decision)>boundary
break
end
end
index=i-1;
data_new=data;
生成结点
%生成结点,输入数据,输出最佳决策属性,最佳划分边界,以及划分后的两组数据及其gini系数和加权gini系数
function [decision_global_best,boundary_global_best,data_new1,gini_now1,data_new2,gini_now2,gini_new]=generate_node(data_new)
decision_select=size(data_new,2)-1;
gini_global_best=1;
decision_global_best=1;
boundary_global_best=0;
for i=1:decision_select
decision=i;
[boundary_best,gini_best]=gini(data_new,decision);
if gini_best<gini_global_best
gini_global_best=gini_best;
decision_global_best=decision;
boundary_global_best=boundary_best;
end
end
%按decision_global_best属性从小到大排列
[data_nnew,index]=BubbleSort(data_new,decision_global_best,boundary_global_best);
%生成子结点
data_new1=data_nnew(1:index,:);
data_new1(index+1,:)=data_nnew(end,:);
gini_now1=gini_self(data_new1);
%去除decision_global_best列
for i=1:decision_select
if i>=decision_global_best
data_new1(:,i)=data_new1(:,i+1);
end
end
data_new1(:,i)=[];
data_new2=data_nnew(index+1:end,:);
gini_now2=gini_self(data_new2);
%去除decision_global_best列
for i=1:decision_select
if i>=decision_global_best
data_new2(:,i)=data_new2(:,i+1);
end
end
data_new2(:,i)=[];
size1=size(data_new1,1)-1;
size2=size(data_new2,1)-1;
gini_new=gini_now1*size1/(size1+size2)+gini_now2*size2/(size1+size2);
生成随机采样样本数据
%生成随机采样样本数据,对样本有放回采样m组,对决策属性无放回采样n组
function data_new=select_sample_decision(data,m,n)
sample_num=size(data,1)-1; %样本个数
decision_num=size(data,2)-1; %决策属性个数
%随机有放回选取m个样本建立决策树
data_new1=ones(m+1,decision_num+1);
data_new1(m+1,:)=data(sample_num+1,:);
for i=1:m
k1=ceil(rand*sample_num);
data_new1(i,:)=data(k1,:);
end
%随机选取n个属性进行决策
k2=randperm(decision_num);
data_new=ones(m+1,n+1);
for i=1:n
data_new(:,i)=data_new1(:,k2(i));
end
data_new(:,n+1)=data_new1(:,decision_num+1);
生成决策树
%生成决策树,输入原始数据,采样样本数,采样决策属性数,预剪枝样本限制
function [path,boundary,gini,result]=decision_tree(data,sample_select,decision_select,sample_limit)
score=100; %评价函数得分
while(score>(sample_select*0.05)) %直到找到好树才停止
data_new=select_sample_decision(data,sample_select,decision_select);
%计算初始gini系数
gini_now=gini_self(data_new);
%主程序
layer=1; %记录决策树当前层数
leaf_sample=zeros(1,sample_select); %记录子结点样本个数
leaf_gini=zeros(1,sample_select); %叶子节点gini系数
leaf_num=0; %记录叶子数
path=zeros(decision_select,2^(decision_select-1)); %初始化路径
gini=ones(decision_select,2^(decision_select-1)); %初始化gini
boundary=zeros(decision_select,2^(decision_select-1)); %初始化划分边界
result=ones(decision_select,2^(decision_select-1)); %初始化结果
path(:)=inf;
gini(:)=inf;
boundary(:)=inf;
result(:)=inf;
%position=[1 0 0 0;1 2 0 0;1 2 3 4];
%第一层
[decision_global_best,boundary_global_best,data_new1,gini_now1,data_new2,gini_now2,~]=generate_node(data_new);
path(layer,1)=data_new(size(data_new,1),decision_global_best);
boundary(layer,1)=boundary_global_best;
gini(layer,1)=gini_now;
layer=layer+1;
gini(layer,1)=gini_now1;
gini(layer,2)=gini_now2;
%第二层
if ((size(data_new1,1)-1)>=sample_limit)&&(gini(layer,1)>0)
[decision_global_best,boundary_global_best,data_new1_1,gini_now1_1,data_new1_2,gini_now1_2,~]=generate_node(data_new1);
path(layer,1)=data_new1(size(data_new1,1),decision_global_best);
boundary(layer,1)=boundary_global_best;
layer=layer+1;
gini(layer,1)=gini_now1_1;
gini(layer,2)=gini_now1_2;
%第三层
if (size(data_new1_1,1)-1)>=sample_limit&&(gini(layer,1)>0)
[decision_global_best,boundary_global_best,data_new1_1_1,gini_now1_1_1,data_new1_1_2,gini_now1_1_2,~]=generate_node(data_new1_1);
path(layer,1)=data_new1_1(size(data_new1_1,1),decision_global_best);
boundary(layer,1)=boundary_global_best;
layer=layer+1;
gini(layer,1)=gini_now1_1_1;
temp1=sum(data_new1_1_1);
temp2=temp1(1,end)-7;
temp3=size(data_new1_1_1,1)-1;
temp=temp2>(temp3/2);
result(layer,1)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now1_1_1;
leaf_sample(leaf_num)=size(data_new1_1_1,1)-1;
gini(layer,2)=gini_now1_1_2;
temp1=sum(data_new1_1_2);
temp2=temp1(1,end)-7;
temp3=size(data_new1_1_2,1)-1;
temp=temp2>(temp3/2);
result(layer,2)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now1_1_2;
leaf_sample(leaf_num)=size(data_new1_1_2,1)-1;
else
temp1=sum(data_new1_1);
temp2=temp1(1,end)-7;
temp3=size(data_new1_1,1)-1;
temp=temp2>(temp3/2);
result(layer,1)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now1_1;
leaf_sample(leaf_num)=size(data_new1_1,1)-1;
path(layer,1)=nan;
boundary(layer,1)=nan;
gini(layer+1,1:2)=nan;
end
layer=3;
if (size(data_new1_2,1)-1)>=sample_limit&&(gini(layer,2)>0)
[decision_global_best,boundary_global_best,data_new1_2_1,gini_now1_2_1,data_new1_2_2,gini_now1_2_2,~]=generate_node(data_new1_2);
path(layer,2)=data_new1_2(size(data_new1_2,1),decision_global_best);
boundary(layer,2)=boundary_global_best;
layer=layer+1;
gini(layer,3)=gini_now1_2_1;
temp1=sum(data_new1_2_1);
temp2=temp1(1,end)-7;
temp3=size(data_new1_2_1,1)-1;
temp=temp2>(temp3/2);
result(layer,3)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now1_2_1;
leaf_sample(leaf_num)=size(data_new1_2_1,1)-1;
gini(layer,4)=gini_now1_2_2;
temp1=sum(data_new1_2_2);
temp2=temp1(1,end)-7;
temp3=size(data_new1_2_2,2)-1;
temp=temp2>(temp3/2);
result(layer,4)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now1_2_2;
leaf_sample(leaf_num)=size(data_new1_2_2,1)-1;
else
temp1=sum(data_new1_2);
temp2=temp1(1,end)-7;
temp3=size(data_new1_2,1)-1;
temp=temp2>(temp3/2);
result(layer,2)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now1_2;
leaf_sample(leaf_num)=size(data_new1_2,1)-1;
path(layer,2)=nan;
boundary(layer,2)=nan;
gini(layer+1,3:4)=nan;
end
else
temp1=sum(data_new1);
temp2=temp1(1,end)-7;
temp3=size(data_new1,1)-1;
temp=temp2>(temp3/2);
result(layer,1)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now1;
leaf_sample(leaf_num)=size(data_new1,1)-1;
path(layer,1)=nan;
boundary(layer,1)=nan;
layer=layer+1;
gini(layer,1:2)=nan;
%第三层
path(layer,1:2)=nan;
boundary(layer,1:2)=nan;
%gini第四层叶子
layer=layer+1;
gini(layer,1:4)=nan;
end
layer=2;
if (size(data_new2,1)-1)>=sample_limit&&(gini(layer,2)>0)
[decision_global_best,boundary_global_best,data_new2_1,gini_now2_1,data_new2_2,gini_now2_2,~]=generate_node(data_new2);
path(layer,2)=data_new2(size(data_new2,1),decision_global_best);
boundary(layer,2)=boundary_global_best;
layer=layer+1;
gini(layer,3)=gini_now2_1;
gini(layer,4)=gini_now2_2;
%第三层
if (size(data_new2_1,1)-1)>=sample_limit&&(gini(layer,3)>0)
[decision_global_best,boundary_global_best,data_new2_1_1,gini_now2_1_1,data_new2_1_2,gini_now2_1_2,~]=generate_node(data_new2_1);
path(layer,3)=data_new2_1(size(data_new2_1,1),decision_global_best);
boundary(layer,3)=boundary_global_best;
layer=layer+1;
gini(layer,5)=gini_now2_1_1;
temp1=sum(data_new2_1_1);
temp2=temp1(1,end)-7;
temp3=size(data_new2_1_1,1)-1;
temp=temp2>(temp3/2);
result(layer,5)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now2_1_1;
leaf_sample(leaf_num)=size(data_new2_1_1,1)-1;
gini(layer,6)=gini_now2_1_2;
temp1=sum(data_new2_1_2);
temp2=temp1(1,end)-7;
temp3=size(data_new2_1_2,1)-1;
temp=temp2>(temp3/2);
result(layer,6)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now2_1_2;
leaf_sample(leaf_num)=size(data_new2_1_2,1)-1;
else
temp1=sum(data_new2_1);
temp2=temp1(1,end)-7;
temp3=size(data_new2_1,1)-1;
temp=temp2>(temp3/2);
result(layer,3)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now2_1;
leaf_sample(leaf_num)=size(data_new2_1,1)-1;
path(layer,3)=nan;
boundary(layer,3)=nan;
gini(layer+1,5:6)=nan;
end
layer=3;
if (size(data_new2_2,1)-1)>=sample_limit&&(gini(layer,4)>0)
[decision_global_best,boundary_global_best,data_new2_2_1,gini_now2_2_1,data_new2_2_2,gini_now2_2_2,~]=generate_node(data_new2_2);
path(layer,4)=data_new2_2(size(data_new2_2,1),decision_global_best);
boundary(layer,4)=boundary_global_best;
layer=layer+1;
gini(layer,7)=gini_now2_2_1;
temp1=sum(data_new2_2_1);
temp2=temp1(1,end)-7;
temp3=size(data_new2_2_1,1)-1;
temp=temp2>(temp3/2);
result(layer,7)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now2_2_1;
leaf_sample(leaf_num)=size(data_new2_2_1,1)-1;
gini(layer,8)=gini_now2_2_2;
temp1=sum(data_new2_2_2);
temp2=temp1(1,end)-7;
temp3=size(data_new2_2_2,1)-1;
temp=temp2>(temp3/2);
result(layer,8)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now2_2_2;
leaf_sample(leaf_num)=size(data_new2_2_2,1)-1;
else
temp1=sum(data_new2_2);
temp2=temp1(1,end)-7;
temp3=size(data_new2_2,1)-1;
temp=temp2>(temp3/2);
result(layer,4)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now2_2;
leaf_sample(leaf_num)=size(data_new2_2,1)-1;
path(layer,4)=nan;
boundary(layer,4)=nan;
gini(layer+1,7:8)=nan;
end
else
temp1=sum(data_new2);
temp2=temp1(1,end)-7;
temp3=size(data_new2,1)-1;
temp=temp2>(temp3/2);
result(layer,1)=temp;
leaf_num=leaf_num+1;
leaf_gini(leaf_num)=gini_now2;
leaf_sample(leaf_num)=size(data_new2,1)-1;
path(layer,2)=nan;
boundary(layer,2)=nan;
layer=layer+1;
gini(layer,3:4)=nan;
%第三层
path(layer,3:4)=nan;
boundary(layer,3:4)=nan;
%gini第四层叶子
layer=layer+1;
gini(layer,5:8)=nan;
end
score=evaluation(leaf_num,leaf_sample,leaf_gini);
end
评价函数
%评价函数
function result=evaluation(leaf_num,sample_num,leaf_gini)
result=0;
for t=1:leaf_num
result=result+sample_num(t)*leaf_gini(t);
end
随机森林
%随机森林,共有trees_num棵树
function result=random_forest(sample,trees_num,data,sample_select,decision_select,sample_limit)
warning_times=0;
nowarning_times=0;
conclusion=zeros(1,trees_num);
for i=1:trees_num
[path,boundary,~,result]=decision_tree(data,sample_select,decision_select,sample_limit);
conclusion(i)=decide(sample,path,boundary,result);
if conclusion(i)==1
warning_times=warning_times+1;
else
if conclusion(i)==0
nowarning_times=nowarning_times+1;
end
end
end
if 2*warning_times>(warning_times+nowarning_times)
result=1;
else
result=0;
end
样本决策函数
%样本决策函数,输入样本与决策树,输出判断结果
function conclusion=decide(sample,path,boundary,result)
if sample(path(1,1))<boundary(1,1)
if ~isnan(result(2,1))
conclusion=result(2,1);
else
if sample(path(2,1))<boundary(2,1)
if ~isnan(result(3,1))
conclusion=result(3,1);
else
if sample(path(3,1))<boundary(3,1)
conclusion=result(4,1);
else
conclusion=result(4,2);
end
end
else
if ~isnan(result(3,2))
conclusion=result(3,2);
else
if sample(path(3,2))<boundary(3,2)
conclusion=result(4,3);
else
conclusion=result(4,4);
end
end
end
end
else
if ~isnan(result(2,2))
conclusion=result(2,1);
else
if sample(path(2,2))<boundary(2,2)
if ~isnan(result(3,3))
conclusion=result(3,3);
else
if sample(path(3,3))<boundary(3,3)
conclusion=result(4,5);
else
conclusion=result(4,6);
end
end
else
if ~isnan(result(3,4))
conclusion=result(3,4);
else
if sample(path(3,4))<boundary(3,4)
conclusion=result(4,7);
else
conclusion=result(4,8);
end
end
end
end
end
正确率计算函数
%正确率计算函数
function rate_average=check_accuracy(data,sample_select,decision_select,trees_num,sample_limit)
sample_num=size(data,1)-1;
rate=zeros(1,sample_num); %初始化正确率数组
for j=1:sample_num
result=zeros(1,sample_num); %初始化结果数组
for i=1:sample_num %遍历样本
sample=data(i,:);
result(i)=random_forest(sample,trees_num,data,sample_select,decision_select,sample_limit);
end
result=result';
t=0;
data1=data(1:end-1,end);
m=result-data1; %机器判断与专家判断对比,0为相同
for i=1:sample_num
if m(i)==0
t=t+1;
end
end
rate(j)=t/sample_num; %计算正确率
end
rate_average=mean(rate); %计算平均正确率
主函数
%使用时只需要调整参数设置里的参数以及对原始数据的决策属性进行标识
clc,clear
%初始化
data=xlsread('data.xlsx'); %读数据 最后一行为决策属性标识
%1用眼距离 2最长持续用眼时长 3总用眼时长 4户外运动时长
%5用眼角度 6健康环境光照用眼比例
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%参数设置%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
sample_select=20; %选取样本数
decision_select=3; %选取决策属性个数
trees_num=40; %设置森林规模
%剪枝
%预剪枝参数设置
deep=3; %决策树深度限制
sample_limit=2; %叶子结点样本数限制
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% 计算平均正确率
tic
rate_average=check_accuracy(data,sample_select,decision_select,trees_num,sample_limit);
toc
%检验样本判断
sample1=[37 33 100 151 5 80];
sample2=[10 130 290 30 37 20];
conclusion1=random_forest(sample1,trees_num,data,sample_select,decision_select,sample_limit);
conclusion2=random_forest(sample2,trees_num,data,sample_select,decision_select,sample_limit);
样本数据
sampledistancedurationtotal_timeoutdoorangleproportionwarning(1 yes 0 no)
120723441481181123468263135750132598357321264143765291157889053415116216918631630178259146325017203513437236808391111698745209224426513614761103915121914025501121179184641860112254124171167211318171286131358911432332361022950115201332261241781116171482366632751173411121457588018248516315514321193216527614633521202512435917133700213151167472547022316335258224412316581644513730242937326104336812534471975956602636123185165267002725126171452333128318498373051129309215311414480302917827814627451