决策树―西瓜书课后题4.3―MATLAB代码

匿名 (未验证) 提交于 2019-12-03 00:34:01

题目:编程实现基于信息熵进行划分选择的决策树算法,并为西瓜数据集3.0上(P84表4.3)中的数据生成一棵决策树;

代码:

clc; clear all;  [num,txt]=xlsread('D:\机器学习\WaterMelon_3.0.xlsx'); data=txt(2:end,[2:7,10]); [rows,cols] = size(data); for i=1:rows     for j=1:cols         %离散值:Discrete_value         D_value(i,j)=string2num(data(i,j));     end end %连续值:Continue_value C_value=[num(:,[8,9]),D_value(:,7)]; make_tree(D_value,C_value);  %% %建树 function  make_tree(data,data1)    [m,n] = size(data);   [m1,n1] = size(data1);   disp('/////////////////////////////////////////////');   disp('待分数据集');   disp(data);   disp(data1);   label = data(:,n);   same_class_num = length(find(data(:,n) ==  label(1,1)));   %退出递归条件   if same_class_num  == m ||( n == 1 && n1 == 1)      disp('划分后的数据子集');      disp(data);      disp(data1);      return;   end   [best_feature,midle_data]= choose_bestfeature(data,data1);   if best_feature<=6      disp("待分数据集的最佳特征序号为");      disp(best_feature);      disp("属于待分离散数据集");   elseif best_feature>6          disp("待分数据集的最佳特征序号为");          disp(best_feature-6);          disp("属于待分连续数据集");          disp("最佳增益时的二分点值");          disp(midle_data);   end   %选中特征为离散特征时递归处理   if best_feature<=6      C2D_value=ones(size(data1,1),1);      featvalue = unique(data(:,best_feature));      featvalue_num = length(featvalue);      for i=1:featvalue_num          [subdata,subdata1] = splitData1(data,data1,best_feature,featvalue(i,1),C2D_value);          make_tree(subdata,subdata1);      end   end   %选中特征为离散特征时递归处理   if best_feature>6      C2D_value=C2Dtranlate(data1,best_feature-6,midle_data);      featvalue = unique(C2D_value);      featvalue_num = length(featvalue);      for i=1:featvalue_num          [subdata,subdata1] = splitData1(data,data1,best_feature,featvalue(i,1),C2D_value);          make_tree(subdata,subdata1);      end   end    end  %% %选择最佳特征 function [best_feature,midle_data]= choose_bestfeature(data,data1)    [m,n] = size(data);   Root_entropy = calc_entropy(data);   midle_data = 0;   best_gain = 0;   best_feature = 0;    % 对于每一列特征   for j=1:n-1       feature_value = unique(data(:,j));       num_f = length(feature_value);       new_entropy = 0;       for i=1:num_f           subdata=splitData(data,j,feature_value(i,1));           [m_s,n_s]=size(subdata);           prob=m_s./m;           new_entropy = new_entropy + prob * calc_entropy(subdata);       end       inf_gain=Root_entropy - new_entropy;       if inf_gain > best_gain         best_gain = inf_gain;         best_feature = j;       end    end    if size(data1,2)>=2       for i=1:size(data1,2)-1           [C_best_gain ,midle_data]=C_value_bestgain(data1,i);           if C_best_gain > best_gain              best_gain =  C_best_gain;              best_feature = i+6;          end       end    end  end %% %计算连续值特征的信息熵值 function [best_gain,midle_data]= C_value_bestgain(C_value,j)    [m,n]=size(C_value);   C_value_sort=sortrows(C_value,j);   best_gain = 0;   for i= 1:m-1       midle_seris(i,j) = (C_value_sort(i,j)+C_value_sort(i+1,j))/2;        C2D_value = C2Dtranlate(C_value,j,midle_seris(i,j));       C2D_value1 =[C2D_value,C_value(:,n)];       [m1,n1] = size(C2D_value1);       baseentropy = calc_entropy(C_value);       feature_value = unique(C2D_value1(:,j));       num_f = length(feature_value);       new_entropy = 0;       for t= 1:num_f         subdata = splitData(C2D_value1 , j, feature_value(t,1));         [m_s,n_s] = size(subdata);         prob = m_s./m1;         new_entropy = new_entropy + prob * calc_entropy(subdata);       end       inf_gain = baseentropy - new_entropy;            if inf_gain > best_gain          best_gain = inf_gain ;           midle_data = midle_seris(i,j);       end     end    end  %% %计算信息熵 function [entropy]= calc_entropy(data)    [m,n] = size(data);   label_value = data(:,n);   label = unique(label_value);   label_number = zeros(length(label),2);   label_number(:,1) = label';   for i= 1:length(label)       label_number(i,2) = sum(label_value == label(i));   end   label_number (:,2) = label_number(:,2) ./ m;   entropy = 0;   entropy = sum(-label_number(:,2).*log2 (label_number(:,2)));    end  %% %分离数据集为子数据集(对离散数据集和连续数据集),用于递归运算 function [subdata,subdata1]= splitData1(data,data1,j,value,c2d_value)    subdata = data;   subdata1 = data1;   if j<=6      subdata(:,j) = [];      k = 0;      for i= 1:size(data,1)          if data(i,j) ~= value             subdata(i-k,:) =[];             subdata1(i-k,:) =[];             k = k + 1;          end      end          elseif j>6           j=j-6;           subdata1(:,j) = [];           k = 0;           for i= 1:size(data1,1)               if c2d_value(i) ~= value               subdata1(i-k,:) =[];               subdata(i-k,:) =[];               k = k + 1;               end               end     end      end  %% %分离数据集为子数据集(对离散数据集),用于过程计算 function [subdata]= splitData(data, j, value)    subdata = data;   subdata(:,j) = [];   k = 0;   for i= 1:size(data,1)       if data(i,j) ~= value          subdata(i-k,:) =[];          k = k + 1;       end   end    end  %% %连续值转化离散值 function  [C2D_value]= C2Dtranlate(C_value,j,midle_data)    [m,n] = size(C_value);   for k= 1:m       if C_value(k,j)< midle_data          C2D_value(k,j)=0;       elseif C_value(k,j)> midle_data              C2D_value(k,j)=1;          end        end    end  %% %处理原字符数据为矩阵表达 function num= string2num(string)     if strcmp(string,'浅白') ||strcmp(string,'硬挺') ||strcmp(string,'清脆')||strcmp(string,'模糊')||strcmp(string,'平坦')||strcmp(string,'软粘')||strcmp(string,'否')       num=0;    elseif strcmp(string,'青绿') ||strcmp(string,'稍蜷') ||strcmp(string,'沉闷')||strcmp(string,'稍糊')||strcmp(string,'稍凹')||strcmp(string,'硬滑')||strcmp(string,'是')       num=1;    else       num=2;    end     end 

生成决策树:



西瓜数据集Excel文件到这里去找:

https://blog.csdn.net/macunshi/article/details/80756016

我的代码是参考这位https://www.cnblogs.com/Kermit-Li/p/4503427.html博客里代码改进的,原ID3只能处理如色泽、根蒂、敲声、纹理等离散值,不能处理密度、含糖率这样的连续值,我在原代码的基础上增加了对连续值的处理。

该博客里有对离散值属性信息增益的计算过程,而对连续值的计算过程详解见这位的博客:

https://blog.csdn.net/leafage_m/article/details/80137305


易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!