例题4.4的结果和书上画的图不一样,但是没问题,原因是GINI指数取最小值的时候,色泽和脐部值是一样的。
import numpy as np
dataset = np.array([[0,0,0,0,0,0,1],
[1,0,1,0,0,0,1],
[1,0,0,0,0,0,1],
[0,0,1,0,0,0,1],
[2,0,0,0,0,0,1],
[0,1,0,0,1,1,1],
[1,1,0,1,1,1,1],
[1,1,0,0,1,0,1],
[1,1,1,1,1,0,0],
[0,2,2,0,2,1,0],
[2,2,2,2,2,0,0],
[2,0,0,2,2,1,0],
[0,1,0,1,0,0,0],
[2,1,1,1,0,0,0],
[1,1,0,0,1,1,0],
[2,0,0,2,2,0,0],
[0,0,1,1,1,0,0]]
)
#取得训练数据
train_dataset=[]
a = set(range(17))
b = {3,4,7,8,10,11,12}
c = a-b
for i in list(c):
train_dataset.append(dataset[i])
train_dataset =np.array(train_dataset)
#取得测试数据
test_dataset=[]
for i in [3,4,7,8,10,11,12]:
test_dataset.append(dataset[i])
test_dataset= np.array(test_dataset)
#4.3 基尼指数生成决策树2.0(CART)
#计算特征值对应的Gini指数,计算一个特征的一个特征值对应的Gini
def calGini(dataset):
count1 = 0
count0 =0
numdataset = len(dataset)
for ai_dataset in dataset:
ai_label = ai_dataset[-1]
if ai_label == 1:
count1 +=1
else:count0 +=1
prob1 = float(count1)/numdataset
prob0 = float(count0)/numdataset
Gini = 1 - (prob1**2+prob0**2)
return(Gini)
#计算特征的Gini指数,主要就是计算每个特征的所占比例
def calAGain(dataset,i):
dataset =np.array(dataset)
a_dataset = dataset[:,i]
values = list(set(a_dataset))
Dv_Gini=0
for ai in values:
ai_dataset = []
for j in range(len(dataset)):
if ai == a_dataset[j]:
ai_dataset.append(dataset[j])
num_ai=len(ai_dataset)
Gini = calGini(ai_dataset)
prob = float(num_ai)/len(dataset)
Dv_Gini += prob*Gini
return Dv_Gini
#选择Gini最小的特征
def choosenbestGini(dataset):
numA = len(dataset[0])
a_Gini_dict = {}
for i in range(numA-1):
a_Gini_dict[i] = calAGain(dataset,i)
a_Gini_list = list(sorted(a_Gini_dict.items(),key=lambda x:x[1]))[0]
min_a_Gini_i = a_Gini_list[0]
min_a_Gini_value = a_Gini_list[1]
return min_a_Gini_i,min_a_Gini_value
#去除父节点的特征数据
def splitdataset(dataset,axis,value):
retdataset = []
for featvec in dataset:
if featvec[axis] == value:
reducefeatvec = np.hstack((featvec[:axis],featvec[axis+1:]))
retdataset.append(reducefeatvec)
return retdataset
#创建label列表
labels = ['color','root','sound','textile','belly','feel']
#生成决策树
def TREE(dataset,labels):
classlist = [example[-1] for example in dataset]
if classlist.count(classlist[0]) == len(classlist):
return classlist[0]
bestA,bestGini= choosenbestGini(dataset)
bestfeatlabel = labels[bestA]
mytree = {bestfeatlabel:{}}
del(labels[bestA])
featvalues = [example[bestA] for example in dataset]
uniquevals = set(featvalues)
for value in uniquevals:
sublabels = labels[:]
mytree[bestfeatlabel][value] = TREE(splitdataset(dataset,bestA,value),sublabels)
return mytree
a = TREE(train_dataset,labels)
#print(a)
#{'color': {0: {'sound':
# {0: 1,
# 1: 0,
# 2: 0}},
# 1: {'root':
# {0: 1,
# 1: {
# 'textile':
# {0: 0,
# 1: 1}}}},
# 2: 0}}
test_label = test_dataset[-1]
train_test_label=[]
#根据生成的树编的分辨器,例题这种深度少的还可以这么写,深度多了估计有别的方便写法吧
#创建特征名字的字典
labels = ['color','root','sound','textile','belly','feel']
labels_dict = {}
for i in labels:
labels_dict[i] =labels.index(i)
def train_test_label(dataset):
train_test_label_list = []
for vector in dataset:
if vector[labels_dict['color']] == 0:
if vector[labels_dict['sound']] == 0:
train_test_label_list.append(1)
else:train_test_label_list.append(0)
elif vector[labels_dict['color']] == 1:
if vector[labels_dict['root']] == 0:
train_test_label_list.append(1)
elif vector[labels_dict['root']] == 1:
if vector[labels_dict['textile']] == 0:
train_test_label_list.append(0)
elif vector[labels_dict['textile']] == 1:
train_test_label_list.append(0)
else:train_test_label_list.append(5)
else:train_test_label_list.append(5)
else:train_test_label_list.append(0)
return train_test_label_list
#得到测试的结果
test_result0 = train_test_label(test_dataset)
#计算正确率的函数
def calrightrate(test_result,real_result):
num = len(test_result) #测试数据的向量数
count =0
for i in range(num):
if test_result[i] == real_result[i]:
count += 1
right_rate = float(count)/num #计算出初始的正确率
return right_rate
#预剪枝
#color层 color=? 定义为1(好) 测试集的概率
num=len(test_dataset)
num0floor=list(test_dataset[-1]).count(1)
rightrate = float(num0floor)/num
#color 0,1,2 : 1,1,0(好 好 坏)
#color值对应的好瓜坏瓜,应该根据训练集color 0/1/2 对应的好瓜坏瓜数量确定 取多的定义
#这里就不写程序计算训练集每个结点的好坏个数了
#计算测试集剪枝后的正确率
num1floor = 0
for vector in test_dataset:
if vector[labels_dict['color']] == 0 and vector[-1] ==1:
num1floor +=1
elif vector[labels_dict['color']] == 1 and vector[-1] ==1:
num1floor +=1
elif vector[labels_dict['color']] == 2 and vector[-1] ==0:
num1floor +=1
right1rate = float(num1floor)/num
def cnt(right0,right1):
if right1rate <= rightrate:
return '保留'
else: return '剪去'
print(cnt(rightrate,right1rate))
#后剪枝
#用了一个很慢的方法做后剪枝,剪完一个枝直接得到了100%的正确率(主要还是因为数据太少了),
def latejudge(dataset):
all_test_result = np.array(train_test_label(dataset)) #决策树预测的结果
real_result = dataset[-1] #实际结果
#计算未剪枝的正确率
right_rate = calrightrate(all_test_result,real_result)
print(right_rate)
judge_dataset =[]
label_num =[]
m=0
for vector in dataset:
m+=1
if vector[labels_dict['color']] == 0 and vector[labels_dict['root']]==1:
judge_dataset.append(vector)
label_num.append(m-1)
count0 =list(judge_dataset[-1]).count(0)
count1 =list(judge_dataset[-1]).count(1)
if count0>count1:
a=0
elif count0 == count1:
a = np.random.randint(0,2,1)
else:a=1
for n in label_num:
test_label[n] = a
cnt1tree_rate = calrightrate(test_label,real_result) #计算减枝后的正确率
if cnt1tree_rate > right_rate:
print(cnt1tree_rate)
latejudge(test_dataset)
#对比预剪枝和后剪枝,后剪枝保留了更多的节点,并且正确率要高于预剪枝,泛化性更好
#但是预剪枝时间成本更低
来源:CSDN
作者:lzaya0000
链接:https://blog.csdn.net/lzaya0000/article/details/104631182