I am using Scikit-learn for text classification. I want to calculate the Information Gain for each attribute with respect to a class in a (sparse) document-term matrix.
Using pure python:
def ig(class_, feature):
classes = set(class_)
Hc = 0
for c in classes:
pc = class_.count(c)/len(class_)
Hc += - pc * math.log(pc, 2)
print('Overall Entropy:', Hc)
feature_values = set(feature)
Hc_feature = 0
for feat in feature_values:
pf = feature.count(feat)/len(feature)
indices = [i for i in range(len(feature)) if feature[i] == feat]
clasess_of_feat = [class_[i] for i in indices]
for c in classes:
pcf = clasess_of_feat.count(c)/len(clasess_of_feat)
if pcf != 0:
temp_H = - pf * pcf * math.log(pcf, 2)
Hc_feature += temp_H
ig = Hc - Hc_feature
return ig