I am using Scikit-learn for text classification. I want to calculate the Information Gain for each attribute with respect to a class in a (sparse) document-term matrix.
Here is my proposition to calculate the information gain using pandas:
from scipy.stats import entropy
import pandas as pd
def information_gain(members, split):
'''
Measures the reduction in entropy after the split
:param v: Pandas Series of the members
:param split:
:return:
'''
entropy_before = entropy(members.value_counts(normalize=True))
split.name = 'split'
members.name = 'members'
grouped_distrib = members.groupby(split) \
.value_counts(normalize=True) \
.reset_index(name='count') \
.pivot_table(index='split', columns='members', values='count').fillna(0)
entropy_after = entropy(grouped_distrib, axis=1)
entropy_after *= split.value_counts(sort=False, normalize=True)
return entropy_before - entropy_after.sum()
members = pd.Series(['yellow','yellow','green','green','blue'])
split = pd.Series([0,0,1,1,0])
print (information_gain(members, split))