Let\'s suppose that I have a dataframe with two columns in pandas which resembles the following one:
text label
0
The following code will do the work (thanks to Mariia Havrylovych).
Assume we have an input dataframe, df, aligned with your structure.
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
# override scikit's tfidf-vectorizer in order to return dataframe with feature names as columns
class DenseTfIdf(TfidfVectorizer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
for k, v in kwargs.items():
setattr(self, k, v)
def transform(self, x, y=None) -> pd.DataFrame:
res = super().transform(x)
df = pd.DataFrame(res.toarray(), columns=self.get_feature_names())
return df
def fit_transform(self, x, y=None) -> pd.DataFrame:
# run sklearn's fit_transform
res = super().fit_transform(x, y=y)
# convert the returned sparse documents-terms matrix into a dataframe to further manipulations
df = pd.DataFrame(res.toarray(), columns=self.get_feature_names(), index=x.index)
return df
# assume texts are stored in column 'text' within a dataframe
texts = df['text']
df_docs_terms_corpus = DenseTfIdf(sublinear_tf=True,
max_df=0.5,
min_df=2,
encoding='ascii',
ngram_range=(1, 2),
lowercase=True,
max_features=1000,
stop_words='english'
).fit_transform(texts)
# Need to keep alignment of indexes between the original dataframe and the resulted documents-terms dataframe
df_class = df[df["label"] == "Class XX"]
df_docs_terms_class = df_docs_terms_corpus.iloc[df_class.index]
# sum by columns and get the top n keywords
df_docs_terms_class.sum(axis=0).nlargest(n=50)