Find top n terms with highest TF-IDF score per class

前端 未结 3 1536
南笙
南笙 2021-01-23 05:59

Let\'s suppose that I have a dataframe with two columns in pandas which resembles the following one:

    text                                label
0         


        
3条回答
  •  死守一世寂寞
    2021-01-23 06:31

    The following code will do the work (thanks to Mariia Havrylovych).

    Assume we have an input dataframe, df, aligned with your structure.

    from sklearn.feature_extraction.text import TfidfVectorizer
    import pandas as pd
    
    # override scikit's tfidf-vectorizer in order to return dataframe with feature names as columns
    class DenseTfIdf(TfidfVectorizer):
    
        def __init__(self, **kwargs):
            super().__init__(**kwargs)
            for k, v in kwargs.items():
                setattr(self, k, v)
    
        def transform(self, x, y=None) -> pd.DataFrame:
            res = super().transform(x)
            df = pd.DataFrame(res.toarray(), columns=self.get_feature_names())
            return df
    
        def fit_transform(self, x, y=None) -> pd.DataFrame:
            # run sklearn's fit_transform
            res = super().fit_transform(x, y=y)
            # convert the returned sparse documents-terms matrix into a dataframe to further manipulations
            df = pd.DataFrame(res.toarray(), columns=self.get_feature_names(), index=x.index)
            return df
    

    Usage:

    # assume texts are stored in column 'text' within a dataframe
    texts = df['text']
    df_docs_terms_corpus = DenseTfIdf(sublinear_tf=True,
                     max_df=0.5,
                     min_df=2,
                     encoding='ascii',
                     ngram_range=(1, 2),
                     lowercase=True,
                     max_features=1000,
                     stop_words='english'
                    ).fit_transform(texts)
    
    
    # Need to keep alignment of indexes between the original dataframe and the resulted documents-terms dataframe
    df_class = df[df["label"] == "Class XX"]
    df_docs_terms_class = df_docs_terms_corpus.iloc[df_class.index]
    # sum by columns and get the top n keywords
    df_docs_terms_class.sum(axis=0).nlargest(n=50)
    

提交回复
热议问题