Label encoding across multiple columns in scikit-learn

后端 未结 22 2302
礼貌的吻别
礼貌的吻别 2020-11-22 09:02

I\'m trying to use scikit-learn\'s LabelEncoder to encode a pandas DataFrame of string labels. As the dataframe has many (50+) columns, I want to a

22条回答
  •  滥情空心
    2020-11-22 09:48

    This is a year-and-a-half after the fact, but I too, needed to be able to .transform() multiple pandas dataframe columns at once (and be able to .inverse_transform() them as well). This expands upon the excellent suggestion of @PriceHardman above:

    class MultiColumnLabelEncoder(LabelEncoder):
        """
        Wraps sklearn LabelEncoder functionality for use on multiple columns of a
        pandas dataframe.
    
        """
        def __init__(self, columns=None):
            self.columns = columns
    
        def fit(self, dframe):
            """
            Fit label encoder to pandas columns.
    
            Access individual column classes via indexig `self.all_classes_`
    
            Access individual column encoders via indexing
            `self.all_encoders_`
            """
            # if columns are provided, iterate through and get `classes_`
            if self.columns is not None:
                # ndarray to hold LabelEncoder().classes_ for each
                # column; should match the shape of specified `columns`
                self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                               dtype=object)
                self.all_encoders_ = np.ndarray(shape=self.columns.shape,
                                                dtype=object)
                for idx, column in enumerate(self.columns):
                    # fit LabelEncoder to get `classes_` for the column
                    le = LabelEncoder()
                    le.fit(dframe.loc[:, column].values)
                    # append the `classes_` to our ndarray container
                    self.all_classes_[idx] = (column,
                                              np.array(le.classes_.tolist(),
                                                      dtype=object))
                    # append this column's encoder
                    self.all_encoders_[idx] = le
            else:
                # no columns specified; assume all are to be encoded
                self.columns = dframe.iloc[:, :].columns
                self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                               dtype=object)
                for idx, column in enumerate(self.columns):
                    le = LabelEncoder()
                    le.fit(dframe.loc[:, column].values)
                    self.all_classes_[idx] = (column,
                                              np.array(le.classes_.tolist(),
                                                      dtype=object))
                    self.all_encoders_[idx] = le
            return self
    
        def fit_transform(self, dframe):
            """
            Fit label encoder and return encoded labels.
    
            Access individual column classes via indexing
            `self.all_classes_`
    
            Access individual column encoders via indexing
            `self.all_encoders_`
    
            Access individual column encoded labels via indexing
            `self.all_labels_`
            """
            # if columns are provided, iterate through and get `classes_`
            if self.columns is not None:
                # ndarray to hold LabelEncoder().classes_ for each
                # column; should match the shape of specified `columns`
                self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                               dtype=object)
                self.all_encoders_ = np.ndarray(shape=self.columns.shape,
                                                dtype=object)
                self.all_labels_ = np.ndarray(shape=self.columns.shape,
                                              dtype=object)
                for idx, column in enumerate(self.columns):
                    # instantiate LabelEncoder
                    le = LabelEncoder()
                    # fit and transform labels in the column
                    dframe.loc[:, column] =\
                        le.fit_transform(dframe.loc[:, column].values)
                    # append the `classes_` to our ndarray container
                    self.all_classes_[idx] = (column,
                                              np.array(le.classes_.tolist(),
                                                      dtype=object))
                    self.all_encoders_[idx] = le
                    self.all_labels_[idx] = le
            else:
                # no columns specified; assume all are to be encoded
                self.columns = dframe.iloc[:, :].columns
                self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                               dtype=object)
                for idx, column in enumerate(self.columns):
                    le = LabelEncoder()
                    dframe.loc[:, column] = le.fit_transform(
                            dframe.loc[:, column].values)
                    self.all_classes_[idx] = (column,
                                              np.array(le.classes_.tolist(),
                                                      dtype=object))
                    self.all_encoders_[idx] = le
            return dframe.loc[:, self.columns].values
    
        def transform(self, dframe):
            """
            Transform labels to normalized encoding.
            """
            if self.columns is not None:
                for idx, column in enumerate(self.columns):
                    dframe.loc[:, column] = self.all_encoders_[
                        idx].transform(dframe.loc[:, column].values)
            else:
                self.columns = dframe.iloc[:, :].columns
                for idx, column in enumerate(self.columns):
                    dframe.loc[:, column] = self.all_encoders_[idx]\
                        .transform(dframe.loc[:, column].values)
            return dframe.loc[:, self.columns].values
    
        def inverse_transform(self, dframe):
            """
            Transform labels back to original encoding.
            """
            if self.columns is not None:
                for idx, column in enumerate(self.columns):
                    dframe.loc[:, column] = self.all_encoders_[idx]\
                        .inverse_transform(dframe.loc[:, column].values)
            else:
                self.columns = dframe.iloc[:, :].columns
                for idx, column in enumerate(self.columns):
                    dframe.loc[:, column] = self.all_encoders_[idx]\
                        .inverse_transform(dframe.loc[:, column].values)
            return dframe.loc[:, self.columns].values
    

    Example:

    If df and df_copy() are mixed-type pandas dataframes, you can apply the MultiColumnLabelEncoder() to the dtype=object columns in the following way:

    # get `object` columns
    df_object_columns = df.iloc[:, :].select_dtypes(include=['object']).columns
    df_copy_object_columns = df_copy.iloc[:, :].select_dtypes(include=['object']).columns
    
    # instantiate `MultiColumnLabelEncoder`
    mcle = MultiColumnLabelEncoder(columns=object_columns)
    
    # fit to `df` data
    mcle.fit(df)
    
    # transform the `df` data
    mcle.transform(df)
    
    # returns output like below
    array([[1, 0, 0, ..., 1, 1, 0],
           [0, 5, 1, ..., 1, 1, 2],
           [1, 1, 1, ..., 1, 1, 2],
           ..., 
           [3, 5, 1, ..., 1, 1, 2],
    
    # transform `df_copy` data
    mcle.transform(df_copy)
    
    # returns output like below (assuming the respective columns 
    # of `df_copy` contain the same unique values as that particular 
    # column in `df`
    array([[1, 0, 0, ..., 1, 1, 0],
           [0, 5, 1, ..., 1, 1, 2],
           [1, 1, 1, ..., 1, 1, 2],
           ..., 
           [3, 5, 1, ..., 1, 1, 2],
    
    # inverse `df` data
    mcle.inverse_transform(df)
    
    # outputs data like below
    array([['August', 'Friday', '2013', ..., 'N', 'N', 'CA'],
           ['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
           ['August', 'Monday', '2014', ..., 'N', 'N', 'NJ'],
           ..., 
           ['February', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
           ['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
           ['March', 'Tuesday', '2013', ..., 'N', 'N', 'NJ']], dtype=object)
    
    # inverse `df_copy` data
    mcle.inverse_transform(df_copy)
    
    # outputs data like below
    array([['August', 'Friday', '2013', ..., 'N', 'N', 'CA'],
           ['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
           ['August', 'Monday', '2014', ..., 'N', 'N', 'NJ'],
           ..., 
           ['February', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
           ['April', 'Tuesday', '2014', ..., 'N', 'N', 'NJ'],
           ['March', 'Tuesday', '2013', ..., 'N', 'N', 'NJ']], dtype=object)
    

    You can access individual column classes, column labels, and column encoders used to fit each column via indexing:

    mcle.all_classes_
    mcle.all_encoders_
    mcle.all_labels_

提交回复
热议问题