How to keep column names when converting from pandas to numpy

前端 未结 4 1179
挽巷
挽巷 2021-02-20 08:03

According to this post, I should be able to access the names of columns in an ndarray as a.dtype.names

Howevever, if I convert a pandas DataFrame to an ndarray with df.a

4条回答
  •  醉酒成梦
    2021-02-20 08:25

    OK, here where I'm leaning:

    class NDArrayWithColumns(np.ndarray):
        def __new__(cls, obj,  columns=None):
            obj = obj.view(cls)
            obj.columns = columns
            return obj
    
        def __array_finalize__(self, obj):
            if obj is None: return
            self.columns = getattr(obj, 'columns', None)
    
        @staticmethod
        def from_dataframe(df):
            cols = tuple(df.columns)
            arr = df.as_matrix(cols)
            return NDArrayWithColumns.from_array(arr,cols)
    
        @staticmethod
        def from_array(array,columns):
            if isinstance(array,NDArrayWithColumns):
                return array
            return NDArrayWithColumns(array,tuple(columns))
    
        def __str__(self):
            sup = np.ndarray.__str__(self)
            if self.columns:
                header = ", ".join(self.columns)
                header = "# " + header + "\n"
                return header+sup
            return sup
    
    NAN = float("nan")
    X = pd.DataFrame(dict(age=[40., NAN, 60.], sys_blood_pressure=[140.,150.,160.]))
    arr = NDArrayWithColumns.from_dataframe(X)
    print arr
    print arr.columns
    print arr.dtype
    

    Gives:

    # age, sys_blood_pressure
    [[  40.  140.]
     [  nan  150.]
     [  60.  160.]]
    ('age', 'sys_blood_pressure')
    float64
    

    and can also be passed to types cython function expecting a ndarray[2,double_t].

    UPDATE: this works pretty good except for some oddness when passing the type to ufuncs.

提交回复
热议问题