How to recover original indices for a flattened Numpy array?

后端 未结 6 503
逝去的感伤
逝去的感伤 2021-01-02 19:49

I\'ve got a multidimensional numpy array that I\'m trying to stick into a pandas data frame. I\'d like to flatten the array, and create a pandas index that reflects the pre-

6条回答
  •  情深已故
    2021-01-02 20:44

    You could use pd.MultiIndex.from_product:

    import numpy as np
    import pandas as pd
    import string
    
    def using_multiindex(A, columns):
        shape = A.shape
        index = pd.MultiIndex.from_product([range(s)for s in shape], names=columns)
        df = pd.DataFrame({'A': A.flatten()}, index=index).reset_index()
        return df
    
    A = np.array([[[ 0.43793885,  0.40078139,  0.48078691,  0.05334248],
        [ 0.76331509,  0.82514441,  0.86169078,  0.86496111],
        [ 0.75572665,  0.80860943,  0.79995337,  0.63123724]],
    
       [[ 0.20648946,  0.57042315,  0.71777265,  0.34155005],
        [ 0.30843717,  0.39381407,  0.12623462,  0.93481552],
        [ 0.3267771 ,  0.64097038,  0.30405215,  0.57726629]]])
    
    df = using_multiindex(A, list('ZYX'))
    

    yields

        Z  Y  X         A
    0   0  0  0  0.437939
    1   0  0  1  0.400781
    2   0  0  2  0.480787
    3   0  0  3  0.053342
    ...
    21  1  2  1  0.640970
    22  1  2  2  0.304052
    23  1  2  3  0.577266
    

    Or if performance is a top priority, consider using senderle's cartesian_product. (See the code, below.)


    Here is a benchmark for A with shape (100, 100, 100):

    In [321]: %timeit  using_cartesian_product(A, columns)
    100 loops, best of 3: 13.8 ms per loop
    
    In [318]: %timeit using_multiindex(A, columns)
    10 loops, best of 3: 35.6 ms per loop
    
    In [320]: %timeit indices_merged_arr_generic(A, columns)
    10 loops, best of 3: 29.1 ms per loop
    
    In [319]: %timeit using_product(A)
    1 loop, best of 3: 461 ms per loop
    

    This is the setup I used for the benchmark:

    import numpy as np
    import pandas as pd
    import functools
    import itertools as IT
    import string
    product = IT.product
    
    def cartesian_product_broadcasted(*arrays):
        """
        http://stackoverflow.com/a/11146645/190597 (senderle)
        """
        broadcastable = np.ix_(*arrays)
        broadcasted = np.broadcast_arrays(*broadcastable)
        dtype = np.result_type(*arrays)
        rows, cols = functools.reduce(np.multiply, broadcasted[0].shape), len(broadcasted)
        out = np.empty(rows * cols, dtype=dtype)
        start, end = 0, rows
        for a in broadcasted:
            out[start:end] = a.reshape(-1)
            start, end = end, end + rows
        return out.reshape(cols, rows).T
    
    def using_cartesian_product(A, columns):
        shape = A.shape
        coords = cartesian_product_broadcasted(*[np.arange(s, dtype='int') for s in shape])
        df = pd.DataFrame(coords, columns=columns)
        df['A'] = A.flatten()
        return df
    
    def using_multiindex(A, columns):
        shape = A.shape
        index = pd.MultiIndex.from_product([range(s)for s in shape], names=columns)
        df = pd.DataFrame({'A': A.flatten()}, index=index).reset_index()
        return df
    
    def indices_merged_arr_generic(arr, columns):
        n = arr.ndim
        grid = np.ogrid[tuple(map(slice, arr.shape))]
        out = np.empty(arr.shape + (n+1,), dtype=arr.dtype)
        for i in range(n):
            out[...,i] = grid[i]
        out[...,-1] = arr
        out.shape = (-1,n+1)
        df = pd.DataFrame(out, columns=['A']+columns)
        return df
    
    def using_product(A):
        x, y, z = A.shape
        x_, y_, z_ = zip(*product(range(x), range(y), range(z)))
        df = pd.DataFrame(A.flatten()).assign(x=x_, y=y_, z=z_)
        return df
    
    A = np.random.random((100,100,100))
    shape = A.shape
    columns = list(string.ascii_uppercase[-len(shape):][::-1])
    

提交回复
热议问题