How to merge csv files onto a single file on condition and adding file name as a column?

前端 未结 1 1176
小鲜肉
小鲜肉 2020-12-22 03:00

I have multiple csv files on folder. The column headers are different but column datas are same.

The number inside the bracket is actual column name. Item(67) 67 is

相关标签:
1条回答
  • 2020-12-22 03:53

    Use:

    files = glob.glob('shelldemo/*.csv')
    
    dfs = []
    for fp in files:
        #if multiple columns with no ()  
        #df = pd.read_csv(fp, index_col=['S.no','id','number'])
    
        df = pd.read_csv(fp, index_col=['ID'])
        df['file'] = os.path.basename(fp).split('.')[0]
        df = df.set_index('file', append=True)
        df.columns = df.columns.str.extract('\((\d+)\)', expand=False).astype(int)
        dfs.append(df)
    
    
    df1 = pd.concat(dfs, sort=False).reset_index()
    print (df1)
       ID   file     58   67     89     91     96    100
    0   1  file1    NaN   56   78.0   98.0    NaN  101.0
    1   2  file1    NaN   91  100.0  121.0    NaN    NaN
    2   3  file2  102.0  103    NaN    NaN  101.0  104.0
    3   4  file2  113.0  117    NaN    NaN  112.0  119.0
    

    print (df2)
        File  Price1  Price2  Price3  Price4
    0  File1      67      89      91     100
    1  File2      96      58     105      99
    

    df2.columns = df2.columns.str.lower() 
    df2['file'] = df2['file'].str.lower()
    
    #merge data together by left join 
    df = df1.merge(df2, on='file', how='left')
    print (df)
       ID   file     58   67     89     91     96    100  price1  price2  price3  \
    0   1  file1    NaN   56   78.0   98.0    NaN  101.0      67      89      91   
    1   2  file1    NaN   91  100.0  121.0    NaN    NaN      67      89      91   
    2   3  file2  102.0  103    NaN    NaN  101.0  104.0      96      58     105   
    3   4  file2  113.0  117    NaN    NaN  112.0  119.0      96      58     105   
    
       price4  
    0     100  
    1     100  
    2      99  
    3      99  
    

    #filter integers between ()
    df1 = df.loc[:, df.columns.str.isnumeric().isnull()].copy()
    #filter all columns with price
    df2 = df.filter(regex='price').copy()
    
    uniq_vals_df2 = df2.stack().dropna().drop_duplicates()
    not_matched_vals = np.setdiff1d(uniq_vals_df2, df1.columns)
    df1 = df1.join(pd.DataFrame(columns=not_matched_vals.tolist() + ['a']))
    
    #replace columns by match values from df2
    for c in df2.columns:
        df2[c] = df1.lookup(df1.index, df2[c].fillna('a'))
    #join to original DataFrame    
    df = df[['file','ID']].join(df2)
    

    print (df)
    
        file  ID  price1  price2  price3  price4
    0  file1   1    56.0    78.0    98.0   101.0
    1  file1   2    91.0   100.0   121.0     NaN
    2  file2   3   101.0   102.0     NaN     NaN
    3  file2   4   112.0   113.0     NaN     NaN
    
    0 讨论(0)
提交回复
热议问题