Here is my question.
With bunch of .csv files(or other files). Pandas is an easy way to read them and save into Dataframe format. But when the amount of f
I am not getting map/map_async to work, but managed to work with apply_async.
Two possible ways (I have no idea which one is better):
I find glob easy to list and fitler files from a directory
from glob import glob
import pandas as pd
from multiprocessing import Pool
folder = "./task_1/" # note the "/" at the end
file_list = glob(folder+'*.xlsx')
def my_read(filename):
f = pd.read_csv(filename)
return (f.VALUE.as_matrix()).reshape(75,90)
#DF_LIST = [] # A) end
DF = pd.DataFrame() # B) during
def DF_LIST_append(result):
#DF_LIST.append(result) # A) end
global DF # B) during
DF = pd.concat([DF,result], ignore_index=True) # B) during
pool = Pool(processes=8)
for file in file_list:
pool.apply_async(my_read, args = (file,), callback = DF_LIST_append)
pool.close()
pool.join()
#DF = pd.concat(DF_LIST, ignore_index=True) # A) end
print(DF.shape)