问题
** Note i have modified the code below original to show a code that works for what i need
Good afternoon all
So many questions around csv data combining but as yet i haven't found anything to help me with my code requirements.
I have large fixed header CSV's that:
1) are produced over a 12hr period. i need to look up a weeks worth of csv's to merge 2) filter the individual CSV's on 2 columns information (to many rows otherwise) 3) append vertically into a single csv 'master sheet' with the naming convention 'date of the last shift'
** Files are coming out as individual CSV's. I need them to append into one
** FYI - Data set after code (there are 16 columns of data i just cut out for this purpose)
Below is what i have so far. apologies for the mess!
import os, csv
import pandas as pd
import io
import glob
from datetime import date
import time
import collections
# Process data and filter #
def ProcessData( data ):
processedData = []
for row in data:
if row[ 15 ] == ( 'OPERATING' ):
outputRow = row[ 0:3 ] + row[ 15:17 ]
processedData.append( outputRow )
return processedData
# Process and write #
def ProcessAndWrite( data, filename ):
processedData = ProcessData( data )
name, ext = os.path.splitext( filename )
outputfilename = name + '_week_combined.csv'
print "writing data to " + str( outputfilename )
with open(outputfilename, 'wb') as csvfile:
writer = csv.writer(csvfile)
for row in processedData:
writer.writerow(row)
# select the correct weeks worth of files #
def filedate( data, datetime ):
root = 'E:\Rs\\'
date_outputfilename_list = []
for file in date_outputfilename_list:
folder, file_name = os.path.split(file[1])
file_date = time.strftime("%y-%m-%d", file[0])
date_name_list.append((file_date, file_name))
date_count_dict = {}
date_name_dict = {}
for date, name in date_name_list:
date_count_dict=collections.defaultdict( int )
date_name_dict.setdefault(date, []).append(name)
import pprint
print("Files with the same date:")
pprint.pprint(date_name_dict)
print('-'*60)
print("Same dates count:")
pprint.pprint(date_count_dict)
# Function #
if __name__ == "__main__":
import sys
path = r'E:\Rs'
filenames = glob.glob(os.path.join(path, '*.csv'))
filenames.sort()
data = []
for filename in filenames:
with open(filename, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter = ',')
header = []
for headerCount in range( 2 ):
header.append(next(reader))
data.extend( [ row for row in reader ] )
if( filedate ):
ProcessAndWrite( data, filename )
data = [ProcessData]
if ( len( data ) > 0 ):
ProcessAndWrite( data, filename )
Data set: position_x, position_y, position_z, start_time, opreason, stage, header 2, header 2, header 2, header 2, header 2, header 2 649794, 4764274, 1147, 2/11/2016 00:00, OPERATING, sound,
Amended Script that works for my purpose
import os, csv # Import csv library
import io
import glob
import datetime
import time
import collections
def ProcessData( data ): # Function definition: filter data
processedData = [] # empty process data list
for row in data:
if (row[ 15 ] == 'OPERATING' and row[ 6 ] == 'truck'): # Filter explination
n1=datetime.datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S') # Strip date from timedate for duration calc
n2=datetime.datetime.strptime(row[4], '%Y-%m-%d %H:%M:%S') # Strip date from timedate for duration calc
diff = n2 - n1 # duration calc
outputRow = row[ 0:3 ] + row[ 3:5 ] + [diff.total_seconds()]
processedData.append( outputRow ) # process the last of the list information from the csv and append new file
return processedData # Final Processed data
def ProcessAndWrite( data, filename ): # Function Definition: Write data
processedData = ProcessData( data )
name, ext = os.path.splitext( filename ) # Split the file name from the original to define the output as weeks mastersheet
outputfilename = name + '_week_combined.csv'
print "writing data to " + str( outputfilename ) # Screen output describing file to look for
with open(outputfilename, 'wb') as csvfile: # 'wb' is write binary file
writer = csv.writer(csvfile) # Next line is a hack to put headers in the csv
writer.writerow(['position_x','position_y','position_z','start_time','end_time','model','number','speed','o','stage','duration', 'cummulative_duration'])
for row in processedData:
writer.writerow(row)
if __name__ == "__main__": # Run script directly through python (not imported)
import sys
path = r'E:\\' # Set correct folder location for file merge
filenames = glob.glob(os.path.join(path, '*.csv')) # Select correct files for merge
filenames.sort() # Sort the folder so that the files are in date order to make sure you dont crash the script later
data = [] # Blank data list
def dateFromFilename( name ): # Function to select the correct files from truck speed folder
path,filename = os.path.split(name)
splitName = filename.split('_')
dateStr = splitName[0]
date = datetime.datetime.strptime(dateStr,'%Y-%m-%d') # Split file name date and words
return date # Need to put this is so it returns an actual value!
firstFileDate = None
lastFilename = None
for filename in filenames: # Select file
currentFileDate = dateFromFilename( filename )
if firstFileDate:
diff = currentFileDate - firstFileDate
# somehow convert this to days
if ( diff.days >= 1 ): # Selct the previous 24hrs worth of data
ProcessAndWrite( data, lastFilename ) # Call function to write data
data = []
firstFileDate = currentFileDate
lastFilename = filename
with open(filename, 'r') as csvfile: # For new CSV files
reader = csv.reader(csvfile, delimiter = ',') # read the csv
header = [] # Blank header list (do this to skip the header rows for merge)
for headerCount in range( 3 ): # Start reading from line 3
header.append(next(reader))
data.extend( [ row for row in reader ] ) # extend is to continue the data stacking with the next csv data
if ( len( data ) > 0 ): # If the list of data has data then continue to process and write
ProcessAndWrite( data, filename )
来源:https://stackoverflow.com/questions/42382696/python-batch-combine-multiple-large-csv-filter-data-skip-header-appending-v