Python - Batch combine Multiple large CSV, filter data, skip header, appending vertically into a single CSV

萝らか妹 提交于 2019-12-08 10:32:11

问题


** Note i have modified the code below original to show a code that works for what i need

Good afternoon all

So many questions around csv data combining but as yet i haven't found anything to help me with my code requirements.

I have large fixed header CSV's that:

1) are produced over a 12hr period. i need to look up a weeks worth of csv's to merge 2) filter the individual CSV's on 2 columns information (to many rows otherwise) 3) append vertically into a single csv 'master sheet' with the naming convention 'date of the last shift'

** Files are coming out as individual CSV's. I need them to append into one

** FYI - Data set after code (there are 16 columns of data i just cut out for this purpose)

Below is what i have so far. apologies for the mess!

import os, csv                                                                 
import pandas as pd
import io
import glob
from datetime import date                                                      
import time
import collections

# Process data and filter #

def ProcessData( data ):                                                        
    processedData = []                                                          

    for row in data:
        if row[ 15 ] == ( 'OPERATING' ):                                        
            outputRow = row[ 0:3 ] + row[ 15:17 ]                               
            processedData.append( outputRow )                           

    return processedData                                                             

# Process and write #

def ProcessAndWrite( data, filename ):                                               
    processedData = ProcessData( data ) 

    name, ext = os.path.splitext( filename )                                         
    outputfilename = name + '_week_combined.csv'                                

    print "writing data to " + str( outputfilename )                                

    with open(outputfilename, 'wb') as csvfile:                                      
        writer = csv.writer(csvfile)
        for row in processedData:
        writer.writerow(row)

# select the correct weeks worth of files #   

def filedate( data, datetime ):                                                    
    root = 'E:\Rs\\'                                                           


    date_outputfilename_list = []
    for file in date_outputfilename_list:


        folder, file_name = os.path.split(file[1])



        file_date = time.strftime("%y-%m-%d", file[0])
        date_name_list.append((file_date, file_name))



    date_count_dict = {}



    date_name_dict = {}

    for date, name in date_name_list:



        date_count_dict=collections.defaultdict( int )
        date_name_dict.setdefault(date, []).append(name)

    import pprint
    print("Files with the same date:")
    pprint.pprint(date_name_dict)
    print('-'*60)
    print("Same dates count:")
    pprint.pprint(date_count_dict)

# Function #

if __name__ == "__main__":                                                          

    import sys
    path = r'E:\Rs'                                                             
    filenames = glob.glob(os.path.join(path, '*.csv'))                              
    filenames.sort()                                                               

    data = []                                                                       

    for filename in filenames:                                                      


        with open(filename, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter = ',')                       
            header = []                                                         
            for headerCount in range( 2 ):                                      
                header.append(next(reader))
            data.extend( [ row for row in reader ] )                          


        if( filedate ):                                                         
            ProcessAndWrite( data, filename )
            data = [ProcessData]


    if ( len( data ) > 0 ):
        ProcessAndWrite( data, filename )        

Data set: position_x, position_y, position_z, start_time, opreason, stage, header 2, header 2, header 2, header 2, header 2, header 2 649794, 4764274, 1147, 2/11/2016 00:00, OPERATING, sound,


Amended Script that works for my purpose

    import os, csv                                                                           # Import csv library
    import io
    import glob
    import datetime                                                       
    import time
    import collections

    def ProcessData( data ):                                                                #   Function definition: filter data
        processedData = []                                                                  #   empty process data list

        for row in data:
            if (row[ 15 ] == 'OPERATING' and row[ 6 ] == 'truck'):                 #   Filter explination
                n1=datetime.datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S')                  #    Strip date from timedate for duration calc
                n2=datetime.datetime.strptime(row[4], '%Y-%m-%d %H:%M:%S')                  #   Strip date from timedate for duration calc
                diff = n2 - n1                                                              #   duration calc   
                outputRow = row[ 0:3 ] + row[ 3:5 ] + [diff.total_seconds()]            
                processedData.append( outputRow )                                           #   process the last of the list information from the csv and append new file

        return processedData                                                                #   Final Processed data


    def ProcessAndWrite( data, filename ):                                                  #   Function Definition: Write data
        processedData = ProcessData( data ) 

        name, ext = os.path.splitext( filename )                                            #   Split the file name from the original to define the output as weeks mastersheet
        outputfilename = name + '_week_combined.csv'                                

        print "writing data to " + str( outputfilename )                                    #   Screen output describing file to look for

        with open(outputfilename, 'wb') as csvfile:                                         #   'wb' is write binary file

            writer = csv.writer(csvfile)                                                    #   Next line is a hack to put headers in the csv
            writer.writerow(['position_x','position_y','position_z','start_time','end_time','model','number','speed','o','stage','duration', 'cummulative_duration'])
            for row in processedData:
                writer.writerow(row)


    if __name__ == "__main__":                                                              #   Run script directly through python (not imported)

        import sys
        path = r'E:\\'                                                                  #   Set correct folder location for file merge
        filenames = glob.glob(os.path.join(path, '*.csv'))                                  #   Select correct files for merge    
        filenames.sort()                                                                    #   Sort the folder so that the files are in date order to make sure you dont crash the script later

        data = []                                                                           #   Blank data list

        def dateFromFilename( name ):                                                       #   Function to select the correct files from truck speed folder
            path,filename = os.path.split(name)                                             
            splitName = filename.split('_')
            dateStr = splitName[0]
            date = datetime.datetime.strptime(dateStr,'%Y-%m-%d')                           #   Split file name date and words
            return date                                                                     #   Need to put this is so it returns an actual value!

        firstFileDate = None
        lastFilename = None

        for filename in filenames:                                                          #   Select file
            currentFileDate = dateFromFilename( filename )

            if firstFileDate:
                diff = currentFileDate - firstFileDate
                # somehow convert this to days
                if ( diff.days >= 1 ):                                                      #   Selct the previous 24hrs worth of data
                    ProcessAndWrite( data, lastFilename )                                   #   Call function to write data
                    data = []
            firstFileDate = currentFileDate
            lastFilename = filename

            with open(filename, 'r') as csvfile:                                            #   For new CSV files
                reader = csv.reader(csvfile, delimiter = ',')                               #   read the csv
                header = []                                                                 #   Blank header list (do this to skip the header rows for merge)
                for headerCount in range( 3 ):                                              #   Start reading from line 3
                    header.append(next(reader))
                data.extend( [ row for row in reader ] )                                    #   extend is to continue the data stacking with the next csv data

        if ( len( data ) > 0 ):                                                             #   If the list of data has data then continue to process and write
            ProcessAndWrite( data, filename )

来源:https://stackoverflow.com/questions/42382696/python-batch-combine-multiple-large-csv-filter-data-skip-header-appending-v

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!