splitting one csv into multiple files in python

前端未结

关注

 10  2110

I have a csv file of about 5000 rows in python i want to split it into five files.

I wrote a code for it but it is not working

import codecs
import c


                      
              相关标签:


      
      
        
          10条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  北荒        
                
              
                            
                2020-12-05 07:45
              
            
            
                                                                       
I have modified the accepted answer a little bit to make it simpler

Edited: Added the import statement, modified the print statement for printing the exception. @Alex F code snippet was written for python2, for python3 you also need to use header_row = rows.__next__() instead header_row = rows.next(). Thanks for pointing out.

import os
import csv
def split_csv_into_chunks(file_location, out_dir, file_size=2):
    count = 0
    current_piece = 1

    # file_to_split_name.csv
    file_name = file_location.split("/")[-1].split(".")[0]
    split_file_name_template = file_name + "__%s.csv"
    splited_files_path = []

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    try:
        with open(file_location, "rb") as csv_file:
            rows = csv.reader(csv_file, delimiter=",")
            headers_row = rows.next()
            for row in rows:
                if count % file_size == 0:
                    current_out_path = os.path.join(out_dir,
                                                    split_file_name_template%str(current_piece))
                    current_out_writer = None

                    current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=",")
                    current_out_writer.writerow(headers_row)
                    splited_files_path.append(current_out_path)
                    current_piece += 1

                current_out_writer.writerow(row)
                count += 1
        return True, splited_files_path
    except Exception as e:
        print("Exception occurred as {}".format(e))
        return False, splited_files_path

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  情话喂你        
                
              
                            
                2020-12-05 07:46
              
            
            
                                                                       
A python3-friendly solution:
def split_csv(source_filepath, dest_folder, split_file_prefix,
                records_per_file):
    """
    Split a source csv into multiple csvs of equal numbers of records,
    except the last file.

    Includes the initial header row in each split file.

    Split files follow a zero-index sequential naming convention like so:

        `{split_file_prefix}_0.csv`
    """
    if records_per_file <= 0:
        raise Exception('records_per_file must be > 0')

    with open(source_filepath, 'r') as source:
        reader = csv.reader(source)
        headers = next(reader)

        file_idx = 0
        records_exist = True

        while records_exist:

            i = 0
            target_filename = f'{split_file_prefix}_{file_idx}.csv'
            target_filepath = os.path.join(dest_folder, target_filename)

            with open(target_filepath, 'w') as target:
                writer = csv.writer(target)

                while i < records_per_file:
                    if i == 0:
                        writer.writerow(headers)

                    try:
                        writer.writerow(next(reader))
                        i += 1
                    except StopIteration:
                        records_exist = False
                        break

            if i == 0:
                # we only wrote the header, so delete that file
                os.remove(target_filepath)

            file_idx += 1

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  臣服心动        
                
              
                            
                2020-12-05 07:48
              
            
            
                                                                       
Another pandas solution (each 1000 rows), similar to Aziz Alto solution:
suffix = 1
for i in range(len(df)):
    if i % 1000 == 0:
        df[i:i+1000].to_csv(f"processed/{filename}_{suffix}.csv", sep ='|', index=False, index_label=False)
        suffix += 1

where df is the csv loaded as pandas.DataFrame; filename is the original filename, the pipe is a separator; index and index_label false is to skip the autoincremented index columns
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  难免孤独        
                
              
                            
                2020-12-05 07:54
              
            
            
                                                                       
if count <= count:
   pass


This condition is always true so you pass everytime

Otherwise you can look at this post: Splitting a CSV file into equal parts?
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  南笙        
                
              
                            
                2020-12-05 07:54
              
            
            
                                                                       
A simple Python 3 solution with Pandas that doesn't cut off the last batch
def to_csv_batch(src_csv, dst_dir, size=30000, index=False):

    import pandas as pd
    import math
    
    # Read source csv
    df = pd.read_csv(src_csv)
    
    # Initial values
    low = 0
    high = size

    # Loop through batches
    for i in range(math.ceil(len(df) / size)):

        fname = dst_dir+'/Batch_' + str(i+1) + '.csv'
        df[low:high].to_csv(fname, index=index)
        
        # Update selection
        low = high
        if (high + size < len(df)):
            high = high + size
        else:
            high = len(df)

Usage example
to_csv_batch('Batch_All.csv', 'Batches')

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  爱一瞬间的悲伤        
                
              
                            
                2020-12-05 07:57
              
            
            
                                                                       
@Ryan, Python3 code worked for me. I used newline='' as below to avoid the blank line issue:

with open(target_filepath, 'w', newline='') as target:

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
   
          
     1
2
下一页
           
           
        
                                  
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复