I have a csv file of about 5000 rows in python i want to split it into five files.
I wrote a code for it but it is not working
import codecs
import c
I have modified the accepted answer a little bit to make it simpler
Edited: Added the import statement, modified the print statement for printing the exception. @Alex F code snippet was written for python2, for python3 you also need to use header_row = rows.__next__()
instead header_row = rows.next()
. Thanks for pointing out.
import os
import csv
def split_csv_into_chunks(file_location, out_dir, file_size=2):
count = 0
current_piece = 1
# file_to_split_name.csv
file_name = file_location.split("/")[-1].split(".")[0]
split_file_name_template = file_name + "__%s.csv"
splited_files_path = []
if not os.path.exists(out_dir):
os.makedirs(out_dir)
try:
with open(file_location, "rb") as csv_file:
rows = csv.reader(csv_file, delimiter=",")
headers_row = rows.next()
for row in rows:
if count % file_size == 0:
current_out_path = os.path.join(out_dir,
split_file_name_template%str(current_piece))
current_out_writer = None
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=",")
current_out_writer.writerow(headers_row)
splited_files_path.append(current_out_path)
current_piece += 1
current_out_writer.writerow(row)
count += 1
return True, splited_files_path
except Exception as e:
print("Exception occurred as {}".format(e))
return False, splited_files_path
A python3-friendly solution:
def split_csv(source_filepath, dest_folder, split_file_prefix,
records_per_file):
"""
Split a source csv into multiple csvs of equal numbers of records,
except the last file.
Includes the initial header row in each split file.
Split files follow a zero-index sequential naming convention like so:
`{split_file_prefix}_0.csv`
"""
if records_per_file <= 0:
raise Exception('records_per_file must be > 0')
with open(source_filepath, 'r') as source:
reader = csv.reader(source)
headers = next(reader)
file_idx = 0
records_exist = True
while records_exist:
i = 0
target_filename = f'{split_file_prefix}_{file_idx}.csv'
target_filepath = os.path.join(dest_folder, target_filename)
with open(target_filepath, 'w') as target:
writer = csv.writer(target)
while i < records_per_file:
if i == 0:
writer.writerow(headers)
try:
writer.writerow(next(reader))
i += 1
except StopIteration:
records_exist = False
break
if i == 0:
# we only wrote the header, so delete that file
os.remove(target_filepath)
file_idx += 1
Another pandas solution (each 1000 rows), similar to Aziz Alto solution:
suffix = 1
for i in range(len(df)):
if i % 1000 == 0:
df[i:i+1000].to_csv(f"processed/{filename}_{suffix}.csv", sep ='|', index=False, index_label=False)
suffix += 1
where df
is the csv loaded as pandas.DataFrame; filename
is the original filename, the pipe is a separator; index
and index_label
false is to skip the autoincremented index columns
if count <= count:
pass
This condition is always true so you pass everytime
Otherwise you can look at this post: Splitting a CSV file into equal parts?
A simple Python 3 solution with Pandas that doesn't cut off the last batch
def to_csv_batch(src_csv, dst_dir, size=30000, index=False):
import pandas as pd
import math
# Read source csv
df = pd.read_csv(src_csv)
# Initial values
low = 0
high = size
# Loop through batches
for i in range(math.ceil(len(df) / size)):
fname = dst_dir+'/Batch_' + str(i+1) + '.csv'
df[low:high].to_csv(fname, index=index)
# Update selection
low = high
if (high + size < len(df)):
high = high + size
else:
high = len(df)
Usage example
to_csv_batch('Batch_All.csv', 'Batches')
@Ryan, Python3 code worked for me. I used newline=''
as below to avoid the blank line issue:
with open(target_filepath, 'w', newline='') as target: