I have a csv file of about 5000 rows in python i want to split it into five files.
I wrote a code for it but it is not working
import codecs
import c
In Python
Use readlines()
and writelines()
to do that, here is an example:
>>> csvfile = open('import_1458922827.csv', 'r').readlines()
>>> filename = 1
>>> for i in range(len(csvfile)):
... if i % 1000 == 0:
... open(str(filename) + '.csv', 'w+').writelines(csvfile[i:i+1000])
... filename += 1
the output file names will be numbered 1.csv
, 2.csv
, ... etc.
From terminal
FYI, you can do this from the command line using split
as follows:
$ split -l 1000 import_1458922827.csv
I suggest you not inventing a wheel. There is existing solution. Source here
import os
def split(filehandler, delimiter=',', row_limit=1000,
output_name_template='output_%s.csv', output_path='.', keep_headers=True):
import csv
reader = csv.reader(filehandler, delimiter=delimiter)
current_piece = 1
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
current_limit = row_limit
if keep_headers:
headers = reader.next()
current_out_writer.writerow(headers)
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
if keep_headers:
current_out_writer.writerow(headers)
current_out_writer.writerow(row)
Use it like:
split(open('/your/pat/input.csv', 'r'));
I suggest you leverage the possibilities offered by pandas. Here are functions you could use to do that :
def csv_count_rows(file):
"""
Counts the number of rows in a file.
:param file: path to the file.
:return: number of lines in the designated file.
"""
with open(file) as f:
nb_lines = sum(1 for line in f)
return nb_lines
def split_csv(file, sep=",", output_path=".", nrows=None, chunksize=None, low_memory=True, usecols=None):
"""
Split a csv into several files.
:param file: path to the original csv.
:param sep: View pandas.read_csv doc.
:param output_path: path in which to output the resulting parts of the splitting.
:param nrows: Number of rows to split the original csv by, also view pandas.read_csv doc.
:param chunksize: View pandas.read_csv doc.
:param low_memory: View pandas.read_csv doc.
:param usecols: View pandas.read_csv doc.
"""
nb_of_rows = csv_count_rows(file)
# Parsing file elements : Path, name, extension, etc...
# file_path = "/".join(file.split("/")[0:-1])
file_name = file.split("/")[-1]
# file_ext = file_name.split(".")[-1]
file_name_trunk = file_name.split(".")[0]
split_files_name_trunk = file_name_trunk + "_part_"
# Number of chunks to partition the original file into
nb_of_chunks = math.ceil(nb_of_rows / nrows)
if nrows:
log_debug_process_start = f"The file '{file_name}' contains {nb_of_rows} ROWS. " \
f"\nIt will be split into {nb_of_chunks} chunks of a max number of rows : {nrows}." \
f"\nThe resulting files will be output in '{output_path}' as '{split_files_name_trunk}0 to {nb_of_chunks - 1}'"
logging.debug(log_debug_process_start)
for i in range(nb_of_chunks):
# Number of rows to skip is determined by (the number of the chunk being processed) multiplied by (the nrows parameter).
rows_to_skip = range(1, i * nrows) if i else None
output_file = f"{output_path}/{split_files_name_trunk}{i}.csv"
log_debug_chunk_processing = f"Processing chunk {i} of the file '{file_name}'"
logging.debug(log_debug_chunk_processing)
# Fetching the original csv file and handling it with skiprows and nrows to process its data
df_chunk = pd.read_csv(filepath_or_buffer=file, sep=sep, nrows=nrows, skiprows=rows_to_skip,
chunksize=chunksize, low_memory=low_memory, usecols=usecols)
df_chunk.to_csv(path_or_buf=output_file, sep=sep)
log_info_file_output = f"Chunk {i} of file '{file_name}' created in '{output_file}'"
logging.info(log_info_file_output)
And then in your main or jupyter notebook you put :
# This is how you initiate logging in the most basic way.
logging.basicConfig(level=logging.DEBUG)
file = {#Path to your file}
split_csv(file,sep=";" ,output_path={#Path where you'd like to output it},nrows = 4000000, low_memory = False)
P.S.1 : I put nrows = 4000000
because when it's a personal preference. You can change that number if you wish.
P.S.2 : I used the logging library to display messages. When would apply such a function on big files that exist on a remote server, you really want to avoid 'simple printing' and incorporate logging capabilities. You can replace logging.info
or logging.debug
with print
P.S.3 : Of course, you need to replace the {# Blablabla}
parts of the code with your own parameters.
A simpler script works for me.
import pandas as pd
path = "path to file" # path to file
df = pd.read_csv(path) # reading file
low = 0 # Initial Lower Limit
high = 1000 # Initial Higher Limit
while(high < len(df)):
df_new = df[low:high] # subsetting DataFrame based on index
low = high #changing lower limit
high = high + 1000 # givig uper limit with increment of 1000
df_new.to_csv("Path to output file") # output file