With nodejs I want to parse a .csv file of 10000 records and do some operation on each row. I tried using http://www.adaltas.com/projects/node-csv. I couldnt get this to pau
Ok so there are many answers here and I dont think they answer your question which I think is similar to mine.
You need to do an operation like contacting a database or third part api that will take time and is asyncronus. You do not want to load the entire document into memory due to being to large or some other reason so you need to read line by line to process.
I have read into the fs documents and it can pause on reading but using .on('data') call will make it continous which most of these answer use and cause the problem.
UPDATE: I know more info about Streams than I ever wanted
The best way to do this is to create a writable stream. This will pipe the csv data into your writable stream which you can manage asyncronus calls. The pipe will manage the buffer all the way back to the reader so you will not wind up with heavy memory usage
Simple Version
const parser = require('csv-parser');
const stripBom = require('strip-bom-stream');
const stream = require('stream')
const mySimpleWritable = new stream.Writable({
objectMode: true, // Because input is object from csv-parser
write(chunk, encoding, done) { // Required
// chunk is object with data from a line in the csv
console.log('chunk', chunk)
done();
},
final(done) { // Optional
// last place to clean up when done
done();
}
});
fs.createReadStream(fileNameFull).pipe(stripBom()).pipe(parser()).pipe(mySimpleWritable)
Class Version
const parser = require('csv-parser');
const stripBom = require('strip-bom-stream');
const stream = require('stream')
// Create writable class
class MyWritable extends stream.Writable {
// Used to set object mode because we get an object piped in from csv-parser
constructor(another_variable, options) {
// Calls the stream.Writable() constructor.
super({ ...options, objectMode: true });
// additional information if you want
this.another_variable = another_variable
}
// The write method
// Called over and over, for each line in the csv
async _write(chunk, encoding, done) {
// The chunk will be a line of your csv as an object
console.log('Chunk Data', this.another_variable, chunk)
// demonstrate await call
// This will pause the process until it is finished
await new Promise(resolve => setTimeout(resolve, 2000));
// Very important to add. Keeps the pipe buffers correct. Will load the next line of data
done();
};
// Gets called when all lines have been read
async _final(done) {
// Can do more calls here with left over information in the class
console.log('clean up')
// lets pipe know its done and the .on('final') will be called
done()
}
}
// Instantiate the new writable class
myWritable = new MyWritable(somevariable)
// Pipe the read stream to csv-parser, then to your write class
// stripBom is due to Excel saving csv files with UTF8 - BOM format
fs.createReadStream(fileNameFull).pipe(stripBom()).pipe(parser()).pipe(myWritable)
// optional
.on('finish', () => {
// will be called after the wriables internal _final
console.log('Called very last')
})
OLD METHOD:
PROBLEM WITH readable
const csv = require('csv-parser');
const fs = require('fs');
const processFileByLine = async(fileNameFull) => {
let reading = false
const rr = fs.createReadStream(fileNameFull)
.pipe(csv())
// Magic happens here
rr.on('readable', async function(){
// Called once when data starts flowing
console.log('starting readable')
// Found this might be called a second time for some reason
// This will stop that event from happening
if (reading) {
console.log('ignoring reading')
return
}
reading = true
while (null !== (data = rr.read())) {
// data variable will be an object with information from the line it read
// PROCESS DATA HERE
console.log('new line of data', data)
}
// All lines have been read and file is done.
// End event will be called about now so that code will run before below code
console.log('Finished readable')
})
rr.on("end", function () {
// File has finished being read
console.log('closing file')
});
rr.on("error", err => {
// Some basic error handling for fs error events
console.log('error', err);
});
}
You will notice a reading
flag. I have noticed that for some reason right near the end of the file the .on('readable') gets called a second time on small and large files. I am unsure why but this blocks that from a second process reading the same line items.