I have 3 terabyte .gz file and want to read its uncompressed content line-by-line in a C++ program. As the file is quite huge, I want to avoid loading it completely in memor
Using zlib, I'm doing something along these lines:
// return a line in a std::vector< char >
std::vector< char > readline( gzFile f ) {
std::vector< char > v( 256 );
unsigned pos = 0;
for ( ;; ) {
if ( gzgets( f, &v[ pos ], v.size() - pos ) == 0 ) {
// end-of-file or error
int err;
const char *msg = gzerror( f, &err );
if ( err != Z_OK ) {
// handle error
}
break;
}
unsigned read = strlen( &v[ pos ] );
if ( v[ pos + read - 1 ] == '\n' ) {
if ( pos + read >= 2 && v[ pos + read - 2 ] == '\r' ) {
pos = pos + read - 2;
} else {
pos = pos + read - 1;
}
break;
}
if ( read == 0 || pos + read < v.size() - 1 ) {
pos = read + pos;
break;
}
pos = v.size() - 1;
v.resize( v.size() * 2 );
}
v.resize( pos );
return v;
}
EDIT: Removed two mis-copied *
in the example above.
EDIT: Corrected out of bounds read on v[pos + read - 2]