I have 3 terabyte .gz file and want to read its uncompressed content line-by-line in a C++ program. As the file is quite huge, I want to avoid loading it completely in memor
Using zlib, I'm doing something along these lines:
// return a line in a std::vector< char >
std::vector< char > readline( gzFile f ) {
std::vector< char > v( 256 );
unsigned pos = 0;
for ( ;; ) {
if ( gzgets( f, &v[ pos ], v.size() - pos ) == 0 ) {
// end-of-file or error
int err;
const char *msg = gzerror( f, &err );
if ( err != Z_OK ) {
// handle error
}
break;
}
unsigned read = strlen( &v[ pos ] );
if ( v[ pos + read - 1 ] == '\n' ) {
if ( pos + read >= 2 && v[ pos + read - 2 ] == '\r' ) {
pos = pos + read - 2;
} else {
pos = pos + read - 1;
}
break;
}
if ( read == 0 || pos + read < v.size() - 1 ) {
pos = read + pos;
break;
}
pos = v.size() - 1;
v.resize( v.size() * 2 );
}
v.resize( pos );
return v;
}
EDIT: Removed two mis-copied * in the example above.
EDIT: Corrected out of bounds read on v[pos + read - 2]
Here is some code with which you can read normal and zipped files line by line:
char line[0x10000];
FILE *infile=open_file(file);
bool gzipped=endsWith(file, ".gz");
if(gzipped)
init_gzip_stream(infile,&line[0]);
while (readLine(infile,line,gzipped)) {
if(line[0]==0)continue;// skip gzip new_block
printf(line);
}
#include <zlib.h>
#define CHUNK 0x100
#define OUT_CHUNK CHUNK*100
unsigned char gzip_in[CHUNK];
unsigned char gzip_out[OUT_CHUNK];
///* These are parameters to inflateInit2. See http://zlib.net/manual.html for the exact meanings. */
#define windowBits 15
#define ENABLE_ZLIB_GZIP 32
z_stream strm = {0};
z_stream init_gzip_stream(FILE* file,char* out){// unsigned
strm.zalloc = Z_NULL;
strm.zfree = Z_NULL;
strm.opaque = Z_NULL;
strm.next_in = gzip_in;
strm.avail_in = 0;
strm.next_out = gzip_out;
inflateInit2 (& strm, windowBits | ENABLE_ZLIB_GZIP);
return strm;
}
bool inflate_gzip(FILE* file, z_stream strm,size_t bytes_read){
strm.avail_in = (int)bytes_read;
do {
strm.avail_out = OUT_CHUNK;
inflate (& strm, Z_NO_FLUSH);
// printf ("%s",gzip_out);
}while (strm.avail_out == 0);
if (feof (file)) {
inflateEnd (& strm);
return false;
}
return true;// all OK
}
char* first_line=(char*)&gzip_out[0];
char* current_line=first_line;
char* next_line=first_line;
char hangover[1000];
bool readLine(FILE* infile,char* line,bool gzipped){
if(!gzipped)
return fgets(line, sizeof(line), infile) != NULL;
else{
bool ok=true;
current_line=next_line;
if(!current_line || strlen(current_line)==0 || next_line-current_line>OUT_CHUNK){
current_line=first_line;
size_t bytes_read = fread (gzip_in, sizeof (char), CHUNK, infile);
ok=inflate_gzip(infile,strm,bytes_read);
strcpy(line,hangover);
}
if(ok){
next_line=strstr(current_line,"\n");
if(next_line){
next_line[0]=0;
next_line++;
strcpy(line+strlen(hangover),current_line);
hangover[0]=0;
}else{
strcpy(hangover,current_line);
line[0]=0;// skip that one!!
}
}
return ok;
}
}
The zlib library supports decompressing files in memory in blocks, so you don't have to decompress the entire file in order to process it.
Chilkat (http://www.chilkatsoft.com/) has libraries to read compressed files from a C++, .Net, VB, ... application.
For something that is going to be used regularly, you probably want to use one of the previous suggestions. Alternatively, you can do
gzcat file.gz | yourprogram
and have yourprogram read from cin. This will decompress parts of the file in memory as it is needed, and send the uncompressed output to yourprogram.
You most probably will have to use ZLib's deflate, example is available from their site
Alternatively you may have a look at BOOST C++ wrapper
The example from BOOST page (decompresses data from a file and writes it to standard output)
#include <fstream>
#include <iostream>
#include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/iostreams/filter/zlib.hpp>
int main()
{
using namespace std;
ifstream file("hello.z", ios_base::in | ios_base::binary);
filtering_streambuf<input> in;
in.push(zlib_decompressor());
in.push(file);
boost::iostreams::copy(in, cout);
}