Read simple/bz2-compressed-file(line by line) by detecting it is compressed or not (size of file is large)

巧了我就是萌 提交于 2019-12-11 02:59:12

问题


I wrote a code to read simple-text/bz2-compressed-file. I used magic-characters of bz2 file to detect the file is compressed or not

NOTE "user may or may not provide file with proper extension"

my code

#include <iostream>
#include <sstream>
#include <vector>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/iostreams/filter/bzip2.hpp>

// compile using
// g++ -std=c++11 code.cpp -lboost_iostreams
// run using
// ./a.out < compressed_file
// ./a.out < simple_file
// cat file_name | ./a.out

std::string BZ2 = ".bzip2";
std::string NO_EXT = "";

void uncompress(std::vector<char> & line, const std::string & file_ext){
    std::string str(line.begin(), line.end());
    std::cout << "size of line is " << str.length() <<std::endl;

    std::stringstream input(str);
    std::stringstream decompressed;
    boost::iostreams::filtering_istream in;

    if (file_ext == NO_EXT) {return;}
    if (file_ext == BZ2) {
        in.push(boost::iostreams::bzip2_decompressor());
        in.push(input);
        boost::iostreams::copy(in, decompressed);
        decompressed >> str;
        line.clear();
        std::copy(str.begin(),str.end(),std::back_inserter(line));
    }
}

std::vector<char>&readline(std::istream & stream, std::vector<char> & container) {
    char c;
    container.clear();
    while (stream && stream.get(c)) {
        container.push_back(c);
        if (c == '\n') break;
    }
    return container;
}

std::string get_ext(const std::vector<char> &line) { // working fine
    std::vector<std::pair<std::vector<char>, std::string>> types = { { {66, 90, 104}, BZ2} };// magic char of bzip file
    for (auto & type : types) if (std::equal(type.first.begin(), type.first.end(), line.begin())) return type.second;
    return NO_EXT;
}

void print_line(std::vector<char> &line) { //working fine
    std::string str(line.begin(), line.end());
    std::cout << str << std::endl;
}

int main () {
    std::vector<char> line;
    readline(std::cin, line);
    std::string file_ext = get_ext(line); //obitain the file extension

    uncompress(line, file_ext);
    print_line(line);

    while (readline(std::cin, line).size() != 0) {
        uncompress(line, file_ext);
        print_line(line);
    }
}

there is a problem with this code. While reading compressed file. It is reading whole of the compressed file. I don't want to load whole file into memory just to test the file_type.

file size may be greater than 4 GB

If by some way I could figure out the file_type then It will be pretty easy for me to do so.

std::string BZ2 = ".bzip2";
std::string NO_EXT = "";

void uncompress(std::istream & input,
                const std::string & file_ext,
                boost::iostreams::filtering_istream & in)
{
    if (file_ext == BZ2) {
        in.push(boost::iostreams::bzip2_decompressor());
    }
    in.push(input);
}

std::vector<char>&readline(boost::iostreams::filtering_istream & stream, std::vector<char> & container) {
    char c;
    container.clear();
    while (stream && stream.get(c)) {
        container.push_back(c);
        if (c == '\n') break;
    }
    return container;
}

std::string get_ext(const std::vector<char> &line) { // working fine
    std::vector<std::pair<std::vector<char>, std::string>> types = { { { 66, 90, 104 }, BZ2 } };

    for (auto & type : types) if (std::equal(type.first.begin(), type.first.end(), line.begin())) return type.second;

    return NO_EXT;
}

void print_line(std::vector<char> &line) { //working fine
    std::string str(line.begin(), line.end());
    std::cout << str << std::endl;
}

int main () {
    std::vector<char> line;
    boost::iostreams::filtering_istream in;
    std::string file_ext = BZ2; // suppose I already knew that beforehand

    uncompress(std::cin, file_ext, in);

    while (readline(in, line).size() != 0) {
        print_line(line);
    }
}

I am getting no idea how to know that before hand. Or any other approach.

来源:https://stackoverflow.com/questions/49182320/read-simple-bz2-compressed-fileline-by-line-by-detecting-it-is-compressed-or-n

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!