How to divide a big dataset into multiple small files in Hadoop in an efficient way

问题

I have a big data set consisting of files with 1M records each and I'd like to divide it into some files with 1000 records each in Hadoop. I'm investigating different scenarios for achieving this goal. One is to make the split size small so that each mapper takes only a few records (~1000 records) and then output them. This requires running many mappers which is not efficient. The other solution is to consider one reducer and send all the records to it and them do the split there. This is also counter-intuitive to mapreduce as all the job is done by only one node. What is the efficient alternative to split this data sets into small files?

回答1:

You can use NLineInputFormat to specify how many records should be given as input for mappers.

Set the property 'mapreduce.input.lineinputformat.linespermap' to multiples of 1000 so that reasonable number of mappers are spawned.In mapper, Use multiple outputs to write each 1000 records to separate file using counter increment logic.

Sample code for using multiple outputs to split data into 1000 records (for Text files)

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class DataSplitter {

    public static class Map extends Mapper<LongWritable, Text, NullWritable, Text> {

        private Text outputValue = new Text();

        @SuppressWarnings("rawtypes")
        private MultipleOutputs multipleOutputs;

        private int fileCounter = 1;

        private List<String> recordList = new ArrayList<String>();

        @SuppressWarnings({ "rawtypes", "unchecked" })
        @Override
        protected void setup(Mapper<LongWritable, Text, NullWritable, Text>.Context context) throws IOException, InterruptedException {

            multipleOutputs = new MultipleOutputs(context);

        }

        @SuppressWarnings("unchecked")
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();

            recordList.add(line);

            if (recordList.size() == 1000) {

                for (int i = 0; i < recordList.size(); i++) {

                    outputValue.set(recordList.get(i));

                    multipleOutputs.write("mos", NullWritable.get(), outputValue, "output-" + fileCounter);

                }

                fileCounter++;

                recordList.clear();
            }

        }

        @Override
        protected void cleanup(Mapper<LongWritable, Text, NullWritable, Text>.Context context) throws IOException, InterruptedException {

            multipleOutputs.close();

            if (!recordList.isEmpty()) {

                for (int i = 0; i < recordList.size(); i++) {

                    outputValue.set(recordList.get(i));

                    context.write(NullWritable.get(), outputValue);

                }
                recordList.clear();

            }
        }

    }

    public static class Reduce extends Reducer<LongWritable, Text, NullWritable, Text> {

        private Text outputValue = new Text();

        @SuppressWarnings("rawtypes")
        private MultipleOutputs multipleOutputs;

        private int fileCounter = 1;

        private List<String> recordList = new ArrayList<String>();

        @SuppressWarnings({ "unchecked", "rawtypes" })
        @Override
        protected void setup(Reducer<LongWritable, Text, NullWritable, Text>.Context context) throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            multipleOutputs = new MultipleOutputs(context);
        }

        @SuppressWarnings("unchecked")
        public void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

            for (Text value : values) {

                String line = value.toString();

                recordList.add(line);

                if (recordList.size() == 1000) {

                    for (int i = 0; i < recordList.size(); i++) {

                        outputValue.set(recordList.get(i));

                        multipleOutputs.write("mos", NullWritable.get(), outputValue, "output-" + fileCounter);

                    }
                    fileCounter++;
                    recordList.clear();
                }

                if (!recordList.isEmpty()) {

                    for (int i = 0; i < recordList.size(); i++) {

                        outputValue.set(recordList.get(i));

                        context.write(NullWritable.get(), outputValue);

                    }
                }
            }

        }

        @Override
        protected void cleanup(Reducer<LongWritable, Text, NullWritable, Text>.Context context) throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            super.cleanup(context);
            multipleOutputs.close();
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        @SuppressWarnings("deprecation")
        Job job = new Job(conf, "DataSplitter");
        job.setJarByClass(DataSplitter.class);

        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileSystem.get(conf).delete(new Path(args[1]), true);

        MultipleOutputs.addNamedOutput(job, "mos", TextOutputFormat.class, NullWritable.class, Text.class);
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        System.exit(job.waitForCompletion(true) == true ? 0 : 1);
    }

}

回答2:

If you don't particularly care about which record goes where, then calculate the number of files you want beforehand,and put it into the configuration. Then you could have a random number generator in the mapper, which generates a random number between 0 and (numFiles -1). Take num % numReducers as a key for the mapper output, with numReducers being the number of reducers you want to have.

For the value, use a MapWritable<IntWritable,RecordClass>, replacing RecordClass with whatever is convenient to store the record itself. for the IntWritable put the original random number denoting which file it should go into. put the rest of the record into the RecordClass slot.

In the reducer, extract the random number from the map, and for write the record to a file based on that number (write to file FileName1 if number is 1, FileName2 if number is 2 etc.).

回答3:

Split big file into multiple small files using spark.

Below example splits input-file into 2 files:

     scala> sc.textFile("/xyz-path/input-file",2).saveAsTextFile("/xyz-path/output-file")

2nd parameter in textFile is minPartitions and it uses default partitioner. You can also use customer partitioner for better partitioning strategy. Read more about Custom partition here.

来源：https://stackoverflow.com/questions/29567139/how-to-divide-a-big-dataset-into-multiple-small-files-in-hadoop-in-an-efficient

标签

Hadoop

MapReduce