MultipleOutputFormat in hadoop

前端 未结 3 1582
春和景丽
春和景丽 2020-11-29 06:52

I\'m a newbie in Hadoop. I\'m trying out the Wordcount program.

Now to try out multiple output files, i use MultipleOutputFormat. this link helped me

3条回答
  •  死守一世寂寞
    2020-11-29 07:30

    I wrote a class for doing this. Just use it your job:

    job.setOutputFormatClass(m_customOutputFormatClass);
    

    This is the my class:

    import java.io.IOException;
    import java.util.HashMap;
    import java.util.Map;
    import java.util.Map.Entry;
    
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.mapreduce.RecordWriter;
    import org.apache.hadoop.mapreduce.TaskAttemptContext;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    
    /**
     * TextOutputFormat extension which enables writing the mapper/reducer's output in multiple files.
    *

    * WARNING: The number of different folder shuoldn't be large for one mapper since we keep an * {@link RecordWriter} instance per folder name. *

    *

    * In this class the folder name is defined by the written entry's key.
    * To change this behavior simply extend this class and override the * {@link HdMultipleFileOutputFormat#getFolderNameExtractor()} method and create your own * {@link FolderNameExtractor} implementation. *

    * * * @author ykesten * * @param - Keys type * @param - Values type */ public class HdMultipleFileOutputFormat extends TextOutputFormat { private String folderName; private class MultipleFilesRecordWriter extends RecordWriter { private Map> fileNameToWriter; private FolderNameExtractor fileNameExtractor; private TaskAttemptContext job; public MultipleFilesRecordWriter(FolderNameExtractor fileNameExtractor, TaskAttemptContext job) { fileNameToWriter = new HashMap>(); this.fileNameExtractor = fileNameExtractor; this.job = job; } @Override public void write(K key, V value) throws IOException, InterruptedException { String fileName = fileNameExtractor.extractFolderName(key, value); RecordWriter writer = fileNameToWriter.get(fileName); if (writer == null) { writer = createNewWriter(fileName, fileNameToWriter, job); if (writer == null) { throw new IOException("Unable to create writer for path: " + fileName); } } writer.write(key, value); } @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { for (Entry> entry : fileNameToWriter.entrySet()) { entry.getValue().close(context); } } } private synchronized RecordWriter createNewWriter(String folderName, Map> fileNameToWriter, TaskAttemptContext job) { try { this.folderName = folderName; RecordWriter writer = super.getRecordWriter(job); this.folderName = null; fileNameToWriter.put(folderName, writer); return writer; } catch (Exception e) { e.printStackTrace(); return null; } } @Override public Path getDefaultWorkFile(TaskAttemptContext context, String extension) throws IOException { Path path = super.getDefaultWorkFile(context, extension); if (folderName != null) { String newPath = path.getParent().toString() + "/" + folderName + "/" + path.getName(); path = new Path(newPath); } return path; } @Override public RecordWriter getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { return new MultipleFilesRecordWriter(getFolderNameExtractor(), job); } public FolderNameExtractor getFolderNameExtractor() { return new KeyFolderNameExtractor(); } public interface FolderNameExtractor { public String extractFolderName(K key, V value); } private static class KeyFolderNameExtractor implements FolderNameExtractor { public String extractFolderName(K key, V value) { return key.toString(); } } }

提交回复
热议问题