I\'m a newbie in Hadoop. I\'m trying out the Wordcount program.
Now to try out multiple output files, i use MultipleOutputFormat. this link helped me
I wrote a class for doing this. Just use it your job:
job.setOutputFormatClass(m_customOutputFormatClass);
This is the my class:
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
/**
* TextOutputFormat extension which enables writing the mapper/reducer's output in multiple files.
*
* WARNING: The number of different folder shuoldn't be large for one mapper since we keep an
* {@link RecordWriter} instance per folder name.
*
*
* In this class the folder name is defined by the written entry's key.
* To change this behavior simply extend this class and override the
* {@link HdMultipleFileOutputFormat#getFolderNameExtractor()} method and create your own
* {@link FolderNameExtractor} implementation.
*
*
*
* @author ykesten
*
* @param - Keys type
* @param - Values type
*/
public class HdMultipleFileOutputFormat extends TextOutputFormat {
private String folderName;
private class MultipleFilesRecordWriter extends RecordWriter {
private Map> fileNameToWriter;
private FolderNameExtractor fileNameExtractor;
private TaskAttemptContext job;
public MultipleFilesRecordWriter(FolderNameExtractor fileNameExtractor, TaskAttemptContext job) {
fileNameToWriter = new HashMap>();
this.fileNameExtractor = fileNameExtractor;
this.job = job;
}
@Override
public void write(K key, V value) throws IOException, InterruptedException {
String fileName = fileNameExtractor.extractFolderName(key, value);
RecordWriter writer = fileNameToWriter.get(fileName);
if (writer == null) {
writer = createNewWriter(fileName, fileNameToWriter, job);
if (writer == null) {
throw new IOException("Unable to create writer for path: " + fileName);
}
}
writer.write(key, value);
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
for (Entry> entry : fileNameToWriter.entrySet()) {
entry.getValue().close(context);
}
}
}
private synchronized RecordWriter createNewWriter(String folderName,
Map> fileNameToWriter, TaskAttemptContext job) {
try {
this.folderName = folderName;
RecordWriter writer = super.getRecordWriter(job);
this.folderName = null;
fileNameToWriter.put(folderName, writer);
return writer;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
@Override
public Path getDefaultWorkFile(TaskAttemptContext context, String extension) throws IOException {
Path path = super.getDefaultWorkFile(context, extension);
if (folderName != null) {
String newPath = path.getParent().toString() + "/" + folderName + "/" + path.getName();
path = new Path(newPath);
}
return path;
}
@Override
public RecordWriter getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
return new MultipleFilesRecordWriter(getFolderNameExtractor(), job);
}
public FolderNameExtractor getFolderNameExtractor() {
return new KeyFolderNameExtractor();
}
public interface FolderNameExtractor {
public String extractFolderName(K key, V value);
}
private static class KeyFolderNameExtractor implements FolderNameExtractor {
public String extractFolderName(K key, V value) {
return key.toString();
}
}
}