引言
无论HDFS还是MapReduce,在处理小文件时效率都是非常低,但又难免面临处理大量小文件的场景,此时,就需要有相应的解决方案。可以自定义InputFormat实现小文件的合并。
需求
将多个小文件合并成一个SequenceFile文件(SequenceFile文件是Hadoop用来存储二进制形式的key-value对的文件格式),SequenceFile里面存储着多个文件,存储的形式为路径+名称的key,文件内容为value。
1、输入数据
三个txt文件
2、期望输出文件格式
工程代码
工程结构
MyFIleDriver.java:
package com.zhenghui.TestInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
//import org.apache.hadoop.mapreduce.lib.output.SequenceFileAsBinaryOutputFormat;
import java.io.IOException;
public class MyFIleDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(MyFIleDriver.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
job.setInputFormatClass(MyFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(job,new Path("E:/input"));
FileOutputFormat.setOutputPath(job,new Path("e:/output"));
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
MyFileInputFormat.java:
package com.zhenghui.TestInputFormat;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
public class MyFileInputFormat extends FileInputFormat<Text, BytesWritable> {
/**
* 判断是否可以切片
* @param context
* @param filename
* @return 直接返回false,就是不让切片了
*/
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
public RecordReader<Text, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
return new MyFileRecordReader();
}
}
MyFileRecordReader.java:
package com.zhenghui.TestInputFormat;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* 自定义RecordReader,处理一个文件;把这个文件直接读成一个KV值
*/
public class MyFileRecordReader extends RecordReader<Text, BytesWritable> {
private boolean notRead = true;//初始值表示没读过
//临时Key/Value
private Text key = new Text();
private BytesWritable value = new BytesWritable();
private FSDataInputStream inputStream;//流
private FileSplit fs;
/**
* 初始化方法,框架会在开始的时候调用一次
* @param split 切片
* @param context
* @throws IOException
* @throws InterruptedException
*/
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
//1、转换切片到文件切片
fs = (FileSplit) split;
//2、通过切片获取路径
Path path = fs.getPath();//切片地址
//3、通过路径获取文件系统
FileSystem fileSystem = path.getFileSystem(context.getConfiguration());
//4、开流
inputStream = fileSystem.open(path);
}
/**
* 读取下一组KV值
* @return 如果读到返回true,读完了返回false
* @throws IOException
* @throws InterruptedException
*/
public boolean nextKeyValue() throws IOException, InterruptedException {
if(notRead){
//具体读文件的过程
//1、读key;文件路径做为key
key.set((fs.getPath()).toString());
//2、读value
byte[] buf = new byte[(int) fs.getLength()];//缓冲区要和文件的大小一样
inputStream.read(buf);
value.set(buf,0,buf.length);
System.out.println("key="+key.toString()+"---"+new String(buf));
notRead = false; //读过了
return true;//读到了数据返回true
}else{
//读过了
return false;
}
}
/**
* 获取当前读到的key
* @return 当前key
* @throws IOException
* @throws InterruptedException
*/
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
/**
* 获取当前读到的value
* @return 当前读到的value
* @throws IOException
* @throws InterruptedException
*/
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}
/**
* 返回当前读取数据的进度
* @return 当前的进度
* @throws IOException
* @throws InterruptedException
*/
public float getProgress() throws IOException, InterruptedException {
//没读就是0,读了就是1
return notRead ? 0 : 1;
}
/**
* 关闭资源
* @throws IOException
*/
public void close() throws IOException {
//5、关流
IOUtils.closeStream(inputStream);
}
}
测试结果
来源:CSDN
作者:郑晖同学
链接:https://blog.csdn.net/qq_17623363/article/details/104130822