需求:
现有部分GN数据,数据为全省数据,解析GN数据,将数据按照 /OutputData/城市名称/日期(YYYY-MM-dd)/类型(固定Gn)/imsi.txt (有很多imsi)的结构,将相同城市,相同日期,相同imsi(国际移动用户标识),类型为Gn的数据汇总到一起,。
解析出新的IMSI, VULUME、CELLID、TAC、city、time
数据:
1|460002452699237|8655890276520178|8613786401241|21.176.70.136|29588|255|56042|221.177.173.83|221.177.173.64|221.177.173.35|221.177.173.35|2|cmnet|101|a788057f91cf3a89|1480752079784|1480752079788|18|26|0|33931|8.8.8.8|53|460|0|73|366|1|1|0|0|0|0|0|0|183.232.72.164|0|1|4|6|6|2260069379|||||||||||||||
数据说明:数据列的分隔符为“|”,截取出数据的第六个和第八个字段,两个字段使用“_”拼接,构成城市名称编号。
日期字段为第十七个数据。
Imsi数据为第二个数据
GnMapper
package GN.demo01;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class GnMapper extends Mapper<LongWritable, Text,Text,Text> {
private SimpleDateFormat df = new SimpleDateFormat("YYYY-MM-dd");
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\\|");
context.write(new Text("/OutputData/"+ split[5]+"_"+split[7]+"/"+df.format(new Date(Long.parseLong(split[16])))+"/Gn/"+split[1]),new Text(value));
}
}
GnReducer
package GN.demo01;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class GnReducer extends Reducer<Text, Text, Text, NullWritable> {
FSDataOutputStream fsDataOutputStream;
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
Configuration configuration = context.getConfiguration();
FileSystem fileSystem = null;
try {
fileSystem = FileSystem.get(new URI("hdfs://192.168.100.201:8020"),configuration);
} catch (URISyntaxException e) {
e.printStackTrace();
}
String str = "";
fsDataOutputStream = fileSystem.create(new Path(key.toString() + ".txt"));
for (Text value : values) {
str += value.toString() + "\r\n";
}
byte[] bytes = str.getBytes();
fsDataOutputStream.write(bytes,0,bytes.length);
fsDataOutputStream.close();
}
}
GnDriver
package GN.demo01;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class GnDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
// conf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
Job job = Job.getInstance(conf);
// job.setNumReduceTasks(30);
job.setJarByClass(GnDriver.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("E:\\2019-传智项目\\企业需求实战\\01_湖南移动项目需求\\数据\\gn"));
job.setMapperClass(GnMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(GnReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("E:\\2019-传智项目\\企业需求实战\\01_湖南移动项目需求\\数据\\Out"));
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static void main(String[] args) throws Exception{
ToolRunner.run(new GnDriver(),args);
}
}
来源:CSDN
作者:沙漠一只雕?
链接:https://blog.csdn.net/bbvjx1314/article/details/103626949