今天完成的是第二部数据的清洗,由于在网上查阅了相关资料,进一步了解了有关mapreduce的运作机理,所以在做起来时,较以前简单了许多。
import java.io.IOException;
2 import org.apache.hadoop.conf.Configuration;
3 import org.apache.hadoop.fs.Path;
4 import org.apache.hadoop.io.Text;
5 import org.apache.hadoop.mapreduce.Job;
6 import org.apache.hadoop.mapreduce.Mapper;
7 import org.apache.hadoop.mapreduce.Reducer;
8 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
9 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
10
11 public class text_2_1 {
12 public static class Map extends Mapper<Object,Text,Text,Text>{
13 private static Text newKey = new Text();
14 private static Text newvalue = new Text("1");
15 public void map(Object key,Text value,Context context) throws IOException, InterruptedException{
16 String line = value.toString();
17 String arr[] = line.split(" ");
18 newKey.set(arr[5]);
19 context.write(newKey,newvalue);
20 }
21 }
22 public static class Reduce extends Reducer<Text, Text, Text, Text> {
23 private static Text newkey = new Text();
24 private static Text newvalue = new Text();
25 protected void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
26 int num = 0;
27 for(Text text : values){
28 num++;
29 }
30 newkey.set(""+num);
31 newvalue.set(key);
32 context.write(newkey,newvalue);
33 }
34 }
35 public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
36 Configuration conf = new Configuration();
37 conf.set("mapred.textoutputformat.separator", " ");
38 System.out.println("start");
39 Job job=Job.getInstance(conf);
40 job.setJarByClass(text_2_1.class);
41 job.setMapperClass(Map.class);
42 job.setReducerClass(Reduce.class);
43 job.setOutputKeyClass(Text.class);
44 job.setOutputValueClass(Text.class);
45 Path in=new Path("hdfs://localhost:9000/text/in/data");
46 Path out=new Path("hdfs://localhost:9000/text/out1");
47 FileInputFormat.addInputPath(job, in);
48 FileOutputFormat.setOutputPath(job, out);
49 boolean flag = job.waitForCompletion(true);
50 System.out.println(flag);
51 System.exit(flag? 0 : 1);
52 }
53 }
import java.io.IOException;
2 import org.apache.hadoop.conf.Configuration;
3 import org.apache.hadoop.fs.Path;
4 import org.apache.hadoop.io.IntWritable;
5 import org.apache.hadoop.io.Text;
6 import org.apache.hadoop.mapreduce.Job;
7 import org.apache.hadoop.mapreduce.Mapper;
8 import org.apache.hadoop.mapreduce.Reducer;
9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
11 public class m {
12 public static class Map extends Mapper<Object,Text,IntWritable,Text>{
13 private static IntWritable newKey = new IntWritable ();
14 private static Text newvalue = new Text ();
15 public void map(Object key,Text value,Context context) throws IOException, InterruptedException{
16 String line = value.toString();
17 String arr[] = line.split(" ");
18 newKey.set(Integer.parseInt(arr[0]));
19 newvalue.set(arr[1]);
20 context.write(newKey,newvalue);
21 }
22 }
23 public static class Reduce extends Reducer<IntWritable, Text, IntWritable, Text> {
24 protected void reduce(IntWritable key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
25 for(Text text : values){
26 context.write(key,text);
27 }
28 }
29 }
30 public static class IntWritableDecreasingComparator extends IntWritable.Comparator
31 {
32 public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2)
33 {
34 return -super.compare(b1, s1, l1, b2, s2, l2);
35 }
36 }
37 public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
38 Configuration conf = new Configuration();
39 conf.set("mapred.textoutputformat.separator", " ");
40 System.out.println("start");
41 Job job=Job.getInstance(conf);
42 job.setJarByClass(m.class);
43 job.setMapperClass(Map.class);
44 job.setReducerClass(Reduce.class);
45 job.setOutputKeyClass(IntWritable.class);
46 job.setOutputValueClass(Text.class);
47 job.setSortComparatorClass(IntWritableDecreasingComparator.class);
48 Path in=new Path("hdfs://localhost:9000/text/out1/part-r-00000");
49 Path out=new Path("hdfs://localhost:9000/text/out2");
50 FileInputFormat.addInputPath(job, in);
51 FileOutputFormat.setOutputPath(job, out);
52 boolean flag = job.waitForCompletion(true);
53 System.out.println(flag);
54 System.exit(flag? 0 : 1);
55 }
56 }
运行结果如下:

