Result文件数据说明:
Ip:106.39.41.166,(城市)
Date:10/Nov/2016:00:01:02 +0800,(日期)
Day:10,(天数)
Traffic: 54 ,(流量)
Type: video,(类型:视频video或文章article)
Id: 8701(视频或者文章的id)
测试要求:
1、 数据清洗:按照进行数据清洗,并将清洗后的数据导入hive数据库中。
两阶段数据清洗:
(1)第一阶段:把需要的信息从原始日志中提取出来
ip: 199.30.25.88
time: 10/Nov/2016:00:01:03 +0800
traffic: 62
文章: article/11325
视频: video/3235
(2)第二阶段:根据提取出来的信息做精细化操作
ip--->城市 city(IP)
date--> time:2016-11-10 00:01:03
day: 10
traffic:62
type:article/video
id:11325
(3)hive数据库表结构:
create table data( ip string, time string , day string, traffic bigint,
type string, id string )
2、数据处理:
·统计最受欢迎的视频/文章的Top10访问次数 (video/article)
·按照地市统计最受欢迎的Top10课程 (ip)
·按照流量统计最受欢迎的Top10课程 (traffic)
3、数据可视化:将统计结果倒入MySql数据库中,通过图形化展示的方式展现出来。
阶段一:
/*
* 将日志文件首先利用mapreduce清洗出来,然后导入hive里
*
*/
package classtest3;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class Result{
public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); //原时间格式
public static final SimpleDateFormat dateformat1 = new SimpleDateFormat("yyyy-MM-dd");//现时间格式
private Date parseDateFormat(String string) { //转换时间格式
Date parse = null;
try {
parse = FORMAT.parse(string);
} catch (Exception e) {
e.printStackTrace();
}
return parse;
}
//将一行数据清洗整合到一个字符串数组里
public String[] parse(String line) {
String ip = parseIP(line); //ip
String time = parseTime(line); //时间
String url = parseURL(line); //url
String status = parseStatus(line); //状态
String traffic = parseTraffic(line);//流量
return new String[] { ip, time, url, status, traffic };
}
private String parseTraffic(String line) { //流量
final String trim = line.substring(line.lastIndexOf("\"") + 1)
.trim();
String traffic = trim.split(" ")[1];
return traffic;
}
private String parseStatus(String line) { //状态
final String trim = line.substring(line.lastIndexOf("\"") + 1)
.trim();
String status = trim.split(" ")[0];
return status;
}
private String parseURL(String line) { //url
final int first = line.indexOf("\"");
final int last = line.lastIndexOf("\"");
String url = line.substring(first + 1, last);
return url;
}
//解析Time
private String parseTime(String line) { //时间
final int first = line.indexOf("[");//查找一个字符串中,第一次出现指定字符串的位置。
final int last = line.indexOf("+0800]");
String time = line.substring(first + 1, last).trim();//"hamburger".substring(3,8) returns "burge"
Date date = parseDateFormat(time);
return dateformat1.format(date);
}
private String parseIP(String line) { //ip
String ip = line.split("- -")[0].trim();
return ip;
}
public static class Map extends
Mapper<LongWritable, Text, Text, IntWritable> {
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 将输入的纯文本文件的数据转化成String
Text outputValue = new Text();
String line = value.toString();
Result aa=new Result();
StringTokenizer tokenizerArticle = new StringTokenizer(line, "\n");
// 分别对每一行进行处理
while (tokenizerArticle.hasMoreElements()) {
// 每行按空格划分
String stra=tokenizerArticle.nextToken().toString();
String [] Newstr=aa.parse(stra);
if (Newstr[2].startsWith("GET /")) { //过滤开头字符串
Newstr[2] = Newstr[2].substring("GET /".length());
}
else if (Newstr[2].startsWith("POST /")) {
Newstr[2] = Newstr[2].substring("POST /".length());
}
if (Newstr[2].endsWith(" HTTP/1.1")) { //过滤结尾字符串
Newstr[2] = Newstr[2].substring(0, Newstr[2].length()
- " HTTP/1.1".length());
}
String[] words = Newstr[2].split("/");
if(words.length==4){
outputValue.set(Newstr[0] + "\t" + Newstr[1] + "\t" + words[0]+"\t"+words[1]+"\t"+words[2]+"\t"+words[3]+"\t"+"0");
context.write(outputValue,new IntWritable(1));
}
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
// 实现reduce函数
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
Iterator<IntWritable> iterator = values.iterator();
while (iterator.hasNext()) {
sum += iterator.next().get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// conf.set("mapred.jar","Namecount.jar");
//
// String[] ioArgs = new String[] { "name", "name_out" };
//
// String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();//输入输出路径
//
// //验证输入输出路径是否存在
// if (otherArgs.length != 2) {
// System.err.println("Usage: Score Average <in> <out>");
// System.exit(2);
// }
Job job = Job.getInstance();
job.setJarByClass(Result.class);
// 设置Map、Combine和Reduce处理类
job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
// 设置输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 将输入的数据集分割成小数据块splites,提供一个RecordReder的实现
job.setInputFormatClass(TextInputFormat.class);
// 提供一个RecordWriter的实现,负责数据输出
job.setOutputFormatClass(TextOutputFormat.class);
// 设置输入和输出目录
FileInputFormat.addInputPath(job, new Path("hdfs://192.168.57.128:9000/MyMapReduce/classtest3/name.txt"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.57.128:9000/MyMapReduce/classtest3/test1result"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
导入Hive语句:
load data inpath '/MyMapReduce/classtest3/test1result/part-r-00000' into table acc_log;