Spark_飞机项目
- 首先将csv文件变成UTF-8
scala> val flights=sc.textFile("/data/USA_Flight") scala> flights.take(3) val df = spark.read.format("csv").option("header",true).load("/data/USA_Flight") 重新定义英文名 scala> val df1=df.withColumn("origin_id",col("起飞机场编号")) 起始机场编号的总数 scala> df1.groupBy("origin_id").agg(count("*")).show(1) 起始机场编号10个的总数(别名cnt) scala> df1.groupBy("origin_id").agg(count("*")).as("cnt").show(10) 起始机场编号排名 scala> df1.groupBy("origin_id").agg(count("*").as("cnt")).sort(desc("cnt")).show(10)
rdd
- 起始机场编号排名
scala> val df1=df.withColumn("origin_id",col("起飞机场编号")) scala> val rdd=df1.select("origin_id").rdd scala> rdd.map(row=>(row.get(0),1)).reduceByKey(_+_).sortBy(_._2,false).collect
rdd.延时航班的数量/总共发出航班的数量比例 scala> val rdd3=sc.textFile("/data/USA_Flight") scala> val rdd3_1=rdd3.mapPartitionsWithIndex((idx,it)=>{if(idx==0) it.drop(1) else it}) scala>val rdd3_2=rdd3_1.map(line=>line.split(",")).map(arr=>(arr(2),arr(11))) 延时航班的数量/总共发出航班的数量 scala>rdd3_2.groupByKey().map(comp=>(comp._1,comp._2.count(x=>x.toInt>0).toDouble/comp._2.size)).take(3)
scala> import org.apache.spark.graphx._ scala> val rdd3=sc.textFile("/data/USA_Flight") scala> val rdd3_1=rdd3.mapPartitionsWithIndex((idx,it)=>{if(idx==0) it.drop(1) else it}) scala> val airports = rdd3_1.map(line=>line.split(",")).map(arr=>(arr(5),arr(6),arr(7),arr(8))).flatMap(x=>Array((x._1.toLong,x._2),(x._3,x._4))) scala> val airports=rdd3_1.map(line=>line.split(",")).map(arr=>(arr(5),arr(6),arr(7),arr(8))).flatMap(x=>Array((x._1.toLong,x._2),(x._3.toLong,x._4))).distinct scala> val arilines=rdd3_1.map(line=>line.split(",")).map(arr=>(arr(5),arr(7),arr(16))) scala> val airlines=rdd3_1.map(line=>line.split(",")).map(arr=>Edge(arr(5).toLong,arr(7).toLong,arr(16).toLong)).distinct scala> val graph=Graph(airports,arilines) scala> graph.vertices.collect
机场数量/航线数量
求顶点个数: scala> graph.numVertices 求边的个数: scala> graph.numEdges
计算最长的飞行航线
- 最大的边属性
方法一:scala> graph.edges.sortBy(edge=>edge.attr,false).take(1) 方法二:scala> graph.triplets.sortBy(triplet=>triplet.attr,false).take(1)
找出最繁忙的机场
- 哪个机场到达航班最多
计算顶点的入度并排序 scala> graph.inDegrees.take(10) scala> graph.inDegrees.sortBy(_._2,false).take(10) 计算出度并排序 scala> graph.outDegrees.sortBy(_._2,false).take(1)
找出最重要的飞行航线
- PageRank
scala> graph.pageRank(0.001).vertices.sortBy(_._2,false).take(3) res19: Array[(org.apache.spark.graphx.VertexId, Double)] = Array((10397,11.804830496200681), (13930,11.559339731504148), (11298,11.415597402337278))
prege
sampleRDD
scala> val sampleRDD=sc.makeRDD(1 to 10) scala> sampleRDD.sample(false,0.1,10)
找出最便宜的飞行航线
- 创建顶点
scala> val sample=graph.vertices.sample(false,0.4,100) scala> val first=sample.first first: (org.apache.spark.graphx.VertexId, String) = (10397,ATL)
- 初始化源点(0)
第一个值设为0,其他为无穷大 scala> val initGraph=graph.mapVertices((vid,_)=>{if(vid==first._1) 0 else Double.PositiveInfinity} scala> initGraph.vertices.take(3) scala> val initGraph=graph.mapVertices((vid,_)=>{if(vid==first._1) 0 else Double.PositiveInfinity}).mapEdges(edge=>180+edge.attr*0.5) scala> initGraph.edges.take(3) scala> val pregel=initGraph.pregel(Double.PositiveInfinity)(vprog=(vid,price,new_price)=>{math.min(price,new_price)},sendMsg=(triplet)=>{if (triplet.srcAttr+triplet.attr<triplet.dstAttr) Iterator((triplet.dstId,triplet.srcAttr+triplet.attr)) else Iterator.empty},mergeMsg=(a,b)=>{math.min(a,b)}) pregel.vertices.take(3) scala> pregel.vertices.foreach(x=>{println(first._1+" -> "+x._1+" price is : "+x._2)}) 