I have a below data set where first column is department and second is for salary. I want to calculate the avg of salary by department.
IT 2000000
HR 2000000
IT 1950000
HR 2200000
Admin 1900000
IT 1900000
IT 2200000
I performed below operation
JavaPairRDD<String, Iterable<Long>> rddY = employees.groupByKey();
System.out.println("<=========================RDDY collect==================>" + rddY.collect());
and got below output:
<=========================RDDY
collect==================>[(IT,[2000000, 1950000, 1900000, 2200000]),
(HR,[2000000, 2200000]), (Admin,[1900000])]
What I need is
I want to calculate the total average and department wise average by using spark RDD.
How to use the groupBy functions in spark to calculate the average.
Below is the code to calculate average by key using Spark JavaPairRDD. Hope this helps.
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class SparkAverageCalculation {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("Average Calculation").setMaster("local[2]");
JavaSparkContext sc = new JavaSparkContext(conf);
//inputList
List<Tuple2<String,Integer>> inputList = new ArrayList<Tuple2<String,Integer>>();
inputList.add(new Tuple2<String,Integer>("a1", 30));
inputList.add(new Tuple2<String,Integer>("b1", 30));
inputList.add(new Tuple2<String,Integer>("a1", 40));
inputList.add(new Tuple2<String,Integer>("a1", 20));
inputList.add(new Tuple2<String,Integer>("b1", 50));
//parallelizePairs
JavaPairRDD<String, Integer> pairRDD = sc.parallelizePairs(inputList);
//count each values per key
JavaPairRDD<String, Tuple2<Integer, Integer>> valueCount = pairRDD.mapValues(value -> new Tuple2<Integer, Integer>(value,1));
//add values by reduceByKey
JavaPairRDD<String, Tuple2<Integer, Integer>> reducedCount = valueCount.reduceByKey((tuple1,tuple2) -> new Tuple2<Integer, Integer>(tuple1._1 + tuple2._1, tuple1._2 + tuple2._2));
//calculate average
JavaPairRDD<String, Integer> averagePair = reducedCount.mapToPair(getAverageByKey);
//print averageByKey
averagePair.foreach(data -> {
System.out.println("Key="+data._1() + " Average=" + data._2());
});
//stop sc
sc.stop();
sc.close();
}
private static PairFunction<Tuple2<String, Tuple2<Integer, Integer>>,String,Integer> getAverageByKey = (tuple) -> {
Tuple2<Integer, Integer> val = tuple._2;
int total = val._1;
int count = val._2;
Tuple2<String, Integer> averagePair = new Tuple2<String, Integer>(tuple._1, total / count);
return averagePair;
};
}
import org.apache.htrace.fasterxml.jackson.databind.ObjectMapper;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
import scala.Tuple2;
import java.util.Map;
public class ElasticsearchMetricProcessor {
private static final String ES_HOST_PORT = "localhost:9200";
private static PairFunction<Tuple2<String, Tuple2<Long, Integer>>,String,Long> getAverageByKey = (tuple) -> {
Tuple2<Long, Integer> val = tuple._2;
long total = val._1;
int count = val._2;
Tuple2<String, Long> averagePair = new Tuple2<String, Long>(tuple._1, total / count);
return averagePair;
};
public static void main(String args[]) throws InterruptedException {
System.setProperty("hadoop.home.dir","C:\\Users\\anki\\metering\\winutils");
SparkConf sparkConf = new SparkConf().setAppName("StreamingApp").setMaster("local[2]");
sparkConf.set("es.nodes.wan.only","false");
sparkConf.set("es.nodes",ES_HOST_PORT);
JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, Durations.seconds(10));
JavaRDD<Map<String, Object>> esRDD = JavaEsSpark.esRDD(jsc.sparkContext(), "portal_analytics/report-execution").values();
JavaPairRDD<String, Tuple2<Long, Integer>> valueCount = esRDD.mapToPair( x -> new Tuple2<String, Long>(x.get("id").toString(),Long.valueOf(x.get("duration").toString()))).mapValues(value -> new Tuple2<Long, Integer>(value,1));
JavaPairRDD<String, Tuple2<Long, Integer>> reducedCount = valueCount.reduceByKey((tuple1,tuple2) -> new Tuple2<Long, Integer>(tuple1._1 + tuple2._1, tuple1._2 + tuple2._2));
//calculate average
JavaPairRDD<String, Long> averagePair = reducedCount.mapToPair(getAverageByKey);
//print averageByKey
averagePair.foreach(data -> {
System.out.println("Key="+data._1() + " Average=" + data._2());
});
//stop sc
jsc.stop();
jsc.close();
}
}
--------------------------------------------------------
Elasticsearch Test Data
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"failed": 0
},
"hits": {
"total": 16,
"max_score": 1,
"hits": [
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8aPGm2uMcgoWFwdx",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8eOcm2uMcgoWFwd3",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvTL5ACm2uMcgoWFweC",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "b37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 70
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvTL96Xm2uMcgoWFweD",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "b37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvTNrKPm2uMcgoWFweF",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "b37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8dWFm2uMcgoWFwdy",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8dlim2uMcgoWFwdz",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8d7am2uMcgoWFwd1",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8eX0m2uMcgoWFwd4",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8nplm2uMcgoWFwd7",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 50
}
}
],
}
}
Output
Key=b37cacc3-71d5-40f0-a329-a051a3949ced Average=50 Key=a37cacc3-71d5-40f0-a329-a051a3949ced Average=37
Above solutions are in java, for anyone who is looking in scala,can try below solution.
val mapp = data.map(x => x.split(" "))
val dept = mapp.map( x => (x(0),(x(1).toInt,1)))
val avg = dept.reduceByKey((x,y) => ((x._1+y._1),(x._2+y._2)))
val count = avg.mapValues{case (x,y) => x/y}
count.foreach(println)
Output
(Admin,1900000)
(HR,4200000)
(IT,8050000)
来源:https://stackoverflow.com/questions/38847188/find-average-by-department-in-spark-groupby-in-java-1-8