Find average by department in spark groupBy in Java 1.8

I have a below data set where first column is department and second is for salary. I want to calculate the avg of salary by department.

IT  2000000
HR  2000000
IT  1950000
HR  2200000
Admin   1900000
IT  1900000
IT  2200000

I performed below operation

JavaPairRDD<String, Iterable<Long>> rddY = employees.groupByKey();
System.out.println("<=========================RDDY collect==================>" + rddY.collect());

and got below output:

<=========================RDDY
collect==================>[(IT,[2000000, 1950000, 1900000, 2200000]),
(HR,[2000000, 2200000]), (Admin,[1900000])]

What I need is

I want to calculate the total average and department wise average by using spark RDD.
How to use the groupBy functions in spark to calculate the average.

Below is the code to calculate average by key using Spark JavaPairRDD. Hope this helps.

import java.util.ArrayList;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

public class SparkAverageCalculation {
public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("Average Calculation").setMaster("local[2]");
    JavaSparkContext sc = new JavaSparkContext(conf);
    //inputList
    List<Tuple2<String,Integer>> inputList = new ArrayList<Tuple2<String,Integer>>();
    inputList.add(new Tuple2<String,Integer>("a1", 30));
    inputList.add(new Tuple2<String,Integer>("b1", 30));
    inputList.add(new Tuple2<String,Integer>("a1", 40));
    inputList.add(new Tuple2<String,Integer>("a1", 20));
    inputList.add(new Tuple2<String,Integer>("b1", 50));            
    //parallelizePairs    
    JavaPairRDD<String, Integer> pairRDD = sc.parallelizePairs(inputList);
    //count each values per key
    JavaPairRDD<String, Tuple2<Integer, Integer>> valueCount = pairRDD.mapValues(value -> new Tuple2<Integer, Integer>(value,1));
    //add values by reduceByKey
    JavaPairRDD<String, Tuple2<Integer, Integer>> reducedCount = valueCount.reduceByKey((tuple1,tuple2) ->  new Tuple2<Integer, Integer>(tuple1._1 + tuple2._1, tuple1._2 + tuple2._2));
    //calculate average
    JavaPairRDD<String, Integer> averagePair = reducedCount.mapToPair(getAverageByKey);
    //print averageByKey
    averagePair.foreach(data -> {
        System.out.println("Key="+data._1() + " Average=" + data._2());
    }); 
    //stop sc
    sc.stop();
    sc.close();
}

private static PairFunction<Tuple2<String, Tuple2<Integer, Integer>>,String,Integer> getAverageByKey = (tuple) -> {
     Tuple2<Integer, Integer> val = tuple._2;
     int total = val._1;
     int count = val._2;
     Tuple2<String, Integer> averagePair = new Tuple2<String, Integer>(tuple._1, total / count);
     return averagePair;
  };
}

import org.apache.htrace.fasterxml.jackson.databind.ObjectMapper;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
import scala.Tuple2;

import java.util.Map;

public class ElasticsearchMetricProcessor {

    private static final String ES_HOST_PORT = "localhost:9200";

    private static PairFunction<Tuple2<String, Tuple2<Long, Integer>>,String,Long> getAverageByKey = (tuple) -> {
        Tuple2<Long, Integer> val = tuple._2;
        long total = val._1;
        int count = val._2;
        Tuple2<String, Long> averagePair = new Tuple2<String, Long>(tuple._1, total / count);
        return averagePair;
    };

    public static void main(String args[]) throws InterruptedException {
        System.setProperty("hadoop.home.dir","C:\\Users\\anki\\metering\\winutils");
        SparkConf sparkConf = new SparkConf().setAppName("StreamingApp").setMaster("local[2]");
        sparkConf.set("es.nodes.wan.only","false");
        sparkConf.set("es.nodes",ES_HOST_PORT);
        JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, Durations.seconds(10));

        JavaRDD<Map<String, Object>> esRDD =  JavaEsSpark.esRDD(jsc.sparkContext(), "portal_analytics/report-execution").values();

        JavaPairRDD<String, Tuple2<Long, Integer>> valueCount = esRDD.mapToPair( x -> new Tuple2<String, Long>(x.get("id").toString(),Long.valueOf(x.get("duration").toString()))).mapValues(value -> new Tuple2<Long, Integer>(value,1));

        JavaPairRDD<String, Tuple2<Long, Integer>> reducedCount = valueCount.reduceByKey((tuple1,tuple2) ->  new Tuple2<Long, Integer>(tuple1._1 + tuple2._1, tuple1._2 + tuple2._2));
        //calculate average
        JavaPairRDD<String, Long> averagePair = reducedCount.mapToPair(getAverageByKey);
        //print averageByKey
        averagePair.foreach(data -> {
            System.out.println("Key="+data._1() + " Average=" + data._2());
        });
        //stop sc
        jsc.stop();
        jsc.close();
    }
}
--------------------------------------------------------
Elasticsearch Test Data

{
"took": 3,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"failed": 0
},
"hits": {
"total": 16,
"max_score": 1,
"hits": [
  {
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8aPGm2uMcgoWFwdx",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
  {
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8eOcm2uMcgoWFwd3",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
  {
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvTL5ACm2uMcgoWFweC",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "b37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 70
}
},
  {
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvTL96Xm2uMcgoWFweD",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "b37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
  {
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvTNrKPm2uMcgoWFweF",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "b37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
  {
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8dWFm2uMcgoWFwdy",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
  {
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8dlim2uMcgoWFwdz",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
  {
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8d7am2uMcgoWFwd1",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
  {
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8eX0m2uMcgoWFwd4",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
  {
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8nplm2uMcgoWFwd7",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 50
}
}
],
}
}

Output

Key=b37cacc3-71d5-40f0-a329-a051a3949ced Average=50 Key=a37cacc3-71d5-40f0-a329-a051a3949ced Average=37

Above solutions are in java, for anyone who is looking in scala,can try below solution.

val mapp = data.map(x => x.split(" "))
val dept = mapp.map( x => (x(0),(x(1).toInt,1)))
val avg = dept.reduceByKey((x,y) => ((x._1+y._1),(x._2+y._2)))
val count = avg.mapValues{case (x,y) => x/y}
count.foreach(println)

Output

(Admin,1900000)
(HR,4200000)
(IT,8050000)

来源：https://stackoverflow.com/questions/38847188/find-average-by-department-in-spark-groupby-in-java-1-8

标签

java

apache-spark

rdd