Spark, Scala, DataFrame: create feature vectors

后端 未结 3 1170
無奈伤痛
無奈伤痛 2020-12-13 01:12

I have a DataFrame that looks like follow:

userID, category, frequency
1,cat1,1
1,cat2,3
1,cat9,5
2,c         


        
3条回答
  •  孤街浪徒
    2020-12-13 01:51

    A little bit more DataFrame centric solution:

    import org.apache.spark.ml.feature.VectorAssembler
    
    val df = sc.parallelize(Seq(
      (1, "cat1", 1), (1, "cat2", 3), (1, "cat9", 5), (2, "cat4", 6),
      (2, "cat9", 2), (2, "cat10", 1), (3, "cat1", 5), (3, "cat7", 16),
      (3, "cat8", 2))).toDF("userID", "category", "frequency")
    
    // Create a sorted array of categories
    val categories = df
      .select($"category")
      .distinct.map(_.getString(0))
      .collect
      .sorted
    
    // Prepare vector assemble
    val assembler =  new VectorAssembler()
      .setInputCols(categories)
      .setOutputCol("features")
    
    // Aggregation expressions
    val exprs = categories.map(
       c => sum(when($"category" === c, $"frequency").otherwise(lit(0))).alias(c))
    
    val transformed = assembler.transform(
        df.groupBy($"userID").agg(exprs.head, exprs.tail: _*))
      .select($"userID", $"features")
    

    and an UDAF alternative:

    import org.apache.spark.sql.expressions.{
      MutableAggregationBuffer, UserDefinedAggregateFunction}
    import org.apache.spark.mllib.linalg.Vectors
    import org.apache.spark.sql.types.{
      StructType, ArrayType, DoubleType, IntegerType}
    import scala.collection.mutable.WrappedArray
    
    class VectorAggregate (n: Int) extends UserDefinedAggregateFunction {
        def inputSchema = new StructType()
          .add("i", IntegerType)
          .add("v", DoubleType)
        def bufferSchema = new StructType().add("buff", ArrayType(DoubleType))
        def dataType = new VectorUDT()
        def deterministic = true 
    
        def initialize(buffer: MutableAggregationBuffer) = {
          buffer.update(0, Array.fill(n)(0.0))
        }
    
        def update(buffer: MutableAggregationBuffer, input: Row) = {
          if (!input.isNullAt(0)) {
            val i = input.getInt(0)
            val v = input.getDouble(1)
            val buff = buffer.getAs[WrappedArray[Double]](0) 
            buff(i) += v
            buffer.update(0, buff)
          }
        }
    
        def merge(buffer1: MutableAggregationBuffer, buffer2: Row) = {
          val buff1 = buffer1.getAs[WrappedArray[Double]](0) 
          val buff2 = buffer2.getAs[WrappedArray[Double]](0) 
          for ((x, i) <- buff2.zipWithIndex) {
            buff1(i) += x
          }
          buffer1.update(0, buff1)
        }
    
        def evaluate(buffer: Row) =  Vectors.dense(
          buffer.getAs[Seq[Double]](0).toArray)
    }
    

    with example usage:

    import org.apache.spark.ml.feature.StringIndexer
    
    val indexer = new StringIndexer()
      .setInputCol("category")
      .setOutputCol("category_idx")
      .fit(df)
    
    val indexed = indexer.transform(df)
      .withColumn("category_idx", $"category_idx".cast("integer"))
      .withColumn("frequency", $"frequency".cast("double"))
    
    val n = indexer.labels.size + 1
    
    val transformed = indexed
      .groupBy($"userID")
      .agg(new VectorAggregate(n)($"category_idx", $"frequency").as("vec"))
    
    transformed.show
    
    // +------+--------------------+
    // |userID|                 vec|
    // +------+--------------------+
    // |     1|[1.0,5.0,0.0,3.0,...|
    // |     2|[0.0,2.0,0.0,0.0,...|
    // |     3|[5.0,0.0,16.0,0.0...|
    // +------+--------------------+
    

    In this case order of values is defined by indexer.labels:

    indexer.labels
    // Array[String] = Array(cat1, cat9, cat7, cat2, cat8, cat4, cat10)
    

    In practice I would prefer solution by Odomontois so these are provided mostly for reference.

提交回复
热议问题