Dropping a nested column from Spark DataFrame

故事扮演 提交于 2019-12-17 04:31:56

问题


I have a DataFrame with the schema

root
 |-- label: string (nullable = true)
 |-- features: struct (nullable = true)
 |    |-- feat1: string (nullable = true)
 |    |-- feat2: string (nullable = true)
 |    |-- feat3: string (nullable = true)

While, I am able to filter the data frame using

  val data = rawData
     .filter( !(rawData("features.feat1") <=> "100") )

I am unable to drop the columns using

  val data = rawData
       .drop("features.feat1")

Is it something that I am doing wrong here? I also tried (unsuccessfully) doing drop(rawData("features.feat1")), though it does not make much sense to do so.

Thanks in advance,

Nikhil


回答1:


It is just a programming exercise but you can try something like this:

import org.apache.spark.sql.{DataFrame, Column}
import org.apache.spark.sql.types.{StructType, StructField}
import org.apache.spark.sql.{functions => f}
import scala.util.Try

case class DFWithDropFrom(df: DataFrame) {
  def getSourceField(source: String): Try[StructField] = {
    Try(df.schema.fields.filter(_.name == source).head)
  }

  def getType(sourceField: StructField): Try[StructType] = {
    Try(sourceField.dataType.asInstanceOf[StructType])
  }

  def genOutputCol(names: Array[String], source: String): Column = {
    f.struct(names.map(x => f.col(source).getItem(x).alias(x)): _*)
  }

  def dropFrom(source: String, toDrop: Array[String]): DataFrame = {
    getSourceField(source)
      .flatMap(getType)
      .map(_.fieldNames.diff(toDrop))
      .map(genOutputCol(_, source))
      .map(df.withColumn(source, _))
      .getOrElse(df)
  }
}

Example usage:

scala> case class features(feat1: String, feat2: String, feat3: String)
defined class features

scala> case class record(label: String, features: features)
defined class record

scala> val df = sc.parallelize(Seq(record("a_label",  features("f1", "f2", "f3")))).toDF
df: org.apache.spark.sql.DataFrame = [label: string, features: struct<feat1:string,feat2:string,feat3:string>]

scala> DFWithDropFrom(df).dropFrom("features", Array("feat1")).show
+-------+--------+
|  label|features|
+-------+--------+
|a_label| [f2,f3]|
+-------+--------+


scala> DFWithDropFrom(df).dropFrom("foobar", Array("feat1")).show
+-------+----------+
|  label|  features|
+-------+----------+
|a_label|[f1,f2,f3]|
+-------+----------+


scala> DFWithDropFrom(df).dropFrom("features", Array("foobar")).show
+-------+----------+
|  label|  features|
+-------+----------+
|a_label|[f1,f2,f3]|
+-------+----------+

Add an implicit conversion and you're good to go.




回答2:


This version allows you to remove nested columns at any level:

import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{StructType, DataType}

/**
  * Various Spark utilities and extensions of DataFrame
  */
object DataFrameUtils {

  private def dropSubColumn(col: Column, colType: DataType, fullColName: String, dropColName: String): Option[Column] = {
    if (fullColName.equals(dropColName)) {
      None
    } else {
      colType match {
        case colType: StructType =>
          if (dropColName.startsWith(s"${fullColName}.")) {
            Some(struct(
              colType.fields
                .flatMap(f =>
                  dropSubColumn(col.getField(f.name), f.dataType, s"${fullColName}.${f.name}", dropColName) match {
                    case Some(x) => Some(x.alias(f.name))
                    case None => None
                  })
                : _*))
          } else {
            Some(col)
          }
        case other => Some(col)
      }
    }
  }

  protected def dropColumn(df: DataFrame, colName: String): DataFrame = {
    df.schema.fields
      .flatMap(f => {
        if (colName.startsWith(s"${f.name}.")) {
          dropSubColumn(col(f.name), f.dataType, f.name, colName) match {
            case Some(x) => Some((f.name, x))
            case None => None
          }
        } else {
          None
        }
      })
      .foldLeft(df.drop(colName)) {
        case (df, (colName, column)) => df.withColumn(colName, column)
      }
  }

  /**
    * Extended version of DataFrame that allows to operate on nested fields
    */
  implicit class ExtendedDataFrame(df: DataFrame) extends Serializable {
    /**
      * Drops nested field from DataFrame
      *
      * @param colName Dot-separated nested field name
      */
    def dropNestedColumn(colName: String): DataFrame = {
      DataFrameUtils.dropColumn(df, colName)
    }
  }
}

Usage:

import DataFrameUtils._
df.dropNestedColumn("a.b.c.d")



回答3:


Expanding on spektom answer. With support for array types:

object DataFrameUtils {

  private def dropSubColumn(col: Column, colType: DataType, fullColName: String, dropColName: String): Option[Column] = {
    if (fullColName.equals(dropColName)) {
      None
    } else if (dropColName.startsWith(s"$fullColName.")) {
      colType match {
        case colType: StructType =>
          Some(struct(
            colType.fields
              .flatMap(f =>
                dropSubColumn(col.getField(f.name), f.dataType, s"$fullColName.${f.name}", dropColName) match {
                  case Some(x) => Some(x.alias(f.name))
                  case None => None
                })
              : _*))
        case colType: ArrayType =>
          colType.elementType match {
            case innerType: StructType =>
              Some(struct(innerType.fields
                .flatMap(f =>
                  dropSubColumn(col.getField(f.name), f.dataType, s"$fullColName.${f.name}", dropColName) match {
                    case Some(x) => Some(x.alias(f.name))
                    case None => None
                  })
                : _*))
          }

        case other => Some(col)
      }
    } else {
      Some(col)
    }
  }

  protected def dropColumn(df: DataFrame, colName: String): DataFrame = {
    df.schema.fields
      .flatMap(f => {
        if (colName.startsWith(s"${f.name}.")) {
          dropSubColumn(col(f.name), f.dataType, f.name, colName) match {
            case Some(x) => Some((f.name, x))
            case None => None
          }
        } else {
          None
        }
      })
      .foldLeft(df.drop(colName)) {
        case (df, (colName, column)) => df.withColumn(colName, column)
      }
  }

  /**
    * Extended version of DataFrame that allows to operate on nested fields
    */
  implicit class ExtendedDataFrame(df: DataFrame) extends Serializable {
    /**
      * Drops nested field from DataFrame
      *
      * @param colName Dot-separated nested field name
      */
    def dropNestedColumn(colName: String): DataFrame = {
      DataFrameUtils.dropColumn(df, colName)
    }
  }

}



回答4:


Following spektom's code snippet for scala, I've created a similar code in Java. Since java 8 doesn't have foldLeft, I used forEachOrdered. This code is suitable for spark 2.x (I'm using 2.1) Also I noted that dropping a column and adding it using withColumn with the same name doesn't work, so I'm just replacing the column, and it seem to work.

Code is not fully tested, hope it works :-)

public class DataFrameUtils {

public static Dataset<Row> dropNestedColumn(Dataset<Row> dataFrame, String columnName) {
    final DataFrameFolder dataFrameFolder = new DataFrameFolder(dataFrame);
    Arrays.stream(dataFrame.schema().fields())
        .flatMap( f -> {
           if (columnName.startsWith(f.name() + ".")) {
               final Optional<Column> column = dropSubColumn(col(f.name()), f.dataType(), f.name(), columnName);
               if (column.isPresent()) {
                   return Stream.of(new Tuple2<>(f.name(), column));
               } else {
                   return Stream.empty();
               }
           } else {
               return Stream.empty();
           }
        }).forEachOrdered(colTuple -> dataFrameFolder.accept(colTuple));

    return dataFrameFolder.getDF();
}

private static Optional<Column> dropSubColumn(Column col, DataType colType, String fullColumnName, String dropColumnName) {
    Optional<Column> column = Optional.empty();
    if (!fullColumnName.equals(dropColumnName)) {
        if (colType instanceof StructType) {
            if (dropColumnName.startsWith(fullColumnName + ".")) {
                column = Optional.of(struct(getColumns(col, (StructType)colType, fullColumnName, dropColumnName)));
            }
        } else {
            column = Optional.of(col);
        }
    }

    return column;
}

private static Column[] getColumns(Column col, StructType colType, String fullColumnName, String dropColumnName) {
    return Arrays.stream(colType.fields())
        .flatMap(f -> {
                    final Optional<Column> column = dropSubColumn(col.getField(f.name()), f.dataType(),
                            fullColumnName + "." + f.name(), dropColumnName);
                    if (column.isPresent()) {
                        return Stream.of(column.get().alias(f.name()));
                    } else {
                        return Stream.empty();
                    }
                }
        ).toArray(Column[]::new);

}

private static class DataFrameFolder implements Consumer<Tuple2<String, Optional<Column>>> {
    private Dataset<Row> df;

    public DataFrameFolder(Dataset<Row> df) {
        this.df = df;
    }

    public Dataset<Row> getDF() {
        return df;
    }

    @Override
    public void accept(Tuple2<String, Optional<Column>> colTuple) {
        if (!colTuple._2().isPresent()) {
            df = df.drop(colTuple._1());
        } else {
            df = df.withColumn(colTuple._1(), colTuple._2().get());
        }
    }
}

Usage example:

private class Pojo {
    private String str;
    private Integer number;
    private List<String> strList;
    private Pojo2 pojo2;

    public String getStr() {
        return str;
    }

    public Integer getNumber() {
        return number;
    }

    public List<String> getStrList() {
        return strList;
    }

    public Pojo2 getPojo2() {
        return pojo2;
    }

}

private class Pojo2 {
    private String str;
    private Integer number;
    private List<String> strList;

    public String getStr() {
        return str;
    }

    public Integer getNumber() {
        return number;
    }

    public List<String> getStrList() {
        return strList;
    }

}

SQLContext context = new SQLContext(new SparkContext("local[1]", "test"));
Dataset<Row> df = context.createDataFrame(Collections.emptyList(), Pojo.class);
Dataset<Row> dfRes = DataFrameUtils.dropNestedColumn(df, "pojo2.str");

Original struct:

root
 |-- number: integer (nullable = true)
 |-- pojo2: struct (nullable = true)
 |    |-- number: integer (nullable = true)
 |    |-- str: string (nullable = true)
 |    |-- strList: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- str: string (nullable = true)
 |-- strList: array (nullable = true)
 |    |-- element: string (containsNull = true)

After drop:

root
 |-- number: integer (nullable = true)
 |-- pojo2: struct (nullable = false)
 |    |-- number: integer (nullable = true)
 |    |-- strList: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- str: string (nullable = true)
 |-- strList: array (nullable = true)
 |    |-- element: string (containsNull = true)


来源:https://stackoverflow.com/questions/32727279/dropping-a-nested-column-from-spark-dataframe

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!