I\'m using Spark 2.0.2. I have a DataFrame that has an alias on it, and I\'d like to be able to retrieve that. A simplified example of why I\'d want that is below.
Disclaimer: as stated above, this code relies on undocumented APIs subject to change. It works as of Spark 2.3.
After much digging into mostly undocumented Spark methods, here is the full code to pull the list of fields, along with the table alias for a dataframe in PySpark:
def schema_from_plan(df):
plan = df._jdf.queryExecution().analyzed()
all_fields = _schema_from_plan(plan)
iterator = plan.output().iterator()
output_fields = {}
while iterator.hasNext():
field = iterator.next()
queryfield = all_fields.get(field.exprId().id(),{})
if not queryfield=={}:
tablealias = queryfield["tablealias"]
else:
tablealias = ""
output_fields[field.exprId().id()] = {
"tablealias": tablealias,
"dataType": field.dataType().typeName(),
"name": field.name()
}
return list(output_fields.values())
def _schema_from_plan(root,tablealias=None,fields={}):
iterator = root.children().iterator()
while iterator.hasNext():
node = iterator.next()
nodeClass = node.getClass().getSimpleName()
if (nodeClass=="SubqueryAlias"):
# get the alias and process the subnodes with this alias
_schema_from_plan(node,node.alias(),fields)
else:
if tablealias:
# add all the fields, along with the unique IDs, and a new tablealias field
iterator = node.output().iterator()
while iterator.hasNext():
field = iterator.next()
fields[field.exprId().id()] = {
"tablealias": tablealias,
"dataType": field.dataType().typeName(),
"name": field.name()
}
_schema_from_plan(node,tablealias,fields)
return fields
# example: fields = schema_from_plan(df)