The following scala method works in local or client mode, and writes the df to a single csv of the chosen name. It requires that the df fit into memory, otherwise collect() will blow up.
import org.apache.hadoop.fs.{FileSystem, Path}
val SPARK_WRITE_LOCATION = some_directory
val SPARKSESSION = org.apache.spark.sql.SparkSession
def saveResults(results : DataFrame, filename: String) {
var fs = FileSystem.get(this.SPARKSESSION.sparkContext.hadoopConfiguration)
if (SPARKSESSION.conf.get("spark.master").toString.contains("local")) {
fs = FileSystem.getLocal(new conf.Configuration())
}
val tempWritePath = new Path(SPARK_WRITE_LOCATION)
if (fs.exists(tempWritePath)) {
val x = fs.delete(new Path(SPARK_WRITE_LOCATION), true)
assert(x)
}
if (results.count > 0) {
val hadoopFilepath = new Path(SPARK_WRITE_LOCATION, filename)
val writeStream = fs.create(hadoopFilepath, true)
val bw = new BufferedWriter( new OutputStreamWriter( writeStream, "UTF-8" ) )
val x = results.collect()
for (row : Row <- x) {
val rowString = row.mkString(start = "", sep = ",", end="\n")
bw.write(rowString)
}
bw.close()
writeStream.close()
val resultsWritePath = new Path(WRITE_DIRECTORY, filename)
if (fs.exists(resultsWritePath)) {
fs.delete(resultsWritePath, true)
}
fs.copyToLocalFile(false, hadoopFilepath, resultsWritePath, true)
} else {
System.exit(-1)
}
}