val df1 = sc.parallelize(Seq(
(\"a1\",10,\"ACTIVE\",\"ds1\"),
(\"a1\",20,\"ACTIVE\",\"ds1\"),
(\"a2\",50,\"ACTIVE\",\"ds1\"),
(\"a3\",60,\"ACTIVE\",\"ds1\"))
Enjoyed the challenge and here is my solution.
val c1keys = df1.select("c1").distinct
val df2_in_df1 = df2.join(c1keys, Seq("c1"), "inner")
val df2inactive = df2_in_df1.join(df1, Seq("c1", "c2"), "leftanti").withColumn("c3", lit("INACTIVE"))
scala> df1.union(df2inactive).show
+---+---+--------+---+
| c1| c2| c3| c4|
+---+---+--------+---+
| a1| 10| ACTIVE|ds1|
| a1| 20| ACTIVE|ds1|
| a2| 50| ACTIVE|ds1|
| a3| 60| ACTIVE|ds1|
| a1| 30|INACTIVE|ds2|
| a1| 40|INACTIVE|ds2|
+---+---+--------+---+