We are doing streaming on kafka data which being collected from MySQL. Now once all the analytics has been done i want to save my data directly to Hbase. I have through the
This method worked for me even using pyspark: https://github.com/hortonworks-spark/shc/issues/205
package HBase
import org.apache.spark.internal.Logging
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.execution.datasources.hbase._
class HBaseSink(options: Map[String, String]) extends Sink with Logging {
// String with HBaseTableCatalog.tableCatalog
private val hBaseCatalog = options.get("hbasecat").map(_.toString).getOrElse("")
override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized {
val df = data.sparkSession.createDataFrame(data.rdd, data.schema)
df.write
.options(Map(HBaseTableCatalog.tableCatalog->hBaseCatalog,
HBaseTableCatalog.newTable -> "5"))
.format("org.apache.spark.sql.execution.datasources.hbase").save()
}
}
class HBaseSinkProvider extends StreamSinkProvider with DataSourceRegister {
def createSink(
sqlContext: SQLContext,
parameters: Map[String, String],
partitionColumns: Seq[String],
outputMode: OutputMode): Sink = {
new HBaseSink(parameters)
}
def shortName(): String = "hbase"
}
I added the file named as HBaseSinkProvider.scala to shc/core/src/main/scala/org/apache/spark/sql/execution/datasources/hbase
and built it, the example works perfect
This is example, how to use (scala):
inputDF.
writeStream.
queryName("hbase writer").
format("HBase.HBaseSinkProvider").
option("checkpointLocation", checkPointProdPath).
option("hbasecat", catalog).
outputMode(OutputMode.Update()).
trigger(Trigger.ProcessingTime(30.seconds)).
start
And an example of how i use it in python:
inputDF \
.writeStream \
.outputMode("append") \
.format('HBase.HBaseSinkProvider') \
.option('hbasecat', catalog_kafka) \
.option("checkpointLocation", '/tmp/checkpoint') \
.start()