We are doing streaming on kafka data which being collected from MySQL. Now once all the analytics has been done i want to save my data directly to Hbase. I have through the
1- add these libraries to your project :
"org.apache.hbase" % "hbase-client" % "2.0.1"
"org.apache.hbase" % "hbase-common" % "2.0.1"
2- add this trait to your code :
import java.util.concurrent.ExecutorService
import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Put, Table}
import org.apache.hadoop.hbase.security.User
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.spark.sql.ForeachWriter
trait HBaseForeachWriter[RECORD] extends ForeachWriter[RECORD] {
val tableName: String
val hbaseConfResources: Seq[String]
def pool: Option[ExecutorService] = None
def user: Option[User] = None
private var hTable: Table = _
private var connection: Connection = _
override def open(partitionId: Long, version: Long): Boolean = {
connection = createConnection()
hTable = getHTable(connection)
true
}
def createConnection(): Connection = {
val hbaseConfig = HBaseConfiguration.create()
hbaseConfResources.foreach(hbaseConfig.addResource)
ConnectionFactory.createConnection(hbaseConfig, pool.orNull, user.orNull)
}
def getHTable(connection: Connection): Table = {
connection.getTable(TableName.valueOf(tableName))
}
override def process(record: RECORD): Unit = {
val put = toPut(record)
hTable.put(put)
}
override def close(errorOrNull: Throwable): Unit = {
hTable.close()
connection.close()
}
def toPut(record: RECORD): Put
}
3- use it for your logic :
val ds = .... //anyDataset[WhatEverYourDataType]
val query = ds.writeStream
.foreach(new HBaseForeachWriter[WhatEverYourDataType] {
override val tableName: String = "hbase-table-name"
//your cluster files, i assume here it is in resources
override val hbaseConfResources: Seq[String] = Seq("core-site.xml", "hbase-site.xml")
override def toPut(record: WhatEverYourDataType): Put = {
val key = .....
val columnFamaliyName : String = ....
val columnName : String = ....
val columnValue = ....
val p = new Put(Bytes.toBytes(key))
//Add columns ...
p.addColumn(Bytes.toBytes(columnFamaliyName),
Bytes.toBytes(columnName),
Bytes.toBytes(columnValue))
p
}
}
).start()
query.awaitTermination()