1 redis的事务(pipeline)测试
Redis本身对数据进行操作,单条命令是原子性的,但事务不保证原子性,且没有回滚。事务中任何命令执行失败,其余的命令仍会被执行,将Redis的多个操作放到一起执行,要成功多成功,如果失败了,可以把整个操作放弃,可以实现类似事物的功能。redis事务包含三个阶段:开始事务,命令入队,执行事务。redis的分片副本集集群不支持pipeline,redis只支持单机版的事务(pipeline),Redis的主从复制也支持pipeline(目前一些公司就是这样干的)。若是想用集群,可以使用MongoDB,MongoDB集群支持事物,是一个NoSQL文档数据库,支持存储海量数据、安全、可扩容。
RedisPipelineTest

package com._51doit.spark14
import com._51doit.utils.JedisConnectionPool
import redis.clients.jedis.{Jedis, Pipeline}
object RedisPipeLineTest {
def main(args: Array[String]): Unit = {
val jedis: Jedis = JedisConnectionPool.getConnection
jedis.select(1)
// 获取jedis的pipeline
val pipeline: Pipeline = jedis.pipelined()
// 开启多个操作在一个批次执行
pipeline.multi()
try {
pipeline.hincrBy("AAA", "a", 200)
var i = 1 / 0
pipeline.hincrBy("BBB", "b", 20)
//提交事物
pipeline.exec()
pipeline.sync()
} catch {
case e: Exception => {
//将脏数据废弃
pipeline.discard()
e.printStackTrace()
}
} finally {
pipeline.close()
jedis.close()
}
}
}
2. 利用redis的pipeline实现数据统计的exactlyonce
ExactlyOnceWordCountOffsetStoreInRedis

package cn._51doit.spark.day14
import cn._51doit.spark.utils.{JedisConnectionPool, OffsetUtils}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Milliseconds, StreamingContext}
import redis.clients.jedis.{Jedis, Pipeline}
/**
* 从Kafka读取数据,实现ExactlyOnce,偏移量保存到Redis中
* 1.将聚合好的数据,收集到Driver端,
* 2.然后将计算好的数据和偏移量在一个pipeline中同时保存到Redis中
* 3.成功了提交事物
* 4.失败了废弃原来的数据并让这个任务重启
*/
object ExactlyOnceWordCountOffsetStoreInRedis {
def main(args: Array[String]): Unit = {
//true a1 g1 ta,tb
val Array(isLocal, appName, groupId, allTopics) = args
val conf = new SparkConf()
.setAppName(appName)
if (isLocal.toBoolean) {
conf.setMaster("local[*]")
}
//创建StreamingContext,并指定批次生成的时间
val ssc = new StreamingContext(conf, Milliseconds(5000))
//设置日志级别
ssc.sparkContext.setLogLevel("WARN")
//SparkStreaming 跟kafka进行整合
//1.导入跟Kafka整合的依赖
//2.跟kafka整合,创建直连的DStream【使用底层的消费API,效率更高】
val topics = allTopics.split(",")
//SparkSteaming跟kafka整合的参数
//kafka的消费者默认的参数就是每5秒钟自动提交偏移量到Kafka特殊的topic中: __consumer_offsets
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "node-1.51doit.cn:9092,node-2.51doit.cn:9092,node-3.51doit.cn:9092",
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"group.id" -> groupId,
"auto.offset.reset" -> "earliest" //如果没有记录偏移量,第一次从最开始读,有偏移量,接着偏移量读
, "enable.auto.commit" -> (false: java.lang.Boolean) //消费者不自动提交偏移量
)
//在创建KafkaDStream之前要先读取Redis数据库,查询历史偏移量,没有就从头读,有就接着读
//offsets: collection.Map[TopicPartition, Long]
val offsets: Map[TopicPartition, Long] = OffsetUtils.queryHistoryOffsetFromRedis(appName, groupId)
//跟Kafka进行整合,需要引入跟Kafka整合的依赖
//createDirectStream更加高效,使用的是Kafka底层的消费API,消费者直接连接到Kafka的Leader分区进行消费
//直连方式,RDD的分区数量和Kafka的分区数量是一一对应的【数目一样】
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent, //调度task到Kafka所在的节点
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, offsets) //指定订阅Topic的规则
)
kafkaDStream.foreachRDD(rdd => {
//判断当前批次的RDD是否有数据
if (!rdd.isEmpty()) {
//获取RDD所有分区的偏移量
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//实现WordCount业务逻辑
val words: RDD[String] = rdd.flatMap(_.value().split(" "))
val wordsAndOne: RDD[(String, Int)] = words.map((_, 1))
val reduced: RDD[(String, Int)] = wordsAndOne.reduceByKey(_ + _)
//将计算好的结果收集到Driver端再写入到Redis中【保证数据和偏移量写入在一个事物中】
//触发Action,将数据收集到Driver段
val res: Array[(String, Int)] = reduced.collect()
var jedis: Jedis = null
var pipeline: Pipeline = null
//创建一个Redis的连接【在Driver端创建】
try {
jedis = JedisConnectionPool.getConnection()
//使用pipeline
pipeline = jedis.pipelined()
pipeline.select(1)
//开启多个操作在一起执行
pipeline.multi()
//写入计算好的结果
for (tp <- res) {
pipeline.hincrBy("WORD_COUNT", tp._1, tp._2)
}
//写入偏移量
for (offsetRange <- offsetRanges) {
val topic = offsetRange.topic
val partition = offsetRange.partition
val untilOffset = offsetRange.untilOffset
//将原来的偏移量覆盖
pipeline.hset(appName +"_" + groupId, topic + "_" + partition, untilOffset.toString)
}
//类似提交事物
pipeline.exec()
pipeline.sync()
} catch {
case e: Exception => {
pipeline.discard()
e.printStackTrace()
ssc.stop()
}
} finally {
pipeline.close()
jedis.close()
}
}
})
ssc.start()
ssc.awaitTermination()
}
}
查询redis的历史偏移量:OffsetUtils(queryHistoryOffsetFromRedis)

package cn._51doit.spark.utils
import java.sql.{Connection, DriverManager, ResultSet}
import java.util
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
import scala.collection.mutable
object OffsetUtils {
def queryHistoryOffsetFromMySQL(appName: String, groupId: String): Map[TopicPartition, Long] = {
val offsets = new mutable.HashMap[TopicPartition, Long]()
val connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata", "root", "123456")
val ps = connection.prepareStatement("SELECT topic_partition, offset FROM t_kafka_offset WHERE" +
" app_gid = ?")
ps.setString(1, appName + "_" +groupId)
val rs = ps.executeQuery()
while (rs.next()) {
val topicAndPartition = rs.getString(1)
val offset = rs.getLong(2)
val fields = topicAndPartition.split("_")
val topic = fields(0)
val partition = fields(1).toInt
val topicPartition = new TopicPartition(topic, partition)
//将构建好的TopicPartition放入map中
offsets(topicPartition) = offset
}
offsets.toMap
}
/**
* 将偏移量更新到MySQL中
* @param offsetRanges
* @param connection
*/
def updateOffsetToMySQL(appNameAndGroupId: String, offsetRanges: Array[OffsetRange], connection: Connection) = {
val ps = connection.prepareStatement("INSERT INTO t_kafka_offset (app_gid, topic_partition, offset) VALUES (?, ?, ?) ON DUPLICATE KEY UPDATE offset = ?")
for (offsetRange <- offsetRanges) {
//topic名称
val topic = offsetRange.topic
//topic分区编号
val partition = offsetRange.partition
//获取结束偏移量
val untilOffset = offsetRange.untilOffset
//将结果写入MySQL
ps.setString(1, appNameAndGroupId)
ps.setString(2, topic + "_" + partition)
ps.setLong(3, untilOffset)
ps.setLong(4, untilOffset)
ps.executeUpdate()
}
ps.close()
}
/**
* 从Redis中查询历史偏移量
* @param appName
* @param groupId
* @return
*/
def queryHistoryOffsetFromRedis(appName: String, groupId: String): Map[TopicPartition, Long] = {
val offsets = new mutable.HashMap[TopicPartition, Long]()
val jedis = JedisConnectionPool.getConnection()
jedis.select(1)
val topicPartitionAndOffsets: util.Map[String, String] = jedis.hgetAll(appName + "_" + groupId)
//导入隐式转换
import scala.collection.JavaConversions._
for((topicAndPartition, offset) <- topicPartitionAndOffsets) {
val fields = topicAndPartition.split("_")
val topic = fields(0)
val partition = fields(1).toInt
val topicPartition = new TopicPartition(topic, partition)
offsets(topicPartition) = offset.toLong
}
offsets.toMap
}
//每一次启动该程序,都要从Hbase查询历史偏移量
def queryHistoryOffsetFromHbase(view: String, groupid: String): Map[TopicPartition, Long] = {
val offsets = new mutable.HashMap[TopicPartition, Long]()
val connection = DriverManager.getConnection("jdbc:phoenix:node-1.51doit.cn,node-2.51doit.cn,node-3.51doit.cn:2181")
val ps = connection.prepareStatement("select \"topic_partition\", max(\"offset\") from \"myorder\" where \"groupid\" = ? group by \"topic_partition\"")
ps.setString(1, groupid)
//查询返回结果
val rs: ResultSet = ps.executeQuery()
while(rs.next()) {
val topicAndPartition = rs.getString(1)
val fields = topicAndPartition.split("_")
val topic = fields(0)
val partition = fields(1).toInt
val offset = rs.getLong(2)
offsets.put(new TopicPartition(topic, partition), offset)
}
offsets.toMap
}
}
以上的查询偏移量,以及将偏移量都可以写到一个工具类中,封装成方法,上诉OffsetUtils中对将偏移量存mysql这样走了
注意:以上的统计结果都能收集到driver端的原因是数据统计是聚合类的操作(数据量必定小),若不是聚合类的操作,则不能收集到driver端,进而达不到将数据和偏移量同时写入数据库的需求,解决办法如3
3 SparkStreaming中数据写入Hbase实现ExactlyOnce
首都
来源:https://www.cnblogs.com/jj1106/p/12383885.html
