1. 版本说明
jdk 1.8
spark 2.3
elasticsearch 7.4
scala 2.11
Kafka 0.10
2. pom 依赖
<dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka_2.11</artifactId> <version>1.0.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_${scala.version}</artifactId> <version>${spark.version}</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_${scala.version}</artifactId> <version>${spark.version}</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka-0-10_${scala.version}</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.elasticsearch</groupId> <artifactId>elasticsearch-hadoop</artifactId> <version>7.4.0</version> </dependency> <dependency> <groupId>org.scalikejdbc</groupId> <artifactId>scalikejdbc_2.11</artifactId> <version>2.2.1</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.38</version> </dependency> <dependency> <groupId>org.elasticsearch.client</groupId> <artifactId>elasticsearch-rest-high-level-client</artifactId> <version>7.4.0</version> </dependency>
3. 代码
(该代码从kafka读取数据写入Es,并将kafka offset 保存到MySQL)
object DataToElasticsearch { private val log = LoggerFactory.getLogger(RunApplication.getClass) def main(args: Array[String]): Unit = { val ssc = run_task() ssc.start() ssc.awaitTermination() } def run_task(): StreamingContext = { val conf = ConfigFactory.load("application.conf") val dt: String = LocalDate.now.toString val spark = SparkSession.builder().appName("epson-data-es") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.streaming.stopGracefullyOnShutdown", "true") .config("spark.streaming.kafka.maxRatePerPartition", conf.getString("maxRatePerPartition")) .config("es.index.auto.create", "true") .config("es.nodes", "xx.xx.xx.xx") .config("es.port", "9200") .getOrCreate() val sc = spark.sparkContext val batchDuration = conf.getInt("batchDuration") val ssc = new StreamingContext(sc, Seconds(batchDuration)) //kafka 配置 val topic = conf.getString("kafka.topic") val brokers = conf.getString("kafka.brokers") val group = conf.getString("kafka.group") val topics = Array(topic) //注册JDBC驱动程序 val jdbcDriver = conf.getString("jdbc.driver") val jdbcUrl = conf.getString("jdbc.url") val jdbcUser = conf.getString("jdbc.user") val jdbcPassword = conf.getString("jdbc.password") val jdbcTable = conf.getString("jdbc.table") OffsetDetails(jdbcDriver, jdbcUrl, jdbcUser, jdbcPassword) //获取offset val fromOffsets: java.util.Map[TopicPartition, java.lang.Long] = get_offset(topic, jdbcTable) var flag = false //设置Kafka参数 val kafkaParams = get_kafkaParams(brokers, group) //创建DStream val kafkaStream = create_kafkaStream(ssc, kafkaParams, fromOffsets, topics, fromOffsets.keySet()) //处理数据 kafkaStream.foreachRDD(rdd => { try { if (!rdd.isEmpty()) { val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges val lines = rdd.map(_.value) rdd.map(_.key()).foreach(key=>println("+key++++++++++++++"+key)) EsSpark.saveJsonToEs(lines, "spark/docs") if ((!fromOffsets.isEmpty) | flag) { //update offset save_offset(offsetRanges, jdbcTable) } else { //init offset insert_offset(offsetRanges, jdbcTable) flag = true } } } catch { case e: Throwable => System.out.println("Error" + e.printStackTrace()) } }) ssc } }
来源:CSDN
作者:米兰昆德拉的幽默
链接:https://blog.csdn.net/mi_lan_kun_de_la/article/details/103926976