一、需求
從指定時間戳(比如 2 小時前)開始消費 Kafka 數(shù)據(jù)
二、思路
我們知道通過 Kafka 的 API 可以得到指定時間戳對應(yīng)數(shù)據(jù)所在的 segment 的起始 offset。那么就可以通過這個功能來粗略的實現(xiàn)需求。
三、實現(xiàn)
我們知道 KafkaUitls.createDirectStream 這個接口可以指定起始點的 offset,那么我們需要做的就變成如下三步:
- 獲取 topic 對應(yīng)的 TopicAndPartitions ,得到當(dāng)前 topic 有多少 partition
- 從 Kafka 獲取每個 partition 指定時間戳所在 segment 的起始 offset
- 將步驟 2 中的 offset 作為參數(shù)傳入 createDirectStream 即可
package com.ruozedata.bigdata.spark.streaming01
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer}
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.joda.time.DateTime
import org.joda.time.format.{DateTimeFormat, DateTimeFormatter}
import scala.collection.JavaConverters._
import scala.collection.mutable
object SparkStreamingWithTimestamp {
def main(args: Array[String]): Unit = {
if (args.length > 1) {
System.err.println(
s"""
|Usage: SparkStreamingWithTimestamp [datetime]
| [datetime] is a kafka offset datetime.The format is yyyy-MM-dd hh:mm:ss
|
""".stripMargin)
System.exit(1)
}
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
val ssc = new StreamingContext(conf, Seconds(10))
conf.registerKryoClasses(Array(classOf[ConsumerRecord[String,String]]))
ssc.sparkContext.setLogLevel("WARN")
val topicsSet = "test".split(",").toSet
val kafkaParams = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.174.120:9092",
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.GROUP_ID_CONFIG -> "test",
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest",
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean)
)
var messages: InputDStream[ConsumerRecord[String, String]] = null
if (args.length == 1) {
messages = KafkaUtils.createDirectStream[String, String](ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topicsSet, kafkaParams, getOffsetByTimestamp(kafkaParams, args(0))))
} else {
messages = KafkaUtils.createDirectStream[String, String](ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](Seq("test"), kafkaParams))
}
messages.print()
ssc.start()
ssc.awaitTermination()
}
/**
* 根據(jù)時間 獲取kafka的offset
*
* @param kafkaParams
* @param time
* @return
*/
def getOffsetByTimestamp(kafkaParams: collection.Map[String, Object], time: String): mutable.HashMap[TopicPartition, Long] = {
val consumer = new KafkaConsumer[String, String](new java.util.HashMap[String, Object](kafkaParams.asJava))
val fetchTime = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").parseMillis(time)
//記錄(topic,分區(qū)) --->對應(yīng)時間戳
val timestampToSearch: java.util.Map[TopicPartition, java.lang.Long] = new java.util.HashMap[TopicPartition, java.lang.Long]()
//記錄分區(qū)和他對應(yīng)的offset
val partitionOffset = new mutable.HashMap[TopicPartition, Long]
//獲取topic的partition信息 可以得到這個topic的所有partition 返回值是一個list[PartitionInfo]
val partitionInfos = consumer.partitionsFor("test")
for (partitionInfo <- partitionInfos.asScala) {
val tp = new TopicPartition(partitionInfo.topic(), partitionInfo.partition());
timestampToSearch.put(tp, fetchTime)
}
val topicPartitionToOffsetAndTimestamp = consumer.offsetsForTimes(timestampToSearch)
for ((tp, offsetAndTimeStamp) <- topicPartitionToOffsetAndTimestamp.asScala) {
val offset = offsetAndTimeStamp.offset()
partitionOffset+=tp->offset
}
consumer.close()
partitionOffset
}
}