Apache Flink: Count window with timeout

前端 未结 3 2167
日久生厌
日久生厌 2020-12-19 13:18

Here is a simple code example to illustrate my question:

case class Record( key: String, value: Int )

object Job extends App
{
  val env = StreamExecutionEn         


        
相关标签:
3条回答
  • 2020-12-19 13:42

    You could also do this with a custom window Trigger that fires either when the count has been reached or when the timeout expires -- effectively blending the built-in CountTrigger and EventTimeTrigger.

    0 讨论(0)
  • 2020-12-19 13:44

    I have followed both David's and NIrav's approaches and here are the results.

    1) Using a custom trigger:

    Here I have reversed my initial logic. Instead of using a 'count window', I use a 'time window' with a duration corresponding to the timeout and followed by a trigger that fires when all the elements have been processed.

    case class Record( key: String, value: Int )
    
    object Job extends App
    {
      val env = StreamExecutionEnvironment.getExecutionEnvironment
      val data = env.fromElements( Record("01",1), Record("02",2), Record("03",3), Record("04",4), Record("05",5) )
      val step1 = data.filter( record => record.value % 3 != 0  ) // introduces some data loss
      val step2 = data.map( r => Record( r.key, r.value * 2 ) )
      val step3 = data.map( r => Record( r.key, r.value * 3 ) )
      val merged = step1.union( step2, step3 )
      val keyed = merged.keyBy(0)
      val windowed = keyed.timeWindow( Time.milliseconds( 50 ) )
      val triggered = windowed.trigger( new CountTriggerWithTimeout( 3, env.getStreamTimeCharacteristic ) )
      val summed = triggered.sum( 1 )
      summed.print()
      env.execute("test")
    }
    

    And here is the trigger code:

    import org.apache.flink.annotation.PublicEvolving
    import org.apache.flink.api.common.functions.ReduceFunction
    import org.apache.flink.api.common.functions.RuntimeContext
    import org.apache.flink.api.common.state.ReducingState
    import org.apache.flink.api.common.state.ReducingStateDescriptor
    import org.apache.flink.api.common.typeutils.base.LongSerializer
    import org.apache.flink.streaming.api.TimeCharacteristic
    import org.apache.flink.streaming.api.windowing.triggers._
    import org.apache.flink.streaming.api.windowing.triggers.Trigger.TriggerContext
    import org.apache.flink.streaming.api.windowing.windows.TimeWindow
    
    /**
     * A trigger that fires when the count of elements in a pane reaches the given count or a 
     * timeout is reached whatever happens first.
     */
    class CountTriggerWithTimeout[W <: TimeWindow](maxCount: Long, timeCharacteristic: TimeCharacteristic) extends Trigger[Object,W] 
    {
      private val countState: ReducingStateDescriptor[java.lang.Long] = new ReducingStateDescriptor[java.lang.Long]( "count", new Sum(), LongSerializer.INSTANCE)
    
      override def onElement(element: Object, timestamp: Long, window: W, ctx: TriggerContext): TriggerResult = 
      {
          val count: ReducingState[java.lang.Long] = ctx.getPartitionedState(countState)
          count.add( 1L )
          if ( count.get >= maxCount || timestamp >= window.getEnd ) TriggerResult.FIRE_AND_PURGE else TriggerResult.CONTINUE
      }
    
      override def onProcessingTime(time: Long, window: W, ctx: TriggerContext): TriggerResult = 
      {
          if (timeCharacteristic == TimeCharacteristic.EventTime) TriggerResult.CONTINUE else
          {
              if ( time >= window.getEnd ) TriggerResult.CONTINUE else TriggerResult.FIRE_AND_PURGE
          }
      }
    
      override def onEventTime(time: Long, window: W, ctx: TriggerContext): TriggerResult = 
      {
          if (timeCharacteristic == TimeCharacteristic.ProcessingTime) TriggerResult.CONTINUE else
          {
              if ( time >= window.getEnd ) TriggerResult.CONTINUE else TriggerResult.FIRE_AND_PURGE
          }
      }
    
      override def clear(window: W, ctx: TriggerContext): Unit = 
      {
              ctx.getPartitionedState( countState ).clear
        }
    
        class Sum extends ReduceFunction[java.lang.Long] 
      {
            def reduce(value1: java.lang.Long, value2: java.lang.Long): java.lang.Long = value1 + value2
      }
    }
    

    2) Using a process function:

    case class Record( key: String, value: Int )
    
    object Job extends App
    {
      val env = StreamExecutionEnvironment.getExecutionEnvironment
      env.setStreamTimeCharacteristic( TimeCharacteristic.IngestionTime )
      val data = env.fromElements( Record("01",1), Record("02",2), Record("03",3), Record("04",4), Record("05",5) )
      val step1 = data.filter( record => record.value % 3 != 0  ) // introduces some data loss
      val step2 = data.map( r => Record( r.key, r.value * 2 ) )
      val step3 = data.map( r => Record( r.key, r.value * 3 ) )
      val merged = step1.union( step2, step3 )
      val keyed = merged.keyBy(0)
      val processed = keyed.process( new TimeCountWindowProcessFunction( 3, 100 ) )
      processed.print()
      env.execute("test")
    }
    

    With all the logic (i.e., windowing, triggering, and summing) going into the function:

    import org.apache.flink.streaming.api.functions._
    import org.apache.flink.util._
    import org.apache.flink.api.common.state._
    
    case class Status( count: Int, key: String, value: Long )
    
    class TimeCountWindowProcessFunction( count: Long, windowSize: Long ) extends ProcessFunction[Record,Record] 
    {
        lazy val state: ValueState[Status] = getRuntimeContext
          .getState(new ValueStateDescriptor[Status]("state", classOf[Status]))
    
        override def processElement( input: Record, ctx: ProcessFunction[Record,Record]#Context, out: Collector[Record] ): Unit =
        {
            val updated: Status = Option( state.value ) match {
                case None => {
                    ctx.timerService().registerEventTimeTimer( ctx.timestamp + windowSize )
                    Status( 1, input.key, input.value )
                }
                case Some( current ) => Status( current.count + 1, input.key, input.value + current.value )    
            }
            if ( updated.count == count ) 
            {
                out.collect( Record( input.key, updated.value ) )
                state.clear
            }
            else
            {
                state.update( updated )  
            }        
        }
    
        override def onTimer( timestamp: Long, ctx: ProcessFunction[Record,Record]#OnTimerContext, out: Collector[Record] ): Unit =
        {
            Option( state.value ) match {
                case None => // ignore
                case Some( status ) => {
                    out.collect( Record( status.key, status.value ) )
                    state.clear
                }
            }
        }
    }
    
    0 讨论(0)
  • 2020-12-19 13:46

    I think you can implement this use case using ProcessFunction

    In which you have count property and windowEnd property. Using that you can decide when to collect the data.

    public class TimeCountWindowProcessFunction extends ProcessFunction {
    
        protected long windowStart;
        protected long windowEnd;
        protected long count;
        private ValueState<CountPojo> state;
    
        public TimeCountWindowProcessFunction(long windowSize, long count) {
    
        this.windowSize = windowSize;
        this.count = count;
    
        }
    
    @Override
        public void open(Configuration parameters) {
    
        TypeInformation<CountPojo> typeInformation = TypeInformation.of(new TypeHint<CountPojo>() {
        });
        ValueStateDescriptor<CountPojo> descriptor = new ValueStateDescriptor("test", typeInformation);
    
        state = getRuntimeContext().getState(descriptor);
    }
    
    
        @Override
        public void processElement(CountPojo input, Context ctx, Collector<CountPojo> out)
                throws Exception {
    
        long timestamp = ctx.timestamp();
            windowStart = timestamp - (timestamp % windowSize);
            windowEnd = windowStart + windowSize;
    
            // retrieve the current count
            CountPojo current = (CountPojo) state.value();
    
            if (current == null) {
    
                current = new CountPojo();
            current.count = 1;
    
                ctx.timerService().registerEventTimeTimer(windowEnd);
            } else {
    
                current.count += 1;
            }
    
            if(current.count >= count) {
            out.collect(current);
        }
    
            // set the state's timestamp to the record's assigned event time timestamp
            current.setLastModified(ctx.timestamp());
    
            // write the state back
            state.update(current);
        }
    
    
        @Override
        public void onTimer(long timestamp, OnTimerContext ctx, Collector<CountPojo> out)
                throws Exception {
    
    
            if (windowEnd == timestamp) {
    
                out.collect(state.value());
            }
    
            state.clear();
        }
    }
    

    I hope this will helpful to you.

    0 讨论(0)
提交回复
热议问题