apache flink 0.10 how to get the first occurence of a composite key from an unbounded input dataStream?

前端 未结 2 693
生来不讨喜
生来不讨喜 2021-01-01 01:20

i am a newbie with apache flink. i have an unbound data stream in my input (fed into flink 0.10 via kakfa).

i want to get the 1st occurence of each primary key (the

2条回答
  •  灰色年华
    2021-01-01 01:44

    Here's another way to do this that I happen to have just written. It has the disadvantage that it's a bit more custom code since it doesn't use the built-in Flink windowing functions but it doesn't have the latency penalty that Till mentioned. Full example on GitHub.

    package com.dataartisans.filters;
    
    import com.google.common.cache.CacheBuilder;
    import com.google.common.cache.CacheLoader;
    import com.google.common.cache.LoadingCache;
    import org.apache.flink.api.common.functions.RichFilterFunction;
    import org.apache.flink.api.java.functions.KeySelector;
    import org.apache.flink.configuration.Configuration;
    import org.apache.flink.streaming.api.checkpoint.CheckpointedAsynchronously;
    
    import java.io.Serializable;
    import java.util.HashSet;
    import java.util.concurrent.TimeUnit;
    
    
    /**
     * This class filters duplicates that occur within a configurable time of each other in a data stream.
     */
    public class DedupeFilterFunction extends RichFilterFunction implements CheckpointedAsynchronously> {
    
      private LoadingCache dedupeCache;
      private final KeySelector keySelector;
      private final long cacheExpirationTimeMs;
    
      /**
       * @param cacheExpirationTimeMs The expiration time for elements in the cache
       */
      public DedupeFilterFunction(KeySelector keySelector, long cacheExpirationTimeMs){
        this.keySelector = keySelector;
        this.cacheExpirationTimeMs = cacheExpirationTimeMs;
      }
    
      @Override
      public void open(Configuration parameters) throws Exception {
        createDedupeCache();
      }
    
    
      @Override
      public boolean filter(T value) throws Exception {
        K key = keySelector.getKey(value);
        boolean seen = dedupeCache.get(key);
        if (!seen) {
          dedupeCache.put(key, true);
          return true;
        } else {
          return false;
        }
      }
    
      @Override
      public HashSet snapshotState(long checkpointId, long checkpointTimestamp) throws Exception {
        return new HashSet<>(dedupeCache.asMap().keySet());
      }
    
      @Override
      public void restoreState(HashSet state) throws Exception {
        createDedupeCache();
        for (K key : state) {
          dedupeCache.put(key, true);
        }
      }
    
      private void createDedupeCache() {
        dedupeCache = CacheBuilder.newBuilder()
          .expireAfterWrite(cacheExpirationTimeMs, TimeUnit.MILLISECONDS)
          .build(new CacheLoader() {
            @Override
            public Boolean load(K k) throws Exception {
              return false;
            }
          });
      }
    }
    

提交回复
热议问题