i am a newbie with apache flink. i have an unbound data stream in my input (fed into flink 0.10 via kakfa).
i want to get the 1st occurence of each primary key (the
Here's another way to do this that I happen to have just written. It has the disadvantage that it's a bit more custom code since it doesn't use the built-in Flink windowing functions but it doesn't have the latency penalty that Till mentioned. Full example on GitHub.
package com.dataartisans.filters;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import org.apache.flink.api.common.functions.RichFilterFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.checkpoint.CheckpointedAsynchronously;
import java.io.Serializable;
import java.util.HashSet;
import java.util.concurrent.TimeUnit;
/**
* This class filters duplicates that occur within a configurable time of each other in a data stream.
*/
public class DedupeFilterFunction extends RichFilterFunction implements CheckpointedAsynchronously> {
private LoadingCache dedupeCache;
private final KeySelector keySelector;
private final long cacheExpirationTimeMs;
/**
* @param cacheExpirationTimeMs The expiration time for elements in the cache
*/
public DedupeFilterFunction(KeySelector keySelector, long cacheExpirationTimeMs){
this.keySelector = keySelector;
this.cacheExpirationTimeMs = cacheExpirationTimeMs;
}
@Override
public void open(Configuration parameters) throws Exception {
createDedupeCache();
}
@Override
public boolean filter(T value) throws Exception {
K key = keySelector.getKey(value);
boolean seen = dedupeCache.get(key);
if (!seen) {
dedupeCache.put(key, true);
return true;
} else {
return false;
}
}
@Override
public HashSet snapshotState(long checkpointId, long checkpointTimestamp) throws Exception {
return new HashSet<>(dedupeCache.asMap().keySet());
}
@Override
public void restoreState(HashSet state) throws Exception {
createDedupeCache();
for (K key : state) {
dedupeCache.put(key, true);
}
}
private void createDedupeCache() {
dedupeCache = CacheBuilder.newBuilder()
.expireAfterWrite(cacheExpirationTimeMs, TimeUnit.MILLISECONDS)
.build(new CacheLoader() {
@Override
public Boolean load(K k) throws Exception {
return false;
}
});
}
}