public class FPTree {
// FP树根节点
FPNode root = new FPNode("Root", -1);
// FP树节点线索头
Map<String, FPNode> firstNodeTable = new HashMap<>();
// FP树节点线索尾
Map<String, FPNode> lastNodeTable = new HashMap<>();
// 支持度
private int support = 1;
public List<FPNode> table = new ArrayList<>();
private static List<LogTemplate> templates = new ArrayList<>();
public FPTree(List<List<String>> data, int support) {
int size = data.size();
List<Integer> count = new ArrayList<>();
for (int i = 0; i < size; i++) {
count.add(1);
}
buildTree(data, count, support, false);
}
public FPTree(List<LogTemplate> templates, int support, boolean sorted) {
List<List<String>> data = new ArrayList<>();
List<Integer> count = new ArrayList<>();
for (LogTemplate template : templates) {
data.add(template.getWords());
count.add(template.getCount());
}
buildTree(data, count, support, sorted);
}
public void buildTree(List<List<String>> data, List<Integer> count, int support, boolean sorted) {
this.support = support;
if (!sorted) {
data = sort(data, count);
}
// line为一行日志
int i = 0;
for (List<String> line : data) {
FPNode curNode = root;
for (String word : line) {
if (curNode.getChildren().containsKey(word)) {
// 子节点存在则访问次数加一
curNode.getChildren().get(word).increase(count.get(i));
} else {
// 子节点不存在则新增子节点
FPNode child = new FPNode(word, count.get(i));
curNode.getChildren().put(word, child);
child.setFather(curNode);
}
curNode = curNode.getChildren().get(word);
// 当前节点有线索指向,则不必重复建立线索
if (curNode.isVisited()) {
continue;
}
// 创建线索
if (firstNodeTable.containsKey(word)) {
lastNodeTable.get(word).setNext(curNode);
} else {
firstNodeTable.put(word, curNode);
}
lastNodeTable.put(word, curNode);
curNode.setVisited(true);
}
i++;
}
}
private List<List<String>> sort(List<List<String>> data, List<Integer> count) {
Map<String, Integer> wordCount = new HashMap<>();
// 统计单词出现的次数
int i = 0;
for (List<String> line : data) {
for (String word : line) {
if (wordCount.containsKey(word)) {
wordCount.put(word, wordCount.get(word) + count.get(i));
} else {
wordCount.put(word, count.get(i));
}
}
i++;
}
for (Map.Entry<String, Integer> entry : wordCount.entrySet()) {
if (entry.getValue() >= this.support) {
table.add(new FPNode(entry.getKey(), entry.getValue()));
}
}
if(0 != table.size()) {
table = table.stream().sorted(Comparator.comparing(FPNode::getCount).reversed())
.collect(Collectors.toList());
}
List<List<String>> result = new ArrayList<>();
// 单词排序
for (List<String> line : data) {
List<String> newLine = line.stream()
.filter(word -> wordCount.get(word) >= support)
.sorted(Comparator.comparing(word -> wordCount.get(word)).reversed())
.collect(Collectors.toList());
if (null != newLine && 0 != newLine.size()) {
result.add(newLine);
}
}
return result;
}
public void print() {
root.print(0);
}
public void growth(FPTree fpTree, List<String> last, List<FPNode> table) {
FPNode tree = fpTree.getRoot();
if (isSingleTree(tree)) {
// 获取单树路径上所有节点
List<FPNode> wordCount = new ArrayList<>();
FPNode child = getFirstChild(tree);
while (null != child) {
wordCount.add(child);
child = getFirstChild(child);
}
// 获取wordCount所有非空子集
List<LogTemplate> templates = getSonSet(wordCount);
for (LogTemplate template : templates) {
// 子集合出现次数大于支撑度则保留为模板
if (template.getCount() >= support) {
this.templates.add(template);
template.getWords().addAll(last);
}
}
} else {
FPNode root = tree;
Collections.reverse(table);
for (FPNode node : table) {
List<String> pre = new ArrayList<>();
pre.add(node.getWord());
pre.addAll(last);
// 当前节点当做一个日志模板
LogTemplate template = new LogTemplate();
template.setCount(node.getCount());
List<String> words = new ArrayList<>();
words.add(node.getWord());
template.setWords(words);
this.templates.add(template);
FPNode link = this.firstNodeTable.get(node.getWord());
List<LogTemplate> linkTemplates = new ArrayList<>();
// 一条线索上有多个节点,每个节点从下往上对应一条日志模板路径
while (null != link) {
FPNode me = link;
LogTemplate meTemplate = new LogTemplate();
List<String> meWords = new ArrayList<>();
me = me.getFather();
// 线索上每个节点往上走
while (null != me.getFather()) {
meWords.add(me.getWord());
me = me.getFather();
}
Collections.reverse(meWords);
meTemplate.setWords(meWords);
meTemplate.setCount(link.getCount());
linkTemplates.add(meTemplate);
link = link.getNext();
}
// 统计每个单词出现的次数
Map<String, Integer> wordCount = new HashMap<>();
for (LogTemplate linkTemplate : linkTemplates) {
for (String word : linkTemplate.getWords()) {
if (wordCount.containsKey(word)) {
wordCount.put(word, wordCount.get(word) + linkTemplate.getCount());
} else {
wordCount.put(word, linkTemplate.getCount());
}
}
}
// 以上述节点构造新树
FPTree newTree = new FPTree(linkTemplates, 1, false);
List<FPNode> newTable = new ArrayList<>();
for (Map.Entry<String, Integer> entry : wordCount.entrySet()) {
if (entry.getValue() >= this.support) {
newTable.add(new FPNode(entry.getKey(), entry.getValue()));
}
}
if(0 != newTable.size()) {
newTable = newTable.stream().sorted(Comparator.comparing(FPNode::getCount).reversed())
.collect(Collectors.toList());
newTree.growth(newTree, pre, newTable);
}
}
}
}
private List<LogTemplate> getSonSet(List<FPNode> wordCount) {
List<LogTemplate> result = new ArrayList<>();
int length = wordCount.size();
int mark = 0;
int nEnd = 1 << length;
// 对于length位二进制数,每个数字对应一个子集合取法
for (mark = 0; mark < nEnd; mark++) {
LogTemplate template = new LogTemplate();
// 循环查找每位是否应该放入集合
for (int i = 0; i < length; i++) {
//该位有元素输出
if (((1 << i) & mark) != 0) {
template.getWords().add(wordCount.get(i).getWord());
template.setCount(wordCount.get(i).getCount());
}
}
// 空集合
if (template.getCount() != 0) {
result.add(template);
}
}
return result;
}
private boolean isSingleTree(FPNode tree) {
if (null == tree || null == tree.getChildren() || 0 == tree.getChildren().size()) {
return true;
}
// 有多个子节点则不是单树
if (1 < tree.getChildren().size()) {
return false;
} else {
return isSingleTree(getFirstChild(tree));
}
}
private FPNode getFirstChild(FPNode tree) {
if (null == tree || null == tree.getChildren() || 0 == tree.getChildren().size()) {
return null;
} else {
for (FPNode child : tree.getChildren().values()) {
return child;
}
return null;
}
}
public static void main(String[] args) {
List<String> line1 = new ArrayList<>();
line1.add("C");
line1.add("A");
line1.add("B");
List<String> line2 = new ArrayList<>();
line2.add("A");
line2.add("B");
line2.add("D");
List<String> line3 = new ArrayList<>();
line3.add("A");
line3.add("B");
List<String> line4 = new ArrayList<>();
line4.add("C");
line4.add("E");
List<List<String>> data = new ArrayList<>();
data.add(line1);
data.add(line2);
data.add(line3);
data.add(line4);
FPTree tree = new FPTree(data, 1);
tree.print();
tree.growth(tree, new ArrayList<>(), tree.table);
for(LogTemplate template : templates) {
template.print();
}
}
public FPNode getRoot() {
return root;
}
public void setRoot(FPNode root) {
this.root = root;
}
}
来源:https://www.cnblogs.com/coshaho/p/12163496.html