#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author:ShidongDu time:2020/6/3
import time
import pandas as pd
import re
# 结点类
class node:
def __init__(self, ch):
self.ch = ch # 结点值
self.fail = None # Fail指针
self.tail = 0 # 尾标志:标志为 i 表示第 i 个模式串串尾
self.child = [] # 子结点
self.childvalue = [] # 子结点的值
# AC自动机类
class Aho_Corasick:
def __init__(self):
self.root = node("") # 初始化根结点
self.count = 0 # 模式串个数
# 第一步:模式串建树
def insert(self, strkey):
self.count += 1 # 插入模式串,模式串数量加一
p = self.root
for i in strkey:
if i not in p.childvalue: # 若字符不存在,添加子结点
child = node(i)
p.child.append(child)
p.childvalue.append(i)
p = child
else: # 否则,转到子结点
p = p.child[p.childvalue.index(i)]
p.tail = self.count # 修改尾标志
# 第二步:修改Fail指针
def ac_automation(self):
queuelist = [self.root] # 用列表代替队列
while len(queuelist): # BFS遍历字典树
temp = queuelist[0]
queuelist.remove(temp) # 取出队首元素
for i in temp.child:
if temp == self.root: # 根的子结点Fail指向根自己
i.fail = self.root
else:
p = temp.fail # 转到Fail指针
while p:
if i.ch in p.childvalue: # 若结点值在该结点的子结点中,则将Fail指向该结点的对应子结点
i.fail = p.child[p.childvalue.index(i.ch)]
break
p = p.fail # 否则,转到Fail指针继续回溯
if not p: # 若p==None,表示当前结点值在之前都没出现过,则其Fail指向根结点
i.fail = self.root
queuelist.append(i) # 将当前结点的所有子结点加到队列中
# 第三步:模式匹配
def runkmp(self, strmode):
p = self.root
cnt = {} # 使用字典记录成功匹配的状态
for i in strmode: # 遍历目标串
while i not in p.childvalue and p is not self.root:
p = p.fail
if i in p.childvalue: # 若找到匹配成功的字符结点,则指向那个结点,否则指向根结点
p = p.child[p.childvalue.index(i)]
else:
p = self.root
temp = p
while temp is not self.root:
if temp.tail: # 尾标志为0不处理
if temp.tail not in cnt:
cnt.setdefault(temp.tail)
cnt[temp.tail] = 1
else:
cnt[temp.tail] += 1
temp = temp.fail
return cnt # 返回匹配状态
# 如果只需要知道是否匹配成功,则return bool(cnt)即可
# 如果需要知道成功匹配的模式串种数,则return len(cnt)即可
class Trie:
# word_end = -1
def __init__(self):
"""
Initialize your data structure here.
"""
self.name = 'Tire'
self.root = {}
self.word_end = -1
def insert(self, word):
"""
Inserts a word into the trie.
:type word: str
:rtype: void
"""
curNode = self.root
for c in word:
if not c in curNode:
curNode[c] = {}
curNode = curNode[c]
curNode[self.word_end] = True
def search(self, word):
"""
Returns if the word is in the trie.
:type word: str
:rtype: bool
"""
curNode = self.root
for c in word:
if not c in curNode:
return False
curNode = curNode[c]
# Doesn't end here
if self.word_end not in curNode:
return False
return True
def startsWith(self, prefix):
"""
Returns if there is any word in the trie that starts with the given prefix.
:type prefix: str
:rtype: bool
"""
curNode = self.root
for c in prefix:
if not c in curNode:
return False
curNode = curNode[c]
return True
class Solution:
def __init__(self, dict_file: str, besides=None):
self.besides = besides
self.key_word_list = []
self.a_dict = self.read_xls(dict_file)
def read_xls(self, file) -> dict:
a_dict = {}
a = pd.read_excel(file)
category = {}
for cate in a.keys():
if 'Unnamed' not in cate:
category[cate] = set()
for _ in category.keys():
for __ in a[_]:
category[_].add(__)
for key in a.keys():
if 'Unnamed' not in key:
a_dict[key] = []
for word in a[key]:
if type(word) == type('str'):
self.key_word_list.append(word)
a_dict[key].append(word)
if self.besides:
self.besides.insert(word)
else:
break
return a_dict
####################################################################
def BF(self, word, doc):
res = []
length = len(word)
for i in range(len(doc)-length):
if doc[i: i+length] == word:
res.append(word)
return (word, len(res)) if res else None
####################################################################
def KMP(self, word: str, doc: str):
res =[]
def get_next(word: str):
n = len(word)
next = [0 for _ in range(n)]
next[0] = -1
j = -1
for i in range(1, n):
while (j != -1 and word[i] != word[j + 1]):
j = next[j]
if word[i] == word[j + 1]: j += 1
next[i] = j
return next
next = get_next(word)
m = len(doc)
n = len(word)
j = -1
for i in range(m):
while(j!=-1 and doc[i] != word[j+1]):
j = next[j]
if doc[i]==word[j+1]:
j+=1
if j == n-1:
res.append(word)
j = next[j]
return (word, len(res)) if res else None
####################################################################
def Re(self, word: str, doc: str):
res = re.search(word, doc)
return (word, 1) if res else None
####################################################################
def Tire_Tree(self, doc: str):
res = []
for i in range(len(doc)):
if doc[i] not in self.besides.root:
continue
else:
tmp = self.besides.root[doc[i]]
j = i+1
while j <= len(doc)-1:
if doc[j] in tmp:
if -1 in tmp[doc[j]] :
res.append(doc[i: j+1])
break
else:
tmp = tmp[doc[j]]
j += 1
else:
break
if res:
return res
####################################################################
def Aho_Corasick(self, doc: str):
res = []
d = self.besides.runkmp(doc)
for key in d.keys():
res.append( (self.key_word_list[key], d[key]) )
return res
####################################################################
def operation(self, algorithm: str, file_name: str):
res = []
if algorithm == 'BF':
algo = self.BF
if algorithm == 'KMP':
algo = self.KMP
if algorithm == 'Re':
algo = self.Re
if algorithm == 'Tire':
algo = self.Tire_Tree
with open(file_name, 'r', encoding='utf-8') as f:
textlines = f.readlines()
for text in textlines:
word_pos = []
word_pos.append( (algo(text), text) )
res.append((word_pos, text))
return res
if algorithm == 'Aho_Corasick':
algo = self.Aho_Corasick
self.besides.ac_automation()
with open(file_name, 'r', encoding='utf-8') as f:
textlines = f.readlines()
for text in textlines:
word_nums = []
word_nums.append( (algo(text), text) )
res.append((word_nums, text))
return res
with open(file_name, 'r', encoding='utf-8') as f:
textlines = f.readlines()
for text in textlines:
word_pos = []
for key in self.a_dict.keys():
for word in self.a_dict[key]:
tmp = algo(word, text)
if tmp:
word_pos.append((tmp, key))
res.append( (word_pos, text) )
return res
if __name__ == '__main__':
time1 = time.time()
tire = Trie()
aho_corasick = Aho_Corasick()
solution = Solution('key_word-update.xlsx', aho_corasick)
res = solution.operation('Aho_Corasick', 'all.txt')
# solution = Solution('key_word-update.xlsx')
# res = solution.operation('Re', 'all.txt')
with open('res.txt', 'w', encoding='utf-8') as f:
for _ in res:
f.write(str(_) + '\n')
time2 = time.time()
print(time2 - time1)
来源:oschina
链接:https://my.oschina.net/u/4258318/blog/4309075