Partitioning large dictionary (400k+) for finding words in words in Python 3

问题

in my neural network, I want to normalize words of a text. It means words get converted to infinitives and gender independent. Example: "Unternehmensgruppenführer" (german) gets 3 words: unternehmen gruppe fahren. This requires a lot of "startsWith" processing.

I built a table that gives me word=>infinitive and loads it into a sorted(!) dictionary of length 400k. However, the processing of that dictionary by finding words in words got very slow. I optimized that with an implementation of Binary Search and Index-Partitioning.

I am very thankful for suggestions to further optimize it with other / better approaches ! Please consider that I know about Redis-Clusters and further parallelization (async/threads). But for that step I would like to optimize further and got the basics right.

The function that takes the dictionary dic of word=>infinitive builds the variable partitioning:

from bisect import bisect_left

indice = []         # partitions of aa,ab...zz
indice_vals = []    # partition mapping to dictionary-indice
indice_len = 0      # length of partitions (2 = aa,.. zz | 3 = aal,.. zwa)
keys = []           # dictionary-keys that get partitioned

def build_indice(dic, leng=2):
    chars = [0] * leng
    kc = [0] * leng
    global indice_len
    indice_len = leng

    dict_pos = -1

    for x in dic.keys():
        keys.append(x)
        dict_pos += 1
        if len(x) < leng:   # words need to be longer as partitioned index
            continue

        for c in range(0, leng):
            kc[c] = x[c]    # index mapping =>  x = "unternehmen" & kc = "un"

        changed = False

        for c in range(0, leng):   # consecutive check of change in index
            if chars[c] != kc[c]:
                chars[c] = kc[c]
                changed = True

        if changed:
            indice.append("".join(map(str, chars)))   # add index to partition table
            indice_vals.append(dict_pos)

The two functions contains and binary_search are used to find words that begin with a string:

def contains(dic, word):
    # dictionary lookup
    if len(word) < indice_len:   # ignore words smaller than partition index
        return False, -1, -1

    lookup = word[0:indice_len]
    index = binary_search(indice, lookup)   # check for word in partition index

    if index == -1:
        return False, -1, -1   # iword in index not found

    index_dict = indice_vals[index]

    if len(indice) < index + 2:  # jump to next index if available or to end
        next_index_dict = len(dic)
    else:
        next_index_dict = indice_vals[index + 1]

    for x in range(index_dict, next_index_dict):   # only search in narrowed range
        if keys[x].startswith(word):
            return True, index_dict, next_index_dict
    return False, -1, -1

def binary_search(a, x, lo=0, hi=-1):   # a = array, x = word, min range, max range
    if hi == -1:
        hi = len(a)

    i = bisect_left(a, x, lo=lo, hi=hi)
    if i != len(a) and a[i] == x:
        return i
    else:
        return -1

Finally, get_words returns all recognized words and makes use of get_longest_word, that iteratively cancels out and returns the detected word:

def get_longest_word(dic, word):   # word = unternehmensgruppenführer
    longest = ""
    last = ""

    if len(word) < indice_len:   # ignore words smaller than partitioned index
        return "", word

    longest = word[0:indice_len - 1]   # longest = un

    for i in range(indice_len - 1, len(word)):
        longest += word[i]   # longest = unt
        hasit, start, end = contains(dic, longest)   # check for 'unt' in dict
        if hasit:
            index = binary_search(keys, longest, start)
            if index > -1:
                last = longest   # word 'unter' exists, so keep it
        else:
            word = word[len(last):]
            return last, word   # last = unter, word = nehmensgruppenführer

    word = word[len(last):]
    return last, word


def get_words(dic, word):
    tmp = word
    xx = " "
    resolved = ""
    while xx != "":   # crawl for recognized words until no words are left
        xx, word = get_longest_word(dic, word)
        # german thing: we bind words together with 's', optimization:
        if len(word) > 0 and word[0] == 's' and len(xx) == 0:
            word = word[1:]
            xx, word = get_longest_word(dic, word)
        if len(xx) > 0:
            inf, success = get_infinitive(dic, xx)  # convert to infinitive
            resolved += " " + inf  # build a string of recognized infinitives

    if len(word) == 0:
        resolved = resolved.strip()  # trim
        return resolved
    else:
        return resolved + " " + tmp   # not all words detected, add the original word

You can test with:

def get_infinitive(dic, word):
    if word in dic:
        return dic[word], True
    else:
        return word, False


dic = dict([['fuhr','fahren'],['gruppen','gruppe'],['nehmen','nehmen'],['unter', 'unter'],['unternehmen','unternehmen']])
build_indice(dic, 3)
print("indice_len " + str(indice_len))
print(get_words(dic, "unternehmensgruppenfuhrer"))

Output:

indice_len 3
 unternehmen gruppe fahren unternehmensgruppenfuhrer

来源：https://stackoverflow.com/questions/60901265/partitioning-large-dictionary-400k-for-finding-words-in-words-in-python-3

标签

python-3.x

tensorflow

dictionary

optimization