Simplifying the French POS Tag Set with NLTK

前端 未结 1 400
栀梦
栀梦 2021-01-02 23:10

How can one simplify the part of speech tags returned by Stanford\'s French POS tagger? It is fairly easy to read an English sentence into NLTK, find each word\'s part of sp

相关标签:
1条回答
  • 2021-01-02 23:48

    I ended up just manually mapping Stanford's POS tags to the universal tag set. For what it's worth, the snippet above was part of a slightly larger workflow aimed at measuring syntactic similarity between French and English sentences. Here's the full code, in case it helps others:

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    
    '''NLTK 3.0 offers map_tag, which maps the Penn Treebank Tag Set to the Universal Tagset, a course tag set with the following 12 tags:
    
    VERB - verbs (all tenses and modes)
    NOUN - nouns (common and proper)
    PRON - pronouns
    ADJ - adjectives
    ADV - adverbs
    ADP - adpositions (prepositions and postpositions)
    CONJ - conjunctions
    DET - determiners
    NUM - cardinal numbers
    PRT - particles or other function words
    X - other: foreign words, typos, abbreviations
    . - punctuation
    
    We'll map Stanford's tag set to this tag set then compare the similarity between subregions of French and English sentences.'''
    
    from __future__ import division
    import os, math
    from nltk.tag.stanford import POSTagger
    from nltk.tokenize import word_tokenize
    from nltk.tag import map_tag
    from collections import Counter
    
    #########################
    # Create Tagset Mapping #
    #########################
    
    def create_french_to_universal_dict():
        '''this function creates the dict we'll call below when we map french pos tags to the universal tag set'''
        french_to_universal = {}
        french_to_universal[u"ADJ"]    = u"ADJ"
        french_to_universal[u"ADJWH"]  = u"ADJ"
        french_to_universal[u"ADV"]    = u"ADV"
        french_to_universal[u"ADVWH"]  = u"ADV"
        french_to_universal[u"CC"]     = u"CONJ"    
        french_to_universal[u"CLO"]    = u"PRON"
        french_to_universal[u"CLR"]    = u"PRON"
        french_to_universal[u"CLS"]    = u"PRON"
        french_to_universal[u"CS"]     = u"CONJ"
        french_to_universal[u"DET"]    = u"DET"
        french_to_universal[u"DETWH"]  = u"DET"
        french_to_universal[u"ET"]     = u"X"
        french_to_universal[u"NC"]     = u"NOUN"
        french_to_universal[u"NPP"]    = u"NOUN"
        french_to_universal[u"P"]      = u"ADP"
        french_to_universal[u"PUNC"]   = u"."
        french_to_universal[u"PRO"]    = u"PRON"
        french_to_universal[u"PROREL"] = u"PRON"
        french_to_universal[u"PROWH"]  = u"PRON"
        french_to_universal[u"V"]      = u"VERB"
        french_to_universal[u"VIMP"]   = u"VERB"
        french_to_universal[u"VINF"]   = u"VERB"
        french_to_universal[u"VPP"]    = u"VERB"
        french_to_universal[u"VPR"]    = u"VERB"
        french_to_universal[u"VS"]     = u"VERB"
        #nb, I is not part of the universal tagset--interjections get mapped to X
        french_to_universal[u"I"]      = u"X"
        return french_to_universal
    
    french_to_universal_dict = create_french_to_universal_dict()
    
    def map_french_tag_to_universal(list_of_french_tag_tuples):
        '''this function reads in a list of tuples (word, pos) and returns the same list with pos mapped to universal tagset'''
        return [ (tup[0], french_to_universal_dict[ tup[1] ]) for tup in list_of_french_tag_tuples ]
    
    ###############################
    # Define Similarity Functions #
    ###############################
    
    def counter_cosine_similarity(c1, c2):
        '''this function reads in two counters and returns their cosine similarity'''
        terms = set(c1).union(c2)
        dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
        magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
        magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
        return dotprod / (magA * magB)
    
    def longest_common_subsequence_length(a, b):
        '''this function reads in two lists and returns the length of their longest common subsequence'''
        table = [[0] * (len(b) + 1) for _ in xrange(len(a) + 1)]
        for i, ca in enumerate(a, 1):
            for j, cb in enumerate(b, 1):
                table[i][j] = (
                    table[i - 1][j - 1] + 1 if ca == cb else
                    max(table[i][j - 1], table[i - 1][j]))
        return table[-1][-1]        
    
    def longest_contiguous_subsequence_length(a, b):
        '''this function reads in two lists and returns the length of their longest contiguous subsequence'''
        table = [[0] * (len(b) + 1) for _ in xrange(len(a) + 1)]
        l = 0
        for i, ca in enumerate(a, 1):
            for j, cb in enumerate(b, 1):
                if ca == cb:
                    table[i][j] = table[i - 1][j - 1] + 1
                    if table[i][j] > l:
                        l = table[i][j]
        return l
    
    def calculate_syntactic_similarity(french_pos_tuples, english_pos_tuples):
        '''this function reads in two lists of (word, pos) tuples and returns their cosine similarity, logest_common_subsequence, and longest_common_contiguous_sequence''' 
        french_pos_list           = [tup[1] for tup in french_pos_tuples]
        english_pos_list          = [tup[1] for tup in english_pos_tuples]
        french_pos_counter        = Counter(french_pos_list)
        english_pos_counter       = Counter(english_pos_list)
        cosine_similarity         = counter_cosine_similarity(french_pos_counter, english_pos_counter)
        lc_subsequence            = longest_common_subsequence_length(french_pos_counter, english_pos_counter) / max(len(french_pos_list), len(english_pos_list))
        lc_contiguous_subsequence = longest_contiguous_subsequence_length(french_pos_counter, english_pos_counter) / max(len(french_pos_list), len(english_pos_list))   
        return cosine_similarity, lc_subsequence, lc_contiguous_subsequence 
    
    ########################### 
    # Parse POS with Stanford #
    ###########################
    
    #set java_home path from within script. Run os.getenv("JAVA_HOME") to test java_home
    os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk1.7.0_25\\bin"
    
    english = u"the whole earth swarms with living beings, every plant, every grain and leaf, supports the life of thousands."
    french = u"Chaque plante, chaque graine, chaque particule de matière organique contient des milliers d'atomes animés."
    
    #specify paths 
    path_to_english_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\english-bidirectional-distsim.tagger"
    path_to_french_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\french.tagger"
    path_to_jar = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar"
    
    #define english and french taggers
    english_tagger = POSTagger(path_to_english_model, path_to_jar, encoding="utf-8")
    french_tagger = POSTagger(path_to_french_model, path_to_jar, encoding="utf-8")
    
    #each tuple in list_of_english_pos_tuples = (word, pos)
    list_of_english_pos_tuples = english_tagger.tag(word_tokenize(english))
    list_of_french_pos_tuples = french_tagger.tag(word_tokenize(french))
    
    #simplify each tagset
    simplified_pos_tags_english = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in list_of_english_pos_tuples]
    simplified_pos_tags_french = map_french_tag_to_universal( list_of_french_pos_tuples )
    
    print calculate_syntactic_similarity(simplified_pos_tags_french, simplified_pos_tags_english)
    
    0 讨论(0)
提交回复
热议问题