NLTK Tagging spanish words using a corpus

前端 未结 4 848
無奈伤痛
無奈伤痛 2020-11-27 17:13

I am trying to learn how to tag spanish words using NLTK.

From the nltk book, It is quite easy to tag english words using their example. Because I am new to nltk an

4条回答
  •  星月不相逢
    2020-11-27 18:01

    Given the tutorial in the previous answer, here's a more object-oriented approach from spaghetti tagger: https://github.com/alvations/spaghetti-tagger

    #-*- coding: utf8 -*-
    
    from nltk import UnigramTagger as ut
    from nltk import BigramTagger as bt
    from cPickle import dump,load
    
    def loadtagger(taggerfilename):
        infile = open(taggerfilename,'rb')
        tagger = load(infile); infile.close()
        return tagger
    
    def traintag(corpusname, corpus):
        # Function to save tagger.
        def savetagger(tagfilename,tagger):
            outfile = open(tagfilename, 'wb')
            dump(tagger,outfile,-1); outfile.close()
            return
        # Training UnigramTagger.
        uni_tag = ut(corpus)
        savetagger(corpusname+'_unigram.tagger',uni_tag)
        # Training BigramTagger.
        bi_tag = bt(corpus)
        savetagger(corpusname+'_bigram.tagger',bi_tag)
        print "Tagger trained with",corpusname,"using" +\
                    "UnigramTagger and BigramTagger."
        return
    
    # Function to unchunk corpus.
    def unchunk(corpus):
        nomwe_corpus = []
        for i in corpus:
            nomwe = " ".join([j[0].replace("_"," ") for j in i])
            nomwe_corpus.append(nomwe.split())
        return nomwe_corpus
    
    class cesstag():
        def __init__(self,mwe=True):
            self.mwe = mwe
            # Train tagger if it's used for the first time.
            try:
                loadtagger('cess_unigram.tagger').tag(['estoy'])
                loadtagger('cess_bigram.tagger').tag(['estoy'])
            except IOError:
                print "*** First-time use of cess tagger ***"
                print "Training tagger ..."
                from nltk.corpus import cess_esp as cess
                cess_sents = cess.tagged_sents()
                traintag('cess',cess_sents)
                # Trains the tagger with no MWE.
                cess_nomwe = unchunk(cess.tagged_sents())
                tagged_cess_nomwe = batch_pos_tag(cess_nomwe)
                traintag('cess_nomwe',tagged_cess_nomwe)
                print
            # Load tagger.
            if self.mwe == True:
                self.uni = loadtagger('cess_unigram.tagger')
                self.bi = loadtagger('cess_bigram.tagger')
            elif self.mwe == False:
                self.uni = loadtagger('cess_nomwe_unigram.tagger')
                self.bi = loadtagger('cess_nomwe_bigram.tagger')
    
    def pos_tag(tokens, mmwe=True):
        tagger = cesstag(mmwe)
        return tagger.uni.tag(tokens)
    
    def batch_pos_tag(sentences, mmwe=True):
        tagger = cesstag(mmwe)
        return tagger.uni.batch_tag(sentences)
    
    tagger = cesstag()
    print tagger.uni.tag('Mi colega me ayuda a programar cosas .'.split())
    

提交回复
热议问题