Is there a way to convert number words to Integers?

前端 未结 16 2261
北恋
北恋 2020-11-22 06:14

I need to convert one into 1, two into 2 and so on.

Is there a way to do this with a library or a class or anythi

16条回答
  •  余生分开走
    2020-11-22 06:48

    I needed something a bit different since my input is from a speech-to-text conversion and the solution is not always to sum the numbers. For example, "my zipcode is one two three four five" should not convert to "my zipcode is 15".

    I took Andrew's answer and tweaked it to handle a few other cases people highlighted as errors, and also added support for examples like the zipcode one I mentioned above. Some basic test cases are shown below, but I'm sure there is still room for improvement.

    def is_number(x):
        if type(x) == str:
            x = x.replace(',', '')
        try:
            float(x)
        except:
            return False
        return True
    
    def text2int (textnum, numwords={}):
        units = [
            'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
            'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
            'sixteen', 'seventeen', 'eighteen', 'nineteen',
        ]
        tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
        scales = ['hundred', 'thousand', 'million', 'billion', 'trillion']
        ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
        ordinal_endings = [('ieth', 'y'), ('th', '')]
    
        if not numwords:
            numwords['and'] = (1, 0)
            for idx, word in enumerate(units): numwords[word] = (1, idx)
            for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
            for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
    
        textnum = textnum.replace('-', ' ')
    
        current = result = 0
        curstring = ''
        onnumber = False
        lastunit = False
        lastscale = False
    
        def is_numword(x):
            if is_number(x):
                return True
            if word in numwords:
                return True
            return False
    
        def from_numword(x):
            if is_number(x):
                scale = 0
                increment = int(x.replace(',', ''))
                return scale, increment
            return numwords[x]
    
        for word in textnum.split():
            if word in ordinal_words:
                scale, increment = (1, ordinal_words[word])
                current = current * scale + increment
                if scale > 100:
                    result += current
                    current = 0
                onnumber = True
                lastunit = False
                lastscale = False
            else:
                for ending, replacement in ordinal_endings:
                    if word.endswith(ending):
                        word = "%s%s" % (word[:-len(ending)], replacement)
    
                if (not is_numword(word)) or (word == 'and' and not lastscale):
                    if onnumber:
                        # Flush the current number we are building
                        curstring += repr(result + current) + " "
                    curstring += word + " "
                    result = current = 0
                    onnumber = False
                    lastunit = False
                    lastscale = False
                else:
                    scale, increment = from_numword(word)
                    onnumber = True
    
                    if lastunit and (word not in scales):                                                                                                                                                                                                                                         
                        # Assume this is part of a string of individual numbers to                                                                                                                                                                                                                
                        # be flushed, such as a zipcode "one two three four five"                                                                                                                                                                                                                 
                        curstring += repr(result + current)                                                                                                                                                                                                                                       
                        result = current = 0                                                                                                                                                                                                                                                      
    
                    if scale > 1:                                                                                                                                                                                                                                                                 
                        current = max(1, current)                                                                                                                                                                                                                                                 
    
                    current = current * scale + increment                                                                                                                                                                                                                                         
                    if scale > 100:                                                                                                                                                                                                                                                               
                        result += current                                                                                                                                                                                                                                                         
                        current = 0                                                                                                                                                                                                                                                               
    
                    lastscale = False                                                                                                                                                                                                              
                    lastunit = False                                                                                                                                                
                    if word in scales:                                                                                                                                                                                                             
                        lastscale = True                                                                                                                                                                                                         
                    elif word in units:                                                                                                                                                                                                             
                        lastunit = True
    
        if onnumber:
            curstring += repr(result + current)
    
        return curstring
    

    Some tests...

    one two three -> 123
    three forty five -> 345
    three and forty five -> 3 and 45
    three hundred and forty five -> 345
    three hundred -> 300
    twenty five hundred -> 2500
    three thousand and six -> 3006
    three thousand six -> 3006
    nineteenth -> 19
    twentieth -> 20
    first -> 1
    my zip is one two three four five -> my zip is 12345
    nineteen ninety six -> 1996
    fifty-seventh -> 57
    one million -> 1000000
    first hundred -> 100
    I will buy the first thousand -> I will buy the 1000  # probably should leave ordinal in the string
    thousand -> 1000
    hundred and six -> 106
    1 million -> 1000000
    

提交回复
热议问题