Extracting text from HTML file using Python

后端 未结 30 2695
一生所求
一生所求 2020-11-22 04:05

I\'d like to extract the text from an HTML file using Python. I want essentially the same output I would get if I copied the text from a browser and pasted it into notepad.

30条回答
  •  庸人自扰
    2020-11-22 04:31

    Found myself facing just the same problem today. I wrote a very simple HTML parser to strip incoming content of all markups, returning the remaining text with only a minimum of formatting.

    from HTMLParser import HTMLParser
    from re import sub
    from sys import stderr
    from traceback import print_exc
    
    class _DeHTMLParser(HTMLParser):
        def __init__(self):
            HTMLParser.__init__(self)
            self.__text = []
    
        def handle_data(self, data):
            text = data.strip()
            if len(text) > 0:
                text = sub('[ \t\r\n]+', ' ', text)
                self.__text.append(text + ' ')
    
        def handle_starttag(self, tag, attrs):
            if tag == 'p':
                self.__text.append('\n\n')
            elif tag == 'br':
                self.__text.append('\n')
    
        def handle_startendtag(self, tag, attrs):
            if tag == 'br':
                self.__text.append('\n\n')
    
        def text(self):
            return ''.join(self.__text).strip()
    
    
    def dehtml(text):
        try:
            parser = _DeHTMLParser()
            parser.feed(text)
            parser.close()
            return parser.text()
        except:
            print_exc(file=stderr)
            return text
    
    
    def main():
        text = r'''
            
                
                    Project: DeHTML
    Description:
    This small script is intended to allow conversion from HTML markup to plain text. ''' print(dehtml(text)) if __name__ == '__main__': main()

提交回复
热议问题