Extracting text from HTML file using Python

后端 未结 30 2815
一生所求
一生所求 2020-11-22 04:05

I\'d like to extract the text from an HTML file using Python. I want essentially the same output I would get if I copied the text from a browser and pasted it into notepad.

30条回答
  •  生来不讨喜
    2020-11-22 04:14

    Here is a version of xperroni's answer which is a bit more complete. It skips script and style sections and translates charrefs (e.g., ') and HTML entities (e.g., &).

    It also includes a trivial plain-text-to-html inverse converter.

    """
    HTML <-> text conversions.
    """
    from HTMLParser import HTMLParser, HTMLParseError
    from htmlentitydefs import name2codepoint
    import re
    
    class _HTMLToText(HTMLParser):
        def __init__(self):
            HTMLParser.__init__(self)
            self._buf = []
            self.hide_output = False
    
        def handle_starttag(self, tag, attrs):
            if tag in ('p', 'br') and not self.hide_output:
                self._buf.append('\n')
            elif tag in ('script', 'style'):
                self.hide_output = True
    
        def handle_startendtag(self, tag, attrs):
            if tag == 'br':
                self._buf.append('\n')
    
        def handle_endtag(self, tag):
            if tag == 'p':
                self._buf.append('\n')
            elif tag in ('script', 'style'):
                self.hide_output = False
    
        def handle_data(self, text):
            if text and not self.hide_output:
                self._buf.append(re.sub(r'\s+', ' ', text))
    
        def handle_entityref(self, name):
            if name in name2codepoint and not self.hide_output:
                c = unichr(name2codepoint[name])
                self._buf.append(c)
    
        def handle_charref(self, name):
            if not self.hide_output:
                n = int(name[1:], 16) if name.startswith('x') else int(name)
                self._buf.append(unichr(n))
    
        def get_text(self):
            return re.sub(r' +', ' ', ''.join(self._buf))
    
    def html_to_text(html):
        """
        Given a piece of HTML, return the plain text it contains.
        This handles entities and char refs, but not javascript and stylesheets.
        """
        parser = _HTMLToText()
        try:
            parser.feed(html)
            parser.close()
        except HTMLParseError:
            pass
        return parser.get_text()
    
    def text_to_html(text):
        """
        Convert the given text to html, wrapping what looks like URLs with  tags,
        converting newlines to 
    tags and converting confusing chars into html entities. """ def f(mo): t = mo.group() if len(t) == 1: return {'&':'&', "'":''', '"':'"', '<':'<', '>':'>'}.get(t) return '
    %s' % (t, t) return re.sub(r'https?://[^] ()"\';]+|[&\'"<>]', f, text)

提交回复
热议问题