Strip HTML from strings in Python

前端 未结 26 2169
难免孤独
难免孤独 2020-11-22 02:50
from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
  print line

When p

相关标签:
26条回答
  • 2020-11-22 02:55

    I needed a way to strip tags and decode HTML entities to plain text. The following solution is based on Eloff's answer (which I couldn't use because it strips entities).

    from HTMLParser import HTMLParser
    import htmlentitydefs
    
    class HTMLTextExtractor(HTMLParser):
        def __init__(self):
            HTMLParser.__init__(self)
            self.result = [ ]
    
        def handle_data(self, d):
            self.result.append(d)
    
        def handle_charref(self, number):
            codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
            self.result.append(unichr(codepoint))
    
        def handle_entityref(self, name):
            codepoint = htmlentitydefs.name2codepoint[name]
            self.result.append(unichr(codepoint))
    
        def get_text(self):
            return u''.join(self.result)
    
    def html_to_text(html):
        s = HTMLTextExtractor()
        s.feed(html)
        return s.get_text()
    

    A quick test:

    html = u'<a href="#">Demo <em>(&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>'
    print repr(html_to_text(html))
    

    Result:

    u'Demo (\xac \u0394\u03b7\u03bc\u03ce)'
    

    Error handling:

    • Invalid HTML structure may cause an HTMLParseError.
    • Invalid named HTML entities (such as &#apos;, which is valid in XML and XHTML, but not plain HTML) will cause a ValueError exception.
    • Numeric HTML entities specifying code points outside the Unicode range acceptable by Python (such as, on some systems, characters outside the Basic Multilingual Plane) will cause a ValueError exception.

    Security note: Do not confuse HTML stripping (converting HTML into plain text) with HTML sanitizing (converting plain text into HTML). This answer will remove HTML and decode entities into plain text – that does not make the result safe to use in a HTML context.

    Example: &lt;script&gt;alert("Hello");&lt;/script&gt; will be converted to <script>alert("Hello");</script>, which is 100% correct behavior, but obviously not sufficient if the resulting plain text is inserted as-is into a HTML page.

    The rule is not hard: Any time you insert a plain-text string into HTML output, you should always HTML escape it (using cgi.escape(s, True)), even if you "know" that it doesn't contain HTML (e.g. because you stripped HTML content).

    (However, the OP asked about printing the result to the console, in which case no HTML escaping is needed.)

    Python 3.4+ version: (with doctest!)

    import html.parser
    
    class HTMLTextExtractor(html.parser.HTMLParser):
        def __init__(self):
            super(HTMLTextExtractor, self).__init__()
            self.result = [ ]
    
        def handle_data(self, d):
            self.result.append(d)
    
        def get_text(self):
            return ''.join(self.result)
    
    def html_to_text(html):
        """Converts HTML to plain text (stripping tags and converting entities).
        >>> html_to_text('<a href="#">Demo<!--...--> <em>(&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>')
        'Demo (\xac \u0394\u03b7\u03bc\u03ce)'
    
        "Plain text" doesn't mean result can safely be used as-is in HTML.
        >>> html_to_text('&lt;script&gt;alert("Hello");&lt;/script&gt;')
        '<script>alert("Hello");</script>'
    
        Always use html.escape to sanitize text before using in an HTML context!
    
        HTMLParser will do its best to make sense of invalid HTML.
        >>> html_to_text('x < y &lt z <!--b')
        'x < y < z '
    
        Unrecognized named entities are included as-is. '&apos;' is recognized,
        despite being XML only.
        >>> html_to_text('&nosuchentity; &apos; ')
        "&nosuchentity; ' "
        """
        s = HTMLTextExtractor()
        s.feed(html)
        return s.get_text()
    

    Note that HTMLParser has improved in Python 3 (meaning less code and better error handling).

    0 讨论(0)
  • 2020-11-22 02:55

    I'm parsing Github readmes and I find that the following really works well:

    import re
    import lxml.html
    
    def strip_markdown(x):
        links_sub = re.sub(r'\[(.+)\]\([^\)]+\)', r'\1', x)
        bold_sub = re.sub(r'\*\*([^*]+)\*\*', r'\1', links_sub)
        emph_sub = re.sub(r'\*([^*]+)\*', r'\1', bold_sub)
        return emph_sub
    
    def strip_html(x):
        return lxml.html.fromstring(x).text_content() if x else ''
    

    And then

    readme = """<img src="https://raw.githubusercontent.com/kootenpv/sky/master/resources/skylogo.png" />
    
                sky is a web scraping framework, implemented with the latest python versions in mind (3.4+). 
                It uses the asynchronous `asyncio` framework, as well as many popular modules 
                and extensions.
    
                Most importantly, it aims for **next generation** web crawling where machine intelligence 
                is used to speed up the development/maintainance/reliability of crawling.
    
                It mainly does this by considering the user to be interested in content 
                from *domains*, not just a collection of *single pages*
                ([templating approach](#templating-approach))."""
    
    strip_markdown(strip_html(readme))
    

    Removes all markdown and html correctly.

    0 讨论(0)
  • 2020-11-22 02:57

    For one project, I needed so strip HTML, but also css and js. Thus, I made a variation of Eloffs answer:

    class MLStripper(HTMLParser):
        def __init__(self):
            self.reset()
            self.strict = False
            self.convert_charrefs= True
            self.fed = []
            self.css = False
        def handle_starttag(self, tag, attrs):
            if tag == "style" or tag=="script":
                self.css = True
        def handle_endtag(self, tag):
            if tag=="style" or tag=="script":
                self.css=False
        def handle_data(self, d):
            if not self.css:
                self.fed.append(d)
        def get_data(self):
            return ''.join(self.fed)
    
    def strip_tags(html):
        s = MLStripper()
        s.feed(html)
        return s.get_data()
    
    0 讨论(0)
  • 2020-11-22 02:57

    2020 Update

    Use the Mozilla Bleach library, it really lets you customize which tags to keep and which attributes to keep and also filter out attributes based on values

    Here are 2 cases to illustrate

    1) Do not allow any HTML tags or attributes

    Take sample raw text

    raw_text = """
    <p><img width="696" height="392" src="https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-768x432.jpg" class="attachment-medium_large size-medium_large wp-post-image" alt="Ethereum Classic 51% Attack: Okex Crypto Exchange Suffers $5.6 Million Loss, Contemplates Delisting ETC" style="float:left; margin:0 15px 15px 0;" srcset="https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-768x432.jpg 768w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-300x169.jpg 300w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-1024x576.jpg 1024w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-696x392.jpg 696w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-1068x601.jpg 1068w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-747x420.jpg 747w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-190x107.jpg 190w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-380x214.jpg 380w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-760x428.jpg 760w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc.jpg 1280w" sizes="(max-width: 696px) 100vw, 696px" />Cryptocurrency exchange Okex reveals it suffered the $5.6 million loss as a result of the double-spend carried out by the attacker(s) in Ethereum Classic 51% attack. Okex says it fully absorbed the loss as per its user-protection policy while insisting that the attack did not cause any loss to the platform&#8217;s users. Also as part [&#8230;]</p>
    <p>The post <a rel="nofollow" href="https://news.bitcoin.com/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc/">Ethereum Classic 51% Attack: Okex Crypto Exchange Suffers $5.6 Million Loss, Contemplates Delisting ETC</a> appeared first on <a rel="nofollow" href="https://news.bitcoin.com">Bitcoin News</a>.</p> 
    """
    

    2) Remove all HTML tags and attributes from raw text

    # DO NOT ALLOW any tags or any attributes
    from bleach.sanitizer import Cleaner
    cleaner = Cleaner(tags=[], attributes={}, styles=[], protocols=[], strip=True, strip_comments=True, filters=None)
    print(cleaner.clean(raw_text))
    

    Output

    Cryptocurrency exchange Okex reveals it suffered the $5.6 million loss as a result of the double-spend carried out by the attacker(s) in Ethereum Classic 51% attack. Okex says it fully absorbed the loss as per its user-protection policy while insisting that the attack did not cause any loss to the platform&#8217;s users. Also as part [&#8230;]
    The post Ethereum Classic 51% Attack: Okex Crypto Exchange Suffers $5.6 Million Loss, Contemplates Delisting ETC appeared first on Bitcoin News. 
    

    3 Allow Only img tag with srcset attribute

    from bleach.sanitizer import Cleaner
    # ALLOW ONLY img tags with src attribute
    cleaner = Cleaner(tags=['img'], attributes={'img': ['srcset']}, styles=[], protocols=[], strip=True, strip_comments=True, filters=None)
    print(cleaner.clean(raw_text))
    

    Output

    <img srcset="https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-768x432.jpg 768w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-300x169.jpg 300w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-1024x576.jpg 1024w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-696x392.jpg 696w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-1068x601.jpg 1068w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-747x420.jpg 747w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-190x107.jpg 190w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-380x214.jpg 380w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-760x428.jpg 760w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc.jpg 1280w">Cryptocurrency exchange Okex reveals it suffered the $5.6 million loss as a result of the double-spend carried out by the attacker(s) in Ethereum Classic 51% attack. Okex says it fully absorbed the loss as per its user-protection policy while insisting that the attack did not cause any loss to the platform&#8217;s users. Also as part [&#8230;]
    The post Ethereum Classic 51% Attack: Okex Crypto Exchange Suffers $5.6 Million Loss, Contemplates Delisting ETC appeared first on Bitcoin News. 
    
    0 讨论(0)
  • 2020-11-22 03:02

    The solutions with HTML-Parser are all breakable, if they run only once:

    html_to_text('<<b>script>alert("hacked")<</b>/script>
    

    results in:

    <script>alert("hacked")</script>
    

    what you intend to prevent. if you use a HTML-Parser, count the Tags until zero are replaced:

    from HTMLParser import HTMLParser
    
    class MLStripper(HTMLParser):
        def __init__(self):
            self.reset()
            self.fed = []
            self.containstags = False
    
        def handle_starttag(self, tag, attrs):
           self.containstags = True
    
        def handle_data(self, d):
            self.fed.append(d)
    
        def has_tags(self):
            return self.containstags
    
        def get_data(self):
            return ''.join(self.fed)
    
    def strip_tags(html):
        must_filtered = True
        while ( must_filtered ):
            s = MLStripper()
            s.feed(html)
            html = s.get_data()
            must_filtered = s.has_tags()
        return html
    
    0 讨论(0)
  • 2020-11-22 03:02

    Using BeautifulSoup, html2text or the code from @Eloff, most of the time, it remains some html elements, javascript code...

    So you can use a combination of these libraries and delete markdown formatting (Python 3):

    import re
    import html2text
    from bs4 import BeautifulSoup
    def html2Text(html):
        def removeMarkdown(text):
            for current in ["^[ #*]{2,30}", "^[ ]{0,30}\d\\\.", "^[ ]{0,30}\d\."]:
                markdown = re.compile(current, flags=re.MULTILINE)
                text = markdown.sub(" ", text)
            return text
        def removeAngular(text):
            angular = re.compile("[{][|].{2,40}[|][}]|[{][*].{2,40}[*][}]|[{][{].{2,40}[}][}]|\[\[.{2,40}\]\]")
            text = angular.sub(" ", text)
            return text
        h = html2text.HTML2Text()
        h.images_to_alt = True
        h.ignore_links = True
        h.ignore_emphasis = False
        h.skip_internal_links = True
        text = h.handle(html)
        soup = BeautifulSoup(text, "html.parser")
        text = soup.text
        text = removeAngular(text)
        text = removeMarkdown(text)
        return text
    

    It works well for me but it can be enhanced, of course...

    0 讨论(0)
提交回复
热议问题