Strip HTML from strings in Python

前端 未结 26 2663
难免孤独
难免孤独 2020-11-22 02:50
from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
  print line

When p

26条回答
  •  陌清茗
    陌清茗 (楼主)
    2020-11-22 03:15

    I always used this function to strip HTML tags, as it requires only the Python stdlib:

    For Python 3:

    from io import StringIO
    from html.parser import HTMLParser
    
    class MLStripper(HTMLParser):
        def __init__(self):
            super().__init__()
            self.reset()
            self.strict = False
            self.convert_charrefs= True
            self.text = StringIO()
        def handle_data(self, d):
            self.text.write(d)
        def get_data(self):
            return self.text.getvalue()
    
    def strip_tags(html):
        s = MLStripper()
        s.feed(html)
        return s.get_data()
    

    For Python 2:

    from HTMLParser import HTMLParser
    from StringIO import StringIO
    
    class MLStripper(HTMLParser):
        def __init__(self):
            self.reset()
            self.text = StringIO()
        def handle_data(self, d):
            self.text.write(d)
        def get_data(self):
            return self.text.getvalue()
    
    def strip_tags(html):
        s = MLStripper()
        s.feed(html)
        return s.get_data()
    

提交回复
热议问题