strip tags python

后端 未结 9 1615
深忆病人
深忆病人 2020-12-17 23:07

i want the following functionality.

input : this is test  bold text  normal text
expected output: this is test normal text
相关标签:
9条回答
  • 2020-12-17 23:17

    I would use http://code.google.com/p/html5lib/ if you want to include some safe tags.

    See the "Sanitizing Tokenizer" section at http://code.google.com/p/html5lib/wiki/UserDocumentation.

    Remember to test for vulnerabilities if it's an important service: http://ha.ckers.org/xss.html.

    0 讨论(0)
  • 2020-12-17 23:20

    Looks like you want HTMLParser. (html.parser in Python 3.)

    from HTMLParser import HTMLParser
    from sys import stdout
    class Filter(HTMLParser):
        def __init__(self, ignored_tags):
            super(Filter, self).__init__()
            self.ignorelevel = 0
            self. ignored_tags = ignored_tags
        def handle_starttag(self, tag, attrs):
            if self.ignorelevel > 0:
                self.ignorelevel += 1
            elif tag in self.ignored_tags:
                self.ignorelevel = 1
            else:
                # One of these two.  Test and see.
                stdout.write(self.get_starttag_text())
                #stdout.write('<' + self.get_starttag_text() + '>')
        def handle_startendtag(self, tag, attrs):
            if self.ignorelevel == 0 and tag not in self.ignored_tags:
                # One of these two.  Test and see.
                stdout.write(self.get_starttag_text())
                #stdout.write('<' + self.get_starttag_text() + '/>')
        def handle_endtag(self, tag):
            if self.ignorelevel > 0:
                self.ignorelevel -= 1
                if self.ignorelevel > 0:
                    return
            stdout.write('</' + tag + '>')
        def handle_data(self, data):
            stdout.write(data)
        def handle_charref(self, name):
            stdout.write('&#' + name + ';')
        def handle_entityref(self, name):
            stdout.write('&' + name + ';')
        def handle_comment(self, data):
            stdout.write('<!-- ' + data + ' -->')
        def handle_decl(self, data):
            stdout.write('<!' + data + '>')
        def handle_pi(self, data):
            stdout.write('<?' + data + '>')
    
    0 讨论(0)
  • 2020-12-17 23:21

    With BeautifulSoup:

    from BeautifulSoup import BeautifulSoup    
    ''.join(BeautifulSoup(page).findAll(text=True))
    

    Found at http://www.ghastlyfop.com/blog/2008/12/strip-html-tags-from-string-python.html

    0 讨论(0)
  • 2020-12-17 23:23

    Solution using BeautifulSoup:

    from BeautifulSoup import BeautifulSoup
    def removeTag(soup, tagname):
        for tag in soup.findAll(tagname):
            contents = tag.contents
            parent = tag.parent
            tag.extract()
    
    s = BeautifulSoup("abcd <b> btag </b> hello <d>dtag</d>")
    
    removeTag(s,"b")
    print s
    removeTag(s, "d")
    print s
    

    returns:

    >>>
    abcd  hello <d>dtag</d>
    abcd  hello
    
    0 讨论(0)
  • 2020-12-17 23:34

    Use the webob.exc module:

    from webob.exc import strip_tags
    

    And then use it:

    print strip_tags('a<br/>b')
    >> ab
    
    0 讨论(0)
  • 2020-12-17 23:36

    Sam's answer should do what's wanted fairly well as far as I can tell, but it may pay to make sure that any left over <> characters are replaced with &lt; and &gt; respectively to prevent misuse/invalid HTML.

    This approach has the advantage that it can accept incredibly malformed HTML references/tags. BeautifulSoup also handles malformed tags fairly well but html5lib, sgmllib and htmllib can choke on invalid code, some more than others if I remember correctly.

    The following code also validates & HTML references:

    import re
    from htmlentitydefs import name2codepoint, codepoint2name
    
    S = '1234567890ABCDEF'
    DHex = {}
    for i in S:
        DHex[i.lower()] = None
        DHex[i.upper()] = None
    
    def IsHex(S):
        if not S: return False
        for i in S: 
            if i not in DHex:
                return False
        return True
    
    def UnEscape(S, LReEscape=None):
        # Converts HTML character references into a unicode string to allow manipulation
        #
        # If LUnEscape is provided, then the positions of the escaped characters will be 
        # added to allow turning the result back into HTML with ReEscape below, validating 
        # the references and escaping all the rest
        # 
        # This is needed to prevent browsers from stripping out e.g. &#32; (spaces) etc
        re = LReEscape != None
    
        LRtn = []
        L = S.split('&')
        xx = 0
        yy = 0
        for iS in L:
            if xx:
                LSplit = iS.split(';')
                if LSplit[0].lower() in name2codepoint:
                    # A character reference, e.g. '&amp;'
                    a = unichr(name2codepoint[LSplit[0].lower()])
                    LRtn.append(a+';'.join(LSplit[1:]))
                    if re: LReEscape.append((yy, a))
    
                elif LSplit[0] and LSplit[0][0] == '#' and LSplit[0][1:].isdigit():
                    # A character number e.g. '&#52;'
                    a = unichr(int(LSplit[0][1:]))
                    LRtn.append(a+';'.join(LSplit[1:]))
                    if re: LReEscape.append((yy, a))
    
                elif LSplit[0] and LSplit[0][0] == '#' and LSplit[0][1:2].lower() == 'x' and IsHex(LSplit[0][2:]):
                    # A hexadecimal encoded character
                    a = unichr(int(LSplit[0][2:].lower(), 16)) # Hex -> base 16
                    LRtn.append(a+';'.join(LSplit[1:]))
                    if re: LReEscape.append((yy, a))
    
                else: LRtn.append('&%s' % ';'.join(LSplit))
            else: LRtn.append(iS)
            xx += 1
            yy += len(LRtn[-1])
        return ''.join(LRtn)
    
    def ReEscape(LReEscape, S, EscFn):
        # Re-escapes the output of UnEscape to HTML, ensuring e.g. &#32; 
        # is turned back again and isn't stripped at a browser level
        L = []
        prev = 0
        for x, c in LReEscape:
            if x != prev:
                L.append(EscFn(S[prev:x]))
    
            o = ord(c)
            if o in codepoint2name:
                L.append('&%s;' % codepoint2name[o])
            else: L.append('&#%s;' % o)
            prev = x+len(c)
        L.append(EscFn(S[prev:]))
        return ''.join(L)
    
    def escape(value):
        # Escape left over <>& tags
        value = value.replace('&', '&amp;')
        value = value.replace('>', '&gt;')
        value = value.replace('<', '&lt;')
        return value
    
    def strip_tags(value):
        # Strip HTML tags
        value = re.sub(r'<[^>]*?>', '', value)
        print 'No Tags:', value
    
        # Validate & references
        LReEscape = []
        value = UnEscape(value, LReEscape)
        value = ReEscape(LReEscape, value, EscFn=escape)
        print 'References Validated:', value
        return value
    
    if __name__ == '__main__':
        # Outputs:
        #  No Tags: this is test  bold text  normal text >< &blah &amp; &amp
        #  References Validated: this is test  bold text  normal text &gt;&lt; &amp;blah &amp; &amp;
        strip_tags('this is test <b> bold text </b> normal text >< &blah &amp; &amp')
    
    0 讨论(0)
提交回复
热议问题