strip tags python

后端 未结 9 1622
深忆病人
深忆病人 2020-12-17 23:07

i want the following functionality.

input : this is test  bold text  normal text
expected output: this is test normal text
9条回答
  •  Happy的楠姐
    2020-12-17 23:36

    Sam's answer should do what's wanted fairly well as far as I can tell, but it may pay to make sure that any left over <> characters are replaced with < and > respectively to prevent misuse/invalid HTML.

    This approach has the advantage that it can accept incredibly malformed HTML references/tags. BeautifulSoup also handles malformed tags fairly well but html5lib, sgmllib and htmllib can choke on invalid code, some more than others if I remember correctly.

    The following code also validates & HTML references:

    import re
    from htmlentitydefs import name2codepoint, codepoint2name
    
    S = '1234567890ABCDEF'
    DHex = {}
    for i in S:
        DHex[i.lower()] = None
        DHex[i.upper()] = None
    
    def IsHex(S):
        if not S: return False
        for i in S: 
            if i not in DHex:
                return False
        return True
    
    def UnEscape(S, LReEscape=None):
        # Converts HTML character references into a unicode string to allow manipulation
        #
        # If LUnEscape is provided, then the positions of the escaped characters will be 
        # added to allow turning the result back into HTML with ReEscape below, validating 
        # the references and escaping all the rest
        # 
        # This is needed to prevent browsers from stripping out e.g.   (spaces) etc
        re = LReEscape != None
    
        LRtn = []
        L = S.split('&')
        xx = 0
        yy = 0
        for iS in L:
            if xx:
                LSplit = iS.split(';')
                if LSplit[0].lower() in name2codepoint:
                    # A character reference, e.g. '&'
                    a = unichr(name2codepoint[LSplit[0].lower()])
                    LRtn.append(a+';'.join(LSplit[1:]))
                    if re: LReEscape.append((yy, a))
    
                elif LSplit[0] and LSplit[0][0] == '#' and LSplit[0][1:].isdigit():
                    # A character number e.g. '4'
                    a = unichr(int(LSplit[0][1:]))
                    LRtn.append(a+';'.join(LSplit[1:]))
                    if re: LReEscape.append((yy, a))
    
                elif LSplit[0] and LSplit[0][0] == '#' and LSplit[0][1:2].lower() == 'x' and IsHex(LSplit[0][2:]):
                    # A hexadecimal encoded character
                    a = unichr(int(LSplit[0][2:].lower(), 16)) # Hex -> base 16
                    LRtn.append(a+';'.join(LSplit[1:]))
                    if re: LReEscape.append((yy, a))
    
                else: LRtn.append('&%s' % ';'.join(LSplit))
            else: LRtn.append(iS)
            xx += 1
            yy += len(LRtn[-1])
        return ''.join(LRtn)
    
    def ReEscape(LReEscape, S, EscFn):
        # Re-escapes the output of UnEscape to HTML, ensuring e.g.   
        # is turned back again and isn't stripped at a browser level
        L = []
        prev = 0
        for x, c in LReEscape:
            if x != prev:
                L.append(EscFn(S[prev:x]))
    
            o = ord(c)
            if o in codepoint2name:
                L.append('&%s;' % codepoint2name[o])
            else: L.append('&#%s;' % o)
            prev = x+len(c)
        L.append(EscFn(S[prev:]))
        return ''.join(L)
    
    def escape(value):
        # Escape left over <>& tags
        value = value.replace('&', '&')
        value = value.replace('>', '>')
        value = value.replace('<', '<')
        return value
    
    def strip_tags(value):
        # Strip HTML tags
        value = re.sub(r'<[^>]*?>', '', value)
        print 'No Tags:', value
    
        # Validate & references
        LReEscape = []
        value = UnEscape(value, LReEscape)
        value = ReEscape(LReEscape, value, EscFn=escape)
        print 'References Validated:', value
        return value
    
    if __name__ == '__main__':
        # Outputs:
        #  No Tags: this is test  bold text  normal text >< &blah & &
        #  References Validated: this is test  bold text  normal text >< &blah & &
        strip_tags('this is test  bold text  normal text >< &blah & &')
    

提交回复
热议问题