Strip HTML from strings in Python

前端未结

关注

 26  2788

难免孤独 2020-11-22 02:50

from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
  print line

When p

26条回答

余生分开走 (楼主)

2020-11-22 03:08

This method works flawlessly for me and requires no additional installations:

import re
import htmlentitydefs

def convertentity(m):
    if m.group(1)=='#':
        try:
            return unichr(int(m.group(2)))
        except ValueError:
            return '&#%s;' % m.group(2)
        try:
            return htmlentitydefs.entitydefs[m.group(2)]
        except KeyError:
            return '&%s;' % m.group(2)

def converthtml(s):
    return re.sub(r'&(#?)(.+?);',convertentity,s)

html =  converthtml(html)
html.replace(" ", " ") ## Get rid of the remnants of certain formatting(subscript,superscript,etc).

0 讨论(0)

查看其它26个回答