from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
print line
When p
This method works flawlessly for me and requires no additional installations:
import re
import htmlentitydefs
def convertentity(m):
if m.group(1)=='#':
try:
return unichr(int(m.group(2)))
except ValueError:
return '%s;' % m.group(2)
try:
return htmlentitydefs.entitydefs[m.group(2)]
except KeyError:
return '&%s;' % m.group(2)
def converthtml(s):
return re.sub(r'&(#?)(.+?);',convertentity,s)
html = converthtml(html)
html.replace(" ", " ") ## Get rid of the remnants of certain formatting(subscript,superscript,etc).