from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
print line
When p
An lxml.html-based solution (lxml is a native library and can be more performant than a pure python solution).
from lxml import html
## from file-like object or URL
tree = html.parse(file_like_object_or_url)
## from string
tree = html.fromstring('safe safe')
print(tree.text_content().strip())
### OUTPUT: 'safe unsafe safe'
from lxml import html
from lxml.html.clean import clean_html
tree = html.fromstring("""
Detailed answers to any questions you might have
""")
## text only
print(clean_html(tree).text_content().strip())
### OUTPUT: 'Detailed answers to any questions you might have'
Also see http://lxml.de/lxmlhtml.html#cleaning-up-html for what exactly the lxml.cleaner does.
If you need more control over what exactly is sanitized before converting to text then you might want to use the lxml Cleaner explicitly by passing the options you want in the constructor, e.g:
cleaner = Cleaner(page_structure=True,
meta=True,
embedded=True,
links=True,
style=True,
processing_instructions=True,
inline_style=True,
scripts=True,
javascript=True,
comments=True,
frames=True,
forms=True,
annoying_tags=True,
remove_unknown_tags=True,
safe_attrs_only=True,
safe_attrs=frozenset(['src','color', 'href', 'title', 'class', 'name', 'id']),
remove_tags=('span', 'font', 'div')
)
sanitized_html = cleaner.clean_html(unsafe_html)
If you need more control over how plain text is generated then instead of text_content()
you can use lxml.etree.tostring:
plain_bytes = tostring(tree, method='text', encoding='utf-8')
print(plain.decode('utf-8'))