i want the following functionality.
input : this is test bold text normal text
expected output: this is test normal text
This is working code taken from my project Supybot, so it's fairly well tested:
class HtmlToText(sgmllib.SGMLParser):
"""Taken from some eff-bot code on c.l.p."""
entitydefs = htmlentitydefs.entitydefs.copy()
entitydefs['nbsp'] = ' '
def __init__(self, tagReplace=' '):
self.data = []
self.tagReplace = tagReplace
sgmllib.SGMLParser.__init__(self)
def unknown_starttag(self, tag, attr):
self.data.append(self.tagReplace)
def unknown_endtag(self, tag):
self.data.append(self.tagReplace)
def handle_data(self, data):
self.data.append(data)
def getText(self):
text = ''.join(self.data).strip()
return normalizeWhitespace(text)
def htmlToText(s, tagReplace=' '):
"""Turns HTML into text. tagReplace is a string to replace HTML tags with.
"""
x = HtmlToText(tagReplace)
x.feed(s)
return x.getText()
As the docstring notes, it originated with Fredrik Lundh, not me. As they say, great authors steal :)