I want to get whole html under a tag and using HTMLParser. I am able to currently get the data between the tags and following is my code
class LinksParser(HT
Here's something that gets the job done based on the test data you provided with minimal changes to your existing code (assuming it's basically doing what you want already). You'd probably want to expand it to deal with self-closing tags in a more robust way:
from HTMLParser import HTMLParser
class LinksParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.recording = 0
self.data = ''
self.self_closing_tags = ("br",)
def handle_starttag(self, tag, attributes):
if tag not in ('span',) + self.self_closing_tags:
self.data += "<%s" % (tag,)
if attributes:
self.data += " " + " ".join('%s="%s"' % (k, v) for k, v in attributes)
self.data += ">"
return
if self.recording:
self.recording += 1
return
for name, value in attributes:
if name == 'itemprop' and value == 'description':
break
else:
return
self.recording = 1
def handle_endtag(self, tag):
if tag == 'span' and self.recording:
self.recording -= 1
elif tag in self.self_closing_tags:
self.data += "<%s/"> % (tag,)
else:
self.data += "%s>" % (tag,)
def handle_data(self, data):
if self.recording:
self.data += data
Given this as input:
My First Heading
My first
paragraph.
the output is:
My First Heading
My first
paragraph.