I want to get whole html under a tag and using HTMLParser. I am able to currently get the data between the tags and following is my code
class LinksParser(HT
One could use xml.etree.ElementTree.TreeBuilder to exploit etree API for finding/manipulating the element:
import sys
from HTMLParser import HTMLParser
from xml.etree import cElementTree as etree
class LinksParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.tb = etree.TreeBuilder()
def handle_starttag(self, tag, attributes):
self.tb.start(tag, dict(attributes))
def handle_endtag(self, tag):
self.tb.end(tag)
def handle_data(self, data):
self.tb.data(data)
def close(self):
HTMLParser.close(self)
return self.tb.close()
parser = LinksParser()
parser.feed(sys.stdin.read())
root = parser.close()
span = root.find(".//span[@itemprop='description']")
etree.ElementTree(span).write(sys.stdout)
My First Heading
My first
paragraph.
To print without the parent (root) tag:
sys.stdout.write(span.text)
for child in span:
sys.stdout.write(etree.tostring(child)) # add encoding="unicode" on Python 3