How can I retrieve the page title of a webpage (title html tag) using Python?
Here is a fault tolerant HTMLParser
implementation.
You can throw pretty much anything at get_title()
without it breaking, If anything unexpected happens
get_title()
will return None
.
When Parser()
downloads the page it encodes it to ASCII
regardless of the charset used in the page ignoring any errors.
It would be trivial to change to_ascii()
to convert the data into UTF-8
or any other encoding. Just add an encoding argument and rename the function to something like to_encoding()
.
By default HTMLParser()
will break on broken html, it will even break on trivial things like mismatched tags. To prevent this behavior I replaced HTMLParser()
's error method with a function that will ignore the errors.
#-*-coding:utf8;-*-
#qpy:3
#qpy:console
'''
Extract the title from a web page using
the standard lib.
'''
from html.parser import HTMLParser
from urllib.request import urlopen
import urllib
def error_callback(*_, **__):
pass
def is_string(data):
return isinstance(data, str)
def is_bytes(data):
return isinstance(data, bytes)
def to_ascii(data):
if is_string(data):
data = data.encode('ascii', errors='ignore')
elif is_bytes(data):
data = data.decode('ascii', errors='ignore')
else:
data = str(data).encode('ascii', errors='ignore')
return data
class Parser(HTMLParser):
def __init__(self, url):
self.title = None
self.rec = False
HTMLParser.__init__(self)
try:
self.feed(to_ascii(urlopen(url).read()))
except urllib.error.HTTPError:
return
except urllib.error.URLError:
return
except ValueError:
return
self.rec = False
self.error = error_callback
def handle_starttag(self, tag, attrs):
if tag == 'title':
self.rec = True
def handle_data(self, data):
if self.rec:
self.title = data
def handle_endtag(self, tag):
if tag == 'title':
self.rec = False
def get_title(url):
return Parser(url).title
print(get_title('http://www.google.com'))