问题
I met UnicodeEncodeError while crawling Wikipedia dump json file. Here are my code snippet and the error message. It seems like the character 'é' cause this problem. However, I do not know how to solve this issue.
import urllib2
import json
# List of philosopher's name: mergel list
# print mergel
i = 0
for name in mergel:
# Use the API to get the page content in a format that we like.
# https://en.wikipedia.org/w/api.php?action=query&titles=Spider-Man&prop=revisions&rvprop=content&format=json
# set the parameters (https://www.mediawiki.org/wiki/API:Tutorial)
i = i+1
baseurl = "https://en.wikipedia.org/w/api.php?"
action = "action=query"
titlename = name.replace(" ", "_")
print titlename
title = "titles="+titlename
content = "prop=revisions&rvprop=content"
dataformat = "format=json"
# construct the query
query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
print query
wikiresponse = urllib2.urlopen(query)
wikisource = wikiresponse.read()
# print wikisource
wikijson = json.loads(wikisource)
jsonfilename = './json/'+titlename+'.json'
with open(jsonfilename, 'w') as outfile:
json.dump(wikijson, outfile)
Error message:
Tenzin_Gyatso
https://en.wikipedia.org/w/api.php?action=query&titles=Tenzin_Gyatso&prop=revisions&rvprop=content&format=json
Claude_Lévi-Strauss
https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-203-8430fc805550> in <module>()
21 query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
22 print query
---> 23 wikiresponse = urllib2.urlopen(query)
24 wikisource = wikiresponse.read()
25 # print wikisource
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
152 else:
153 opener = _opener
--> 154 return opener.open(url, data, timeout)
155
156 def install_opener(opener):
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
429 req = meth(req)
430
--> 431 response = self._open(req, data)
432
433 # post-process response
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _open(self, req, data)
447 protocol = req.get_type()
448 result = self._call_chain(self.handle_open, protocol, protocol +
--> 449 '_open', req)
450 if result:
451 return result
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
407 func = getattr(handler, meth_name)
408
--> 409 result = func(*args)
410 if result is not None:
411 return result
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in https_open(self, req)
1238 def https_open(self, req):
1239 return self.do_open(httplib.HTTPSConnection, req,
-> 1240 context=self._context)
1241
1242 https_request = AbstractHTTPHandler.do_request_
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in do_open(self, http_class, req, **http_conn_args)
1192
1193 try:
-> 1194 h.request(req.get_method(), req.get_selector(), req.data, headers)
1195 except socket.error, err: # XXX what error?
1196 h.close()
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in request(self, method, url, body, headers)
1051 def request(self, method, url, body=None, headers={}):
1052 """Send a complete request to the server."""
-> 1053 self._send_request(method, url, body, headers)
1054
1055 def _set_content_length(self, body, method):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_request(self, method, url, body, headers)
1091 for hdr, value in headers.iteritems():
1092 self.putheader(hdr, value)
-> 1093 self.endheaders(body)
1094
1095 def getresponse(self, buffering=False):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in endheaders(self, message_body)
1047 else:
1048 raise CannotSendHeader()
-> 1049 self._send_output(message_body)
1050
1051 def request(self, method, url, body=None, headers={}):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_output(self, message_body)
891 msg += message_body
892 message_body = None
--> 893 self.send(msg)
894 if message_body is not None:
895 #message_body was not a string (i.e. it is a file) and
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in send(self, data)
867 datablock = data.read(blocksize)
868 else:
--> 869 self.sock.sendall(data)
870
871 def _output(self, s):
/Users/sundong/anaconda/lib/python2.7/ssl.pyc in sendall(self, data, flags)
719 count = 0
720 while (count < amount):
--> 721 v = self.send(data[count:])
722 count += v
723 return amount
/Users/sundong/anaconda/lib/python2.7/ssl.pyc in send(self, data, flags)
685 self.__class__)
686 try:
--> 687 v = self._sslobj.write(data)
688 except SSLError as x:
689 if x.args[0] == SSL_ERROR_WANT_READ:
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 43: ordinal not in range(128)
However, below simple & direct code without getting a title from a list, just works without any issues.
import urllib2
import json
query = 'https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json'
wikiresponse = urllib2.urlopen(query)
wikisource = wikiresponse.read()
wikijson = json.loads(wikisource)
jsonfilename = './json/'+'Claude_Lévi-Strauss'+'.json'
with open(jsonfilename, 'w') as outfile:
json.dump(wikijson, outfile)
回答1:
Don't mix Unicode and bytestrings: use Unicode strings to work with text in Python.
Don't create urls by hand, use urllib
functions such as quote()
, urlencode()
. Also, consider functions from urlparse
module such as urljoin()
, urlunsplit()
.
You've already requested json format, no need to parse it, only to dump it back immediately using the same format; you could use shutil.copyfileobj()
to copy file-like objects. You could check the result file later, to make sure that it has been downloaded correctly.
Putting it all together, here's how to save a wiki-page with a given title to a file in JSON format:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
from contextlib import closing
from urllib import quote
from urllib2 import urlopen
from shutil import copyfileobj
def urlretrieve(url, filename, chunksize=8096):
with closing(urlopen(url)) as response, open(filename, 'wb') as file:
copyfileobj(response, file, chunksize)
#XXX for name in mergel:
name = u"Claude Lévi-Strauss" #NOTE: Unicode string
urlretrieve("https://en.wikipedia.org/w/api.php?"
"action=query&prop=revisions&rvprop=content&format=json&"
"titles=" + quote(name.encode('utf-8')),
os.path.join('json', name + '.json'))
Note:
you don't need to
.replace(' ', '_')
in this caseos.path.join('json', name + '.json')
line mixes bytestrings ('json'
,'.json'
) and Unicode (type(name) == unicode
). It is ok here, because both'json'
and'.json'
are ascii-only literals in the source code# -*- coding: utf-8 -*-
encoding declaration affects only characters that appear literally in your Python source code e.g., it is accidental that the query string also uses the same encoding in this particular case. The encoding of your source code has no relation with a character encoding that might be used for filenames, or to transfer data over http, or to write Unicode text to terminal, etc (all these encodings may be different from each other).In principle, you could have used
urllib.urlretrieve(url, filename)
here instead ofurlopen + copyfile
buturllib.urlretrieve()
behavior is different fromurllib2.urlopen()
on Python 2
Here's the same code using requests
:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
from urllib import quote
import requests # $ pip install requests
def urlretrieve(url, filename, chunksize=8096):
r = requests.get(url, stream=True)
r.raise_for_status() # raise on http error
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunksize):
f.write(chunk)
#XXX for name in mergel:
name = u"Claude Lévi-Strauss" #NOTE: Unicode string
urlretrieve("https://en.wikipedia.org/w/api.php?"
"action=query&prop=revisions&rvprop=content&format=json&"
"titles=" + quote(name.encode('utf-8')),
os.path.join('json', name + '.json'))
However, below simple & direct code without getting a title from a list, just works without any issues.
Your code uses non-ascii bytestring literals (illegal in Python 3). There is no encoding error because all data is bytes already. The issue with using bytestrings is that it breaks if different environment may use different character encodings and they do (you can't expect that everything uses utf-8 however desirable it might be). Also, the query part should be properly encoded e.g., é should be sent as '%C3%A9'.
Unrelated: to download several web-pages at once, you could use a thread pool:
from multiprocessing.dummy import Pool # use threads
def download(name):
urlretrieve("https://en.wikipedia.org/w/api.php?"
"action=query&prop=revisions&rvprop=content&format=json&"
"titles=" + quote(name.encode('utf-8')),
os.path.join('json', name + '.json'))
pool = Pool(4) # download 4 titles concurrently
for _ in pool.imap_unordered(download, mergel, chunksize=100):
pass
It is polite to set maxlag query parameter and respect Retry-After http header. There are several wrappers for Wikipedia API that might do it for you.
来源:https://stackoverflow.com/questions/32809287/unicodeencodeerror-in-urllib2