Here is a small snippet to help you to guess the encoding. It guesses between latin1 and utf8 quite good. It converts a byte string to a unicode string.
# Attention: Order of encoding_guess_list is import. Example: "latin1" always succeeds.
encoding_guess_list=['utf8', 'latin1']
def try_unicode(string, errors='strict'):
if isinstance(string, unicode):
return string
assert isinstance(string, str), repr(string)
for enc in encoding_guess_list:
try:
return string.decode(enc, errors)
except UnicodeError, exc:
continue
raise UnicodeError('Failed to convert %r' % string)
def test_try_unicode():
for start, should in [
('\xfc', u'ü'),
('\xc3\xbc', u'ü'),
('\xbb', u'\xbb'), # postgres/psycopg2 latin1: RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
]:
result=try_unicode(start, errors='strict')
if not result==should:
raise Exception(u'Error: start=%r should=%r result=%r' % (
start, should, result))