Ubuntu 11.10:
$ python
Python 2.7.2+ (default, Oct 4 2011, 20:03:08)
[GCC 4.6.1] on linux2
Type \"help\", \"copyright\", \"credits\" or \"license\" for more
great question! i fell down this rabbit hole recently myself.
@dan04's answer inspired me to expand it into a unicode subclass that provides consistent indexing, slicing, and len() on both narrow and wide Python 2 builds:
class WideUnicode(unicode):
"""String class with consistent indexing, slicing, len() on both narrow and wide Python."""
def __init__(self, *args, **kwargs):
super(WideUnicode, self).__init__(*args, **kwargs)
# use UTF-32LE to avoid a byte order marker at the beginning of the string
self.__utf32le = unicode(self).encode('utf-32le')
def __len__(self):
return len(self.__utf32le) / 4
def __getitem__(self, key):
length = len(self)
if isinstance(key, int):
if key >= length:
raise IndexError()
key = slice(key, key + 1)
if key.stop is None:
key.stop = length
assert key.step is None
return WideUnicode(self.__utf32le[key.start * 4:key.stop * 4]
.decode('utf-32le'))
def __getslice__(self, i, j):
return self.__getitem__(slice(i, j))
open sourced here, public domain. example usage:
text = WideUnicode(obj.text)
for tag in obj.tags:
text = WideUnicode(text[:start] + tag.text + text[end:])
(simplified from this usage.)
thanks @dan04!