Python UTF-16 CSV reader

前端 未结 4 766
温柔的废话
温柔的废话 2020-11-27 21:49

I have a UTF-16 CSV file which I have to read. Python csv module does not seem to support UTF-16.

I am using python 2.7.2. CSV files I need to parse are huge size ru

4条回答
  •  -上瘾入骨i
    2020-11-27 22:12

    At the moment, the csv module does not support UTF-16.

    In Python 3.x, csv expects a text-mode file and you can simply use the encoding parameter of open to force another encoding:

    # Python 3.x only
    import csv
    with open('utf16.csv', 'r', encoding='utf16') as csvf:
        for line in csv.reader(csvf):
            print(line) # do something with the line
    

    In Python 2.x, you can recode the input:

    # Python 2.x only
    import codecs
    import csv
    
    class Recoder(object):
        def __init__(self, stream, decoder, encoder, eol='\r\n'):
            self._stream = stream
            self._decoder = decoder if isinstance(decoder, codecs.IncrementalDecoder) else codecs.getincrementaldecoder(decoder)()
            self._encoder = encoder if isinstance(encoder, codecs.IncrementalEncoder) else codecs.getincrementalencoder(encoder)()
            self._buf = ''
            self._eol = eol
            self._reachedEof = False
    
        def read(self, size=None):
            r = self._stream.read(size)
            raw = self._decoder.decode(r, size is None)
            return self._encoder.encode(raw)
    
        def __iter__(self):
            return self
    
        def __next__(self):
            if self._reachedEof:
                raise StopIteration()
            while True:
                line,eol,rest = self._buf.partition(self._eol)
                if eol == self._eol:
                    self._buf = rest
                    return self._encoder.encode(line + eol)
                raw = self._stream.read(1024)
                if raw == '':
                    self._decoder.decode(b'', True)
                    self._reachedEof = True
                    return self._encoder.encode(self._buf)
                self._buf += self._decoder.decode(raw)
        next = __next__
    
        def close(self):
            return self._stream.close()
    
    with open('test.csv','rb') as f:
        sr = Recoder(f, 'utf-16', 'utf-8')
    
        for row in csv.reader(sr):
            print (row)
    

    open and codecs.open require the file to start with a BOM. If it doesn't (or you're on Python 2.x), you can still convert it in memory, like this:

    try:
        from io import BytesIO
    except ImportError: # Python < 2.6
        from StringIO import StringIO as BytesIO
    import csv
    with open('utf16.csv', 'rb') as binf:
        c = binf.read().decode('utf-16').encode('utf-8')
    for line in csv.reader(BytesIO(c)):
        print(line) # do something with the line
    

提交回复
热议问题