Efficient cython file reading, string parsing, and array building

后端 未结 2 472
醉话见心
醉话见心 2020-12-28 10:54

So I have some data files that look like this:

      47
   425   425  -3 15000 15000 900   385   315   3   370   330   2   340   330   2
   325   315   2   3         


        
2条回答
  •  醉话见心
    2020-12-28 11:08

    Here is a faster example, it use fast_atoi() to convert string to int, it's 2x faster then get_points_cython() on my pc. If the number of points line have the same width (8 chars), then I think I can speedup it further (about 12x faster then get_points_cython()).

    %%cython
    import numpy as np
    cimport numpy as np
    import cython
    
    cdef int fast_atoi(char *buff):
        cdef int c = 0, sign = 0, x = 0
        cdef char *p = buff
        while True:
            c = p[0]
            if c == 0:
                break
            if c == 45:
                sign = 1
            elif c > 47 and c < 58:
                x = x * 10 + c - 48
            p += 1
        return -x if sign else x
    
    @cython.boundscheck(False)
    @cython.wraparound(False)
    def get_points_cython_numpy(filename):
        cdef int i, j, x, y, z, n_chunks
        cdef bytes line, chunk
        cdef int[:, ::1] points = np.zeros([500000, 3], np.int32)
        f = open(filename, 'rb')
        j = 0
        for line in f:
            n_chunks = int(len(line)/16)
            for i in range(n_chunks):
                chunk = line[16*i:16*(i+1)]
                x = fast_atoi(chunk[0:6])
                y = fast_atoi(chunk[6:12])
                z = fast_atoi(chunk[12:16])
                points[j, 0] = x
                points[j, 1] = y
                points[j, 2] = z
                j = j + 1
    
        f.close()
        return points.base[:j]
    

    Here is the fasest method, the idea is read the whole file content into a bytes object, and get points data from it.

    @cython.boundscheck(False)
    @cython.wraparound(False)
    cdef inline int fast_atoi(char *buf, int size):
        cdef int i=0 ,c = 0, sign = 0, x = 0
        for i in range(size):
            c = buf[i]
            if c == 0:
                break
            if c == 45:
                sign = 1
            elif c > 47 and c < 58:
                x = x * 10 + c - 48
        return -x if sign else x
    
    @cython.boundscheck(False)
    @cython.wraparound(False)
    def fastest_read_points(fn):
        cdef bytes buf
        with open(fn, "rb") as f:
            buf = f.read().replace(b"\n", b"") # change it with your endline.
    
        cdef char * p = buf
        cdef int length = len(buf)
        cdef char * buf_end = p + length
        cdef int count = length // 16 * 2 # create enough large array  
        cdef int[:, ::1] res = np.zeros((count, 3), np.int32)
        cdef int i, j, block_count
        i = 0
        while p < buf_end:
            block_count = fast_atoi(p, 10)
            p += 10
            for j in range(block_count):
                res[i, 0] = fast_atoi(p, 6)
                res[i, 1] = fast_atoi(p+6, 6)
                res[i, 2] = fast_atoi(p+12, 4)
                p += 16
                i += 1
        return res.base[:i]
    

提交回复
热议问题