How can I slice each element of a numpy array of strings?

前端 未结 4 2015
面向向阳花
面向向阳花 2020-12-01 16:25

Numpy has some very useful string operations, which vectorize the usual Python string operations.

Compared to these operation and to pandas.str, the num

4条回答
  •  北海茫月
    2020-12-01 17:15

    Interesting omission... I guess you can always write your own:

    import numpy as np
    
    def slicer(start=None, stop=None, step=1):
        return np.vectorize(lambda x: x[start:stop:step], otypes=[str])
    
    a = np.array(['hello', 'how', 'are', 'you'])
    print(slicer(1, 3)(a))    # => ['el' 'ow' 're' 'ou']
    

    EDIT: Here are some benchmarks using the text of Ulysses by James Joyce. It seems the clear winner is @hpaulj's last strategy. @Divakar gets into the race improving on @hpaulj's last strategy.

    import numpy as np
    import requests
    
    ulysses = requests.get('http://www.gutenberg.org/files/4300/4300-0.txt').text
    a = np.array(ulysses.split())
    
    # Ufunc
    def slicer(start=None, stop=None, step=1):
        return np.vectorize(lambda x: x[start:stop:step], otypes=[str])
    
    %timeit slicer(1, 3)(a)
    # => 1 loop, best of 3: 221 ms per loop
    
    # Non-mutating loop
    def loop1(a):
        out = np.empty(len(a), dtype=object)
        for i, word in enumerate(a):
            out[i] = word[1:3]
    
    %timeit loop1(a)
    # => 1 loop, best of 3: 262 ms per loop
    
    # Mutating loop
    def loop2(a):
        for i in range(len(a)):
            a[i] = a[i][1:3]
    
    b = a.copy()
    %timeit -n 1 -r 1 loop2(b)
    # 1 loop, best of 1: 285 ms per loop
    
    # From @hpaulj's answer
    %timeit np.frompyfunc(lambda x:x[1:3],1,1)(a)
    # => 10 loops, best of 3: 141 ms per loop
    
    %timeit np.frompyfunc(lambda x:x[1:3],1,1)(a).astype('U2')
    # => 1 loop, best of 3: 170 ms per loop
    
    %timeit a.view('U1').reshape(len(a),-1)[:,1:3].astype(object).sum(axis=1)
    # => 10 loops, best of 3: 60.7 ms per loop
    
    def slicer_vectorized(a,start,end):
        b = a.view('S1').reshape(len(a),-1)[:,start:end]
        return np.fromstring(b.tostring(),dtype='S'+str(end-start))
    
    %timeit slicer_vectorized(a,1,3)
    # => The slowest run took 5.34 times longer than the fastest.
    #    This could mean that an intermediate result is being cached.
    #    10 loops, best of 3: 16.8 ms per loop
    

提交回复
热议问题