Numpy has some very useful string operations, which vectorize the usual Python string operations.
Compared to these operation and to pandas.str
, the num
Interesting omission... I guess you can always write your own:
import numpy as np
def slicer(start=None, stop=None, step=1):
return np.vectorize(lambda x: x[start:stop:step], otypes=[str])
a = np.array(['hello', 'how', 'are', 'you'])
print(slicer(1, 3)(a)) # => ['el' 'ow' 're' 'ou']
EDIT: Here are some benchmarks using the text of Ulysses by James Joyce. It seems the clear winner is @hpaulj's last strategy. @Divakar gets into the race improving on @hpaulj's last strategy.
import numpy as np
import requests
ulysses = requests.get('http://www.gutenberg.org/files/4300/4300-0.txt').text
a = np.array(ulysses.split())
# Ufunc
def slicer(start=None, stop=None, step=1):
return np.vectorize(lambda x: x[start:stop:step], otypes=[str])
%timeit slicer(1, 3)(a)
# => 1 loop, best of 3: 221 ms per loop
# Non-mutating loop
def loop1(a):
out = np.empty(len(a), dtype=object)
for i, word in enumerate(a):
out[i] = word[1:3]
%timeit loop1(a)
# => 1 loop, best of 3: 262 ms per loop
# Mutating loop
def loop2(a):
for i in range(len(a)):
a[i] = a[i][1:3]
b = a.copy()
%timeit -n 1 -r 1 loop2(b)
# 1 loop, best of 1: 285 ms per loop
# From @hpaulj's answer
%timeit np.frompyfunc(lambda x:x[1:3],1,1)(a)
# => 10 loops, best of 3: 141 ms per loop
%timeit np.frompyfunc(lambda x:x[1:3],1,1)(a).astype('U2')
# => 1 loop, best of 3: 170 ms per loop
%timeit a.view('U1').reshape(len(a),-1)[:,1:3].astype(object).sum(axis=1)
# => 10 loops, best of 3: 60.7 ms per loop
def slicer_vectorized(a,start,end):
b = a.view('S1').reshape(len(a),-1)[:,start:end]
return np.fromstring(b.tostring(),dtype='S'+str(end-start))
%timeit slicer_vectorized(a,1,3)
# => The slowest run took 5.34 times longer than the fastest.
# This could mean that an intermediate result is being cached.
# 10 loops, best of 3: 16.8 ms per loop