numpy faster than numba and cython , how to improve numba code

后端 未结 3 1044
陌清茗
陌清茗 2020-12-10 07:18

I have a simple example here to help me understand using numba and cython. I am `new to both numba and cython. I\'ve tried my best with to incorporate all the tricks to make

3条回答
  •  天命终不由人
    2020-12-10 07:48

    It depends on the exp implementation and parallelization

    If you use Intel SVML in Numpy, use it in other packages like Numba,Numexpr or Cython too. Numba performance tips

    If the Numpy commands are parallelized also try to parallelize it in Numba or Cython.

    Code

    import os
    #Have to be before importing numpy
    #Test with 1 Thread against a single thread Numba/Cython Version and
    #at least with number of physical cores against parallel versions
    os.environ["MKL_NUM_THREADS"] = "1" 
    
    import numpy as np
    
    #from version 0.43 until 0.47 this has to be set before importing numba
    #Bug: https://github.com/numba/numba/issues/4689
    from llvmlite import binding
    binding.set_option('SVML', '-vector-library=SVML')
    import numba as nb
    
    def py_expsum(x):
        return np.sum( np.exp(x) )
    
    @nb.njit(parallel=False,fastmath=True) #set it to True for a parallel version  
    def nb_expsum(x):
        val = nb.float32(0.)#change this to float64 on the float64 version
        for ix in nb.prange(x.shape[0]):
            for iy in range(x.shape[1]):
                val += np.exp(x[ix,iy])
        return val
    
    N,M=2000, 1000
    #a=np.random.rand(N*M).reshape((N,M)).astype(np.float32)
    a=np.random.rand(N*M).reshape((N,M))
    

    Benchmarks

    #float64
    %timeit py_expsum(a) #os.environ["MKL_NUM_THREADS"] = "1" 
    #7.44 ms ± 86.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    %timeit py_expsum(a) #os.environ["MKL_NUM_THREADS"] = "6" 
    #4.83 ms ± 139 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    %timeit nb_expsum(a) #parallel=false
    #2.49 ms ± 25.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    %timeit nb_expsum(a) ##parallel=true
    #568 µs ± 45.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
    
    #float32
    %timeit py_expsum(a) #os.environ["MKL_NUM_THREADS"] = "1" 
    #3.44 ms ± 66.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    %timeit py_expsum(a) #os.environ["MKL_NUM_THREADS"] = "6" 
    #2.59 ms ± 35.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    %timeit nb_expsum(a) #parallel=false
    #1 ms ± 12.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
    %timeit nb_expsum(a) #parallel=true
    #252 µs ± 19.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
    

    Perfplot with SVML

    import numpy as np
    
    #from version 0.43 until 0.47 this has to be set before importing numba
    #Bug: https://github.com/numba/numba/issues/4689
    from llvmlite import binding
    binding.set_option('SVML', '-vector-library=SVML')
    import numba as nb
    
    def py_expsum(x):
        return np.sum(np.exp(x))
    
    @nb.jit( nopython=True,parallel=False,fastmath=False)    
    def nb_expsum_single_thread(x):
        nx, ny = x.shape
        val = 0.0
        for ix in range(nx):
            for iy in range(ny):
                val += np.exp( x[ix, iy] )
        return val
    
    #fastmath makes SIMD-vectorization possible 
    #val+=some_value is not vectorizable (scalar depends on scalar)
    #This would also prevents the usage of SVML
    @nb.jit( nopython=True,parallel=False,fastmath=True)    
    def nb_expsum_single_thread_vec(x):
        nx, ny = x.shape
        val = 0.0
        for ix in range(nx):
            for iy in range(ny):
                val += np.exp( x[ix, iy] )
        return val
    
    @nb.jit(nopython=True,parallel=True,fastmath=False)    
    def nb_expsum_parallel(x):
        nx, ny = x.shape
        val = 0.0
        #parallelization over the outer loop is almost every time faster
        #except for rare cases like this (x.shape -> (1,n))
        for ix in range(nx):
            for iy in nb.prange(ny):
                val += np.exp( x[ix, iy] )
        return val
    
    #fastmath makes SIMD-vectorization possible 
    #val+=some_value is not vectorizable (scalar depends on scalar)
    #This would also prevents the usage of SVML
    @nb.jit(nopython=True,parallel=True,fastmath=True)    
    def nb_expsum_parallel_vec(x):
        nx, ny = x.shape
        val = 0.0
        #parallelization over the outer loop is almost every time faster
        #except for rare cases like this (x.shape -> (1,n))
        for ix in range(nx):
            for iy in nb.prange(ny):
                val += np.exp( x[ix, iy] )
        return val
    
    import perfplot
    factor = 1.0 # 0.0 or 1e4
    perfplot.show(
        setup=lambda n: factor*np.random.rand(1,n),
        n_range=[2**k for k in range(0,27)],
        kernels=[
            py_expsum,
            nb_expsum_single_thread,
            nb_expsum_single_thread_vec,
            nb_expsum_parallel,
            nb_expsum_parallel_vec,
            cy_expsum
            ],
        logx=True,
        logy=True,
        xlabel='len(x)'
        )
    

    Check if SVML has been used

    Can be useful to check if everything is working as expected.

    def check_SVML(func):
        if 'intel_svmlcc' in func.inspect_llvm(func.signatures[0]):
            print("found")
        else:
            print("not found")
    
    check_SVML(nb_expsum_parallel_vec)
    #found
    

提交回复
热议问题