I have experience in coding OpenMP for Shared Memory machines (in both C and FORTRAN) to carry out simple tasks like matrix addition, multiplication etc. (Just to see how it
http://archive.euroscipy.org/talk/6857 "introduces Cython's OpenMP abilities focussing on parallel loops over NumPy arrays. Source code examples demonstrate how to use OpenMP from Python. Results for parallel algorithms with OpenMP show what speed-ups can be achieved for different data sizes compared to other parallelizing strategies."
import numpy
import cython
from cython cimport parallel
@cython.boundscheck(False)
@cython.wraparound(False)
def func(object[double, ndim=2] buf1 not None,
object[double, ndim=2] buf2 not None,
object[double, ndim=2] output=None,
int num_threads=2):
cdef unsigned int x, y, inner, outer
if buf1.shape != buf2.shape:
raise TypeError('Arrays have different shapes: %s, %s' % (buf1.shape,
buf2.shape))
if output is None:
output = numpy.empty_like(buf1)
outer = buf1.shape[0]
inner = buf1.shape[1]
with nogil, cython.boundscheck(False), cython.wraparound(False):
for x in parallel.prange(outer, schedule='static',
num_threads=num_threads):
for y in xrange(inner):
output[x, y] = ((buf1[x, y] + buf2[x, y]) * 2 +
buf1[x, y] * buf2[x, y])
return output