I've gotten stuck writing some parallel c code using OpenMP for a concurrency course.
Heres a snippet
#include <stdio.h> #include <time.h> #include <math.h> #define FALSE 0 #define TRUE 1 int count_primes_0(int); int count_primes_1(int); int count_primes_2(int); int main(int argc, char *argv[]){ int n; if (argc != 2){ printf("Incorrect Invocation, use: \nq1 N"); return 0; } else { n = atoi(argv[1]); } if (n < 0){ printf("N cannot be negative"); return 0; } printf("N = %d\n", n); //omp_set_num_threads(1); time_it(count_primes_0, n, "Method 0"); time_it(count_primes_1, n, "Method 1"); time_it(count_primes_2, n, "Method 2"); return 0; } int is_prime(int n){ for(int i = 2; i <= (int)(sqrt((double) n)); i++){ if ((n % i) == 0){ return FALSE; } } return n > 1; } void time_it( int (*f)(int), int n, char *string){ clock_t start_clock; clock_t end_clock; double calc_time; int nprimes; struct timeval start_val; struct timeval end_val; start_clock = clock(); nprimes = (*f)(n); end_clock = clock(); calc_time = ((double)end_clock - (double)start_clock) / CLOCKS_PER_SEC; printf("\tNumber of primes: %d \t Time taken: %fs\n\n", nprimes, calc_time); } // METHOD 0 // Base Case no parallelization int count_primes_0(int n){ int nprimes = 0; for(int i = 1; i <= n; i++){ if (is_prime(i)) { nprimes++; } } return nprimes; } //METHOD 1 // Use only For and Critical Constructs int count_primes_1(int n){ int nprimes = 0; #pragma omp parallel for for(int i = 1; i <= n; i++){ if (is_prime(i)) { #pragma omp critical nprimes++; } } return nprimes; } //METHOD 2 // Use Reduction int count_primes_2(int n){ int nprimes = 0; #pragma omp parallel for reduction(+:nprimes) for(int i = 1; i <= n; i++){ if (is_prime(i)) { nprimes++; } } return nprimes; }
The problem I'm facing is that when I use omp_set_num_threads() the less threads I use the faster my functions run -- or get closer to the runtime of the base unparallelized case
Time Results: These are run on an 8 core machine
8 Threads: Method 0: 0.07s; Method 1: 1.63s; Method 2: 1.4s
4 Threads: Method 0: 0.07s; Method 1: 0.16s; Method 2: 0.16s
2 Threads: Method 0: 0.07s; Method 1: 0.10; Method 2: 0.09
1 Thread: Method 0: 0.07s; Method 1: 0.08s; Method 2: 0.07s
I've tried disabling optimization and using a different gcc version with no difference
Any help is appreciated.
EDIT: Using clock in Linux returns the 'incorrect' time, wall clock time is what I needed so using ether omp_get_wtime() or the Linux function timeit would produce the proper results.