Hi, I am writing a library that does some heavy computation, and so I have been trying to speed things up with some low level parallelism. My machine runs Ubuntu 16.04, and is an old i7, but has four virtual cores.
I tried to use openMP, and while it runs error free, and appears to be creating the threads, I don't ever see any speedup. I cut my code down drastically, to this test code:
#include "timer.h"
#include <iostream>
#include <math.h>
void testPlusEQ(double* summand1,double* summand2,unsigned int size_)
{// omp simd
for (unsigned int i=0;i<size_;i++) { summand1[i] += summand2[i]; } //sqrt(summand1[i]/summand2[i]);
}
void testPlusEQ_OMP(double* summand1,double* summand2,unsigned int size_)
{// omp simd
#pragma omp simd
for (unsigned int i=0;i<size_;i++) { summand1[i] += summand2[i]; } //sqrt(summand1[i]/summand2[i]);
}
int main()
{
unsigned int size(10000000);
double* x(new double[size]), *y(new double[size]);
for(unsigned int i=0; i < size; ++i) { x[i] = 1 + ((double)i)/((double)size); y[i] = 2 + ((double)i)/((double)size); }
uint64_t t1(MathLib::GetTimeStamp());
testPlusEQ(x,y,size);
uint64_t t2(MathLib::GetTimeStamp());
testPlusEQ_OMP(x,y,size);
uint64_t t3(MathLib::GetTimeStamp());
std::cout << " ST " << t2 - t1 << " OMP " << t3 - t2 << " OMP - ST " << ((double)(t3 - t2)) - ((double)(t2 - t1)) << " OMP/ST " << ((double) (t3 - t2))/((double) (t2 - t1)) << "\n";
return 1;
}
The timer is this code, btw:
// returns time in microseconds
uint64_t MathLib::GetTimeStamp() {
struct timeval tv;
gettimeofday(&tv,0);
return tv.tv_sec*(uint64_t)1000000+tv.tv_usec;
}
int timeval_subtract (timeval* result,timeval* x,timeval* y)
// struct *result, *x, *y;
{
/* Perform the carry for the later subtraction by updating y. */
if (x->tv_usec < y->tv_usec) {
int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
y->tv_usec -= 1000000 * nsec;
y->tv_sec += nsec;
}
if (x->tv_usec - y->tv_usec > 1000000) {
int nsec = (x->tv_usec - y->tv_usec) / 1000000;
y->tv_usec += 1000000 * nsec;
y->tv_sec -= nsec;
}
/* Compute the time remaining to wait.
tv_usec is certainly positive. */
result->tv_sec = x->tv_sec - y->tv_sec;
result->tv_usec = x->tv_usec - y->tv_usec;
/* Return 1 if result is negative. */
return x->tv_sec < y->tv_sec;
}
I have tried many different openMP pragmas inside the openMP version of the function. I have tried heavy computations in the test code and light. I have tried a small vector
length, and very large, but the openMP version and the single threaded version give essentially the same performance. What am I missing, here?