Hi, I'm having trouble with getting my cuda program to work, apparently it is something to do with the way I call the kernel and what I'm telling it to do compare to what I want it to do.
OK, what I want it to do is (for now):
*input x*
*maths calculation using x*
*set answer to maths calculation as x*
*do this 100 times*
*output x*
Here is my code:
#include <string>
#include <cstdlib>
#include <iostream>
#include <cmath>
__global__ void runge_4(float *x){
*x = *x + 0.01;
}
using namespace std;
float x = 1;
float *gpu;
LARGE_INTEGER numTicksPerSecond;
LARGE_INTEGER startTime;
LARGE_INTEGER endTime;
int main(){
QueryPerformanceFrequency(&numTicksPerSecond);
QueryPerformanceCounter(&startTime);
cudaMalloc((void**)&gpu, sizeof(float));
cudaMemcpy(gpu,&x,sizeof(float),cudaMemcpyHostToDevice);
runge_4<<<1, 100>>>(gpu);
cudaMemcpy(&x, gpu,sizeof(float),cudaMemcpyDeviceToHost);
QueryPerformanceCounter(&endTime);
LONGLONG numTicks = endTime.QuadPart - startTime.QuadPart;
double numSeconds = (((double) numTicks) / (double) numTicksPerSecond.QuadPart);
cout << "Num Ticks Per Second : " << numTicksPerSecond.QuadPart << endl;
cout << "Start " << startTime.QuadPart << endl;
cout << "End : " << endTime.QuadPart << endl;
cout << "Num Ticks : " << numTicks << endl;
cout << "Num seconds : " << numSeconds << endl;
cout<<x<<endl;
cudaFree(gpu);
return 0;
}
The output from this code is 1.01, athough I tell the kernel to run 100 times, and changing the numbers in line 30 produce some weird results:
e.g.
runge_4<<<100, 100>>>(gpu); gives an output of 1.07
runge_4<<<1, 1000>>>(gpu); gives an output of 1