kernel void AtomicSum(global int* sum){
atomic_add(sum,1);
}
Let's try to test this kernel running 1024x1024x128 threads:
int sum=0; cl::Buffer bufferSum = cl::Buffer(context, CL_MEM_READ_WRITE, 1 * sizeof(float)); queue.enqueueWriteBuffer(bufferSum, CL_TRUE, 0, 1 * sizeof(int), &sum); cl::Kernel kernel=cl::Kernel(program, "AtomicSum"); kernel.setArg(0,bufferSum); queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(1024*1024*128), cl::NullRange); queue.finish(); queue.enqueueReadBuffer(bufferSum,CL_TRUE,0,1 * sizeof(int),&sum); std::cout << "Sum: " << sum << "\n";