alpaka: Race condition in CUDA blocking queue
@krzikalla provided me with a test case where we have dataraces between a synchronous memcopy and a kernel.
The problem is that we create the streams with cudaStreamCreateWithFlags(..., cudaStreamNonBlocking)
. The cuda documentation that cudaStreamNonBlocking
disable the blocking behavior to the default stream 0
. In alpaka we use blocking memcopy operations e.g.cudaMemcpy
if we use the QueueCudaRtBlocking
. The result is that our memcopy operations are running non blocking to all enqueued kernel.
This BUG is also available in the last release 0.3.5. This means we need to do a bugfix release even we release soon 0.4.0.
We have different possibilities to solve it
- create blocking queues with
cudaStreamCreate()
- this will introduce the side effect that a memcopy will block all stream
- use asynchronously memcopy and do an explicit synchronization after each call (
- will increase the host side overhead since a API call has a latency of ~10 - 16 us
#include <stdio.h>
#include "alpaka/alpaka.hpp"
//#define NATIVE_CUDA 0
#ifdef NATIVE_CUDA
#define CUDA_ASSERT(x) if ((x) != cudaSuccess) { printf("cuda fail on line %d", __LINE__); exit(1); }
__global__ void emptyCudaKernel(int threadElementExtent)
{
assert(threadElementExtent == 1);
}
__global__ void myCudaKernel(const double* sourceData)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
// note: here we are supposed to check that i is in the array range
// but this is not what is causing the issue
if (sourceData[i] != 1.0)
printf("%u %u %u: %f\n",blockDim.x, blockIdx.x, threadIdx.x, sourceData[i]);
}
int main(int argc, char* argv[])
{
cudaStream_t cudaStream;
void * memPtr;
const size_t size = 900 * 5 * 5;
CUDA_ASSERT(cudaSetDevice(0));
CUDA_ASSERT(cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking));
CUDA_ASSERT(cudaMalloc(&memPtr, size * sizeof(double)));
// note: here we assume size is not a multiple of 64, this is unrelated to the issue
dim3 gridDim(std::size_t(((size - 1) / 64) + 1), 1u, 1u);
dim3 blockDim(64u, 1u, 1u);
emptyCudaKernel<<<gridDim, blockDim, 0, cudaStream>>>(argc);
CUDA_ASSERT(cudaStreamSynchronize(cudaStream));
std::vector<double> sourceMemHost(size, 1.0);
CUDA_ASSERT(cudaMemcpy(memPtr, sourceMemHost.data(), size * sizeof(double), cudaMemcpyHostToDevice));
cudaStreamSynchronize(cudaStream);
myCudaKernel<<<gridDim, blockDim, 0, cudaStream>>>((double*)memPtr);
CUDA_ASSERT(cudaStreamSynchronize(cudaStream));
CUDA_ASSERT(cudaStreamDestroy(cudaStream));
return 0;
}
#else
struct MyKernel
{
template<typename Acc>
ALPAKA_FN_ACC void operator()(Acc const & acc, const double* sourceData) const
{
int i = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
// note (same as for CUDA): here we are supposed to check that i is in the array range
// but this is not what is causing the issue
if(sourceData[i] != 1.0)
printf("%u %u %u %lu\n",blockDim.x, blockIdx.x, threadIdx.x, sourceData[i]);
}
};
struct EmptyKernel
{
template<typename Acc>
ALPAKA_FN_ACC void operator()(Acc const & acc, int threadElementExtent) const
{
assert(threadElementExtent == 1);
}
};
int main(int argc, char* argv[])
{
const size_t size = 900 * 5 * 5;
using ComputeAccelerator = alpaka::acc::AccGpuCudaRt<alpaka::dim::DimInt<1>, std::size_t>;
using ComputeDevice = alpaka::dev::Dev<ComputeAccelerator>;
using ComputeStream = alpaka::stream::StreamCudaRtSync;
ComputeDevice computeDevice(alpaka::pltf::getDevByIdx<alpaka::pltf::Pltf<ComputeDevice> >(0));
ComputeStream computeStream (computeDevice);
using V = alpaka::vec::Vec<alpaka::dim::DimInt<1>, std::size_t>;
using WorkDivision = alpaka::workdiv::WorkDivMembers<alpaka::dim::DimInt<1>, std::size_t>;
WorkDivision wd(V(std::size_t(((size - 1) / 64) + 1)), V(std::size_t(64)), V(std::size_t(1)));
using HostAccelerator = alpaka::acc::AccCpuOmp2Blocks<alpaka::dim::DimInt<1>, std::size_t>;
using HostDevice = alpaka::dev::Dev<HostAccelerator>;
alpaka::vec::Vec<alpaka::dim::DimInt<1>, size_t> bufferSize (size);
using HostBufferType = decltype(
alpaka::mem::buf::alloc<double, size_t>(std::declval<HostDevice>(), bufferSize));
using HostViewType = alpaka::mem::view::ViewPlainPtr<alpaka::dev::Dev<HostBufferType>,
alpaka::elem::Elem<HostBufferType>, alpaka::dim::Dim<HostBufferType>, alpaka::size::Size<HostBufferType> >;
HostDevice hostDevice(alpaka::pltf::getDevByIdx<alpaka::pltf::Pltf<HostDevice> >(0u));
auto sourceMem = alpaka::mem::buf::alloc<double, size_t>(computeDevice, size);
alpaka::stream::enqueue(computeStream, alpaka::exec::create<ComputeAccelerator>(wd, EmptyKernel(), argc));
std::vector<double> sourceMemHost(size, 1.0);
HostViewType hostBufferView(sourceMemHost.data(), hostDevice, bufferSize);
alpaka::mem::view::copy(computeStream, sourceMem, hostBufferView, bufferSize);
alpaka::wait::wait(computeStream);
alpaka::stream::enqueue(computeStream,
alpaka::exec::create<ComputeAccelerator>(wd, MyKernel(), alpaka::mem::view::getPtrNative(sourceMem)));
alpaka::wait::wait(computeStream);
return 0;
}
#endif
About this issue
- Original URL
- State: closed
- Created 5 years ago
- Reactions: 1
- Comments: 15 (14 by maintainers)
Commits related to this issue
- fix cuda stream race condition fix #850 By using `cudaStreamCreateWithFlags(..., cudaStreamNonBlocking)` we disabling the blocking behavior of `cudaMemcpy` and other blocking cuda API calls. By usin... — committed to psychocoderHPC/alpaka by psychocoderHPC 5 years ago
- add check for race between queues https://github.com/ComputationalRadiationPhysics/alpaka/issues/850 — committed to psychocoderHPC/alpaka by psychocoderHPC 5 years ago
Test code for the upcoming alpaka 0.4.0 (added all renamings):