alpaka: Race condition in CUDA blocking queue

@krzikalla provided me with a test case where we have dataraces between a synchronous memcopy and a kernel.

The problem is that we create the streams with cudaStreamCreateWithFlags(..., cudaStreamNonBlocking). The cuda documentation that cudaStreamNonBlocking disable the blocking behavior to the default stream 0. In alpaka we use blocking memcopy operations e.g.cudaMemcpy if we use the QueueCudaRtBlocking. The result is that our memcopy operations are running non blocking to all enqueued kernel.

This BUG is also available in the last release 0.3.5. This means we need to do a bugfix release even we release soon 0.4.0.

We have different possibilities to solve it

  • create blocking queues with cudaStreamCreate()
    • this will introduce the side effect that a memcopy will block all stream
  • use asynchronously memcopy and do an explicit synchronization after each call (
    • will increase the host side overhead since a API call has a latency of ~10 - 16 us
#include <stdio.h>
#include "alpaka/alpaka.hpp"

//#define NATIVE_CUDA 0
#ifdef NATIVE_CUDA

#define CUDA_ASSERT(x) if ((x) != cudaSuccess) { printf("cuda fail on line %d", __LINE__); exit(1); }


__global__ void emptyCudaKernel(int threadElementExtent)
{
  assert(threadElementExtent == 1);
}

__global__ void myCudaKernel(const double* sourceData)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  // note: here we are supposed to check that i is in the array range
  // but this is not what is causing the issue
  if (sourceData[i] != 1.0)
    printf("%u %u %u: %f\n",blockDim.x, blockIdx.x, threadIdx.x, sourceData[i]);
}



int main(int argc, char* argv[])
{
  cudaStream_t cudaStream;
  void * memPtr;
  const size_t size = 900 * 5 * 5;

  CUDA_ASSERT(cudaSetDevice(0));
  CUDA_ASSERT(cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking));  
  CUDA_ASSERT(cudaMalloc(&memPtr, size * sizeof(double)));

  // note: here we assume size is not a multiple of 64, this is unrelated to the issue
  dim3 gridDim(std::size_t(((size - 1) / 64) + 1), 1u, 1u);
  dim3 blockDim(64u, 1u, 1u);
  emptyCudaKernel<<<gridDim, blockDim, 0, cudaStream>>>(argc);
  CUDA_ASSERT(cudaStreamSynchronize(cudaStream));
  
  std::vector<double> sourceMemHost(size, 1.0);
  CUDA_ASSERT(cudaMemcpy(memPtr, sourceMemHost.data(), size * sizeof(double), cudaMemcpyHostToDevice));
  cudaStreamSynchronize(cudaStream);
 
  myCudaKernel<<<gridDim, blockDim, 0, cudaStream>>>((double*)memPtr);

  CUDA_ASSERT(cudaStreamSynchronize(cudaStream));
  CUDA_ASSERT(cudaStreamDestroy(cudaStream));
  return 0;
}
  

#else

struct MyKernel
{
   template<typename Acc>
   ALPAKA_FN_ACC void operator()(Acc const & acc, const double* sourceData) const
   {
     int i = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
     // note (same as for CUDA): here we are supposed to check that i is in the array range
     // but this is not what is causing the issue
     if(sourceData[i] != 1.0)
       printf("%u %u %u %lu\n",blockDim.x, blockIdx.x, threadIdx.x, sourceData[i]);
   }
};

struct EmptyKernel
{
   template<typename Acc>
   ALPAKA_FN_ACC void operator()(Acc const & acc, int threadElementExtent) const
   {
     assert(threadElementExtent == 1);
   }
};



int main(int argc, char* argv[])
{
   const size_t size = 900 * 5 * 5;

   using ComputeAccelerator = alpaka::acc::AccGpuCudaRt<alpaka::dim::DimInt<1>, std::size_t>;
   using ComputeDevice = alpaka::dev::Dev<ComputeAccelerator>;
   using ComputeStream = alpaka::stream::StreamCudaRtSync;

   ComputeDevice computeDevice(alpaka::pltf::getDevByIdx<alpaka::pltf::Pltf<ComputeDevice> >(0));
   ComputeStream computeStream (computeDevice);

   using V = alpaka::vec::Vec<alpaka::dim::DimInt<1>, std::size_t>;
   using WorkDivision = alpaka::workdiv::WorkDivMembers<alpaka::dim::DimInt<1>, std::size_t>;
   WorkDivision wd(V(std::size_t(((size - 1) / 64) + 1)), V(std::size_t(64)), V(std::size_t(1)));

   using HostAccelerator = alpaka::acc::AccCpuOmp2Blocks<alpaka::dim::DimInt<1>, std::size_t>;
   using HostDevice = alpaka::dev::Dev<HostAccelerator>;
   alpaka::vec::Vec<alpaka::dim::DimInt<1>, size_t> bufferSize (size);
   using HostBufferType = decltype(
     alpaka::mem::buf::alloc<double, size_t>(std::declval<HostDevice>(), bufferSize));
   using HostViewType = alpaka::mem::view::ViewPlainPtr<alpaka::dev::Dev<HostBufferType>,
     alpaka::elem::Elem<HostBufferType>, alpaka::dim::Dim<HostBufferType>, alpaka::size::Size<HostBufferType> >;

   HostDevice hostDevice(alpaka::pltf::getDevByIdx<alpaka::pltf::Pltf<HostDevice> >(0u));

   auto sourceMem = alpaka::mem::buf::alloc<double, size_t>(computeDevice, size);

   alpaka::stream::enqueue(computeStream, alpaka::exec::create<ComputeAccelerator>(wd, EmptyKernel(), argc));

   std::vector<double> sourceMemHost(size, 1.0);
   HostViewType hostBufferView(sourceMemHost.data(), hostDevice, bufferSize);
   alpaka::mem::view::copy(computeStream, sourceMem, hostBufferView, bufferSize);
   alpaka::wait::wait(computeStream);

   alpaka::stream::enqueue(computeStream,
     alpaka::exec::create<ComputeAccelerator>(wd, MyKernel(), alpaka::mem::view::getPtrNative(sourceMem)));

   alpaka::wait::wait(computeStream);

   return 0;
}
#endif

About this issue

  • Original URL
  • State: closed
  • Created 5 years ago
  • Reactions: 1
  • Comments: 15 (14 by maintainers)

Commits related to this issue

Most upvoted comments

Test code for the upcoming alpaka 0.4.0 (added all renamings):

#include <stdio.h>
#include "alpaka/alpaka.hpp"

//#define NATIVE_CUDA 0
#ifdef NATIVE_CUDA

#define CUDA_ASSERT(x) if ((x) != cudaSuccess) { printf("cuda fail on line %d", __LINE__); exit(1); }


__global__ void emptyCudaKernel(int threadElementExtent)
{
  assert(threadElementExtent == 1);
}

__global__ void myCudaKernel(const double* sourceData)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  constexpr size_t size = 900 * 5 * 5;
  // note: here we are supposed to check that i is in the array range
  // but this is not what is causing the issue
  if (i < size && sourceData[i] != 1.0)
    printf("%u %u %u: %f\n",blockDim.x, blockIdx.x, threadIdx.x, sourceData[i]);
}



int main(int argc, char* argv[])
{
  cudaStream_t cudaStream;
  void * memPtr;
  const size_t size = 900 * 5 * 5;

  CUDA_ASSERT(cudaSetDevice(0));
  CUDA_ASSERT(cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking));
  CUDA_ASSERT(cudaMalloc(&memPtr, size * sizeof(double)));

  // note: here we assume size is not a multiple of 64, this is unrelated to the issue
  dim3 gridDim(std::size_t(((size - 1) / 64) + 1), 1u, 1u);
  dim3 blockDim(64u, 1u, 1u);
  emptyCudaKernel<<<gridDim, blockDim, 0, cudaStream>>>(argc);
  CUDA_ASSERT(cudaStreamSynchronize(cudaStream));

  std::vector<double> sourceMemHost(size, 1.0);
  CUDA_ASSERT(cudaMemcpy(memPtr, sourceMemHost.data(), size * sizeof(double), cudaMemcpyHostToDevice));
  cudaStreamSynchronize(cudaStream);

  myCudaKernel<<<gridDim, blockDim, 0, cudaStream>>>((double*)memPtr);

  CUDA_ASSERT(cudaStreamSynchronize(cudaStream));
  CUDA_ASSERT(cudaStreamDestroy(cudaStream));
  return 0;
}


#else

struct MyKernel
{
   template<typename Acc>
   ALPAKA_FN_ACC void operator()(Acc const & acc, const double* sourceData) const
   {
     constexpr size_t size = 900 * 5 * 5;
     int i = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
     // note (same as for CUDA): here we are supposed to check that i is in the array range
     // but this is not what is causing the issue
     if(i < size && sourceData[i] != 1.0)
       printf("%u %u %u %lu\n",blockDim.x, blockIdx.x, threadIdx.x, sourceData[i]);
   }
};

struct EmptyKernel
{
   template<typename Acc>
   ALPAKA_FN_ACC void operator()(Acc const & acc, int threadElementExtent) const
   {
     assert(threadElementExtent == 1);
   }
};



int main(int argc, char* argv[])
{
   const size_t size = 900 * 5 * 5;

   using ComputeAccelerator = alpaka::acc::AccGpuCudaRt<alpaka::dim::DimInt<1>, std::size_t>;
   using ComputeDevice = alpaka::dev::Dev<ComputeAccelerator>;
   using ComputeStream = alpaka::queue::QueueCudaRtBlocking;

   ComputeDevice computeDevice(alpaka::pltf::getDevByIdx<alpaka::pltf::Pltf<ComputeDevice> >(0));
   ComputeStream computeStream (computeDevice);

   using V = alpaka::vec::Vec<alpaka::dim::DimInt<1>, std::size_t>;
   using WorkDivision = alpaka::workdiv::WorkDivMembers<alpaka::dim::DimInt<1>, std::size_t>;
   WorkDivision wd(V(std::size_t(((size - 1) / 64) + 1)), V(std::size_t(64)), V(std::size_t(1)));

   using HostAccelerator = alpaka::acc::AccCpuOmp2Blocks<alpaka::dim::DimInt<1>, std::size_t>;
   using HostDevice = alpaka::dev::Dev<HostAccelerator>;
   alpaka::vec::Vec<alpaka::dim::DimInt<1>, size_t> bufferSize (size);
   using HostBufferType = decltype(
     alpaka::mem::buf::alloc<double, size_t>(std::declval<HostDevice>(), bufferSize));
   using HostViewType = alpaka::mem::view::ViewPlainPtr<alpaka::dev::Dev<HostBufferType>,
     alpaka::elem::Elem<HostBufferType>, alpaka::dim::Dim<HostBufferType>, alpaka::idx::Idx<HostBufferType> >;

   HostDevice hostDevice(alpaka::pltf::getDevByIdx<alpaka::pltf::Pltf<HostDevice> >(0u));

   auto sourceMem = alpaka::mem::buf::alloc<double, size_t>(computeDevice, size);

   alpaka::queue::enqueue(computeStream, alpaka::kernel::createTaskKernel<ComputeAccelerator>(wd, EmptyKernel(), argc));

   std::vector<double> sourceMemHost(size, 1.0);
   HostViewType hostBufferView(sourceMemHost.data(), hostDevice, bufferSize);
   alpaka::mem::view::copy(computeStream, sourceMem, hostBufferView, bufferSize);
   alpaka::wait::wait(computeStream);

   alpaka::queue::enqueue(computeStream,
     alpaka::kernel::createTaskKernel<ComputeAccelerator>(wd, MyKernel(), alpaka::mem::view::getPtrNative(sourceMem)));

   alpaka::wait::wait(computeStream);

   return 0;
}
#endif