TensorRT: nvinfer1::IExecutionContext* enqueue fialed exit code -11

Description

Environment

TensorRT Version: NVIDIA GPU: gtx1650 NVIDIA Driver Version:
CUDA Version: 10.1 CUDNN Version: Operating System: ubuntu16.04 Python Version (if applicable):
Tensorflow Version (if applicable): PyTorch Version (if applicable): 1.7 Baremetal or Container (if so, version):

LOG_INFO("2.0");
  LOG_INFO("2.0.0");
  //   cudaStreamCreate(&stream);
  LOG_INFO("2.1");

  CUDA_CHECK(cudaMemcpyAsync(
      buffers_[0], data_,
      BATCH_SIZE * KCHANNEL * KINPUT_H * KINPUT_W * sizeof(float),
      cudaMemcpyHostToDevice, stream));
  LOG_INFO("2.2");
  LOG_INFO("2.2.1");

  context_ptr_->enqueue(BATCH_SIZE, buffers_, stream, nullptr);
  LOG_INFO("2.3");
  cudaMemcpyAsync(prob_, buffers_[1], BATCH_SIZE * KCLASS_NUM * sizeof(float),
                  cudaMemcpyDeviceToHost, stream);
  LOG_INFO("2.4");

  cudaStreamSynchronize(stream);
  LOG_INFO("2.5");

[ INFO] [1616132115.062831248]: [20210319 13:35:15:062819] [process_rgb.cc:78] 1.1 [ INFO] [1616132115.062851257]: [20210319 13:35:15:062842] [process_rgb.cc:208] 2.0 [ INFO] [1616132115.062867492]: [20210319 13:35:15:062858] [process_rgb.cc:209] 2.0.0 [ INFO] [1616132115.062882795]: [20210319 13:35:15:062874] [process_rgb.cc:211] 2.1 [ INFO] [1616132115.062983906]: [20210319 13:35:15:062972] [process_rgb.cc:217] 2.2 [ INFO] [1616132115.063002847]: [20210319 13:35:15:062995] [process_rgb.cc:218] 2.2.1 [cotek_visual_node-2] process has died [pid 11456, exit code -11, cmd /home/zhongsy/cotek/cotek_cpp_core/devel/lib/cotek_visual/cotek_visual_node __name:=cotek_visual_node __log:=/home/zhongsy/.ros/log/defad84a-8874-11eb-a468-7cb27d5e85b2/cotek_visual_node-2.log]. log file: /home/zhongsy/.ros/log/defad84a-8874-11eb-a468-7cb27d5e85b2/cotek_visual_node-2*.log

About this issue

  • Original URL
  • State: closed
  • Created 3 years ago
  • Comments: 25

Most upvoted comments

i have th similiar code it can run normally @ttyio


static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int CLASS_NUM = 2;
// static const int OUTPUT_SIZE =
//     Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) +
//     1;
static const int OUTPUT_SIZE = 2;
void doInference(nvinfer1::IExecutionContext& context, cudaStream_t& stream,
                 void** buffers, float* input, float* output, int batchSize) {
  // DMA input batch data to device, infer on the batch asynchronously, and DMA
  // output back to host
  cudaMemcpyAsync(buffers[0], input,
                  batchSize * 3 * INPUT_H * INPUT_W * sizeof(float),
                  cudaMemcpyHostToDevice, stream);
  context.enqueue(batchSize, buffers, stream, nullptr);
  cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float),
                  cudaMemcpyDeviceToHost, stream);
  cudaStreamSynchronize(stream);
}

cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
  int w, h, x, y;
  float r_w = input_w / (img.cols * 1.0);
  float r_h = input_h / (img.rows * 1.0);
  if (r_h > r_w) {
    w = input_w;
    h = r_w * img.rows;
    x = 0;
    y = (input_h - h) / 2;
  } else {
    w = r_h * img.cols;
    h = input_h;
    x = (input_w - w) / 2;
    y = 0;
  }
  cv::Mat re(h, w, CV_8UC3);
  cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
  cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
  re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
  return out;
}

int main() {
  Logger gLogger;

  std::string engine_name = "./ghost.trt";  // load engine name
  std::cout << "hello" << std::endl;
  // start load engine
  char* trtModelStream{nullptr};
  size_t size{0};

  std::ifstream engine_file(engine_name, std::ios::binary);

  if (engine_file.good()) {
    engine_file.seekg(0,
                      engine_file.end);  // 定位输入流结束位置地址偏移量为0初
    size = engine_file.tellg();
    engine_file.seekg(0, engine_file.beg);
    trtModelStream = new char[size];
    engine_file.read(trtModelStream, size);
    engine_file.close();
  }

  static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
  static float prob[BATCH_SIZE * OUTPUT_SIZE];

  nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);

  nvinfer1::ICudaEngine* engine =
      runtime->deserializeCudaEngine(trtModelStream, size);

  nvinfer1::IExecutionContext* context = engine->createExecutionContext();
  delete[] trtModelStream;

  void* buffers[2];
  const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
  const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);

  //  使用上面的indices,在GPU上创建一个指向input和output缓冲区的buffer数组

  // Create GPU buffers on device
  cudaMalloc(&buffers[inputIndex],
             BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float));
  cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float));

  // Create stream
  cudaStream_t stream;
  cudaStreamCreate(&stream);

  while (1) {
    auto start = std::chrono::system_clock::now();

    cv::Mat img = cv::imread("../3.jpg");
    cv::Mat pre_img = preprocess_img(img, INPUT_W, INPUT_H);

    int i = 0;
    int fcount = 0;

    // 分离BGR并变化成RGB
    for (int row = 0; row < INPUT_H; ++row) {
      uchar* uc_pixel = pre_img.data + row * pre_img.step;
      for (int col = 0; col < INPUT_W; ++col) {
        data[fcount * 3 * INPUT_H * INPUT_W + i] =
            static_cast<float>(uc_pixel[2]) / 255.0;
        data[fcount * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] =
            static_cast<float>(uc_pixel[1]) / 255.0;
        data[fcount * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] =
            static_cast<float>(uc_pixel[0]) / 255.0;
        uc_pixel += 3;
        ++i;
      }
    }
    // Run inference
    // doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
    cudaMemcpyAsync(buffers[0], data,
                    BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float),
                    cudaMemcpyHostToDevice, stream);
    context->enqueue(BATCH_SIZE, buffers, stream, nullptr);
    cudaMemcpyAsync(prob, buffers[1], BATCH_SIZE * OUTPUT_SIZE * sizeof(float),
                    cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);
    auto end = std::chrono::system_clock::now();
    std::cout << "prob: " << std::endl;
    std::cout << prob[0] << " " << prob[1] << std::endl;
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end -
                                                                       start)
                     .count()
              << "ms" << std::endl;
  }
}