mmdetection: RuntimeError: nms_impl: implementation for device cuda:0 not found.

When I run /usr/src/app/demo/inference_demo.ipynb, an error reported:

/usr/src/app/mmdet/datasets/utils.py:65: UserWarning: "ImageToTensor" pipeline is replaced by "DefaultFormatBundle" for batch inference. It is recommended to manually replace it in the test data pipeline in your config file.
  warnings.warn(
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-4-4d78d4937daf> in <module>
      1 # test a single image
      2 img = 'demo.jpg'
----> 3 result = inference_detector(model, img)

/usr/src/app/mmdet/apis/inference.py in inference_detector(model, imgs)
    145     # forward the model
    146     with torch.no_grad():
--> 147         results = model(return_loss=False, rescale=True, **data)
    148 
    149     if not is_batch:

/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1013         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1014                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1015             return forward_call(*input, **kwargs)
   1016         # Do not call functions when jit is used
   1017         full_backward_hooks, non_full_backward_hooks = [], []

/opt/conda/lib/python3.8/site-packages/mmcv/runner/fp16_utils.py in new_func(*args, **kwargs)
     96                                 'method of nn.Module')
     97             if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
---> 98                 return old_func(*args, **kwargs)
     99 
    100             # get the arg spec of the decorated method

/usr/src/app/mmdet/models/detectors/base.py in forward(self, img, img_metas, return_loss, **kwargs)
    172             return self.forward_train(img, img_metas, **kwargs)
    173         else:
--> 174             return self.forward_test(img, img_metas, **kwargs)
    175 
    176     def _parse_losses(self, losses):

/usr/src/app/mmdet/models/detectors/base.py in forward_test(self, imgs, img_metas, **kwargs)
    145             if 'proposals' in kwargs:
    146                 kwargs['proposals'] = kwargs['proposals'][0]
--> 147             return self.simple_test(imgs[0], img_metas[0], **kwargs)
    148         else:
    149             assert imgs[0].size(0) == 1, 'aug test does not support ' \

/usr/src/app/mmdet/models/detectors/two_stage.py in simple_test(self, img, img_metas, proposals, rescale)
    177         x = self.extract_feat(img)
    178         if proposals is None:
--> 179             proposal_list = self.rpn_head.simple_test_rpn(x, img_metas)
    180         else:
    181             proposal_list = proposals

/usr/src/app/mmdet/models/dense_heads/dense_test_mixins.py in simple_test_rpn(self, x, img_metas)
    128         """
    129         rpn_outs = self(x)
--> 130         proposal_list = self.get_bboxes(*rpn_outs, img_metas=img_metas)
    131         return proposal_list
    132 

/opt/conda/lib/python3.8/site-packages/mmcv/runner/fp16_utils.py in new_func(*args, **kwargs)
    184                                 'method of nn.Module')
    185             if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
--> 186                 return old_func(*args, **kwargs)
    187             # get the arg spec of the decorated method
    188             args_info = getfullargspec(old_func)

/usr/src/app/mmdet/models/dense_heads/base_dense_head.py in get_bboxes(self, cls_scores, bbox_preds, score_factors, img_metas, cfg, rescale, with_nms, **kwargs)
     91                 score_factor_list = [None for _ in range(num_levels)]
     92 
---> 93             results = self._get_bboxes_single(cls_score_list, bbox_pred_list,
     94                                               score_factor_list, mlvl_priors,
     95                                               img_meta, cfg, rescale, with_nms,

/usr/src/app/mmdet/models/dense_heads/rpn_head.py in _get_bboxes_single(self, cls_score_list, bbox_pred_list, score_factor_list, mlvl_anchors, img_meta, cfg, rescale, with_nms, **kwargs)
    183                                 dtype=torch.long))
    184 
--> 185         return self._bbox_post_process(mlvl_scores, mlvl_bbox_preds,
    186                                        mlvl_valid_anchors, level_ids, cfg,
    187                                        img_shape)

/usr/src/app/mmdet/models/dense_heads/rpn_head.py in _bbox_post_process(self, mlvl_scores, mlvl_bboxes, mlvl_valid_anchors, level_ids, cfg, img_shape, **kwargs)
    230 
    231         if proposals.numel() > 0:
--> 232             dets, _ = batched_nms(proposals, scores, ids, cfg.nms)
    233         else:
    234             return proposals.new_zeros(0, 5)

/opt/conda/lib/python3.8/site-packages/mmcv/ops/nms.py in batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic)
    305     # Won't split to multiple nms nodes when exporting to onnx
    306     if boxes_for_nms.shape[0] < split_thr or torch.onnx.is_in_onnx_export():
--> 307         dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
    308         boxes = boxes[keep]
    309         # -1 indexing works abnormal in TensorRT

/opt/conda/lib/python3.8/site-packages/mmcv/utils/misc.py in new_func(*args, **kwargs)
    338 
    339             # apply converted arguments to the decorated method
--> 340             output = old_func(*args, **kwargs)
    341             return output
    342 

/opt/conda/lib/python3.8/site-packages/mmcv/ops/nms.py in nms(boxes, scores, iou_threshold, offset, score_threshold, max_num)
    169         inds = ext_module.nms(*indata_list, **indata_dict)
    170     else:
--> 171         inds = NMSop.apply(boxes, scores, iou_threshold, offset,
    172                            score_threshold, max_num)
    173     dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)

/opt/conda/lib/python3.8/site-packages/mmcv/ops/nms.py in forward(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num)
     24                 valid_mask, as_tuple=False).squeeze(dim=1)
     25 
---> 26         inds = ext_module.nms(
     27             bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)
     28 

RuntimeError: nms_impl: implementation for device cuda:0 not found.

My environment:

sys.platform: linux
Python: 3.8.8 (default, Feb 24 2021, 21:46:12) [GCC 7.3.0]
CUDA available: True
GPU 0: NVIDIA GeForce RTX 3090
CUDA_HOME: /usr/local/cuda
NVCC: Build cuda_11.3.r11.3/compiler.29745058_0
GCC: gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
PyTorch: 1.9.0a0+2ecb2c7
PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201402
  - Intel(R) Math Kernel Library Version 2019.0.4 Product Build 20190411 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v1.8.0 (Git Hash N/A)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 11.3
  - NVCC architecture flags: -gencode;arch=compute_52,code=sm_52;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_86,code=compute_86
  - CuDNN 8.2
  - Magma 2.5.2
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.3, CUDNN_VERSION=8.2.0, CXX_COMPILER=/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, FORCE_FALLBACK_CUDA_MPI=1, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.9.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=ON, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, 

TorchVision: 0.9.0a0
OpenCV: 3.4.11
MMCV: 1.4.0
MMCV Compiler: GCC 9.3
MMCV CUDA Compiler: not available
MMDetection: 2.19.0+f3817df

About this issue

  • Original URL
  • State: closed
  • Created 3 years ago
  • Comments: 18

Most upvoted comments

Run python mmdet/utils/collect_env.py

In docker containers, we need to set environment variables as follows before executing pip install:

ENV FORCE_CUDA="1"
ENV MMCV_WITH_OPS=1

This solved the problem for me

@EkAugust Run python mmdet/utils/collect_env.py

hello, did you solve this problem? I also meet this problem, and I am sure I installed mmcv-full, but it is also not work. this is my envy: sys.platform: linux Python: 3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:21) [GCC 9.4.0] CUDA available: True GPU 0: NVIDIA GeForce RTX 2080 Ti CUDA_HOME: None GCC: gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-44) PyTorch: 1.10.2 PyTorch compiling details: PyTorch built with:

  • GCC 7.3
  • C++ Version: 201402
  • Intel® oneAPI Math Kernel Library Version 2022.0-Product Build 20211112 for Intel® 64 architecture applications
  • Intel® MKL-DNN v2.2.3 (Git Hash 7336ca9f055cf1bfa13efb658fe15dc9b41f0740)
  • OpenMP 201511 (a.k.a. OpenMP 4.5)
  • LAPACK is enabled (usually provided by MKL)
  • NNPACK is enabled
  • CPU capability usage: AVX512
  • CUDA Runtime 10.2
  • NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_37,code=compute_37
  • CuDNN 7.6.5
  • Magma 2.5.2
  • Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=10.2, CUDNN_VERSION=7.6.5, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.10.2, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON,

TorchVision: 0.11.3 OpenCV: 4.5.5 MMCV: 1.4.5 MMCV Compiler: GCC 7.3 MMCV CUDA Compiler: not available MMDetection: 2.19.0+f08548b

Hi, I followed everything from here. Here is the output of collect_env:

sys.platform: linux
Python: 3.11.8 | packaged by conda-forge | (main, Feb 16 2024, 20:53:32) [GCC 12.3.0]
CUDA available: True
MUSA available: False
numpy_random_seed: 2147483648
GPU 0: NVIDIA A100-SXM4-40GB
CUDA_HOME: None
GCC: gcc (GCC) 8.5.0 20210514 (Red Hat 8.5.0-10)
PyTorch: 2.2.1+cu121
PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.3.2 (Git Hash 2dc95a2ad0841e29db8b22fbccaf3e5da7992b01)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 12.1
  - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90
  - CuDNN 8.9.2
  - Magma 2.6.1
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=12.1, CUDNN_VERSION=8.9.2, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=pedantic -Wno-error=old-style-cast -Wno-missing-braces -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=2.2.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, 

TorchVision: 0.17.1+cu121
OpenCV: 4.9.0
MMEngine: 0.10.3
MMDetection: 3.3.0+cfd5d3a

I don’t know how to solve the issue…

mmcv-full

Thank you jshilong

  1. I installed mmcv with mim install "mmcv>=2.0.0"
  2. uninstalled mmcv mim uninstall mmcv
  3. installed mmcv-full ‘mim install mmcv-full’
  4. and mmcv again with mim install mmcv

If you get the following error:

AssertionError: MMCV==1.7.2 is used but incompatible. Please install mmcv>=2.0.0rc4, <2.2.0.

Then uninstall both mmcv and mmcv-full, then install them both without specifying the mmcv version.

@aRibra MMCV v2.0.0 official version was released on April 6, 2023. In version 2.x, it removed components related to the training process and added a data transformation module. Also, starting from 2.x, it renamed the package names mmcv to mmcv-lite and mmcv-full to mmcv. For details, see Compatibility Documentation.

any updates on that? I’m getting the same issue with running @rayryeng snippet with env:

➜  mmdetection3d git:(main) ✗ python mmdet3d/utils/collect_env.py                                                                                                                                                 
sys.platform: linux
Python: 3.8.18 (default, Sep 11 2023, 13:40:15) [GCC 11.2.0]
CUDA available: True
numpy_random_seed: 2147483648
GPU 0: NVIDIA GeForce RTX 2080 Ti
CUDA_HOME: /usr/local/cuda-12.1
NVCC: Cuda compilation tools, release 12.1, V12.1.105
GCC: gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
PyTorch: 2.1.1
PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2023.1-Product Build 20230303 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.1.1 (Git Hash 64f6bcbcbab628e96f33a62c3e975f8535a7bde4)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 12.1
  - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90
  - CuDNN 8.9.2
  - Magma 2.6.1
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=12.1, CUDNN_VERSION=8.9.2, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=old-style-cast -Wno-invalid-partial-specialization -Wno-unused-private-field -Wno-aligned-allocation-unavailable -Wno-missing-braces -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.1.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, 

TorchVision: 0.16.1
OpenCV: 4.8.1
MMEngine: 0.10.0
MMDetection: 3.2.0
MMDetection3D: 1.3.0+5c0613b
spconv2.0: True

I have the same problem when using mmcv==2.1.0 (installed via mim) with PyTorch 2.1.0 and CUDA 12.1 in a Dockerfile. If I install the mmcv package from source (mim install "git+https://github.com/open-mmlab/mmcv.git@6299bc02bde35f96e0b57a6cc94ed0fda177c478") everything works fine.

Setting these ENV variables in Dockerfile doesn’t help:

ENV FORCE_CUDA="1"
ENV MMCV_WITH_OPS=1

Example:

from mmcv.ops import batched_nms
import torch


def check_mmcv():

    device = torch.device('cuda:0')

    bboxes = torch.randn(2, 4, device=device)
    scores = torch.randn(2, device=device)
    labels = torch.zeros(2, dtype=torch.long, device=device)
    det_bboxes, keep_idxs = batched_nms(bboxes.to(torch.float32), scores.to(torch.float32), labels, {
        'type': 'nms',
        'iou_threshold': 0.6
    })

    print('OK.')


if __name__ == '__main__':
    check_mmcv()

Error:

Traceback (most recent call last):
  File "/app/test.py", line 21, in <module>
    check_mmcv()
  File "/app/test.py", line 12, in check_mmcv
    det_bboxes, keep_idxs = batched_nms(bboxes.to(torch.float32), scores.to(torch.float32), labels, {
  File "/usr/local/lib/python3.10/dist-packages/mmcv/ops/nms.py", line 303, in batched_nms
    dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
  File "/usr/local/lib/python3.10/dist-packages/mmengine/utils/misc.py", line 395, in new_func
    output = old_func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/mmcv/ops/nms.py", line 127, in nms
    inds = NMSop.apply(boxes, scores, iou_threshold, offset, score_threshold,
  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 539, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/usr/local/lib/python3.10/dist-packages/mmcv/ops/nms.py", line 27, in forward
    inds = ext_module.nms(
RuntimeError: nms_impl: implementation for device cuda:0 not found.

Same results for me. I am installing mmcv with the aforementioned fix and I still get this error. In my Dockerfile:

ENV FORCE_CUDA="1"
ENV MMCV_WITH_OPS=1
RUN python -m pip install -U openmim
RUN python -m pip install 'git+https://github.com/cocodataset/panopticapi.git'
RUN mim install mmengine
RUN mim install "mmcv>=2.0.0"
RUN mim install mmdet

I have the same problem when using mmcv==2.1.0 (installed via mim) with PyTorch 2.1.0 and CUDA 12.1 in a Dockerfile. If I install the mmcv package from source (mim install "git+https://github.com/open-mmlab/mmcv.git@6299bc02bde35f96e0b57a6cc94ed0fda177c478") everything works fine.

Setting these ENV variables in Dockerfile doesn’t help:

ENV FORCE_CUDA="1"
ENV MMCV_WITH_OPS=1

Example:

from mmcv.ops import batched_nms
import torch


def check_mmcv():

    device = torch.device('cuda:0')

    bboxes = torch.randn(2, 4, device=device)
    scores = torch.randn(2, device=device)
    labels = torch.zeros(2, dtype=torch.long, device=device)
    det_bboxes, keep_idxs = batched_nms(bboxes.to(torch.float32), scores.to(torch.float32), labels, {
        'type': 'nms',
        'iou_threshold': 0.6
    })

    print('OK.')


if __name__ == '__main__':
    check_mmcv()

Error:

Traceback (most recent call last):
  File "/app/test.py", line 21, in <module>
    check_mmcv()
  File "/app/test.py", line 12, in check_mmcv
    det_bboxes, keep_idxs = batched_nms(bboxes.to(torch.float32), scores.to(torch.float32), labels, {
  File "/usr/local/lib/python3.10/dist-packages/mmcv/ops/nms.py", line 303, in batched_nms
    dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
  File "/usr/local/lib/python3.10/dist-packages/mmengine/utils/misc.py", line 395, in new_func
    output = old_func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/mmcv/ops/nms.py", line 127, in nms
    inds = NMSop.apply(boxes, scores, iou_threshold, offset, score_threshold,
  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 539, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/usr/local/lib/python3.10/dist-packages/mmcv/ops/nms.py", line 27, in forward
    inds = ext_module.nms(
RuntimeError: nms_impl: implementation for device cuda:0 not found.

How to output mmcv environment like yours?

My environment:

sys.platform: linux
Python: 3.8.8 (default, Feb 24 2021, 21:46:12) [GCC 7.3.0]
CUDA available: True
GPU 0: NVIDIA GeForce RTX 3090
CUDA_HOME: /usr/local/cuda
NVCC: Build cuda_11.3.r11.3/compiler.29745058_0
GCC: gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
PyTorch: 1.9.0a0+2ecb2c7
PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201402
  - Intel(R) Math Kernel Library Version 2019.0.4 Product Build 20190411 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v1.8.0 (Git Hash N/A)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 11.3
  - NVCC architecture flags: -gencode;arch=compute_52,code=sm_52;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_86,code=compute_86
  - CuDNN 8.2
  - Magma 2.5.2
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.3, CUDNN_VERSION=8.2.0, CXX_COMPILER=/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, FORCE_FALLBACK_CUDA_MPI=1, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.9.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=ON, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, 

TorchVision: 0.9.0a0
OpenCV: 3.4.11
MMCV: 1.4.0
MMCV Compiler: GCC 9.3
MMCV CUDA Compiler: not available
MMDetection: 2.19.0+f3817df