dgl: dgl._ffi.base.DGLError: [13:15:35] /opt/dgl/src/array/cuda/spmm.cu:213: Check failed: e == CUSPARSE_STATUS_SUCCESS: CUSPARSE ERROR: 1

πŸ› Bug

To Reproduce

I run the tutorial code, but errors occur.

import dgl.data
import torch.nn.functional as F
from dgl.dataloading import GraphDataLoader
from dgl.nn import GraphConv
import torch.nn as nn
import torch


class Classifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes):
        super(Classifier, self).__init__()
        self.conv1 = GraphConv(in_dim, hidden_dim)
        self.conv2 = GraphConv(hidden_dim, hidden_dim)
        self.classify = nn.Linear(hidden_dim, n_classes)

    def forward(self, g, h):
        # Apply graph convolution and activation.
        h = F.relu(self.conv1(g, h))
        h = F.relu(self.conv2(g, h))
        with g.local_scope():
            g.ndata['h'] = h
            # Calculate graph representation by average readout.
            hg = dgl.mean_nodes(g, 'h')
            return self.classify(hg)


if __name__ == '__main__':
    dataset = dgl.data.GINDataset('MUTAG', False)
    device = torch.device('cuda:0')
    model = Classifier(7, 20, 5).to(device)
    dataloader = GraphDataLoader(dataset,
                                 batch_size=1024,
                                 drop_last=False,
                                 shuffle=True)
    opt = torch.optim.Adam(model.parameters())
    for epoch in range(20):
        for batched_graph, labels in dataloader:
            feats = batched_graph.ndata['attr'].to(device)
            batched_graph = batched_graph.to(device)
            logits = model(batched_graph, feats)
            loss = F.cross_entropy(logits, labels)
            opt.zero_grad()
            loss.backward()
            opt.step()

Errors:

Using backend: pytorch
Traceback (most recent call last):
  File "/home/zhuangxiang/code/test.py", line 42, in <module>
    logits = model(batched_graph, feats)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/zhuangxiang/code/test.py", line 19, in forward
    h = F.relu(self.conv1(g, h))
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/nn/pytorch/conv/graphconv.py", line 423, in forward
    graph.update_all(aggregate_fn, fn.sum(msg='m', out='h'))
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/heterograph.py", line 4686, in update_all
    ndata = core.message_passing(g, message_func, reduce_func, apply_node_func)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/core.py", line 283, in message_passing
    ndata = invoke_gspmm(g, mfunc, rfunc)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/core.py", line 258, in invoke_gspmm
    z = op(graph, x)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/ops/spmm.py", line 170, in func
    return gspmm(g, 'copy_lhs', reduce_op, x, None)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/ops/spmm.py", line 62, in gspmm
    ret = gspmm_internal(g._graph, op,
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/backend/pytorch/sparse.py", line 307, in gspmm
    return GSpMM.apply(gidx, op, reduce_op, lhs_data, rhs_data)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/torch/cuda/amp/autocast_mode.py", line 213, in decorate_fwd
    return fwd(*args, **kwargs)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/backend/pytorch/sparse.py", line 87, in forward
    out, (argX, argY) = _gspmm(gidx, op, reduce_op, X, Y)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/sparse.py", line 157, in _gspmm
    _CAPI_DGLKernelSpMM(gidx, op, reduce_op,
  File "dgl/_ffi/_cython/./function.pxi", line 287, in dgl._ffi._cy3.core.FunctionBase.__call__
  File "dgl/_ffi/_cython/./function.pxi", line 232, in dgl._ffi._cy3.core.FuncCall
  File "dgl/_ffi/_cython/./base.pxi", line 155, in dgl._ffi._cy3.core.CALL
dgl._ffi.base.DGLError: [13:15:35] /opt/dgl/src/array/cuda/spmm.cu:213: Check failed: e == CUSPARSE_STATUS_SUCCESS: CUSPARSE ERROR: 1
Stack trace:
  [bt] (0) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x4f) [0x7fb378ce90ff]
  [bt] (1) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(void dgl::aten::cusparse::CusparseCsrmm2<float, long>(DLContext const&, dgl::aten::CSRMatrix const&, float const*, float const*, float*, int)+0x762) [0x7fb3798dd852]
  [bt] (2) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(void dgl::aten::SpMMCsr<2, long, 32>(std::string const&, std::string const&, dgl::BcastOff const&, dgl::aten::CSRMatrix const&, dgl::runtime::NDArray, dgl::runtime::NDArray, dgl::runtime::NDArray, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> >)+0xdc) [0x7fb3799268ac]
  [bt] (3) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(dgl::aten::SpMM(std::string const&, std::string const&, std::shared_ptr<dgl::BaseHeteroGraph>, dgl::runtime::NDArray, dgl::runtime::NDArray, dgl::runtime::NDArray, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> >)+0x2633) [0x7fb378e3af53]
  [bt] (4) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(+0x6a7e5c) [0x7fb378e45e5c]
  [bt] (5) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(+0x6a85a1) [0x7fb378e465a1]
  [bt] (6) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(DGLFuncCall+0x48) [0x7fb3793d5a98]
  [bt] (7) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/_ffi/_cy3/core.cpython-38-x86_64-linux-gnu.so(+0x15d3e) [0x7fb35e7e3d3e]
  [bt] (8) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/_ffi/_cy3/core.cpython-38-x86_64-linux-gnu.so(+0x1626b) [0x7fb35e7e426b]

The above code runs correctly on cpu but goes wrong on gpu.

Environment

  • DGL Version (e.g., 1.0): 0.6.0.post1
  • Backend Library & Version (e.g., PyTorch 0.4.1, MXNet/Gluon 1.3): Pytorch 1.7.0
  • OS (e.g., Linux): Linux
  • How you installed DGL (conda, pip, source): pip
  • Python version: 3.8.5
  • CUDA/cuDNN version (if applicable): 11.0
  • GPU models and configuration (e.g. V100): GeForce RTX 3090

Additional context

About this issue

  • Original URL
  • State: closed
  • Created 3 years ago
  • Comments: 17 (1 by maintainers)

Most upvoted comments

This works for me. Thanks! Anyone who meets the errors on GPU 4090 could try this solution.

I also tried to get this example code working, this is what worked for me.

After a lot of back and forth trying to match python,pytorch and cuda versions [1], the following steps worked for me. (It’s easier to start with a new environment because there might be lots of conflicts going on with packages)

[1] - https://www.dgl.ai/pages/start.html

## Create new environment, use arbitrary name "myenv" that you prefer
conda create -n myenv python=3.11

## Activate environment
source activate myenv

## Install pytorch 2.2 
conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=12.1 -c pytorch -c nvidia

## Install dgl which matches pytorch 2.2 and cuda 12.1 
conda install -c dglteam/label/cu121 dgl

## Add environment to jupyter kernel
conda install -c anaconda ipykernel -y
python -m ipykernel install --user --name=myenv

# install remaining things that dgl needs
pip install torchdata
pip install pandas
pip install pyyaml
pip install pydantic

I tried another example, and the same error occurred.

import torch.nn.functional as F
import dgl
from dgl.nn import GraphConv
import torch.nn as nn
import torch
class Classifier(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(Classifier, self).__init__()
        self.conv1 = GraphConv(in_dim, out_dim,)
    def forward(self, g, h):
        # Apply graph convolution and activation.
        h = F.relu(self.conv1(g, h))
        return h
src_ids = torch.tensor([2, 3, 4])
dst_ids = torch.tensor([1, 2, 3])
device = torch.device('cuda:0')
g = dgl.graph((src_ids, dst_ids)).to(device)
g = dgl.add_self_loop(g)
x = torch.randn((5, 100)).to(device)
model = Classifier(100, 20).to(device)
model(g, x)

I also tried to get this example code working, this is what worked for me.

After a lot of back and forth trying to match python,pytorch and cuda versions [1], the following steps worked for me. (It’s easier to start with a new environment because there might be lots of conflicts going on with packages)

[1] - https://www.dgl.ai/pages/start.html

## Create new environment, use arbitrary name "myenv" that you prefer
conda create -n myenv python=3.11

## Activate environment
source activate myenv

## Install pytorch 2.2 
conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=12.1 -c pytorch -c nvidia

## Install dgl which matches pytorch 2.2 and cuda 12.1 
conda install -c dglteam/label/cu121 dgl

## Add environment to jupyter kernel
conda install -c anaconda ipykernel -y
python -m ipykernel install --user --name=myenv

# install remaining things that dgl needs
pip install torchdata
pip install pandas
pip install pyyaml
pip install pydantic

I tried another example, and the same error occurred.

import torch.nn.functional as F
import dgl
from dgl.nn import GraphConv
import torch.nn as nn
import torch
class Classifier(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(Classifier, self).__init__()
        self.conv1 = GraphConv(in_dim, out_dim,)
    def forward(self, g, h):
        # Apply graph convolution and activation.
        h = F.relu(self.conv1(g, h))
        return h
src_ids = torch.tensor([2, 3, 4])
dst_ids = torch.tensor([1, 2, 3])
device = torch.device('cuda:0')
g = dgl.graph((src_ids, dst_ids)).to(device)
g = dgl.add_self_loop(g)
x = torch.randn((5, 100)).to(device)
model = Classifier(100, 20).to(device)
model(g, x)

I tried another example, and the same error occurred.

import torch.nn.functional as F
from dgl.nn import GraphConv
import torch.nn as nn
import torch
class Classifier(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(Classifier, self).__init__()
        self.conv1 = GraphConv(in_dim, out_dim,)
    def forward(self, g, h):
        # Apply graph convolution and activation.
        h = F.relu(self.conv1(g, h))
        return h
src_ids = torch.tensor([2, 3, 4])
dst_ids = torch.tensor([1, 2, 3])
device = torch.device('cuda:0')
g = dgl.graph((src_ids, dst_ids)).to(device)
g = dgl.add_self_loop(g)
x = torch.randn((5, 100)).to(device)
model = Classifier(100, 20).to(device)
model(g, x)