pytorch-lightning: Changing errors when running the same code
🐛 Bug
I do not know how to describe this. When I run the below code, the process crashes after 2 epochs of training and the error is always different. I’ve hit:
-
AttributeError: 'weakref' object has no attribute 'grad_fn' -
RuntimeError: cannot call get_autograd_meta() on undefined tensor -
AttributeError: 'FrameSummary' object has no attribute 'grad_fn' -
RuntimeError: Can't detach views in-place. Use detach() instead. If you are using DistributedDataParallel (DDP) for training, and gradient_as_bucket_view is set as True, gradients are views of DDP buckets, and hence detach_() cannot be called on these gradients. To fix this error, please refer to the Optimizer.zero_grad() function in torch/optim/optimizer.py as the solution. python-BaseException
Every time, the process dies, so I can’t debug.
To Reproduce
import os
import pl_bolts
import pl_bolts.callbacks
import pytorch_lightning as pl
import pytorch_lightning.loggers
import torch.utils.data
import torchvision.transforms
import src.models.helpers
import src.models.readout
class MLPReadout(pl.LightningModule):
def __init__(self,
sizes: Tuple[int, ...],
use_bias: bool = True,
act=nn.LeakyReLU,
encoder: nn.Module = None,
train_only_readout: bool = True, ):
super(MLPReadout, self).__init__()
self.encoder = encoder
self.train_only_readout = train_only_readout
self.act_callable = act()
if self.train_only_readout:
if self.encoder is not None:
self.encoder.requires_grad_(False)
mlp_layers = []
for i in range(len(sizes) - 1):
mlp_layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=use_bias))
if i < len(sizes) - 2:
mlp_layers.append(act())
self.readout = nn.Sequential(*mlp_layers)
def forward(self,
x: torch.Tensor,
) -> torch.Tensor:
if self.encoder is not None:
# Need to explicitly call forward because otherwise SwAV doesn't call its head
x = self.encoder.forward(x)
if isinstance(x, list):
x = x[-1]
x = self.act_callable(x)
return self.readout(x)
class LinearReadout(MLPReadout):
def __init__(self,
dim_out: int,
dim_in: int,
encoder: nn.Module = None,
train_only_readout: bool = True,
use_bias: bool = True,
):
super(LinearReadout, self).__init__(
sizes=(dim_in, dim_out),
encoder=encoder,
train_only_readout=train_only_readout,
use_bias=use_bias,
)
class CIFAR10CNN(pl.LightningModule):
def __init__(self):
super().__init__()
self.conv1 = torch.nn.Conv2d(3, 6, 5)
self.pool = torch.nn.MaxPool2d(2, 2)
self.conv2 = torch.nn.Conv2d(6, 16, 5)
self.fc1 = torch.nn.Linear(16 * 5 * 5, 120)
self.fc2 = torch.nn.Linear(120, 84)
def forward(self, x0):
x1 = self.pool(F.relu(self.conv1(x0)))
x2 = self.pool(F.relu(self.conv2(x1)))
x2 = torch.flatten(x2, 1) # flatten all dimensions except batch
x3 = F.leaky_relu(self.fc1(x2))
x4 = F.leaky_relu(self.fc2(x3))
activations = [x0, x1, x2, x3, x4]
return activations
class TrainSupervisedNetSystem(pl.LightningModule):
def __init__(self,
net: pl.LightningModule = None,
system_prefix: str = 'train_supervised_net'):
super().__init__()
if net is None:
net = CIFAR10CNN()
self.net = net
self.system_prefix = system_prefix
def training_step(self, batch, batch_idx):
loss, acc1, acc5 = self._shared_batch_step(
batch=batch, batch_idx=batch_idx)
self.log(f'{self.system_prefix}/train/loss', loss)
self.log(f'{self.system_prefix}/train/acc1', acc1)
self.log(f'{self.system_prefix}/train/acc5', acc5)
return loss
def validation_step(self, batch, batch_idx):
loss, acc1, acc5 = self._shared_batch_step(
batch=batch, batch_idx=batch_idx)
self.log(f'{self.system_prefix}/val/loss', loss)
self.log(f'{self.system_prefix}/val/acc1', acc1)
self.log(f'{self.system_prefix}/val/acc5', acc5)
def _shared_batch_step(self, batch, batch_idx):
x, y = batch
model_outputs = self.net(x)
if isinstance(model_outputs, list):
model_predictions = model_outputs[-1]
else:
model_predictions = model_outputs
loss = F.cross_entropy(input=model_predictions, target=y)
acc1 = accuracy(preds=model_predictions, target=y, top_k=1)
acc5 = accuracy(preds=model_predictions, target=y, top_k=5)
return loss, acc1, acc5
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
return optimizer
config = {
'batch_size': 256,
'learning_rate': 1e-3,
}
path_prefix_str = '/data3/rschaef'
datasets_path_str = os.path.join(path_prefix_str, 'datasets')
run_path_str = os.path.join(
path_prefix_str,
'CoCoLab-Pretrained-Representation-Distillation/04_all_three_stages/bs={}_lr={}'.format(
config['batch_size'],
config['learning_rate']
))
base_transforms = [
torchvision.transforms.transforms.ToTensor(),
torchvision.transforms.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
train_transforms = torchvision.transforms.transforms.Compose(
[torchvision.transforms.RandomHorizontalFlip()] + base_transforms)
test_transforms = torchvision.transforms.transforms.Compose(
base_transforms)
train_dataset = torchvision.datasets.CIFAR10(root=datasets_path_str, train=True,
download=True, transform=train_transforms)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=config['batch_size'],
shuffle=True, num_workers=8)
test_dataset = torchvision.datasets.CIFAR10(root=datasets_path_str, train=False,
download=True, transform=test_transforms)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=config['batch_size'],
shuffle=False, num_workers=8)
classes = ('plane', 'car', 'bird', 'cat',
'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
# teacher_checkpoint = '/data3/rschaef/CoCoLab-Pretrained-Representation-Distillation/02_train_supervised_cnn/epoch=18-step=3723.ckpt'
# if os.path.isfile(teacher_checkpoint):
# teacher_encoder_net = CIFAR10CNN.load_from_checkpoint(teacher_checkpoint)
# else:
teacher_encoder_net = CIFAR10CNN()
teacher_classification_net = src.models.readout.LinearReadout(
encoder=teacher_encoder_net,
dim_out=10,
dim_in=84,
train_only_readout=False,
)
wandb_logger = pytorch_lightning.loggers.WandbLogger(
project='prd-04_all_three_stages',
log_model='all',
config=config)
train_supervised_teacher_sys = TrainSupervisedNetSystem(
net=teacher_classification_net,
system_prefix='train_supervised_teacher')
systems = [
train_supervised_teacher_sys,
# distill_teacher_into_student_sys,
# train_supervised_student_sys,
]
for system in systems:
callbacks = [
pytorch_lightning.callbacks.ModelCheckpoint(
monitor=f"{system.system_prefix}/val/loss",
mode='min',
dirpath=run_path_str)
]
trainer = pl.Trainer(
default_root_dir=run_path_str,
# gpus=4,
# strategy='ddp',
logger=wandb_logger,
callbacks=callbacks,
max_epochs=25,
log_every_n_steps=25,
check_val_every_n_epoch=5,
# fast_dev_run=True,
# overfit_batches=10,
# limit_train_batches=10,
detect_anomaly=True,
)
trainer.fit(system,
train_dataloaders=train_dataloader,
val_dataloaders=test_dataloader)
Environment
* CUDA:
- GPU:
- TITAN Xp
- TITAN Xp
- TITAN Xp
- TITAN Xp
- TITAN Xp
- TITAN Xp
- TITAN Xp
- TITAN Xp
- TITAN Xp
- TITAN Xp
- available: True
- version: 10.2
* Packages:
- numpy: 1.19.5
- pyTorch_debug: False
- pyTorch_version: 1.10.0+cu102
- pytorch-lightning: 1.5.5
- tqdm: 4.62.3
* System:
- OS: Linux
- architecture:
- 64bit
- ELF
- processor: x86_64
- python: 3.6.10
- version: #203-Ubuntu SMP Wed Jan 15 02:55:01 UTC 2020
About this issue
- Original URL
- State: closed
- Created 3 years ago
- Reactions: 2
- Comments: 25 (6 by maintainers)
I’m 99% confident that these errors are not actually caused by PyTorch Lightning but instead something killing my processes on my cluster. I am waiting for confirmation and will update this issue tomorrow.
Now that you mention garbage collection, I think I had an intertwined problem where something you would expected to be garbage collected (e.g.
losses.append(loss.item())within a loop) wasn’t functioning properly…yeah. maybe, but if that’s the case it should fail right after the first optimizer update, not after some time… but let’s see how your run goes 😃