pytorch-lightning: PT 1.6.0 could not resume a training with plugins monitoring on metrics

πŸ› Bug

As mentioned in the title

To Reproduce

The code (boring.py) is provided below:

from typing import List

import torch
from pytorch_lightning import LightningDataModule, LightningModule
from pytorch_lightning.utilities.cli import (LightningArgumentParser, LightningCLI)
from torch.utils.data import DataLoader, Dataset
from jsonargparse import lazy_instance


class Arch(torch.nn.Linear):

    def __init__(self, input_size: int = 10, output_size: int = 2) -> None:
        super().__init__(input_size, output_size)


class RandomDataset(Dataset):

    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len


class MyDataModule(LightningDataModule):

    def __init__(self, input_size: int = 10, train_transforms=None, val_transforms=None, test_transforms=None, dims=None):
        super().__init__(train_transforms=train_transforms, val_transforms=val_transforms, test_transforms=test_transforms, dims=dims)
        self.input_size = input_size

    def train_dataloader(self) -> DataLoader:
        return DataLoader(RandomDataset(self.input_size, 64), batch_size=2)

    def val_dataloader(self) -> DataLoader:
        return DataLoader(RandomDataset(self.input_size, 64), batch_size=2)

    def test_dataloader(self) -> DataLoader:
        return DataLoader(RandomDataset(self.input_size, 64), batch_size=2)


class BoringModel(LightningModule):

    def __init__(self, arch: Arch = lazy_instance(Arch), channels: List[int] = [0, 1]):
        super().__init__()
        self.arch = arch

    def forward(self, x):
        print(x.shape)
        return self.arch(x)

    def training_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("valid_loss", loss)

    def test_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("test_loss", loss)

    def configure_optimizers(self):
        return torch.optim.SGD(self.arch.parameters(), lr=0.1)


class MyCLI(LightningCLI):

    def add_arguments_to_parser(self, parser: LightningArgumentParser) -> None:
        from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
        parser.add_lightning_class_args(EarlyStopping, "early_stopping")
        parser.set_defaults({
            "early_stopping.monitor": "valid_loss",
            "early_stopping.min_delta": 0.01,
            "early_stopping.patience": 10,
            "early_stopping.mode": "min",
        })
        
        # ModelCheckpoint
        parser.add_lightning_class_args(ModelCheckpoint, "model_checkpoint")
        model_checkpoint_defaults = {
            "model_checkpoint.filename": "epoch{epoch}_valid_loss{valid_loss:.4f}",
            "model_checkpoint.monitor": "valid_loss",
            "model_checkpoint.mode": "min",
            "model_checkpoint.every_n_epochs": 1,
            "model_checkpoint.save_top_k": 5,
            "model_checkpoint.auto_insert_metric_name": False,
            "model_checkpoint.save_last": True
        }
        parser.set_defaults(model_checkpoint_defaults)
        return super().add_arguments_to_parser(parser)


if __name__ == '__main__':
    cli = MyCLI(BoringModel, MyDataModule, seed_everything_default=None, save_config_overwrite=True, parser_kwargs={"parser_mode": "omegaconf"})
  1. fit: python boring.py fit, and kill the training program when we have ckeckpoints
  2. resume (something like the command): python boring.py fit --config lightning_logs/version_7/config.yaml --ckpt_path=lightning_logs/version_7/checkpoints/last.ckpt

Expected behavior

resume sucessfully.

Environment

  • PyTorch Lightning Version (e.g., 1.5.0): 1.6.0
  • PyTorch Version (e.g., 1.10): 1.10
  • Python version (e.g., 3.9): 3.9
  • OS (e.g., Linux):
  • CUDA/cuDNN version:
  • GPU models and configuration:
  • How you installed PyTorch (conda, pip, source):
  • If compiling from source, the output of torch.__config__.show():
  • Any other relevant information:

Additional context

The error:

python boring.py fit --config lightning_logs/version_7/config.yaml --ckpt_path=lightning_logs/version_7/checkpoints/last.ckpt
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:91: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
  rank_zero_warn(
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1823: PossibleUserWarning: GPU available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='gpu', devices=8)`.
  rank_zero_warn(
Restoring states from the checkpoint path at lightning_logs/version_7/checkpoints/last.ckpt
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:342: UserWarning: The dirpath has changed from '/data/home/quancs/projects/NBSS_pmt/lightning_logs/version_7/checkpoints' to '/data/home/quancs/projects/NBSS_pmt/lightning_logs/version_8/checkpoints', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.
  warnings.warn(

  | Name | Type | Params
------------------------------
0 | arch | Arch | 22    
------------------------------
22        Trainable params
0         Non-trainable params
22        Total params
0.000     Total estimated model params size (MB)
Restored all states from the checkpoint file at lightning_logs/version_7/checkpoints/last.ckpt
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:240: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 128 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
  rank_zero_warn(
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1938: PossibleUserWarning: The number of training samples (32) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
  rank_zero_warn(
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:240: PossibleUserWarning: The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 128 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
  rank_zero_warn(
Epoch 19:  50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ                                                                          | 32/64 [00:00<?, ?it/s]Traceback (most recent call last):
  File "/data/home/x/projects/NBSS_pmt/boring.py", line 100, in <module>
    cli = MyCLI(BoringModel, MyDataModule, seed_everything_default=None, save_config_overwrite=True, parser_kwargs={"parser_mode": "omegaconf"})
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/utilities/cli.py", line 564, in __init__
    self._run_subcommand(self.subcommand)
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/utilities/cli.py", line 835, in _run_subcommand
    fn(**fn_kwargs)
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 771, in fit
    self._call_and_handle_interrupt(
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 724, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 812, in _fit_impl
    results = self._run(model, ckpt_path=self.ckpt_path)
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1237, in _run
    results = self._run_stage()
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1324, in _run_stage
    return self._run_train()
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1354, in _run_train
    self.fit_loop.run()
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 205, in run
    self.on_advance_end()
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 297, in on_advance_end
    self.trainer._call_callback_hooks("on_train_epoch_end")
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1637, in _call_callback_hooks
    fn(self, self.lightning_module, *args, **kwargs)
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/callbacks/early_stopping.py", line 179, in on_train_epoch_end
    self._run_early_stopping_check(trainer)
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/callbacks/early_stopping.py", line 190, in _run_early_stopping_check
    if trainer.fast_dev_run or not self._validate_condition_metric(  # disable early_stopping with fast_dev_run
  File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/callbacks/early_stopping.py", line 145, in _validate_condition_metric
    raise RuntimeError(error_msg)
RuntimeError: Early stopping conditioned on metric `valid_loss` which is not available. Pass in or modify your `EarlyStopping` callback to use any of the following: ``
Epoch 19:  50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ     | 32/64 [00:00<?, ?it/s]

cc @tchaton @rohitgr7 @carmocca @awaelchli

About this issue

  • Original URL
  • State: closed
  • Created 2 years ago
  • Reactions: 4
  • Comments: 15 (6 by maintainers)

Most upvoted comments

same here with 1.9.4! Is there a known fix for this?

prioritizing it. Will be fixed soon.

Same here! Not only EarlyStopping doesn’t work, any callback or LRScheduler with metric monitors.

still getting this error!

l getting this error!

Second that, have check_val_every_n_epoch=10 in the trainer, if I leave that out everything works. But I don’t need to check validation every epoch, fix would be appreciated!

Same issue here with 1.8.3.post1