pytorch-lightning: PT 1.6.0 could not resume a training with plugins monitoring on metrics
π Bug
As mentioned in the title
To Reproduce
The code (boring.py) is provided below:
from typing import List
import torch
from pytorch_lightning import LightningDataModule, LightningModule
from pytorch_lightning.utilities.cli import (LightningArgumentParser, LightningCLI)
from torch.utils.data import DataLoader, Dataset
from jsonargparse import lazy_instance
class Arch(torch.nn.Linear):
def __init__(self, input_size: int = 10, output_size: int = 2) -> None:
super().__init__(input_size, output_size)
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class MyDataModule(LightningDataModule):
def __init__(self, input_size: int = 10, train_transforms=None, val_transforms=None, test_transforms=None, dims=None):
super().__init__(train_transforms=train_transforms, val_transforms=val_transforms, test_transforms=test_transforms, dims=dims)
self.input_size = input_size
def train_dataloader(self) -> DataLoader:
return DataLoader(RandomDataset(self.input_size, 64), batch_size=2)
def val_dataloader(self) -> DataLoader:
return DataLoader(RandomDataset(self.input_size, 64), batch_size=2)
def test_dataloader(self) -> DataLoader:
return DataLoader(RandomDataset(self.input_size, 64), batch_size=2)
class BoringModel(LightningModule):
def __init__(self, arch: Arch = lazy_instance(Arch), channels: List[int] = [0, 1]):
super().__init__()
self.arch = arch
def forward(self, x):
print(x.shape)
return self.arch(x)
def training_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("train_loss", loss)
return {"loss": loss}
def validation_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("valid_loss", loss)
def test_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("test_loss", loss)
def configure_optimizers(self):
return torch.optim.SGD(self.arch.parameters(), lr=0.1)
class MyCLI(LightningCLI):
def add_arguments_to_parser(self, parser: LightningArgumentParser) -> None:
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
parser.add_lightning_class_args(EarlyStopping, "early_stopping")
parser.set_defaults({
"early_stopping.monitor": "valid_loss",
"early_stopping.min_delta": 0.01,
"early_stopping.patience": 10,
"early_stopping.mode": "min",
})
# ModelCheckpoint
parser.add_lightning_class_args(ModelCheckpoint, "model_checkpoint")
model_checkpoint_defaults = {
"model_checkpoint.filename": "epoch{epoch}_valid_loss{valid_loss:.4f}",
"model_checkpoint.monitor": "valid_loss",
"model_checkpoint.mode": "min",
"model_checkpoint.every_n_epochs": 1,
"model_checkpoint.save_top_k": 5,
"model_checkpoint.auto_insert_metric_name": False,
"model_checkpoint.save_last": True
}
parser.set_defaults(model_checkpoint_defaults)
return super().add_arguments_to_parser(parser)
if __name__ == '__main__':
cli = MyCLI(BoringModel, MyDataModule, seed_everything_default=None, save_config_overwrite=True, parser_kwargs={"parser_mode": "omegaconf"})
- fit:
python boring.py fit, and kill the training program when we have ckeckpoints - resume (something like the command):
python boring.py fit --config lightning_logs/version_7/config.yaml --ckpt_path=lightning_logs/version_7/checkpoints/last.ckpt
Expected behavior
resume sucessfully.
Environment
- PyTorch Lightning Version (e.g., 1.5.0): 1.6.0
- PyTorch Version (e.g., 1.10): 1.10
- Python version (e.g., 3.9): 3.9
- OS (e.g., Linux):
- CUDA/cuDNN version:
- GPU models and configuration:
- How you installed PyTorch (
conda,pip, source): - If compiling from source, the output of
torch.__config__.show(): - Any other relevant information:
Additional context
The error:
python boring.py fit --config lightning_logs/version_7/config.yaml --ckpt_path=lightning_logs/version_7/checkpoints/last.ckpt
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:91: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
rank_zero_warn(
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1823: PossibleUserWarning: GPU available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='gpu', devices=8)`.
rank_zero_warn(
Restoring states from the checkpoint path at lightning_logs/version_7/checkpoints/last.ckpt
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:342: UserWarning: The dirpath has changed from '/data/home/quancs/projects/NBSS_pmt/lightning_logs/version_7/checkpoints' to '/data/home/quancs/projects/NBSS_pmt/lightning_logs/version_8/checkpoints', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.
warnings.warn(
| Name | Type | Params
------------------------------
0 | arch | Arch | 22
------------------------------
22 Trainable params
0 Non-trainable params
22 Total params
0.000 Total estimated model params size (MB)
Restored all states from the checkpoint file at lightning_logs/version_7/checkpoints/last.ckpt
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:240: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 128 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
rank_zero_warn(
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1938: PossibleUserWarning: The number of training samples (32) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
rank_zero_warn(
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:240: PossibleUserWarning: The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 128 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
rank_zero_warn(
Epoch 19: 50%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 32/64 [00:00<?, ?it/s]Traceback (most recent call last):
File "/data/home/x/projects/NBSS_pmt/boring.py", line 100, in <module>
cli = MyCLI(BoringModel, MyDataModule, seed_everything_default=None, save_config_overwrite=True, parser_kwargs={"parser_mode": "omegaconf"})
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/utilities/cli.py", line 564, in __init__
self._run_subcommand(self.subcommand)
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/utilities/cli.py", line 835, in _run_subcommand
fn(**fn_kwargs)
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 771, in fit
self._call_and_handle_interrupt(
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 724, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 812, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1237, in _run
results = self._run_stage()
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1324, in _run_stage
return self._run_train()
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1354, in _run_train
self.fit_loop.run()
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 205, in run
self.on_advance_end()
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 297, in on_advance_end
self.trainer._call_callback_hooks("on_train_epoch_end")
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1637, in _call_callback_hooks
fn(self, self.lightning_module, *args, **kwargs)
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/callbacks/early_stopping.py", line 179, in on_train_epoch_end
self._run_early_stopping_check(trainer)
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/callbacks/early_stopping.py", line 190, in _run_early_stopping_check
if trainer.fast_dev_run or not self._validate_condition_metric( # disable early_stopping with fast_dev_run
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/callbacks/early_stopping.py", line 145, in _validate_condition_metric
raise RuntimeError(error_msg)
RuntimeError: Early stopping conditioned on metric `valid_loss` which is not available. Pass in or modify your `EarlyStopping` callback to use any of the following: ``
Epoch 19: 50%|βββββ | 32/64 [00:00<?, ?it/s]
About this issue
- Original URL
- State: closed
- Created 2 years ago
- Reactions: 4
- Comments: 15 (6 by maintainers)
same here with 1.9.4! Is there a known fix for this?
prioritizing it. Will be fixed soon.
Same here! Not only EarlyStopping doesnβt work, any callback or LRScheduler with metric monitors.
still getting this error!
Second that, have check_val_every_n_epoch=10 in the trainer, if I leave that out everything works. But I donβt need to check validation every epoch, fix would be appreciated!
Same issue here with 1.8.3.post1