pytorch-lightning: TypeError: optimizer_step() got an unexpected keyword argument 'on_tpu'
❓ Questions and Help
I have got this error
/usr/local/lib/python3.6/dist-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, datamodule)
468 self.call_hook('on_fit_start')
469
--> 470 results = self.accelerator_backend.train()
471 self.accelerator_backend.teardown()
472
/usr/local/lib/python3.6/dist-packages/pytorch_lightning/accelerators/gpu_accelerator.py in train(self)
66
67 # train or test
---> 68 results = self.train_or_test()
69 return results
70
/usr/local/lib/python3.6/dist-packages/pytorch_lightning/accelerators/accelerator.py in train_or_test(self)
67 results = self.trainer.run_test()
68 else:
---> 69 results = self.trainer.train()
70 return results
71
/usr/local/lib/python3.6/dist-packages/pytorch_lightning/trainer/trainer.py in train(self)
519 with self.profiler.profile("run_training_epoch"):
520 # run train epoch
--> 521 self.train_loop.run_training_epoch()
522
523 if self.max_steps and self.max_steps <= self.global_step:
/usr/local/lib/python3.6/dist-packages/pytorch_lightning/trainer/training_loop.py in run_training_epoch(self)
558 # ------------------------------------
559 with self.trainer.profiler.profile("run_training_batch"):
--> 560 batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
561
562 # when returning -1 from train_step, we end epoch early
/usr/local/lib/python3.6/dist-packages/pytorch_lightning/trainer/training_loop.py in run_training_batch(self, batch, batch_idx, dataloader_idx)
716
717 # optimizer step
--> 718 self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
719
720 else:
/usr/local/lib/python3.6/dist-packages/pytorch_lightning/trainer/training_loop.py in optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
499 on_tpu=self.trainer.use_tpu and TPU_AVAILABLE,
500 using_native_amp=using_native_amp,
--> 501 using_lbfgs=is_lbfgs,
502 )
503
TypeError: optimizer_step() got an unexpected keyword argument 'on_tpu'
How can I solve this issue?
while trying to understand how to use T5
this is the code :
class T5FineTuner(pl.LightningModule):
def __init__(self, hparams):
super(T5FineTuner, self).__init__()
self.hparams = hparams
self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
def is_logger(self):
#AttributeError: 'Trainer' object has no attribute 'proc_rank'
#return self.trainer.proc_rank <= 0 # What is this?
print("self.trainer.global_rank : ", self.trainer.global_rank )
#https://github.com/PyTorchLightning/pytorch-lightning/issues/2267 fixes the issue
return self.trainer.global_rank <= 0
def forward(
self,
input_ids,
attention_mask=None,
decoder_input_ids=None,
decoder_attention_mask=None,
lm_labels=None
):
return self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
labels=lm_labels,# lm_labels=lm_labels,
)
def _step(self, batch):
lm_labels = batch["target_ids"]
lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100 # why is this ?
outputs = self(
input_ids=batch["source_ids"],
attention_mask=batch["source_mask"],
decoder_attention_mask=batch['target_mask'],
lm_labels=lm_labels
)
loss = outputs[0]
return loss
def training_step(self, batch, batch_idx):
loss = self._step(batch)
tensorboard_logs = {"train_loss": loss}
return {"loss": loss, "log": tensorboard_logs}
def training_epoch_end(self, outputs):
avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
tensorboard_logs = {"avg_train_loss": avg_train_loss}
return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}
def validation_step(self, batch, batch_idx):
loss = self._step(batch)
return {"val_loss": loss}
def validation_epoch_end(self, outputs):
avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
tensorboard_logs = {"val_loss": avg_loss}
return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}
def configure_optimizers(self):
"Prepare optimizer and schedule (linear warmup and decay)"
model = self.model
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": self.hparams.weight_decay,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
self.opt = optimizer
return [optimizer]
def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
if self.trainer.use_tpu:
print("optimizer is using TPU...")
xm.optimizer_step(optimizer)
else:
optimizer.step()
optimizer.zero_grad()
self.lr_scheduler.step()
def get_tqdm_dict(self):
tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
return tqdm_dict
def train_dataloader(self):
train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
t_total = (
(len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
// self.hparams.gradient_accumulation_steps
* float(self.hparams.num_train_epochs)
)
scheduler = get_linear_schedule_with_warmup(
self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
)
self.lr_scheduler = scheduler
return dataloader
def val_dataloader(self):
val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, `num_workers=4)`
train_params = dict(
accumulate_grad_batches=args.gradient_accumulation_steps,
gpus=args.n_gpu,
max_epochs=args.num_train_epochs,
#callbacks=[early_stop_callback],
precision= 16 if args.fp_16 else 32,
amp_level=args.opt_level,
gradient_clip_val=args.max_grad_norm,
checkpoint_callback=checkpoint_callback,
callbacks=[LoggingCallback(), early_stop_callback],
)
trainer = pl.Trainer(**train_params)
I am working on colab
About this issue
- Original URL
- State: closed
- Created 3 years ago
- Comments: 15 (3 by maintainers)
Hey @sumanthd17 !
I think this will solve your problem
Hey all, improved @Arij-Aladel 's solution as below and it worked for me:
I still face the same issue when I refer to https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#optimizer-step
Hey @ tchaton ! Sorry for late response. Seems the code needs refactoring the basic code that I am trying can be found here I am still experimenting on it cause I am new. I need to use this code but for question answering system. I solved the problem by writting like this
and removing optimizer_step function
where self.steps_per_epochs is calculated in my dataset
I also removed get_tqdm_dict function and that did not change the output
anyway this is my notebook
but I have another questions please
while I am trying the model I have noticed that Validation sanity check: does not progress I did not understand why.
and please I did not figure out how to evaluate each 500 training step, how to do that?
%tensorboard --logdir ./lightning_logs is not working
it is training for just 1 epoch not for two which is strange
While save the checkpoint I get this warning