pytorch-lightning: TypeError: optimizer_step() got an unexpected keyword argument 'on_tpu'

❓ Questions and Help

I have got this error


/usr/local/lib/python3.6/dist-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, datamodule)
    468         self.call_hook('on_fit_start')
    469 
--> 470         results = self.accelerator_backend.train()
    471         self.accelerator_backend.teardown()
    472 

/usr/local/lib/python3.6/dist-packages/pytorch_lightning/accelerators/gpu_accelerator.py in train(self)
     66 
     67         # train or test
---> 68         results = self.train_or_test()
     69         return results
     70 

/usr/local/lib/python3.6/dist-packages/pytorch_lightning/accelerators/accelerator.py in train_or_test(self)
     67             results = self.trainer.run_test()
     68         else:
---> 69             results = self.trainer.train()
     70         return results
     71 

/usr/local/lib/python3.6/dist-packages/pytorch_lightning/trainer/trainer.py in train(self)
    519                 with self.profiler.profile("run_training_epoch"):
    520                     # run train epoch
--> 521                     self.train_loop.run_training_epoch()
    522 
    523                 if self.max_steps and self.max_steps <= self.global_step:

/usr/local/lib/python3.6/dist-packages/pytorch_lightning/trainer/training_loop.py in run_training_epoch(self)
    558             # ------------------------------------
    559             with self.trainer.profiler.profile("run_training_batch"):
--> 560                 batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
    561 
    562             # when returning -1 from train_step, we end epoch early

/usr/local/lib/python3.6/dist-packages/pytorch_lightning/trainer/training_loop.py in run_training_batch(self, batch, batch_idx, dataloader_idx)
    716 
    717                         # optimizer step
--> 718                         self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
    719 
    720                     else:

/usr/local/lib/python3.6/dist-packages/pytorch_lightning/trainer/training_loop.py in optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
    499             on_tpu=self.trainer.use_tpu and TPU_AVAILABLE,
    500             using_native_amp=using_native_amp,
--> 501             using_lbfgs=is_lbfgs,
    502         )
    503 

TypeError: optimizer_step() got an unexpected keyword argument 'on_tpu'

How can I solve this issue?

while trying to understand how to use T5

this is the code :


class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    self.hparams = hparams
    
    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
  
  def is_logger(self):
    #AttributeError: 'Trainer' object has no attribute 'proc_rank'
    #return self.trainer.proc_rank <= 0  # What is this? 
    print("self.trainer.global_rank : ", self.trainer.global_rank )
    #https://github.com/PyTorchLightning/pytorch-lightning/issues/2267   fixes the issue
    return self.trainer.global_rank <= 0
  def forward(
       self, 
       input_ids,
       attention_mask=None,
       decoder_input_ids=None, 
       decoder_attention_mask=None, 
       lm_labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=lm_labels,#        lm_labels=lm_labels,

    )

  def _step(self, batch):
    lm_labels = batch["target_ids"]
    lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100  # why is this ?

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        decoder_attention_mask=batch['target_mask'],
        lm_labels=lm_labels
    )   

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
    if self.trainer.use_tpu:
      print("optimizer is using TPU...")
      xm.optimizer_step(optimizer)
    else:
      optimizer.step()
    optimizer.zero_grad()
    self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, `num_workers=4)`


train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    #callbacks=[early_stop_callback],
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback(), early_stop_callback],
)

trainer = pl.Trainer(**train_params)

I am working on colab

About this issue

  • Original URL
  • State: closed
  • Created 3 years ago
  • Comments: 15 (3 by maintainers)

Most upvoted comments

Hey @sumanthd17 !

I encountered the same error. Would be great to have an answer to this. Currently I’m commenting the arguments to get it running.

@tchaton I have a colab notebook which can replicate this error. Could you take a look? colab notebook

I think this will solve your problem

  def optimizer_step(self,
                     epoch=None, 
                    batch_idx=None, 
                    optimizer=None, 
                    optimizer_idx=None, 
                    optimizer_closure=None, 
                    on_tpu=None, 
                    using_native_amp=None, 
                    using_lbfgs=None
                     ):

    optimizer.step(closure=optimizer_closure)
    optimizer.zero_grad()
    self.lr_scheduler.step()

Hey all, improved @Arij-Aladel 's solution as below and it worked for me:

 # Arij-Aladel's solution:
  def optimizer_step(self,
                     epoch=None, 
                    batch_idx=None, 
                    optimizer=None, 
                    optimizer_idx=None, 
                    optimizer_closure=None, 
                    on_tpu=None, 
                    using_native_amp=None, 
                    using_lbfgs=None
                     ):

    optimizer.step(closure=optimizer_closure)
    optimizer.zero_grad()
    self.lr_scheduler.step()
# minor adjustment:
 def optimizer_step(self,
                     epoch=None,
                     batch_idx=None,
                     optimizer=None,
                     optimizer_idx=None,
                     optimizer_closure=None,
                     on_tpu=None,
                     using_native_amp=None,
                     using_lbfgs=None):

    optimizer.step() # remove 'closure=optimizer_closure' here
    optimizer.zero_grad()
    self.lr_scheduler.step()`

Hey @ tchaton ! Sorry for late response. Seems the code needs refactoring the basic code that I am trying can be found here I am still experimenting on it cause I am new. I need to use this code but for question answering system. I solved the problem by writting like this

  def configure_optimizers(self):

    "Prepare optimizer and schedule (linear warmup and decay)"
    optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    
    t_total = self.steps_per_epochs * float(self.hparams.num_train_epochs)
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps=self.hparams.warmup_steps, 
                                                num_training_steps=t_total)

and removing optimizer_step function

where self.steps_per_epochs is calculated in my dataset

I also removed get_tqdm_dict function and that did not change the output

anyway this is my notebook

but I have another questions please

  1. while I am trying the model I have noticed that Validation sanity check: does not progress I did not understand why.

  2. and please I did not figure out how to evaluate each 500 training step, how to do that?

  3. %tensorboard --logdir ./lightning_logs is not working

  4. it is training for just 1 epoch not for two which is strange

  5. While save the checkpoint I get this warning

/usr/local/lib/python3.6/dist-packages/torch/optim/lr_scheduler.py:216: UserWarning: Please also save or load the state of the optimizer when saving or loading the scheduler. state_dict = {key: value for key, value in self.dict.items() if key not in (‘optimizer’, ‘lr_lambdas’)}