tensorflow: ValueError: `handle` is not available outside the replica context or a `tf.distribute.Strategy.update()` call.

I am not able to run training using tf.distribute.Strategy However, it works fine without distribution. Below is the code block for training loop

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from absl import app
import os
import tensorflow as tf # TF2
import model_timit as model
import kaldi_io
from DataLoader_timit import SequentialLoader
from warprnnt_tensorflow import rnnt_loss
assert tf.__version__.startswith('2')

class Train(object):

  def __init__(self, epochs, decoder,batch_size):
    self.epochs = epochs
    self.decoder = decoder
    self.batch_size = batch_size
    self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.0004)
    self.train_loss_metric = tf.keras.metrics.Mean(name='train_loss')
    self.checkpoint = tf.train.Checkpoint(
            decoder=self.decoder,
            optimizer=self.optimizer)

  def loss_function(self, pred,real,xlen,ylen):
    loss_ = rnnt_loss(pred,real,xlen,ylen,0)
    return tf.reduce_sum(loss_) * 1. / self.batch_size

  def train_step(self, inputs):
    loss = 0
    inp, targ,xlen,ylen = inputs

    with tf.GradientTape() as tape:
      xs_1,xs,predictions = self.decoder(
            inp, targ)
      time_dim = tf.shape(predictions)[1]
      loss += self.loss_function(predictions,targ,xlen,ylen)

    batch_loss = (loss / int(targ.shape[1]))
    variables = (self.decoder.trainable_variables)
    gradients = tape.gradient(loss, variables)
    self.optimizer.apply_gradients(zip(gradients, variables))
    #self.optimizer.apply_gradients(list(zip(gradients, variables)))

    self.train_loss_metric(batch_loss)

    return self.train_loss_metric.result().numpy()

class DistributedTrain(Train):
  def __init__(self, epochs, decoder, batch_size, local_batch_size):
    Train.__init__(
        self, epochs, decoder, local_batch_size)

  def training_loop(self, train_ds, test_ds, strategy):
    def distributed_train(inp, targ, xlen, ylen):
      returnstrategy.experimental_run_v2(self.train_step((inp, targ, xlen, ylen)))

    distributed_train = tf.function(distributed_train)
    template = 'Epoch: {}, Train Loss: {}, Test Loss: {}'
    for epoch in range(self.epochs):
      self.train_loss_metric.reset_states()
      for i, (inp, targ, xlen, ylen) in enumerate(train_ds):
        distributed_train(inp, targ, xlen, ylen)

def main(epochs=200, batch_size=16, num_examples=70000, embedding_dim=256, enc_units=1024, dec_units=1024):

  strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0","/gpu:1"])
  num_replicas = strategy.num_replicas_in_sync

  train_ds = SequentialLoader('train', batch_size)
  test_ds = SequentialLoader('test', batch_size)

  with strategy.scope():
    decoder = model.Transducer(39, 62, 250, 3, 0.5,bidirectional=False)
    train_obj = DistributedTrain(10, decoder, batch_size, 8)

    print ('Training ...')
    return train_obj.training_loop(train_ds, test_ds, strategy)

if __name__ == '__main__':
  app.run(main)


This is the error.

train_timit_distributed.py:97 distributed_train  *
        per_example_loss = strategy.experimental_run_v2(self.train_step((inp, targ, xlen, ylen)))
    train_timit_distributed.py:61 train_step  *
        gradients = tape.gradient(loss, variables)
    /home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/backprop.py:996 gradient
        flat_sources = [_handle_or_self(x) for x in flat_sources]
    /home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/backprop.py:996 <listcomp>
        flat_sources = [_handle_or_self(x) for x in flat_sources]
    /home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/backprop.py:687 _handle_or_self
        x = x.handle
    /home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/distribute/values.py:717 handle
        raise ValueError("`handle` is not available outside the replica context"

    ValueError: `handle` is not available outside the replica context or a `tf.distribute.Strategy.update()` call.

How to fix this?

About this issue

  • Original URL
  • State: closed
  • Created 5 years ago
  • Comments: 27 (6 by maintainers)

Most upvoted comments

I just ran into this exact issue as well any answers? Batch Normalization layer within strategy scope seems to be the issue. any Ideas or fixes?

some isshe +1… how to solve it?

I’m having the same issue. Perhaps there is something with using BatchNormalization in the strategy scope. If you remove this normalization layer, the training goes well…

I get the same error trying to create a Resnet50 model with mirrored distribution across 2 GPUs.

I am training on:

  • Ubuntu 18
  • Anaconda 2019 Python 3.7
  • Tensorflow 2.0
  • Keras