tensorflow: ValueError: `handle` is not available outside the replica context or a `tf.distribute.Strategy.update()` call.

I am not able to run training using tf.distribute.Strategy However, it works fine without distribution. Below is the code block for training loop

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from absl import app
import os
import tensorflow as tf # TF2
import model_timit as model
import kaldi_io
from DataLoader_timit import SequentialLoader
from warprnnt_tensorflow import rnnt_loss
assert tf.__version__.startswith('2')

class Train(object):

  def __init__(self, epochs, decoder,batch_size):
    self.epochs = epochs
    self.decoder = decoder
    self.batch_size = batch_size
    self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.0004)
    self.train_loss_metric = tf.keras.metrics.Mean(name='train_loss')
    self.checkpoint = tf.train.Checkpoint(
            decoder=self.decoder,
            optimizer=self.optimizer)

  def loss_function(self, pred,real,xlen,ylen):
    loss_ = rnnt_loss(pred,real,xlen,ylen,0)
    return tf.reduce_sum(loss_) * 1. / self.batch_size

  def train_step(self, inputs):
    loss = 0
    inp, targ,xlen,ylen = inputs

    with tf.GradientTape() as tape:
      xs_1,xs,predictions = self.decoder(
            inp, targ)
      time_dim = tf.shape(predictions)[1]
      loss += self.loss_function(predictions,targ,xlen,ylen)

    batch_loss = (loss / int(targ.shape[1]))
    variables = (self.decoder.trainable_variables)
    gradients = tape.gradient(loss, variables)
    self.optimizer.apply_gradients(zip(gradients, variables))
    #self.optimizer.apply_gradients(list(zip(gradients, variables)))

    self.train_loss_metric(batch_loss)

    return self.train_loss_metric.result().numpy()

class DistributedTrain(Train):
  def __init__(self, epochs, decoder, batch_size, local_batch_size):
    Train.__init__(
        self, epochs, decoder, local_batch_size)

  def training_loop(self, train_ds, test_ds, strategy):
    def distributed_train(inp, targ, xlen, ylen):
      returnstrategy.experimental_run_v2(self.train_step((inp, targ, xlen, ylen)))

    distributed_train = tf.function(distributed_train)
    template = 'Epoch: {}, Train Loss: {}, Test Loss: {}'
    for epoch in range(self.epochs):
      self.train_loss_metric.reset_states()
      for i, (inp, targ, xlen, ylen) in enumerate(train_ds):
        distributed_train(inp, targ, xlen, ylen)

def main(epochs=200, batch_size=16, num_examples=70000, embedding_dim=256, enc_units=1024, dec_units=1024):

  strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0","/gpu:1"])
  num_replicas = strategy.num_replicas_in_sync

  train_ds = SequentialLoader('train', batch_size)
  test_ds = SequentialLoader('test', batch_size)

  with strategy.scope():
    decoder = model.Transducer(39, 62, 250, 3, 0.5,bidirectional=False)
    train_obj = DistributedTrain(10, decoder, batch_size, 8)

    print ('Training ...')
    return train_obj.training_loop(train_ds, test_ds, strategy)

if __name__ == '__main__':
  app.run(main)

This is the error.

train_timit_distributed.py:97 distributed_train  *
        per_example_loss = strategy.experimental_run_v2(self.train_step((inp, targ, xlen, ylen)))
    train_timit_distributed.py:61 train_step  *
        gradients = tape.gradient(loss, variables)
    /home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/backprop.py:996 gradient
        flat_sources = [_handle_or_self(x) for x in flat_sources]
    /home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/backprop.py:996 <listcomp>
        flat_sources = [_handle_or_self(x) for x in flat_sources]
    /home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/backprop.py:687 _handle_or_self
        x = x.handle
    /home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/distribute/values.py:717 handle
        raise ValueError("`handle` is not available outside the replica context"

    ValueError: `handle` is not available outside the replica context or a `tf.distribute.Strategy.update()` call.

How to fix this?

About this issue

Original URL
State: closed
Created 5 years ago
Comments: 27 (6 by maintainers)

Most upvoted comments

I just ran into this exact issue as well any answers? Batch Normalization layer within strategy scope seems to be the issue. any Ideas or fixes?

kylehatfield1 on Mar 22, 2021

some isshe +1… how to solve it?

Zepan on Jan 27, 2021

I’m having the same issue. Perhaps there is something with using BatchNormalization in the strategy scope. If you remove this normalization layer, the training goes well…

samcaetano on Jul 3, 2020

I get the same error trying to create a Resnet50 model with mirrored distribution across 2 GPUs.

I am training on:

Ubuntu 18
Anaconda 2019 Python 3.7
Tensorflow 2.0
Keras

jtk1919 on Dec 23, 2019