tensorflow: ValueError: `handle` is not available outside the replica context or a `tf.distribute.Strategy.update()` call.
I am not able to run training using tf.distribute.Strategy However, it works fine without distribution. Below is the code block for training loop
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import app
import os
import tensorflow as tf # TF2
import model_timit as model
import kaldi_io
from DataLoader_timit import SequentialLoader
from warprnnt_tensorflow import rnnt_loss
assert tf.__version__.startswith('2')
class Train(object):
def __init__(self, epochs, decoder,batch_size):
self.epochs = epochs
self.decoder = decoder
self.batch_size = batch_size
self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.0004)
self.train_loss_metric = tf.keras.metrics.Mean(name='train_loss')
self.checkpoint = tf.train.Checkpoint(
decoder=self.decoder,
optimizer=self.optimizer)
def loss_function(self, pred,real,xlen,ylen):
loss_ = rnnt_loss(pred,real,xlen,ylen,0)
return tf.reduce_sum(loss_) * 1. / self.batch_size
def train_step(self, inputs):
loss = 0
inp, targ,xlen,ylen = inputs
with tf.GradientTape() as tape:
xs_1,xs,predictions = self.decoder(
inp, targ)
time_dim = tf.shape(predictions)[1]
loss += self.loss_function(predictions,targ,xlen,ylen)
batch_loss = (loss / int(targ.shape[1]))
variables = (self.decoder.trainable_variables)
gradients = tape.gradient(loss, variables)
self.optimizer.apply_gradients(zip(gradients, variables))
#self.optimizer.apply_gradients(list(zip(gradients, variables)))
self.train_loss_metric(batch_loss)
return self.train_loss_metric.result().numpy()
class DistributedTrain(Train):
def __init__(self, epochs, decoder, batch_size, local_batch_size):
Train.__init__(
self, epochs, decoder, local_batch_size)
def training_loop(self, train_ds, test_ds, strategy):
def distributed_train(inp, targ, xlen, ylen):
returnstrategy.experimental_run_v2(self.train_step((inp, targ, xlen, ylen)))
distributed_train = tf.function(distributed_train)
template = 'Epoch: {}, Train Loss: {}, Test Loss: {}'
for epoch in range(self.epochs):
self.train_loss_metric.reset_states()
for i, (inp, targ, xlen, ylen) in enumerate(train_ds):
distributed_train(inp, targ, xlen, ylen)
def main(epochs=200, batch_size=16, num_examples=70000, embedding_dim=256, enc_units=1024, dec_units=1024):
strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0","/gpu:1"])
num_replicas = strategy.num_replicas_in_sync
train_ds = SequentialLoader('train', batch_size)
test_ds = SequentialLoader('test', batch_size)
with strategy.scope():
decoder = model.Transducer(39, 62, 250, 3, 0.5,bidirectional=False)
train_obj = DistributedTrain(10, decoder, batch_size, 8)
print ('Training ...')
return train_obj.training_loop(train_ds, test_ds, strategy)
if __name__ == '__main__':
app.run(main)
This is the error.
train_timit_distributed.py:97 distributed_train *
per_example_loss = strategy.experimental_run_v2(self.train_step((inp, targ, xlen, ylen)))
train_timit_distributed.py:61 train_step *
gradients = tape.gradient(loss, variables)
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/backprop.py:996 gradient
flat_sources = [_handle_or_self(x) for x in flat_sources]
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/backprop.py:996 <listcomp>
flat_sources = [_handle_or_self(x) for x in flat_sources]
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/eager/backprop.py:687 _handle_or_self
x = x.handle
/home/ubuntu/tf2/lib/python3.6/site-packages/tensorflow_core/python/distribute/values.py:717 handle
raise ValueError("`handle` is not available outside the replica context"
ValueError: `handle` is not available outside the replica context or a `tf.distribute.Strategy.update()` call.
How to fix this?
About this issue
- Original URL
- State: closed
- Created 5 years ago
- Comments: 27 (6 by maintainers)
I just ran into this exact issue as well any answers? Batch Normalization layer within strategy scope seems to be the issue. any Ideas or fixes?
some isshe +1… how to solve it?
I’m having the same issue. Perhaps there is something with using BatchNormalization in the strategy scope. If you remove this normalization layer, the training goes well…
I get the same error trying to create a Resnet50 model with mirrored distribution across 2 GPUs.
I am training on: