tensorflow: Possible bug in dynamic_rnn when training on TPU for iterations_per_loop > 1
System information
- Have I written custom code (as opposed to using a stock example script provided in TensorFlow): Yes
- OS Platform and Distribution (e.g., Linux Ubuntu 16.04): Google Cloud Platform (Linux Debian)
- Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if the issue happens on mobile device: NA
- TensorFlow installed from (source or binary): Binary
- TensorFlow version (use command below): 1.9
- Python version: 2.7
- Bazel version (if compiling from source): NA
- GCC/Compiler version (if compiling from source): NA
- CUDA/cuDNN version: NA (TPU training)
- GPU model and memory: NA (TPU training)
- Exact command to reproduce:
Describe the problem
Training an RNN (constructed with dynamic_rnn) on TPU gives largely different loss values for iterations_per_loop=1 and iterations_per_loop=100. The loss when training on TPU with iterations_per_loop=1 is very close to the loss when training on CPU, but the loss for iterations_per_loop=100 case is orders of magnitude different.
See below for the code to reproduce this issue. I also tested it with BasicRNNCell (instead of GRU) and observed the same issue. For easier debugging, I have made the runs deterministic (all the random ops are seeded, repeated runs produce the exact same values).
Note that if I replace my model_fn with a simple linear model containing only matrix multiplication (instead of dynamic_rnn) the loss for any value of iterations_per_loop will be the same which is as expected. So I suspect there is a bug in using dynamic_rnn with TPU.
Source code / logs
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tpu.python.tpu import tpu_estimator
import subprocess
import os
SEED = 10010
USE_TPU = True
def make_data(params):
# make training and validation data: sinusoids with random phases
np.random.seed(SEED)
num_samp_tr = 1000
num_samp_val = 1000
ramps_tr = np.transpose(np.broadcast_to(0.1*np.arange(0,100), (num_samp_tr, params['dims'], 100)), (0, 2, 1))
rand_phase = np.transpose(np.tile(np.random.randn(num_samp_tr, params['dims']), (100,1,1)), (1, 0, 2))
ramps_val = np.transpose(np.broadcast_to(0.1*np.arange(0,100), (num_samp_val, params['dims'], 100)), (0, 2, 1))
rand_phase_val = np.transpose(np.tile(np.random.randn(num_samp_val, params['dims']), (100,1,1)), (1, 0, 2))
data = {'train_data': np.sin(ramps_tr + rand_phase),
'valid_data': np.sin(ramps_val + rand_phase_val)}
return data
def input_fn(data_dict, mode):
def data_fn(params):
batch_size = params['batch_size']
if mode == tf.estimator.ModeKeys.TRAIN:
dataset = tf.data.Dataset.from_tensor_slices(data_dict['train_data'].astype(np.float32)).cache().repeat().shuffle(buffer_size=10000, seed=SEED)
else:
dataset = tf.data.Dataset.from_tensor_slices(data_dict['valid_data'].astype(np.float32)).cache().repeat()
dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(batch_size))
return dataset
return data_fn
def model_fn(features, mode, params):
#tf.set_random_seed(SEED) # Use for BasicRNNCell, it does not get an initializer
batch_size=params['batch_size']
ts = features.get_shape().as_list()[1]
seq_len = ts * np.ones([batch_size,])
with tf.variable_scope('encoder'):
init_kern = tf.random_normal_initializer(0.0, 0.1, dtype=tf.float32, seed=SEED)
#cell = tf.contrib.rnn.BasicRNNCell(num_units=20)
cell = tf.contrib.rnn.GRUCell(num_units=20, kernel_initializer=init_kern)
_, output_latent = tf.nn.dynamic_rnn(cell=cell, inputs=features, sequence_length=seq_len, dtype=tf.float32)
with tf.variable_scope('decoder'):
init_kern = tf.random_normal_initializer(0.0, 0.1, dtype=tf.float32, seed=SEED)
#cell = tf.contrib.rnn.BasicRNNCell(num_units=20)
cell = tf.contrib.rnn.GRUCell(num_units=20, kernel_initializer=init_kern)
z_inps = tf.zeros([batch_size, ts, 1])
output_recon, _ = tf.nn.dynamic_rnn(cell=cell, inputs=z_inps, initial_state=output_latent, sequence_length=seq_len, dtype=tf.float32)
winit = tf.random_normal_initializer(0.0, 0.1, dtype=tf.float32, seed=SEED)
output_recon = tf.contrib.layers.fully_connected(inputs=output_recon, num_outputs=params['dims'], activation_fn=None, weights_initializer=winit)
loss = tf.losses.mean_squared_error(features, output_recon)
global_step = tf.train.get_global_step()
opt = tf.train.AdamOptimizer(0.01)
if USE_TPU:
opt = tf.contrib.tpu.CrossShardOptimizer(opt)
train_op = opt.minimize(loss, global_step)
def metric_fn(labels, rec):
return {
'MSE': tf.metrics.mean_squared_error(labels, rec),
}
tpu_eval_metrics = (metric_fn, [features, output_recon])
return tpu_estimator.TPUEstimatorSpec(mode=mode,
loss=loss,
train_op=train_op,
eval_metrics=tpu_eval_metrics,
)
def train_model(num_steps, iterations_per_loop, num_shards=1):
if USE_TPU:
my_project = subprocess.check_output([
'gcloud','config','get-value','project'])
my_zone = subprocess.check_output([
'gcloud','config','get-value','compute/zone'])
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
tpu=[os.environ['TPU_NAME']],
)
tpu_cluster_resolver = tpu_cluster_resolver
else:
tpu_cluster_resolver = None
#tf.logging.set_verbosity(tf.logging.INFO)
config = tf.ConfigProto(allow_soft_placement=True,
log_device_placement=True)
run_config = tf.contrib.tpu.RunConfig(
save_checkpoints_steps=400,
cluster=tpu_cluster_resolver,
keep_checkpoint_max=1,
model_dir = 'gs://test-bucket/runs',
session_config=config,
tpu_config=tf.contrib.tpu.TPUConfig(iterations_per_loop=iterations_per_loop, num_shards=num_shards))
params = {'dims': 5}
data = make_data(params)
train_input = input_fn(data, tf.estimator.ModeKeys.TRAIN)
eval_input = input_fn(data, tf.estimator.ModeKeys.EVAL)
model = tf.contrib.tpu.TPUEstimator(model_fn=model_fn, params=params, config=run_config,
use_tpu=USE_TPU, train_batch_size=100, eval_batch_size=100)
model.train(train_input, steps=num_steps)
valid_costs = model.evaluate(eval_input, name='valid_data', steps=2)
print('==== Evaluation:')
print(valid_costs)
return valid_costs
print("==================== Training with iterations_per_loop = 1")
run1 = train_model(num_steps=100, iterations_per_loop=1, num_shards=1)
# remove checkpoints
subprocess.call("gsutil -m rm -r gs://test-bucket/runs/*", shell=True)
print("==================== Training with iterations_per_loop = 100")
run2 = train_model(num_steps=100, iterations_per_loop=100, num_shards=1)
print('Summary:')
print('====== iterations_per_loop = 1 :')
print(run1)
print('====== iterations_per_loop = 100 :')
print(run2)
Output (multiple runs):
CPU Run:
{'loss': 0.2408253, 'MSE': 0.24082531, 'global_step': 100}
TPU Runs:
iterations_per_loop=1
Run1:
{'loss': 0.24119371, 'MSE': 0.2411936, 'global_step': 100}
Run2:
{'loss': 0.24119371, 'MSE': 0.2411936, 'global_step': 100}
iterations_per_loop=100
Run1:
{'loss': 29.255905, 'MSE': 29.25589, 'global_step': 100}
Run2:
{'loss': 29.255905, 'MSE': 29.25589, 'global_step': 100}
About this issue
- Original URL
- State: closed
- Created 6 years ago
- Comments: 15 (10 by maintainers)
After much delay (sorry about that!), @mrezak your GCP project should be whitelisted. Please do re-open if this is not the case. Thanks!
(For others reading this thread, TF 1.10 has encountered a couple delays that we are working on fixing. It should be out soon. Thanks for your patience!)
We’re looking into the error with static_rnn first. After that’s fixed you will be able to rerun and we can compare again.