tensorflow: Random error after hundreds of iterations: Aborting RingReduce with Invalid argument: Incompatible shapes: [0,64] vs. [0,256]
Click to expand!
Issue Type
Bug
Have you reproduced the bug with TF nightly?
No
Source
source
Tensorflow Version
tensorflow-rocm 2.2
Custom Code
Yes
OS Platform and Distribution
Ubuntu 20.04
Mobile device
No response
Python version
3.8
Bazel version
No response
GCC/Compiler version
No response
CUDA/cuDNN version
ROCm v3.5
GPU model and memory
2 RX 480 4Go
Current Behaviour?
Hello Tensorflow community,
I randomly encounter this bug after few tens of iterations for no apparent reason. Sometimes after t > 400.
The "incompatible shapes" also differ from execution to execution. It doesn't seems to come from input datas.
I also got those warnings at the execution:
2023-01-05 10:30:46.933216: W tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc:435] error: Internal: Complete shape not known for allreduce/CollectiveReduce_5
2023-01-05 10:30:46.933250: W tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc:1117] error: Internal: Complete shape not known for allreduce/CollectiveReduce_5
2023-01-05 10:30:46.933351: E tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc:1134] ScopedAllocatorOptimizer: Internal: Complete shape not known for allreduce/CollectiveReduce_5
2023-01-05 10:30:46.933359: W tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc:907] error: Internal: Complete shape not known for allreduce/CollectiveReduce_5
2023-01-05 10:30:46.934321: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:563] scoped_allocator_optimizer failed: Internal: Complete shape not known for allreduce/CollectiveReduce_5
Thank you for your help.
Standalone code to reproduce the issue
import os
import random
import time
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from collections import deque
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
print(tf.config.experimental.list_physical_devices("GPU"))
mirrored_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(tf.distribute.experimental.CollectiveCommunication.RING)
window_size = 5
episodes = 20
batch_size = 32
NAME = f"Blackstonev1-LSTM-32x64x64-{int(time.time())}"
tensorboard = tf.keras.callbacks.TensorBoard(log_dir="logs\{}".format(NAME))
class AIAgent:
def __init__(self, state_size, action_space=3, model_name=NAME): # Stay, Buy, Sell
self.state_size = state_size
self.action_space = action_space
self.memory = deque(maxlen=2000)
self.inventory = []
self.margin_inventory = []
self.model_name = model_name
self.gamma = 0.95
self.epsilon = 1.0
self.epsilon_final = 0.05
self.epsilon_decay = 0.9995
self.model = self.model_builder()
def model_builder(self):
with mirrored_strategy.scope():
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(window_size, 2)))
model.add(tf.keras.layers.LSTM(units=32, activation='relu', return_sequences=True))
model.add(tf.keras.layers.LSTM(units=64, activation='relu', return_sequences=True))
model.add(tf.keras.layers.LSTM(units=64, activation='relu', return_sequences=False))
model.add(tf.keras.layers.Dense(units=self.action_space, activation='linear'))
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=0.001))
return model
def trade(self, state):
rdm = random.random()
if rdm <= self.epsilon:
rdm_act = random.randrange(self.action_space)
print(f"random: {rdm_act}")
return rdm_act
actions = self.model.predict(state)
argmax = np.argmax(actions[0])
print(f'model: {argmax}')
return argmax
def batch_train(self, batch_size):
batch = []
for i in range(len(self.memory) - batch_size + 1, len(self.memory)):
batch.append(self.memory[i])
for state, action, reward, next_state, done in batch:
reward = reward
if not done:
reward = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
target = self.model.predict(state)
target[0][action] = reward
self.model.fit(state, target, epochs=1, verbose=0, callbacks=[tensorboard])
if self.epsilon > self.epsilon_final:
self.epsilon *= self.epsilon_decay
def state_creator(data, timestep, window_size):
starting_id = timestep - window_size + 1
if starting_id >= 0:
windowed_data = data[starting_id:timestep + 1]
else:
windowed_data = - starting_id * [data[0]] + list(data[0:timestep + 1])
state = windowed_data
return np.array([state])
def main(batch_size, window_size, episodes):
data = load_data(stock_name) # Replace with your own input here
data_samples = len(data) - 1
agent = AIAgent(window_size)
agent.model.summary()
for episode in range(1, episodes + 1):
print("Episode: {}/{}".format(episode, episodes))
state = state_creator(data, 0, window_size)
total_profit = 0
agent.inventory = []
for t in tqdm(range(data_samples)):
action = agent.trade(state)
next_state = state_creator(data, t + 1, window_size)
reward = 0
if action == 1:
# Do that
continue
elif action == 2:
# Do that
continue
elif action == 0:
# Do that
continue
if t == data_samples - 1:
done = True
else:
done = False
agent.memory.append((state, action, reward, next_state, done))
state = next_state
if len(agent.memory) > batch_size:
agent.batch_train(batch_size)
agent.model.save(f"{agent.model_name}_{episode}.h5")
Relevant log output
Traceback (most recent call last):
File "main.py", line 207, in <module>
trader.batch_train(batch_size)
File "main.py", line 91, in batch_train
self.model.fit(state, target, epochs=1, verbose=0, callbacks=[tensorboard])
File "/home/hugo/Documents/scripts/blackstone_ai/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 66, in _method_wrapper
return method(self, *args, **kwargs)
File "/home/hugo/Documents/scripts/blackstone_ai/venv/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 848, in fit
tmp_logs = train_function(iterator)
File "/home/hugo/Documents/scripts/blackstone_ai/venv/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 580, in __call__
result = self._call(*args, **kwds)
File "/home/hugo/Documents/scripts/blackstone_ai/venv/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 611, in _call
return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
File "/home/hugo/Documents/scripts/blackstone_ai/venv/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 2420, in __call__
return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
File "/home/hugo/Documents/scripts/blackstone_ai/venv/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 1661, in _filtered_call
return self._call_flat(
File "/home/hugo/Documents/scripts/blackstone_ai/venv/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 1745, in _call_flat
return self._build_call_outputs(self._inference_function.call(
File "/home/hugo/Documents/scripts/blackstone_ai/venv/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 593, in call
outputs = execute.execute(
File "/home/hugo/Documents/scripts/blackstone_ai/venv/lib/python3.8/site-packages/tensorflow/python/eager/execute.py", line 59, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: Incompatible shapes: [0,64] vs. [0,256]
[[{{node gradient_tape/replica_1/sequential/lstm_2/while/replica_1/sequential/lstm_2/while_grad/body/_1459/gradients/lstm_cell_2/mul_grad/BroadcastGradientArgs}}]]
(1) Invalid argument: Incompatible shapes: [0,64] vs. [0,256]
[[{{node gradient_tape/replica_1/sequential/lstm_2/while/replica_1/sequential/lstm_2/while_grad/body/_1459/gradients/lstm_cell_2/mul_grad/BroadcastGradientArgs}}]]
[[Adam/Adam/update_1_1/AssignAddVariableOp/_323]]
0 successful operations.
1 derived errors ignored. [Op:__inference_train_function_10543]
Function call stack:
train_function -> train_function
About this issue
- Original URL
- State: closed
- Created a year ago
- Comments: 16 (1 by maintainers)
It was indeed coming from my environment. I found that the following versions have finally worked for me: python: 3.7.16, tensorflow-rocm: 2.1.6, rocm: 3.5.1, ubuntu: 20.04 kernel: 5.0.42 if you have the same error, I recommend you this tutorial: https://github.com/boriswinner/RX580-rocM-tensorflow-ubuntu20.4-guide
Wish you good luck if you have to train with AMD’s Polaris card …
Thank you for your help @SuryanarayanaY