It seems as though distributed training with keras version 3.7 and above with tensorflow isn't supported.
https://stackoverflow.com/questions/79285532/multi-gpu-training-in-tensorflow-results-in-nans?noredirect=1#comment140127186_79285532
Comment From: dhantule
Hi @DarrenR96, thanks for reporting this.
Could you please refer this issue and test your code with keras-nightly
.
Comment From: willianck
I also have an issue with the keras version used when trying to run the following dummy example for distributed training. Currently using tensorflow[and-cuda] 2.18.0
which installs with thekeras 3.9.2 version
:
import tensorflow as tf
import keras
def create_dataset():
float_data = tf.constant([[1.0, 2.0], [3.0, 4.0]], dtype=tf.float32)
string_data = tf.constant([["foo", "bar"], ["baz", "qux"]], dtype=tf.string)
labels = tf.constant([[1], [0]], dtype=tf.float32)
dataset = tf.data.Dataset.from_tensor_slices(((float_data, string_data), labels))
return dataset
def create_model():
input_float = keras.Input(shape=(2,), dtype=tf.float32, name='float_input')
input_string = keras.Input(shape=(2,), dtype=tf.string, name='string_input')
string_lookup = keras.layers.StringLookup(vocabulary=["foo", "bar", "baz", "qux"], name='string_lookup')
string_embedding = string_lookup(input_string)
concatenated = keras.layers.Concatenate(name='concatenate')([input_float, string_embedding])
dense = keras.layers.Dense(10, activation='relu', name='dense_1')(concatenated)
output = keras.layers.Dense(1, activation='sigmoid', name='output')(dense)
model = keras.Model(inputs=[input_float, input_string], outputs=output, name='simple_model')
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
return model
def main():
print("Multiple GPUs strategy")
strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])
n_gpu: int = len(tf.config.list_physical_devices('GPU'))
n_replicas: int = strategy.num_replicas_in_sync
print(f'GPU: {n_gpu}')
print(f'Replicas: {n_replicas}')
dataset = create_dataset()
with strategy.scope():
model = create_model()
model.fit(dataset.batch(2), epochs=5)
if __name__ == "__main__":
main()
I get the following error:
tensorflow.python.framework.errors_impl.InvalidArgumentError: Value for attr 'T' of string is not in the list of allowed values: float, double, int32, uint8, int16, int8, complex64, int64, qint8, quint8, qint32, bfloat16, qint16, quint16, uint16, complex128, half, uint32, uint64, variant
; NodeDef: {{node AddN}}; Op<name=AddN; signature=inputs:N*T -> sum:T; attr=N:int,min=1; attr=T:type,allowed=[DT_FLOAT, DT_DOUBLE, DT_INT32, DT_UINT8, DT_INT16, DT_INT8, DT_COMPLEX64, DT_INT64, DT_QINT8, DT_QUINT8, DT_QINT32, DT_BFLOAT16, DT_QINT16, DT_QUINT16, DT_UINT16, DT_COMPLEX128, DT_HALF, DT_UINT32, DT_UINT64, DT_VARIANT]; is_commutative=true; is_aggregate=true> [Op:AddN] name:
I was able to make it work by downgrading tensorflow and keras totensorflow[and-cuda]==2.17.0
and keras==3.4.1
A similar issue was posted on the tensorflow repo but has been closed without it seamingly been fixed.
Comment From: coldhearti
I ran into this issue after upgrading to 3.10. Running mirrored strategy with 4 GPUs. Loss goes NaN nearly immediately.
Comment From: divyashreepathihalli
Tagging @amitsrivastava78 to take a look
Comment From: amitsrivastava78
I tired the below code on keras 3.10 and tf 2.18
import contextlib
import keras
import numpy as np
# import tf_keras as keras
import tensorflow as tf
#tf.debugging.enable_check_numerics()
n_devices = 4
tf.config.set_logical_device_configuration(
tf.config.list_physical_devices("GPU")[0],
[tf.config.LogicalDeviceConfiguration(memory_limit=1000)] * n_devices,
)
keras.utils.set_random_seed(0)
input_shape = (1,)
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
do_bug = True
with tf.distribute.MirroredStrategy().scope() if do_bug else contextlib.nullcontext():
model = keras.Sequential(
[
keras.Input(shape=input_shape),
keras.layers.Identity(),
]
)
batch_size = 128
epochs = 1
model.compile(loss="mse", optimizer="sgd")
model.fit(
tf.ones((batch_size * 200, 1)),
tf.zeros((batch_size * 200, 1)),
batch_size=batch_size,
)
produces no nan, results are as below Number of devices: 4 22/200 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - loss: 1.0000/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py:83: UserWarning: The model does not have any trainable weights. warnings.warn("The model does not have any trainable weights.") 200/200 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0000
Comment From: github-actions[bot]
This issue is stale because it has been open for 14 days with no activity. It will be closed if no further activity occurs. Thank you.