It seems as though distributed training with keras version 3.7 and above with tensorflow isn't supported.

https://stackoverflow.com/questions/79285532/multi-gpu-training-in-tensorflow-results-in-nans?noredirect=1#comment140127186_79285532

Comment From: dhantule

Hi @DarrenR96, thanks for reporting this. Could you please refer this issue and test your code with keras-nightly.

Comment From: willianck

I also have an issue with the keras version used when trying to run the following dummy example for distributed training. Currently using tensorflow[and-cuda] 2.18.0which installs with thekeras 3.9.2 version:

import tensorflow as tf
import keras

def create_dataset():
    float_data = tf.constant([[1.0, 2.0], [3.0, 4.0]], dtype=tf.float32)
    string_data = tf.constant([["foo", "bar"], ["baz", "qux"]], dtype=tf.string)
    labels = tf.constant([[1], [0]], dtype=tf.float32)

    dataset = tf.data.Dataset.from_tensor_slices(((float_data, string_data), labels))
    return dataset

def create_model():
    input_float = keras.Input(shape=(2,), dtype=tf.float32, name='float_input')
    input_string = keras.Input(shape=(2,), dtype=tf.string, name='string_input')

    string_lookup = keras.layers.StringLookup(vocabulary=["foo", "bar", "baz", "qux"], name='string_lookup')
    string_embedding = string_lookup(input_string)

    concatenated = keras.layers.Concatenate(name='concatenate')([input_float, string_embedding])

    dense = keras.layers.Dense(10, activation='relu', name='dense_1')(concatenated)
    output = keras.layers.Dense(1, activation='sigmoid', name='output')(dense)

    model = keras.Model(inputs=[input_float, input_string], outputs=output, name='simple_model')
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def main():

    print("Multiple GPUs strategy")

    strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])

    n_gpu: int = len(tf.config.list_physical_devices('GPU'))
    n_replicas: int = strategy.num_replicas_in_sync

    print(f'GPU: {n_gpu}')
    print(f'Replicas: {n_replicas}')

    dataset = create_dataset()

    with strategy.scope():
        model = create_model()

    model.fit(dataset.batch(2), epochs=5)

if __name__ == "__main__":
    main()

I get the following error:

tensorflow.python.framework.errors_impl.InvalidArgumentError: Value for attr 'T' of string is not in the list of allowed values: float, double, int32, uint8, int16, int8, complex64, int64, qint8, quint8, qint32, bfloat16, qint16, quint16, uint16, complex128, half, uint32, uint64, variant
        ; NodeDef: {{node AddN}}; Op<name=AddN; signature=inputs:N*T -> sum:T; attr=N:int,min=1; attr=T:type,allowed=[DT_FLOAT, DT_DOUBLE, DT_INT32, DT_UINT8, DT_INT16, DT_INT8, DT_COMPLEX64, DT_INT64, DT_QINT8, DT_QUINT8, DT_QINT32, DT_BFLOAT16, DT_QINT16, DT_QUINT16, DT_UINT16, DT_COMPLEX128, DT_HALF, DT_UINT32, DT_UINT64, DT_VARIANT]; is_commutative=true; is_aggregate=true> [Op:AddN] name: 

I was able to make it work by downgrading tensorflow and keras totensorflow[and-cuda]==2.17.0 and keras==3.4.1

A similar issue was posted on the tensorflow repo but has been closed without it seamingly been fixed.

Comment From: coldhearti

I ran into this issue after upgrading to 3.10. Running mirrored strategy with 4 GPUs. Loss goes NaN nearly immediately.

Comment From: divyashreepathihalli

Tagging @amitsrivastava78 to take a look

Comment From: amitsrivastava78

I tired the below code on keras 3.10 and tf 2.18

import contextlib

import keras
import numpy as np


# import tf_keras as keras

import tensorflow as tf

#tf.debugging.enable_check_numerics()

n_devices = 4
tf.config.set_logical_device_configuration(
    tf.config.list_physical_devices("GPU")[0],
    [tf.config.LogicalDeviceConfiguration(memory_limit=1000)] * n_devices,
)

keras.utils.set_random_seed(0)

input_shape = (1,)

strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
do_bug = True
with tf.distribute.MirroredStrategy().scope() if do_bug else contextlib.nullcontext():
    model = keras.Sequential(
        [
            keras.Input(shape=input_shape),
            keras.layers.Identity(),
        ]
    )

    batch_size = 128
    epochs = 1


    model.compile(loss="mse", optimizer="sgd")

    model.fit(
        tf.ones((batch_size * 200, 1)),
        tf.zeros((batch_size * 200, 1)),
        batch_size=batch_size,
    )

produces no nan, results are as below Number of devices: 4 22/200 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - loss: 1.0000/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py:83: UserWarning: The model does not have any trainable weights. warnings.warn("The model does not have any trainable weights.") 200/200 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 1.0000

Comment From: github-actions[bot]

This issue is stale because it has been open for 14 days with no activity. It will be closed if no further activity occurs. Thank you.