Tensorflow crashes when factory has more than one pyKeras model booked

Hi,
up until some time ago I used a script similar to this to train a bunch of keras models to tune the parameters

def getKerasModel(inputDim, modelName, nLayers = 3, layerSize = 200, dropValue = 0.2, optLabel = 'adam'):
    model = Sequential()
    model.add(Dense(layerSize, activation='relu', kernel_initializer='normal', input_dim=inputDim))
    if dropValue != 0:
            model.add(Dropout(dropValue))
            
    for i in range(1, nLayers):
        model.add(Dense(layerSize, activation='relu', kernel_initializer='normal'))
        if dropValue != 0:
            model.add(Dropout(dropValue))

    model.add(Dense(2, activation='softmax'))

    opt = Adam(lr=0.001)
    if optLabel == 'sgd':
        opt = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    model.save(modelName)
    model.summary()
    return

...
...
...


ntu_nLayers = (2, 3, 4)
ntu_layerSize = (100, 150, 200)
ntu_dropValue = (0.0, 0.1, 0.2, 0.3)

for nLayers in ntu_nLayers:
    for layerSize in ntu_layerSize:
        for dropValue in ntu_dropValue:
            suffix = '_' + str(nLayers) + '_' + str(layerSize) + '_' + str(int(dropValue*10))
            modelName = 'modelFullScan' + suffix +'.h5'
            getKerasModel(nVars, modelName, nLayers, layerSize, dropValue)
            dnnOptions = '!H:!V:FilenameModel=' + modelName + ':NumEpochs=15:TriesEarlyStopping=5:BatchSize=1024:ValidationSize=30%'
            dnnName = 'DNNMuonIDFullScan' + suffix
            factory.BookMethod(dataloader, TMVA.Types.kPyKeras, dnnName, dnnOptions + preprocessingOptions)
            print(modelName)
            print(dnnOptions + preprocessingOptions)


factory.TrainAllMethods()
factory.TestAllMethods()
factory.EvaluateAllMethods()

This suddenly stopped to work and crash

Let’s downscale a bit.

This works:

model1 = Sequential()
model1.add(Dense(100, activation='relu', kernel_initializer='normal', input_dim=nVars))
model1.add(Dense(100, activation='relu', kernel_initializer='normal'))
model1.add(Dense(2, activation='softmax'))
model1.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])
model1.save('a1.h5')
model1.summary()

dnnOptions = '!H:!V:FilenameModel=a1.h5:NumEpochs=15:TriesEarlyStopping=5:BatchSize=1024:ValidationSize=30%'
factory.BookMethod(dataloader, TMVA.Types.kPyKeras, 'DNNMuonIDFullScanA1', dnnOptions + preprocessingOptions)

# Run training, test and evaluation
factory.TrainAllMethods()
factory.TestAllMethods()
factory.EvaluateAllMethods()

but this DOES NOT

model1 = Sequential()
model1.add(Dense(100, activation='relu', kernel_initializer='normal', input_dim=nVars))
model1.add(Dense(100, activation='relu', kernel_initializer='normal'))
model1.add(Dense(2, activation='softmax'))
model1.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])
model1.save('a1.h5')
model1.summary()

dnnOptions = '!H:!V:FilenameModel=a1.h5:NumEpochs=15:TriesEarlyStopping=5:BatchSize=1024:ValidationSize=30%'
factory.BookMethod(dataloader, TMVA.Types.kPyKeras, 'DNNMuonIDFullScanA1', dnnOptions + preprocessingOptions)
print(dnnOptions)

model2 = Sequential()
model2.add(Dense(150, activation='relu', kernel_initializer='normal', input_dim=nVars))
model2.add(Dense(150, activation='relu', kernel_initializer='normal'))
model2.add(Dense(2, activation='softmax'))
model2.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])
model2.save('a2.h5')
model2.summary()

dnnOptions = '!H:!V:FilenameModel=a2.h5:NumEpochs=15:TriesEarlyStopping=5:BatchSize=1024:ValidationSize=30%'
factory.BookMethod(dataloader, TMVA.Types.kPyKeras, 'DNNMuonIDFullScanA2', dnnOptions + preprocessingOptions)

# Run training, test and evaluation
factory.TrainAllMethods()
factory.TestAllMethods()
factory.EvaluateAllMethods()

It crashes with the following error (which I am not able to interpret) as soon as the training begins

Epoch 1/15
<WARNING>                : Failed to run python code: history = model.fit(trainX, trainY, sample_weight=trainWeights, batch_size=batchSize, epochs=numEpochs, verbose=verbose, validation_data=(valX, valY, valWeights), callbacks=callbacks)
<WARNING>                : Python error message:
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/local/lib/python2.7/dist-packages/keras/models.py", line 963, in fit
    validation_steps=validation_steps)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 1712, in fit
    validation_steps=validation_steps)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 1235, in _fit_loop
    outs = f(ins_batch)
  File "/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py", line 2475, in __call__
    **self.session_kwargs)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 929, in run
    run_metadata_ptr)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1152, in _run
    feed_dict_tensor, options, run_metadata)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1328, in _do_run
    run_metadata)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1348, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.FailedPreconditionError: Attempting to use uninitialized value dense_2_1/kernel
	 [[node dense_2_1/kernel/read (defined at /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:392) ]]

Caused by op u'dense_2_1/kernel/read', defined at:
  File "trainingMuonIDscan.py", line 198, in <module>
    factory.BookMethod(dataloader, TMVA.Types.kPyKeras, 'DNNMuonIDFullScanA1', dnnOptions + preprocessingOptions)
  File "<string>", line 1, in <module>
  File "/usr/local/lib/python2.7/dist-packages/keras/models.py", line 243, in load_model
    model = model_from_config(model_config, custom_objects=custom_objects)
  File "/usr/local/lib/python2.7/dist-packages/keras/models.py", line 317, in model_from_config
    return layer_module.deserialize(config, custom_objects=custom_objects)
  File "/usr/local/lib/python2.7/dist-packages/keras/layers/__init__.py", line 55, in deserialize
    printable_module_name='layer')
  File "/usr/local/lib/python2.7/dist-packages/keras/utils/generic_utils.py", line 144, in deserialize_keras_object
    list(custom_objects.items())))
  File "/usr/local/lib/python2.7/dist-packages/keras/models.py", line 1350, in from_config
    model.add(layer)
  File "/usr/local/lib/python2.7/dist-packages/keras/models.py", line 492, in add
    output_tensor = layer(self.outputs[0])
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 590, in __call__
    self.build(input_shapes[0])
  File "/usr/local/lib/python2.7/dist-packages/keras/layers/core.py", line 842, in build
    constraint=self.kernel_constraint)
  File "/usr/local/lib/python2.7/dist-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 414, in add_weight
    constraint=constraint)
  File "/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py", line 392, in variable
    v = tf.Variable(value, dtype=tf.as_dtype(dtype), name=name)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 213, in __call__
    return cls._variable_v1_call(*args, **kwargs)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 176, in _variable_v1_call
    aggregation=aggregation)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 155, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 2495, in default_variable_creator
    expected_shape=expected_shape, import_scope=import_scope)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 217, in __call__
    return super(VariableMetaclass, cls).__call__(*args, **kwargs)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 1395, in __init__
    constraint=constraint)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 1557, in _init_from_args
    self._snapshot = array_ops.identity(self._variable, name="read")
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/util/dispatch.py", line 180, in wrapper
    return target(*args, **kwargs)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.py", line 81, in identity
    ret = gen_array_ops.identity(input, name=name)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 3890, in identity
    "Identity", input=input, name=name)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 3300, in create_op
    op_def=op_def)
  File "/home/alberto/.local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1801, in __init__
    self._traceback = tf_stack.extract_stack()

FailedPreconditionError (see above for traceback): Attempting to use uninitialized value dense_2_1/kernel
	 [[node dense_2_1/kernel/read (defined at /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:392) ]]

<FATAL>                         : Failed to train model
***> abort program execution
Traceback (most recent call last):
  File "trainingMuonIDscan.py", line 214, in <module>
    factory.TrainAllMethods()
Exception: void TMVA::Factory::TrainAllMethods() =>
    FATAL error (C++ exception of type runtime_error)

Curiously:

  • it crashes at the training of the first model (not the second)
  • it does not crashes if I define the model but do not book it into the factory
  • it crashes even if I book two times the first model

In other words as soon as I book a second pyKeras model the training fails. I’m 100% that this did not happened in the past (I’m simply rehashing old studies)

What am I missing?

Thanks for any help,

Alberto


ROOT: 6.17/01
Keras: 2.1.4

Hi,
I can confirm that this DOES NOT happen in ROOT 6.16/00

Alberto