Using multiple RReaders in pyROOT

apetukho · August 7, 2023, 3:38pm

Dear ROOT experts,

I want to apply multiple TMVA classifiers to the TTree stored in a .root file using pyROOT and RDataFrame (using this post as a reference). I’m trying to declare as many “lazy” snapshots as possible to run them at the same time to decrease the time needed to start the RDataFrame computations.

I’ve run into a problem trying to define multiple compute functions from the models.

When I run the following code

#coding=utf-8
import ROOT
from collections import namedtuple

Classifier = namedtuple('classifier', [
    'name',
    'weight_file_path',
])

def write_bdt_response_to_file(input_file_path, tree_name, classifier_list):
    snapshot_handle_list = []
    for classifier in classifier_list:
        print classifier.name
        ROOT.gInterpreter.ProcessLine('''
            TMVA::Experimental::RReader model("{}");
            '''.format(classifier.weight_file_path))

        variables = ROOT.model.GetVariableNames()
        variables_num = len(variables)
        ROOT.gInterpreter.ProcessLine('''
            computeModel = TMVA::Experimental::Compute<{}, float>(model);
            '''.format(variables_num))
        
        response_branch_name = 'response' + classifier.name
        df = ROOT.ROOT.RDataFrame(tree_name, input_file_path)
        df = df.Define(response_branch_name, ROOT.computeModel, variables)
        
        branch_vector = ROOT.vector('string')()
        branch_vector.push_back(response_branch_name)

        response_tree_name = '{}_{}'.format(tree_name, classifier.name)

        opts = ROOT.ROOT.RDF.RSnapshotOptions()
        opts.fMode = "update"
        opts.fOverwriteIfExists = True
        opts.fLazy = True
        snapshot_handle_list.append(
            df.Snapshot(response_tree_name, input_file_path, branch_vector, opts)
        )

    ROOT.ROOT.RDF.RunGraphs(snapshot_handle_list)

def main():
    input_tree_name = 'tree_PFLOW'
    classifier_list = [
        Classifier('20230721_BDTscore_nj0_rdf_1', 'TMVAClassification_NT400Shrinkage02MNS10_1_nospec.weights.xml'),
        Classifier('20230721_BDTscore_nj0_rdf_2', 'TMVAClassification_NT400Shrinkage02MNS10_1_nospec.weights.xml')
    ]
    input_file_path = 'ttV.root'
    write_bdt_response_to_file(input_file_path, input_tree_name, classifier_list)

if __name__ == '__main__':
    main()

I get the following error:

20230721_BDTscore_nj0_rdf_1
20230721_BDTscore_nj0_rdf_2
input_line_73:3:26: error: object of type 'TMVA::Experimental::Internal::ComputeHelper<std::integer_sequence<unsigned long, 0, 1, 2, 3, 4, 5, 6, 7, 8>, float, TMVA::Experimental::RReader &>' cannot be assigned because its copy assignment operator is implicitly deleted
            computeModel = TMVA::Experimental::Compute<9, float>(model);
                         ^
/home/alex/root/include/TMVA/RInferenceUtils.hxx:19:6: note: copy assignment operator of 'ComputeHelper<std::integer_sequence<unsigned long, 0, 1, 2, 3, 4, 5, 6, 7, 8>, float, TMVA::Experimental::RReader &>' is implicitly deleted because field 'fFunc' is of reference type 'TMVA::Experimental::RReader &'
   F fFunc;
     ^

i.e. there seems to be an error when I’m trying to define the computeModel function for the second time.

I can rewrite it as follows and make the models and compute functions have different names, since technically they are just strings from the python point of view.

#coding=utf-8
import ROOT
from collections import namedtuple

Classifier = namedtuple('classifier', [
    'name',
    'weight_file_path',
])

def write_bdt_response_to_file(input_file_path, tree_name, classifier_list):
    snapshot_handle_list = []
    for classifier in classifier_list:
        print classifier.name

        classifier_model_name = "model_{}".format(classifier.name)
        ROOT.gInterpreter.ProcessLine('''
            TMVA::Experimental::RReader {}("{}");
            '''.format(classifier_model_name, classifier.weight_file_path))

        variables = getattr(ROOT, classifier_model_name).GetVariableNames()
        variables_num = len(variables)
        classifier_compute_name = "compute_{}".format(classifier.name)
        ROOT.gInterpreter.ProcessLine('''
            {} = TMVA::Experimental::Compute<{}, float>({});
            '''.format(classifier_compute_name, variables_num, classifier_model_name))
        
        response_branch_name = 'response' + classifier.name
        df = ROOT.ROOT.RDataFrame(tree_name, input_file_path)
        df = df.Define(
            response_branch_name, 
            getattr(ROOT, classifier_compute_name), 
            variables)
        
        branch_vector = ROOT.vector('string')()
        branch_vector.push_back(response_branch_name)

        response_tree_name = '{}_{}'.format(tree_name, classifier.name)

        opts = ROOT.ROOT.RDF.RSnapshotOptions()
        opts.fMode = "update"
        opts.fOverwriteIfExists = True
        opts.fLazy = True
        snapshot_handle_list.append(
            df.Snapshot(response_tree_name, input_file_path, branch_vector, opts)
        )

    ROOT.ROOT.RDF.RunGraphs(snapshot_handle_list)

def main():
    input_tree_name = 'tree_PFLOW'
    classifier_list = [
        Classifier('20230721_BDTscore_nj0_rdf_1', 'TMVAClassification_NT400Shrinkage02MNS10_1_nospec.weights.xml'),
        Classifier('20230721_BDTscore_nj0_rdf_2', 'TMVAClassification_NT400Shrinkage02MNS10_1_nospec.weights.xml')
    ]
    input_file_path = 'ttV.root'
    write_bdt_response_to_file(input_file_path, input_tree_name, classifier_list)

if __name__ == '__main__':
    main()

It works fine, but starts to look too convoluted and hard to understand with all the formats (or f-strings) and getattrs.
Is there a more elegant and less hacky way to get the same results from the ROOT side?

Best regards,
Aleksandr

ROOT Version: 6.26/06
Platform: Ubuntu 20.04
Compiler: Precompiled

bellenot · August 8, 2023, 6:40am

Maybe @moneta can help

system · August 22, 2023, 6:41am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.