Dear ROOT experts,
I want to apply multiple TMVA classifiers to the TTree
stored in a .root
file using pyROOT
and RDataFrame
(using this post as a reference). I’m trying to declare as many “lazy” snapshots as possible to run them at the same time to decrease the time needed to start the RDataFrame
computations.
I’ve run into a problem trying to define multiple compute functions from the models.
When I run the following code
#coding=utf-8
import ROOT
from collections import namedtuple
Classifier = namedtuple('classifier', [
'name',
'weight_file_path',
])
def write_bdt_response_to_file(input_file_path, tree_name, classifier_list):
snapshot_handle_list = []
for classifier in classifier_list:
print classifier.name
ROOT.gInterpreter.ProcessLine('''
TMVA::Experimental::RReader model("{}");
'''.format(classifier.weight_file_path))
variables = ROOT.model.GetVariableNames()
variables_num = len(variables)
ROOT.gInterpreter.ProcessLine('''
computeModel = TMVA::Experimental::Compute<{}, float>(model);
'''.format(variables_num))
response_branch_name = 'response' + classifier.name
df = ROOT.ROOT.RDataFrame(tree_name, input_file_path)
df = df.Define(response_branch_name, ROOT.computeModel, variables)
branch_vector = ROOT.vector('string')()
branch_vector.push_back(response_branch_name)
response_tree_name = '{}_{}'.format(tree_name, classifier.name)
opts = ROOT.ROOT.RDF.RSnapshotOptions()
opts.fMode = "update"
opts.fOverwriteIfExists = True
opts.fLazy = True
snapshot_handle_list.append(
df.Snapshot(response_tree_name, input_file_path, branch_vector, opts)
)
ROOT.ROOT.RDF.RunGraphs(snapshot_handle_list)
def main():
input_tree_name = 'tree_PFLOW'
classifier_list = [
Classifier('20230721_BDTscore_nj0_rdf_1', 'TMVAClassification_NT400Shrinkage02MNS10_1_nospec.weights.xml'),
Classifier('20230721_BDTscore_nj0_rdf_2', 'TMVAClassification_NT400Shrinkage02MNS10_1_nospec.weights.xml')
]
input_file_path = 'ttV.root'
write_bdt_response_to_file(input_file_path, input_tree_name, classifier_list)
if __name__ == '__main__':
main()
I get the following error:
20230721_BDTscore_nj0_rdf_1
20230721_BDTscore_nj0_rdf_2
input_line_73:3:26: error: object of type 'TMVA::Experimental::Internal::ComputeHelper<std::integer_sequence<unsigned long, 0, 1, 2, 3, 4, 5, 6, 7, 8>, float, TMVA::Experimental::RReader &>' cannot be assigned because its copy assignment operator is implicitly deleted
computeModel = TMVA::Experimental::Compute<9, float>(model);
^
/home/alex/root/include/TMVA/RInferenceUtils.hxx:19:6: note: copy assignment operator of 'ComputeHelper<std::integer_sequence<unsigned long, 0, 1, 2, 3, 4, 5, 6, 7, 8>, float, TMVA::Experimental::RReader &>' is implicitly deleted because field 'fFunc' is of reference type 'TMVA::Experimental::RReader &'
F fFunc;
^
i.e. there seems to be an error when I’m trying to define the computeModel
function for the second time.
I can rewrite it as follows and make the models and compute functions have different names, since technically they are just strings from the python
point of view.
#coding=utf-8
import ROOT
from collections import namedtuple
Classifier = namedtuple('classifier', [
'name',
'weight_file_path',
])
def write_bdt_response_to_file(input_file_path, tree_name, classifier_list):
snapshot_handle_list = []
for classifier in classifier_list:
print classifier.name
classifier_model_name = "model_{}".format(classifier.name)
ROOT.gInterpreter.ProcessLine('''
TMVA::Experimental::RReader {}("{}");
'''.format(classifier_model_name, classifier.weight_file_path))
variables = getattr(ROOT, classifier_model_name).GetVariableNames()
variables_num = len(variables)
classifier_compute_name = "compute_{}".format(classifier.name)
ROOT.gInterpreter.ProcessLine('''
{} = TMVA::Experimental::Compute<{}, float>({});
'''.format(classifier_compute_name, variables_num, classifier_model_name))
response_branch_name = 'response' + classifier.name
df = ROOT.ROOT.RDataFrame(tree_name, input_file_path)
df = df.Define(
response_branch_name,
getattr(ROOT, classifier_compute_name),
variables)
branch_vector = ROOT.vector('string')()
branch_vector.push_back(response_branch_name)
response_tree_name = '{}_{}'.format(tree_name, classifier.name)
opts = ROOT.ROOT.RDF.RSnapshotOptions()
opts.fMode = "update"
opts.fOverwriteIfExists = True
opts.fLazy = True
snapshot_handle_list.append(
df.Snapshot(response_tree_name, input_file_path, branch_vector, opts)
)
ROOT.ROOT.RDF.RunGraphs(snapshot_handle_list)
def main():
input_tree_name = 'tree_PFLOW'
classifier_list = [
Classifier('20230721_BDTscore_nj0_rdf_1', 'TMVAClassification_NT400Shrinkage02MNS10_1_nospec.weights.xml'),
Classifier('20230721_BDTscore_nj0_rdf_2', 'TMVAClassification_NT400Shrinkage02MNS10_1_nospec.weights.xml')
]
input_file_path = 'ttV.root'
write_bdt_response_to_file(input_file_path, input_tree_name, classifier_list)
if __name__ == '__main__':
main()
It works fine, but starts to look too convoluted and hard to understand with all the format
s (or f-strings) and getattr
s.
Is there a more elegant and less hacky way to get the same results from the ROOT
side?
Best regards,
Aleksandr
ROOT Version: 6.26/06
Platform: Ubuntu 20.04
Compiler: Precompiled