Dear experts,
I would like to ask for your feedback to understand how to build an efficient code with RDataFrames, as so far the performance I get on a simple example is O(10)x slower than a code with a TTree loop that implements the same functionality of histogram filling.
My use case is something I believe to be rather standard: plotting the distributions of several branches in a Tree under a few selections (example: control plots in different analysis regions).
Ideally this should be something flexible where variables, weights and selections are easily configurable (e.g. from a cfg file), so RDataFrames look very suitable for this and can use the python interface.
To create a “toy example”, I build an artificial dataset containing a Tree of 100k entries and 100 branches (x_0, … x_99). The code in case you want to reproduce it is below [1].
I use these data to fill a plot of each variable, with a weight given by x_0 * x_1, under 3 selections:
- all events
- x_99 < 0
- x_99 < 0 && x_98 < 0 && x_97 < 0
A minimal code that implements this it in python with a RDataFrame is below [2], and on my machine it takes approximately 33s to run. I compared it against an equivalent compiled C++ version, but performance is roughly the same (as expected, since the same backend code is called).
If I implement the same functionality with a loop on a TTree as in example [3], however, the execution of the code takes only 3 to 4 seconds, so it’s about 10 times faster.
Is this difference in the performance expected, or is there something that is slowing down my RDataFrame-based code that can be fixed?
Of course multithreading can help, although it looks like I may need to recompile ROOT with the support enabled, as I am using the precompiled version and enabling MT has no effect.
But in any case, I would need to run this code in parallel over different datasets, so that a “trivial” parallelisation of one execution for one dataset in a single thread may be the best thing to do anyway.
Thanks in advance for your feedback!
Cheers,
Luca
[1]
// root -l generate_dataset()
void generate_dataset()
{
TFile* fOut = new TFile("dataset.root", "recreate");
// TFile* fOut = new TFile("dataset_100k.root", "recreate");
TTree* tOut = new TTree("tree", "tree");
TRandom3* rndm = new TRandom3(0);
const uint nEntries = 10000000; // number of entries to generate
// const uint nEntries = 100000; // number of entries to generate
const int nbranches = 100;
std::vector<double> vals(nbranches); // for setbranchaddress
cout << "... will generate " << nbranches << " branches" << endl;
cout << "... will generate " << nEntries << " entries" << endl;
for (uint ib = 0; ib < nbranches; ++ib){
tOut->Branch(Form("x_%i", ib), &vals.at(ib));
}
for (uint ie = 0; ie < nEntries; ++ie){
if (ie % 100000 == 0) cout << ie << " / " << nEntries << endl;
for (uint ib = 0; ib < nbranches; ++ib){
vals.at(ib) = rndm->Gaus(0, 1);
}
tOut->Fill();
}
tOut->Write();
}
[2]
# python -u rootDataFrame_plots.py
import ROOT
ROOT.gROOT.SetBatch(True)
# ROOT.EnableImplicitMT(4)
# dtset_name = 'dataset.root'
dtset_name = 'dataset_100k.root'
print('... opening file', dtset_name)
d = ROOT.RDataFrame("tree", dtset_name)
print('... defining weights')
d = d.Define('col_w', 'x_0 * x_1')
print('... applying selections')
cut1 = "(x_99 < 0)"
cut2 = "(x_99 < 0 && x_98 < 0 && x_97 < 0)"
d_incl = d
d_cut1 = d.Filter(cut1)
d_cut2 = d.Filter(cut2)
print('... creating branchlist')
branchlist = ['x_{}'.format(i) for i in range (100)]
print('... making plots')
c1 = ROOT.TCanvas('c1', 'c1')
for ib in range(len(branchlist)):
print('... doing', branchlist[ib])
h_incl = d_incl.Histo1D( ("h_{}_incl".format(ib), "h_{}_incl".format(ib), 100, -3, 3) , 'x_{}'.format(ib), 'col_w')
h_cut1 = d_cut1.Histo1D( ("h_{}_cut1".format(ib), "h_{}_cut1".format(ib), 100, -3, 3) , 'x_{}'.format(ib), 'col_w')
h_cut2 = d_cut2.Histo1D( ("h_{}_cut2".format(ib), "h_{}_cut2".format(ib), 100, -3, 3) , 'x_{}'.format(ib), 'col_w')
h_incl.Draw('hist')
c1.Print('rdfplots/h_%i_incl.pdf' % ib)
h_cut1.Draw('hist')
c1.Print('rdfplots/h_%i_cut1.pdf' % ib)
h_cut2.Draw('hist')
c1.Print('rdfplots/h_%i_cut2.pdf' % ib)
[3]
// c++ -lm -o rootTreeLoop_plots rootTreeLoop_plots.cpp `root-config --glibs --cflags`
#include "TString.h"
#include "TH1D.h"
#include "TTree.h"
#include "TFile.h"
#include "TTreeFormula.h"
#include "TCanvas.h"
#include <iostream>
#include <vector>
using namespace std;
int main() {
TString file_in = "dataset_100k.root";
// TString file_in = "dataset.root";
cout << "... opening file: " << file_in << endl;
TFile* fIn = TFile::Open(file_in);
TTree* tIn = (TTree*) fIn->Get("tree");
cout << "Making branch list" << endl;
std::vector<TString> branchList;
for (uint i = 0; i < 100; ++i)
branchList.push_back(Form("x_%i", i));
cout << "Setting branch address" << endl;
std::vector<double> vals(branchList.size());
for (uint ib = 0; ib < branchList.size(); ++ib){
tIn->SetBranchAddress(Form("x_%i", ib), &vals.at(ib));
}
cout << "Making histograms" << endl;
std::vector<TH1D*> histos_incl;
std::vector<TH1D*> histos_cut1;
std::vector<TH1D*> histos_cut2;
for (uint ib = 0; ib < branchList.size(); ++ib){
TH1D* h_incl = new TH1D(Form("h_%s_incl", branchList.at(ib).Data()), Form("h_%s_incl", branchList.at(ib).Data()), 100, -3, 3);
TH1D* h_cut1 = new TH1D(Form("h_%s_cut1", branchList.at(ib).Data()), Form("h_%s_cut1", branchList.at(ib).Data()), 100, -3, 3);
TH1D* h_cut2 = new TH1D(Form("h_%s_cut2", branchList.at(ib).Data()), Form("h_%s_cut2", branchList.at(ib).Data()), 100, -3, 3);
histos_incl.push_back(h_incl);
histos_cut1.push_back(h_cut1);
histos_cut2.push_back(h_cut2);
}
cout << "Making formulas" << endl;
TTreeFormula* TTF_cut1 = new TTreeFormula ("TTF_cut1", "(x_99 < 0)", tIn);
TTreeFormula* TTF_cut2 = new TTreeFormula ("TTF_cut2", "(x_99 < 0 && x_98 < 0 && x_97 < 0)", tIn);
cout << "Making plots" << endl;
const uint nEvts = tIn->GetEntries();
for (uint iEv = 0; iEv < nEvts; ++iEv) {
if (iEv % 10000 == 0)
cout << iEv << " / " << nEvts << endl;
tIn->GetEntry(iEv);
bool pass_cut1 = TTF_cut1->EvalInstance();
bool pass_cut2 = TTF_cut2->EvalInstance();
for (uint ib = 0; ib < branchList.size(); ++ib){
histos_incl.at(ib)->Fill(vals.at(ib), vals.at(0)*vals.at(1));
if(pass_cut1) histos_cut1.at(ib)->Fill(vals.at(ib), vals.at(0)*vals.at(1));
if(pass_cut2) histos_cut2.at(ib)->Fill(vals.at(ib), vals.at(0)*vals.at(1));
}
}
cout << "Saving pdf of plots" << endl;
TCanvas* c1 = new TCanvas("c1", "c1");
for (uint ib = 0; ib < branchList.size(); ++ib){
histos_incl.at(ib)->Draw("hist");
c1->Print(Form("treeloopplots/h_%s_incl.pdf", branchList.at(ib).Data()));
histos_cut1.at(ib)->Draw("hist");
c1->Print(Form("treeloopplots/h_%s_cut1.pdf", branchList.at(ib).Data()));
histos_cut2.at(ib)->Draw("hist");
c1->Print(Form("treeloopplots/h_%s_cut2.pdf", branchList.at(ib).Data()));
}
}
Please read tips for efficient and successful posting and posting code
ROOT Version: 6.22/06
Platform: macOS, 10.15.7
Compiler: Apple clang version 12.0.0