I’m trying to benchmark reading ROOT file and filling histograms from selected branches, my benchmarks are not making sense, and I wanted some expert opinion on how to it right.
I’m creating a ROOT Tree with 24 columns (named c1,…,c24), I’m reading 4 columns (c1,c2,c3,c4) and filling corresponding histograms. The TTree is filled with random numbers (0.0 to 1.0) with 50M rows.
When using simple iterative loop through TTree entries, I’m able to fill the histograms in 5.37 second, when using DataFrame approach, I get 7.2 seconds. Codes provided below. Is it expected for DataFrames to be slower?
I also tried using RNtuple, however, filling 4 histograms from RNtuple took 21 seconds.
I would appriciate some help in pointing to my mistake in the codes.
using ROOT/6.32
Code for reading TTree:
void benchmark_read(const char *root_file, const char *tree_name , int nread){
TFile* file = TFile::Open(root_file);
if (!file || file->IsZombie()) {
std::cerr << "Error: Could not open file " << root_file << std::endl;
return;
}
// Get the TTree
TTree* tree = (TTree*)file->Get(tree_name);
if (!tree) {
std::cerr << "Error: TTree " << tree_name << " not found in file " << root_file << std::endl;
return;
}
TH1F* hist1 = new TH1F("hist1", "Histogram for Branch 1", 100, 0, 1);
TH1F* hist2 = new TH1F("hist2", "Histogram for Branch 2", 100, 0, 1);
TH1F* hist3 = new TH1F("hist3", "Histogram for Branch 3", 100, 0, 1);
TH1F* hist4 = new TH1F("hist4", "Histogram for Branch 4", 100, 0, 1);
tree->SetBranchStatus("*",0);
char bname[128];
float branch[128];
for(int i = 0; i < nread; i++){
std::snprintf(bname,128,"c%d",i+1);
tree->SetBranchStatus(bname,1);
tree->SetBranchAddress(bname,&branch[i]);
}
auto start_time = std::chrono::high_resolution_clock::now();
Long64_t nentries = tree->GetEntries();
for (Long64_t i = 0; i < nentries; ++i) {
tree->GetEntry(i);
hist1->Fill(branch[k]);
hist2->Fill(branch[1]);
hist3->Fill(branch[2]);
hist4->Fill(branch[3]);
}
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
std::cout << "Execution time: " << duration.count() << " ms" << std::endl;
}
Code for plotting unsing DataFrames:
void benchmark(){
std::string file_name = "../benchmarks/output.root"; // Replace with your ROOT file name
std::string tree_name = "Tree"; // Replace with your TTree name
// Initialize the RDataFrame
try {
ROOT::RDataFrame df(tree_name, file_name);
// Define the variables for which you want to create histograms
std::vector<std::string> variables_to_plot = {"c1", "c2", "c3", "c4"}; // Replace with actual variable names
// Create histograms for each variable
std::vector<ROOT::RDF::RResultPtr<TH1D>> histograms;
for (const auto &var : variables_to_plot) {
histograms.push_back(df.Histo1D(var));
}
// Create a canvas to draw the histograms
TCanvas canvas("canvas", "Histograms", 800, 600);
canvas.Divide(2, 2); // Divide the canvas into a 2x2 grid
// Draw histograms on the canvas
auto start_time = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < histograms.size(); ++i) {
canvas.cd(i + 1); // Move to the appropriate pad
histograms[i]->Draw();
//histograms[i]->Fill();
}
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
std::cout << "Histograms saved as histograms.png" << std::endl;
std::cout << "Execution time: " << duration.count() << " ms" << std::endl;
// Save the canvas to a file
canvas.SaveAs("histograms.pdf");
} catch (const std::exception &e) {
std::cerr << "Error initializing RDataFrame or processing histograms: " << e.what() << std::endl;
}
//return 0;
}
Code for plotting from RNtuple:
// Path to the RNtuple ROOT file and the RNtuple name
const std::string fileName = "output.root";
const std::string ntupleName = "TTree";
// Open the RNtuple file for reading
auto ntuple = ROOT::Experimental::RNTupleReader::Open(ntupleName, fileName);
//auto ntuple = RNTupleReader::Open(ntupleName, fileName);
if (!ntuple) {
std::cerr << "Error: Unable to open RNtuple file or RNtuple not found!" << std::endl;
//return 1;
}
std::cout << "Successfully opened RNtuple: " << ntupleName << " from file: " << fileName << std::endl;
// Column to plot
const std::string columnToPlot = "c1";
// Check if the column exists
//if (!ntuple->GetDescriptor().HasField(columnToPlot)) {
// std::cerr << "Error: Column '" << columnToPlot << "' not found in the RNtuple!" << std::endl;
//return 1;
//}
// Create a histogram for the column
int nBins = 100; // Number of bins
double minValue = 0.0; // Minimum value for the histogram
double maxValue = 1.0; // Maximum value for the histogram
TH1F hc1("hist_1", "histo c1", nBins, minValue, maxValue);
TH1F hc2("hist_2", "histo c1", nBins, minValue, maxValue);
TH1F hc3("hist_3", "histo c1", nBins, minValue, maxValue);
TH1F hc4("hist_4", "histo c1", nBins, minValue, maxValue);
// Fill the histogram with data from the RNtuple column
auto fc1 = ntuple->GetView<float>("c1");
auto fc2 = ntuple->GetView<float>("c2");
auto fc3 = ntuple->GetView<float>("c3");
auto fc4 = ntuple->GetView<float>("c4");
auto start_time = std::chrono::high_resolution_clock::now();
for (auto entryId : ntuple->GetEntryRange()) {
ntuple->LoadEntry(entryId);
hc1.Fill(fc1(false));
hc2.Fill(fc2(false));
hc3.Fill(fc3(false));
hc4.Fill(fc4(false));
}
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);