Hello all,
Wanted to check what an ideal way was to loop over / write two trees which have common events between them.
In principle the use case is to compare what changed on the same events between two reconstruction versions but for complicated reasons the underlying data does not have the same number of events.
The workflow below works but is tremendously slow for large datasets. Posting here to see if there are faster alternatives to achive the same maybe using RDF?
import ROOT
ROOT.gROOT.SetBatch(True)
def makeOutputTreeFromList(oldTree, eventList, outputFileName="outputFile.root"):
with ROOT.TFile.Open(outputFileName, "RECREATE") as f:
newTree = oldTree.CloneTree(0)
for id in eventList:
oldTree.GetEntryWithIndex(id)
newTree.Fill()
newTree.Write()
# return newTree
def makeFakeData():
rdf_r22 = ROOT.RDataFrame(100)
rdf_r22 = rdf_r22.Define("eventID", "rdfentry_")
rdf_r22 = rdf_r22.Filter("eventID % 2 == 0")
rdf_r22.Snapshot("nt", "fakeData_r0022.root")
rdf_r23 = ROOT.RDataFrame(100)
rdf_r23 = rdf_r23.Define("eventID", "rdfentry_")
rdf_r23.Snapshot("nt", "fakeData_r0023.root")
makeFakeData()
run_no = "fakeData"
data_r0023_dir = f"{run_no}_r23.root"
data_r0022_dir = f"{run_no}_r22.root"
chain_r0023 = ROOT.TChain("nt")
chain_r0023.Add(data_r0023_dir)
chain_r0023.BuildIndex("eventID")
chain_r0022 = ROOT.TChain("nt")
chain_r0022.Add(data_r0022_dir)
chain_r0022.BuildIndex("eventID")
# Was hoping to use this to loop over them together but does not work if they have different entries
chain_combined = ROOT.TChain("nt")
chain_combined.Add(data_r0023_dir)
chain_combined.AddFriend(chain_r0022, "r22")
nEvents_r0023 = chain_r0023.GetEntries()
nEvents_r0022 = chain_r0022.GetEntries()
nEvents_combined = chain_combined.GetEntries()
print("Entries in r0023", nEvents_r0023)
print("Entries in r0022", nEvents_r0022)
print("Entries in combined", nEvents_combined)
# Select events in r0023 that are also in r0022
chain_r0023.Draw("eventID", "", "goff")
N = chain_r0023.GetEntries()
eventIDs_r23 = chain_r0023.GetV1()
eventIDs_r23 = [int(eventIDs_r23[i]) for i in range(N)]
chain_r0022.Draw("eventID", "", "goff")
N = chain_r0022.GetEntries()
eventIDs_r22 = chain_r0022.GetV1()
eventIDs_r22 = [int(eventIDs_r22[i]) for i in range(N)]
commonEventIDs = set(eventIDs_r22).intersection(set(eventIDs_r23))
makeOutputTreeFromList(chain_r0023, commonEventIDs, outputFileName=f"filtered_data_{run_no}_r23.root")
makeOutputTreeFromList(chain_r0022, commonEventIDs, outputFileName=f"filtered_data_{run_no}_r22.root")