Hi! I am noticing an issue with with DefinePerSample() when trying to define weights for samples in an RDataFrame made from combining multiple root trees. I have defined two functions: pt_distribution and width_fitting. These two should be totally independent functions; however, when I run width_fitting first and then pt_distribution, I get a different result than if I did it the other way around.
I’m pretty sure it has something to do with the DefinePerSample() step. Has anyone had any experience dealing with this? I am attaching my code below:
Relevant functions:
def weight_maker(df, directories, **kwargs):
cs = kwargs.get('cross_sections', None)
ed = kwargs.get('entries_dict', None)
luminosity = kwargs.get('lum',None)
counter = 0
print(cs)
if directories[0] == "qcdMC_":
weight_fxn = "float weight_val = 1.0;\n"
for name, weight in cs.items():
counter += 1
entries_value = 0
for i in list(ed.keys()):
if name in str(i):
entries_value = ed[i]
if entries_value == 0:
print("numEntries error!")
if counter == 1:
weight_fxn += 'if (rdfsampleinfo_.Contains("{}")) weight_val = {};\n'.format(name, weight*10e3*lum/entries_value)
else:
weight_fxn += 'else if (rdfsampleinfo_.Contains("{}")) weight_val = {};\n'.format(name, weight*10e3*lum/entries_value)
weight_fxn += 'return weight_val;'
print(weight_fxn)
df = df.DefinePerSample("weightbysample", weight_fxn)
return df
else:
df = df.Define("weightbysample", "1.0f")
return df
def width_fitting(df, bin_edges, fit_min, fit_max, directories, label, SAVE):
#To fit the data with Gaussians
fit_func1 = r.TF1("fit_func1", "gaus", fit_min, fit_max)
fit_func2 = r.TF1("fit_func2", "gaus", fit_min, fit_max)
sum_bin_edges = []
for j in np.arange(len(bin_edges)-1):
sum_bin_edges.append(sum(bin_edges[:j+1])+(bin_edges[j+1]-bin_edges[j])/2)
#To store the sigmas from the fits
sigmas1 = []
sigmaErrors1 = []
sigmas2 = []
sigmaErrors2 = []
means1 = []
meanErrors1 = []
means2 = []
meanErrors2 = []
slice_histograms1 = {}
slice_histograms2 = {}
for i in np.arange(num_slices):
slice = i
slice_width = bin_edges[i+1] - bin_edges[i]
slice_min = bin_edges[i]
slice_max = bin_edges[i+1]
slice_cut1 = x_var1+" >= " + str(slice_min) + " && "+x_var1+" < " + str(slice_max)
slice_cut2 = x_var2+" >= " + str(slice_min) + " && "+x_var2+" < " + str(slice_max)
slice_histo1 = df.Define("caloJet_weightedTimePtCap_good", "caloJet_weightedTimePtCap[" + standard_cuts + " && " + slice_cut1 + "]").Histo1D((y_var,";"+y_var, 100, y_min, y_max), y_var, "weightbysample")
slice_histo2 = df.Define("caloJet_weightedTimePtCap_good", "caloJet_weightedTimePtCap[" + standard_cuts + " && " + slice_cut2 + "]").Histo1D((y_var,";"+y_var, 100, y_min, y_max), y_var, "weightbysample")
slice_histo1.Fit(fit_func1,"R")
sigmas1.append(fit_func1.GetParameter(2))
sigmaErrors1.append(fit_func1.GetParError(2))
slice_histo2.Fit(fit_func2,"R")
sigmas2.append(fit_func2.GetParameter(2))
sigmaErrors2.append(fit_func2.GetParError(2))
#slice_histo1.Fit(fit_func1,"R")
means1.append(fit_func1.GetParameter(1))
meanErrors1.append(fit_func1.GetParError(1))
#slice_histo2.Fit(fit_func2,"R")
means2.append(fit_func2.GetParameter(1))
meanErrors2.append(fit_func2.GetParError(1))
oC1 = r.TCanvas()
oC1.SetLogy()
slice_histo1.SetTitle("Pt Start: " + str(int(slice_min)) + " Pt Slice Width: " + str(int(slice_width)))
slice_histo1.Draw("HIST")
fit_func1.Draw("same")
oC1.SaveAs(directories[1]+"/"+str(label)+str(i)+".png")
oC2 = r.TCanvas()
oC2.SetLogy()
slice_histo2.SetTitle("EM Start: " + str(int(slice_min)) + " EM Slice Width: " + str(int(slice_width)))
slice_histo2.Draw("HIST")
fit_func2.Draw("same")
oC2.SaveAs(directories[2]+"/"+str(label)+str(i)+".png")
slice_histograms1[label+str(i)] = slice_histo1
slice_histograms2[label+str(i)] = slice_histo2
if SAVE == True:
plt.figure()
plt.errorbar(sum_bin_edges, sigmas1, yerr=sigmaErrors1, fmt='.', label='Pt Slices', markersize=8)
plt.title("caloJet_weightedTimePtCap")
plt.xlabel("Kinematics [GeV]")
plt.ylabel("Sigma from Gaussian Fit [ns]")
plt.errorbar(sum_bin_edges, sigmas2, yerr=sigmaErrors2, fmt='.', label='EM Slices', markersize=8)
plt.legend(fontsize=12, frameon=False)
plt.savefig(directories[0]+'/caloJet_weightedTimePtCap_in_Pt_and_EM_Slices_'+directories[0]+'.png')
plt.show()
plt.figure()
plt.errorbar(sum_bin_edges, means1, yerr=meanErrors1, fmt='.', label='Pt Slices', markersize=8)
plt.title("caloJet_weightedTimePtCap")
plt.xlabel("Kinematics [GeV]")
plt.ylabel("Means from Gaussian Fit [ns]")
plt.errorbar(sum_bin_edges, means2, yerr=meanErrors2, fmt='.', label='EM Slices', markersize=8)
plt.legend(fontsize=12, frameon=False)
plt.savefig(directories[0]+'/caloJet_weightedTimePtCap_in_Pt_and_EM_Slices_'+directories[0]+'_MEANS.png')
plt.show()
return sigmas1, sigmaErrors1, sigmas2, sigmaErrors2, slice_histograms1, slice_histograms2
def pt_hist_generator(df, directories):
slice_histo10 = df.Define("caloJet_pt_good", "caloJet_pt[" + standard_cuts + "]").Histo1D(("caloJet_pt_good",";caloJet_pt_good", 100, 0, 3200), "caloJet_pt_good","weightbysample")
oC10 = r.TCanvas()
slice_histo10.SetTitle("pt distribution check")
oC10.SetLogy()
slice_histo10.Draw("HIST")
oC10.SaveAs(directories[0]+"/pt_distribution.png")
return slice_histo10
if __name__ == "__main__":
directory_maker(directories)
if directories[0] == "qcdMC":
lum = 1
file_counter = 0
cross_sections = {"QCD_PT-470to600": 626.4, "QCD_PT-1000to1400": 8.92, "QCD_PT-120to170": 445800.0, "QCD_PT-1400to1800": 0.8103, "QCD_PT-170to300": 113700.0, "QCD_PT-1800to2400": 0.1148, "QCD_PT-2400to3200": 0.007542, "QCD_PT-3200": 0.0002331, "QCD_PT-600to800": 178.6, "QCD_PT-800to1000": 30.57}
file_list = []
file_location_list = []
entries_dict = {}
for file in os.listdir(full_directory):
file_location_list.append(full_directory+"/"+str(file))
file_list.append(str(file))
ne_df, ne_tree, ne_runLeaf, ne_run_min, ne_run_max, nEntries = data_retrieval(full_directory+"/"+str(file))
entries_dict[file] = nEntries
combined_rdf = r.RDataFrame("timeTree",file_location_list)
combined_rdf = weight_maker(combined_rdf, directories, cross_sections=cross_sections, entries_dict=entries_dict, lum=lum)
width_fitting(combined_rdf, bin_edges, -2, 2, directories, directories[0],True)
pt_hist_generator(combined_rdf, directories)
#width_fitting(combined_rdf, bin_edges, -2, 2, directories, directories[0],True)
_ROOT Version: 6.26/11
Platform: Not Provided
Compiler: Not Provided