Issue with multiple executions of DefinePerSample() in pyROOT

steenisj · June 7, 2023, 7:44pm

Hi! I am noticing an issue with with DefinePerSample() when trying to define weights for samples in an RDataFrame made from combining multiple root trees. I have defined two functions: pt_distribution and width_fitting. These two should be totally independent functions; however, when I run width_fitting first and then pt_distribution, I get a different result than if I did it the other way around.

I’m pretty sure it has something to do with the DefinePerSample() step. Has anyone had any experience dealing with this? I am attaching my code below:

Relevant functions:

def weight_maker(df, directories, **kwargs):
    cs = kwargs.get('cross_sections', None)
    ed = kwargs.get('entries_dict', None)
    luminosity = kwargs.get('lum',None)
    counter = 0

    print(cs)

    if directories[0] == "qcdMC_":
        weight_fxn = "float weight_val = 1.0;\n"
        for name, weight in cs.items():
            counter += 1
            entries_value = 0
            for i in list(ed.keys()):
                if name in str(i):
                    entries_value = ed[i]
            if entries_value == 0:
                print("numEntries error!")
            if counter == 1:
                weight_fxn += 'if (rdfsampleinfo_.Contains("{}")) weight_val = {};\n'.format(name, weight*10e3*lum/entries_value)
            else:
                weight_fxn += 'else if (rdfsampleinfo_.Contains("{}")) weight_val = {};\n'.format(name, weight*10e3*lum/entries_value)
        weight_fxn += 'return weight_val;'
        print(weight_fxn)
        df = df.DefinePerSample("weightbysample", weight_fxn)
        return df

    else:
        df = df.Define("weightbysample", "1.0f")
        return df

def width_fitting(df, bin_edges, fit_min, fit_max, directories, label, SAVE):
    #To fit the data with Gaussians
    fit_func1 = r.TF1("fit_func1", "gaus", fit_min, fit_max)
    fit_func2 = r.TF1("fit_func2", "gaus", fit_min, fit_max)
    sum_bin_edges = []
    for j in np.arange(len(bin_edges)-1):
        sum_bin_edges.append(sum(bin_edges[:j+1])+(bin_edges[j+1]-bin_edges[j])/2)
    
    #To store the sigmas from the fits
    sigmas1 = []
    sigmaErrors1 = []
    sigmas2 = []
    sigmaErrors2 = []

    means1 = []
    meanErrors1 = []
    means2 = []
    meanErrors2 = []

    slice_histograms1 = {} 
    slice_histograms2 = {}

    for i in np.arange(num_slices):
        slice = i
        slice_width = bin_edges[i+1] - bin_edges[i]
        slice_min = bin_edges[i]
        slice_max = bin_edges[i+1]
        slice_cut1 = x_var1+" >= " + str(slice_min) + " && "+x_var1+" < " + str(slice_max)
        slice_cut2 = x_var2+" >= " + str(slice_min) + " && "+x_var2+" < " + str(slice_max)

        slice_histo1 = df.Define("caloJet_weightedTimePtCap_good", "caloJet_weightedTimePtCap[" + standard_cuts + " && " + slice_cut1 + "]").Histo1D((y_var,";"+y_var, 100, y_min, y_max), y_var, "weightbysample")
        slice_histo2 = df.Define("caloJet_weightedTimePtCap_good", "caloJet_weightedTimePtCap[" + standard_cuts + " && " + slice_cut2 + "]").Histo1D((y_var,";"+y_var, 100, y_min, y_max), y_var, "weightbysample")

        slice_histo1.Fit(fit_func1,"R")
        sigmas1.append(fit_func1.GetParameter(2))
        sigmaErrors1.append(fit_func1.GetParError(2))

        slice_histo2.Fit(fit_func2,"R")
        sigmas2.append(fit_func2.GetParameter(2))
        sigmaErrors2.append(fit_func2.GetParError(2))

        #slice_histo1.Fit(fit_func1,"R")
        means1.append(fit_func1.GetParameter(1))
        meanErrors1.append(fit_func1.GetParError(1))

        #slice_histo2.Fit(fit_func2,"R")
        means2.append(fit_func2.GetParameter(1))
        meanErrors2.append(fit_func2.GetParError(1))

        oC1 = r.TCanvas()
        oC1.SetLogy()
        slice_histo1.SetTitle("Pt Start: " + str(int(slice_min)) + " Pt Slice Width: " + str(int(slice_width)))
        slice_histo1.Draw("HIST")
        fit_func1.Draw("same")
        oC1.SaveAs(directories[1]+"/"+str(label)+str(i)+".png")

        oC2 = r.TCanvas()
        oC2.SetLogy()
        slice_histo2.SetTitle("EM Start: " + str(int(slice_min)) + " EM Slice Width: " + str(int(slice_width)))
        slice_histo2.Draw("HIST")
        fit_func2.Draw("same")
        oC2.SaveAs(directories[2]+"/"+str(label)+str(i)+".png")

        slice_histograms1[label+str(i)] = slice_histo1
        slice_histograms2[label+str(i)] = slice_histo2

    if SAVE == True:
        plt.figure()
        plt.errorbar(sum_bin_edges, sigmas1, yerr=sigmaErrors1, fmt='.', label='Pt Slices', markersize=8)
        plt.title("caloJet_weightedTimePtCap")
        plt.xlabel("Kinematics [GeV]")
        plt.ylabel("Sigma from Gaussian Fit [ns]")
        plt.errorbar(sum_bin_edges, sigmas2, yerr=sigmaErrors2, fmt='.', label='EM Slices', markersize=8)

        plt.legend(fontsize=12, frameon=False)
        plt.savefig(directories[0]+'/caloJet_weightedTimePtCap_in_Pt_and_EM_Slices_'+directories[0]+'.png')
        plt.show()

        plt.figure()
        plt.errorbar(sum_bin_edges, means1, yerr=meanErrors1, fmt='.', label='Pt Slices', markersize=8)
        plt.title("caloJet_weightedTimePtCap")
        plt.xlabel("Kinematics [GeV]")
        plt.ylabel("Means from Gaussian Fit [ns]")
        plt.errorbar(sum_bin_edges, means2, yerr=meanErrors2, fmt='.', label='EM Slices', markersize=8)

        plt.legend(fontsize=12, frameon=False)
        plt.savefig(directories[0]+'/caloJet_weightedTimePtCap_in_Pt_and_EM_Slices_'+directories[0]+'_MEANS.png')
        plt.show()

    return sigmas1, sigmaErrors1, sigmas2, sigmaErrors2, slice_histograms1, slice_histograms2

def pt_hist_generator(df, directories):
    slice_histo10 = df.Define("caloJet_pt_good", "caloJet_pt[" + standard_cuts + "]").Histo1D(("caloJet_pt_good",";caloJet_pt_good", 100, 0, 3200), "caloJet_pt_good","weightbysample") 
    oC10 = r.TCanvas()
    slice_histo10.SetTitle("pt distribution check")
    oC10.SetLogy()
    slice_histo10.Draw("HIST")
    oC10.SaveAs(directories[0]+"/pt_distribution.png")
    return slice_histo10

if __name__ == "__main__":
    directory_maker(directories)

    if directories[0] == "qcdMC":
        lum = 1
        file_counter = 0
        cross_sections = {"QCD_PT-470to600": 626.4, "QCD_PT-1000to1400": 8.92, "QCD_PT-120to170": 445800.0, "QCD_PT-1400to1800": 0.8103, "QCD_PT-170to300": 113700.0, "QCD_PT-1800to2400": 0.1148, "QCD_PT-2400to3200": 0.007542, "QCD_PT-3200": 0.0002331, "QCD_PT-600to800": 178.6, "QCD_PT-800to1000": 30.57}

        file_list = []
        file_location_list = [] 
        entries_dict = {}
 
        for file in os.listdir(full_directory):
            file_location_list.append(full_directory+"/"+str(file))
            file_list.append(str(file))
            ne_df, ne_tree, ne_runLeaf, ne_run_min, ne_run_max, nEntries = data_retrieval(full_directory+"/"+str(file)) 
            entries_dict[file] = nEntries

        combined_rdf = r.RDataFrame("timeTree",file_location_list)
        combined_rdf = weight_maker(combined_rdf, directories, cross_sections=cross_sections, entries_dict=entries_dict, lum=lum)
        width_fitting(combined_rdf, bin_edges, -2, 2, directories, directories[0],True)
        pt_hist_generator(combined_rdf, directories)
        #width_fitting(combined_rdf, bin_edges, -2, 2, directories, directories[0],True)

_ROOT Version: 6.26/11
Platform: Not Provided
Compiler: Not Provided

steenisj · June 7, 2023, 7:57pm

I should add that I am using pyROOT.

vpadulan · June 7, 2023, 8:54pm

Dear @steenisj ,

Thank you for your report. There is a chance you might be incurring in Wrong interaction of DefinePerSample with multiple executions · Issue #12043 · root-project/root · GitHub. To test this, could you try to write your application in a way that all the calls to the RDataFrame API that do not trigger execution (Define, Filter) happen before you start drawing any histogram or retrieving any result?

Practically, this would mean copying the code inside of pt_hist_generator into width_fitting, making sure that the order of operations is such that you trigger execution only at the very end, so that the DefinePerSample call is booked together with all other Define and Filter calls. By looking at the code of pt_hist_generator, the only line that doesn’t trigger execution is

slice_histo10 = df.Define("caloJet_pt_good", "caloJet_pt[" + standard_cuts + "]").Histo1D(("caloJet_pt_good",";caloJet_pt_good", 100, 0, 3200), "caloJet_pt_good","weightbysample")

The rest should be left at the end of the “merged” function.

Then, you can play with writing this extra Define call before other calls that happen in the width_fitting function.

Let me know if you manage to try with this modification. If that works, it would give more credit to my suspicion.

Cheers,
Vincenzo

steenisj · June 7, 2023, 10:07pm

Is this what you mean? I have put the pt_hist_generator within width_fitting

def width_fitting(df, bin_edges, fit_min, fit_max, directories, label, SAVE):
    #To fit the data with Gaussians
    fit_func1 = r.TF1("fit_func1", "gaus", fit_min, fit_max)
    fit_func2 = r.TF1("fit_func2", "gaus", fit_min, fit_max)
    sum_bin_edges = []
    for j in np.arange(len(bin_edges)-1):
        sum_bin_edges.append(sum(bin_edges[:j+1])+(bin_edges[j+1]-bin_edges[j])/2)

    #To store the sigmas from the fits
    sigmas1 = []
    sigmaErrors1 = []
    sigmas2 = []
    sigmaErrors2 = []

    means1 = []
    meanErrors1 = []
    means2 = []
    meanErrors2 = []

    slice_histograms1 = {}
    slice_histograms2 = {}

    slice_histo10 = df.Define("caloJet_pt_good", "caloJet_pt[" + standard_cuts + "]").Histo1D(("caloJet_pt_good",";caloJet_pt_good", 100, 0, 3200), "caloJet_pt_good","weightbysample")

    for i in np.arange(num_slices):
        slice = i
        slice_width = bin_edges[i+1] - bin_edges[i]
        slice_min = bin_edges[i]
        slice_max = bin_edges[i+1]
        slice_cut1 = x_var1+" >= " + str(slice_min) + " && "+x_var1+" < " + str(slice_max)
        slice_cut2 = x_var2+" >= " + str(slice_min) + " && "+x_var2+" < " + str(slice_max)

        slice_histo1 = df.Define("caloJet_weightedTimePtCap_good", "caloJet_weightedTimePtCap[" + standard_cuts + " && " + slice_cut1 + "]").Histo1D((y_var,";"+y_var, 100, y_min, y_max), y_var, "weightbysample")
        slice_histo2 = df.Define("caloJet_weightedTimePtCap_good", "caloJet_weightedTimePtCap[" + standard_cuts + " && " + slice_cut2 + "]").Histo1D((y_var,";"+y_var, 100, y_min, y_max), y_var, "weightbysample")

        slice_histo1.Fit(fit_func1,"R")
        sigmas1.append(fit_func1.GetParameter(2))
        sigmaErrors1.append(fit_func1.GetParError(2))

        slice_histo2.Fit(fit_func2,"R")
        sigmas2.append(fit_func2.GetParameter(2))
        sigmaErrors2.append(fit_func2.GetParError(2))

        #slice_histo1.Fit(fit_func1,"R")
        means1.append(fit_func1.GetParameter(1))
        meanErrors1.append(fit_func1.GetParError(1))

        #slice_histo2.Fit(fit_func2,"R")
        means2.append(fit_func2.GetParameter(1))
        meanErrors2.append(fit_func2.GetParError(1))

        oC1 = r.TCanvas()
        oC1.SetLogy()
        slice_histo1.SetTitle("Pt Start: " + str(int(slice_min)) + " Pt Slice Width: " + str(int(slice_width)))
        slice_histo1.Draw("HIST")
        fit_func1.Draw("same")
        oC1.SaveAs(directories[1]+"/"+str(label)+str(i)+".png")

        oC2 = r.TCanvas()
        oC2.SetLogy()
        slice_histo2.SetTitle("EM Start: " + str(int(slice_min)) + " EM Slice Width: " + str(int(slice_width)))
        slice_histo2.Draw("HIST")
        fit_func2.Draw("same")
        oC2.SaveAs(directories[2]+"/"+str(label)+str(i)+".png")

        slice_histograms1[label+str(i)] = slice_histo1
        slice_histograms2[label+str(i)] = slice_histo2

    oC10 = r.TCanvas()
    slice_histo10.SetTitle("pt distribution check")
    oC10.SetLogy()
    slice_histo10.Draw("HIST")
    oC10.SaveAs(directories[0]+"/pt_distribution.png")

    if SAVE == True:
        plt.figure()
        plt.errorbar(sum_bin_edges, sigmas1, yerr=sigmaErrors1, fmt='.', label='Pt Slices', markersize=8)
        plt.title("caloJet_weightedTimePtCap")
        plt.xlabel("Kinematics [GeV]")
        plt.ylabel("Sigma from Gaussian Fit [ns]")
        plt.errorbar(sum_bin_edges, sigmas2, yerr=sigmaErrors2, fmt='.', label='EM Slices', markersize=8)

        plt.legend(fontsize=12, frameon=False)
        plt.savefig(directories[0]+'/caloJet_weightedTimePtCap_in_Pt_and_EM_Slices_'+directories[0]+'.png')
        plt.show()

        plt.figure()
        plt.errorbar(sum_bin_edges, means1, yerr=meanErrors1, fmt='.', label='Pt Slices', markersize=8)
        plt.title("caloJet_weightedTimePtCap")
        plt.xlabel("Kinematics [GeV]")
        plt.ylabel("Means from Gaussian Fit [ns]")
        plt.errorbar(sum_bin_edges, means2, yerr=meanErrors2, fmt='.', label='EM Slices', markersize=8)

        plt.legend(fontsize=12, frameon=False)
        plt.savefig(directories[0]+'/caloJet_weightedTimePtCap_in_Pt_and_EM_Slices_'+directories[0]+'_MEANS.png')
        plt.show()

    return sigmas1, sigmaErrors1, sigmas2, sigmaErrors2, slice_histograms1, slice_histograms2

steenisj · June 7, 2023, 10:07pm

(I still get the wrong pt_distribution plot when I run it this way.)

eguiraud · June 16, 2023, 2:43pm

Let’s ping @vpadulan

vpadulan · June 19, 2023, 2:24pm

Yes that’s what I meant. So at this point it seems you don’t incur in the existing bug I linked to (on the surface at least). Would you be able to share with me (even in private) some of your data so that I can run your snippet and try to reproduce the issue myself?

Cheers,
Vincenzo

system · July 3, 2023, 2:24pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.