Simultaneous fit and upper limit with constraints

sebmnt · March 7, 2023, 11:47am

Dear Experts,

I am performing a simultaneous fit in 3 fit categories, to determine the upper limit on two signal branching fractions. Fit categories 1 and 2 contain a background component, whose Yield is expressed as follows:

N_BKG(cat) = Expexted_BKG(cat)/Expexted_BKG_total *N_BKG_total,

where N_BKG_total is a shared parameter between Fit categories 1 and 2.
Furthermore, N_BKG_total is a Gauss constraint.

For the limit calculation I use the AsymptoticCalculator, and to take the Gauss constraint into account I basically followed the discussion from this post:
root-forum.cern.ch/t/upper-limit-calculation/34559

However, since the constrained component is not present in one of the Fit categories, I am unsure if my implementation makes sense:

#Signal Yields 
   ws.factory(f'expr::N_sig1{label_fitcat}("(BF_sig1/BF_Normalisation)*Normalisation_Yield*(eps_sig1{label_fitcat}/eps_Normalisation)",BF_sig1,BF_Normalisation,  Normalisation_Yield,eps_sig1{label_fitcat},eps_Normalisation)')
   ws.factory(f'expr::N_sig2{label_fitcat}("(BF_sig2/BF_Normalisation)*Normalisation_Yield*(eps_sig2{label_fitcat}/eps_Normalisation)*(fs_fu)",BF_sig2,BF_Normalisation, Normalisation_Yield,eps_sig2{label_fitcat},eps_Normalisation,fs_fu)')


#Phys BKG Yield
    if fit_category_name =="Fit_category_1" or fit_category_name =="Fit_category_2":
        Total_expexted_Bkg = 2.4
        Expexted_Yield_Bkg= fit_category_dicc["DB_parameters"]["Expexted_Bkg"]
        WS(ws,r.RooRealVar(f"Scale_Bkg_{label_fitcat}",  f"Scale_Bkg_{label_fitcat}",Expexted_Yield_Bkg/Total_expexted_Bkg))
        ws.factory(f'expr::N_Bkg_{label_fitcat}("(Scale_Bkg_{label_fitcat})*N_Bkg_total",Scale_Bkg_{label_fitcat},N_Bkg_total)') 


#####################################################################  
#Constrains on BKG Yield
ws.factory(f"Gaussian::Constrain_BKG(Bkg_global[2.4,0,30],N_Bkg_total ,sigma_const[1.7])")
ws.var("Bkg_global").setConstant(True)
ws.defineSet("Global_Obs","Bkg_global")
# How about "N_Bkg_total"?
ws.defineSet("NuisanceParameters","Ncomb_Fitcat1,Ncomb_Fitcat2,Ncomb_Fitcat3,exp_coef_Fitcat1,exp_coef_Fitcat2,exp_coef_Fitcat3")
# #Is this right?
ws.factory(f"PROD::PDF_Bkg_Fitcat1const(PDF_Bkg_Fitcat1,Constrain_BKG)")
ws.factory(f"PROD::PDF_Bkg_Fitcat2const(PDF_Bkg_Fitcat2,Constrain_BKG)")




##################################################################### 
#Build Model
#Fitcat 1 and 2 contain the BKG:
ws.factory(f"SUM::Model_Fitcat1(N_sig1Fitcat1*PDF_sig1_Fitcat1,"
                              f"N_sig2Fitcat1*PDF_sig2_Fitcat1,"
                              f"Ncomb_Fitcat1*PDF_comb_bkg_Fitcat1,"
                            f"N_Bkg_Fitcat1*PDF_Bkg_Fitcat1const)")
ws.factory(f"SUM::Model_Fitcat2(N_sig1Fitcat2*PDF_sig1_Fitcat2,"
                              f"N_sig2Fitcat2*PDF_sig2_Fitcat2,"
                              f"Ncomb_Fitcat2*PDF_comb_bkg_Fitcat2,"
                            f"N_Bkg_Fitcat2*PDF_Bkg_Fitcat2const)")
# Fitcat3 does not contain the BKG
ws.factory(f"SUM::Model_Fitcat3(N_sig1Fitcat3*PDF_sig1_Fitcat3,"
                              f"N_sig2Fitcat3*PDF_sig2_Fitcat3,"
                              f"Ncomb_Fitcat3*PDF_comb_bkg_Fitcat3)")   

##################################################################### 
# Build  RooSimultanous
roo_category = WS(ws,r.RooCategory("category","category"))
roo_category.defineType("Fitcat1")
roo_category.defineType("Fitcat2")
roo_category.defineType("Fitcat3")
dsmap = r.std.map('string, RooDataSet*')()
dsmap["Fitcat1"] = data_set_holder["Fitcat1"]
dsmap["Fitcat2"] = data_set_holder["Fitcat2"]
dsmap["Fitcat3"] = data_set_holder["Fitcat3"]
data_incat = WS(ws,r.RooDataSet("data","data",r.RooArgSet(mass),r.RooFit.Index(roo_category),r.RooFit.Import(dsmap),))
model = WS(ws,r.RooSimultaneous("Model_Final", "Model_Final", roo_category))
model.addPdf(ws.pdf(f"Model_Fitcat1"), "Fitcat1")
model.addPdf(ws.pdf(f"Model_Fitcat2"), "Fitcat2")
model.addPdf(ws.pdf(f"Model_Fitcat3"), "Fitcat3")


##################################################################### 
# Fit
args_fit = r.RooLinkedList()
args_fit.Add( r.RooFit.Extended(True))
args_fit.Add( r.RooFit.Constrain(r.RooArgSet(ws.var("N_Bkg_total") )))
args_fit.Add( r.RooFit.Minimizer("Minuit","migrad"))
args_fit.Add( r.RooFit.RecoverFromUndefinedRegions(100))
args_fit.Add( r.RooFit.EvalErrorWall(True))
args_fit.Add( r.RooFit.Offset(True))
args_fit.Add( r.RooFit.Hesse(True))
args_fit.Add( r.RooFit.Save())
args_fit.Add( r.RooFit.Strategy(2))
_fitResults= model.fitTo(data_incat, args_fit)

I am especially insecure if I need to multiply PDF of the BKG with the constraint or the full Model and how this would look in terms of the Likelihood.

Furhtermore, I implemented the upper limit calculation for one of the two signal componets:

##################################################################### 
# Calculate Limit for sig1  only
#####################################################################

#Set sig2 to zero
ws.var("BF_sig2").setVal(0)
ws.var("BF_sig2").setConstant(r.kTRUE)

#signal + background mode
sb_model = r.RooStats.ModelConfig("S+B_model",ws)
sb_model.SetPdf(ws.pdf("Model_Final"))
sb_model.SetGlobalObservables(ws.set("Global_Obs"))
sb_model.SetNuisanceParameters(ws.set("NuisanceParameters"))
sb_model.SetParametersOfInterest("BF_sig1")
sb_model.SetObservables(r.RooArgSet(ws.var("B_DTF_M_GeV"), ws.cat("category")))
sb_model.SetSnapshot(r.RooArgSet(ws.var("BF_sig1")))
val_sig1 = ws.var("BF_sig1").getVal()
# bkg ony model
b_model = r.RooStats.ModelConfig("B_model",ws)
b_model.SetPdf(ws.pdf("Model_Final"))
b_model.SetGlobalObservables(ws.set("Global_Obs"))
b_model.SetNuisanceParameters(ws.set("NuisanceParameters"))
ws.var("BF_sig1").setVal(0)
ws.var("BF_sig1").setConstant(r.kTRUE)
b_model.SetParametersOfInterest("BF_sig1")
b_model.SetObservables(r.RooArgSet(ws.var("B_DTF_M_GeV"), ws.cat("category")))
b_model.SetSnapshot(r.RooArgSet(ws.var("BF_sig1")))
ws.var("BF_sig1").setVal(val_sig1)
ws.var("BF_sig1").setConstant(r.kFALSE)
ac = r.RooStats.AsymptoticCalculator(ws.data("data"), b_model, sb_model)
ac.SetOneSided(True)  
calc = r.RooStats.HypoTestInverter(ac)
calc.UseCLs(True)
calc.SetConfidenceLevel(0.95)
poimin = 0
poimax =4e-9
npoints=20
calc.SetFixedScan(npoints,poimin,poimax)
results = calc.GetInterval()
upperLimit = results.UpperLimit()
ulError = results.UpperLimitEstimatedError()
expectedLimit = results.GetExpectedUpperLimit(0)

So from what I understood is that for the limit calculation, it is essential to set the Global Observables
and the Nuisance Parameters, to take the constraint into account. Do I also have to include the N_BKG_total in the Nuisance Parameters? And how do I deal with the second signal BF? I assume I have to set it to zero while I am determining the limit for the first BF?

I would appreciate any advice regarding this points.

Thank you,

Seb

couet · March 7, 2023, 12:55pm

Welcome to the ROOT forum
I think @moneta can help.

RENATO_QUAGLIANI · March 18, 2023, 5:25pm

Hi, me and @sebmnt are working together on this and we got confused on how to properly tell the Asymptotic calculator to apply gauss constraint in some of the categories of the fit.
We are quite confident if we would not rely on the factory constructors and roosimultanous PDF with indexed categories on how to do It juggling the likelihoods (plain sums of the log likelihood of each category and a single term for the constrained parameter), but once the factory Is used and we introduce gauss constraint It becomes unclear if the code would add for each category the same likelihood penalty if the constrained parameter Is shared among some of the categories or It would do It Only once as It should.
If there Is anyone which can provide a feedback on the original post would be great
. Thanks again
Renato

jonas · March 28, 2023, 3:05pm

Hello @sebmnt, sorry for the very late reply!

Here is one general thing to know for RooFit: the RooSimultaneous only works to create likelihoods when it’s the top-level RooFit argument. If you call fitTo() or createNLL() on a pdf that contains a RooSimultaneous but is not a RooSimultaneous itself, you will get garbage.

That means multiply in the full model (which is a RooSimultanous) with the constraints is out of the picture.

So now you’re asking which pdf to multiply with the constraint pdf? The answer is: it doesn’t matter, even if it’s duplicate. It only needs to be part of a RooProdPdf. When the likelihood object is constructed, RooFit goes though the full model and finds all the pdfs that are factors of RooProdPdfs that don’t depend on observables. These are extracted from the model, de-duplicated by name, and used to construct the RooConstraintSum that is added to the main likelihood. You can get a feeling for this with this example:

void demo()
{
    using namespace RooFit;

    RooWorkspace ws{"ws"};

    // only the n_bkg_total parameter is shared
    ws.factory("n_bkg_total[2000, 0, 100000]");

    ws.factory("Gaussian::sig_1(x_1[0, 10], mu_1[7, 0, 10], sigma_1[1, 0.1, 10])");
    ws.factory("Uniform::bkg_1(x_1)");
    ws.factory("expr::n_bkg_1('0.4 * n_bkg_total', n_bkg_total)");
    ws.factory("SUM::model_1(n_sig_1[100, 0, 100000] * sig_1, n_bkg_1 * bkg_1)");

    ws.factory("Gaussian::sig_2(x_2[0, 10], mu_2[4, 0, 10], sigma_2[2, 0.1, 10])");
    ws.factory("Uniform::bkg_2(x_2)");
    ws.factory("expr::n_bkg_2('0.6 * n_bkg_total', n_bkg_total)");
    ws.factory("SUM::model_2(n_sig_2[100, 0, 100000] * sig_2, n_bkg_2 * bkg_2)");

    // create the constraint
    ws.factory("Gaussian::n_bkg_total_constr(n_bkg_total_glob[2000], n_bkg_total, 1000)");

    // add the constraints anywhere (you can try adding them also in both channels)
    ws.factory("PROD::model_1_constr(model_1, n_bkg_total_constr)");
    ws.factory("PROD::model_2_constr(model_2)");

    // create the simultaneous pdf
    ws.factory("SIMUL::sim_pdf(channel_cat[A=0,B=1], A=model_1_constr, B=model_2_constr)");

    RooAbsPdf& pdf = *ws.pdf("sim_pdf");
    RooAbsArg& channelCat = *ws.arg("channel_cat");
    RooArgSet observables{*ws.var("x_1"), *ws.var("x_2"), channelCat};

    // create toy data
    std::unique_ptr<RooDataSet> data{pdf.generate(observables)};

    // create nll (don't forget the global observables such that the constraint pdfs get normalized correctly!)
    RooArgSet globs{*ws.var("n_bkg_total_glob")};
    std::unique_ptr<RooAbsReal> nllPtr{pdf.createNLL(*data, GlobalObservables(globs))};

    // inspect the nll (which is actualy a RooAddition). you don't need the cast in PyROOT
    auto& nll = static_cast<RooAddition&>(*nllPtr);

    // if there were constraints, the constraint sum is the last element in the addition
    auto& constraintSum = static_cast<RooConstraintSum&>(nll.list()[1]);

    // print the constraints
    constraintSum.Print();
}

Here is the output of the demo:

[#1] INFO:Minimization -- p.d.f. provides expected number of events, including extended term in likelihood.
[#1] INFO:Minimization --  Including the following constraint terms in minimization: (n_bkg_total_constr)
[#1] INFO:Minimization -- The following global observables have been defined and their values are taken from the model: (n_bkg_total_glob)
RooConstraintSum::nll_sim_pdf_hmaster_constr[ set1=(n_bkg_total_constr) ] = 7.82669

Therefore, your implementation was correct.

Concerting your second question with the upper limit calculation: that depends on the kind of analysis that you are doing. For sure the n_bkg_total belongs to the nuisance parameters. But if the second signal parameter should be set to constant or is a nuisance parameter depends a bit on what you want to measure. That’s something I can’t tell you, maybe ask your analysis team?

I hope this helps, let me know if you have further questions!

Cheers,
Jonas

sebmnt · April 3, 2023, 1:48pm

@jonas thanks. Your answer helped us a lot.

system · April 17, 2023, 1:48pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.