Please read tips for efficient and successful posting and posting code
ROOT Version: 6.16/00
Platform: Ubuntu 20.04
Compiler: linuxx8664gcc
Dear co-rooters,
(Please move to another more appropriate category if you think is more appropriate)
I have a set of experimental data gGn0
that I expect them to follow a chi-square distribution with 1 degree of freedom. I want to write a code that performs a Maximum Likelihood Estimation (MLE) analysis and determines: (a) the degrees of freedom and (b) the average value of my data along with the estimated uncertainties.
My sample size is expected to be very small (i.e. 50 or 100 or 200) depending on the thresholds I apply.
I wrote a code using TMinuit
and I want to test it before I apply it to my data.
I am generating random numbers of known degrees of freedom (nu=1.2) and average (=2.5) and I am expecting the code to return some values which are close enough to the known input. I also expect that the uncertainty will be larger as the sample size, becomes smaller.
Here’s my code
#include "TF1.h"
#include "TF2.h"
#include "TH1D.h"
#include "TPad.h"
#include "TFile.h"
#include "TTree.h"
#include "TMath.h"
#include "TText.h"
#include "TLine.h"
#include "TGraph.h"
#include "TFrame.h"
#include "TCanvas.h"
#include "TMinuit.h"
#include "TLegend.h"
#include "TGraphQQ.h"
#include "TRandom3.h"
#include "TFitResult.h"
#include "TMultiGraph.h"
#include "TGraphErrors.h"
#include "Math/Functor.h"
#include "Math/IFunction.h"
#include "Math/Integrator.h"
#include "Math/WrappedFunction.h"
#include "Math/PdfFuncMathCore.h"
#include <vector>
#include <fstream>
#include <iostream>
using namespace std;
// Global variables for likelihood fit
std::vector<double> data;
double average = 2.5; // <gGn0>
double nu = 1.2; // Degrees of freedom
// Likelihood function
void likelihoodFcn(int& npar, double* gin, double& f, double* par, int flag) {
double gGn0 = par[0];
double nu_fit = par[1];
double logL = 0.;
for (double x : data) {
// Use ROOT's built-in chisquared_pdf function
double chi_pdf = ROOT::Math::chisquared_pdf(x / gGn0, nu_fit) / gGn0;
logL += std::log(chi_pdf);
}
f = -2 * logL; // Minimize -2 * log-likelihood
}//___likelihoodFcn(int&, double*, double&, double*, int)
std::vector<double> generateData(int N) {
TRandom3 rand(1); // Seed the random generator
std::vector<double> generatedData; // Create a vector to store the generated data
for (int i = 0; i < N; ++i) {
// Use TMath::ChisquareQuantile to generate chi-square distributed random data
double uniform_random = rand.Uniform();
double chi2_value = TMath::ChisquareQuantile(uniform_random, nu);
double x = chi2_value * average; // Scale the chi-square value
generatedData.push_back(x); // Store the result in the vector
//cout << x << endl; // Print the generated value
}
return generatedData; // Return the generated data vector
}
void fitData(const std::vector<double>& inputData, double nu_guess, double gGn0_avrg_guess) {
data = inputData; // Set data from the input vector
TMinuit minuit(2); // 2 parameters: gGn0, nu
minuit.SetFCN(likelihoodFcn);
double initial_gGn0 = gGn0_avrg_guess; // Use the average as the initial guess
double initial_nu = nu_guess; // Use the user-provided guess
double par[2] = {initial_gGn0, initial_nu}; // Initial guesses
double step[2] = {0.01, 0.01}; // Step sizes
// Set parameter limits
double min_gGn0 = 0.1; // Lower limit for gGn0
double max_gGn0 = 20.0; // Upper limit for gGn0
double min_nu = 0.05; // Lower limit for nu
double max_nu = 10.0; // Upper limit for nu
minuit.DefineParameter(0, "gGn0", par[0], step[0], min_gGn0, max_gGn0);
minuit.DefineParameter(1, "nu" , par[1], step[1], min_nu , max_nu);
minuit.Migrad(); // Perform the fit
// Get fitted parameters and uncertainties
double gGn0_fit, nu_fit, gGn0_err, nu_err;
minuit.GetParameter(0, gGn0_fit, gGn0_err);
minuit.GetParameter(1, nu_fit , nu_err);
// Reporting results
std::cout << "Fitted gGn0: " << gGn0_fit << " ± " << gGn0_err << std::endl;
std::cout << "Fitted nu : " << nu_fit << " ± " << nu_err << std::endl;
// Emphasizing uncertainties based on sample size
if (data.size() < 50) { // Example threshold
std::cout << "Warning: Sample size is small (" << data.size() << ")."
<< " Larger uncertainties are expected." << std::endl;
}
}
void MLAnalysis(TString SAMMY_TTree_filename_in) {
//fitData(gGn0_range_vector, 1.0, 2.0); // Perform the maximum likelihood fit with generated data
int N = 20; //Number of data points
std::vector<double> generatedData = generateData(N); //Generate random data
fitData(generatedData, 1.0, 2.2);
}
To run it compile it (.L ML.C++
) and execute the ML function ( MLAnalysis("No_File_Needed.root")
)
When the sample size is large (i.e. N=1000) I get values for nu
and <gGn0>
which are close to what I input i.e.
Fitted gGn0: 2.64453 ± 0.14852
Fitted nu : 1.13675 ± 0.0424665
When N=150
, I am still happy with what I get
Fitted gGn0: 2.74454 ± 0.400776
Fitted nu : 1.10148 ± 0.105908
For N=
100` I am worried
Fitted gGn0: 3.0325 ± 0.548412
Fitted nu : 1.05137 ± 0.123265
But when N=30
Fitted gGn0: 4.55642 ± 1.60826
Fitted nu : 0.807968 ± 0.168769
I am happy with <gGn0>
but nu
isn’t quite right. Had the uncertainty be larger, I wouldn’t worry.
The question is, is there anything evident that results in such a small estimation of the uncertainty in nu?
Note: I have applied the code to my actual data (N=215) and both nu
and <gGn0>
are not as expected, so I’m worried I’m missing something important.