/// \file /// \ingroup tutorial_tmva /// \notebook -nodraw /// This macro provides examples for the training and testing of the /// TMVA classifiers. /// /// As input data is used a toy-MC sample consisting of four Gaussian-distributed /// and linearly correlated input variables. /// The methods to be used can be switched on and off by means of booleans, or /// via the prompt command, for example: /// /// root -l ./TMVAClassification.C\(\"Fisher,Likelihood\"\) /// /// (note that the backslashes are mandatory) /// If no method given, a default set of classifiers is used. /// The output file "TMVA.root" can be analysed with the use of dedicated /// macros (simply say: root -l ), which can be conveniently /// invoked through a GUI that will appear at the end of the run of this macro. /// Launch the GUI via the command: /// /// root -l ./TMVAGui.C /// /// You can also compile and run the example with the following commands /// /// make /// ./TMVAClassification /// /// where: ` = "method1 method2"` are the TMVA classifier names /// example: /// /// ./TMVAClassification Fisher LikelihoodPCA BDT /// /// If no method given, a default set is of classifiers is used /// /// - Project : TMVA - a ROOT-integrated toolkit for multivariate data analysis /// - Package : TMVA /// - Root Macro: TMVAClassification /// /// \macro_output /// \macro_code /// \author Andreas Hoecker #include #include #include #include #include "TChain.h" #include "TFile.h" #include "TTree.h" #include "TString.h" #include "TObjString.h" #include "TSystem.h" #include "TROOT.h" #include "TMVA/Factory.h" #include "TMVA/DataLoader.h" #include "TMVA/Tools.h" #include "TMVA/TMVAGui.h" int TMVAClassification( TString myMethodList = "" ) { // The explicit loading of the shared libTMVA is done in TMVAlogon.C, defined in .rootrc // if you use your private .rootrc, or run from a different directory, please copy the // corresponding lines from .rootrc // Methods to be processed can be given as an argument; use format: // // mylinux~> root -l TMVAClassification.C\(\"myMethod1,myMethod2,myMethod3\"\) //--------------------------------------------------------------- // This loads the library TMVA::Tools::Instance(); //TMVA::Config::Instance().fNbinsMVAoutput = 200; (TMVA::gConfig().GetVariablePlotting()).fNbinsMVAoutput=20; // Default MVA methods to be trained + tested std::map Use; // Boosted Decision Trees Use["BDT"] = 1; // uses Adaptive Boost Use["BDTG"] = 0; // uses Gradient Boost Use["BDTB"] = 0; // uses Bagging Use["BDTD"] = 0; // decorrelation + Adaptive Boost Use["BDTF"] = 0; // allow usage of fisher discriminant for node splitting // // --------------------------------------------------------------- std::cout << std::endl; std::cout << "==> Start TMVAClassification" << std::endl; // Select methods (don't look at this code - not of interest) if (myMethodList != "") { for (std::map::iterator it = Use.begin(); it != Use.end(); it++) it->second = 0; std::vector mlist = TMVA::gTools().SplitString( myMethodList, ',' ); for (UInt_t i=0; i::iterator it = Use.begin(); it != Use.end(); it++) std::cout << it->first << " "; std::cout << std::endl; return 1; } Use[regMethod] = 1; } } // -------------------------------------------------------------------------------------------------- // const Int_t MaxnJet = 10000; // Create a ROOT output file where TMVA will store ntuples, histograms, etc. TString outfileName( "TMVA.root" ); TFile* outputFile = TFile::Open( outfileName, "RECREATE" ); // Create the factory object. Later you can choose the methods // whose performance you'd like to investigate. The factory is // the only TMVA object you have to interact with // // The first argument is the base of the name of all the // weightfiles in the directory weight/ // // The second argument is the output file for the training results // All TMVA output can be suppressed by removing the "!" (not) in // front of the "Silent" argument in the option string TMVA::Factory *factory = new TMVA::Factory( "TMVAClassification", outputFile, "!V:!Silent:Color:DrawProgressBar:Transformations=I:AnalysisType=Classification" ); //"!V:!Silent:Color:DrawProgressBar:Transformations=I;D;P;G,D:AnalysisType=Classification" ); TMVA::DataLoader *dataloader=new TMVA::DataLoader("dataset"); // If you wish to modify default settings // (please check "src/Config.h" to see all available global options) // // (TMVA::gConfig().GetVariablePlotting()).fTimesRMS = 8.0; // (TMVA::gConfig().GetIONames()).fWeightFileDir = "myWeightDirectory"; // Define the input variables that shall be used for the MVA training // note that you may also use variable expressions, such as: "3*var1/var2*abs(var3)" // [all types of expressions that can also be parsed by TTree::Draw( "expression" )] dataloader->AddVariable( "M_bb", "", "GeV", 'F' ); dataloader->AddVariable( "PT_bb", "", "GeV", 'F' ); dataloader->AddVariable( "M_aa", "", "GeV", 'F' ); dataloader->AddVariable( "PT_aa", "", "GeV", 'F' ); dataloader->AddVariable( "DR_bb", 'F' ); dataloader->AddVariable( "DR_aa", 'F' ); dataloader->AddVariable( "DR_ab", 'F' ); // You can add an arbitrary number of signal or background trees TString fname = "./TMVA_all.root"; TFile *input = TFile::Open( fname ); std::cout << "--- TMVAClassification : Using input file: " << input->GetName() << std::endl; TTree *signal = (TTree*)input->Get("signal"); TTree *bbja_BG = (TTree*)input->Get("bbja_BG"); TTree *bbaa_BG = (TTree*)input->Get("bbaa_BG"); TTree *bbjj_BG = (TTree*)input->Get("bbjj_BG"); TTree *bbh_BG = (TTree*)input->Get("bbh_BG"); TTree *tth_BG = (TTree*)input->Get("tth_BG"); TTree *ccja_BG = (TTree*)input->Get("ccja_BG"); TTree *ccaa_BG = (TTree*)input->Get("ccaa_BG"); TTree *ggh_BG = (TTree*)input->Get("ggh_BG"); TTree *zh_BG = (TTree*)input->Get("zh_BG"); TTree *zaa_BG = (TTree*)input->Get("zaa_BG"); TTree *jjaa_BG = (TTree*)input->Get("jjaa_BG"); TTree *ttalb_BG = (TTree*)input->Get("ttalb_BG"); TTree *ttalo_BG = (TTree*)input->Get("ttalo_BG"); TTree *ttasb_BG = (TTree*)input->Get("ttasb_BG"); TTree *ttaso_BG = (TTree*)input->Get("ttaso_BG"); TTree *ttlbb_BG = (TTree*)input->Get("ttlbb_BG"); TTree *ttloo_BG = (TTree*)input->Get("ttloo_BG"); TTree *ttlob_BG = (TTree*)input->Get("ttlob_BG"); TTree *ttsb_BG = (TTree*)input->Get("ttsb_BG"); TTree *ttso_BG = (TTree*)input->Get("ttso_BG"); Double_t signalWeight = 0.119; Double_t bbjaWeight = 367000.0; Double_t bbaaWeight = 140.0; Double_t bbjjWeight = 434000000.0; Double_t bbhWeight = 1.26; Double_t tthWeight = 1.37; Double_t ccjaWeight = 1050000.0; Double_t ccaaWeight = 1140.0; Double_t gghWeight = 120.0; Double_t zhWeight = 2.24; Double_t zaaWeight = 5.17; Double_t jjaaWeight = 16200.0; Double_t ttalbWeight = 6.1515936; Double_t ttaloWeight = 6.1515936; Double_t ttasbWeight = 25.6696128; Double_t ttasoWeight = 25.6696128; Double_t ttlooWeight = 40.98329356; Double_t ttlobWeight = 40.98329356; Double_t ttlbbWeight = 40.98329356; Double_t ttsoWeight = 4.275417661; Double_t ttsbWeight = 4.275417661; dataloader->AddSignalTree ( signal, signalWeight*3000.0*19039.0/300000.0 ); dataloader->AddBackgroundTree( bbja_BG, bbjaWeight*3000.0*5.0/10000.0 *73273.0/3000000.0); dataloader->AddBackgroundTree( bbaa_BG, bbaaWeight*3000.0*60253.0/3000000.0 ); dataloader->AddBackgroundTree( bbjj_BG, bbjjWeight*3000.0*56126.0/5000000.0*5.0/10000.0*5.0/10000.0); dataloader->AddBackgroundTree( bbh_BG, bbhWeight*3000.0*666.0/1000000.0); dataloader->AddBackgroundTree( tth_BG, tthWeight*3000.0*38478.0/1000000.0); dataloader->AddBackgroundTree( ccja_BG, ccjaWeight *3000.0*10173.0/1000000.0*5.0/10000.0*1/8.0*1/8.0); dataloader->AddBackgroundTree( ccaa_BG, ccaaWeight *3000.0*6490.0/1000000.0*1/8.0*1/8.0); dataloader->AddBackgroundTree( ggh_BG, gghWeight*3000.0*580.0/3000000.0 ); dataloader->AddBackgroundTree( zh_BG, zhWeight*3000.0*3582.0 /1000000.0 ); dataloader->AddBackgroundTree( zaa_BG, zaaWeight *3000.0*37830.0 /3000000.0); dataloader->AddBackgroundTree( jjaa_BG, jjaaWeight *3000.0*4628.0 /3000000.0*1/8.0*1/8.0); dataloader->AddBackgroundTree( ttalb_BG, ttalbWeight *3000.0* 84946.0/4000000.0*0.02); dataloader->AddBackgroundTree( ttalo_BG, ttaloWeight *3000.0* 33154.0/4000000.0*0.05); dataloader->AddBackgroundTree( ttaso_BG, ttasoWeight *3000.0* 36702.0/7000000.0*0.05); dataloader->AddBackgroundTree( ttasb_BG, ttasbWeight *3000.0* 87521.0/7000000.0*0.02); dataloader->AddBackgroundTree( ttloo_BG, ttlooWeight *3000.0* 3413.0/4000000.0*0.05*0.05); dataloader->AddBackgroundTree( ttlob_BG, ttlobWeight *3000.0* 37621.0/4000000.0*0.02*0.05); dataloader->AddBackgroundTree( ttlbb_BG, ttlbbWeight *3000.0* 103455.0/4000000.0*0.02*0.02); dataloader->AddBackgroundTree( ttso_BG, ttsoWeight *3000.0* 90344.0/4000000.0*5.0/10000.0*0.05); dataloader->AddBackgroundTree( ttsb_BG, ttsbWeight *3000.0* 215411.0/4000000.0*5.0/10000.0*0.02); // To give different trees for training and testing, do as follows: // // dataloader->AddSignalTree( signalTrainingTree, signalTrainWeight, "Training" ); // dataloader->AddSignalTree( signalTestTree, signalTestWeight, "Test" ); // Use the following code instead of the above two or four lines to add signal and background // training and test events "by hand" // NOTE that in this case one should not give expressions (such as "var1+var2") in the input // variable definition, but simply compute the expression before adding the event // ```cpp // // --- begin ---------------------------------------------------------- // std::vector vars( 4 ); // vector has size of number of input variables // Float_t treevars[4], weight; // // // Signal // for (UInt_t ivar=0; ivar<4; ivar++) signalTree->SetBranchAddress( Form( "var%i", ivar+1 ), &(treevars[ivar]) ); // for (UInt_t i=0; iGetEntries(); i++) { // signalTree->GetEntry(i); // for (UInt_t ivar=0; ivar<4; ivar++) vars[ivar] = treevars[ivar]; // // add training and test events; here: first half is training, second is testing // // note that the weight can also be event-wise // if (i < signalTree->GetEntries()/2.0) dataloader->AddSignalTrainingEvent( vars, signalWeight ); // else dataloader->AddSignalTestEvent ( vars, signalWeight ); // } // // // Background (has event weights) // background->SetBranchAddress( "weight", &weight ); // for (UInt_t ivar=0; ivar<4; ivar++) background->SetBranchAddress( Form( "var%i", ivar+1 ), &(treevars[ivar]) ); // for (UInt_t i=0; iGetEntries(); i++) { // background->GetEntry(i); // for (UInt_t ivar=0; ivar<4; ivar++) vars[ivar] = treevars[ivar]; // // add training and test events; here: first half is training, second is testing // // note that the weight can also be event-wise // if (i < background->GetEntries()/2) dataloader->AddBackgroundTrainingEvent( vars, backgroundWeight*weight ); // else dataloader->AddBackgroundTestEvent ( vars, backgroundWeight*weight ); // } // // --- end ------------------------------------------------------------ // ``` // End of tree registration TCut mycuts = ""; //"DR_bb < 5.0 && DR_aa < 5.0 && DR_aj < 5.0"; // for example: TCut mycuts = "abs(var1)<0.5 && abs(var2-0.5)<1"; TCut mycutb = ""; //"DR_bb < 5.0 && DR_aa < 5.0 && DR_aj < 5.0"; // for example: TCut mycutb = "abs(var1)<0.5"; dataloader->PrepareTrainingAndTestTree( mycuts, mycutb, "nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=None:!V" ); // ### Book MVA methods // // Please lookup the various method configuration options in the corresponding cxx files, eg: // src/MethoCuts.cxx, etc, or here: http://tmva.sourceforge.net/optionRef.html // it is possible to preset ranges in the option string in which the cut optimisation should be done: // "...:CutRangeMin[2]=-1:CutRangeMax[2]=1"...", where [2] is the third input variable // Boosted Decision Trees if (Use["BDTG"]) // Gradient Boost factory->BookMethod( dataloader, TMVA::Types::kBDT, "BDTG", "!H:!V:NTrees=100:MinNodeSize=2.5%:BoostType=Grad:Shrinkage=0.10:UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=2:NegWeightTreatment=Pray" ); if (Use["BDT"]) // Adaptive Boost factory->BookMethod( dataloader, TMVA::Types::kBDT, "BDT", "!H:!V:NTrees=800:MinNodeSize=2.5%:MaxDepth=3:BoostType=AdaBoost:AdaBoostBeta=0.5:UseBaggedBoost:BaggedSampleFraction=0.5:SeparationType=GiniIndex:nCuts=20" ); if (Use["BDTB"]) // Bagging factory->BookMethod( dataloader, TMVA::Types::kBDT, "BDTB", "!H:!V:NTrees=400:BoostType=Bagging:SeparationType=GiniIndex:nCuts=20" ); if (Use["BDTD"]) // Decorrelation + Adaptive Boost factory->BookMethod( dataloader, TMVA::Types::kBDT, "BDTD", "!H:!V:NTrees=400:MinNodeSize=5%:MaxDepth=3:BoostType=AdaBoost:SeparationType=GiniIndex:nCuts=20:VarTransform=Decorrelate" ); if (Use["BDTF"]) // Allow Using Fisher discriminant in node splitting for (strong) linearly correlated variables factory->BookMethod( dataloader, TMVA::Types::kBDT, "BDTF", "!H:!V:NTrees=50:MinNodeSize=2.5%:UseFisherCuts:MaxDepth=3:BoostType=AdaBoost:AdaBoostBeta=0.5:SeparationType=GiniIndex:nCuts=20" ); // For an example of the category classifier usage, see: TMVAClassificationCategory // // -------------------------------------------------------------------------------------------------- // Now you can optimize the setting (configuration) of the MVAs using the set of training events // STILL EXPERIMENTAL and only implemented for BDT's ! // // factory->OptimizeAllMethods("SigEffAt001","Scan"); // factory->OptimizeAllMethods("ROCIntegral","FitGA"); // // -------------------------------------------------------------------------------------------------- // Now you can tell the factory to train, test, and evaluate the MVAs // // Train MVAs using the set of training events factory->TrainAllMethods(); // Evaluate all MVAs using the set of test events factory->TestAllMethods(); // Evaluate and compare performance of all configured MVAs factory->EvaluateAllMethods(); // -------------------------------------------------------------- // Save the output outputFile->Close(); std::cout << "==> Wrote root file: " << outputFile->GetName() << std::endl; std::cout << "==> TMVAClassification is done!" << std::endl; delete factory; delete dataloader; // Launch the GUI for the root macros if (!gROOT->IsBatch()) TMVA::TMVAGui( outfileName ); return 0; } int main( int argc, char** argv ) { // Select methods (don't look at this code - not of interest) TString methodList; for (int i=1; i