TTree memory limit

OtaviusDecius · October 4, 2022, 5:16pm

The code I use to access the dataset files is:

python config:

process.source = cms.Source("PoolSource",
    fileNames = cms.untracked.vstring(XXX)
,
lumisToProcess = cms.untracked.VLuminosityBlockRange('319104:1-319104:10',
'319104:15-319104:185','319124:91-319124:277','319125:1-319125:208','319159:125-319159:618',
'319174:1-319174:77','319175:1-319175:139','319176:1-319176:1803','319177:1-319177:232',
'319190:1-319190:317','319222:108-319222:294','319223:1-319223:131','319254:115-319254:263',
'319255:1-319255:164','319256:1-319256:726','319262:10-319262:10','319262:15-319262:16',
'319262:20-319262:23','319262:29-319262:34','319262:39-319262:40','319262:46-319262:58',
'319262:61-319262:78','319262:82-319262:123','319262:129-319262:362','319263:1-319263:367',
'319264:1-319264:57','319265:1-319265:396','319266:1-319266:26','319267:1-319267:204',
'319268:1-319268:467','319270:1-319270:206','319300:1-319300:1132','319311:1-319311:1733'
)
)

process.TFileService = cms.Service("TFileService",
            fileName = cms.string("output.root"),
            closeFileFast = cms.untracked.bool(False)
)

submit-condorRECOall.csh (see the attachment):

  cat ../${template_py} | sed "s|XXX|${mylist}|"  > temp_py
  cat  temp_py       | sed "s|YYY|${i}|" > ${submit_dir}/job_${i}.py
  rm -f temp_py

shell:

./submit-condorRECOall.csh t201 t20.eos_1

PromptAnalyzer.cc:

// ------------ method called for each event  ------------
void
PromptAnalyzer::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup)
{
...
}

// ------------ method called once each job just before starting event loop  ------------
void
PromptAnalyzer::beginJob()
{
  //...Luiz
  edm::Service<TFileService> fs;
  
  int nbins_eta = 80;
  int nbins_pt = 100;
  int nbins_phi = 64;

  histosTH1F["hpt"] = fs->make<TH1F>("hpt","p_{T}",nbins_pt,0,5);
  histosTH1F["heta"] = fs->make<TH1F>("heta","#eta",nbins_eta,-4,4);
  histosTH1F["hphi"] = fs->make<TH1F>("hphi","#varphi",nbins_phi,-3.2,3.2);
  histosTH1F["halgo"] = fs->make<TH1F>("halgo","Algo",15,0,15.);
  histosTH1F["hnhits"] = fs->make<TH1F>("hnhits","nhits pix+strip",40,0,40.);
...
  std::cout<<"booked all of Luiz' histograms."<<std::endl;
  //--------------end of my histograms
}

// ------------ method called once each job just after ending the event loop  ------------
void
PromptAnalyzer::endJob()
{
  std::cout<<"ciao ciao..."<<std::endl;
}

// ------------ method called when starting to processes a run  ------------
void 
PromptAnalyzer::beginRun(edm::Run const& run, edm::EventSetup const& es)
{
  bool changed(true);
  if (hltConfig_.init(run, es, "HLT",changed)) {
    hltConfig_.dump("Triggers");
    hltConfig_.dump("PrescaleTable"); 
  }
}

// ------------ method called when ending the processing of a run  ------------
void 
PromptAnalyzer::endRun(edm::Run const&, edm::EventSetup const&)
{
}

job_1914.py

process.source = cms.Source("PoolSource",
    fileNames = cms.untracked.vstring('root://eostotem//eos/totem/data/cmstotem/2018/90m/RECO_copy/TOTEM20/110000/2625EB46-453E-E911-8EB
8-008CFA06473C.root', 'root://eostotem//eos/totem/data/cmstotem/2018/90m/RECO_copy/TOTEM20/110000/261F8013-B83D-E911-9A99-003048F2E8C0.r
oot',)
,
lumisToProcess = cms.untracked.VLuminosityBlockRange('319104:1-319104:10',
'319104:15-319104:185','319124:91-319124:277','319125:1-319125:208','319159:125-319159:618',
'319174:1-319174:77','319175:1-319175:139','319176:1-319176:1803','319177:1-319177:232',
'319190:1-319190:317','319222:108-319222:294','319223:1-319223:131','319254:115-319254:263',
'319255:1-319255:164','319256:1-319256:726','319262:10-319262:10','319262:15-319262:16',
'319262:20-319262:23','319262:29-319262:34','319262:39-319262:40','319262:46-319262:58',
'319262:61-319262:78','319262:82-319262:123','319262:129-319262:362','319263:1-319263:367',
'319264:1-319264:57','319265:1-319265:396','319266:1-319266:26','319267:1-319267:204',
'319268:1-319268:467','319270:1-319270:206','319300:1-319300:1132','319311:1-319311:1733'
)
)
process.TFileService = cms.Service("TFileService",
            fileName = cms.string("output.root"),
            closeFileFast = cms.untracked.bool(False)
)

submit-condorRECOall-csh.txt (3.0 KB)

pcanal · October 4, 2022, 5:51pm

Since, the failing output file is produced by its own job: job_1914.py, does re-running the job leads to a similarly broken files? Does calling hadd on a small subset (a dozen file) that includes the 1914 failing file also fails?

OtaviusDecius · October 6, 2022, 2:51pm

dataset: t200

ulimit -S -n 4096
ulimit -S -s 32768
hadd -T -n 50 x200.root data_*.root
… (see the attachment)
hadd Opening the next 49 files
hadd Target path: x200.root:/
hadd Target path: x200.root:/demo
hadd Opening the next 49 files
hadd Target path: x200.root:/
hadd Target path: x200.root:/demo
hadd Opening the next 49 files
Warning in TFile::Init: file data_1422.root probably not closed, trying to recover
Info in TFile::Recover: data_1422.root, recovered key TDirectoryFile:demo at address 232
Warning in TFile::Init: successfully recovered 1 keys
hadd Target path: x200.root:/
hadd Target path: x200.root:/demo

dataset: t210

ulimit -S -n 4096
ulimit -S -s 32768
hadd -T -n 50 x210.root data_*.root
… (see the attachment)
hadd Target path: x210.root:/demo
hadd Opening the next 49 files
hadd Target path: x210.root:/
hadd Target path: x210.root:/demo
hadd Opening the next 49 files
hadd Target path: x210.root:/
hadd Target path: x210.root:/demo
hadd Opening the next 49 files
Warning in TFile::Init: file data_576.root probably not closed, trying to recover
Info in TFile::Recover: data_576.root, recovered key TDirectoryFile:demo at address 232
Warning in TFile::Init: successfully recovered 1 keys
hadd Target path: x210.root:/
hadd Target path: x210.root:/demo
hadd Opening the next 49 files
hadd Target path: x210.root:/
hadd Target path: x210.root:/demo

Full outputs attached. Root files attached.

hadd-t200-2.txt (78.9 KB)
hadd-t210.txt (76.3 KB)
data_576.root (339 Bytes)
data_1422.root (339 Bytes)
data_1914.root (1.1 MB)

OtaviusDecius · October 6, 2022, 2:57pm

The 1914.root provided above is now from t200 dataset which is good, no error. The previous error on 1914 is from another unkown dataset (sorry I do not remember which one). So, forget 1914 and focus on 1422 and 576.

Wile_E_Coyote · October 6, 2022, 2:58pm

“576” and “1422” are completely empty (just an empty “demo” directory is inside).
You need to inspect the jobs which created them (it seems they died before any histograms were written).

OtaviusDecius · October 6, 2022, 3:07pm

Check please the job 1422 files attached.

job_1422-err.txt (12.3 KB)
job_1422-out.txt (12.4 KB)
job_1422-py.txt (5.4 KB)
job_1422-sh.txt (564 Bytes)
submit_1422.txt (312 Bytes)
files_1422.txt (238 Bytes)

OtaviusDecius · October 6, 2022, 3:09pm

job 576
job_576.err.txt (14.8 KB)
job_576.out.txt (12.4 KB)
job_576.py.txt (5.4 KB)
job_576.sh.txt (561 Bytes)
submit_576.txt (308 Bytes)

OtaviusDecius · October 6, 2022, 3:11pm

I am going to resubmit 1422.

Wile_E_Coyote · October 6, 2022, 3:11pm

In both “job_*.err.txt” files:

(...)
 from UserCode/EnergyLossPID/data/mostprob_totem.dat
A fatal system signal has occurred: segmentation violation
The following is the call stack containing the origin of the signal.
Current Modules:
Module: EnergyLossProducer:energyLossProducer (crashed)
A fatal system signal has occurred: segmentation violation
(...)

pcanal · October 6, 2022, 3:13pm

Those jobs are failing.

A fatal system signal has occurred: segmentation violation
The following is the call stack containing the origin of the signal.
...
#4  <signal handler called>
#5  0x00002b4dd6dfdfe5 in ClusterShapeHitFilter::getSizes(DetId, SiStripCluster const&, Point3DBase<float, LocalTag> const&, Vector3DBase<float, LocalTag> const&, int&, float&) const () from /cvmfs/cms.cern.ch/slc7_amd64_gcc820/cms/cmssw/CMSSW_10_6_18/lib/slc7_amd64_gcc820/libRecoPixelVertexingPixelLowPtUtilities.so
#6  0x00002b4df4ec7c96 in HadronClusters::processRec(SiStripRecHit2D const&, Vector3DBase<float, LocalTag>, TTrack&, float, float) () from /afs/cern.ch/user/l/lregisem/CMSSW_10_6_18/lib/slc7_amd64_gcc820/libUserCodeEnergyLossPID.so
#7  0x00002b4df4ecaf91 in HadronClusters::analyzeRecTrajectory(SiPixelClusterShapeCache const&, Trajectory const&, TTrack&) () from /afs/cern.ch/user/l/lregisem/CMSSW_10_6_18/lib/slc7_amd64_gcc820/libUserCodeEnergyLossPID.so
#8  0x00002b4df4e7ba88 in EnergyLossProducer::produce(edm::Event&, edm::EventSetup const&) () from /afs/cern.ch/user/l/lregisem/CMSSW_10_6_18/lib/slc7_amd64_gcc820/pluginUserCodeEnergyLossPIDPlugins.so

You will need to correct this error (inside CMSSW and/or your code) before proceeding.

OtaviusDecius · October 6, 2022, 3:29pm

Oh, no, it is my code problem, silly me. I should have paid more attention to the error files.
Sorry for wasting your time.

Thanks a lot for your kind attention.

Luiz