Note one little gotcha so far (which bothers me) is that when I create a TChain and then run tc.CopyTree – I see two different copies, both are sort of inaccurate.
output_file.cd()
for treename in args.trees:
logger.debug('Building TChain for {0:s}'.format(treename))
tc = ROOT.TChain(treename)
for f in files:
logger.debug('\t{0:s}'.format(f))
tc.Add(f)
logger.debug('Copying TChain now')
tree = tc.CopyTree("")
if args.do_backward_compatibility:
group = group.replace('singletop', 'SingleTop')
group = group.replace('topEW', 'TopEW')
group = group.replace('_sherpa', 'jets')
group = group.replace('_5000', '')
treename = treename.replace('nominal', 'NoSys')
# names need to be format <sample>_<systematic>
tree.SetName("_".join([group, treename]))
tree.SetTitle("_".join([group, treename]))
tree.SetDirectory(output_file)
output_file.cd()
tree.Write()
When I do this… I actually see two TTrees. One named “nominal” and one named “ttbar_NoSys” (where group = “ttbar”). The really unfortunate thing is that “nominal” seems to have been created because of the TreeChain call while “ttbar_NoSys” is from the tree.Write() call. I don’t want “nominal”. The second problem is that the “ttbar_NoSys” is missing an entire file from it, which exists in “nominal” but not in “ttbar_NoSys” [they should be equivalent according to my code above anyway, yet they aren’t!] This is code I’m converting from using CloneTree to using CopyTree…
The code I’m working on that’s spawning this thread in the first place is here (gitlab.cern.ch/MultiBJets/HF_MB … geTrees.py) which I’ll copy below.
#!/usr/bin/env python
# -*- coding: utf-8 -*-,
# @file: mergeTrees.py
# @purpose: merge many files into few files and copy trees and systematics
# @author: Giordon Stark <gstark@cern.ch>
# @date: July 2016
#
# __future__ imports must occur at beginning of file
# redirect python output using the newer print function with file description
# print(string, f=fd)
from __future__ import print_function
import logging
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)
#The background is set with 40 plus the number of the color, and the foreground with 30
#These are the sequences need to get colored ouput
RESET_SEQ = "\033[0m"
COLOR_SEQ = "\033[1;%dm"
BOLD_SEQ = "\033[1m"
def formatter_message(message, use_color = True):
if use_color:
message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ)
else:
message = message.replace("$RESET", "").replace("$BOLD", "")
return message
COLORS = {
'WARNING': YELLOW,
'INFO': WHITE,
'DEBUG': BLUE,
'CRITICAL': YELLOW,
'ERROR': RED
}
class ColoredFormatter(logging.Formatter):
def __init__(self, msg, use_color = True):
logging.Formatter.__init__(self, msg)
self.use_color = use_color
def format(self, record):
levelname = record.levelname
if self.use_color and levelname in COLORS:
levelname_color = COLOR_SEQ % (30 + COLORS[levelname]) + levelname + RESET_SEQ
record.levelname = levelname_color
return logging.Formatter.format(self, record)
# Custom logger class with multiple destinations
class ColoredLogger(logging.Logger):
FORMAT = "[$BOLD%(asctime)s$RESET][%(levelname)-18s] %(message)s ($BOLD%(filename)s$RESET:%(lineno)d)"
#FORMAT = "[$BOLD%(name)-20s$RESET][%(levelname)-18s] %(message)s ($BOLD%(filename)s$RESET:%(lineno)d)"
COLOR_FORMAT = formatter_message(FORMAT, True)
def __init__(self, name):
logging.Logger.__init__(self, name, logging.DEBUG)
color_formatter = ColoredFormatter(self.COLOR_FORMAT)
console = logging.StreamHandler()
console.setFormatter(color_formatter)
self.addHandler(console)
return
root_logger = logging.getLogger()
root_logger.setLevel(logging.NOTSET)
logging.setLoggerClass(ColoredLogger)
logger = logging.getLogger("mergeTrees")
# import the rest of the stuff
import argparse
import os
import subprocess
import sys
from random import choice
import tempfile
from array import array
import re
import json
try:
import ROOT
except ImportError:
logger.exception('Please set up ROOT (and PyROOT bindings) before continuing')
sys.exit(1)
did_regex = re.compile('\.(?:00)?(\d{6})\.')
def get_did(filename):
global did_regex
global logger
m = did_regex.search(filename)
if m is None:
logger.warning('Can\'t figure out the DID! Using input filename: {0}'.format(filename))
return filename.split("/")[-1]
return m.group(1)
def get_scaleFactor(did, weights):
global logger
weight = weights[did]
logger.debug("Weights for {0:s}".format(did))
logger.debug("\t {0:s}".format(str(weight)))
scaleFactor = 1.0
cutflow = weight.get('num events', 0)
if cutflow == 0:
logger.error("Num. events = 0 for {0:s}".format(did))
scaleFactor /= cutflow
logger.debug("___________________________________________________________________")
logger.debug(" {0:8s} Type of Scaling Applied | Scale Factor ".format(did))
logger.debug("========================================|==========================")
logger.debug("Cutflow: {0:20.10f} | {1:0.10f}".format(cutflow, scaleFactor))
scaleFactor *= weight.get('cross section')
logger.debug("Cross Section: {0:20.10f} | {1:0.10f}".format(weight.get('cross section'), scaleFactor))
scaleFactor *= weight.get('filter efficiency')
logger.debug("Filter Efficiency: {0:20.10f} | {1:0.10f}".format(weight.get('filter efficiency'), scaleFactor))
scaleFactor *= weight.get('k-factor')
logger.debug("k-factor: {0:20.10f} | {1:0.10f}".format(weight.get('k-factor'), scaleFactor))
logger.debug( "‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾")
return scaleFactor
def add_sampleWeight_branch(did, weights, tree, branchname='weight_lumi'):
scaleFactor = get_scaleFactor(did, weights)
branch_val = array( 'f', [ scaleFactor ] )
branch_val_up = array( 'f', [ scaleFactor*(1+weights[did].get('rel uncert', -1.0)) ] )
branch_val_down = array( 'f', [ scaleFactor*(1-weights[did].get('rel uncert', 1.0)) ] )
branches = []
branches.append(tree.Branch('{0:s}'.format(branchname), branch_val, '{0:s}/F'.format(branchname)))
branches.append(tree.Branch('{0:s}_up'.format(branchname), branch_val_up, '{0:s}_up/F'.format(branchname)))
branches.append(tree.Branch('{0:s}_down'.format(branchname), branch_val_down, '{0:s}_down/F'.format(branchname)))
for i in range(tree.GetEntries()):
tree.GetEntry(i)
for b in branches:
b.Fill()
#tree.Fill()
if __name__ == "__main__":
# if we want multiple custom formatters, use inheriting
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter):
pass
__version__ = subprocess.check_output(["git", "describe", "--always"], cwd=os.path.dirname(os.path.realpath(__file__))).strip()
parser = argparse.ArgumentParser(description='Determine if a sample has MC Weight issues',
usage='\033[93m%(prog)s\033[0m files [options]',
formatter_class=lambda prog: CustomFormatter(prog, max_help_position=30))
parser.add_argument('files', type=str, nargs='+', help='HistFitter outputs from MBJ code')
parser.add_argument('--filters', metavar='filters.json', type=str, help='JSON dictionary where key=DID and val=selection to apply')
parser.add_argument('--weights', metavar='weights.json', type=str, help='JSON dictionary of weights Can be http:// url or local file!')
parser.add_argument('--did-to-group', metavar='did_to_group.json', type=str, help='JSON dictionary mapping DID to group. Can be http:// url or local file!')
parser.add_argument('--trees', metavar='tree', type=str, nargs='+', help='TTrees to merge and copy over in signal and bkgd files',
default=['nominal',
'JET_GroupedNP_1__1up',
'JET_GroupedNP_1__1down',
'JET_GroupedNP_2__1up',
'JET_GroupedNP_2__1down',
'JET_GroupedNP_3__1up',
'JET_GroupedNP_3__1down',
'JET_JER_SINGLE_NP__1up'])
"""
'JET_JMS_GroupedNP_1__1up',
'JET_JMS_GroupedNP_1__1down'])
'EG_RESOLUTION_ALL__1down',
'EG_RESOLUTION_ALL__1up',
'EG_SCALE_ALL__1down',
'EG_SCALE_ALL__1up',
'MET_SoftTrk_ResoPara',
'MET_SoftTrk_ResoPerp',
'MET_SoftTrk_ScaleDown',
'MET_SoftTrk_ScaleUp',
'MUONS_ID__1down',
'MUONS_ID__1up',
'MUONS_MS__1down',
'MUONS_MS__1up',
'MUONS_SCALE__1down',
'MUONS_SCALE__1up"*/
"""
parser.add_argument('--include-dids', metavar='did', type=str, nargs='+', help='DIDs to use', default=[])
parser.add_argument('--groups-bkg', metavar='group', type=str, nargs='+', help='Groups of bkgd to merge',
default=['ttbar',
'singletop',
'diboson',
'topEW',
'W_sherpa',
'Z_sherpa'])
parser.add_argument('--groups-sig', metavar='group', type=str, nargs='+', help='Groups of signal to collect but not merge', default=['Gtt', 'Gbb'])
parser.add_argument('--output-suffix', metavar='tag#####', type=str, default='', help='Add a suffix to output files, perhaps to specify a nominal only: --output-suffix _nominal')
parser.add_argument('--do-data', action='store_true', help='Process and merge data files. Otherwise, skip this process.')
parser.add_argument('--do-jms-fix', action='store_true', help='Add JMS systematic branches according to the prescription in this code.')
parser.add_argument('--do-backward-compatibility', action='store_true', help='Rename the branch names to get things backgwards-compatible. See the code for what this entails.')
parser.add_argument('-v', '--verbose', dest='verbose', action='count', default=0, help='Enable verbose output of various levels. Default: no verbosity')
parser.add_argument('--version', action='version', version='\033[93m%(prog)s\033[0m \033[94m{version}\033[0m'.format(version=__version__), default='\033[94m{version}\033[0m'.format(version=__version__))
args = parser.parse_args()
# set verbosity for python printing
if args.verbose < 2:
logger.setLevel(20 - args.verbose*10)
else:
logger.setLevel(logging.NOTSET + 1)
"""
Load the files in -- do not necessarily need a filter
"""
try:
filters = json.load(file(args.filters))
except IOError:
logger.warning("No filter file. Is this expected? You gave me '{0:s}'".format(args.filters))
filters = {}
if args.weights.startswith('http'):
import requests
weights = json.load(requests.get(args.weights).text)
else:
weights = json.load(file(args.weights))
if args.did_to_group.startswith('http'):
import requests
did_to_group = json.load(requests.get(args.did_to_group).text)
else:
did_to_group = json.load(file(args.did_to_group))
"""
Next, ensure we have information on all DIDs used in the files.
"""
checkedDIDs = []
for f in args.files:
# skip data
if '.data.' in f: continue
did = get_did(f)
if did in checkedDIDs: continue
if did not in args.include_dids: continue
checkedDIDs.append(did)
logger.info("Checking {0:s}".format(did))
if did in weights:
logger.info("\t has weight")
else:
logger.error("\t does not have weight")
sys.exit(1)
if did in did_to_group:
logger.info("\t has group {0:s}".format(did_to_group[did]))
else:
logger.error("\t does not have a group assigned")
sys.exit(1)
logger.info("\t has filter: {0:s}".format(str(did in filters)))
# signal is fuzzy-matched, bkg is exact-matched
all_groups = [group for group in (args.groups_bkg + args.groups_sig) if group]
# check the groups and see if we have splitting information
for group in all_groups:
logger.info("Checking {0:s}".format(group))
logger.info("\t has defined splits: {0:s}".format(str(any(group in name for name in filters.keys()))))
for name in filters.keys():
if group not in name: continue
selection = filters[name]
logger.info("\t\t {0:s} \t {1:s}".format(name, selection))
"""
In this section,
- we read in all the data files (must have 'data' in the name)
- create a TChain of the 'nominal' branch
- create an output file for data
- clone the TChain as a TTree which gets saved to the output
- close the output file
TODO: Add protection if output file already exists.
"""
data_files = [f for f in args.files if '.data.' in f]
if args.do_data and len(data_files) > 0:
logger.info("Doing the data files")
tc = ROOT.TChain('nominal')
for f in data_files:
logger.debug('\t{0:s}'.format(f))
tc.Add(f)
logger.info('Creating output file {0:s}{1:s}.root'.format('Data', args.output_suffix))
output_file = ROOT.TFile('Data{0:s}.root'.format(args.output_suffix), 'UPDATE')
tree = tc.CloneTree(-1, "fast")
tree.SetName("Data")
tree.SetTitle("Data")
tree.SetAlias("MJSum_rc_r08pt10_nominal","MJSum_rc_r08pt10")
tree.SetDirectory(output_file)
output_file.cd()
tree.Write()
#output_file.Write()
output_file.Close()
logger.info('Finished creating output file')
"""
In this section,
- we read in all the bkg files
- sort everything / filter by group
- make temporary files for things that need hadding or filtering
- for each group
- create the output file
- for each branch
- create a TChain
- apply selection if desired (TChain::CopyTree)
- clone the TChain as a TTree which gets saved to the output
- apply weights by making a new branch
- clean up temporary files
TODO: Add protection if output file already exists
"""
files_by_did = {}
# first group by did to figure out if we need to merge files together or not
for f in args.files:
if f in data_files: continue
did = get_did(f)
if did not in args.include_dids: continue
if did in files_by_did:
files_by_did[did].append(f)
else:
files_by_did[did] = [f]
# here, we need to do some temporary TChains and copy the trees we want
tmpDir = tempfile.mkdtemp(dir=os.getcwd())
logger.info("Temporary directory created: {0:s}".format(tmpDir))
# make a dictionary mapping the group to the files inside the group
files_by_group = {}
for did,files in files_by_did.iteritems():
group = did_to_group[did]
# skip any dids that are not in groups we care about for bkg or signal
if group not in args.groups_bkg and not any(g in group for g in args.groups_sig if g): continue
logger.debug("Working on {0:s} which has {1:d} files".format(did, len(files)))
logger.debug("\t belongs to group {0:s}".format(group))
"""
always make a temporary file since we need to do at least one of the following
which is sorted from most likely to least likely
- add a new branch for sample weight (weight_lumi)
- merge multiple files of the same DID together
- apply a selection/filter to the ttrees
"""
logger.debug("Making a tmp file for {0:s} with {1:d} files".format(did, len(files)))
# wrap this portion in a try statement because we need to make sure we clean up after ourselves
try:
tmpFile = os.path.join(tmpDir, '{0:s}.root'.format(did))
logger.debug("Creating temporary file: {0:s}".format(tmpFile))
for treename in args.trees:
# open the file to update it
output_file = ROOT.TFile.Open(tmpFile, 'UPDATE')
# start making the tchain
logger.debug("Building a TChain for {0:s}".format(treename))
tc = ROOT.TChain(treename)
for f in files:
tc.Add(f)
# apply a selection/filter if defined
if did in filters: logger.debug("Applying filter: {0:s}".format(filters[did]))
""" THIS NEEDS TO BE FIXED ASAP
For some reason, we get
(Pdb) tc.CopyTree(filters.get(did, ''))
<ROOT.TTree object ("nominal") at 0x4948060>
(Pdb) copy_tree
<ROOT.TTree object at 0x(nil)>
(Pdb)
which results in errors like
[.[1m2016-07-17 14:06:04,288.[0m][.[1;34mDEBUG.[0m ] Cloning nominal to temporary file /tmp/tmpgGr47d363361 (.[1mmergeTrees.py.[0m:342)
.[?1034hTraceback (most recent call last):
File "mergeTrees.py", line 343, in <module>
copy_tree.CloneTree(-1, "fast")
ReferenceError: attempt to access a null-pointer
which makes it seem like ROOT is being slow at copying trees so we need to do it twice so it actually copies correctly...
A solution:
run CloneTree first to get the TTree into the new file, then run CopyTree on that cloned tree and use it instead if it needs to be filtered. This means we do not apply CopyTree to every tree, making the script faster since we only run CopyTree when needed and then will need to do some name mangling to get the CloneTree to have a temp name and the CopyTree to have the correct name.
"""
# for now, a temporary patch to fix the fucking problem
#logger.debug(tc.CopyTree(filters.get(did, '')))
copy_tree = False
while not copy_tree:
copy_tree = tc.CopyTree(filters.get(did, ''))
# clone the tree now into the current output file
logger.debug("Cloning {0:s} to temporary file {1:s}".format(treename, tmpFile))
clone_tree = copy_tree.CloneTree(-1, "fast")
logger.debug("Adding the sample weight for {0:s} to the tree".format(did))
add_sampleWeight_branch(did, weights, clone_tree)
if args.do_jms_fix and treename == 'nominal':
logger.debug("\tDoing the JMS fix")
for systname in ['JET_JMS_GroupedNP_1__1up','JET_JMS_GroupedNP_1__1down']:
logger.debug("\t\t{0:s} is being cloned".format(systname))
syst_tree = clone_tree.CloneTree(-1, "fast")
syst_tree.SetName(systname)
syst_tree.SetTitle(systname)
if systname == 'JET_JMS_GroupedNP_1__1up':
syst_tree.SetAlias("MJSum_rc_r08pt10_nominal","MJSum_rc_r08pt10_JMSUP")
elif systname == 'JET_JMS_GroupedNP_1__1down':
syst_tree.SetAlias("MJSum_rc_r08pt10_nominal", "MJSum_rc_r08pt10_JMSDOWN")
syst_tree.SetDirectory(output_file)
output_file.cd()
syst_tree.Write()
logger.debug("Setting an alias for the MJSum branch")
clone_tree.SetAlias("MJSum_rc_r08pt10_nominal","MJSum_rc_r08pt10")
clone_tree.SetDirectory(output_file)
output_file.cd()
clone_tree.Write()
#output_file.Write()
del tc
# flush contents and update streamerinfo
output_file.Close()
try:
files_by_group[group].append(tmpFile)
except KeyError:
files_by_group[group] = [tmpFile]
except:
# clean up after ourselves
import shutil
shutil.rmtree(tmpDir, ignore_errors=True)
raise
# JMS fix requires adding the two new TTrees in our loop and copying them from nominal
if args.do_jms_fix:
args.trees.extend(['JET_JMS_GroupedNP_1__1up','JET_JMS_GroupedNP_1__1down'])
signal_filename = 'Sig{0:s}.root'.format(args.output_suffix)
logger.info('Signal output file {0:s}'.format(signal_filename))
bkg_filename = 'Bkg{0:s}.root'.format(args.output_suffix)
logger.info('Background output file {0:s}'.format(bkg_filename))
# create the output files for signal and background
# this contains the output file that the given group is being written to
output_file = None
for group, files in files_by_group.iteritems():
if group in args.groups_bkg:
logger.debug('Group {0:s} is identified as BACKGROUND'.format(group))
output_filename = bkg_filename
else:
logger.debug('Group {0:s} is identified as SIGNAL'.format(group))
output_filename = signal_filename
logger.info('Group {0:s} will be written to {1:s}'.format(group, output_filename))
# loop over the trees we need to make
for treename in args.trees:
logger.debug('Building TChain for {0:s}'.format(treename))
tc = ROOT.TChain(treename)
for f in files:
logger.debug('\t{0:s}'.format(f))
tc.Add(f)
# open the file
output_file = ROOT.TFile(output_filename, 'UPDATE')
logger.debug('Cloning TChain now')
tree = tc.CloneTree(-1, "fast")
if args.do_backward_compatibility:
group = group.replace('singletop', 'SingleTop')
group = group.replace('topEW', 'TopEW')
group = group.replace('_sherpa', 'jets')
group = group.replace('_5000', '')
treename = treename.replace('nominal', 'NoSys')
# names need to be format <sample>_<systematic>
tree.SetName("_".join([group, treename]))
tree.SetTitle("_".join([group, treename]))
tree.SetDirectory(output_file)
output_file.cd()
tree.Write()
# check if we need to do some splitting
for name,selection in filters.get(group, []):
copy_tree = tree.CopyTree(selection)
# we're technically creating a new sample called <sample>_<name of selection>
copy_tree.SetName("_".join([group, name, treename]))
copy_tree.SetTitle("_".join([group, name, treename]))
copy_tree.Write()
output_file.Close()
import shutil
shutil.rmtree(tmpDir, ignore_errors=True)
logger.info("Don't forget to remove the temp directory {0:s}".format(tmpDir))
logger.info("Completely finished! Enjoy.")