Configuration#
A PocketCoffea analysis can be customized by writing a configuration file, containing all the information needed to setup an analysis run.
The PocketCoffea configuration comprehends:
Input dataset specification
Analysis parameters (see Parameters page)
Custom processor specification
Skimming, preselection and categories
Weights configuration
Systematic variations configuration
Histograms output configuration
Running mode configuration: local, multiprocessing, cluster
Note
The configuration is wrapped by a Configurator
object, usually saved in a python script containing a cfg
variable.
A full simplified example is available here. In this page we will describe in details all the components of a more complete example about ttHbb semileptonic channel.
1from pocket_coffea.utils.configurator import Configurator
2from pocket_coffea.lib.cut_definition import Cut
3
4from pocket_coffea.lib.cut_functions import get_nPVgood, goldenJson, eventFlags
5from pocket_coffea.lib.cut_functions import get_nObj_min, get_HLTsel, get_nBtagEq, get_nBtagMin
6from pocket_coffea.parameters.cuts import passthrough
7from pocket_coffea.parameters.histograms import *
8import os
9
10from pocket_coffea.workflows.tthbb_base_processor import ttHbbBaseProcessor
11from pocket_coffea.lib.weights.common import common_weights
12
13# importing custom cut functions
14from custom_cut_functions import *
15localdir = os.path.dirname(os.path.abspath(__file__))
16
17# Loading default parameters
18from pocket_coffea.parameters import defaults
19default_parameters = defaults.get_default_parameters()
20defaults.register_configuration_dir("config_dir", localdir+"/params")
21
22# merging additional analysis specific parameters
23parameters = defaults.merge_parameters_from_files(default_parameters,
24 f"{localdir}/params/object_preselection.yaml",
25 f"{localdir}/params/btagsf_calibration.yaml",
26 f"{localdir}/params/triggers.yaml",
27 update=True)
28
29# Configurator instance
30cfg = Configurator(
31 parameters = parameters,
32 datasets = {
33 "jsons": [f"{localdir}/datasets/backgrounds_MC_ttbar_2018.json",
34 f"{localdir}/datasets/backgrounds_MC_ttbar_2017.json",
35 f"{localdir}/datasets/DATA_SingleEle.json",
36 f"{localdir}/datasets/DATA_SingleEle.json",
37 ],
38 "filter" : {
39 "samples": ["TTToSemiLeptonic","DATA_SingleEle","DATA_SingleEle"],
40 "samples_exclude" : [],
41 "year": ['2018','2017']
42 },
43 "subsamples":{
44 "TTToSemiLeptonic": {
45 "=1b": [get_nBtagEq(1, coll="Jet")],
46 "=2b" : [get_nBtagEq(2, coll="Jet")],
47 ">2b" : [get_nBtagMin(3, coll="Jet")]
48 }
49 }
50 },
51
52 workflow = ttHbbBaseProcessor,
53 workflow_options = {},
54
55 # Skimming and categorization
56 skim = [
57 get_nPVgood(1), eventFlags, goldenJson,
58 get_nObj_min(4, 15., "Jet"),
59 get_HLTsel()
60 ],
61
62 preselections = [semileptonic_presel_nobtag],
63
64 categories = {
65 "baseline": [passthrough],
66 "1b" : [ get_nBtagEq(1, coll="BJetGood")],
67 "2b" : [ get_nBtagEq(2, coll="BJetGood")],
68 "3b" : [ get_nBtagEq(3, coll="BJetGood")],
69 "4b" : [ get_nBtagEq(4, coll="BJetGood")]
70 },
71
72 # Weights configuration
73 weights_classes = common_weights,
74 weights = {
75 "common": {
76 "inclusive": ["genWeight","lumi","XS",
77 "pileup",
78 "sf_ele_reco", "sf_ele_id",
79 "sf_mu_id","sf_mu_iso",
80 "sf_btag", "sf_jet_puId",
81 ],
82 "bycategory" : {
83 "2jets_20pt" : [.....]
84 }
85 },
86 "bysample": {
87 "TTToSemiLeptonic": {
88 "inclusive": [...],
89 "bycategory": {
90 "2jets_20pt": [....]
91 }
92 }
93 }
94 },
95
96 variations = {
97 "weights": {
98 "common": {
99 "inclusive": [ "pileup",
100 "sf_ele_reco", "sf_ele_id",
101 "sf_mu_id", "sf_mu_iso",
102 "sf_jet_puId","sf_btag"
103 ],
104 "bycategory" : {
105 }
106 },
107 "bysample": {
108 "TTToSemiLeptonic": {
109 "inclusive": [],
110 "bycategory": {}
111 }
112 }
113 },
114 "shape": {
115 ....
116 }
117 },
118
119
120 variables = {
121 "HT" : HistConf([Axis(coll="events", field="events", bins=100, start=0, stop=200, label="HT")] ),
122 "leading_jet_pt_eta" : HistConf(
123 [
124 Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=0, label="Leading jet $p_T$"),
125 Axis(coll="JetGood", field="eta", bins=40, start=-5, stop=5, pos=0, label="Leading jet $\eta$")
126 ] ),
127
128 #Plotting all jets together
129 "all_jets_pt_eta" : HistConf(
130 [
131 Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=None, label="All jets $p_T$"),
132 Axis(coll="JetGood", field="eta", bins=40, start=-5, stop=5, pos=None, label="All jets $\eta$")
133 ] ),
134
135 "subleading_jetpt_MET" : HistConf(
136 [
137 Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=0, label="Leading jet $p_T$"),
138 Axis(coll="MET", field="pt", bins=40, start=0, stop=100, label="MET")
139 ] ),
140
141 **ele_hists(coll="ElectronGood", pos=0),
142 **muon_hists(coll="MuonGood", pos=0),
143 **count_hist(name="nElectronGood", coll="ElectronGood",bins=3, start=0, stop=3),
144 **count_hist(name="nMuonGood", coll="MuonGood",bins=3, start=0, stop=3),
145 **count_hist(name="nJets", coll="JetGood",bins=10, start=4, stop=14),
146 **count_hist(name="nBJets", coll="BJetGood",bins=12, start=2, stop=14),
147 **jet_hists(coll="JetGood", pos=0),
148 **jet_hists(coll="JetGood", pos=1),
149 **jet_hists(coll="JetGood", pos=2),
150
151 },
152
153 columns = {
154 "common": {
155 "inclusive": [],
156 "bycategory": {}
157 },
158 "bysample": {
159 "TTToSemiLeptonic" : { "inclusive": [ColOut("LeptonGood",["pt","eta","phi"])]},
160 "TTToSemiLeptonic__=1b" :{ "inclusive": [ColOut("JetGood",["pt","eta","phi"])]},
161 "TTToSemiLeptonic__=2b":{ "inclusive": [ColOut("BJetGood",["pt","eta","phi"])]},
162 }
163 }
164)
165
Datasets#
The dataset configuration has the following structure:
1cfg = Configurator(
2 datasets = {
3 "jsons": [f"{localdir}/datasets/backgrounds_MC_ttbar_2018.json",
4 f"{localdir}/datasets/backgrounds_MC_ttbar_2017.json",
5 f"{localdir}/datasets/DATA_SingleEle.json",
6 f"{localdir}/datasets/DATA_SingleEle.json",
7 ],
8 "filter" : {
9 "samples": ["TTToSemiLeptonic","DATA_SingleEle","DATA_SingleEle"],
10 "samples_exclude" : [],
11 "year": ['2018','2017']
12 },
13 "subsamples":{
14 "TTToSemiLeptonic": {
15 "=1b": [get_nBtagEq(1, coll="Jet")],
16 "=2b" : [get_nBtagEq(2, coll="Jet")],
17 ">2b" : [get_nBtagMin(3, coll="Jet")]
18 }
19 }
20 },
21 ....
22 )
The
jsons
key contains the list of dataset definition file to consider as inputsThe
filter
dictionary gives the user the possibility to filter on the fly the desidered samples to include or exclude from the full list taken from the jsons files. Samples can be filtered by name of by year.subsamples
makes possible to define cuts splitting the events in multiple sub-samples. See the datasets page for a more in-depth definition of them. A list of Cut objects is used to define the subsample, an AND between them is used to mask the events.In the example, by using the
subsamples
option effectively theTTToSemiLeptonic
sample will be split in the framework in 3 pieces calledTTToSemiLeptonic__=1b
,TTToSemiLeptonic__=2b
,TTToSemiLeptonic__>2b
.Warning
Subsamples do not need to be exclusive. Subsample masks are applied before exporting histograms, columns and counting events.
Workflow#
1from pocket_coffea.workflows.tthbb_base_processor import ttHbbBaseProcessor
2
3cfg = Configurator(
4 workflow : ttHbbBaseProcessor,
5 worflow_options : {},
6 ....
7)
workflow
key specifies directly the class to use.workflow_options
: dictionary with additional options for specific processors (user defined)
Cuts and categories#
The events skimming, preselection and categorization is defined in a structured way in PocketCoffea: see Concepts#Filtering for a detailed explanation of the difference between the steps.
1cfg = Configurator(
2 skim = [
3 get_nPVgood(1), eventFlags, goldenJson,
4 get_nObj_min(4, 15., "Jet"),
5 get_HLTsel()
6 ],
7
8 preselections = [semileptonic_presel_nobtag],
9
10 categories = StandardSelection({
11 "baseline": [passthrough],
12 "1b" : [ get_nBtagEq(1, coll="BJetGood")],
13 "2b" : [ get_nBtagEq(2, coll="BJetGood")],
14 "3b" : [ get_nBtagEq(3, coll="BJetGood")],
15 "4b" : [ get_nBtagEq(4, coll="BJetGood")]
16 }),
17 ....
18)
A Cut
is a simple object grouping a name, a cut function, a dictionary of parameters.
The same Cut
object can be used in different points of the configuration.
The Cut
objects are defined in pocket_coffea.lib.cut_definition
.
Have a look at the documentation about the Cut object and its
API.
Tip
Custom functions and modules, defined locally by the user and not part of the central PocketCoffea core, must be registered in a special way to be available to the Dask workers. Have a look at the Register user defined custom modules section.
PocketCoffea implements a set of factory methods for common cut functions: they are defined in cut_functions.
In the configuration the categorization is split in:
Skim: The skim configuration is a list of
Cut
object. Events passing the AND of the list of cuts pass the skim.Preselections: The preselection is a list of
Cut
object and AND between them is applied.Categories: Splitting of events for histograms and columns output.
Save skimmed NanoAOD#
PocketCoffea can dump events passing the skim selection to NanoAOD root files. This can be useful when your skimming efficiecy is high and you can trade the usage of some disk storage for higher processing speed.
The export of skimmed NanoAOD is activated by the save_skimmed_files
argument of the Configurator
object. If
save_skimmed_files!=None
then the processing stops after the skimming and one root file for each chunk is saved in the
folder specified by the argument.
It is recommended to use a xrootd endpoint: save_skimmed_files='root://eosuser.cern.ch:/eos/user/...
.
1cfg = Configurator(
2
3 workflow = ttHbbBaseProcessor,
4 workflow_options = {},
5
6 save_skimmed_files = "root://eosuser.cern.ch://eos/user/x/xxx/skimmed_samples/Run2UL/",
7 skim = [get_nPVgood(1),
8 eventFlags,
9 goldenJson,
10 get_nBtagMin(3, minpt=15., coll="Jet", wp="M"),
11 get_HLTsel(primaryDatasets=["SingleEle", "SingleMuon"])],
12 )
13
The PocketCoffea output file contains the list of skimmed files with the number of skimmed events in each file. Moreover
the root files contain a new branch called skimRescaleGenWeight
which store for each event the scaling factor
needed to recover the sum of genWeight of the original factor, and correct for the skimming efficiency. The factor
is computed as (original sum of genweight / sum of genweights of skimmed files)
for each file. This factor needs to
be multiplied to the sum of genweights accumulated in each chunk by the processor that runs on top of skimmed
datasets. Therefore the dataset definition file for skimmed datasets must contain the isSkim:True
metadata,
which is used by the processor to apply the rescaling.
A full tutorial of the necessar steps to produce a skim and then to use the pocketcoffea tools to prepare a new dataset configuration file can be found in the How To section.
Categorization utilities#
PocketCoffea defines different ways to categorize events. The code is available at pocket_coffea.lib.categorization.
StandardSelection: handles the definition of categories from a dictionary of Cut objects. Each key defines a category with a list of
Cut
objects which are applied with an AND.1categories = StandardSelection({ 2 "baseline": [passthrough], 3 "1b" : [ get_nBtagEq(1, coll="BJetGood")], 4 "2b" : [ get_nBtagEq(2, coll="BJetGood")], 5 "3b" : [ get_nBtagEq(3, coll="BJetGood")], 6 "4b" : [ get_nBtagEq(4, coll="BJetGood")] 7 }),
CartesianSelection: handles the definition of cartesian product of categories. The class keeps a list of MultiCut objects, each defining a set of subcategories (or bins). The
CartesianSelection
utils defines automatically categories which are the cartesian products of the bins defined by each MultiCut. AStandardSelection
object can be embedded in the CartesianSelection to defined “common” categories not used in the cartesian product. This utility can be very useful to build a differential analysis.For example, this is the configuration to build categories as \(((N_{jets} [4,5,>6]) \times (N_{bjets} [3,4,5,>6])) + \text{inclusive} + 4jets40pt\)
1categories = CartesianSelection( 2 multicuts = [ 3 MultiCut(name="Njets", 4 cuts=[ 5 get_nObj_eq(4, 15., "JetGood"), 6 get_nObj_eq(5, 15., "JetGood"), 7 get_nObj_min(6, 15., "JetGood"), 8 ], 9 cuts_names=["4j","5j","6j"]), 10 MultiCut(name="Nbjet", 11 cuts=[ 12 get_nObj_eq(3, 15., "BJetGood"), 13 get_nObj_eq(4, 15., "BJetGood"), 14 get_nObj_eq(5, 15., "BJetGood"), 15 get_nObj_min(6, coll="BJetGood"), 16 ], 17 cuts_names=["3b","4b","5b","6b"]) 18 ], 19 common_cats = StandardSelection({ 20 "inclusive": [passthrough], 21 "4jets_40pt" : [get_nObj_min(4, 40., "JetGood")] 22 }) 23),
Warning
The standard
PackedSelection
utility from coffea can handle a maximum of 64 categories. TheCartesianSelection
tool overcomes this limitation internally.
Weights#
Weights are handled in PocketCoffea through the WeightsManager
object (see API).
The configuration file specifies which weight is applied to which sample in which category.
1from pocket_coffea.lib.weights.common import common_weights
2
3cfg = Configurator(
4 weights_classes = common_weights,
5 weights = {
6 "common": {
7 "inclusive": ["genWeight","lumi","XS",
8 "pileup",
9 "sf_ele_reco", "sf_ele_id",
10 "sf_mu_id","sf_mu_iso",
11 "sf_btag", "sf_jet_puId",
12 ],
13 "bycategory" : {
14 "2jets_20pt" : [.....]
15 }
16 },
17 "bysample": {
18 "TTToSemiLeptonic": {
19 "inclusive": [...],
20 "bycategory": {
21 "2jets_20pt": [....]
22 }
23 }
24 }
25 },
26 ....
27)
To reduce boilerplate configuration the weights are specified following a decision-tree
style and applied in a hierarchical fashion.
Weights can be assigned to all samples (common
key), inclusively or by category.
Weights can also be assigned to specific samples, again inclusively or in specific categories.
The available weights for a configuration are defined by the weights_classes passed to the Configurator. The
framework implements a set of common weights definitions: if the weights_classes
argument is not passed, the common
ones are used by default. The user can add new weights classes in the configuration and use the corresponding string in
the weights
configuration dictionary. The Weight classes implement a mechanism to check that the definition of the
string is unique and that users cannot inadvertly overwrite already defined weights.
A list of available weights definition:
genWeight: MC generator weight
lumi
XS: sample cross-section
pileup: pileup scale factor
sf_ele_reco, sf_ele_id: electron reconstruction and ID scalefactors. The working point is defined by the
lepton_scale_factors
key in the parameters (see Parameters docs)sf_mu_id, sf_mu_iso: muon id and isolation SF.
sf_btag: btagPOG shape scale factors
sf_jet_puId: jet puID SF
If a weight is requested in the configuration, but it doens’t exist, the framework emits an error before running.
On-the-flight custom weights#
Weights can be created by the user directly in the configuration. It is enough to define a new class deriving from WeightWrapper class defined by the framework.
This wrapper is used to instantiate the weight object for each chunk of data with the corresponding parameters and metadata. Doing so the user can customize the weight for different samples and conditions. Moreover the WeightWrapper instance defined the weight name, the string to be used in the config, and the available variations.
1from pocket_coffea.lib.weights_manager import WeightWrapper
2
3# example of WeightWrapper definition
4class CustomTopSF(WeightWrapper):
5
6 name = "top_sf"
7 has_variations = True
8
9 def __init__(self, params, metadata):
10 super().__init__(params, metadata)
11 self.sf_file = params["top_sf"]["json_file"]
12 # custom variations from config
13 self._variations = params["top_sf"]["variations"]
14
15 def compute(self, events, size, shape_variation):
16 # custom computation
17
18 if shape_variation == "nominal":
19 sf_data = sf_function.compute(self.sf_file, events)
20 return WeightDataMultiVariation(
21 name = self.name,
22 nominal = sf_data["nominal"],
23 up = [sf_data[var] for var in self._variations["up"]],
24 down = [sf_data[var] for var in self._variations["down"]]
25 )
26 else:
27 return WeightData(
28 name = self.name,
29 nominal = np.ones(size),
30 )
31
The class must be then passed to the configurator in order to be available:
1from pocket_coffea.lib.weights.common import common_weights
2
3cfg = Configurator(
4 weights_classes = common_weights + [CustomTopSF], # note the addition here
5 weights = {
6 "common": {
7 "inclusive": ["genWeight","lumi","XS",
8 "pileup",
9 "sf_ele_reco", "sf_ele_id",
10 "sf_mu_id","sf_mu_iso",
11 "sf_btag", "sf_jet_puId",
12 ],
13 "bycategory" : {
14 "2jets_20pt" : [ "top_sf" # custom weight
15 ]
16 }
17 },
18 ...
19
20 ....
21)
22
Moreover, often weights are easier to define: simple computations can be wrapped in a lambda without the need of defining a full WeightWrapper class.
1from pocket_coffea.lib.weights.weights import WeightLambda
2
3my_custom_sf = WeightLambda.wrap_func(
4 name="sf_custom",
5 function=lambda params, metadata, events, size, shape_variations:
6 call_to_my_fancy_function(events, params, metadata, shape_variations)
7 has_variations=True
8 )
The return type of the lambda must be a WeightData or WeightDataMultiVariation object.
Tip
The user can create a library of custom weights and include them in the configuration.
Register user defined custom modules#
Users can define modules, library and functions locally in their configuration folder and import then in the
PocketCoffea configuration and workflows. In order to make them available to the dask workers, without being included in
the PocketCoffea core library, it is sufficient to register the modules with cloudpickle
.
Add this code in the configuration file.
1
2import cloudpickle
3cloudpickle.register_pickle_by_value(workflow) # just an example of user defined modules for processors and cuts
4cloudpickle.register_pickle_by_value(custom_cut_functions)
5cloudpickle.register_pickle_by_value(custom_cuts)
6
Variations#
Systematics variations are also configured in the Configurator
. Weights and shape variations are supported.
The configuration is applied in an hierarchical fashion as for the Weights
, to compact the matrix of
samples and categories.
Weights variations: if the weights defined in the
WeightsManager
has up and down variations, they can be activated by just putting the weight name in thevariations
configuration. Up and down shapes will be exported for histograms.1cfg = Configurator( 2 .... 3 variations = { 4 "weights": { 5 "common": { 6 "inclusive": [ "pileup", 7 "sf_ele_reco", "sf_ele_id", 8 "sf_mu_id", "sf_mu_iso", 9 "sf_jet_puId","sf_btag" 10 ], 11 "bycategory" : { 12 } 13 }, 14 "bysample": { 15 "TTToSemiLeptonic": { 16 "inclusive": [], 17 "bycategory": {} 18 } 19 } 20 }, 21 "shape": { 22 .... 23 } 24 }, 25 ... 26)
Shape variations: shape variations are related to lepton, jets and MET scale variations and similar systematics. The handling of these variations is more complex since everything after skimming (see docs) is rerun for each shape variation.
Have a look at the base processor get_shape_variations() function to learn about their implementation.
cfg = Configurator( .... variations = { "weights": ..... # Shape variations "shape": { "common": "inclusive": ["JESTotal","JER"] } }, ... )
Warning
Only JES and JER variations have been implemented for the moment and are available to be used. The available JES variations depend on the jet calibration configuration defined in the parameters (docs).
Histograms configuration#
The PocketCoffea configuration allows the user to define histograms without modifying the processor code. The histogram configuration closely follows the interface of the scikit-hep/hist library, used by Coffea to handle histograms.
Histograms are identified by unique labels and built using a HistConf
object. Each HistConf
object has a list of Axis
objets, which follow the
interface of the hist
library axes.
Important
The number of Axis contained in a HistConf
is not limited! The user can work with 1,2,3,4…D histograms without
changing the interface. However, be aware of memory issues which may affect large histograms with too many bins.
1cfg = Configurator(
2 variables = {
3 "HT" : HistConf([Axis(coll="events", field="events", bins=100, start=0, stop=200, label="HT")] ),
4 "leading_jet_pt_eta" : HistConf(
5 [
6 Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=0, label="Leading jet $p_T$"),
7 Axis(coll="JetGood", field="eta", bins=40, start=-5, stop=5, pos=0, label="Leading jet $\eta$")
8 ] ),
9
10 #Plotting all jets together
11 "all_jets_pt_eta" : HistConf(
12 [
13 Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=None, label="All jets $p_T$"),
14 Axis(coll="JetGood", field="eta", bins=40, start=-5, stop=5, pos=None, label="All jets $\eta$")
15 ] ),
16
17 "subleading_jetpt_MET" : HistConf(
18 [
19 Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=0, label="Leading jet $p_T$"),
20 Axis(coll="MET", field="pt", bins=40, start=0, stop=100, label="MET")
21 ] ),
22
23
24 **ele_hists(coll="ElectronGood", pos=0),
25 **muon_hists(coll="MuonGood", pos=0),
26 **count_hist(name="nElectronGood", coll="ElectronGood",bins=3, start=0, stop=3),
27 **count_hist(name="nMuonGood", coll="MuonGood",bins=3, start=0, stop=3),
28 **count_hist(name="nJets", coll="JetGood",bins=10, start=4, stop=14),
29 **count_hist(name="nBJets", coll="BJetGood",bins=12, start=2, stop=14),
30 **jet_hists(coll="JetGood", pos=0),
31 **jet_hists(coll="JetGood", pos=1),
32 **jet_hists(coll="JetGood", pos=2),
33
34 },
35 ...
36)
37
The HistConf
class has many options, particularly useful to exclude some categories or samples from a specific
histogram.
1
2@dataclass
3class HistConf:
4 axes: List[Axis]
5 storage: str = "weight"
6 autofill: bool = True # Handle the filling automatically
7 variations: bool = True
8 only_variations: List[str] = None
9 exclude_samples: List[str] = None
10 only_samples: List[str] = None
11 exclude_categories: List[str] = None
12 only_categories: List[str] = None
13 no_weights: bool = False # Do not fill the weights
14 metadata_hist: bool = False # Non-event variables, for processing metadata
15 hist_obj = None
16 collapse_2D_masks = False # if 2D masks are applied on the events
17 # and the data_ndim=1, when collapse_2D_mask=True the OR
18 # of the masks on the axis=2 is performed to get the mask
19 # on axis=1, otherwise an exception is raised
20 collapse_2D_masks_mode = "OR" # Use OR or AND to collapse 2D masks for data_ndim=1 if collapse_2D_masks == True
21
The Axis
object has many options: in particular the array to be plotted is taken from the events
mother array
using the coll
and field
attributed. If an array is global in NanoAOD, the coll
is events
.
1
2@dataclass
3class Axis:
4 field: str # variable to plot
5 label: str # human readable label for the axis
6 bins: int = None
7 start: float = None
8 stop: float = None
9 coll: str = "events" # Collection or events or metadata or custom
10 name: str = None # Identifier of the axis: By default is built as coll.field, if not provided
11 pos: int = None # index in the collection to plot. If None plot all the objects on the same histogram
12 type: str = "regular" # regular/variable/integer/intcat/strcat
13 transform: str = None
14 lim: Tuple[float] = (0, 0)
15 underflow: bool = True
16 overflow: bool = True
17 growth: bool = False
Tip
A set of factory methods to build commonly used histogram configuration is available in
pocket_coffea.parameters.histograms
.
They produce dictionaries of HistConf
objects that need to be unpacked in the configuration file with the syntax: **jet_hists(coll="JetGood", pos=2)
Multidimensional arrays#
A special mention is worth it for the pos
attributes. The user can specify which object in a collection to use for the
field to plot: if the collection contains more
than 1 object, e.g. Jet, and pos=1
, only the attributes of the 2nd object will be plotted. If the second object is missing, the
attributes are None-padded automatically.
Tip
If the collection contains more objects (e.g. the Jet collection) and the attribute pos
is None, the array
is flattened before filling the histograms. This means that you can plot the \(p_T\) of all the jets in a single plot just
by using Axes(coll="Jet", field="pt", pos=None)
Columns output#
In PocketCoffea it is also possible to export arrays from NanoAOD events: the configuration is handled with a
ColOut
object.
The configuration follows the same structure of the Weights
configuration.
A list of ColOut
objects is assigner either inclusively and to all the samples or specifically to a sample and
category.
1cfg = Configurator(
2 # columns output configuration
3 columns = {
4 "common": {
5 "inclusive": [],
6 "bycategory": {}
7 },
8 "bysample": {
9 "TTToSemiLeptonic" : { "inclusive": [ColOut("LeptonGood",["pt","eta","phi"])]},
10 "TTToSemiLeptonic__=1b" :{ "inclusive": [ColOut("JetGood",["pt","eta","phi"])]},
11 "TTToSemiLeptonic__=2b":{ "inclusive": [ColOut("BJetGood",["pt","eta","phi"])]},
12 }
13 }
14 )
The ColOut
object defines which collection and fields get exported in the output file, moreover by default the number
of object in the collection is saved only once along the fields. This is needed because the output accumulator contains
flattened arrays. The output can then be unflattened using the saved number of objects.
1@dataclass
2class ColOut:
3 collection: str # Collection
4 columns: List[str] # list of columns to export
5 flatten: bool = True # Flatten by defaul
6 store_size: bool = True
7 fill_none: bool = True
8 fill_value: float = -999.0 # by default the None elements are filled
9 pos_start: int = None # First position in the collection to export. If None export from the first element
10 pos_end: int = None # Last position in the collection to export. If None export until the last element
Similarly to the pos
option for the Axes
configuration, it is possible to specify a range of objects to restrict the
output over the collection.
Warning
At the moment the output columns gets accumulated over all the chunks of the processed datasets and returned as a single file. This may cause memory problems in case of a large number of events or exported data. A solution is to export single files separately: the option is under development.
Exporting chunks in separate files#
When exporting arrays from the processor, the size of the output may become an issue. In fact, by default the coffea
processor will accumulate the column_accumulators
for each chunk to produce the total output at the end of the
processing. This process may accumulate too much memory and crush the processing.
To overcome this issue there is the possibility to export the Columns
output of each chunk in a separate file,
without adding anything to the standard PocketCoffea output. The file can be saved in a local folder or sent remotely
with xrootd.
Warning
With this setup the output will be in parquet format, so it is not necessary to flatten the awkward arrays before saving. The full awkward structure can be kept in the output arrays.
To activate this mode, just add the option dump_columns_as_arrays_per_chunk
in the workflow_options
dictionary of
the Configurator
.
The target directory can be local (no xrootD prefix) or a xRootD localtion.
The following configuration shows the setup in action. N.B.: the columns are not flattened (the default), because the output parquet file will contain directly awkward arrays (not column accumulators).
1cfg = Configurator(
2 parameters = parameters,
3 datasets = {
4 "jsons": [f"{localdir}/datasets/signal_ttHTobb_local.json",
5 f"{localdir}/datasets/backgrounds_MC_ttbar_local.json",
6 f"{localdir}/datasets/backgrounds_MC_TTbb_local.json"],
7 "filter" : {
8 "samples": ["ttHTobb", "TTToSemiLeptonic", "TTbbSemiLeptonic"],
9 "samples_exclude" : [],
10 "year": ["2016_PreVFP",
11 "2016_PostVFP",
12 "2017","2018"] #All the years
13 }
14 },
15
16 workflow = PartonMatchingProcessor,
17 workflow_options = {"parton_jet_min_dR": 0.3,
18 "dump_columns_as_arrays_per_chunk": "root://t3se01.psi.ch:1094//store/user/dvalsecc/ttHbb/output_columns_parton_matching/sig_bkg_05_07_2023_v1/"},
19
20 ....
21 columns = {
22 "common": {
23 "bycategory": {
24 "semilep_LHE": [
25 ColOut("Parton", ["pt", "eta", "phi", "mass", "pdgId", "provenance"], flatten=False),
26 ColOut(
27 "PartonMatched",
28 ["pt", "eta", "phi","mass", "pdgId", "provenance", "dRMatchedJet",], flatten=False
29 ),
30 ColOut(
31 "JetGood",
32 ["pt", "eta", "phi", "hadronFlavour", "btagDeepFlavB"], flatten=False
33 ),
34 ColOut(
35 "JetGoodMatched",
36 [
37 "pt",
38 "eta",
39 "phi",
40 "hadronFlavour",
41 "btagDeepFlavB",
42 "dRMatchedJet",
43 ], flatten=False
44 ),
45
46 ColOut("LeptonGood",
47 ["pt","eta","phi"],flatten=False,
48 pos_end=1, store_size=False),
49 ColOut("MET", ["phi","pt","significance"], flatten=False),
50 ColOut("Generator",["x1","x2","id1","id2","xpdf1","xpdf2"], flatten=False),
51 ColOut("LeptonParton",["pt","eta","phi","mass","pdgId"], flatten=False)
52 ]
53 }
54 },
55 "bysample":{
56 "ttHTobb":{
57 "bycategory": {
58 "semilep_LHE": [ColOut("HiggsParton",
59 ["pt","eta","phi","mass","pdgId"], pos_end=1, store_size=False, flatten=False)]
60 }
61 }
62 }
63 },
64)
This configuration will create a structure of folders containing the dataset name and the categories:
# main output folder
(pocket-coffea) ➜ sig_bkg_05_07_2023_v1 lrt
total 3.5K
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:06 TTbbSemiLeptonic_Powheg_2018
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:06 TTbbSemiLeptonic_Powheg_2016_PreVFP
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:06 TTToSemiLeptonic_2016_PreVFP
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:06 TTbbSemiLeptonic_Powheg_2016_PostVFP
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:07 TTbbSemiLeptonic_Powheg_2017
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:14 TTToSemiLeptonic_2016_PostVFP
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:20 TTToSemiLeptonic_2017
# Output by dataset
(pocket-coffea) ➜ sig_bkg_05_07_2023_v1 cd TTbbSemiLeptonic_Powheg_2018
(pocket-coffea) ➜ TTbbSemiLeptonic_Powheg_2018 lrt
# categories
drwxr-xr-x 24 dvalsecc ethz-higgs 512 Jul 5 15:12 semilep_LHE
# Chunks output
(pocket-coffea) ➜ TTbbSemiLeptonic_Powheg_2018 cd semilep_LHE
(pocket-coffea) ➜ semilep_LHE lrt
total 219M
-rw-r--r-- 1 dvalsecc ethz-higgs 161K Jul 5 15:06 58cae696-ff9a-11eb-8bcf-b4e45d9fbeef_%2FEvents%3B1_0-6000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 8.8M Jul 5 15:07 f90f7300-022f-11ec-8fd2-0c0013acbeef_%2FEvents%3B1_403500-807000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 9.2M Jul 5 15:07 b788eafa-0203-11ec-9ed1-0b0013acbeef_%2FEvents%3B1_429000-858000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 8.8M Jul 5 15:07 f90f7300-022f-11ec-8fd2-0c0013acbeef_%2FEvents%3B1_0-403500.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 11M Jul 5 15:07 df0073b2-05f2-11ec-936f-118810acbeef_%2FEvents%3B1_0-495000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 715K Jul 5 15:07 94c2a20e-ff92-11eb-9e5b-7e969e86beef_%2FEvents%3B1_0-28681.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 9.2M Jul 5 15:07 b788eafa-0203-11ec-9ed1-0b0013acbeef_%2FEvents%3B1_0-429000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 14M Jul 5 15:07 b379fc2e-0203-11ec-8947-030013acbeef_%2FEvents%3B1_0-639000.parquet