Configuration#
A PocketCoffea analysis can be customized by writing a configuration file, containing all the information needed to setup an analysis run.
The PocketCoffea configuration comprehends:
Input dataset specification
Analysis parameters (see Parameters page)
Custom processor specification
Skimming, preselection and categories
Weights configuration
Systematic variations configuration
Histograms output configuration
Running mode configuration: local, multiprocessing, cluster
Note
The configuration is wrapped by a Configurator object, usually saved in a python script containing a cfg variable.
A full simplified example is available here. In this page we will describe in details all the components of a more complete example about ttHbb semileptonic channel.
1from pocket_coffea.utils.configurator import Configurator
2from pocket_coffea.lib.cut_definition import Cut
3
4from pocket_coffea.lib.cut_functions import get_nPVgood, goldenJson, eventFlags
5from pocket_coffea.lib.cut_functions import get_nObj_min, get_HLTsel, get_nBtagEq, get_nBtagMin
6from pocket_coffea.parameters.cuts import passthrough
7from pocket_coffea.parameters.histograms import *
8import os
9
10from pocket_coffea.workflows.tthbb_base_processor import ttHbbBaseProcessor
11from pocket_coffea.lib.weights.common import common_weights
12from pocket_coffea.lib.calibrators.common import default_calibrators_sequence
13
14# importing custom cut functions
15from custom_cut_functions import *
16localdir = os.path.dirname(os.path.abspath(__file__))
17
18# Loading default parameters
19from pocket_coffea.parameters import defaults
20default_parameters = defaults.get_default_parameters()
21defaults.register_configuration_dir("config_dir", localdir+"/params")
22
23# merging additional analysis specific parameters
24parameters = defaults.merge_parameters_from_files(default_parameters,
25 f"{localdir}/params/object_preselection.yaml",
26 f"{localdir}/params/btagsf_calibration.yaml",
27 f"{localdir}/params/triggers.yaml",
28 update=True)
29
30# Configurator instance
31cfg = Configurator(
32 parameters = parameters,
33 datasets = {
34 "jsons": [f"{localdir}/datasets/backgrounds_MC_ttbar_2018.json",
35 f"{localdir}/datasets/backgrounds_MC_ttbar_2017.json",
36 f"{localdir}/datasets/DATA_SingleEle.json",
37 f"{localdir}/datasets/DATA_SingleEle.json",
38 ],
39 "filter" : {
40 "samples": ["TTToSemiLeptonic","DATA_SingleEle","DATA_SingleEle"],
41 "samples_exclude" : [],
42 "year": ['2018','2017']
43 },
44 "subsamples":{
45 "TTToSemiLeptonic": {
46 "=1b": [get_nBtagEq(1, coll="Jet")],
47 "=2b" : [get_nBtagEq(2, coll="Jet")],
48 ">2b" : [get_nBtagMin(3, coll="Jet")]
49 }
50 }
51 },
52
53 workflow = ttHbbBaseProcessor,
54 workflow_options = {},
55
56 # Skimming and categorization
57 skim = [
58 get_nPVgood(1), eventFlags, goldenJson,
59 get_nObj_min(4, 15., "Jet"),
60 get_HLTsel()
61 ],
62
63 preselections = [semileptonic_presel_nobtag],
64
65 categories = {
66 "baseline": [passthrough],
67 "1b" : [ get_nBtagEq(1, coll="BJetGood")],
68 "2b" : [ get_nBtagEq(2, coll="BJetGood")],
69 "3b" : [ get_nBtagEq(3, coll="BJetGood")],
70 "4b" : [ get_nBtagEq(4, coll="BJetGood")]
71 },
72
73 # Weights and calibrators configuration
74 weights_classes = common_weights,
75 calibrators = default_calibrators_sequence, # JetsCalibrator, METCalibrator, ElectronsScaleCalibrator
76
77 weights = {
78 "common": {
79 "inclusive": ["genWeight","lumi","XS",
80 "pileup",
81 "sf_ele_reco", "sf_ele_id",
82 "sf_mu_id","sf_mu_iso",
83 "sf_btag", "sf_jet_puId",
84 ],
85 "bycategory" : {
86 "2jets_20pt" : [.....]
87 }
88 },
89 "bysample": {
90 "TTToSemiLeptonic": {
91 "inclusive": [...],
92 "bycategory": {
93 "2jets_20pt": [....]
94 }
95 }
96 }
97 },
98
99 variations = {
100 "weights": {
101 "common": {
102 "inclusive": [ "pileup",
103 "sf_ele_reco", "sf_ele_id",
104 "sf_mu_id", "sf_mu_iso",
105 "sf_jet_puId","sf_btag"
106 ],
107 "bycategory" : {
108 }
109 },
110 "bysample": {
111 "TTToSemiLeptonic": {
112 "inclusive": [],
113 "bycategory": {}
114 }
115 }
116 },
117 "shape": {
118 ....
119 }
120 },
121
122
123 variables = {
124 "HT" : HistConf([Axis(coll="events", field="events", bins=100, start=0, stop=200, label="HT")] ),
125 "leading_jet_pt_eta" : HistConf(
126 [
127 Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=0, label="Leading jet $p_T$"),
128 Axis(coll="JetGood", field="eta", bins=40, start=-5, stop=5, pos=0, label="Leading jet $\eta$")
129 ] ),
130
131 #Plotting all jets together
132 "all_jets_pt_eta" : HistConf(
133 [
134 Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=None, label="All jets $p_T$"),
135 Axis(coll="JetGood", field="eta", bins=40, start=-5, stop=5, pos=None, label="All jets $\eta$")
136 ] ),
137
138 "subleading_jetpt_MET" : HistConf(
139 [
140 Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=0, label="Leading jet $p_T$"),
141 Axis(coll="MET", field="pt", bins=40, start=0, stop=100, label="MET")
142 ] ),
143
144 **ele_hists(coll="ElectronGood", pos=0),
145 **muon_hists(coll="MuonGood", pos=0),
146 **count_hist(name="nElectronGood", coll="ElectronGood",bins=3, start=0, stop=3),
147 **count_hist(name="nMuonGood", coll="MuonGood",bins=3, start=0, stop=3),
148 **count_hist(name="nJets", coll="JetGood",bins=10, start=4, stop=14),
149 **count_hist(name="nBJets", coll="BJetGood",bins=12, start=2, stop=14),
150 **jet_hists(coll="JetGood", pos=0),
151 **jet_hists(coll="JetGood", pos=1),
152 **jet_hists(coll="JetGood", pos=2),
153
154 },
155
156 columns = {
157 "common": {
158 "inclusive": [],
159 "bycategory": {}
160 },
161 "bysample": {
162 "TTToSemiLeptonic" : { "inclusive": [ColOut("LeptonGood",["pt","eta","phi"])]},
163 "TTToSemiLeptonic__=1b" :{ "inclusive": [ColOut("JetGood",["pt","eta","phi"])]},
164 "TTToSemiLeptonic__=2b":{ "inclusive": [ColOut("BJetGood",["pt","eta","phi"])]},
165 }
166 }
167)
168
Datasets#
The dataset configuration has the following structure:
1cfg = Configurator(
2 datasets={
3 "jsons": [
4 f"{localdir}/datasets/backgrounds_MC_ttbar_2018.json",
5 f"{localdir}/datasets/backgrounds_MC_ttbar_2017.json",
6 f"{localdir}/datasets/DATA_SingleEle.json",
7 f"{localdir}/datasets/DATA_SingleEle.json",
8 ],
9 "filter": {
10 "samples": ["TTToSemiLeptonic", "DATA_SingleEle", "DATA_SingleEle"],
11 "samples_exclude": [],
12 "year": ["2018", "2017"],
13 },
14 "subsamples": {
15 "TTToSemiLeptonic": {
16 "=1b": [get_nBtagEq(1, coll="Jet")],
17 "=2b": [get_nBtagEq(2, coll="Jet")],
18 ">2b": [get_nBtagMin(3, coll="Jet")],
19 }
20 },
21 },
22 # ....
23)
The
jsonskey contains the list of dataset definition file to consider as inputsThe
filterdictionary gives the user the possibility to filter on the fly the desidered samples to include or exclude from the full list taken from the jsons files. Samples can be filtered by name of by year.subsamplesmakes possible to define cuts splitting the events in multiple sub-samples. See the datasets page for a more in-depth definition of them. A list of Cut objects is used to define the subsample, an AND between them is used to mask the events.In the example, by using the
subsamplesoption effectively theTTToSemiLeptonicsample will be split in the framework in 3 pieces calledTTToSemiLeptonic__=1b,TTToSemiLeptonic__=2b,TTToSemiLeptonic__>2b.Warning
Subsamples do not need to be exclusive. Subsample masks are applied before exporting histograms, columns and counting events.
Subsample-specific Configuration#
PocketCoffea allows applying weights and systematic variations specific to subsamples. When subsamples are defined, they can be referenced in the weights and variations configuration using their full name (original sample name + subsample suffix).
In the example configuration, the TTToSemiLeptonic sample is split into three subsamples:
TTToSemiLeptonic__=1bTTToSemiLeptonic__=2bTTToSemiLeptonic__>2b
These subsamples can have different weight and variation configurations:
1cfg = Configurator(
2 datasets={
3 "subsamples": {
4 "TTToSemiLeptonic": {
5 "=1b": [get_nBtagEq(1, coll="Jet")],
6 "=2b": [get_nBtagEq(2, coll="Jet")],
7 ">2b": [get_nBtagMin(3, coll="Jet")],
8 }
9 }
10 },
11 weights={
12 "common": {
13 "inclusive": ["genWeight", "lumi", "XS", "pileup"],
14 },
15 "bysample": {
16 # Original sample weights (applied to all subsamples if not overridden)
17 "TTToSemiLeptonic": {
18 "inclusive": ["sf_btag", "sf_jet_puId"],
19 },
20 # Subsample-specific weights
21 "TTToSemiLeptonic__=1b": {
22 "inclusive": ["sf_btag_1b_specific"], # Custom b-tag SF for 1b events
23 "bycategory": {"baseline": ["additional_1b_weight"]},
24 },
25 "TTToSemiLeptonic__=2b": {
26 "inclusive": [
27 "sf_btag_2b_specific"
28 ], # Different b-tag SF for 2b events
29 },
30 "TTToSemiLeptonic__>2b": {
31 "inclusive": ["sf_btag_multi_specific", "top_pt_reweight"],
32 },
33 },
34 },
35 variations={
36 "weights": {
37 "bysample": {
38 "TTToSemiLeptonic__=1b": {
39 "inclusive": ["sf_btag_1b_specific"], # Only vary 1b-specific SF
40 },
41 "TTToSemiLeptonic__=2b": {
42 "inclusive": ["sf_btag_2b_specific"],
43 },
44 "TTToSemiLeptonic__>2b": {
45 "inclusive": ["sf_btag_multi_specific", "top_pt_reweight"],
46 },
47 }
48 },
49 "shape": {
50 "bysample": {
51 "TTToSemiLeptonic__>2b": {
52 "inclusive": [
53 "JESTotal",
54 "JER",
55 ] # Only apply shape variations to high b-jet multiplicity
56 }
57 }
58 },
59 },
60)
Tip
Subsample-specific configurations are particularly useful for:
Applying different scale factors based on event topology (e.g., different b-tagging efficiencies for different b-jet multiplicities)
Implementing data-driven background estimation techniques that require different weights per subsample
Warning
When configuring weights and variations for subsamples, ensure that the subsample name exactly matches the automatically generated name format: {original_sample}__{subsample_suffix}.
Workflow#
1from pocket_coffea.workflows.tthbb_base_processor import ttHbbBaseProcessor
2
3cfg = Configurator(
4 workflow=ttHbbBaseProcessor,
5 workflow_options={},
6 # ....
7)
workflowkey specifies directly the class to use.workflow_options: dictionary with additional options for specific processors (user defined)
Calibrators#
Object calibrations and systematic variations are handled by the Calibrators system. The calibrator sequence is configured in the Configurator:
1from pocket_coffea.lib.calibrators.common import default_calibrators_sequence
2
3cfg = Configurator(
4 # Default calibrator sequence
5 calibrators=default_calibrators_sequence,
6 # ....
7)
The calibrators parameter accepts a list of calibrator classes that will be applied sequentially to correct physics objects and handle systematic variations.
For a detailed explanation of how calibrators work in the PocketCoffea workflow, see the Object calibration and systematic variations section in the Concepts page. For implementation details and creating custom calibrators, see the dedicated Calibrators page.
Default Calibrator Sequence#
PocketCoffea provides a default sequence of calibrators for common corrections:
1default_calibrators_sequence = [JetsCalibrator, METCalibrator, ElectronsScaleCalibrator]
JetsCalibrator: Applies JEC/JER corrections and provides systematic variations (JESUp/Down, JERUp/Down, etc.)
METCalibrator: Propagates jet corrections to MET collection
ElectronsScaleCalibrator: Handles electron energy scale and smearing corrections
Warning
If the user does not specify a calibration sequence in the configuration, the default one defined in `pocket_coffea.lib.calibrators.common.common is used. It’s always better to define explicitely the calibration sequence!
Custom Calibrator Sequence#
Users can define custom calibrator sequences by creating their own list:
1from pocket_coffea.lib.calibrators.common import JetsCalibrator, METCalibrator
2from my_custom_calibrators import MyCustomMuonCalibrator
3
4# Custom sequence
5my_calibrators = [
6 JetsCalibrator,
7 METCalibrator,
8 MyCustomMuonCalibrator, # Custom calibrator
9]
10
11cfg = Configurator(
12 calibrators=my_calibrators,
13 # ....
14)
Calibrator Parameters#
Calibrator behavior is usually controlled through the parameters configuration. Key calibrator parameters include:
jets_calibration: Controls JEC/JER application and variations
lepton_scale_factors: Configures electron/muon scale factors and corrections
met_calibration: MET correction configuration
See the Parameters page for detailed calibrator parameter configuration, the Calibrators page for implementation details, and the Concepts page for how calibrators integrate into the workflow.
Tip
The calibrator sequence order matters! For example, METCalibrator must come after JetsCalibrator because it needs the corrected jets to properly propagate JEC effects to the MET.
Warning
All requested systematic variations from calibrators must be properly configured in the variations section of the configuration to be processed.
Cuts and categories#
The events skimming, preselection and categorization is defined in a structured way in PocketCoffea: see Concepts#Filtering for a detailed explanation of the difference between the steps.
1cfg = Configurator(
2 skim=[
3 get_nPVgood(1),
4 eventFlags,
5 goldenJson,
6 get_nObj_min(4, 15.0, "Jet"),
7 get_HLTsel(),
8 ],
9 preselections=[semileptonic_presel_nobtag],
10 categories=StandardSelection(
11 {
12 "baseline": [passthrough],
13 "1b": [get_nBtagEq(1, coll="BJetGood")],
14 "2b": [get_nBtagEq(2, coll="BJetGood")],
15 "3b": [get_nBtagEq(3, coll="BJetGood")],
16 "4b": [get_nBtagEq(4, coll="BJetGood")],
17 }
18 ),
19 # ....
20)
A Cut is a simple object grouping a name, a cut function, a dictionary of parameters.
The same Cut object can be used in different points of the configuration.
The Cut objects are defined in pocket_coffea.lib.cut_definition.
Have a look at the documentation about the Cut object and its
API.
Tip
Custom functions and modules, defined locally by the user and not part of the central PocketCoffea core, must be registered in a special way to be available to the Dask workers. Have a look at the Register user defined custom modules section.
PocketCoffea implements a set of factory methods for common cut functions: they are defined in cut_functions.
In the configuration the categorization is split in:
Skim: The skim configuration is a list of
Cutobject. Events passing the AND of the list of cuts pass the skim.Preselections: The preselection is a list of
Cutobject and AND between them is applied.Categories: Splitting of events for histograms and columns output.
Save skimmed NanoAOD#
PocketCoffea can dump events passing the skim selection to NanoAOD ROOT files. Running once on a skim and reusing it for every subsequent processing pass trades a one-off disk cost for much shorter turnaround later — particularly useful when you iterate on histograms, weights, or systematic variations and the skim selection itself is stable.
Skim export is enabled by setting the save_skimmed_files argument of the Configurator to an output directory (a
local path or, recommended, an XRootD endpoint such as root://eosuser.cern.ch://eos/user/.../skim/). When set, the
processor finishes the skim step, writes one ROOT file per processed chunk, and stops — categorization, weights and
histograms are not computed in this run.
1cfg = Configurator(
2 workflow=ttHbbBaseProcessor,
3 workflow_options={}, # default: skim_mode="skim"
4 save_skimmed_files="root://eosuser.cern.ch://eos/user/x/xxx/skimmed_samples/Run2UL/",
5 skim=[
6 get_nPVgood(1),
7 eventFlags,
8 goldenJson,
9 get_nBtagMin(3, minpt=15.0, coll="Jet", wp="M"),
10 get_HLTsel(primaryDatasets=["SingleEle", "SingleMuon"]),
11 ],
12)
Two skim modes#
The exact set of events that ends up in the skim is controlled by workflow_options["skim_mode"]. Two values are
recognised today (see BaseProcessorABC.process in pocket_coffea/workflows/base.py):
skim_mode |
What gets saved | Object corrections during selection |
|---|---|---|
"skim" (default) |
Events passing the skim cut list, evaluated on raw NanoAOD. |
None — skim runs before any calibrator. |
"presel_any_variation" |
Events passing either the skim or the full event preselection in at least one active systematic variation. | The full calibration loop is evaluated in dry-run mode just to compute preselection masks; the events written to disk are the un-calibrated ones. |
In default "skim" mode the skim is a pure “drop obviously useless events” stage — fast, loose, and only allowed to
look at branches that exist before any object correction. This is the standard recipe and what the rest of the
documentation assumes.
skim_mode: "presel_any_variation"#
The variation-aware mode lets you push the skim much closer to the analysis preselection without losing events that would be selected only under a systematic shift (e.g. a JES-up shifted jet that crosses the pT threshold). The flow is:
Run the normal skim on raw NanoAOD.
Initialise the
CalibratorsManagerand loop over every active shape variation, callingapply_object_preselection,count_objects,define_common_variables_before_presel, and finally evaluating the preselection mask without filtering the events.OR-combine the per-variation masks into a single boolean mask and apply it to the uncalibrated events.
Write the surviving events to disk, then exit — no histograms or columns are produced.
The recipe is correct for the set of calibration variations that were active when the skim was produced. Each downstream re-processing on the skim re-runs the same calibration loop, so a JES-up event that was kept thanks to its shifted-up pT is re-evaluated with the corrected pT and ends up in the right variation histogram. Storing the uncalibrated events is what lets that downstream re-calibration apply cleanly — it does not make the skim immune to changes in the calibration configuration itself.
1cfg = Configurator(
2 workflow=ttHbbBaseProcessor,
3 save_skimmed_files="root://eosuser.cern.ch://eos/user/x/xxx/skimmed_samples/Run3/",
4 workflow_options={
5 "skim_mode": "presel_any_variation",
6 # the explicit skim list is still applied first;
7 # the preselection-OR is added on top.
8 },
9 skim=[get_nPVgood(1), eventFlags, goldenJson, get_HLTsel(...)],
10 preselections=[
11 get_nObj_min(2, 25., "JetGood"),
12 get_nBtagMin(1, coll="BJetGood", wp="M"),
13 ],
14 calibrators=[JetsCalibrator, MetCalibrator, ElectronSFCalibrator],
15)
When to prefer it:
The preselection has a non-trivial efficiency, so a basic skim alone would still ship many useless events.
You want one canonical skim that already encodes the analysis acceptance for every shape systematic, so downstream re-runs only have to re-evaluate histograms (not preselection).
You are willing to spend more CPU in the skim step (the full calibration loop runs once per chunk) in exchange for a smaller, more analysis-shaped output.
When not to use it:
Skim selection alone is already loose; the dry-run preselection just adds CPU for negligible event reduction.
The preselection depends on quantities that aren’t yet computed in
define_common_variables_before_presel.
Stopping the run after the skim#
By default save_skimmed_files already short-circuits the rest of the processor — the chunk returns immediately after
export_skimmed_chunk(). If you have a derived workflow that calls extra steps from process_extra_after_skim and you
also want to bail out there in special debugging configurations, the workflow option skip_processing_after_skim: True
is honoured: it makes the variation-aware branch exit right after writing the skim ROOT file. The default skim path
already behaves this way; the option only matters when you wire it into your own subclass.
Output structure & gen-weight rescaling#
The main .coffea output of a skim run keeps the usual sum_genweights, cutflow and dataset metadata and adds two
extra keys:
skimmed_files:{dataset: [path/to/file_0.root, ...]}— the new ROOT files produced for each dataset.nskimmed_events:{dataset: [n_events_in_file_0, ...]}— event counts per chunk file.
Each output ROOT file carries an extra branch skimRescaleGenWeight, set per-event to
skimRescaleGenWeight = (original sum_genweight of the input file) / (sum_genweight surviving the skim)
Downstream, when the skim is used as input dataset, the dataset definition JSON must carry isSkim: True in its
metadata. The processor reads that flag in load_metadata (base.py:135-137) and computes sum_genweights as
sum(skimRescaleGenWeight * genWeight) instead of sum(genWeight), so the per-dataset normalisation matches the
unskimmed reference. Forgetting isSkim: True will silently under-normalise your MC.
For chunks that produced zero events the rescale factor is set to 0 (see export_skimmed_chunk in base.py); these
chunks contribute 0 to sum_genweights_skimmed but are still listed in skimmed_files and tracked by the rest of the
machinery.
Recovering sum_genweight for zero-event chunks#
There is one subtle failure mode of the per-event skimRescaleGenWeight mechanism: if an input NanoAOD chunk has
zero events surviving the skim, no ROOT file is written for that chunk, so the downstream per-chunk reconstruction
sum(skimRescaleGenWeight * genWeight) cannot recover its sum_genweight contribution. The original total would be
silently under-counted whenever the skim is tight enough that some input chunks vanish entirely.
To avoid this, save_skimed_dataset_definition writes the authoritative pre-skim dataset-level totals into the new
dataset JSON’s metadata:
"metadata": {
...,
"isSkim": "True",
"sum_genweights": 1.2345e+08,
"sum_signOf_genweights": 9.87e+06
}
These values come straight from the skim job’s .coffea accumulator, where they are computed before the skim mask
is applied (BaseProcessorABC.process); they therefore include every input chunk regardless of survival. The hadded
dataset definition produced by pocket-coffea hadd-skimmed-files carries the same fields, so a hadded skim is also
self-recovering.
At downstream postprocess time BaseProcessorABC.postprocess calls
pocket_coffea.utils.skim.apply_skim_sumgenweights_override which, for every isSkim dataset whose metadata carries
these fields, replaces the per-chunk-reconstructed accumulator["sum_genweights"] /
accumulator["sum_signOf_genweights"] entries with the authoritative values before rescale_sumgenweights runs. The
override is logged at INFO so the recovery is visible. Older skim outputs that lack the new metadata fields keep using
the per-chunk reconstruction unchanged.
A useful side effect: running downstream with --limit-files against an isSkim dataset now normalises histograms to
the original sum_genweight rather than to whatever subset of files happened to be processed — which is the
physically correct behaviour for cross-section normalisation.
Practical tips#
Run the skim on
condor@lxplus. Each condor job produces its own slice of skimmed files; failed jobs can be resubmitted individually with--recreate-jobs(seerunning.md). The dask scheduler is overkill for an embarrassingly parallel write-only step.haddthe per-chunk files afterwards withpocket-coffea hadd-skimmed-filesto reduce file count and size amplification (typical chunk files are ~tens of MB). The post-hadd helper produces an updated dataset definition JSON withisSkim: Truealready set.Mind the skim looseness. Even in
presel_any_variationmode the explicitskimlist is evaluated on raw NanoAOD before any object correction; it must remain loose under the systematic shifts that the preselection sees later.
See the end-to-end recipe in How-To → Skimming events for the concrete CLI invocation and post-processing steps.
Categorization utilities#
PocketCoffea defines different ways to categorize events. The code is available at pocket_coffea.lib.categorization.
StandardSelection: handles the definition of categories from a dictionary of Cut objects. Each key defines a category with a list of
Cutobjects which are applied with an AND.1categories = ( 2 StandardSelection( 3 { 4 "baseline": [passthrough], 5 "1b": [get_nBtagEq(1, coll="BJetGood")], 6 "2b": [get_nBtagEq(2, coll="BJetGood")], 7 "3b": [get_nBtagEq(3, coll="BJetGood")], 8 "4b": [get_nBtagEq(4, coll="BJetGood")], 9 } 10 ), 11)
CartesianSelection: handles the definition of cartesian product of categories. The class keeps a list of MultiCut objects, each defining a set of subcategories (or bins). The
CartesianSelectionutils defines automatically categories which are the cartesian products of the bins defined by each MultiCut. AStandardSelectionobject can be embedded in the CartesianSelection to defined “common” categories not used in the cartesian product. This utility can be very useful to build a differential analysis.For example, this is the configuration to build categories as \(((N_{jets} [4,5,>6]) \times (N_{bjets} [3,4,5,>6])) + \text{inclusive} + 4jets40pt\)
1categories = ( 2 CartesianSelection( 3 multicuts=[ 4 MultiCut( 5 name="Njets", 6 cuts=[ 7 get_nObj_eq(4, 15.0, "JetGood"), 8 get_nObj_eq(5, 15.0, "JetGood"), 9 get_nObj_min(6, 15.0, "JetGood"), 10 ], 11 cuts_names=["4j", "5j", "6j"], 12 ), 13 MultiCut( 14 name="Nbjet", 15 cuts=[ 16 get_nObj_eq(3, 15.0, "BJetGood"), 17 get_nObj_eq(4, 15.0, "BJetGood"), 18 get_nObj_eq(5, 15.0, "BJetGood"), 19 get_nObj_min(6, coll="BJetGood"), 20 ], 21 cuts_names=["3b", "4b", "5b", "6b"], 22 ), 23 ], 24 common_cats=StandardSelection( 25 { 26 "inclusive": [passthrough], 27 "4jets_40pt": [get_nObj_min(4, 40.0, "JetGood")], 28 } 29 ), 30 ), 31)
Warning
The standard
PackedSelectionutility from coffea can handle a maximum of 64 categories. TheCartesianSelectiontool overcomes this limitation internally.
Weights#
Weights are handled in PocketCoffea through the WeightsManager object (see API).
The configuration file specifies which weight is applied to which sample in which category.
1from pocket_coffea.lib.weights.common import common_weights
2
3cfg = Configurator(
4 weights_classes=common_weights,
5 weights={
6 "common": {
7 "inclusive": [
8 "genWeight",
9 "lumi",
10 "XS",
11 "pileup",
12 "sf_ele_reco",
13 "sf_ele_id",
14 "sf_mu_id",
15 "sf_mu_iso",
16 "sf_btag",
17 "sf_jet_puId",
18 ],
19 "bycategory": {"2jets_20pt": []},
20 },
21 "bysample": {
22 "TTToSemiLeptonic": {
23 "inclusive": [],
24 "bycategory": {"2jets_20pt": []},
25 }
26 },
27 },
28 # ....
29)
To reduce boilerplate configuration the weights are specified following a decision-tree style and applied in a hierarchical fashion.
Weights can be assigned to all samples (common key), inclusively or by category.
Weights can also be assigned to specific samples, again inclusively or in specific categories.
The available weights for a configuration are defined by the weights_classes passed to the Configurator. The
framework implements a set of common weights definitions: if the weights_classes argument is not passed, the common
ones are used by default. The user can add new weights classes in the configuration and use the corresponding string in
the weights configuration dictionary. The Weight classes implement a mechanism to check that the definition of the
string is unique and that users cannot inadvertly overwrite already defined weights.
A list of available weights definition:
genWeight: MC generator weight
lumi
XS: sample cross-section
pileup: pileup scale factor
sf_ele_reco, sf_ele_id: electron reconstruction and ID scalefactors. The working point is defined by the
lepton_scale_factorskey in the parameters (see Parameters docs)sf_mu_id, sf_mu_iso: muon id and isolation SF.
sf_btag: btagPOG shape scale factors
sf_jet_puId: jet puID SF
If a weight is requested in the configuration, but it doens’t exist, the framework emits an error before running.
On-the-flight custom weights#
Weights can be created by the user directly in the configuration. It is enough to define a new class deriving from WeightWrapper class defined by the framework.
This wrapper is used to instantiate the weight object for each chunk of data with the corresponding parameters and metadata. Doing so the user can customize the weight for different samples and conditions. Moreover the WeightWrapper instance defined the weight name, the string to be used in the config, and the available variations.
1from pocket_coffea.lib.weights_manager import WeightWrapper
2
3# example of WeightWrapper definition
4class CustomTopSF(WeightWrapper):
5
6 name = "top_sf"
7 has_variations = True
8
9 def __init__(self, params, metadata):
10 super().__init__(params, metadata)
11 self.sf_file = params["top_sf"]["json_file"]
12 # custom variations from config
13 self._variations = params["top_sf"]["variations"]
14
15 def compute(self, events, size, shape_variation):
16 # custom computation
17
18 if shape_variation == "nominal":
19 sf_data = sf_function.compute(self.sf_file, events)
20 return WeightDataMultiVariation(
21 name=self.name,
22 nominal=sf_data["nominal"],
23 up=[sf_data[var] for var in self._variations["up"]],
24 down=[sf_data[var] for var in self._variations["down"]],
25 )
26 else:
27 return WeightData(
28 name=self.name,
29 nominal=np.ones(size),
30 )
The class must be then passed to the configurator in order to be available:
1from pocket_coffea.lib.weights.common import common_weights
2
3cfg = Configurator(
4 weights_classes=common_weights + [CustomTopSF], # note the addition here
5 weights={
6 "common": {
7 "inclusive": [
8 "genWeight",
9 "lumi",
10 "XS",
11 "pileup",
12 "sf_ele_reco",
13 "sf_ele_id",
14 "sf_mu_id",
15 "sf_mu_iso",
16 "sf_btag",
17 "sf_jet_puId",
18 ],
19 "bycategory": {"2jets_20pt": ["top_sf"]}, # custom weight
20 },
21 # ...
22 },
23 # ....
24)
Moreover, often weights are easier to define: simple computations can be wrapped in a lambda without the need of defining a full WeightWrapper class.
1from pocket_coffea.lib.weights.weights import WeightLambda
2
3my_custom_sf = WeightLambda.wrap_func(
4 name="sf_custom",
5 function=lambda params, metadata, events, size, shape_variations: call_to_my_fancy_function(
6 events, params, metadata, shape_variations
7 ),
8 has_variations=True,
9)
The return type of the lambda must be a WeightData or WeightDataMultiVariation object.
Tip
The user can create a library of custom weights and include them in the configuration.
Register user defined custom modules#
Users can define modules, library and functions locally in their configuration folder and import then in the
PocketCoffea configuration and workflows. In order to make them available to the dask workers, without being included in
the PocketCoffea core library, it is sufficient to register the modules with cloudpickle.
Add this code in the configuration file.
1import cloudpickle
2
3cloudpickle.register_pickle_by_value(
4 workflow
5) # just an example of user defined modules for processors and cuts
6cloudpickle.register_pickle_by_value(custom_cut_functions)
7cloudpickle.register_pickle_by_value(custom_cuts)
Variations#
Systematics variations are also configured in the Configurator. Weights and shape variations are supported.
The configuration is applied in an hierarchical fashion as for the Weights, to compact the matrix of
samples and categories.
Weights variations: if the weights defined in the
WeightsManagerhas up and down variations, they can be activated by just putting the weight name in thevariationsconfiguration. Up and down shapes will be exported for histograms.1cfg = Configurator( 2 # .... 3 variations={ 4 "weights": { 5 "common": { 6 "inclusive": [ 7 "pileup", 8 "sf_ele_reco", 9 "sf_ele_id", 10 "sf_mu_id", 11 "sf_mu_iso", 12 "sf_jet_puId", 13 "sf_btag", 14 ], 15 "bycategory": {}, 16 }, 17 "bysample": {"TTToSemiLeptonic": {"inclusive": [], "bycategory": {}}}, 18 }, 19 "shape": { 20 # .... 21 }, 22 }, 23 # ... 24)
Shape variations: shape variations are related to lepton, jets and MET scale variations and similar systematics. These variations are provided by the calibrators configured in the Calibrators section. The handling of these variations is more complex since everything after skimming (see docs) is rerun for each shape variation.
Have a look at the base processor get_shape_variations() function to learn about their implementation.
cfg = Configurator( .... variations = { "weights": ..... # Shape variations "shape": { "common": "inclusive": ["JESTotal","JER"] } }, ... )Warning
The available shape variations depend on the calibrators configured in the Calibrators section. Currently, JES and JER variations are implemented and available through the
JetsCalibrator. The available JES variations depend on the jet calibration configuration defined in the parameters (docs).
Histograms configuration#
The PocketCoffea configuration allows the user to define histograms without modifying the processor code. The histogram configuration closely follows the interface of the scikit-hep/hist library, used by Coffea to handle histograms.
Histograms are identified by unique labels and built using a HistConf
object. Each HistConf object has a list of Axis objets, which follow the
interface of the hist library axes.
Important
The number of Axis contained in a HistConf is not limited! The user can work with 1,2,3,4…D histograms without
changing the interface. However, be aware of memory issues which may affect large histograms with too many bins.
1cfg = Configurator(
2 variables={
3 "HT": HistConf(
4 [
5 Axis(
6 coll="events",
7 field="events",
8 bins=100,
9 start=0,
10 stop=200,
11 label="HT",
12 )
13 ]
14 ),
15 "leading_jet_pt_eta": HistConf(
16 [
17 Axis(
18 coll="JetGood",
19 field="pt",
20 bins=40,
21 start=0,
22 stop=200,
23 pos=0,
24 label="Leading jet $p_T$",
25 ),
26 Axis(
27 coll="JetGood",
28 field="eta",
29 bins=40,
30 start=-5,
31 stop=5,
32 pos=0,
33 label="Leading jet $\eta$",
34 ),
35 ]
36 ),
37 # Plotting all jets together
38 "all_jets_pt_eta": HistConf(
39 [
40 Axis(
41 coll="JetGood",
42 field="pt",
43 bins=40,
44 start=0,
45 stop=200,
46 pos=None,
47 label="All jets $p_T$",
48 ),
49 Axis(
50 coll="JetGood",
51 field="eta",
52 bins=40,
53 start=-5,
54 stop=5,
55 pos=None,
56 label="All jets $\eta$",
57 ),
58 ]
59 ),
60 "subleading_jetpt_MET": HistConf(
61 [
62 Axis(
63 coll="JetGood",
64 field="pt",
65 bins=40,
66 start=0,
67 stop=200,
68 pos=0,
69 label="Leading jet $p_T$",
70 ),
71 Axis(coll="MET", field="pt", bins=40, start=0, stop=100, label="MET"),
72 ]
73 ),
74 **ele_hists(coll="ElectronGood", pos=0),
75 **muon_hists(coll="MuonGood", pos=0),
76 **count_hist(
77 name="nElectronGood", coll="ElectronGood", bins=3, start=0, stop=3
78 ),
79 **count_hist(name="nMuonGood", coll="MuonGood", bins=3, start=0, stop=3),
80 **count_hist(name="nJets", coll="JetGood", bins=10, start=4, stop=14),
81 **count_hist(name="nBJets", coll="BJetGood", bins=12, start=2, stop=14),
82 **jet_hists(coll="JetGood", pos=0),
83 **jet_hists(coll="JetGood", pos=1),
84 **jet_hists(coll="JetGood", pos=2),
85 },
86 ...,
87)
The HistConf class has many options, particularly useful to exclude some categories or samples from a specific
histogram.
1@dataclass
2class HistConf:
3 axes: List[Axis]
4 storage: str = "weight"
5 autofill: bool = True # Handle the filling automatically
6 variations: bool = True
7 only_variations: List[str] = None
8 exclude_samples: List[str] = None
9 only_samples: List[str] = None
10 exclude_categories: List[str] = None
11 only_categories: List[str] = None
12 no_weights: bool = False # Do not fill the weights
13 metadata_hist: bool = False # Non-event variables, for processing metadata
14 hist_obj = None
15 collapse_2D_masks = False # if 2D masks are applied on the events
16 # and the data_ndim=1, when collapse_2D_mask=True the OR
17 # of the masks on the axis=2 is performed to get the mask
18 # on axis=1, otherwise an exception is raised
19 collapse_2D_masks_mode = "OR" # Use OR or AND to collapse 2D masks for data_ndim=1 if collapse_2D_masks == True
The Axis object has many options: in particular the array to be plotted is taken from the events mother array
using the coll and field attributed. If an array is global in NanoAOD, the coll is events.
1@dataclass
2class Axis:
3 field: str # variable to plot
4 label: str # human readable label for the axis
5 bins: int = None
6 start: float = None
7 stop: float = None
8 coll: str = "events" # Collection or events or metadata or custom
9 name: str = None # Identifier of the axis: By default is built as coll.field, if not provided
10 pos: int = None # index in the collection to plot. If None plot all the objects on the same histogram
11 type: str = "regular" # regular/variable/integer/intcat/strcat
12 transform: str = None
13 lim: Tuple[float] = (0, 0)
14 underflow: bool = True
15 overflow: bool = True
16 growth: bool = False
Tip
A set of factory methods to build commonly used histogram configuration is available in
pocket_coffea.parameters.histograms.
They produce dictionaries of HistConf objects that need to be unpacked in the configuration file with the syntax: **jet_hists(coll="JetGood", pos=2)
Multidimensional arrays#
A special mention is worth it for the pos attributes. The user can specify which object in a collection to use for the
field to plot: if the collection contains more
than 1 object, e.g. Jet, and pos=1, only the attributes of the 2nd object will be plotted. If the second object is missing, the
attributes are None-padded automatically.
Tip
If the collection contains more objects (e.g. the Jet collection) and the attribute pos is None, the array
is flattened before filling the histograms. This means that you can plot the \(p_T\) of all the jets in a single plot just
by using Axes(coll="Jet", field="pt", pos=None)
Columns output#
In PocketCoffea it is also possible to export arrays from NanoAOD events: the configuration is handled with a
ColOut object.
The configuration follows the same structure of the Weights configuration.
A list of ColOut objects is assigner either inclusively and to all the samples or specifically to a sample and
category.
1cfg = Configurator(
2 # columns output configuration
3 columns={
4 "common": {"inclusive": [], "bycategory": {}},
5 "bysample": {
6 "TTToSemiLeptonic": {
7 "inclusive": [ColOut("LeptonGood", ["pt", "eta", "phi"])]
8 },
9 "TTToSemiLeptonic__=1b": {
10 "inclusive": [ColOut("JetGood", ["pt", "eta", "phi"])]
11 },
12 "TTToSemiLeptonic__=2b": {
13 "inclusive": [ColOut("BJetGood", ["pt", "eta", "phi"])]
14 },
15 },
16 }
17)
The ColOut object defines which collection and fields get exported in the output file, moreover by default the number
of object in the collection is saved only once along the fields. This is needed because the output accumulator contains
flattened arrays. The output can then be unflattened using the saved number of objects.
1@dataclass
2class ColOut:
3 collection: str # Collection
4 columns: List[str] # list of columns to export
5 flatten: bool = True # Flatten by defaul
6 store_size: bool = True
7 fill_none: bool = True
8 fill_value: float = -999.0 # by default the None elements are filled
9 pos_start: int = None # First position in the collection to export. If None export from the first element
10 pos_end: int = None # Last position in the collection to export. If None export until the last element
Similarly to the pos option for the Axes configuration, it is possible to specify a range of objects to restrict the
output over the collection.
Warning
At the moment the output columns gets accumulated over all the chunks of the processed datasets and returned as a single file. This may cause memory problems in case of a large number of events or exported data. A solution is to export single files separately: the option is under development.
Exporting chunks in separate files#
When exporting arrays from the processor, the size of the output may become an issue. In fact, by default the coffea
processor will accumulate the column_accumulators for each chunk to produce the total output at the end of the
processing. This process may accumulate too much memory and crush the processing.
To overcome this issue there is the possibility to export the Columns output of each chunk in a separate file,
without adding anything to the standard PocketCoffea output. The file can be saved in a local folder or sent remotely
with xrootd.
Warning
With this setup the output will be in parquet format, so it is not necessary to flatten the awkward arrays before saving. The full awkward structure can be kept in the output arrays.
To activate this mode, just add the option dump_columns_as_arrays_per_chunk in the workflow_options dictionary of
the Configurator.
The target directory can be local (no xrootD prefix) or a xRootD localtion.
The following configuration shows the setup in action. N.B.: the columns are not flattened (the default), because the output parquet file will contain directly awkward arrays (not column accumulators).
1cfg = Configurator(
2 parameters=parameters,
3 datasets={
4 "jsons": [
5 f"{localdir}/datasets/signal_ttHTobb_local.json",
6 f"{localdir}/datasets/backgrounds_MC_ttbar_local.json",
7 f"{localdir}/datasets/backgrounds_MC_TTbb_local.json",
8 ],
9 "filter": {
10 "samples": ["ttHTobb", "TTToSemiLeptonic", "TTbbSemiLeptonic"],
11 "samples_exclude": [],
12 "year": ["2016_PreVFP", "2016_PostVFP", "2017", "2018"], # All the years
13 },
14 },
15 workflow=PartonMatchingProcessor,
16 workflow_options={
17 "parton_jet_min_dR": 0.3,
18 "dump_columns_as_arrays_per_chunk": "root://t3se01.psi.ch:1094//store/user/dvalsecc/ttHbb/output_columns_parton_matching/sig_bkg_05_07_2023_v1/",
19 },
20 ....columns={
21 "common": {
22 "bycategory": {
23 "semilep_LHE": [
24 ColOut(
25 "Parton",
26 ["pt", "eta", "phi", "mass", "pdgId", "provenance"],
27 flatten=False,
28 ),
29 ColOut(
30 "PartonMatched",
31 [
32 "pt",
33 "eta",
34 "phi",
35 "mass",
36 "pdgId",
37 "provenance",
38 "dRMatchedJet",
39 ],
40 flatten=False,
41 ),
42 ColOut(
43 "JetGood",
44 ["pt", "eta", "phi", "hadronFlavour", "btagDeepFlavB"],
45 flatten=False,
46 ),
47 ColOut(
48 "JetGoodMatched",
49 [
50 "pt",
51 "eta",
52 "phi",
53 "hadronFlavour",
54 "btagDeepFlavB",
55 "dRMatchedJet",
56 ],
57 flatten=False,
58 ),
59 ColOut(
60 "LeptonGood",
61 ["pt", "eta", "phi"],
62 flatten=False,
63 pos_end=1,
64 store_size=False,
65 ),
66 ColOut("MET", ["phi", "pt", "significance"], flatten=False),
67 ColOut(
68 "Generator",
69 ["x1", "x2", "id1", "id2", "xpdf1", "xpdf2"],
70 flatten=False,
71 ),
72 ColOut(
73 "LeptonParton",
74 ["pt", "eta", "phi", "mass", "pdgId"],
75 flatten=False,
76 ),
77 ]
78 }
79 },
80 "bysample": {
81 "ttHTobb": {
82 "bycategory": {
83 "semilep_LHE": [
84 ColOut(
85 "HiggsParton",
86 ["pt", "eta", "phi", "mass", "pdgId"],
87 pos_end=1,
88 store_size=False,
89 flatten=False,
90 )
91 ]
92 }
93 }
94 },
95 },
96)
This configuration will create a structure of folders containing the dataset name and the categories:
# main output folder
(pocket-coffea) ➜ sig_bkg_05_07_2023_v1 lrt
total 3.5K
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:06 TTbbSemiLeptonic_Powheg_2018
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:06 TTbbSemiLeptonic_Powheg_2016_PreVFP
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:06 TTToSemiLeptonic_2016_PreVFP
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:06 TTbbSemiLeptonic_Powheg_2016_PostVFP
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:07 TTbbSemiLeptonic_Powheg_2017
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:14 TTToSemiLeptonic_2016_PostVFP
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul 5 15:20 TTToSemiLeptonic_2017
# Output by dataset
(pocket-coffea) ➜ sig_bkg_05_07_2023_v1 cd TTbbSemiLeptonic_Powheg_2018
(pocket-coffea) ➜ TTbbSemiLeptonic_Powheg_2018 lrt
# categories
drwxr-xr-x 24 dvalsecc ethz-higgs 512 Jul 5 15:12 semilep_LHE
# Chunks output
(pocket-coffea) ➜ TTbbSemiLeptonic_Powheg_2018 cd semilep_LHE
(pocket-coffea) ➜ semilep_LHE lrt
total 219M
-rw-r--r-- 1 dvalsecc ethz-higgs 161K Jul 5 15:06 58cae696-ff9a-11eb-8bcf-b4e45d9fbeef_%2FEvents%3B1_0-6000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 8.8M Jul 5 15:07 f90f7300-022f-11ec-8fd2-0c0013acbeef_%2FEvents%3B1_403500-807000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 9.2M Jul 5 15:07 b788eafa-0203-11ec-9ed1-0b0013acbeef_%2FEvents%3B1_429000-858000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 8.8M Jul 5 15:07 f90f7300-022f-11ec-8fd2-0c0013acbeef_%2FEvents%3B1_0-403500.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 11M Jul 5 15:07 df0073b2-05f2-11ec-936f-118810acbeef_%2FEvents%3B1_0-495000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 715K Jul 5 15:07 94c2a20e-ff92-11eb-9e5b-7e969e86beef_%2FEvents%3B1_0-28681.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 9.2M Jul 5 15:07 b788eafa-0203-11ec-9ed1-0b0013acbeef_%2FEvents%3B1_0-429000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 14M Jul 5 15:07 b379fc2e-0203-11ec-8947-030013acbeef_%2FEvents%3B1_0-639000.parquet