Configuration#

A PocketCoffea analysis can be customized by writing a configuration file, containing all the information needed to setup an analysis run.

The PocketCoffea configuration comprehends:

  • Input dataset specification

  • Analysis parameters (see Parameters page)

  • Custom processor specification

  • Skimming, preselection and categories

  • Weights configuration

  • Systematic variations configuration

  • Histograms output configuration

  • Running mode configuration: local, multiprocessing, cluster

Note

The configuration is wrapped by a Configurator object, usually saved in a python script containing a cfg variable.

A full simplified example is available here. In this page we will describe in details all the components of a more complete example about ttHbb semileptonic channel.

  1from pocket_coffea.utils.configurator import Configurator
  2from pocket_coffea.lib.cut_definition import Cut
  3from pocket_coffea.lib.cut_functions import get_nObj_min, get_HLTsel,get_nBtagEq,get_nBtagMin
  4from pocket_coffea.parameters.cuts import passthrough
  5from pocket_coffea.parameters.histograms import *
  6import os
  7
  8from pocket_coffea.workflows.tthbb_base_processor import ttHbbBaseProcessor 
  9
 10# importing custom cut functions
 11from custom_cut_functions import *
 12localdir = os.path.dirname(os.path.abspath(__file__))
 13
 14# Loading default parameters
 15from pocket_coffea.parameters import defaults
 16default_parameters = defaults.get_default_parameters()
 17defaults.register_configuration_dir("config_dir", localdir+"/params")
 18
 19# merging additional analysis specific parameters
 20parameters = defaults.merge_parameters_from_files(default_parameters,
 21                                                  f"{localdir}/params/object_preselection.yaml",
 22                                                  f"{localdir}/params/btagsf_calibration.yaml",
 23                                                  f"{localdir}/params/triggers.yaml",
 24                                                  update=True)
 25
 26# Configurator instance
 27cfg = Configurator(
 28    parameters = parameters,
 29    datasets = {
 30        "jsons": [f"{localdir}/datasets/backgrounds_MC_ttbar_2018.json",
 31                  f"{localdir}/datasets/backgrounds_MC_ttbar_2017.json",
 32                  f"{localdir}/datasets/DATA_SingleEle.json",
 33                  f"{localdir}/datasets/DATA_SingleEle.json",
 34                    ],
 35        "filter" : {
 36            "samples": ["TTToSemiLeptonic","DATA_SingleEle","DATA_SingleEle"],
 37            "samples_exclude" : [],
 38            "year": ['2018','2017']
 39        },
 40        "subsamples":{
 41            "TTToSemiLeptonic": {
 42                "=1b":  [get_nBtagEq(1, coll="Jet")],
 43                "=2b" : [get_nBtagEq(2, coll="Jet")],
 44                ">2b" : [get_nBtagMin(3, coll="Jet")]
 45            }
 46        }
 47    },
 48
 49    workflow = ttHbbBaseProcessor,
 50    workflow_options = {},
 51    
 52    # Skimming and categorization
 53    skim = [
 54             get_nObj_min(4, 15., "Jet"),
 55             get_HLTsel()
 56             ],
 57             
 58    preselections = [semileptonic_presel_nobtag],
 59    
 60    categories = {
 61        "baseline": [passthrough],
 62        "1b" : [ get_nBtagEq(1, coll="BJetGood")],
 63        "2b" : [ get_nBtagEq(2, coll="BJetGood")],
 64        "3b" : [ get_nBtagEq(3, coll="BJetGood")],
 65        "4b" : [ get_nBtagEq(4, coll="BJetGood")]
 66    },
 67    
 68    # Weights configuration
 69    weights = {
 70        "common": {
 71            "inclusive": ["genWeight","lumi","XS",
 72                          "pileup",
 73                          "sf_ele_reco", "sf_ele_id",
 74                          "sf_mu_id","sf_mu_iso",
 75                          "sf_btag", "sf_jet_puId", 
 76                          ],
 77            "bycategory" : {
 78                "2jets_20pt" : [.....]
 79            }
 80        },
 81        "bysample": {
 82             "TTToSemiLeptonic": {
 83                  "inclusive": [...],
 84                  "bycategory": { 
 85                       "2jets_20pt": [....]
 86                   }
 87             }
 88        }
 89    },
 90
 91    variations = {
 92        "weights": {
 93            "common": {
 94                "inclusive": [  "pileup",
 95                                "sf_ele_reco", "sf_ele_id",
 96                                "sf_mu_id", "sf_mu_iso",
 97                                 "sf_jet_puId","sf_btag"  
 98                              ],
 99                "bycategory" : {
100                }
101            },
102            "bysample": {
103              "TTToSemiLeptonic": {
104                    "inclusive": [],
105                    "bycategory": {}
106                }
107            } 
108        },
109        "shape": {
110        ....
111        }
112    },
113
114    
115    variables = {
116        "HT" : HistConf([Axis(coll="events", field="events", bins=100, start=0, stop=200, label="HT")] ),
117        "leading_jet_pt_eta" : HistConf(
118            [
119                Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=0, label="Leading jet $p_T$"),
120                Axis(coll="JetGood", field="eta", bins=40, start=-5, stop=5, pos=0, label="Leading jet $\eta$")
121            ] ),
122            
123         #Plotting all jets together
124         "all_jets_pt_eta" : HistConf(
125            [
126                Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=None, label="All jets $p_T$"),
127                Axis(coll="JetGood", field="eta", bins=40, start=-5, stop=5, pos=None, label="All jets $\eta$")
128            ] ),
129            
130        "subleading_jetpt_MET" : HistConf(
131            [
132                Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=0, label="Leading jet $p_T$"),
133                Axis(coll="MET", field="pt", bins=40, start=0, stop=100, label="MET")
134            ] ),
135                
136        **ele_hists(coll="ElectronGood", pos=0),
137        **muon_hists(coll="MuonGood", pos=0),
138        **count_hist(name="nElectronGood", coll="ElectronGood",bins=3, start=0, stop=3),
139        **count_hist(name="nMuonGood", coll="MuonGood",bins=3, start=0, stop=3),
140        **count_hist(name="nJets", coll="JetGood",bins=10, start=4, stop=14),
141        **count_hist(name="nBJets", coll="BJetGood",bins=12, start=2, stop=14),
142        **jet_hists(coll="JetGood", pos=0),
143        **jet_hists(coll="JetGood", pos=1),
144        **jet_hists(coll="JetGood", pos=2),
145        
146    },
147
148    columns = {
149        "common": {
150             "inclusive": [],
151             "bycategory": {}
152        },
153        "bysample": {
154            "TTToSemiLeptonic" : { "inclusive":  [ColOut("LeptonGood",["pt","eta","phi"])]},
155            "TTToSemiLeptonic__=1b" :{ "inclusive":  [ColOut("JetGood",["pt","eta","phi"])]},
156            "TTToSemiLeptonic__=2b":{ "inclusive":  [ColOut("BJetGood",["pt","eta","phi"])]},
157        }
158    }
159)
160
161run_options = {
162        "executor"       : "dask/lxplus",
163        "env"            : "singularity",
164        "workers"        : 1,
165        "scaleout"       : 50,
166        "worker_image"   : "/cvmfs/unpacked.cern.ch/gitlab-registry.cern.ch/cms-analysis/general/pocketcoffea:lxplus-cc7-latest",
167        "queue"          : "microcentury",
168        "walltime"       : "00:40:00",
169        "mem_per_worker" : "4GB", # GB
170        "disk_per_worker" : "1GB", # GB
171        "exclusive"      : False,
172        "chunk"          : 400000,
173        "retries"        : 50,
174        "treereduction"  : 20,
175        "adapt"          : False,
176        
177    }

Datasets#

The dataset configuration has the following structure:

 1cfg = Configurator(
 2    datasets = {
 3        "jsons": [f"{localdir}/datasets/backgrounds_MC_ttbar_2018.json",
 4                  f"{localdir}/datasets/backgrounds_MC_ttbar_2017.json",
 5                  f"{localdir}/datasets/DATA_SingleEle.json",
 6                  f"{localdir}/datasets/DATA_SingleEle.json",
 7                    ],
 8        "filter" : {
 9            "samples": ["TTToSemiLeptonic","DATA_SingleEle","DATA_SingleEle"],
10            "samples_exclude" : [],
11            "year": ['2018','2017']
12        },
13        "subsamples":{
14            "TTToSemiLeptonic": {
15                "=1b":  [get_nBtagEq(1, coll="Jet")],
16                "=2b" : [get_nBtagEq(2, coll="Jet")],
17                ">2b" : [get_nBtagMin(3, coll="Jet")]
18            }
19        }
20    },
21    ....
22    )
  • The jsons key contains the list of dataset definition file to consider as inputs

  • The filter dictionary gives the user the possibility to filter on the fly the desidered samples to include or exclude from the full list taken from the jsons files. Samples can be filtered by name of by year.

  • subsamples makes possible to define cuts splitting the events in multiple sub-samples. See the datasets page for a more in-depth definition of them. A list of Cut objects is used to define the subsample, an AND between them is used to mask the events.

    In the example, by using the subsamples option effectively the TTToSemiLeptonic sample will be split in the framework in 3 pieces called TTToSemiLeptonic__=1b, TTToSemiLeptonic__=2b, TTToSemiLeptonic__>2b.

    Warning

    Subsamples do not need to be exclusive. Subsample masks are applied before exporting histograms, columns and counting events.

Workflow#

1from pocket_coffea.workflows.tthbb_base_processor import ttHbbBaseProcessor
2
3cfg = Configurator(
4    workflow : ttHbbBaseProcessor,
5    worflow_options : {},
6    ....
7)
  • workflow key specifies directly the class to use.

  • workflow_options: dictionary with additional options for specific processors (user defined)

Cuts and categories#

The events skimming, preselection and categorization is defined in a structured way in PocketCoffea: see Concepts#Filtering for a detailed explanation of the difference between the steps.

 1cfg = Configurator(
 2   skim = [
 3                get_nObj_min(4, 15., "Jet"),
 4                get_HLTsel()
 5                ],
 6
 7   preselections = [semileptonic_presel_nobtag],
 8
 9   categories = StandardSelection({
10          "baseline": [passthrough],
11           "1b" : [ get_nBtagEq(1, coll="BJetGood")],
12           "2b" : [ get_nBtagEq(2, coll="BJetGood")],
13           "3b" : [ get_nBtagEq(3, coll="BJetGood")],
14           "4b" : [ get_nBtagEq(4, coll="BJetGood")]
15      }),
16   ....
17)

A Cut is a simple object grouping a name, a cut function, a dictionary of parameters. The same Cut object can be used in different points of the configuration. The Cut objects are defined in pocket_coffea.lib.cut_definition. Have a look at the documentation about the Cut object and its API.

Tip

Custom functions and modules, defined locally by the user and not part of the central PocketCoffea core, must be registered in a special way to be available to the Dask workers. Have a look at the Register user defined custom modules section.

PocketCoffea implements a set of factory methods for common cut functions: they are defined in cut_functions.

In the configuration the categorization is split in:

  • Skim: The skim configuration is a list of Cut object. Events passing the AND of the list of cuts pass the skim.

  • Preselections: The preselection is a list of Cut object and AND between them is applied.

  • Categories: Splitting of events for histograms and columns output.

Categorization utilities#

PocketCoffea defines different ways to categorize events. The code is available at pocket_coffea.lib.categorization.

  • StandardSelection: handles the definition of categories from a dictionary of Cut objects. Each key defines a category with a list of Cut objects which are applied with an AND.

    1categories = StandardSelection({
    2      "baseline": [passthrough],
    3       "1b" : [ get_nBtagEq(1, coll="BJetGood")],
    4       "2b" : [ get_nBtagEq(2, coll="BJetGood")],
    5       "3b" : [ get_nBtagEq(3, coll="BJetGood")],
    6       "4b" : [ get_nBtagEq(4, coll="BJetGood")]
    7  }),
    
  • CartesianSelection: handles the definition of cartesian product of categories. The class keeps a list of MultiCut objects, each defining a set of subcategories (or bins). The CartesianSelection utils defines automatically categories which are the cartesian products of the bins defined by each MultiCut. A StandardSelection object can be embedded in the CartesianSelection to defined “common” categories not used in the cartesian product. This utility can be very useful to build a differential analysis.

    For example, this is the configuration to build categories as \(((N_{jets} [4,5,>6]) \times (N_{bjets} [3,4,5,>6])) + \text{inclusive} + 4jets40pt\)

     1categories = CartesianSelection(
     2    multicuts = [
     3        MultiCut(name="Njets",
     4                 cuts=[
     5                     get_nObj_eq(4, 15., "JetGood"),
     6                     get_nObj_eq(5, 15., "JetGood"),
     7                     get_nObj_min(6, 15., "JetGood"),
     8                 ],
     9                 cuts_names=["4j","5j","6j"]),
    10        MultiCut(name="Nbjet",
    11                cuts=[
    12                     get_nObj_eq(3, 15., "BJetGood"),
    13                     get_nObj_eq(4, 15., "BJetGood"),
    14                     get_nObj_eq(5, 15., "BJetGood"),
    15                     get_nObj_min(6, coll="BJetGood"),
    16                 ],
    17                 cuts_names=["3b","4b","5b","6b"])
    18    ],
    19    common_cats = StandardSelection({
    20        "inclusive": [passthrough],
    21        "4jets_40pt" : [get_nObj_min(4, 40., "JetGood")]
    22    })
    23),
    

    Warning

    The standard PackedSelection utility from coffea can handle a maximum of 64 categories. The CartesianSelection tool overcomes this limitation internally.

Weights#

Weights are handled in PocketCoffea through the WeightsManager object (see API). The configuration file specifies which weight is applied to which sample in which category.

 1cfg = Configurator(
 2  
 3    weights = {
 4        "common": {
 5            "inclusive": ["genWeight","lumi","XS",
 6                          "pileup",
 7                          "sf_ele_reco", "sf_ele_id",
 8                          "sf_mu_id","sf_mu_iso",
 9                          "sf_btag", "sf_jet_puId", 
10                          ],
11            "bycategory" : {
12                "2jets_20pt" : [.....]
13            }
14        },
15        "bysample": {
16             "TTToSemiLeptonic": {
17                  "inclusive": [...],
18                  "bycategory": { 
19                       "2jets_20pt": [....]
20                   }
21             }
22        }
23    },
24    ....
25)

To reduce boilerplate configuration the weights are specified following a decision-tree style and applied in a hierarchical fashion. Weights can be assigned to all samples (common key), inclusively or by category. Weights can also be assigned to specific samples, again inclusively or in specific categories.

A set of predefined weights with centrally produced corrections and scale factors for the CMS Run2 ultra-legacy analysis have been already implemented in PocketCoffea and are available in the configuration by using string identifiers:

  • genWeight: MC generator weight

  • lumi

  • XS: sample cross-section

  • pileup: pileup scale factor

  • sf_ele_reco, sf_ele_id: electron reconstruction and ID scalefactors. The working point is defined by the lepton_scale_factors key in the parameters (see Parameters docs)

  • sf_mu_id, sf_mu_iso: muon id and isolation SF.

  • sf_btag: btagPOG shape scale factors

  • sf_jet_puId: jet puID SF

If a weight is requested in the configuration, but it doens’t exist, the framework emits an error before running.

On-the-flight custom weights#

Weights can be created by the user directly in the configuration. The WeightCustom object allows to create a function with a name that get called for each chunk to produce an array of weights (and optionally their variations). Have a look at the API.

1WeightCustom(
2      name="custom_weight",
3      function= lambda events, size, metadata: [("pt_weight", 1 + events.JetGood[:,0].pt/400.)]
4   )

The custom weight can be added in the configuration instead of the string identifier of centrally-defined weights.

 1custom_w = WeightCustom(
 2  name="custom_weight",
 3  function= lambda events, size, metadata: [("pt_weight", 1 + events.JetGood[:,0].pt/400.)]
 4)
 5
 6"weights": {
 7    "common": {
 8        "inclusive": [... ],
 9        "bycategory" : {
10            "3jets": [custom_w]
11        }
12    }
13}

Tip

The user can create a library of custom weights and include them in the configuration.

Register user defined custom modules#

Users can define modules, library and functions locally in their configuration folder and import then in the PocketCoffea configuration and workflows. In order to make them available to the dask workers, without being included in the PocketCoffea core library, it is sufficient to register the modules with cloudpickle.

Add this code in the configuration file.

1
2import cloudpickle
3cloudpickle.register_pickle_by_value(workflow) # just an example of user defined modules for processors and cuts
4cloudpickle.register_pickle_by_value(custom_cut_functions)
5cloudpickle.register_pickle_by_value(custom_cuts)
6

Variations#

Systematics variations are also configured in the Configurator. Weights and shape variations are supported. The configuration is applied in an hierarchical fashion as for the Weights, to compact the matrix of samples and categories.

  • Weights variations: if the weights defined in the WeightsManager has up and down variations, they can be activated by just putting the weight name in the variations configuration. Up and down shapes will be exported for histograms.

     1cfg = Configurator(
     2   ....
     3  variations = {
     4      "weights": {
     5          "common": {
     6              "inclusive": [  "pileup",
     7                              "sf_ele_reco", "sf_ele_id",
     8                              "sf_mu_id", "sf_mu_iso",
     9                               "sf_jet_puId","sf_btag"  
    10                            ],
    11              "bycategory" : {
    12              }
    13          },
    14          "bysample": {
    15            "TTToSemiLeptonic": {
    16                  "inclusive": [],
    17                  "bycategory": {}
    18              }
    19          } 
    20      },
    21      "shape": {
    22      ....
    23      }
    24  },
    25  ...
    26)
    
  • Shape variations: shape variations are related to lepton, jets and MET scale variations and similar systematics. The handling of these variations is more complex since everything after skimming (see docs) is rerun for each shape variation.

    Have a look at the base processor get_shape_variations() function to learn about their implementation.

    cfg = Configurator(
       ....
      variations = {
          "weights": .....
          # Shape variations
          "shape": {
              "common":
                "inclusive": ["JESTotal","JER"]
          }
      },
      ...
    )
    

    Warning

    Only JES and JER variations have been implemented for the moment and are available to be used. The available JES variations depend on the jet calibration configuration defined in the parameters (docs).

Histograms configuration#

The PocketCoffea configuration allows the user to define histograms without modifying the processor code. The histogram configuration closely follows the interface of the scikit-hep/hist library, used by Coffea to handle histograms.

Histograms are identified by unique labels and built using a HistConf object. Each HistConf object has a list of Axis objets, which follow the interface of the hist library axes.

Important

The number of Axis contained in a HistConf is not limited! The user can work with 1,2,3,4…D histograms without changing the interface. However, be aware of memory issues which may affect large histograms with too many bins.

 1cfg = Configurator(
 2   variables = {
 3        "HT" : HistConf([Axis(coll="events", field="events", bins=100, start=0, stop=200, label="HT")] ),
 4        "leading_jet_pt_eta" : HistConf(
 5            [
 6                Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=0, label="Leading jet $p_T$"),
 7                Axis(coll="JetGood", field="eta", bins=40, start=-5, stop=5, pos=0, label="Leading jet $\eta$")
 8            ] ),
 9            
10         #Plotting all jets together
11         "all_jets_pt_eta" : HistConf(
12            [
13                Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=None, label="All jets $p_T$"),
14                Axis(coll="JetGood", field="eta", bins=40, start=-5, stop=5, pos=None, label="All jets $\eta$")
15            ] ),
16            
17        "subleading_jetpt_MET" : HistConf(
18            [
19                Axis(coll="JetGood", field="pt", bins=40, start=0, stop=200, pos=0, label="Leading jet $p_T$"),
20                Axis(coll="MET", field="pt", bins=40, start=0, stop=100, label="MET")
21            ] ),
22            
23                
24        **ele_hists(coll="ElectronGood", pos=0),
25        **muon_hists(coll="MuonGood", pos=0),
26        **count_hist(name="nElectronGood", coll="ElectronGood",bins=3, start=0, stop=3),
27        **count_hist(name="nMuonGood", coll="MuonGood",bins=3, start=0, stop=3),
28        **count_hist(name="nJets", coll="JetGood",bins=10, start=4, stop=14),
29        **count_hist(name="nBJets", coll="BJetGood",bins=12, start=2, stop=14),
30        **jet_hists(coll="JetGood", pos=0),
31        **jet_hists(coll="JetGood", pos=1),
32        **jet_hists(coll="JetGood", pos=2),
33        
34    },
35    ...
36)
37

The Axis object has many options: in particular the array to be plotted is taken from the events mother array using the coll and field attributed. If an array is global in NanoAOD, the coll is events.

 1
 2@dataclass
 3class Axis:
 4    field: str  # variable to plot
 5    label: str  # human readable label for the axis
 6    bins: int = None
 7    start: float = None
 8    stop: float = None
 9    coll: str = "events"  # Collection or events or metadata or custom
10    name: str = None      # Identifier of the axis: By default is built as coll.field, if not provided
11    pos: int = None       # index in the collection to plot. If None plot all the objects on the same histogram
12    type: str = "regular" # regular/variable/integer/intcat/strcat
13    transform: str = None
14    lim: Tuple[float] = (0, 0)
15    underflow: bool = True
16    overflow: bool = True
17    growth: bool = False

Tip

A set of factory methods to build commonly used histogram configuration is available in pocket_coffea.parameters.histograms. They produce dictionaries of HistConf objects that need to be unpacked in the configuration file with the syntax: **jet_hists(coll="JetGood", pos=2)

Multidimensional arrays#

A special mention is worth it for the pos attributes. The user can specify which object in a collection to use for the field to plot: if the collection contains more than 1 object, e.g. Jet, and pos=1, only the attributes of the 2nd object will be plotted. If the second object is missing, the attributes are None-padded automatically.

Tip

If the collection contains more objects (e.g. the Jet collection) and the attribute pos is None, the array is flattened before filling the histograms. This means that you can plot the \(p_T\) of all the jets in a single plot just by using Axes(coll="Jet", field="pt", pos=None)

Columns output#

In PocketCoffea it is also possible to export arrays from NanoAOD events: the configuration is handled with a ColOut object.

The configuration follows the same structure of the Weights configuration. A list of ColOut objects is assigner either inclusively and to all the samples or specifically to a sample and category.

 1cfg = Configurator(
 2   # columns output configuration
 3   columns = {
 4        "common": {
 5             "inclusive": [],
 6             "bycategory": {}
 7        },
 8        "bysample": {
 9            "TTToSemiLeptonic" : { "inclusive":  [ColOut("LeptonGood",["pt","eta","phi"])]},
10            "TTToSemiLeptonic__=1b" :{ "inclusive":  [ColOut("JetGood",["pt","eta","phi"])]},
11            "TTToSemiLeptonic__=2b":{ "inclusive":  [ColOut("BJetGood",["pt","eta","phi"])]},
12        }
13    }
14    )

The ColOut object defines which collection and fields get exported in the output file, moreover by default the number of object in the collection is saved only once along the fields. This is needed because the output accumulator contains flattened arrays. The output can then be unflattened using the saved number of objects.

 1@dataclass
 2class ColOut:
 3    collection: str  # Collection
 4    columns: List[str]  # list of columns to export
 5    flatten: bool = True  # Flatten by defaul
 6    store_size: bool = True
 7    fill_none: bool = True
 8    fill_value: float = -999.0  # by default the None elements are filled
 9    pos_start: int = None  # First position in the collection to export. If None export from the first element
10    pos_end: int = None  # Last position in the collection to export. If None export until the last element

Similarly to the pos option for the Axes configuration, it is possible to specify a range of objects to restrict the output over the collection.

Warning

At the moment the output columns gets accumulated over all the chunks of the processed datasets and returned as a single file. This may cause memory problems in case of a large number of events or exported data. A solution is to export single files separately: the option is under development.

Exporting chunks in separate files#

When exporting arrays from the processor, the size of the output may become an issue. In fact, by default the coffea processor will accumulate the column_accumulators for each chunk to produce the total output at the end of the processing. This process may accumulate too much memory and crush the processing.

To overcome this issue there is the possibility to export the Columns output of each chunk in a separate file, without adding anything to the standard PocketCoffea output. The file can be saved in a local folder or sent remotely with xrootd.

Warning

With this setup the output will be in parquet format, so it is not necessary to flatten the awkward arrays before saving. The full awkward structure can be kept in the output arrays.

To activate this mode, just add the option dump_columns_as_arrays_per_chunk in the workflow_options dictionary of the Configurator. The target directory can be local (no xrootD prefix) or a xRootD localtion.

The following configuration shows the setup in action. N.B.: the columns are not flattened (the default), because the output parquet file will contain directly awkward arrays (not column accumulators).

 1cfg = Configurator(
 2    parameters = parameters,
 3    datasets = {
 4        "jsons": [f"{localdir}/datasets/signal_ttHTobb_local.json",
 5                  f"{localdir}/datasets/backgrounds_MC_ttbar_local.json",
 6                  f"{localdir}/datasets/backgrounds_MC_TTbb_local.json"],
 7        "filter" : {
 8            "samples": ["ttHTobb", "TTToSemiLeptonic", "TTbbSemiLeptonic"],
 9            "samples_exclude" : [],
10            "year": ["2016_PreVFP",
11                     "2016_PostVFP",
12                     "2017","2018"] #All the years
13        }
14    },
15
16    workflow = PartonMatchingProcessor,
17    workflow_options = {"parton_jet_min_dR": 0.3,
18                        "dump_columns_as_arrays_per_chunk": "root://t3se01.psi.ch:1094//store/user/dvalsecc/ttHbb/output_columns_parton_matching/sig_bkg_05_07_2023_v1/"},
19    
20    .... 
21    columns = {
22        "common": {
23            "bycategory": {
24                    "semilep_LHE": [
25                        ColOut("Parton", ["pt", "eta", "phi", "mass", "pdgId", "provenance"], flatten=False),
26                        ColOut(
27                            "PartonMatched",
28                            ["pt", "eta", "phi","mass", "pdgId", "provenance", "dRMatchedJet",], flatten=False
29                        ),
30                        ColOut(
31                            "JetGood",
32                            ["pt", "eta", "phi", "hadronFlavour", "btagDeepFlavB"], flatten=False
33                        ),
34                        ColOut(
35                            "JetGoodMatched",
36                            [
37                                "pt",
38                                "eta",
39                                "phi",
40                                "hadronFlavour",
41                                "btagDeepFlavB",
42                                "dRMatchedJet",
43                            ], flatten=False
44                        ),
45                        
46                        ColOut("LeptonGood",
47                               ["pt","eta","phi"],flatten=False,
48                               pos_end=1, store_size=False),
49                        ColOut("MET", ["phi","pt","significance"], flatten=False),
50                        ColOut("Generator",["x1","x2","id1","id2","xpdf1","xpdf2"], flatten=False),
51                        ColOut("LeptonParton",["pt","eta","phi","mass","pdgId"], flatten=False)
52                    ]
53                }
54        },
55        "bysample":{
56            "ttHTobb":{
57                "bycategory": {
58                    "semilep_LHE": [ColOut("HiggsParton",
59                                           ["pt","eta","phi","mass","pdgId"], pos_end=1, store_size=False, flatten=False)]
60                }
61            }
62        }
63    },
64)

This configuration will create a structure of folders containing the dataset name and the categories:

# main output folder
(pocket-coffea)   sig_bkg_05_07_2023_v1 lrt
total 3.5K
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul  5 15:06 TTbbSemiLeptonic_Powheg_2018
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul  5 15:06 TTbbSemiLeptonic_Powheg_2016_PreVFP
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul  5 15:06 TTToSemiLeptonic_2016_PreVFP
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul  5 15:06 TTbbSemiLeptonic_Powheg_2016_PostVFP
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul  5 15:07 TTbbSemiLeptonic_Powheg_2017
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul  5 15:14 TTToSemiLeptonic_2016_PostVFP
drwxr-xr-x 3 dvalsecc ethz-higgs 512 Jul  5 15:20 TTToSemiLeptonic_2017

# Output by dataset
(pocket-coffea)   sig_bkg_05_07_2023_v1 cd TTbbSemiLeptonic_Powheg_2018
(pocket-coffea)   TTbbSemiLeptonic_Powheg_2018 lrt
# categories
drwxr-xr-x 24 dvalsecc ethz-higgs 512 Jul  5 15:12 semilep_LHE

# Chunks output
(pocket-coffea)   TTbbSemiLeptonic_Powheg_2018 cd semilep_LHE 
(pocket-coffea)   semilep_LHE lrt
total 219M
-rw-r--r-- 1 dvalsecc ethz-higgs 161K Jul  5 15:06 58cae696-ff9a-11eb-8bcf-b4e45d9fbeef_%2FEvents%3B1_0-6000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 8.8M Jul  5 15:07 f90f7300-022f-11ec-8fd2-0c0013acbeef_%2FEvents%3B1_403500-807000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 9.2M Jul  5 15:07 b788eafa-0203-11ec-9ed1-0b0013acbeef_%2FEvents%3B1_429000-858000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 8.8M Jul  5 15:07 f90f7300-022f-11ec-8fd2-0c0013acbeef_%2FEvents%3B1_0-403500.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs  11M Jul  5 15:07 df0073b2-05f2-11ec-936f-118810acbeef_%2FEvents%3B1_0-495000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 715K Jul  5 15:07 94c2a20e-ff92-11eb-9e5b-7e969e86beef_%2FEvents%3B1_0-28681.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs 9.2M Jul  5 15:07 b788eafa-0203-11ec-9ed1-0b0013acbeef_%2FEvents%3B1_0-429000.parquet
-rw-r--r-- 1 dvalsecc ethz-higgs  14M Jul  5 15:07 b379fc2e-0203-11ec-8947-030013acbeef_%2FEvents%3B1_0-639000.parquet