Skip to content

Commit ca6b1a1

Browse files
committed
first version of the nnpdf data package , with versioning
add utility function to read metadata just from dataset name deprecate a bunch of functions fix include
1 parent 8896921 commit ca6b1a1

24 files changed

+686
-557
lines changed

deprecated_functions.py

+138
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
"""
2+
Note: this module will be removed after the next tag, don't use anything from here
3+
"""
4+
5+
import dataclasses
6+
import logging
7+
from operator import attrgetter
8+
9+
import pandas as pd
10+
11+
from nnpdf_data.coredata import CommonData
12+
13+
log = logging.getLogger(__name__)
14+
15+
log.warning(
16+
"You are loading deprecated functionality that use the old commondata parser. This is no longer supported and will be removed in the near future"
17+
)
18+
19+
20+
### Old commondata:
21+
### All code below this line is deprecated and will be removed
22+
def load_commondata_old(commondatafile, systypefile, setname):
23+
"""Parse a commondata file and a systype file into a CommonData.
24+
25+
Parameters
26+
----------
27+
commondatafile : file or path to file
28+
systypefile : file or path to file
29+
30+
Returns
31+
-------
32+
commondata : CommonData
33+
An object containing the data and information from the commondata
34+
and systype files.
35+
"""
36+
# First parse commondata file
37+
commondatatable = pd.read_csv(commondatafile, sep=r"\s+", skiprows=1, header=None)
38+
# Remove NaNs
39+
# TODO: replace commondata files with bad formatting
40+
# Build header
41+
commondataheader = ["entry", "process", "kin1", "kin2", "kin3", "data", "stat"]
42+
nsys = (commondatatable.shape[1] - len(commondataheader)) // 2
43+
44+
commondataheader += ["ADD", "MULT"] * nsys
45+
commondatatable.columns = commondataheader
46+
commondatatable.set_index("entry", inplace=True)
47+
ndata = len(commondatatable)
48+
commondataproc = commondatatable["process"][1]
49+
# Check for consistency with commondata metadata
50+
cdmetadata = peek_commondata_metadata(commondatafile)
51+
if (nsys, ndata) != attrgetter("nsys", "ndata")(cdmetadata):
52+
raise ValueError(f"Commondata table information does not match metadata for {setname}")
53+
54+
# Now parse the systype file
55+
systypetable = parse_systypes(systypefile)
56+
57+
# Populate CommonData object
58+
return CommonData(
59+
setname=setname,
60+
ndata=ndata,
61+
commondataproc=commondataproc,
62+
nkin=3,
63+
nsys=nsys,
64+
commondata_table=commondatatable,
65+
systype_table=systypetable,
66+
legacy=True,
67+
)
68+
69+
70+
def parse_systypes(systypefile):
71+
"""Parses a systype file and returns a pandas dataframe."""
72+
systypeheader = ["sys_index", "treatment", "name"]
73+
try:
74+
systypetable = pd.read_csv(
75+
systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None
76+
)
77+
systypetable.dropna(axis="columns", inplace=True)
78+
# Some datasets e.g. CMSWCHARMRAT have no systematics
79+
except pd.errors.EmptyDataError:
80+
systypetable = pd.DataFrame(columns=systypeheader)
81+
82+
systypetable.set_index("sys_index", inplace=True)
83+
84+
return systypetable
85+
86+
87+
@dataclasses.dataclass(frozen=True)
88+
class CommonDataMetadata:
89+
"""Contains metadata information about the data being read"""
90+
91+
name: str
92+
nsys: int
93+
ndata: int
94+
process_type: str
95+
96+
97+
def peek_commondata_metadata(commondatafilename):
98+
"""Read some of the properties of the commondata object as a CommonData Metadata"""
99+
with open(commondatafilename) as f:
100+
try:
101+
l = f.readline()
102+
name, nsys_str, ndata_str = l.split()
103+
l = f.readline()
104+
process_type_str = l.split()[1]
105+
except Exception:
106+
log.error(f"Error processing {commondatafilename}")
107+
raise
108+
109+
return CommonDataMetadata(
110+
name, int(nsys_str), int(ndata_str), get_kinlabel_key(process_type_str)
111+
)
112+
113+
114+
def get_plot_kinlabels(commondata):
115+
"""Return the LaTex kinematic labels for a given Commondata"""
116+
key = commondata.process_type
117+
118+
# TODO: the keys in KINLABEL_LATEX need to be updated for the new commondata
119+
return KINLABEL_LATEX.get(key, key)
120+
121+
122+
def get_kinlabel_key(process_label):
123+
"""
124+
Since there is no 1:1 correspondence between latex keys and the old libNNPDF names
125+
we match the longest key such that the proc label starts with it.
126+
"""
127+
l = process_label
128+
try:
129+
if process_label == "EWK_RAP_ASY":
130+
# TODO this function is disappearing in this PR
131+
l = "EWK_RAP"
132+
return next(k for k in sorted(KINLABEL_LATEX, key=len, reverse=True) if l.startswith(k))
133+
except StopIteration as e:
134+
raise ValueError(
135+
"Could not find a set of kinematic "
136+
"variables matching the process %s Check the "
137+
"labels defined in commondata.cc. " % (l)
138+
) from e

doc/sphinx/source/vp/customplots.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ There are two ways to take advantage of resources produced using the
6565
* Using extra modules: Additional Python modules or files can be passed to
6666
``validphys`` using the ``--extra-modules`` (or ``-x``) flag. The
6767
functions in these modules then act ``validphys`` providers and can take
68-
resources from ``validpys`` as input. This approach allows the
68+
resources from ``validpys`` as input. This approach allows the
6969
immediate use of runcards or the default styles. One limitation is that
7070
there is currently no way of adding production rules or parsers in this
7171
way. Prefer this for actions that are too difficult to upstream to
@@ -76,7 +76,7 @@ There are two ways to take advantage of resources produced using the
7676
from matplotlib.figure import Figure
7777
from reportengine.figure import figure
7878

79-
from validphys.commondataparser import load_commondata
79+
from nnpdf_data.commondataparser import load_commondata
8080

8181
# A simple plot that probably should be in validphys to begin with.
8282

@@ -103,7 +103,7 @@ There are two ways to take advantage of resources produced using the
103103

104104

105105

106-
Note that both of these come at the cost of risking future breakage
106+
Note that both of these come at the cost of risking future breakage
107107
somewhat as we don't guarantee any sort of stability on the internal
108108
interfaces.
109109

doc/sphinx/source/vp/pydataobjs.rst

+6-6
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,8 @@ Loading CommonData
143143
------------------
144144

145145
The underlying functions for loading CommonData can be found in
146-
:py:mod:`validphys.commondataparser`. The data is loaded
147-
as :py:class:`validphys.coredata.CommonData`, which uses the
146+
:py:mod:`nnpdf_data.commondataparser`. The data is loaded
147+
as :py:class:`nnpdf_data.coredata.CommonData`, which uses the
148148
`dataclasses <https://docs.python.org/3/library/dataclasses.html>`_ module
149149
which automatically generates some special methods for the class. The
150150
underlying data is stored as DataFrames, and so can be used
@@ -153,7 +153,7 @@ with the standard pandas machinery::
153153
import pandas as pd
154154

155155
from validphys.api import API
156-
from validphys.commondataparser import load_commondata
156+
from nnpdf_data.commondataparser import load_commondata
157157
# define dataset settings
158158
ds_input={'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10}
159159
# first get the CommonDataSpec
@@ -162,11 +162,11 @@ with the standard pandas machinery::
162162
assert isinstance(lcd.central_values, pd.Series)
163163
assert isinstance(lcd.systematics_table, pd.DataFrame)
164164

165-
The :py:class:`validphys.coredata.CommonData` class has a method which returns
165+
The :py:class:`nnpdf_data.coredata.CommonData` class has a method which returns
166166
a new instance of the class with cuts applied::
167167

168168
from validphys.api import API
169-
from validphys.commondataparser import load_commondata
169+
from nnpdf_data.commondataparser import load_commondata
170170
# define dataset and additional settings
171171
ds_input={'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10}
172172
inp = {
@@ -193,7 +193,7 @@ more convenient than calling the underlying functions::
193193
Loading Covariance Matrices
194194
---------------------------
195195

196-
Functions which take :py:class:`validphys.coredata.CommonData` s and return
196+
Functions which take :py:class:`nnpdf_data.coredata.CommonData` s and return
197197
covariance matrices can be found in
198198
:py:mod:`validphys.covmats`. As with the commondata
199199
the functions can be called in scripts directly::

nnpdf_data/examples_of_use.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""
2+
This file contains examples of use of ``nnpdf_data`` as a library.
3+
This library is currently in pre-alpha form and should not be considered stable.
4+
5+
The functions and examples in this file will be eventually removed but might become
6+
part of the library as an external user-facing interface.
7+
8+
There is currently no user-facing interface so no stability is expected.
9+
"""
10+
11+
from nnpdf_data import path_commondata
12+
from nnpdf_data.commondataparser import parse_new_metadata
13+
14+
15+
def parse_dataset(dataset, variant=None):
16+
"""Given a dataset name, read the observable metadata as a CommonData object.
17+
A variant can be given.
18+
19+
The output is a ``ObservableMetaData`` object, with references to all files
20+
that form the dataset but none of them is loaded.
21+
This can then be used to _load_ the dataset using load_commondata.
22+
23+
Example
24+
-------
25+
>>> from nnpdf_data.commondataparser import load_commondata
26+
>>> cd_meta = parse_dataset("LHCB_Z0_7TEV_DIELECTRON_Y")
27+
>>> cd = load_commondata(cd_meta)
28+
>>> print(cd)
29+
CommonData(setname='LHCB_Z0_7TEV_DIELECTRON_Y', ndata=9, commondataproc='DY_Z_Y', nkin=3, nsys=11, legacy=False, legacy_names=['LHCBZ940PB'], kin_variables=['y', 'm_Z2', 'sqrts'])
30+
"""
31+
setname, observable = dataset.rsplit("_", 1)
32+
metadata_file = path_commondata / setname / "metadata.yaml"
33+
metadata = parse_new_metadata(metadata_file, observable, variant=variant)
34+
return metadata

nnpdf_data/nnpdf_data/__init__.py

+11-83
Original file line numberDiff line numberDiff line change
@@ -1,93 +1,21 @@
1-
from functools import lru_cache
2-
import logging
31
import pathlib
42

5-
import yaml
6-
7-
log = logging.getLogger(__name__)
3+
from ._version import __version__
4+
from .commondataparser import parse_new_metadata
5+
from .validphys_compatibility import legacy_to_new_map
86

97
path_vpdata = pathlib.Path(__file__).parent
108
path_commondata = path_vpdata / "commondata"
11-
12-
# VP should not have access to this file, only to the products
13-
_path_legacy_mapping = path_commondata / "dataset_names.yml"
149
theory_cards = path_vpdata / "theory_cards"
1510

16-
with open(_path_legacy_mapping) as file:
17-
_legacy_to_new_mapping_raw = yaml.load(file, yaml.Loader)
18-
# Convert strings into a dictionary
19-
legacy_to_new_mapping = {
20-
k: ({"dataset": v} if isinstance(v, str) else v) for k, v in _legacy_to_new_mapping_raw.items()
21-
}
22-
23-
24-
@lru_cache
25-
def _warn_old_names():
26-
log.warning(
27-
"The usage of old names is deprecated and support will be dropped in future versions! Update your runcards"
28-
)
2911

30-
31-
@lru_cache
32-
def legacy_to_new_map(dataset_name, sys=None):
33-
"""Find the new dataset name and variant corresponding to an old dataset
34-
and systematics choice"""
35-
if dataset_name not in legacy_to_new_mapping:
36-
return dataset_name, None
37-
38-
new_name = legacy_to_new_mapping[dataset_name]
39-
variant = new_name.get("variant")
40-
new_name = new_name["dataset"]
41-
if sys is not None:
42-
if variant is None:
43-
raise KeyError(
44-
f"I cannot translate the combination of {dataset_name} and sys: {sys}. Please report this."
45-
)
46-
variant += f"_{sys}"
47-
48-
# Deprecation notice
49-
_warn_old_names()
50-
warn_text = f"Please change {dataset_name} to {new_name}"
12+
def load_dataset_metadata(dataset_name, variant=None):
13+
"""Given a dataset name, return the metadata"""
14+
# Compatibility with old nnpdf names, these two lines
15+
# might disappear at any given point
5116
if variant is None:
52-
warn_text += f" (variant: {variant})"
53-
log.warning(warn_text)
54-
55-
return new_name, variant
56-
57-
58-
@lru_cache
59-
def new_to_legacy_map(dataset_name, variant_used):
60-
"""Loop over the dictionary and find the right dataset.
61-
62-
Since it is posible to have more than 1 dataset mapped to the same new one,
63-
returns a list of everything that matches.
64-
65-
This function will loop over the entire dictionary of mappings and selects
66-
1. All datasets that match exactly what's in the runcard (dataset & variant): exact_matches
67-
2. All datasets that match the dataset name: matches
68-
If there are any `exact_matches`, it will return only those; otherwise, return all `matches`
69-
if there are no `matches` at all, return None
70-
"""
71-
72-
matches = []
73-
exact_matches = []
74-
75-
for old_name, new_info in legacy_to_new_mapping.items():
76-
new_name = new_info["dataset"]
77-
variant = new_info.get("variant")
78-
79-
if new_name == dataset_name:
80-
matches.append(old_name)
81-
# if it's a nuclear DIS data promote legacy to be legacy_dw
82-
if "_DW_" in old_name and variant_used == "legacy":
83-
variant = "legacy_dw"
84-
85-
if variant_used == variant:
86-
exact_matches.append(old_name)
17+
dataset_name, variant = legacy_to_new_map(dataset_name)
8718

88-
# If we found exact matches, return those and stop looking
89-
if exact_matches:
90-
return exact_matches
91-
elif matches:
92-
return matches
93-
return None
19+
setname, observable = dataset_name.rsplit("_", 1)
20+
metadata_file = path_commondata / setname / "metadata.yaml"
21+
return parse_new_metadata(metadata_file, observable, variant=variant)

0 commit comments

Comments
 (0)