Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unweighting implementation #2083

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry_dynamic_versioning.backend"

[tool.poetry]
name = "nnpdf"
version = "0.0.0"
version = "4.0.9.post745.dev0+c7e687b31.dirty"
description = "An open-source machine learning framework for global analyses of parton distributions."
readme = "README.md"
authors = [
Expand Down Expand Up @@ -47,6 +47,7 @@ postfit = "validphys.scripts.postfit:main"
# validphys helpers and scripts
vp-upload = "validphys.scripts.vp_upload:main"
wiki-upload = "validphys.scripts.wiki_upload:main"
vp-unweight= "validphys.scripts.vp_unweight:main"
vp-get = "validphys.scripts.vp_get:main"
vp-comparefits = "validphys.scripts.vp_comparefits:main"
vp-fitrename = "validphys.scripts.vp_fitrename:main"
Expand Down
4 changes: 2 additions & 2 deletions validphys2/src/validphys/commondataparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,7 +857,7 @@ def load_commondata_new(metadata):
# the uncertainties
uncertainties_df = metadata.load_uncertainties()
# and the kinematics
kin_df = metadata.load_kinematics()
kin_df = metadata.load_kinematics(drop_minmax=False)

# Once we have loaded all uncertainty files, let's check how many sys we have
nsys = len(
Expand Down Expand Up @@ -915,7 +915,7 @@ def load_commondata_new(metadata):
setname=metadata.name,
ndata=metadata.ndata,
commondataproc=procname,
nkin=3,
nkin=9,
nsys=nsys,
commondata_table=commondata_table,
systype_table=systype_table,
Expand Down
12 changes: 11 additions & 1 deletion validphys2/src/validphys/coredata.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,17 @@
from reportengine.compat import yaml
from validphys.utils import generate_path_filtered_data

KIN_NAMES = ["kin1", "kin2", "kin3"]
KIN_NAMES = [
"kin1min",
"kin1mid",
"kin1max",
"kin2min",
"kin2mid",
"kin2max",
"kin3min",
"kin3mid",
"kin3max",
]
log = logging.getLogger(__name__)


Expand Down
233 changes: 233 additions & 0 deletions validphys2/src/validphys/scripts/vp_unweight.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
import numpy as np
import pandas as pd
from scipy.special import xlogy
from typing import Tuple
import argparse
import matplotlib.pyplot as plt


class Unweight:
def __init__(self, weights: np.ndarray, *chis: Tuple[int, np.ndarray]) -> None:
"""
Initialize the Unweight class.

Parameters
----------
weights : np.ndarray
Array of weights.
*chis : Tuple[int, np.ndarray]
Variable number of tuples containing power `n` and `chi` array.

Raises
------
AssertionError
If lengths of `chi` arrays are not consistent.
"""
self.chis = chis

length = len(chis[0][1])
for chi in chis:
assert len(chi[1]) == length, "Not all chis have the same length!"

if weights is None:
self.weights = np.ones(length)
else:
self.weights = weights

def entropy(self, p1: np.ndarray, p2: np.ndarray) -> float:
"""
Calculate the entropy between two probability distributions.

Parameters
----------
p1 : np.ndarray
Probability distribution 1.
p2 : np.ndarray
Probability distribution 2.

Returns
-------
float
Entropy value.
"""
log = xlogy(p1, p1 / p2)
log[np.isnan(log)] = 0
entropy = np.sum(log)
return entropy

def reweight(self, i: int = 0, thresh: float = 1e-12) -> None:
"""
Perform reweighting.

Parameters
----------
i : int, optional
Index of the chi array to reweight.
thresh : float, optional
Threshold value for setting small weights to zero. Defaults to 1e-12.
"""

n, chi = self.chis[i]
exp = (n - 1) * np.log(chi) - 1 / 2 * np.power(chi, 2.0)
self.reweights = np.exp(exp - np.mean(exp))
self.reweights = len(self.reweights) * self.reweights / np.sum(self.reweights)
self.reweights[self.reweights <= thresh] = 0
self.reprobs = self.reweights / len(self.reweights)

def unweight(self, Np: int) -> None:
"""
Perform unweighting.

Parameters
----------
Np : int
Number of points.
"""
pcum = np.zeros(len(self.reweights) + 1)
pcum[1:] = np.cumsum(self.reprobs)
unweights = np.zeros(len(self.reweights), dtype="int")
for k in range(len(self.reweights)):
for j in range(Np):
condition_one = j / Np - pcum[k] >= 0
condition_two = pcum[k + 1] - j / Np >= 0
if condition_one and condition_two:
unweights[k] += 1

self.unweights = unweights
self.unprobs = unweights / np.sum(unweights)

def effective_replicas(self, weights: np.ndarray, thresh: float = 1e-12) -> int:
"""
Calculate the effective number of replicas.

Parameters
----------
weights : np.ndarray
Array of weights.
thresh : float, optional
Threshold value neglecting small weights. Defaults to 1e-12.

Returns
-------
int
Effective number of replicas.
"""
N = len(weights)
weights = weights[weights > thresh]
Neff = int(np.exp(-1 / N * np.sum(xlogy(weights, weights / N))))
return Neff

def iterate(
self, thresh: float = 0, earlystopping: bool = True
) -> Tuple[np.ndarray, np.ndarray, int]:
"""
Itarate the unweighting process based on entropy threshold.

Parameters
----------
thresh : float
Entropy threshold value. Defaults to 0.
earlystopping : bool, optional
Whether to stop optimization early if threshold is reached. Defaults to True.

Returns
-------
Tuple[np.ndarray, np.ndarray, int]
Tuple containing arrays of Nps, entropies, and optimal Np value.
"""
Nps = np.logspace(0, np.log10(len(self.weights)) + 1, 50, dtype=np.int64)
entropies = np.zeros(len(Nps))
for i in range(len(Nps)):
self.unweight(Nps[i])
entropies[i] = self.entropy(self.unprobs, self.reprobs)
if entropies[i] <= thresh and earlystopping:
loc = i
break

if i == len(Nps) - 1:
try:
loc = np.where(entropies <= thresh)[0][0]
except:
print("Failed minimisation procedure! Defaulting to lowest entropy.")
loc = -1

Nopt = Nps[loc]

return Nps, entropies, Nopt

def plot_entropy(self, Neff: int) -> None:
"""
Plot the entropy as a function of the new number of replicas.
Parameters
----------
Neff : int
Number of effective replicas
"""
N, E, _ = self.iterate()
fig = plt.figure()
ax = plt.axes(xscale="log")
ax.axvline(Neff, c="r", linestyle=":")
ax.plot(N, E)
ax.set_xlabel(r"Replica Number $N'_{rep}$", size=18)
ax.set_ylabel(r"Entropy $H$", size=18)
ax.tick_params(axis='x', direction='in', bottom=True, top=True)
ax.tick_params(axis='y', direction='in', left=True, right=True)
ax.minorticks_on()
ax.tick_params(axis='x', which='minor', direction='in', bottom=True, top=True)
ax.tick_params(axis='y', which='minor', direction='in', left=True, right=True)
fig.savefig("Entropy.jpg", bbox_inches='tight', pad_inches=0.2)


def main(chi2: np.ndarray, N: int, store: bool = True, plot_entropy: bool = False) -> pd.DataFrame:
"""
Perform the unweighting process and store the results.

Parameters
----------
chi2 : np.ndarray
Array of chi-squared values.
N : int
Number of experimental data points that the chi2 is based on.
store : bool, optional
Whether to store the resulting weights in a CSV file. Defaults to True.
plot_entropy: bool, optional
Whether to plot and save the entropy as a function of the number of new replicas. Defaults to false.

Returns
-------
pd.DataFrame
DataFrame containing the unweighted and reweighted values.
"""
u = Unweight(None, (N, np.sqrt(chi2)))
u.reweight()
Neff = u.effective_replicas(u.reweights)
u.unweight(Neff)

weights = pd.DataFrame()
weights["unweight"] = u.unweights
weights["reweight"] = u.reweights
weights["nrep"] = np.arange(1, len(weights) + 1)
weights = weights.set_index("nrep", drop=True)

if store:
weights.to_csv("weights.csv")

if plot_entropy:
u.plot_entropy(Neff)

return weights


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Unweighting using chi squared values.")
parser.add_argument("chi2_name", help="Add the filename of the chi2 dataset (.csv)")
parser.add_argument(
"N", help="Add the amount of experimental datapoints that the chi2 is based on"
)
parser.add_argument(
"--plot_entropy", action="store_true", help="Call flag to enable entropy plotting."
)
args = parser.parse_args()
chi2 = pd.read_csv(args.chi2_name).values
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is the csv generated? This should be part of the script.

Ideally the script should have two steps

  1. Check whether the csv exist and if It doesn't generate it

  2. Do the actual computation

In the validphys language these would be two actions, where the second depends on the first, but we can make it into proper validphys actions at the end, it can remain a script for the time being.

For the first action you can use a lot of vp stuff (e.g., functions to compute the chi2, or load pdfs).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This I could not find a better solution for. The calculation of the chi2 per replica is done in the nnpdf/external/ poljet code and if I would shift the calculation of the chi2 to validphys, I'm afraid I must migrate more functionality. However, I don't think the solution will be more elegant.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, you are already using validphys in that code.

So a first stage can be a simple port, and then later on instead of doing self.l.check_commondata(data_name) or API.dataset_inputs_covmat_from_systematics(**inp) you would have a function that takes as input results and then according to the definiton of dataset_inputs in the runcard / input you would get a results object already populated with the dataset, covariance matrix, statistic / systematic errors, etc.


main(chi2, args.N, plot_entropy=args.plot_entropy)
Loading