-
Notifications
You must be signed in to change notification settings - Fork 9
Unweighting implementation #2083
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 3 commits
c7e687b
e6d3c51
2fd2f04
5dbadcd
9c6feab
bb3598b
2e51982
71271be
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from scipy.special import xlogy | ||
from typing import Tuple | ||
from tqdm import tqdm | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not too fond of tqdm because it usually makes logs and debugging harder to read. |
||
import argparse | ||
|
||
class Unweight: | ||
def __init__(self, weights: np.ndarray, *chis: Tuple[int, np.ndarray]) -> None: | ||
""" | ||
Initialize the Unweight class. | ||
|
||
Args: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are trying to follow the numpy conventions https://numpydoc.readthedocs.io/en/latest/format.html#parameters at least for new code (and specially for parameters, returns, etc). It helps consistency. |
||
weights (np.ndarray): Array of weights. | ||
*chis (Tuple[int, np.ndarray]): Variable number of tuples containing power `n` and `chi` array. | ||
|
||
Raises: | ||
AssertionError: If lengths of `chi` arrays are not consistent. | ||
""" | ||
self.chis = chis | ||
|
||
length = len(chis[0][1]) | ||
for chi in chis: | ||
assert len(chi[1]) == length, "Not all chis have the same length!" | ||
|
||
if weights is None: | ||
self.weights = np.ones(length) | ||
else: | ||
self.weights = weights | ||
|
||
def entropy(self, p1, p2): | ||
""" | ||
Calculate the entropy between two probability distributions. | ||
|
||
Args: | ||
p1 (np.ndarray): Probability distribution 1. | ||
p2 (np.ndarray): Probability distribution 2. | ||
|
||
Returns: | ||
float: Entropy value. | ||
""" | ||
log = xlogy(p1, p1/p2) | ||
log[np.isnan(log)] = 0 | ||
entropy = np.sum(log) | ||
return entropy | ||
|
||
def reweight(self, i: int = 0, thresh: float = 1e-12) -> None: | ||
""" | ||
Perform reweighting. | ||
|
||
Args: | ||
i (int): Index of the chi array to reweight. | ||
thresh (float, optional): Threshold value for setting small weights to zero. Defaults to 1e-12. | ||
""" | ||
|
||
n, chi = self.chis[i] | ||
exp = (n-1)*np.log(chi) - 1/2*np.power(chi,2.0) | ||
self.reweights = np.exp(exp - np.mean(exp)) | ||
self.reweights = len(self.reweights)*self.reweights/np.sum(self.reweights) | ||
self.reweights[self.reweights <= thresh] = 0 | ||
self.reprobs = self.reweights/len(self.reweights) | ||
|
||
def unweight(self, Np: int) -> None: | ||
""" | ||
Perform unweighting. | ||
|
||
Args: | ||
Np (int): Number of points. | ||
""" | ||
pcum = np.zeros(len(self.reweights) + 1) | ||
pcum[1:] = np.cumsum(self.reprobs) | ||
unweights = np.zeros(len(self.reweights), dtype="int") | ||
for k in range(len(self.reweights)): | ||
for j in range(Np): | ||
condition_one = j/Np - pcum[k] >= 0 | ||
condition_two = pcum[k+1] - j/Np >= 0 | ||
if condition_one and condition_two: | ||
unweights[k] += 1 | ||
|
||
self.unweights = unweights | ||
self.unprobs = unweights/np.sum(unweights) | ||
|
||
def effective_replicas(self, weights: np.ndarray, thresh: float = 1e-12) -> int: | ||
""" | ||
Calculate the effective number of replicas. | ||
|
||
Args: | ||
weights (np.ndarray): Array of weights. | ||
thresh (float, optional): Threshold value neglecting small weights. Defaults to 1e-12. | ||
|
||
Returns: | ||
int: Effective number of replicas. | ||
""" | ||
N = len(weights) | ||
weights = weights[weights > thresh] | ||
Neff = int(np.exp(-1/N*np.sum(xlogy(weights,weights/N)))) | ||
return Neff | ||
|
||
def optimize(self, thresh: float, earlystopping: bool = True): | ||
""" | ||
Optimize the unweighting process based on entropy threshold. | ||
|
||
Args: | ||
thresh (float): Entropy threshold value. | ||
earlystopping (bool, optional): Whether to stop optimization early if threshold is reached. Defaults to True. | ||
|
||
Returns: | ||
Tuple[np.ndarray, np.ndarray, int]: Tuple containing arrays of Nps, entropies, and optimal Np value. | ||
""" | ||
Nps = np.logspace(1, np.log10(len(self.weights))+1, 50, dtype=np.int64) | ||
entropies = np.zeros(len(Nps)) | ||
for i in tqdm(range(len(Nps))): | ||
self.unweight(Nps[i]) | ||
entropies[i] = self.entropy(self.unprobs, self.reprobs) | ||
if entropies[i] <= thresh and earlystopping: | ||
loc = i | ||
break | ||
|
||
if i == len(Nps)-1: | ||
try: | ||
loc = np.where(entropies <= thresh)[0][0] | ||
except: | ||
print("Failed minimisation procedure! Defaulting to lowest entropy.") | ||
loc = -1 | ||
|
||
Nopt = Nps[loc] | ||
|
||
return Nps, entropies, Nopt | ||
|
||
def main(chi2, N, store = True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add also here some doctrings explaining what it does. |
||
u = Unweight(None, (N, np.sqrt(chi2))) | ||
u.reweight() | ||
Neff = u.effective_replicas(u.reweights) | ||
u.unweight(Neff) | ||
|
||
weights = pd.DataFrame() | ||
weights["unweight"] = u.unweights | ||
weights["reweight"] = u.reweights | ||
weights["nrep"] = np.arange(1, len(weights)+1) | ||
weights = weights.set_index("nrep", drop = True) | ||
|
||
if store: | ||
weights.to_csv("weights.csv") | ||
|
||
return weights | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Unweighting using chi squared values.") | ||
parser.add_argument("chi2_name", help = "Add the filename of the chi2 dataset (.csv)") | ||
parser.add_argument("N", help = "Add the amount of experimental datapoints that the chi2 is based on") | ||
args = parser.parse_args() | ||
chi2 = pd.read_csv(args.chi2_name).values | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How is the csv generated? This should be part of the script. Ideally the script should have two steps
In the validphys language these would be two actions, where the second depends on the first, but we can make it into proper validphys actions at the end, it can remain a script for the time being. For the first action you can use a lot of vp stuff (e.g., functions to compute the chi2, or load pdfs). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This I could not find a better solution for. The calculation of the chi2 per replica is done in the nnpdf/external/ poljet code and if I would shift the calculation of the chi2 to validphys, I'm afraid I must migrate more functionality. However, I don't think the solution will be more elegant. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, you are already using validphys in that code. So a first stage can be a simple port, and then later on instead of doing |
||
|
||
main(chi2, args.N) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why did you disable versioning?
(it needs to be reenabled, but i'd like to know the exact problem that led to needing to disable it)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this happened because I built NNPDF locally. I reverted it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But in principle you should be able to install with
pip install -e .
(or without-e
) and it should not modify this file.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Strange. It did happen in my case...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is your environment managed by poetry?