Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Naming: pfc to pcf #499

Merged
merged 1 commit into from
Jan 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions oml/functional/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,25 +109,25 @@ def calc_retrieval_metrics(
return metrics


def calc_topological_metrics(embeddings: Tensor, pfc_variance: Tuple[float, ...]) -> TMetricsDict:
def calc_topological_metrics(embeddings: Tensor, pcf_variance: Tuple[float, ...]) -> TMetricsDict:
"""
Function to evaluate different topological metrics.

Args:
embeddings: Embeddings matrix with the shape of ``[n_embeddings, embeddings_dim]``.
pfc_variance: Values in range [0, 1]. Find the number of components such that the amount
pcf_variance: Values in range [0, 1]. Find the number of components such that the amount
of variance that needs to be explained is greater than the percentage specified
by ``pfc_variance``.
by ``pcf_variance``.

Returns:
Metrics dictionary.

"""
metrics: TMetricsDict = dict()

if pfc_variance:
main_components = calc_pcf(embeddings, pfc_variance)
metrics["pcf"] = dict(zip(pfc_variance, main_components))
if pcf_variance:
main_components = calc_pcf(embeddings, pcf_variance)
metrics["pcf"] = dict(zip(pcf_variance, main_components))

return metrics

Expand Down Expand Up @@ -484,16 +484,16 @@ def calc_fnmr_at_fmr(pos_dist: Tensor, neg_dist: Tensor, fmr_vals: Tuple[float,
return fnmr_at_fmr


def calc_pcf(embeddings: Tensor, pfc_variance: Tuple[float, ...]) -> List[Tensor]:
def calc_pcf(embeddings: Tensor, pcf_variance: Tuple[float, ...]) -> List[Tensor]:
"""
Function estimates the Principal Components Fraction (PCF) of embeddings using Principal Component Analysis.
The metric is defined as a fraction of components needed to explain the required variance in data.

Args:
embeddings: Embeddings matrix with the shape of ``[n_embeddings, embeddings_dim]``.
pfc_variance: Values in range [0, 1]. Find the number of components such that the amount
pcf_variance: Values in range [0, 1]. Find the number of components such that the amount
of variance that needs to be explained is greater than the fraction specified
by ``pfc_variance``.
by ``pcf_variance``.
Returns:
List of linear dimensions as a fractions of the embeddings dimension.

Expand Down Expand Up @@ -527,22 +527,22 @@ def calc_pcf(embeddings: Tensor, pfc_variance: Tuple[float, ...]) -> List[Tensor
because the number of principal axes is superior to the desired explained variance threshold).

>>> embeddings = torch.eye(4, 10, dtype=torch.float)
>>> calc_pcf(embeddings, pfc_variance=(0.5, 1))
>>> calc_pcf(embeddings, pcf_variance=(0.5, 1))
tensor([0.2000, 0.5000])

"""
# The code below mirrors code from scikit-learn repository:
# https://github.com/scikit-learn/scikit-learn/blob/f3f51f9b6/sklearn/decomposition/_pca.py#L491
_check_if_in_range(pfc_variance, 0, 1, "pfc_variance")
_check_if_in_range(pcf_variance, 0, 1, "pcf_variance")
try:
pca = PCA(embeddings)
n_components = pca.calc_principal_axes_number(pfc_variance).to(embeddings)
n_components = pca.calc_principal_axes_number(pcf_variance).to(embeddings)
metric = n_components / embeddings.shape[1]
except Exception:
# Mostly we handle the following error here:
# >>> The algorithm failed to converge because the input matrix is ill-conditioned
# >>> or has too many repeated singular values
metric = [torch.tensor(float("nan"))] * len(pfc_variance)
metric = [torch.tensor(float("nan"))] * len(pcf_variance)

return metric

Expand Down
10 changes: 5 additions & 5 deletions oml/metrics/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def __init__(
precision_top_k: Tuple[int, ...] = (5,),
map_top_k: Tuple[int, ...] = (5,),
fmr_vals: Tuple[float, ...] = tuple(),
pfc_variance: Tuple[float, ...] = (0.5,),
pcf_variance: Tuple[float, ...] = (0.5,),
categories_key: Optional[str] = None,
sequence_key: Optional[str] = None,
postprocessor: Optional[IDistancesPostprocessor] = None,
Expand All @@ -102,9 +102,9 @@ def __init__(
and ``fnmr@fmr=0.4``.
Note, computing this metric requires additional memory overhead,
that is why it's turned off by default.
pfc_variance: Values in range [0, 1]. Find the number of components such that the amount
pcf_variance: Values in range [0, 1]. Find the number of components such that the amount
of variance that needs to be explained is greater than the percentage specified
by ``pfc_variance``.
by ``pcf_variance``.
categories_key: Key to take the samples' categories from the batches (if you have ones)
sequence_key: Key to take sequence ids from the batches (if you have ones)
postprocessor: Postprocessor which applies some techniques like query reranking
Expand All @@ -124,7 +124,7 @@ def __init__(
self.precision_top_k = precision_top_k
self.map_top_k = map_top_k
self.fmr_vals = fmr_vals
self.pfc_variance = pfc_variance
self.pcf_variance = pcf_variance

self.categories_key = categories_key
self.sequence_key = sequence_key
Expand Down Expand Up @@ -205,7 +205,7 @@ def compute_metrics(self) -> TMetricsDict_ByLabels: # type: ignore
"map_top_k": self.map_top_k,
"fmr_vals": self.fmr_vals,
}
args_topological_metrics = {"pfc_variance": self.pfc_variance}
args_topological_metrics = {"pcf_variance": self.pcf_variance}

metrics: TMetricsDict_ByLabels = dict()

Expand Down
10 changes: 5 additions & 5 deletions oml/utils/misc_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,15 +406,15 @@ def inverse_transform(self, embeddings: torch.Tensor) -> torch.Tensor:
self._check_dimensions(n_components)
return torch.matmul(embeddings, self.components[:n_components, :]) + self.mean

def calc_principal_axes_number(self, pfc_variance: Tuple[float, ...]) -> torch.Tensor:
def calc_principal_axes_number(self, pcf_variance: Tuple[float, ...]) -> torch.Tensor:
"""
Function estimates the number of principal axes that are required to explain the `explained_variance_ths`
variance.

Args:
pfc_variance: Values in range [0, 1]. Find the number of components such that the amount
pcf_variance: Values in range [0, 1]. Find the number of components such that the amount
of variance that needs to be explained is greater than the fraction specified
by ``pfc_variance``.
by ``pcf_variance``.
Returns:
List of amount of principal axes.

Expand All @@ -437,12 +437,12 @@ def calc_principal_axes_number(self, pfc_variance: Tuple[float, ...]) -> torch.T

>>> embeddings = torch.eye(4, 10, dtype=torch.float)
>>> pca = PCA(embeddings)
>>> pca.calc_principal_axes_number(pfc_variance=(0.5, 1))
>>> pca.calc_principal_axes_number(pcf_variance=(0.5, 1))
tensor([2, 5])

"""
ratio_cumsum = torch.cumsum(self.explained_variance_ratio, dim=0)
n_components = torch.searchsorted(ratio_cumsum, torch.tensor(pfc_variance), side="right") + 1
n_components = torch.searchsorted(ratio_cumsum, torch.tensor(pcf_variance), side="right") + 1
return n_components

def _check_dimensions(self, n_components: int) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ metric_args:
metrics_to_exclude_from_visualization: [cmc,]
cmc_top_k: [1]
map_top_k: [5]
pfc_variance: [0.5, 0.9, 0.99]
pcf_variance: [0.5, 0.9, 0.99]
return_only_overall_category: True
visualize_only_overall_category: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ metric_args:
cmc_top_k: [1]
map_top_k: [5]
fmr_vals: [1]
pfc_variance: [0.5, 0.9, 0.99]
pcf_variance: [0.5, 0.9, 0.99]
return_only_overall_category: True
visualize_only_overall_category: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ metric_args:
cmc_top_k: [1] # to calculate cmc@1
map_top_k: [5] # wo calculate map@5
fmr_vals: [0.01] # to calculate [email protected]
pfc_variance: [0.5, 0.9, 0.99] # to calculate pfc@0.5, pfc@0.9, pfc@0.99
pcf_variance: [0.5, 0.9, 0.99] # to calculate pcf@0.5, pcf@0.9, pcf@0.99
return_only_overall_category: True # set False if you want to see metric graphs for all the categories (doesn't matter for CARS, since it contains no categories)
visualize_only_overall_category: True # set False to see images where the model performed worse for each separated category (doesn't matter for CARS, since it contains no categories)

Expand Down
2 changes: 1 addition & 1 deletion pipelines/features_extraction/extractor_cars/val_cars.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ metric_args:
map_top_k: [5]
precision_top_k: [5]
fmr_vals: [0.01]
pfc_variance: [0.5, 0.9, 0.99]
pcf_variance: [0.5, 0.9, 0.99]
return_only_overall_category: False
visualize_only_overall_category: True

Expand Down
2 changes: 1 addition & 1 deletion pipelines/features_extraction/extractor_cub/train_cub.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ metric_args:
cmc_top_k: [1]
map_top_k: [5]
fmr_vals: [0.01]
pfc_variance: [0.5, 0.9, 0.99]
pcf_variance: [0.5, 0.9, 0.99]
return_only_overall_category: True
visualize_only_overall_category: True

Expand Down
2 changes: 1 addition & 1 deletion pipelines/features_extraction/extractor_cub/val_cub.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ metric_args:
map_top_k: [5]
fmr_vals: [0.01]
precision_top_k: [5]
pfc_variance: [0.5, 0.9, 0.99]
pcf_variance: [0.5, 0.9, 0.99]
return_only_overall_category: False
visualize_only_overall_category: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ metric_args:
cmc_top_k: [1]
map_top_k: [5]
fmr_vals: [] # Since InShop is a big dataset you should be careful with increasing of the memory footprint, which is needed to calculate fmr
pfc_variance: [0.5, 0.9, 0.99]
pcf_variance: [0.5, 0.9, 0.99]
return_only_overall_category: True
visualize_only_overall_category: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ metric_args:
cmc_top_k: [1, 5]
map_top_k: [5]
precision_top_k: [5]
pfc_variance: [0.5, 0.9, 0.99]
pcf_variance: [0.5, 0.9, 0.99]
return_only_overall_category: True
visualize_only_overall_category: True

Expand Down
2 changes: 1 addition & 1 deletion pipelines/features_extraction/extractor_sop/train_sop.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ metric_args:
cmc_top_k: [1]
map_top_k: [5]
fmr_vals: [] # Since SOP is a big dataset you should be careful with increasing of the memory footprint, which is needed to calculate fmr
pfc_variance: [0.5, 0.9, 0.99]
pcf_variance: [0.5, 0.9, 0.99]
return_only_overall_category: True
visualize_only_overall_category: True

Expand Down
2 changes: 1 addition & 1 deletion pipelines/features_extraction/extractor_sop/val_sop.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ metric_args:
metrics_to_exclude_from_visualization: [cmc,]
cmc_top_k: [1, 5]
map_top_k: [5]
pfc_variance: [0.5, 0.9, 0.99]
pcf_variance: [0.5, 0.9, 0.99]
return_only_overall_category: False
visualize_only_overall_category: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ metric_args:
cmc_top_k: [1, 10, 20, 30, 100]
map_top_k: [5, 10]
fmr_vals: []
pfc_variance: []
pcf_variance: []
return_only_overall_category: True
visualize_only_overall_category: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ metric_args:
cmc_top_k: [1, 10, 20, 30, 100]
map_top_k: [5, 10]
precision_top_k: []
pfc_variance: []
pcf_variance: []
return_only_overall_category: True
visualize_only_overall_category: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ metric_args:
cmc_top_k: [1, 10, 20, 30, 100]
map_top_k: [5, 10]
fmr_vals: []
pfc_variance: []
pcf_variance: []
return_only_overall_category: True
visualize_only_overall_category: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ metric_args:
map_top_k: [5, 10]
fmr_vals: []
precision_top_k: []
pfc_variance: []
pcf_variance: []
return_only_overall_category: True
visualize_only_overall_category: True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def eye_case() -> Tuple[torch.Tensor, TMetricsDict]:

def test_calc_topological_metrics(eye_case: Tuple[torch.Tensor, TMetricsDict]) -> None:
embeddings, metrics_expected = eye_case
args = {"pfc_variance": tuple(metrics_expected["pcf"].keys())}
args = {"pcf_variance": tuple(metrics_expected["pcf"].keys())}
metrics_evaluated = calc_topological_metrics(embeddings, **args)
compare_dicts_recursively(metrics_evaluated, metrics_expected)

Expand All @@ -31,12 +31,12 @@ def test_calc_functions(
metric_func: Callable[[torch.Tensor, Tuple[int, ...]], torch.Tensor],
) -> None:
embeddings, metrics_expected = eye_case
pfc_variance = tuple(metrics_expected[metric_name].keys())
kwargs = {"embeddings": embeddings, "pfc_variance": pfc_variance}
pcf_variance = tuple(metrics_expected[metric_name].keys())
kwargs = {"embeddings": embeddings, "pcf_variance": pcf_variance}

kwargs = remove_unused_kwargs(kwargs, metric_func)
main_components_percentage = metric_func(**kwargs) # type: ignore
metrics_calculated = dict(zip(pfc_variance, main_components_percentage))
metrics_calculated = dict(zip(pcf_variance, main_components_percentage))
for p in metrics_expected[metric_name].keys():
values_expected = metrics_expected[metric_name][p]
values_calculated = metrics_calculated[p]
Expand Down
4 changes: 2 additions & 2 deletions tests/test_oml/test_metrics/test_embedding_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def run_retrieval_metrics(case) -> None: # type: ignore
precision_top_k=tuple(),
map_top_k=tuple(),
fmr_vals=tuple(),
pfc_variance=tuple(),
pcf_variance=tuple(),
postprocessor=get_trivial_postprocessor(top_n=2),
)

Expand Down Expand Up @@ -199,7 +199,7 @@ def run_across_epochs(case1, case2) -> None: # type: ignore
precision_top_k=tuple(),
map_top_k=tuple(),
fmr_vals=tuple(),
pfc_variance=tuple(),
pcf_variance=tuple(),
postprocessor=get_trivial_postprocessor(top_n=3),
)

Expand Down
Loading