Skip to content

Commit

Permalink
Merge pull request #8 from fraunhoferportugal/dev
Browse files Browse the repository at this point in the history
Simplification of Metric Goals Categorization
  • Loading branch information
ivo-facoco authored Oct 29, 2024
2 parents 7d47146 + 50dab99 commit ebd2e4c
Show file tree
Hide file tree
Showing 36 changed files with 183 additions and 244 deletions.
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,22 @@
All notable changes to this project will be documented in this file.
This format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).


## [0.1.2] - 2024-10-28
### Added
- CLI device specification
- CLI compute arguments in reports dir
- Preprocessing transform in image datasets when using the API

### Fixed
- Moved `sklearn` and `gudhi` dependencies to the main dependency tree

### Changed
- Default image feature extractor is now `vit_b_32`
- Confusing synthetic and input metric goals where aggregated to `quality`, `privacy`, `annotation` and `utility` categories
- Moved metrics to sepecific folders based on `metric_group` (feature-based, data-based)


## [0.1.1] - 2024-10-24
### Fixed
- Fixed project configuration conflict between setup.py and pyproject.toml by reverting to poetry as main build engine
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1.1
0.1.2
7 changes: 6 additions & 1 deletion notebooks/image_examples.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,11 @@
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv-dev",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
Expand All @@ -487,7 +492,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
"version": "3.12.7"
}
},
"nbformat": 4,
Expand Down
10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# https://github.com/microsoft/vscode-python/blob/master/CHANGELOG.md#enhancements-1
[tool.poetry]
name = "pymdma"
version = "0.1.1"
version = "0.1.2"
description = "Multimodal Data Metrics for Auditing real and synthetic data"
authors = ["Fraunhofer AICOS <[email protected]>"]
maintainers = [
Expand Down Expand Up @@ -39,6 +39,8 @@ pot = {version = ">=0.9.4, <0.10.0"}
pydantic = {version = ">=2.8.2, <2.9.0"}
python-dotenv = {version = ">=1.0.0, <2.0.0"}
torch = {version = ">=2.1.0, <2.5.0"}
gudhi = {version = ">=3.9.0, <=4.0.0"}
scikit-learn = {version = ">1.4.0"}

# Image dependencies
pydom = {git = "https://github.com/umang-singhal/pydom.git", rev = "2554af8d0", optional = true}
Expand All @@ -59,11 +61,9 @@ statsmodels = {version = ">=0.14.4, <0.15.0", optional = true}
# sentence-transformers = {version = "^2.2.2", optional = true}
# python-multipart = {version = "0.0.6", optional = true}
# Tabular Dependencies
gudhi = {version = ">=3.9.0, <=4.0.0", optional = true}
numba = {version = ">=0.60.0, <0.80.0", optional = true}
pandas = {version = ">=2.0.0, <3.0.0", optional = true}
pycanon = {version = "1.0.1.post2", optional = true}
scikit-learn = {version = ">1.4.0", optional = true}
scipy = {version = ">=1.6.0, <2.0.0", optional = true}
spacy = {version = ">=3.7.4, <4.0.0", optional = true}
transformers = {version = ">=4.43.2, <5.0.0", optional = true}
Expand All @@ -73,12 +73,12 @@ word2number = {version = ">=1.1.0, <1.5.0", optional = true}

[tool.poetry.extras]
image = ["pydom", "torchvision", "torchmetrics", "pycocotools", "opencv-python", "torch-fidelity"]
tabular = ["gudhi", "numba", "pandas", "pycanon", "scikit-learn", "scipy", "spacy", "transformers", "umap-learn", "word2number", "statsmodels"]
tabular = ["numba", "pandas", "pycanon", "scipy", "spacy", "transformers", "umap-learn", "word2number", "statsmodels"]
time_series = ["tsfel", "wfdb", "statsmodels"]
# text = ["accelerate", "datasets", "nltk", "sentence-transformers", "transformers", "python-multipart"]
all = [
"pydom", "torchvision", "torchmetrics", "pycocotools", "opencv-python", "torch-fidelity",
"gudhi", "numba", "pandas", "pycanon", "scikit-learn", "scipy", "spacy", "transformers", "umap-learn", "word2number",
"numba", "pandas", "pycanon", "scipy", "spacy", "transformers", "umap-learn", "word2number",
"tsfel", "wfdb", "statsmodels"
# "accelerate", "datasets", "nltk", "sentence-transformers", "transformers", "python-multipart"
]
Expand Down
6 changes: 0 additions & 6 deletions requirements/requirements-text.txt

This file was deleted.

43 changes: 19 additions & 24 deletions scripts/image_metrics.sh
Original file line number Diff line number Diff line change
@@ -1,27 +1,22 @@
#!/bin/bash

pymdma \
--modality image \
--validation_type input \
--reference_type none \
--evaluation_level instance \
--target_data data/test/image/input_val/dataset \
--reference_data data/test/image/input_val/reference \
--batch_size 3\
--output_dir reports/image_metrics/ \
--annotation_file data/test/image/input_val/annotations/COCO_annotation_example_mask_exp.json

pymdma --modality image \
--validation_type synth \
--reference_type dataset \
--evaluation_level dataset \
--reference_data data/test/image/synthesis_val/reference \
--target_data data/test/image/synthesis_val/dataset \
--batch_size 3\
--metric_group feature \
--output_dir reports/image_metrics/ \
# --extractor_model_name inception_v3

# python3 src/main.py \
# --modality image \
# --validation_type input \
# --reference_type none \
# --evaluation_level instance \
# --target_data data/test/image/input_val/dataset \
# --reference_data data/test/image/input_val/reference \
# --batch_size 3\
# --output_dir reports/image_metrics/ \
# --metric_group quality \
# --annotation_file data/test/image/input_val/annotations/COCO_annotation_example_mask_exp.json
# --extractor_model_name inception
# --reference_data data/test/image/synthesis_val/reference \
pymdma \
--modality image \
--validation_type synth \
--reference_type dataset \
--evaluation_level dataset \
--target_data data/test/image/synthesis_val/dataset \
--reference_data data/test/image/synthesis_val/reference \
--batch_size 3\
--output_dir reports/image_metrics/ \
18 changes: 8 additions & 10 deletions scripts/tabular_metrics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,12 @@ pymdma --modality tabular \
--target_data data/test/tabular/input_val/dataset \
--batch_size 1\
--output_dir reports/tabular_metrics/ \
--metric_group quality privacy

# pymdma --modality tabular \
# --validation_type synth \
# --evaluation_level dataset \
# --reference_type dataset \
# --reference_data data/test/tabular/synthesis_val/reference \
# --target_data data/test/tabular/input_val/dataset \
# --batch_size 1\
# --output_dir reports/tabular_metrics/ \
# --metric_group feature data
pymdma --modality tabular \
--validation_type synth \
--evaluation_level dataset \
--reference_type dataset \
--reference_data data/test/tabular/synthesis_val/reference \
--target_data data/test/tabular/input_val/dataset \
--batch_size 1\
--output_dir reports/tabular_metrics/ \
20 changes: 9 additions & 11 deletions scripts/time_series_metrics.sh
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
#!/bin/bash

# pymdma --modality time_series \
# --validation_type synth \
# --evaluation_level dataset \
# --reference_type dataset \
# --target_data data/test/time_series/synthesis_val/dataset \
# --reference_data data/test/time_series/synthesis_val/reference \
# --batch_size 2\
# --output_dir reports/tabular_metrics/ \
# --metric_group feature
pymdma --modality time_series \
--validation_type synth \
--evaluation_level dataset \
--reference_type dataset \
--target_data data/test/time_series/synthesis_val/dataset \
--reference_data data/test/time_series/synthesis_val/reference \
--batch_size 2\
--output_dir reports/tabular_metrics/

pymdma --modality time_series \
--validation_type input \
--evaluation_level instance \
--reference_type none \
--target_data data/test/time_series/input_val/dataset \
--output_dir reports/time_series/ \
--metric_group quality
--output_dir reports/time_series/
21 changes: 18 additions & 3 deletions src/pymdma/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
DataModalities,
EvaluationLevel,
InputMetricGroups,
MetricGoal,
ReferenceType,
SyntheticMetricGroups,
ValidationTypes,
Expand Down Expand Up @@ -57,7 +58,7 @@ def parse_args():
type=str,
nargs="+",
default=None,
help="Metrics to be evaluated. E.g. feature, quality etc.",
help="Metrics to be evaluated. E.g. privacy, quality etc.",
)
parser.add_argument(
"--metric_goals",
Expand Down Expand Up @@ -123,6 +124,12 @@ def parse_args():
default=1,
help="Number of workers to be used in the computation. Defaults to 1.",
)
parser.add_argument(
"--device",
type=str,
default="cpu",
help="Device to be used for computation. Defaults to 'cpu'.",
)
return parser.parse_args()


Expand All @@ -138,7 +145,7 @@ def infer_data_source(data_modality: str, data_path: Path):
return data_path

# modality custom data parsers
module = import_module(f"{data_modality}.data.parsers")
module = import_module(f"pymdma.{data_modality}.data.parsers")
if data_path.suffix == ".jsonl":
return module.jsonl_files(data_path)

Expand Down Expand Up @@ -183,6 +190,9 @@ def main() -> None:
metric_goals=None,
)

if args.annotation_file is None:
s_func.pop("annotation", None)

for eval_group in list(s_func.keys()):
funcs = s_func[eval_group]
if len(funcs) == 0:
Expand All @@ -208,6 +218,7 @@ def main() -> None:
args.batch_size,
args.output_dir if args.allow_feature_cache else None,
annotation_file=args.annotation_file,
device=args.device,
)

logger.info(
Expand Down Expand Up @@ -253,7 +264,11 @@ def main() -> None:
with open(args.output_dir / "output.json", "w") as f:
f.write(json.dumps(output, indent=2))

logger.info(f"Results saved to {args.output_dir / 'output.json'}")
with open(args.output_dir / "config.json", "w") as f:
args_vals = {key: str(val) for key, val in dict(vars(args)).items()}
json.dump(args_vals, f, indent=2)

logger.info(f"Results saved to {args.output_dir}")


if __name__ == "__main__":
Expand Down
3 changes: 2 additions & 1 deletion src/pymdma/common/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
self.n_workers = n_workers

self.metrics = self._instanciate_metric_classes(group_classes)

self.global_context: Dict[str, any] = {}

self.extractors = set() if pretrained_extractor_name is None else {pretrained_extractor_name}
Expand Down Expand Up @@ -109,7 +110,7 @@ def _compute_and_reduce(

def _compute_task(metric, metric_args):
metric_name = metric.__class__.__name__
logger.info(f"Extractor: {metric.extractor_model_name} | Metric: {metric_name}")
logger.info(f"Metric: {metric_name}")
new_result = metric.compute(*metric_args, context=self.global_context)

# merge metric with already compute one (batch calculation)
Expand Down
5 changes: 2 additions & 3 deletions src/pymdma/common/definitions.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from abc import ABC, abstractmethod
from typing import Optional

from pymdma.constants import METRIC_GOALS as MetricGoals
from pymdma.constants import EvaluationLevel, ReferenceType
from pymdma.constants import EvaluationLevel, MetricGoal, ReferenceType


class MetricClass:
Expand Down Expand Up @@ -40,7 +39,7 @@ def get_embeddings(self, model_name: str, **kwargs):
class Metric(ABC):
# evaluation params
evaluation_level: EvaluationLevel = EvaluationLevel.DATASET
metric_goal: MetricGoals
metric_goal: MetricGoal
reference_type: ReferenceType = ReferenceType.NONE

# metric specific
Expand Down
6 changes: 2 additions & 4 deletions src/pymdma/common/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@
DataModalities,
EvaluationLevel,
InputMetricGroups,
InputQualityMetrics,
MetricGoal,
ReferenceType,
SyntheticFeatureMetrics,
SyntheticMetricGroups,
ValidationTypes,
)
Expand Down Expand Up @@ -93,7 +92,6 @@ def select_modality_input_layer(
)
elif data_modality == "time_series":
from pymdma.time_series.input_layer import TimeSeriesInputLayer


return TimeSeriesInputLayer(
validation_type,
Expand Down Expand Up @@ -204,7 +202,7 @@ def select_metric_functions(
reference_type: ReferenceType,
evaluation_level: Optional[EvaluationLevel] = None,
metric_group: Optional[Union[SyntheticMetricGroups, InputMetricGroups]] = None,
metric_goals: Optional[List[Union[SyntheticFeatureMetrics, InputQualityMetrics]]] = None,
metric_goals: Optional[List[MetricGoal]] = None,
) -> Dict[str, List[Metric]]:
"""Helper function for selecting specific subset of measures.
Expand Down
Loading

0 comments on commit ebd2e4c

Please sign in to comment.