diff --git a/.gitignore b/.gitignore index afd3f20..17c0d9f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ build/ dist/ .pytest_cache .ipynb_* -*.__pyc \ No newline at end of file +*.__pyc +venv/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..355f156 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,34 @@ +## Development Setup + +To set up a development environment and install an editable version of the `impresso_essentials` package, follow these steps: + +1. **Create a virtual environment**: + + ```sh + python3 -m venv venv + + ``` + +2. **Activate the virtual environment**: + + - On Windows: + ```sh + venv\Scripts\activate + ``` + - On macOS/Linux: + ```sh + source venv/bin/activate + ``` + +3. **Install the package in editable mode**: + + ```sh + pip install -e . + ``` + +4. **Run the tests**: + ```sh + pytest + ``` + +This will discover and run all the tests in the tests directory. diff --git a/impresso_essentials/io/s3_path_parser.py b/impresso_essentials/io/s3_path_parser.py new file mode 100644 index 0000000..d89c754 --- /dev/null +++ b/impresso_essentials/io/s3_path_parser.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- +"""s3_path_parser.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1_-fTugHJfr7HsPFjnGTelvdC7E97NJ-K + +# s3 file path parser +""" + +import re + +# Define the possible values for various components +phases = "sandbox|staging|final" +processing_labels = ( + "embeddings|entities|langident|lingproc|ocrqa|newsagencies|topics|textreuse" +) +processing_subtype_labels = "embeddings|images|entities" +tasks = "ner|nel|tm|emb|lid|pos" +subtasks = "newsagency" +langs = "de|fr|en|lb|multilingual" + + +# Construct the regex pattern using re.VERBOSE +pattern = rf""" +^s3:// +(?P + (?P\d{{2}}) + -processed-data- + (?P{phases}) +) +/ (?P{processing_labels}) +(?: / (?P{processing_subtype_labels}) )? +/ +(?P + (?P=processing_label) + - + (?P + (?P{tasks}) + (?:_(?P{subtasks}))? + - + (?P[A-Za-z][A-Za-z_-]*) + (?:_(?Pv(?P\d+)\.(?P\d+)\.(?P\d+)))? + - + (?P{langs}) + ) + _ + (?Pv(?P\d+)-(?P\d+)-(?P\d+)) +) +(?:/(?P[A-Za-z]+))? +(?:/(?P[A-Za-z0-9]+)) +/ +(?P + (?: (?P=media_alias) - ) # Backreference here + (?P\d{{4}}) + (?: - (?P=processing_label) )? # Backreference here +) +\.jsonl\.bz2$ +""" + +# Compile the regex pattern +regex = re.compile(pattern, re.VERBOSE) + + +def parse_s3_path(path): + """ + Parses the given S3 path according to the defined pattern. + + Args: + path (str): The S3 path to parse. + + Returns: + dict: A dictionary of the matched components, or None if no match is found. + """ + match = regex.match(path) + if match: + return match.groupdict() + else: + return None + + +def test_matching_paths(test_paths, verbose=True): + """ + Tests paths that are expected to match the regex pattern. + """ + + print("Testing paths expected to match:") + for path in test_paths: + result = parse_s3_path(path) + if result: + print(f"✅ Passed: {path}") + # Uncomment below to print the matched groups + if verbose: + for key, value in result.items(): + print(f" {key}: {value}") + else: + print(f"❌ Failed: {path} (Expected to match but did not)") + print("-" * 80) + + +def test_non_matching_paths(test_paths, verbose=True): + """ + Tests paths that are expected not to match the regex pattern. + """ + + print("Testing paths expected not to match:") + for path in test_paths: + result = parse_s3_path(path) + if not result: + print(f"✅ Passed: {path} (Correctly did not match)") + if verbose: + for key, value in result.items(): + print(f" {key}: {value}") + else: + print(f"❌ Failed: {path} (Expected not to match but did)") + print("-" * 80) + + +correct_test_paths = [ + # Full path with all components + "s3://01-processed-data-final/entities/embeddings/entities-ner-en_core_web_sm_v3.1.0-en_v1-0-0/Reuters/UK/UK-2021.jsonl.bz2", + # Path without opt_processing_subtype_label + "s3://02-processed-data-staging/langident/langident-lid-fasttext_v1.0.0-multilingual_v2-0-1/BBC/BBC-2020-langident.jsonl.bz2", + # Path without provider_alias and model version + "s3://03-processed-data-sandbox/topics/topics-tm-lda_model-en_v3-2-4/EXP/EXP-2021-topics.jsonl.bz2", + # Path with optional subtask + "s3://01-processed-data-final/newsagencies/newsagencies-ner_newsagency-model_v1.2.0-en_v1-0-0/AFP/AFP-2021-newsagencies.jsonl.bz2", + # Path missing optional components + "s3://42-processed-data-final/embeddings/embeddings-tm-mallet-de_v4-0-0/MEDIA/MEDIA-2022.jsonl.bz2", + # Path with media_alias but no media_alias_file_stem + "s3://42-processed-data-final/topics/topics-tm-bert_v3.0.0-en_v3-0-0/CNN/CNN-2024-topics.jsonl.bz2", + # Path with + "s3://41-processed-data-staging/lingproc/lingproc-pos-spacy_v3.6.0-multilingual_v1-0-2/IMP/IMP-2024.jsonl.bz2", + # entity suggestion + "s3://42-processed-data-final/embeddings/images/image-embeddings/embeddings-resnet_dino_clip-v0-0-1/bnl/actionfem/actionfem-1927-image-embeddings.jsonl.bz2", +] +# Run the test functions +# test_matching_paths(correct_test_paths,verbose=True) + +incorrect_test_paths = [ + # Path with missing year in file_stem + "s3://06-processed-data-final/ocrqa/ocrqa-ner-en_core_web_lg_v2.2.2-en_v2-1-0/NewYorkTimes/USA/USA-ocrqa.jsonl.bz2", + # Path with missing processing_label_file_stem + "s3://07-processed-data-final/lingproc/lingproc-lid-fasttext_v1.0.0-en_v1-0-0/2023.jsonl.bz2", + # Path with invalid phase + "s3://10-processed-data-production/entities/entities-ner-en_core_web_sm_v3.1.0-en_v1-0-0/2021-entities.jsonl.bz2", + # Path with incorrect file extension + "s3://11-processed-data-final/embeddings/embeddings-emb-word2vec_v4.0.0-multilingual_v4-0-0/2022-embeddings.txt", + # Path missing processing_label in run_id + "s3://12-processed-data-final/topics/topics-tm-lda_v1.0.0-en_v1-0-0/2021-topics.jsonl.bz2", + # Path with incorrect model_id format + "s3://13-processed-data-final/entities/entities-unknownmodel_v1.0.0-en_v1-0-0/2021-entities.jsonl.bz2", + # Path with provider_alias but no media_alias + "s3://09-processed-data-final/lingproc/lingproc-tm-lda_v1.0.0-en_v1-0-0/PROVIDER/MEDIA-2025-lingproc.jsonl.bz2", +] + + +# test_non_matching_paths(incorrect_test_paths,verbose=False) diff --git a/tests/test_s3_path_parser.py b/tests/test_s3_path_parser.py new file mode 100644 index 0000000..325862b --- /dev/null +++ b/tests/test_s3_path_parser.py @@ -0,0 +1,136 @@ +import re +import pytest +from impresso_essentials.io.s3_path_parser import pattern + + +@pytest.mark.parametrize( + "s3_path, expected", + [ + # First case remains the same + ( + "s3://01-processed-data-final/entities/embeddings/entities-ner-en_core_web_sm_v3.1.0-en_v1-0-0/Reuters/UK/UK-2021.jsonl.bz2", + { + "bucket": "01-processed-data-final", + "stage_number": "01", + "phase": "final", + "processing_label": "entities", + "processing_subtype_label": "embeddings", + "run_id": "entities-ner-en_core_web_sm_v3.1.0-en_v1-0-0", + "model_id": "ner-en_core_web_sm_v3.1.0-en", + "task": "ner", + "subtask": None, + "model_specificity": "en_core_web_sm", + "model_version": "v3.1.0", + "model_major": "3", + "model_minor": "1", + "model_patch": "0", + "lang": "en", + "run_version": "v1-0-0", + "run_major": "1", + "run_minor": "0", + "run_patch": "0", + "provider_alias": "Reuters", + "media_alias": "UK", + "file_stem": "UK-2021", + "year": "2021", + }, + ), + ( + "s3://02-processed-data-staging/langident/langident-lid-fasttext_v1.0.0-multilingual_v2-0-1/BBC/BBC-2020-langident.jsonl.bz2", + { + "bucket": "02-processed-data-staging", + "stage_number": "02", + "phase": "staging", + "processing_label": "langident", + "processing_subtype_label": None, + "run_id": "langident-lid-fasttext_v1.0.0-multilingual_v2-0-1", + "model_id": "lid-fasttext_v1.0.0-multilingual", + "task": "lid", + "subtask": None, + "model_specificity": "fasttext", + "model_version": "v1.0.0", + "model_major": "1", + "model_minor": "0", + "model_patch": "0", + "lang": "multilingual", + "run_version": "v2-0-1", + "run_major": "2", + "run_minor": "0", + "run_patch": "1", + "provider_alias": None, + "media_alias": "BBC", + "file_stem": "BBC-2020-langident", + "year": "2020", + }, + ), + ( + "s3://03-processed-data-sandbox/topics/topics-tm-lda_model-en_v3-2-4/EXP/EXP-2021-topics.jsonl.bz2", + { + "bucket": "03-processed-data-sandbox", + "stage_number": "03", + "phase": "sandbox", + "processing_label": "topics", + "processing_subtype_label": None, + "run_id": "topics-tm-lda_model-en_v3-2-4", + "model_id": "tm-lda_model-en", + "task": "tm", + "subtask": None, + "model_specificity": "lda_model", + "model_version": None, + "model_major": None, + "model_minor": None, + "model_patch": None, + "lang": "en", + "run_version": "v3-2-4", + "run_major": "3", + "run_minor": "2", + "run_patch": "4", + "provider_alias": None, + "media_alias": "EXP", + "file_stem": "EXP-2021-topics", + "year": "2021", + }, + ), + ( + "s3://42-processed-data-final/topics/topics-tm-bert_v3.0.0-en_v3-0-0/CNN/CNN-2024-topics.jsonl.bz2", + { + "bucket": "42-processed-data-final", + "stage_number": "42", + "phase": "final", + "processing_label": "topics", + "processing_subtype_label": None, + "run_id": "topics-tm-bert_v3.0.0-en_v3-0-0", + "model_id": "tm-bert_v3.0.0-en", + "task": "tm", + "subtask": None, + "model_specificity": "bert", + "model_version": "v3.0.0", + "model_major": "3", + "model_minor": "0", + "model_patch": "0", + "lang": "en", + "run_version": "v3-0-0", + "run_major": "3", + "run_minor": "0", + "run_patch": "0", + "provider_alias": None, + "media_alias": "CNN", + "file_stem": "CNN-2024-topics", + "year": "2024", + }, + ), + ], +) +def test_successful_s3_path_matches(s3_path, expected): + match = re.match(pattern, s3_path, re.VERBOSE) + assert match is not None, f"Pattern did not match for {s3_path}" + for key, value in expected.items(): + assert ( + match.group(key) == value + ), f"For {key}: expected {value}, got {match.group(key)}" + + +def test_failed_s3_path_match(): + path = "s3://42-processed-data-final/embeddings/images/image-embeddings/embeddings-resnet_dino_clip-v0-0-1/bnl/actionfem/actionfem-1927-image-embeddings.jsonl.bz2" + match = re.match(pattern, path, re.VERBOSE) + assert match is None, f"Pattern should not match for {path}"