diff --git a/README.md b/README.md index f0b4ec1..1f6cf85 100644 --- a/README.md +++ b/README.md @@ -20,4 +20,8 @@ --- -`Stay tuned, more information will follow` \ No newline at end of file +`Stay tuned, more information will follow` + +Features: +- Integration with git +- In \ No newline at end of file diff --git a/examples/pyterrier/example-script.py b/examples/pyterrier/example-script.py new file mode 100755 index 0000000..d20d2d7 --- /dev/null +++ b/examples/pyterrier/example-script.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +from pathlib import Path + +import click +import pandas as pd +import pyterrier as pt +from codecarbon import OfflineEmissionsTracker + +from autometadata import persist_ir_metadata + + +@click.command() +@click.argument("output-directory", type=Path) +def main(output_directory): + with OfflineEmissionsTracker(country_iso_code="DEU") as tracker: + run = pd.DataFrame( + [ + {"qid": "q-1", "docno": "doc-01", "rank": 1, "score": 10}, + {"qid": "q-1", "docno": "doc-02", "rank": 2, "score": 9}, + {"qid": "q-1", "docno": "doc-03", "rank": 3, "score": 8}, + ] + ) + output_directory.mkdir(exist_ok=True, parents=True) + pt.io.write_results(run, output_directory / "run.txt", format="trec") + persist_ir_metadata(output_directory, tracker) + + +if __name__ == "__main__": + main() diff --git a/setup.cfg b/setup.cfg index 7c98f55..c866bbc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,10 +9,16 @@ package_dir = packages = find_namespace: install_requires = GitPython + setuptools + py-cpuinfo + nvsmi + codecarbon [options.extras_require] test = pytest + approvaltests + python-terrier==0.12.0 dev = coverage black @@ -33,4 +39,4 @@ max-line-length = 120 extend-ignore = E203 include = src,test exclude = docs -max-complexity = 10 \ No newline at end of file +max-complexity = 10 diff --git a/src/autometadata/__init__.py b/src/autometadata/__init__.py index df23aa0..6e7dcbb 100644 --- a/src/autometadata/__init__.py +++ b/src/autometadata/__init__.py @@ -1,6 +1,112 @@ -def dummy() -> int: - """ - This is a dummy function that exists to test the initial project configuration. It should be deleted once actual - code was introduced! - """ - return 42 +import json +import platform +import sys +import traceback +from pathlib import Path +from typing import Any, Dict, List, Optional + +from codecarbon import EmissionsTracker +from cpuinfo import get_cpu_info +from git import InvalidGitRepositoryError, Repo +from nvsmi import get_gpus +from pkg_resources import working_set + +FILE_NAME = ".ir-metadata" + + +def __ensure_output_directory_is_valid(outdir: Path): + if not outdir: + raise ValueError("Foo") + + if outdir.exists() and not outdir.is_dir(): + raise ValueError("Foo") + + if (outdir / FILE_NAME).exists(): + raise ValueError("Foo") + + if not outdir.is_dir(): + outdir.mkdir(parents=True, exist_ok=True) + + +def _executed_file_from_stacktrace() -> Path: + return Path(traceback.extract_stack()[0].filename).resolve() + + +def collect_git_repo_metadata(repo: Optional[Path] = None) -> Dict[str, Any]: + if not repo: + return collect_git_repo_metadata(_executed_file_from_stacktrace().parent) + git_repo = None + try: + git_repo = Repo(repo) + except InvalidGitRepositoryError: + parent_repo = repo.parent + cnt = 0 + + while cnt < 7 and parent_repo != Path("/"): + try: + cnt += 1 + git_repo = Repo(parent_repo) + break + except InvalidGitRepositoryError: + parent_repo = parent_repo.parent + + if not git_repo: + raise InvalidGitRepositoryError(f"I can not find a git repository in {repo}.") + + remotes = {r.name: r.url for r in git_repo.remotes} + + return { + "commit": git_repo.head.commit.hexsha, + "active_branch": git_repo.active_branch.name, + "remotes": remotes, + } + + +def get_python_info() -> Dict[str, Any]: + ret: Dict[str, Any] = {} + modules = [i.split(".")[0] for i in sys.modules.keys() if i and not i.startswith("_")] + pkg_resources = list(set([f"{i.project_name}=={i.version}" for i in working_set])) + ret["sys"] = { + "executable": sys.executable, + "argv": sys.argv, + "modules": list(set(modules)), + "version_info": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}", + } + ret["pkg_resources"] = pkg_resources + return ret + + +def get_gpu_info() -> List[Dict[str, Any]]: + try: + return [json.loads(gpu.to_json()) for gpu in get_gpus()] + except FileNotFoundError: + return [] + + +def get_platform_info() -> Dict[str, Any]: + return { + "system": platform.system(), + "machine": platform.machine(), + "version": platform.version(), + "architecture": platform.architecture(), + "processor": platform.processor(), + } + + +def persist_ir_metadata(output_directory: Path, codecarbon_tracker: Optional[EmissionsTracker] = None): + __ensure_output_directory_is_valid(output_directory) + output_file = output_directory / FILE_NAME + collected_meta_data = get_python_info() + collected_meta_data["git"] = collect_git_repo_metadata() + collected_meta_data["cpuinfo"] = get_cpu_info() + collected_meta_data["gpus"] = get_gpu_info() + collected_meta_data["platform"] = get_platform_info() + executed_file = _executed_file_from_stacktrace() + collected_meta_data["file"] = {"name": executed_file.name, "content": open(executed_file, "r").read()} + if codecarbon_tracker: + collected_meta_data["codecarbon_emissions"] = json.loads(codecarbon_tracker.final_emissions_data.toJSON()) + + serialized_meta_data = json.dumps(collected_meta_data) + + with open(output_file, "w") as f: + f.write(serialized_meta_data) diff --git a/test/PythonScriptApprovalTests.test_for_valid_git_repo.approved.json b/test/PythonScriptApprovalTests.test_for_valid_git_repo.approved.json new file mode 100644 index 0000000..075f51f --- /dev/null +++ b/test/PythonScriptApprovalTests.test_for_valid_git_repo.approved.json @@ -0,0 +1,37 @@ +{ + "cpuinfo": { + "arch": "X86_64" + }, + "file": { + "content": "#!/usr/bin/env python3\nfrom pathlib import Path\n\nimport click\nimport pandas as pd\nimport pyterrier as pt\n\nfrom autometadata import persist_ir_metadata\n\n\n@click.command()\n@click.argument(\"output-directory\", type=Path)\ndef main(output_directory):\n run = pd.DataFrame(\n [\n {\"qid\": \"q-1\", \"docno\": \"doc-01\", \"rank\": 1, \"score\": 10},\n {\"qid\": \"q-1\", \"docno\": \"doc-02\", \"rank\": 2, \"score\": 9},\n {\"qid\": \"q-1\", \"docno\": \"doc-03\", \"rank\": 3, \"score\": 8},\n ]\n )\n output_directory.mkdir(exist_ok=True, parents=True)\n pt.io.write_results(run, output_directory / \"run.txt\", format=\"trec\")\n persist_ir_metadata(output_directory)\n\n\nif __name__ == \"__main__\":\n main()\n", + "name": "example-script.py" + }, + "git": { + "active_branch": "main", + "commit": "a810f6948292c8c6ffef4fc0c8172887a8715c8d", + "remotes": { + "origin": "url-does-not-exist" + } + }, + "gpus": [], + "pkg_resources": [ + "python-terrier==0.12.0" + ], + "platform": { + "architecture": "OMITTED", + "machine": "OMITTED", + "processor": "OMITTED", + "system": "OMITTED", + "version": "OMITTED" + }, + "sys": { + "argv": [ + "example-script.py" + ], + "executable": "python3", + "modules": [ + "pyterrier" + ], + "version_info": "3.XY.XY" + } +} diff --git a/test/README.md b/test/README.md new file mode 100644 index 0000000..939462a --- /dev/null +++ b/test/README.md @@ -0,0 +1,7 @@ +# Datasets + +The examples contain multiple directories, each with a git repository that is not committed, we zip them via: + +``` +cd ../examples/ && zip -r ../test/test-resources.zip . && cd ../test +``` diff --git a/test/dummytest.py b/test/dummytest.py deleted file mode 100644 index 49caed7..0000000 --- a/test/dummytest.py +++ /dev/null @@ -1,13 +0,0 @@ -""" -This file only exists to have a dummy test during initial setup. Please remove once actual tests have been added! -""" - -import unittest - -from autometadata import dummy - - -class TestDummy(unittest.TestCase): - - def test_dummy(self): - self.assertEqual(dummy(), 42) diff --git a/test/test-resources.zip b/test/test-resources.zip new file mode 100644 index 0000000..2989554 Binary files /dev/null and b/test/test-resources.zip differ diff --git a/test/test_python_script.py b/test/test_python_script.py new file mode 100644 index 0000000..9b6837e --- /dev/null +++ b/test/test_python_script.py @@ -0,0 +1,62 @@ +import json +import os +import shutil +import subprocess +import tempfile +import unittest +import zipfile +from contextlib import contextmanager +from pathlib import Path +from typing import Generator + +from approvaltests import verify_as_json + +ROOT_DIR = Path(__file__).parent.parent.resolve() +TEST_RESOURCES = ROOT_DIR / "test" / "test-resources.zip" + + +@contextmanager +def resource(resource_name: str) -> Generator[Path, None, None]: + with tempfile.TemporaryDirectory() as f: + with zipfile.ZipFile(str(TEST_RESOURCES), "r") as zip_ref: + zip_ref.extractall(f) + ret = Path(f) / resource_name + assert ret.is_dir(), ret + yield ret + + +def run_command_and_return_persisted_metadata(command): + with tempfile.TemporaryDirectory() as f: + env = os.environ.copy() + env["PYTHONPATH"] = ROOT_DIR / "src" + subprocess.check_output(command(f), env=env, stderr=subprocess.STDOUT) + actual = json.load(open(f"{f}/.ir-metadata", "r")) + actual["cpuinfo"] = {k: v for k, v in actual["cpuinfo"].items() if k == "arch"} + actual["sys"]["executable"] = "python3" if "python3" in actual["sys"]["executable"] else "UNEXPECTED" + actual["sys"]["version_info"] = "3.XY.XY" if actual["sys"]["version_info"].startswith("3.") else "UNEXPECTED" + actual["sys"]["argv"] = [i.split("/")[-1] for i in actual["sys"]["argv"] if "example" in i] + actual["platform"] = {k: "OMITTED" for k in actual["platform"].keys()} + actual["sys"]["modules"] = [i for i in actual["sys"]["modules"] if "terrier" in i] + actual["pkg_resources"] = [i for i in actual["pkg_resources"] if "python-terrier" in i] + return actual + + +class PythonScriptApprovalTests(unittest.TestCase): + def test_for_valid_git_repo(self): + with resource("pyterrier") as pyterrier_dir: + actual = run_command_and_return_persisted_metadata( + lambda i: ["python3", f"{pyterrier_dir}/example-script.py", i] + ) + + verify_as_json(actual) + + def test_for_pyterrier_fails_if_not_in_git(self): + with resource("pyterrier") as pyterrier_dir: + shutil.rmtree(pyterrier_dir / ".git") + + with self.assertRaises(subprocess.CalledProcessError) as context: + run_command_and_return_persisted_metadata( + lambda i: ["python3", f"{pyterrier_dir}/example-script.py", i] + ) + + self.assertIn("InvalidGitRepositoryError", repr(context.exception.stdout)) diff --git a/test/test_utility_functions.py b/test/test_utility_functions.py new file mode 100644 index 0000000..200cb23 --- /dev/null +++ b/test/test_utility_functions.py @@ -0,0 +1,30 @@ +import unittest +from pathlib import Path + +from git import InvalidGitRepositoryError + +from autometadata import _executed_file_from_stacktrace, collect_git_repo_metadata, get_gpu_info + +ROOT_DIR = Path(__file__).parent.parent.resolve() + + +class TestUtilityFunctions(unittest.TestCase): + def test_file_is_extracted(self): + expected = set(["pytest", "run_pytest_script"]) + actual = _executed_file_from_stacktrace().stem + self.assertIn(actual, expected) + + def test_tmp_is_no_git_repo(self): + with self.assertRaises(InvalidGitRepositoryError) as context: + collect_git_repo_metadata(Path("/tmp")) + + self.assertNotIn("InvalidGitRepositoryError", repr(context)) + + def test_with_current_git_repo_root_level(self): + self.assertIsNotNone(collect_git_repo_metadata(ROOT_DIR)) + + def test_with_current_git_repo_multiple_nonroot_level(self): + self.assertIsNotNone(collect_git_repo_metadata(ROOT_DIR / "src" / "autometadata")) + + def test_get_gpus(self): + self.assertIsNotNone(get_gpu_info())