Skip to content

Commit

Permalink
Merge pull request #1 from webis-de/dev
Browse files Browse the repository at this point in the history
First rudimentary version
  • Loading branch information
mam10eks authored Dec 23, 2024
2 parents 6e6bdbf + 8b93f9d commit b17d280
Show file tree
Hide file tree
Showing 10 changed files with 289 additions and 21 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,8 @@

---

`Stay tuned, more information will follow`
`Stay tuned, more information will follow`

Features:
- Integration with git
- In
29 changes: 29 additions & 0 deletions examples/pyterrier/example-script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python3
from pathlib import Path

import click
import pandas as pd
import pyterrier as pt
from codecarbon import OfflineEmissionsTracker

from autometadata import persist_ir_metadata


@click.command()
@click.argument("output-directory", type=Path)
def main(output_directory):
with OfflineEmissionsTracker(country_iso_code="DEU") as tracker:
run = pd.DataFrame(
[
{"qid": "q-1", "docno": "doc-01", "rank": 1, "score": 10},
{"qid": "q-1", "docno": "doc-02", "rank": 2, "score": 9},
{"qid": "q-1", "docno": "doc-03", "rank": 3, "score": 8},
]
)
output_directory.mkdir(exist_ok=True, parents=True)
pt.io.write_results(run, output_directory / "run.txt", format="trec")
persist_ir_metadata(output_directory, tracker)


if __name__ == "__main__":
main()
8 changes: 7 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,16 @@ package_dir =
packages = find_namespace:
install_requires =
GitPython
setuptools
py-cpuinfo
nvsmi
codecarbon

[options.extras_require]
test =
pytest
approvaltests
python-terrier==0.12.0
dev =
coverage
black
Expand All @@ -33,4 +39,4 @@ max-line-length = 120
extend-ignore = E203
include = src,test
exclude = docs
max-complexity = 10
max-complexity = 10
118 changes: 112 additions & 6 deletions src/autometadata/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,112 @@
def dummy() -> int:
"""
This is a dummy function that exists to test the initial project configuration. It should be deleted once actual
code was introduced!
"""
return 42
import json
import platform
import sys
import traceback
from pathlib import Path
from typing import Any, Dict, List, Optional

from codecarbon import EmissionsTracker
from cpuinfo import get_cpu_info
from git import InvalidGitRepositoryError, Repo
from nvsmi import get_gpus
from pkg_resources import working_set

FILE_NAME = ".ir-metadata"


def __ensure_output_directory_is_valid(outdir: Path):
if not outdir:
raise ValueError("Foo")

if outdir.exists() and not outdir.is_dir():
raise ValueError("Foo")

if (outdir / FILE_NAME).exists():
raise ValueError("Foo")

if not outdir.is_dir():
outdir.mkdir(parents=True, exist_ok=True)


def _executed_file_from_stacktrace() -> Path:
return Path(traceback.extract_stack()[0].filename).resolve()


def collect_git_repo_metadata(repo: Optional[Path] = None) -> Dict[str, Any]:
if not repo:
return collect_git_repo_metadata(_executed_file_from_stacktrace().parent)
git_repo = None
try:
git_repo = Repo(repo)
except InvalidGitRepositoryError:
parent_repo = repo.parent
cnt = 0

while cnt < 7 and parent_repo != Path("/"):
try:
cnt += 1
git_repo = Repo(parent_repo)
break
except InvalidGitRepositoryError:
parent_repo = parent_repo.parent

if not git_repo:
raise InvalidGitRepositoryError(f"I can not find a git repository in {repo}.")

remotes = {r.name: r.url for r in git_repo.remotes}

return {
"commit": git_repo.head.commit.hexsha,
"active_branch": git_repo.active_branch.name,
"remotes": remotes,
}


def get_python_info() -> Dict[str, Any]:
ret: Dict[str, Any] = {}
modules = [i.split(".")[0] for i in sys.modules.keys() if i and not i.startswith("_")]
pkg_resources = list(set([f"{i.project_name}=={i.version}" for i in working_set]))
ret["sys"] = {
"executable": sys.executable,
"argv": sys.argv,
"modules": list(set(modules)),
"version_info": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
}
ret["pkg_resources"] = pkg_resources
return ret


def get_gpu_info() -> List[Dict[str, Any]]:
try:
return [json.loads(gpu.to_json()) for gpu in get_gpus()]
except FileNotFoundError:
return []


def get_platform_info() -> Dict[str, Any]:
return {
"system": platform.system(),
"machine": platform.machine(),
"version": platform.version(),
"architecture": platform.architecture(),
"processor": platform.processor(),
}


def persist_ir_metadata(output_directory: Path, codecarbon_tracker: Optional[EmissionsTracker] = None):
__ensure_output_directory_is_valid(output_directory)
output_file = output_directory / FILE_NAME
collected_meta_data = get_python_info()
collected_meta_data["git"] = collect_git_repo_metadata()
collected_meta_data["cpuinfo"] = get_cpu_info()
collected_meta_data["gpus"] = get_gpu_info()
collected_meta_data["platform"] = get_platform_info()
executed_file = _executed_file_from_stacktrace()
collected_meta_data["file"] = {"name": executed_file.name, "content": open(executed_file, "r").read()}
if codecarbon_tracker:
collected_meta_data["codecarbon_emissions"] = json.loads(codecarbon_tracker.final_emissions_data.toJSON())

serialized_meta_data = json.dumps(collected_meta_data)

with open(output_file, "w") as f:
f.write(serialized_meta_data)
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"cpuinfo": {
"arch": "X86_64"
},
"file": {
"content": "#!/usr/bin/env python3\nfrom pathlib import Path\n\nimport click\nimport pandas as pd\nimport pyterrier as pt\n\nfrom autometadata import persist_ir_metadata\n\n\n@click.command()\n@click.argument(\"output-directory\", type=Path)\ndef main(output_directory):\n run = pd.DataFrame(\n [\n {\"qid\": \"q-1\", \"docno\": \"doc-01\", \"rank\": 1, \"score\": 10},\n {\"qid\": \"q-1\", \"docno\": \"doc-02\", \"rank\": 2, \"score\": 9},\n {\"qid\": \"q-1\", \"docno\": \"doc-03\", \"rank\": 3, \"score\": 8},\n ]\n )\n output_directory.mkdir(exist_ok=True, parents=True)\n pt.io.write_results(run, output_directory / \"run.txt\", format=\"trec\")\n persist_ir_metadata(output_directory)\n\n\nif __name__ == \"__main__\":\n main()\n",
"name": "example-script.py"
},
"git": {
"active_branch": "main",
"commit": "a810f6948292c8c6ffef4fc0c8172887a8715c8d",
"remotes": {
"origin": "url-does-not-exist"
}
},
"gpus": [],
"pkg_resources": [
"python-terrier==0.12.0"
],
"platform": {
"architecture": "OMITTED",
"machine": "OMITTED",
"processor": "OMITTED",
"system": "OMITTED",
"version": "OMITTED"
},
"sys": {
"argv": [
"example-script.py"
],
"executable": "python3",
"modules": [
"pyterrier"
],
"version_info": "3.XY.XY"
}
}
7 changes: 7 additions & 0 deletions test/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Datasets

The examples contain multiple directories, each with a git repository that is not committed, we zip them via:

```
cd ../examples/ && zip -r ../test/test-resources.zip . && cd ../test
```
13 changes: 0 additions & 13 deletions test/dummytest.py

This file was deleted.

Binary file added test/test-resources.zip
Binary file not shown.
62 changes: 62 additions & 0 deletions test/test_python_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import json
import os
import shutil
import subprocess
import tempfile
import unittest
import zipfile
from contextlib import contextmanager
from pathlib import Path
from typing import Generator

from approvaltests import verify_as_json

ROOT_DIR = Path(__file__).parent.parent.resolve()
TEST_RESOURCES = ROOT_DIR / "test" / "test-resources.zip"


@contextmanager
def resource(resource_name: str) -> Generator[Path, None, None]:
with tempfile.TemporaryDirectory() as f:
with zipfile.ZipFile(str(TEST_RESOURCES), "r") as zip_ref:
zip_ref.extractall(f)
ret = Path(f) / resource_name
assert ret.is_dir(), ret
yield ret


def run_command_and_return_persisted_metadata(command):
with tempfile.TemporaryDirectory() as f:
env = os.environ.copy()
env["PYTHONPATH"] = ROOT_DIR / "src"
subprocess.check_output(command(f), env=env, stderr=subprocess.STDOUT)
actual = json.load(open(f"{f}/.ir-metadata", "r"))
actual["cpuinfo"] = {k: v for k, v in actual["cpuinfo"].items() if k == "arch"}
actual["sys"]["executable"] = "python3" if "python3" in actual["sys"]["executable"] else "UNEXPECTED"
actual["sys"]["version_info"] = "3.XY.XY" if actual["sys"]["version_info"].startswith("3.") else "UNEXPECTED"
actual["sys"]["argv"] = [i.split("/")[-1] for i in actual["sys"]["argv"] if "example" in i]
actual["platform"] = {k: "OMITTED" for k in actual["platform"].keys()}
actual["sys"]["modules"] = [i for i in actual["sys"]["modules"] if "terrier" in i]
actual["pkg_resources"] = [i for i in actual["pkg_resources"] if "python-terrier" in i]
return actual


class PythonScriptApprovalTests(unittest.TestCase):
def test_for_valid_git_repo(self):
with resource("pyterrier") as pyterrier_dir:
actual = run_command_and_return_persisted_metadata(
lambda i: ["python3", f"{pyterrier_dir}/example-script.py", i]
)

verify_as_json(actual)

def test_for_pyterrier_fails_if_not_in_git(self):
with resource("pyterrier") as pyterrier_dir:
shutil.rmtree(pyterrier_dir / ".git")

with self.assertRaises(subprocess.CalledProcessError) as context:
run_command_and_return_persisted_metadata(
lambda i: ["python3", f"{pyterrier_dir}/example-script.py", i]
)

self.assertIn("InvalidGitRepositoryError", repr(context.exception.stdout))
30 changes: 30 additions & 0 deletions test/test_utility_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import unittest
from pathlib import Path

from git import InvalidGitRepositoryError

from autometadata import _executed_file_from_stacktrace, collect_git_repo_metadata, get_gpu_info

ROOT_DIR = Path(__file__).parent.parent.resolve()


class TestUtilityFunctions(unittest.TestCase):
def test_file_is_extracted(self):
expected = set(["pytest", "run_pytest_script"])
actual = _executed_file_from_stacktrace().stem
self.assertIn(actual, expected)

def test_tmp_is_no_git_repo(self):
with self.assertRaises(InvalidGitRepositoryError) as context:
collect_git_repo_metadata(Path("/tmp"))

self.assertNotIn("InvalidGitRepositoryError", repr(context))

def test_with_current_git_repo_root_level(self):
self.assertIsNotNone(collect_git_repo_metadata(ROOT_DIR))

def test_with_current_git_repo_multiple_nonroot_level(self):
self.assertIsNotNone(collect_git_repo_metadata(ROOT_DIR / "src" / "autometadata"))

def test_get_gpus(self):
self.assertIsNotNone(get_gpu_info())

0 comments on commit b17d280

Please sign in to comment.