-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from webis-de/dev
First rudimentary version
- Loading branch information
Showing
10 changed files
with
289 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#!/usr/bin/env python3 | ||
from pathlib import Path | ||
|
||
import click | ||
import pandas as pd | ||
import pyterrier as pt | ||
from codecarbon import OfflineEmissionsTracker | ||
|
||
from autometadata import persist_ir_metadata | ||
|
||
|
||
@click.command() | ||
@click.argument("output-directory", type=Path) | ||
def main(output_directory): | ||
with OfflineEmissionsTracker(country_iso_code="DEU") as tracker: | ||
run = pd.DataFrame( | ||
[ | ||
{"qid": "q-1", "docno": "doc-01", "rank": 1, "score": 10}, | ||
{"qid": "q-1", "docno": "doc-02", "rank": 2, "score": 9}, | ||
{"qid": "q-1", "docno": "doc-03", "rank": 3, "score": 8}, | ||
] | ||
) | ||
output_directory.mkdir(exist_ok=True, parents=True) | ||
pt.io.write_results(run, output_directory / "run.txt", format="trec") | ||
persist_ir_metadata(output_directory, tracker) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,112 @@ | ||
def dummy() -> int: | ||
""" | ||
This is a dummy function that exists to test the initial project configuration. It should be deleted once actual | ||
code was introduced! | ||
""" | ||
return 42 | ||
import json | ||
import platform | ||
import sys | ||
import traceback | ||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional | ||
|
||
from codecarbon import EmissionsTracker | ||
from cpuinfo import get_cpu_info | ||
from git import InvalidGitRepositoryError, Repo | ||
from nvsmi import get_gpus | ||
from pkg_resources import working_set | ||
|
||
FILE_NAME = ".ir-metadata" | ||
|
||
|
||
def __ensure_output_directory_is_valid(outdir: Path): | ||
if not outdir: | ||
raise ValueError("Foo") | ||
|
||
if outdir.exists() and not outdir.is_dir(): | ||
raise ValueError("Foo") | ||
|
||
if (outdir / FILE_NAME).exists(): | ||
raise ValueError("Foo") | ||
|
||
if not outdir.is_dir(): | ||
outdir.mkdir(parents=True, exist_ok=True) | ||
|
||
|
||
def _executed_file_from_stacktrace() -> Path: | ||
return Path(traceback.extract_stack()[0].filename).resolve() | ||
|
||
|
||
def collect_git_repo_metadata(repo: Optional[Path] = None) -> Dict[str, Any]: | ||
if not repo: | ||
return collect_git_repo_metadata(_executed_file_from_stacktrace().parent) | ||
git_repo = None | ||
try: | ||
git_repo = Repo(repo) | ||
except InvalidGitRepositoryError: | ||
parent_repo = repo.parent | ||
cnt = 0 | ||
|
||
while cnt < 7 and parent_repo != Path("/"): | ||
try: | ||
cnt += 1 | ||
git_repo = Repo(parent_repo) | ||
break | ||
except InvalidGitRepositoryError: | ||
parent_repo = parent_repo.parent | ||
|
||
if not git_repo: | ||
raise InvalidGitRepositoryError(f"I can not find a git repository in {repo}.") | ||
|
||
remotes = {r.name: r.url for r in git_repo.remotes} | ||
|
||
return { | ||
"commit": git_repo.head.commit.hexsha, | ||
"active_branch": git_repo.active_branch.name, | ||
"remotes": remotes, | ||
} | ||
|
||
|
||
def get_python_info() -> Dict[str, Any]: | ||
ret: Dict[str, Any] = {} | ||
modules = [i.split(".")[0] for i in sys.modules.keys() if i and not i.startswith("_")] | ||
pkg_resources = list(set([f"{i.project_name}=={i.version}" for i in working_set])) | ||
ret["sys"] = { | ||
"executable": sys.executable, | ||
"argv": sys.argv, | ||
"modules": list(set(modules)), | ||
"version_info": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}", | ||
} | ||
ret["pkg_resources"] = pkg_resources | ||
return ret | ||
|
||
|
||
def get_gpu_info() -> List[Dict[str, Any]]: | ||
try: | ||
return [json.loads(gpu.to_json()) for gpu in get_gpus()] | ||
except FileNotFoundError: | ||
return [] | ||
|
||
|
||
def get_platform_info() -> Dict[str, Any]: | ||
return { | ||
"system": platform.system(), | ||
"machine": platform.machine(), | ||
"version": platform.version(), | ||
"architecture": platform.architecture(), | ||
"processor": platform.processor(), | ||
} | ||
|
||
|
||
def persist_ir_metadata(output_directory: Path, codecarbon_tracker: Optional[EmissionsTracker] = None): | ||
__ensure_output_directory_is_valid(output_directory) | ||
output_file = output_directory / FILE_NAME | ||
collected_meta_data = get_python_info() | ||
collected_meta_data["git"] = collect_git_repo_metadata() | ||
collected_meta_data["cpuinfo"] = get_cpu_info() | ||
collected_meta_data["gpus"] = get_gpu_info() | ||
collected_meta_data["platform"] = get_platform_info() | ||
executed_file = _executed_file_from_stacktrace() | ||
collected_meta_data["file"] = {"name": executed_file.name, "content": open(executed_file, "r").read()} | ||
if codecarbon_tracker: | ||
collected_meta_data["codecarbon_emissions"] = json.loads(codecarbon_tracker.final_emissions_data.toJSON()) | ||
|
||
serialized_meta_data = json.dumps(collected_meta_data) | ||
|
||
with open(output_file, "w") as f: | ||
f.write(serialized_meta_data) |
37 changes: 37 additions & 0 deletions
37
test/PythonScriptApprovalTests.test_for_valid_git_repo.approved.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
{ | ||
"cpuinfo": { | ||
"arch": "X86_64" | ||
}, | ||
"file": { | ||
"content": "#!/usr/bin/env python3\nfrom pathlib import Path\n\nimport click\nimport pandas as pd\nimport pyterrier as pt\n\nfrom autometadata import persist_ir_metadata\n\n\n@click.command()\n@click.argument(\"output-directory\", type=Path)\ndef main(output_directory):\n run = pd.DataFrame(\n [\n {\"qid\": \"q-1\", \"docno\": \"doc-01\", \"rank\": 1, \"score\": 10},\n {\"qid\": \"q-1\", \"docno\": \"doc-02\", \"rank\": 2, \"score\": 9},\n {\"qid\": \"q-1\", \"docno\": \"doc-03\", \"rank\": 3, \"score\": 8},\n ]\n )\n output_directory.mkdir(exist_ok=True, parents=True)\n pt.io.write_results(run, output_directory / \"run.txt\", format=\"trec\")\n persist_ir_metadata(output_directory)\n\n\nif __name__ == \"__main__\":\n main()\n", | ||
"name": "example-script.py" | ||
}, | ||
"git": { | ||
"active_branch": "main", | ||
"commit": "a810f6948292c8c6ffef4fc0c8172887a8715c8d", | ||
"remotes": { | ||
"origin": "url-does-not-exist" | ||
} | ||
}, | ||
"gpus": [], | ||
"pkg_resources": [ | ||
"python-terrier==0.12.0" | ||
], | ||
"platform": { | ||
"architecture": "OMITTED", | ||
"machine": "OMITTED", | ||
"processor": "OMITTED", | ||
"system": "OMITTED", | ||
"version": "OMITTED" | ||
}, | ||
"sys": { | ||
"argv": [ | ||
"example-script.py" | ||
], | ||
"executable": "python3", | ||
"modules": [ | ||
"pyterrier" | ||
], | ||
"version_info": "3.XY.XY" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Datasets | ||
|
||
The examples contain multiple directories, each with a git repository that is not committed, we zip them via: | ||
|
||
``` | ||
cd ../examples/ && zip -r ../test/test-resources.zip . && cd ../test | ||
``` |
This file was deleted.
Oops, something went wrong.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import json | ||
import os | ||
import shutil | ||
import subprocess | ||
import tempfile | ||
import unittest | ||
import zipfile | ||
from contextlib import contextmanager | ||
from pathlib import Path | ||
from typing import Generator | ||
|
||
from approvaltests import verify_as_json | ||
|
||
ROOT_DIR = Path(__file__).parent.parent.resolve() | ||
TEST_RESOURCES = ROOT_DIR / "test" / "test-resources.zip" | ||
|
||
|
||
@contextmanager | ||
def resource(resource_name: str) -> Generator[Path, None, None]: | ||
with tempfile.TemporaryDirectory() as f: | ||
with zipfile.ZipFile(str(TEST_RESOURCES), "r") as zip_ref: | ||
zip_ref.extractall(f) | ||
ret = Path(f) / resource_name | ||
assert ret.is_dir(), ret | ||
yield ret | ||
|
||
|
||
def run_command_and_return_persisted_metadata(command): | ||
with tempfile.TemporaryDirectory() as f: | ||
env = os.environ.copy() | ||
env["PYTHONPATH"] = ROOT_DIR / "src" | ||
subprocess.check_output(command(f), env=env, stderr=subprocess.STDOUT) | ||
actual = json.load(open(f"{f}/.ir-metadata", "r")) | ||
actual["cpuinfo"] = {k: v for k, v in actual["cpuinfo"].items() if k == "arch"} | ||
actual["sys"]["executable"] = "python3" if "python3" in actual["sys"]["executable"] else "UNEXPECTED" | ||
actual["sys"]["version_info"] = "3.XY.XY" if actual["sys"]["version_info"].startswith("3.") else "UNEXPECTED" | ||
actual["sys"]["argv"] = [i.split("/")[-1] for i in actual["sys"]["argv"] if "example" in i] | ||
actual["platform"] = {k: "OMITTED" for k in actual["platform"].keys()} | ||
actual["sys"]["modules"] = [i for i in actual["sys"]["modules"] if "terrier" in i] | ||
actual["pkg_resources"] = [i for i in actual["pkg_resources"] if "python-terrier" in i] | ||
return actual | ||
|
||
|
||
class PythonScriptApprovalTests(unittest.TestCase): | ||
def test_for_valid_git_repo(self): | ||
with resource("pyterrier") as pyterrier_dir: | ||
actual = run_command_and_return_persisted_metadata( | ||
lambda i: ["python3", f"{pyterrier_dir}/example-script.py", i] | ||
) | ||
|
||
verify_as_json(actual) | ||
|
||
def test_for_pyterrier_fails_if_not_in_git(self): | ||
with resource("pyterrier") as pyterrier_dir: | ||
shutil.rmtree(pyterrier_dir / ".git") | ||
|
||
with self.assertRaises(subprocess.CalledProcessError) as context: | ||
run_command_and_return_persisted_metadata( | ||
lambda i: ["python3", f"{pyterrier_dir}/example-script.py", i] | ||
) | ||
|
||
self.assertIn("InvalidGitRepositoryError", repr(context.exception.stdout)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import unittest | ||
from pathlib import Path | ||
|
||
from git import InvalidGitRepositoryError | ||
|
||
from autometadata import _executed_file_from_stacktrace, collect_git_repo_metadata, get_gpu_info | ||
|
||
ROOT_DIR = Path(__file__).parent.parent.resolve() | ||
|
||
|
||
class TestUtilityFunctions(unittest.TestCase): | ||
def test_file_is_extracted(self): | ||
expected = set(["pytest", "run_pytest_script"]) | ||
actual = _executed_file_from_stacktrace().stem | ||
self.assertIn(actual, expected) | ||
|
||
def test_tmp_is_no_git_repo(self): | ||
with self.assertRaises(InvalidGitRepositoryError) as context: | ||
collect_git_repo_metadata(Path("/tmp")) | ||
|
||
self.assertNotIn("InvalidGitRepositoryError", repr(context)) | ||
|
||
def test_with_current_git_repo_root_level(self): | ||
self.assertIsNotNone(collect_git_repo_metadata(ROOT_DIR)) | ||
|
||
def test_with_current_git_repo_multiple_nonroot_level(self): | ||
self.assertIsNotNone(collect_git_repo_metadata(ROOT_DIR / "src" / "autometadata")) | ||
|
||
def test_get_gpus(self): | ||
self.assertIsNotNone(get_gpu_info()) |