Skip to content

Commit 9474a92

Browse files
czaloomntlind
andauthored
Lite Classification (#749)
Co-authored-by: Nick <[email protected]>
1 parent 43a2b65 commit 9474a92

24 files changed

+6643
-27
lines changed

.github/workflows/lite-benchmark-evaluations.yml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,20 +19,20 @@ jobs:
1919
- name: install lite
2020
run: pip install -e .
2121
working-directory: ./lite
22-
# - name: run classification benchmarks
23-
# run: python benchmark_script.py
24-
# working-directory: ./lite/benchmarks/classification
25-
# - name: print classification results
26-
# run: |
27-
# export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('results.json', 'r')), indent=4));")
28-
# echo "$BENCHMARK_RESULTS"
29-
# working-directory: ./lite/benchmarks/classification
22+
- name: run classification benchmarks
23+
run: python benchmark_classification.py
24+
working-directory: ./lite/benchmarks/
25+
- name: print classification results
26+
run: |
27+
export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('clf_results.json', 'r')), indent=4));")
28+
echo "$BENCHMARK_RESULTS"
29+
working-directory: ./lite/benchmarks/
3030
- name: run object detection benchmarks
3131
run: python benchmark_objdet.py
3232
working-directory: ./lite/benchmarks/
3333
- name: print object detection results
3434
run: |
35-
export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('manager_results.json', 'r')), indent=4));")
35+
export BENCHMARK_RESULTS=$(python -c "import os;import json;print(json.dumps(json.load(open('objdet_results.json', 'r')), indent=4));")
3636
echo "$BENCHMARK_RESULTS"
3737
working-directory: ./lite/benchmarks/
3838
- run: make stop-env

.github/workflows/lite-tests-and-coverage.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,20 @@ jobs:
1919
- uses: actions/setup-python@v4
2020
with:
2121
python-version: "3.10"
22+
- name: run classification tests and report coverage
23+
run: |
24+
pip install -e ".[test]"
25+
COVERAGE_FILE=.coverage.classification python -m coverage run --omit "tests/*" -m pytest -v tests/classification/
26+
python -m coverage combine
27+
python -m coverage report -m
28+
python -m coverage json
29+
export TOTAL=$(python -c "import json;print(json.load(open('coverage.json'))['totals']['percent_covered_display'])")
30+
echo "total=$TOTAL" >> $GITHUB_ENV
31+
if (( $TOTAL < 90 )); then
32+
echo "Coverage is below 90%"
33+
exit 1
34+
fi
35+
working-directory: ./lite
2236
- name: run object detection tests and report coverage
2337
run: |
2438
pip install -e ".[test]"
Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
import json
2+
import os
3+
from dataclasses import dataclass
4+
from datetime import datetime
5+
from pathlib import Path
6+
from time import time
7+
8+
import requests
9+
from tqdm import tqdm
10+
from valor_lite.classification import DataLoader, MetricType
11+
12+
13+
def time_it(fn):
14+
def wrapper(*args, **kwargs):
15+
start = time()
16+
results = fn(*args, **kwargs)
17+
return (time() - start, results)
18+
19+
return wrapper
20+
21+
22+
def download_data_if_not_exists(
23+
file_name: str,
24+
file_path: Path,
25+
url: str,
26+
):
27+
"""Download the data from a public bucket if it doesn't exist locally."""
28+
29+
if not os.path.exists(file_path):
30+
response = requests.get(url, stream=True)
31+
if response.status_code == 200:
32+
total_size = int(response.headers.get("content-length", 0))
33+
with open(file_path, "wb") as f:
34+
with tqdm(
35+
total=total_size,
36+
unit="B",
37+
unit_scale=True,
38+
unit_divisor=1024,
39+
desc=file_name,
40+
) as pbar:
41+
for chunk in response.iter_content(chunk_size=1024):
42+
if chunk:
43+
f.write(chunk)
44+
pbar.update(1024)
45+
else:
46+
raise RuntimeError(response)
47+
else:
48+
print(f"{file_name} already exists locally.")
49+
50+
51+
def write_results_to_file(write_path: Path, results: list[dict]):
52+
"""Write results to results.json"""
53+
current_datetime = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
54+
if os.path.isfile(write_path):
55+
with open(write_path, "r") as file:
56+
file.seek(0)
57+
data = json.load(file)
58+
else:
59+
data = {}
60+
61+
data[current_datetime] = results
62+
63+
with open(write_path, "w+") as file:
64+
json.dump(data, file, indent=4)
65+
66+
67+
@time_it
68+
def ingest(
69+
loader: DataLoader,
70+
gt_path: Path,
71+
pd_path: Path,
72+
limit: int,
73+
chunk_size: int,
74+
):
75+
accumulated_time = 0.0
76+
with open(gt_path, "r") as gf:
77+
with open(pd_path, "r") as pf:
78+
count = 0
79+
groundtruths = []
80+
predictions = []
81+
for gline, pline in zip(gf, pf):
82+
83+
# groundtruth
84+
gt_dict = json.loads(gline)
85+
groundtruths.append(gt_dict)
86+
87+
# prediction
88+
pd_dict = json.loads(pline)
89+
predictions.append(pd_dict)
90+
91+
count += 1
92+
if count >= limit and limit > 0:
93+
break
94+
elif len(groundtruths) < chunk_size or chunk_size == -1:
95+
continue
96+
97+
timer, _ = time_it(loader.add_data_from_valor_dict)(
98+
zip(groundtruths, predictions), True
99+
)
100+
accumulated_time += timer
101+
groundtruths = []
102+
predictions = []
103+
104+
if groundtruths:
105+
timer, _ = time_it(loader.add_data_from_valor_dict)(
106+
zip(groundtruths, predictions), True
107+
)
108+
accumulated_time += timer
109+
110+
return accumulated_time
111+
112+
113+
@dataclass
114+
class Benchmark:
115+
limit: int
116+
n_datums: int
117+
n_groundtruths: int
118+
n_predictions: int
119+
n_labels: int
120+
chunk_size: int
121+
ingestion: float
122+
preprocessing: float
123+
precomputation: float
124+
evaluation: float
125+
detailed_evaluation: list[tuple[int, float]]
126+
127+
def result(self) -> dict:
128+
return {
129+
"limit": self.limit,
130+
"n_datums": self.n_datums,
131+
"n_groundtruths": self.n_groundtruths,
132+
"n_predictions": self.n_predictions,
133+
"n_labels": self.n_labels,
134+
"chunk_size": self.chunk_size,
135+
"ingestion": {
136+
"loading_from_file": f"{round(self.ingestion - self.preprocessing, 2)} seconds",
137+
"numpy_conversion": f"{round(self.preprocessing, 2)} seconds",
138+
"finalization": f"{round(self.precomputation, 2)} seconds",
139+
"total": f"{round(self.ingestion + self.precomputation, 2)} seconds",
140+
},
141+
"base_evaluation": f"{round(self.evaluation, 2)} seconds",
142+
"detailed_evaluation": [
143+
{
144+
"n_points": 10,
145+
"n_examples": curve[0],
146+
"computation": f"{round(curve[1], 2)} seconds",
147+
}
148+
for curve in self.detailed_evaluation
149+
],
150+
}
151+
152+
153+
def run_benchmarking_analysis(
154+
limits_to_test: list[int],
155+
results_file: str = "clf_results.json",
156+
chunk_size: int = -1,
157+
ingestion_timeout=30,
158+
evaluation_timeout=30,
159+
):
160+
"""Time various function calls and export the results."""
161+
current_directory = Path(__file__).parent
162+
write_path = current_directory / Path(results_file)
163+
164+
gt_filename = "gt_classification.jsonl"
165+
pd_filename = "pd_classification.jsonl"
166+
167+
# cache data locally
168+
for filename in [gt_filename, pd_filename]:
169+
file_path = current_directory / Path(filename)
170+
url = f"https://pub-fae71003f78140bdaedf32a7c8d331d2.r2.dev/{filename}"
171+
download_data_if_not_exists(
172+
file_name=filename, file_path=file_path, url=url
173+
)
174+
175+
# iterate through datum limits
176+
results = list()
177+
for limit in limits_to_test:
178+
179+
# === Base Evaluation ===
180+
loader = DataLoader()
181+
182+
# ingest + preprocess
183+
(ingest_time, preprocessing_time,) = ingest(
184+
loader=loader,
185+
gt_path=current_directory / Path(gt_filename),
186+
pd_path=current_directory / Path(pd_filename),
187+
limit=limit,
188+
chunk_size=chunk_size,
189+
) # type: ignore - time_it wrapper
190+
191+
finalization_time, evaluator = time_it(loader.finalize)()
192+
193+
if ingest_time > ingestion_timeout and ingestion_timeout != -1:
194+
raise TimeoutError(
195+
f"Base precomputation timed out with limit of {limit}."
196+
)
197+
198+
# evaluate
199+
eval_time, _ = time_it(evaluator.evaluate)()
200+
if eval_time > evaluation_timeout and evaluation_timeout != -1:
201+
raise TimeoutError(
202+
f"Base evaluation timed out with {evaluator.n_datums} datums."
203+
)
204+
205+
detail_no_examples_time, _ = time_it(evaluator.evaluate)(
206+
metrics_to_return=[*MetricType.base(), MetricType.ConfusionMatrix],
207+
)
208+
if (
209+
detail_no_examples_time > evaluation_timeout
210+
and evaluation_timeout != -1
211+
):
212+
raise TimeoutError(
213+
f"Base evaluation timed out with {evaluator.n_datums} datums."
214+
)
215+
216+
detail_three_examples_time, _ = time_it(evaluator.evaluate)(
217+
metrics_to_return=[*MetricType.base(), MetricType.ConfusionMatrix],
218+
number_of_examples=3,
219+
)
220+
if (
221+
detail_three_examples_time > evaluation_timeout
222+
and evaluation_timeout != -1
223+
):
224+
raise TimeoutError(
225+
f"Base evaluation timed out with {evaluator.n_datums} datums."
226+
)
227+
228+
results.append(
229+
Benchmark(
230+
limit=limit,
231+
n_datums=evaluator.n_datums,
232+
n_groundtruths=evaluator.n_groundtruths,
233+
n_predictions=evaluator.n_predictions,
234+
n_labels=evaluator.n_labels,
235+
chunk_size=chunk_size,
236+
ingestion=ingest_time,
237+
preprocessing=preprocessing_time,
238+
precomputation=finalization_time,
239+
evaluation=eval_time,
240+
detailed_evaluation=[
241+
(0, detail_no_examples_time),
242+
(3, detail_three_examples_time),
243+
],
244+
).result()
245+
)
246+
247+
write_results_to_file(write_path=write_path, results=results)
248+
249+
250+
if __name__ == "__main__":
251+
252+
run_benchmarking_analysis(
253+
limits_to_test=[5000, 5000, 5000],
254+
)

lite/benchmarks/benchmark_objdet.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def download_data_if_not_exists(
6767

6868

6969
def write_results_to_file(write_path: Path, results: list[dict]):
70-
"""Write results to manager_results.json"""
70+
"""Write results to json"""
7171
current_datetime = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
7272
if os.path.isfile(write_path):
7373
with open(write_path, "r") as file:
@@ -178,7 +178,7 @@ def result(self) -> dict:
178178
def run_benchmarking_analysis(
179179
limits_to_test: list[int],
180180
combinations: list[tuple[AnnotationType, AnnotationType]] | None = None,
181-
results_file: str = "manager_results.json",
181+
results_file: str = "objdet_results.json",
182182
chunk_size: int = -1,
183183
compute_pr: bool = True,
184184
compute_detailed: bool = True,

lite/examples/coco-yolo.ipynb renamed to lite/examples/object-detection.ipynb

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,7 @@
1212
"\n",
1313
"In this notebook, we'll walk through a detailed example of how you can use Valor to evaluate object detections made on [the COCO Panoptic dataset](https://cocodataset.org/#home). We'll use Ultralytics' `YOLOv8` model to predict what objects exist in various COCO photographs and compare performance between bounding box and image segmentation results.\n",
1414
"\n",
15-
"For a conceptual introduction to Valor, [check out our project overview](https://striveworks.github.io/valor/). For a higher-level example notebook, [check out our \"Getting Started\" notebook](https://github.com/Striveworks/valor/blob/main/examples/getting_started.ipynb).\n",
16-
"\n",
17-
"Before using this notebook, please ensure that the Valor service is running on your machine (for start-up instructions, [click here](https://striveworks.github.io/valor/getting_started/)). To connect to a non-local instance of Valor, update `client = Client(\"http://0.0.0.0:8000\")` in the first code block to point to the correct URL."
15+
"For a conceptual introduction to Valor, [check out our project overview](https://striveworks.github.io/valor/). For a higher-level example notebook, [check out our \"Getting Started\" notebook](https://github.com/Striveworks/valor/blob/main/examples/getting_started.ipynb)."
1816
]
1917
},
2018
{
@@ -49,19 +47,6 @@
4947
"from valor_lite.detection import DataLoader, MetricType"
5048
]
5149
},
52-
{
53-
"attachments": {},
54-
"cell_type": "markdown",
55-
"id": "a28f5e66",
56-
"metadata": {},
57-
"source": [
58-
"The modules included in `./integrations` are helper modules that demonstrate how to ingest datasets and model inferences into Valor. The depth of each integration varies depending on the use case. \n",
59-
"\n",
60-
"The `coco_integration` is designed to download, extract, and upload all in one command as you are starting off with all the the data. \n",
61-
"\n",
62-
"The `yolo_integration` is much simpler; it is a collection of parser functions that convert YOLO model results into Valor types."
63-
]
64-
},
6550
{
6651
"cell_type": "code",
6752
"execution_count": 2,

0 commit comments

Comments
 (0)