Skip to content

Commit f0ff22a

Browse files
authored
Merge pull request #1 from cleanlab/ulya-adj-ood-scores
Added code support for adjusted Entropy and MSP scoring.
2 parents b1ee8bd + e1338ba commit f0ff22a

File tree

5 files changed

+736
-85
lines changed

5 files changed

+736
-85
lines changed

src/experiments/OOD/0_Train_Models.ipynb

+4-81
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,9 @@
1313
"import sys\n",
1414
"sys.path.insert(0, \"../\")\n",
1515
"\n",
16-
"\n",
1716
"from autogluon.vision import ImagePredictor, ImageDataset\n",
1817
"import numpy as np\n",
1918
"import pandas as pd\n",
20-
"import pickle\n",
21-
"import datetime\n",
22-
"from pathlib import Path\n",
23-
"from sklearn.ensemble import IsolationForest\n",
24-
"from sklearn.model_selection import StratifiedKFold\n",
25-
"from sklearn.metrics import roc_auc_score\n",
26-
"from sklearn.model_selection import train_test_split\n",
2719
"\n",
2820
"pd.set_option('display.max_rows', None)\n",
2921
"pd.set_option('display.max_columns', None)\n",
@@ -40,7 +32,7 @@
4032
},
4133
{
4234
"cell_type": "code",
43-
"execution_count": 2,
35+
"execution_count": null,
4436
"id": "5c9b59b4-c51c-4cdb-a958-46f227cdb5d8",
4537
"metadata": {},
4638
"outputs": [],
@@ -63,7 +55,7 @@
6355
},
6456
{
6557
"cell_type": "code",
66-
"execution_count": 3,
58+
"execution_count": null,
6759
"id": "cde63994-e833-4f87-93b6-e05b3c7ba479",
6860
"metadata": {},
6961
"outputs": [],
@@ -96,79 +88,10 @@
9688
},
9789
{
9890
"cell_type": "code",
99-
"execution_count": 7,
91+
"execution_count": null,
10092
"id": "1ae79a8d-bb68-46d5-b4b9-1f082da7d695",
10193
"metadata": {},
102-
"outputs": [
103-
{
104-
"data": {
105-
"text/html": [
106-
"<div>\n",
107-
"<style scoped>\n",
108-
" .dataframe tbody tr th:only-of-type {\n",
109-
" vertical-align: middle;\n",
110-
" }\n",
111-
"\n",
112-
" .dataframe tbody tr th {\n",
113-
" vertical-align: top;\n",
114-
" }\n",
115-
"\n",
116-
" .dataframe thead th {\n",
117-
" text-align: right;\n",
118-
" }\n",
119-
"</style>\n",
120-
"<table border=\"1\" class=\"dataframe\">\n",
121-
" <thead>\n",
122-
" <tr style=\"text-align: right;\">\n",
123-
" <th></th>\n",
124-
" <th>image</th>\n",
125-
" <th>label</th>\n",
126-
" </tr>\n",
127-
" </thead>\n",
128-
" <tbody>\n",
129-
" <tr>\n",
130-
" <th>0</th>\n",
131-
" <td>/Data/cifar100_png/train/apple/0001.png</td>\n",
132-
" <td>0</td>\n",
133-
" </tr>\n",
134-
" <tr>\n",
135-
" <th>1</th>\n",
136-
" <td>/Data/cifar100_png/train/apple/0002.png</td>\n",
137-
" <td>0</td>\n",
138-
" </tr>\n",
139-
" <tr>\n",
140-
" <th>2</th>\n",
141-
" <td>/Data/cifar100_png/train/apple/0003.png</td>\n",
142-
" <td>0</td>\n",
143-
" </tr>\n",
144-
" <tr>\n",
145-
" <th>3</th>\n",
146-
" <td>/Data/cifar100_png/train/apple/0004.png</td>\n",
147-
" <td>0</td>\n",
148-
" </tr>\n",
149-
" <tr>\n",
150-
" <th>4</th>\n",
151-
" <td>/Data/cifar100_png/train/apple/0005.png</td>\n",
152-
" <td>0</td>\n",
153-
" </tr>\n",
154-
" </tbody>\n",
155-
"</table>\n",
156-
"</div>"
157-
],
158-
"text/plain": [
159-
" image label\n",
160-
"0 /Data/cifar100_png/train/apple/0001.png 0\n",
161-
"1 /Data/cifar100_png/train/apple/0002.png 0\n",
162-
"2 /Data/cifar100_png/train/apple/0003.png 0\n",
163-
"3 /Data/cifar100_png/train/apple/0004.png 0\n",
164-
"4 /Data/cifar100_png/train/apple/0005.png 0"
165-
]
166-
},
167-
"execution_count": 7,
168-
"metadata": {},
169-
"output_type": "execute_result"
170-
}
171-
],
94+
"outputs": [],
17295
"source": [
17396
"# Check out a dataset\n",
17497
"cifar_100_train_dataset.head()"

src/experiments/OOD/1_Evaluate_All_OOD_Experiments.ipynb

-4
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,12 @@
1919
"from autogluon.vision import ImagePredictor, ImageDataset\n",
2020
"import numpy as np\n",
2121
"import pandas as pd\n",
22-
"import pickle\n",
23-
"import datetime\n",
24-
"from pathlib import Path\n",
2522
"import umap\n",
2623
"import seaborn as sns\n",
2724
"from sklearn.ensemble import IsolationForest\n",
2825
"from sklearn.model_selection import StratifiedKFold\n",
2926
"from sklearn.metrics import roc_auc_score\n",
3027
"from sklearn.model_selection import train_test_split\n",
31-
"from IPython.display import HTML\n",
3228
"from matplotlib import pyplot as plt\n",
3329
"from cleanlab.internal.label_quality_utils import get_normalized_entropy\n",
3430
"\n",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "d61e5d70-45e1-4223-b569-7a4c9247876d",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"%load_ext autoreload\n",
11+
"%autoreload 2\n",
12+
"\n",
13+
"import sys\n",
14+
"sys.path.insert(0, \"../\")\n",
15+
"\n",
16+
"from autogluon.vision import ImagePredictor, ImageDataset\n",
17+
"import numpy as np\n",
18+
"import pandas as pd\n",
19+
"\n",
20+
"pd.set_option('display.max_rows', None)\n",
21+
"pd.set_option('display.max_columns', None)\n",
22+
"pd.set_option('display.max_colwidth', None)"
23+
]
24+
},
25+
{
26+
"cell_type": "markdown",
27+
"id": "bc2ebf60-4338-45ce-b9ce-e0d2b5cc7f0d",
28+
"metadata": {},
29+
"source": [
30+
"## Read data"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": null,
36+
"id": "5c9b59b4-c51c-4cdb-a958-46f227cdb5d8",
37+
"metadata": {},
38+
"outputs": [],
39+
"source": [
40+
"# path to data\n",
41+
"CIFAR_10_DATA_PATH = \"/datasets/uly/ood-data/cifar10_png/\"\n",
42+
"CIFAR_100_DATA_PATH = \"/datasets/uly/ood-data/cifar100_png/\"\n",
43+
"MNIST_DATA_PATH = \"/datasets/uly/ood-data/mnist_png/\"\n",
44+
"FASHION_MNIST_DATA_PATH = \"/datasets/uly/ood-data/fashion_mnist_png/\"\n",
45+
"\n",
46+
"# read data from root folder\n",
47+
"cifar_10_train_dataset, _, cifar_10_test_dataset = ImageDataset.from_folders(root=CIFAR_10_DATA_PATH)\n",
48+
"cifar_100_train_dataset, _, cifar_100_test_dataset = ImageDataset.from_folders(root=CIFAR_100_DATA_PATH)\n",
49+
"mnist_train_dataset, _, mnist_test_dataset = ImageDataset.from_folders(root=MNIST_DATA_PATH)\n",
50+
"fashion_mnist_train_dataset, _, fashion_mnist_test_dataset = ImageDataset.from_folders(root=FASHION_MNIST_DATA_PATH)"
51+
]
52+
},
53+
{
54+
"cell_type": "code",
55+
"execution_count": null,
56+
"id": "cde63994-e833-4f87-93b6-e05b3c7ba479",
57+
"metadata": {},
58+
"outputs": [],
59+
"source": [
60+
"# dictionary to store data path and model\n",
61+
"\n",
62+
"data_model_dict = {\n",
63+
" \"cifar-10\": {\n",
64+
" \"train_data\": cifar_10_train_dataset,\n",
65+
" \"test_data\": cifar_10_test_dataset,\n",
66+
" },\n",
67+
" \"cifar-100\": {\n",
68+
" \"train_data\": cifar_100_train_dataset,\n",
69+
" \"test_data\": cifar_100_test_dataset,\n",
70+
" },\n",
71+
" \"mnist\": {\n",
72+
" \"train_data\": mnist_train_dataset,\n",
73+
" \"test_data\": mnist_test_dataset,\n",
74+
" },\n",
75+
" \"fashion-mnist\": {\n",
76+
" \"train_data\": fashion_mnist_train_dataset,\n",
77+
" \"test_data\": fashion_mnist_test_dataset,\n",
78+
" },\n",
79+
"}"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": null,
85+
"id": "8606e688",
86+
"metadata": {},
87+
"outputs": [],
88+
"source": [
89+
"# Create mini train dataset for testing\n",
90+
"def get_imbalanced_dataset(dataset, fractions):\n",
91+
" assert len(fractions) == dataset['label'].nunique()\n",
92+
"\n",
93+
" imbalanced_dataset = pd.DataFrame(columns=dataset.columns)\n",
94+
" print(imbalanced_dataset)\n",
95+
" for i in range(len(fractions)):\n",
96+
" idf = dataset[dataset['label'] == i].sample(frac=fractions[i])\n",
97+
" print(f'label {i} will have {idf.shape[0]} examples')\n",
98+
" imbalanced_dataset = pd.concat([imbalanced_dataset, idf], ignore_index=True)\n",
99+
" print(f'total imbalanced dataset length {imbalanced_dataset.shape[0]}')\n",
100+
" return imbalanced_dataset\n",
101+
"\n",
102+
"### Uncomment below to create imbalanced datasets\n",
103+
"\n",
104+
"# cifar_100_num_classes = len(cifar_100_train_dataset['label'].unique())\n",
105+
"# cifar_100_distribution = [0.15] * int(cifar_100_num_classes * 0.9) + [1.] * int(cifar_100_num_classes * 0.1)\n",
106+
"# cifar_100_train_dataset = get_imbalanced_dataset(cifar_100_train_dataset, cifar_100_distribution)\n",
107+
"# cifar_10_train_dataset = get_imbalanced_dataset(cifar_10_train_dataset,[0.09,0.09,0.09,0.09,1.,1.,0.09,0.09,1.,1.])\n",
108+
"# mnist_train_dataset = get_imbalanced_dataset(mnist_train_dataset,[0.09,0.09,0.09,0.09,1.,1.,0.09,0.09,1.,1.])\n",
109+
"# fashion_mnist_train_dataset = get_imbalanced_dataset(fashion_mnist_train_dataset,[0.09,0.09,0.09,0.09,1.,1.,0.09,0.09,1.,1.])"
110+
]
111+
},
112+
{
113+
"cell_type": "code",
114+
"execution_count": null,
115+
"id": "1ae79a8d-bb68-46d5-b4b9-1f082da7d695",
116+
"metadata": {},
117+
"outputs": [],
118+
"source": [
119+
"# Check out a dataset\n",
120+
"mnist_train_dataset.head()"
121+
]
122+
},
123+
{
124+
"cell_type": "markdown",
125+
"id": "cc26ea6d-954c-4810-a561-50badcdd992d",
126+
"metadata": {},
127+
"source": [
128+
"## Train model"
129+
]
130+
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": null,
134+
"id": "b854ab3a",
135+
"metadata": {},
136+
"outputs": [],
137+
"source": [
138+
"!mkdir models # Create models folder to save model results into"
139+
]
140+
},
141+
{
142+
"cell_type": "code",
143+
"execution_count": null,
144+
"id": "abfa0bb0-aa32-47ac-a453-9ac5a2d91c96",
145+
"metadata": {},
146+
"outputs": [],
147+
"source": [
148+
"%%time\n",
149+
"\n",
150+
"def train_ag_model(\n",
151+
" train_data,\n",
152+
" dataset_name,\n",
153+
" model_folder=\"./models/\", \n",
154+
" epochs=100,\n",
155+
" model=\"swin_base_patch4_window7_224\",\n",
156+
" time_limit=10*3600\n",
157+
"):\n",
158+
"\n",
159+
" # init model\n",
160+
" predictor = ImagePredictor(verbosity=0)\n",
161+
"\n",
162+
" MODEL_PARAMS = {\n",
163+
" \"model\": model,\n",
164+
" \"epochs\": epochs,\n",
165+
" }\n",
166+
"\n",
167+
" # run training\n",
168+
" predictor.fit(\n",
169+
" train_data=train_data,\n",
170+
" # tuning_data=,\n",
171+
" ngpus_per_trial=1,\n",
172+
" hyperparameters=MODEL_PARAMS,\n",
173+
" time_limit=time_limit,\n",
174+
" random_state=123,\n",
175+
" )\n",
176+
"\n",
177+
" # save model\n",
178+
" filename = f\"{model_folder}{model}_{dataset_name}.ag\"\n",
179+
" predictor.save(filename) \n",
180+
" \n",
181+
" return predictor"
182+
]
183+
},
184+
{
185+
"cell_type": "markdown",
186+
"id": "a2a4cfa4-f028-4236-a15d-e3d6e7df9f20",
187+
"metadata": {},
188+
"source": [
189+
"## Train model for all datasets"
190+
]
191+
},
192+
{
193+
"cell_type": "code",
194+
"execution_count": null,
195+
"id": "8bd6e11c-6856-4a4d-80b7-01b5635e5ffb",
196+
"metadata": {},
197+
"outputs": [],
198+
"source": [
199+
"model = \"swin_base_patch4_window7_224\"\n",
200+
"\n",
201+
"for key, data in data_model_dict.items():\n",
202+
"\n",
203+
" dataset = key\n",
204+
" train_dataset = data[\"train_data\"]\n",
205+
" \n",
206+
" print(f\"Dataset: {dataset}\")\n",
207+
" print(f\" Records: {train_dataset.shape}\")\n",
208+
" print(f\" Classes: {train_dataset.label.nunique()}\") \n",
209+
" \n",
210+
" _ = train_ag_model(train_dataset, dataset_name=dataset, model=model, epochs=100)"
211+
]
212+
},
213+
{
214+
"cell_type": "code",
215+
"execution_count": null,
216+
"id": "ec1ed3e1",
217+
"metadata": {},
218+
"outputs": [],
219+
"source": []
220+
}
221+
],
222+
"metadata": {
223+
"kernelspec": {
224+
"display_name": "Python 3 (ipykernel)",
225+
"language": "python",
226+
"name": "python3"
227+
},
228+
"language_info": {
229+
"codemirror_mode": {
230+
"name": "ipython",
231+
"version": 3
232+
},
233+
"file_extension": ".py",
234+
"mimetype": "text/x-python",
235+
"name": "python",
236+
"nbconvert_exporter": "python",
237+
"pygments_lexer": "ipython3",
238+
"version": "3.8.10"
239+
}
240+
},
241+
"nbformat": 4,
242+
"nbformat_minor": 5
243+
}

0 commit comments

Comments
 (0)