Skip to content

Commit b953219

Browse files
author
Corentin
committed
adding embedding prototype
1 parent 2caf08e commit b953219

8 files changed

+4494
-412
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,6 @@ dmypy.json
127127

128128
# Pyre type checker
129129
.pyre/
130+
131+
data/*
132+
logs/

.vscode/settings.json

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
"jupyter.kernels.filter": [
3+
{
4+
"path": "/opt/tljh/user/bin/python",
5+
"type": "pythonEnvironment"
6+
},
7+
{
8+
"path": "/home/meyer/.pyenv/versions/3.10.9/bin/python",
9+
"type": "pythonEnvironment"
10+
},
11+
{
12+
"path": "/usr/bin/python3",
13+
"type": "pythonEnvironment"
14+
},
15+
{
16+
"path": "/bin/python3",
17+
"type": "pythonEnvironment"
18+
},
19+
{
20+
"path": "/home/meyer/.pyenv/versions/3.8.16/bin/python",
21+
"type": "pythonEnvironment"
22+
},
23+
{
24+
"path": "/home/meyer/.pyenv/versions/3.9.16/bin/python",
25+
"type": "pythonEnvironment"
26+
}
27+
]
28+
}

coeur.ipynb

+746
Large diffs are not rendered by default.

myoquant-sdh-train.ipynb

+655-6
Large diffs are not rendered by default.

poetry.lock

+1,904-404
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+13-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ readme = "README.md"
77
packages = [{include = "myoquant_sdh_train"}]
88

99
[tool.poetry.dependencies]
10-
python = ">=3.8, <=3.11"
10+
python = ">=3.8, <=3.10"
1111
confection = "^0.0.4"
1212
typer = "^0.7.0"
1313
rich = "^13.0.1"
@@ -16,10 +16,21 @@ matplotlib = "^3.6.3"
1616
pandas = "^1.5.2"
1717
dvc = "^2.41.1"
1818
tensorboard = "^2.11.0"
19-
embetter = "^0.3.0"
19+
embetter = {version = "^0.3", extras = ["all"]}
2020
doubtlab = "^0.2.4"
2121
ipykernel = "^6.20.1"
2222
tensorflow = "^2.11.0"
23+
ipywidgets = "^8.0.4"
24+
umap-learn = "^0.5.3"
25+
scikeras = {extras = ["tensorflow"], version = "^0.10.0"}
26+
imageio = "^2.25.0"
27+
pigeon-jupyter = "^0.1.0"
28+
scikit-image = "^0.19.3"
29+
openpyxl = "^3.1.0"
30+
deepl = "^1.13.0"
31+
python-dotenv = "^0.21.1"
32+
cohere = "^3.3.2"
33+
openai = "^0.26.4"
2334

2435

2536
[tool.poetry.group.dev.dependencies]

sdh-model-embetter.ipynb

+849
Large diffs are not rendered by default.

text-embetter.ipynb

+296
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import pandas as pd\n",
10+
"import glob\n",
11+
"import deepl\n",
12+
"from dotenv import load_dotenv\n",
13+
"import os\n",
14+
"load_dotenv() \n",
15+
"\n",
16+
"# df = pd.read_excel('data/nlmyo/processed/diag.xlsx')\n",
17+
"# # Read the text content from the filename column and add it to a new column\n",
18+
"# df[\"filepath\"] = df[\"filename\"].apply(lambda x: \"data/nlmyo/processed/\" + x)\n",
19+
"# df['raw_text'] = df['filepath'].apply(lambda x: open(x, 'r').read())\n",
20+
"\n",
21+
"\n",
22+
"# translator = deepl.Translator(os.getenv(\"DEEPL_KEY\")) \n",
23+
"# df['deepl_translation'] = df['raw_text'].apply(lambda x: translator.translate_text(x, target_lang=\"EN-US\").text)\n",
24+
"# save df to csv file\n",
25+
"# df.to_csv('data/nlmyo/processed/diag_translated.csv', index=False)"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"metadata": {},
32+
"outputs": [],
33+
"source": [
34+
"import numpy as np\n",
35+
"from embetter.external import CohereEncoder, OpenAIEncoder\n",
36+
"from embetter.grab import ColumnGrabber\n",
37+
"from sklearn.pipeline import make_pipeline \n",
38+
"import pandas as pd\n",
39+
"from dotenv import load_dotenv\n",
40+
"import os\n",
41+
"\n",
42+
"from cohere import Client\n",
43+
"\n",
44+
"# load_dotenv() \n",
45+
"# client = Client(os.getenv(\"COHERE_KEY\"))\n",
46+
"# # This pipeline grabs the `text` column from a dataframe\n",
47+
"\n",
48+
"# df = pd.read_csv('data/nlmyo/processed/diag_translated.csv')\n",
49+
"\n",
50+
"# text_emb_pipeline = make_pipeline(\n",
51+
"# ColumnGrabber(\"deepl_translation\"),\n",
52+
"# CohereEncoder(client=client, model=\"large\")\n",
53+
"# )\n",
54+
"\n",
55+
"# X = text_emb_pipeline.fit_transform(df, df['diag_simple'])\n",
56+
"# np.save('data/nlmyo/processed/report_translated_embed_cohere.npy', X)"
57+
]
58+
},
59+
{
60+
"cell_type": "code",
61+
"execution_count": null,
62+
"metadata": {},
63+
"outputs": [],
64+
"source": [
65+
"import numpy as np\n",
66+
"from embetter.external import CohereEncoder, OpenAIEncoder\n",
67+
"from embetter.grab import ColumnGrabber\n",
68+
"from sklearn.pipeline import make_pipeline \n",
69+
"import pandas as pd\n",
70+
"from dotenv import load_dotenv\n",
71+
"import os\n",
72+
"\n",
73+
"import openai\n",
74+
"\n",
75+
"# load_dotenv() \n",
76+
"\n",
77+
"# openai.organization = os.getenv(\"OPENAI_ORG\")\n",
78+
"# openai.api_key = os.getenv(\"OPENAI_KEY\")\n",
79+
"# # This pipeline grabs the `text` column from a dataframe\n",
80+
"\n",
81+
"# df = pd.read_csv('data/nlmyo/processed/diag_translated.csv')\n",
82+
"\n",
83+
"# text_emb_pipeline = make_pipeline(\n",
84+
"# ColumnGrabber(\"deepl_translation\"),\n",
85+
"# OpenAIEncoder(model=\"text-embedding-ada-002\")\n",
86+
"# )\n",
87+
"\n",
88+
"# X = text_emb_pipeline.fit_transform(df, df['diag_simple'])\n",
89+
"# np.save('data/nlmyo/processed/report_translated_embed_openai.npy', X)"
90+
]
91+
},
92+
{
93+
"cell_type": "code",
94+
"execution_count": null,
95+
"metadata": {},
96+
"outputs": [],
97+
"source": [
98+
"import numpy as np\n",
99+
"import pandas as pd\n",
100+
"from sklearn.metrics import accuracy_score\n",
101+
"from sklearn.model_selection import StratifiedKFold, cross_val_score\n",
102+
"from sklearn.linear_model import LogisticRegression\n",
103+
"from sklearn.dummy import DummyClassifier\n",
104+
"\n",
105+
"#### Import the data\n",
106+
"df = pd.read_csv('data/nlmyo/processed/diag_translated.csv')\n",
107+
"Y = df['diag_simple'].values\n",
108+
"X_cohere = np.load('data/nlmyo/processed/report_translated_embed_cohere.npy') \n",
109+
"X_openai = np.load('data/nlmyo/processed/report_translated_embed_openai.npy')\n",
110+
"\n",
111+
"# Remove CFTD and unclear diagnosis\n",
112+
"df['diag_simple'].value_counts()\n",
113+
"df['diag_simple'] = df['diag_simple'].replace('CFTD', 'UNCLEAR')\n",
114+
"# Drop the rows with unclear diagnosis\n",
115+
"df = df[df['diag_simple'] != 'UNCLEAR']\n",
116+
"Y = df['diag_simple'].values\n",
117+
"# Do the same for the X array based on the df index\n",
118+
"X_cohere = X_cohere[df.index]\n",
119+
"X_openai = X_openai[df.index]\n",
120+
"cv_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)"
121+
]
122+
},
123+
{
124+
"cell_type": "code",
125+
"execution_count": null,
126+
"metadata": {},
127+
"outputs": [],
128+
"source": [
129+
"clf_dummy = DummyClassifier(strategy='prior')\n",
130+
"cv_scores_dummy = cross_val_score(clf_dummy, X_cohere, Y, cv=cv_fold)\n",
131+
"print(\"Dummy Classifier Results:\")\n",
132+
"print(f\"All CV Scores: {cv_scores_dummy}\")\n",
133+
"print(f\"Mean CV Score: {np.mean(cv_scores_dummy)}\")\n",
134+
"print(f\"Standard Deviation CV Score: {np.std(cv_scores_dummy)}\")"
135+
]
136+
},
137+
{
138+
"cell_type": "code",
139+
"execution_count": null,
140+
"metadata": {},
141+
"outputs": [],
142+
"source": [
143+
"clf = LogisticRegression(max_iter=3000)\n",
144+
"cv_scores = cross_val_score(clf, X_cohere, Y, cv=cv_fold)\n",
145+
"print(\"Results with Logistic Regression and Cohere Embeddings on English Translated Reports:\")\n",
146+
"print(f\"All CV Scores: {cv_scores}\")\n",
147+
"print(f\"Mean CV Score: {np.mean(cv_scores)}\")\n",
148+
"print(f\"Standard Deviation CV Score: {np.std(cv_scores)}\")"
149+
]
150+
},
151+
{
152+
"cell_type": "code",
153+
"execution_count": null,
154+
"metadata": {},
155+
"outputs": [],
156+
"source": [
157+
"clf = LogisticRegression(max_iter=3000)\n",
158+
"cv_scores = cross_val_score(clf, X_openai, Y, cv=cv_fold)\n",
159+
"print(\"Results with Logistic Regression and OpenAI Embeddings on English Translated Reports:\")\n",
160+
"print(f\"All CV Scores: {cv_scores}\")\n",
161+
"print(f\"Mean CV Score: {np.mean(cv_scores)}\")\n",
162+
"print(f\"Standard Deviation CV Score: {np.std(cv_scores)}\")"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": null,
168+
"metadata": {},
169+
"outputs": [],
170+
"source": [
171+
"from sklearn.base import BaseEstimator\n",
172+
"from sklearn.model_selection import GridSearchCV\n",
173+
"from sklearn.pipeline import Pipeline\n",
174+
"\n",
175+
"from sklearn.linear_model import LogisticRegression\n",
176+
"from sklearn.neural_network import MLPClassifier\n",
177+
"from sklearn.neighbors import KNeighborsClassifier\n",
178+
"from sklearn.svm import SVC\n",
179+
"from sklearn.gaussian_process import GaussianProcessClassifier\n",
180+
"from sklearn.ensemble import HistGradientBoostingClassifier\n",
181+
"from sklearn.tree import DecisionTreeClassifier\n",
182+
"from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n",
183+
"from sklearn.naive_bayes import GaussianNB\n",
184+
"\n",
185+
"class DummyEstimator(BaseEstimator):\n",
186+
" def fit(self): pass\n",
187+
" def score(self): pass\n",
188+
"\n",
189+
"\n",
190+
"# Create a pipeline\n",
191+
"pipe = Pipeline([('clf', DummyEstimator())]) # Placeholder Estimator\n",
192+
"\n",
193+
"# Candidate learning algorithms and their hyperparameters\n",
194+
"search_space = [{'clf': [LogisticRegression()],\n",
195+
" 'clf__max_iter': [1500]},\n",
196+
" {'clf': [GaussianNB()],},\n",
197+
" {'clf': [MLPClassifier()],\n",
198+
" 'clf__max_iter': [500]},\n",
199+
" {'clf': [KNeighborsClassifier()],},\n",
200+
" {'clf': [SVC()],},\n",
201+
" {'clf': [GaussianProcessClassifier()],},\n",
202+
" {'clf': [HistGradientBoostingClassifier()],},\n",
203+
" {'clf': [DecisionTreeClassifier()],},\n",
204+
" {'clf': [RandomForestClassifier()],},\n",
205+
" {'clf': [AdaBoostClassifier()],},\n",
206+
" ]\n",
207+
"\n",
208+
"\n",
209+
"# Create grid search \n",
210+
"gs = GridSearchCV(pipe, search_space, scoring=\"accuracy\", cv=cv_fold)\n",
211+
"gs.fit(X_cohere, Y)\n",
212+
"df_cv_search = pd.DataFrame(gs.cv_results_)\n",
213+
"df_cv_search.to_csv('data/nlmyo/processed/report_translated_embed_cohere_gridsearch.csv')\n",
214+
"df_cv_search"
215+
]
216+
},
217+
{
218+
"cell_type": "code",
219+
"execution_count": null,
220+
"metadata": {},
221+
"outputs": [],
222+
"source": [
223+
"from sklearn.base import BaseEstimator\n",
224+
"from sklearn.model_selection import GridSearchCV\n",
225+
"from sklearn.pipeline import Pipeline\n",
226+
"\n",
227+
"from sklearn.linear_model import LogisticRegression\n",
228+
"from sklearn.neural_network import MLPClassifier\n",
229+
"from sklearn.neighbors import KNeighborsClassifier\n",
230+
"from sklearn.svm import SVC\n",
231+
"from sklearn.gaussian_process import GaussianProcessClassifier\n",
232+
"from sklearn.ensemble import HistGradientBoostingClassifier\n",
233+
"from sklearn.tree import DecisionTreeClassifier\n",
234+
"from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n",
235+
"from sklearn.naive_bayes import GaussianNB\n",
236+
"\n",
237+
"class DummyEstimator(BaseEstimator):\n",
238+
" def fit(self): pass\n",
239+
" def score(self): pass\n",
240+
"\n",
241+
"# Create a pipeline\n",
242+
"pipe = Pipeline([('clf', DummyEstimator())]) # Placeholder Estimator\n",
243+
"\n",
244+
"# Candidate learning algorithms and their hyperparameters\n",
245+
"search_space = [{'clf': [LogisticRegression()],\n",
246+
" 'clf__max_iter': [1500]},\n",
247+
" {'clf': [GaussianNB()],},\n",
248+
" {'clf': [MLPClassifier()],\n",
249+
" 'clf__max_iter': [1000]},\n",
250+
" {'clf': [KNeighborsClassifier()],},\n",
251+
" {'clf': [SVC()],},\n",
252+
" {'clf': [GaussianProcessClassifier()],},\n",
253+
" {'clf': [HistGradientBoostingClassifier()],},\n",
254+
" {'clf': [DecisionTreeClassifier()],},\n",
255+
" {'clf': [RandomForestClassifier()],},\n",
256+
" {'clf': [AdaBoostClassifier()],},\n",
257+
" ]\n",
258+
"\n",
259+
"\n",
260+
"# Create grid search \n",
261+
"gs = GridSearchCV(pipe, search_space, scoring=\"accuracy\", cv=cv_fold)\n",
262+
"gs.fit(X_openai, Y)\n",
263+
"df_cv_search = pd.DataFrame(gs.cv_results_)\n",
264+
"df_cv_search.to_csv('data/nlmyo/processed/report_translated_embed_cohere_gridsearch.csv')\n",
265+
"df_cv_search"
266+
]
267+
}
268+
],
269+
"metadata": {
270+
"kernelspec": {
271+
"display_name": ".venv",
272+
"language": "python",
273+
"name": "python3"
274+
},
275+
"language_info": {
276+
"codemirror_mode": {
277+
"name": "ipython",
278+
"version": 3
279+
},
280+
"file_extension": ".py",
281+
"mimetype": "text/x-python",
282+
"name": "python",
283+
"nbconvert_exporter": "python",
284+
"pygments_lexer": "ipython3",
285+
"version": "3.8.16"
286+
},
287+
"orig_nbformat": 4,
288+
"vscode": {
289+
"interpreter": {
290+
"hash": "72f151f06f73a7f1387c41c20c6e81dd1f2de7c0f647fc647e5076786050674c"
291+
}
292+
}
293+
},
294+
"nbformat": 4,
295+
"nbformat_minor": 2
296+
}

0 commit comments

Comments
 (0)