Skip to content

Commit 11e711f

Browse files
authored
[Response to Reviewers] Replace "all except bz" features with "random" features (#132)
* replace all except bz features with random * fix facet
1 parent 90b9b62 commit 11e711f

File tree

7 files changed

+140
-48
lines changed

7 files changed

+140
-48
lines changed

5.signature-exploration/0.evaluate_feature_spaces.ipynb

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
"source": [
4747
"import umap\n",
4848
"import pathlib\n",
49+
"import random\n",
4950
"import numpy as np\n",
5051
"import pandas as pd\n",
5152
"\n",
@@ -959,11 +960,22 @@
959960
],
960961
"source": [
961962
"# Get all features except those in the signature\n",
962-
"all_features_except_bz_sig_features = infer_cp_features(profile_df)\n",
963-
"all_features_except_bz_sig_features = [x for x in all_features_except_bz_sig_features if x not in bz_sig_features]\n",
963+
"all_features = infer_cp_features(profile_df)\n",
964+
"all_features_except_bz_sig_features = [x for x in all_features if x not in bz_sig_features]\n",
964965
"len(all_features_except_bz_sig_features)"
965966
]
966967
},
968+
{
969+
"cell_type": "code",
970+
"execution_count": 9,
971+
"id": "e33dbdf8-910c-42d1-877b-cea5f68c1690",
972+
"metadata": {},
973+
"outputs": [],
974+
"source": [
975+
"# Get random features\n",
976+
"random_features = random.sample(all_features_except_bz_sig_features, len(bz_sig_features))"
977+
]
978+
},
967979
{
968980
"cell_type": "markdown",
969981
"id": "a602bce6-6682-4760-b6d8-be45b8c76920",
@@ -974,7 +986,7 @@
974986
},
975987
{
976988
"cell_type": "code",
977-
"execution_count": 9,
989+
"execution_count": 10,
978990
"id": "bb325549-8f00-4c01-b6b8-39bf55bca976",
979991
"metadata": {},
980992
"outputs": [],
@@ -1003,20 +1015,27 @@
10031015
" profile_df,\n",
10041016
" features=all_features_except_bz_sig_features,\n",
10051017
" umap_category=\"all_except_bortezomib_signature_features\"\n",
1018+
")\n",
1019+
"\n",
1020+
"# 5) 45 random CellProfiler features\n",
1021+
"umap_random_sig_df = process_umap(\n",
1022+
" profile_df,\n",
1023+
" features=random_features,\n",
1024+
" umap_category=\"random_45_features\"\n",
10061025
")"
10071026
]
10081027
},
10091028
{
10101029
"cell_type": "code",
1011-
"execution_count": 10,
1030+
"execution_count": 11,
10121031
"id": "d699c95b-1d3d-4c15-bf70-1b70d5589dab",
10131032
"metadata": {},
10141033
"outputs": [
10151034
{
10161035
"name": "stdout",
10171036
"output_type": "stream",
10181037
"text": [
1019-
"(15828, 19)\n"
1038+
"(19785, 19)\n"
10201039
]
10211040
},
10221041
{
@@ -1227,7 +1246,7 @@
12271246
"4 wildtype all_features "
12281247
]
12291248
},
1230-
"execution_count": 10,
1249+
"execution_count": 11,
12311250
"metadata": {},
12321251
"output_type": "execute_result"
12331252
}
@@ -1239,7 +1258,8 @@
12391258
" umap_all_feature_df,\n",
12401259
" umap_fs_feature_df,\n",
12411260
" umap_bz_sig_df,\n",
1242-
" umap_non_bz_sig_df\n",
1261+
" umap_non_bz_sig_df,\n",
1262+
" umap_random_sig_df\n",
12431263
" ]\n",
12441264
").reset_index(drop=True)\n",
12451265
"\n",
@@ -1259,7 +1279,7 @@
12591279
},
12601280
{
12611281
"cell_type": "code",
1262-
"execution_count": 11,
1282+
"execution_count": 12,
12631283
"id": "24a12a25",
12641284
"metadata": {},
12651285
"outputs": [],
@@ -1314,20 +1334,32 @@
13141334
" feature_category = \"all_except_bortezomib_signature_features\",\n",
13151335
" low_k = low_k, \n",
13161336
" high_k = high_k\n",
1337+
")\n",
1338+
"\n",
1339+
"# 5) Random features clustering\n",
1340+
"random_feature_cluster_df = perform_clustering(\n",
1341+
" df = profile_df,\n",
1342+
" features = random_features,\n",
1343+
" pca_n_components=pca_n_components,\n",
1344+
" class_column = \"Metadata_clone_type\",\n",
1345+
" positive_class = \"wildtype\",\n",
1346+
" feature_category = \"random_45_features\",\n",
1347+
" low_k = low_k, \n",
1348+
" high_k = high_k\n",
13171349
")"
13181350
]
13191351
},
13201352
{
13211353
"cell_type": "code",
1322-
"execution_count": 12,
1354+
"execution_count": 13,
13231355
"id": "306e58fc",
13241356
"metadata": {},
13251357
"outputs": [
13261358
{
13271359
"name": "stdout",
13281360
"output_type": "stream",
13291361
"text": [
1330-
"(52, 6)\n"
1362+
"(65, 6)\n"
13311363
]
13321364
},
13331365
{
@@ -1425,7 +1457,7 @@
14251457
"4 6.906626e-06 all_features "
14261458
]
14271459
},
1428-
"execution_count": 12,
1460+
"execution_count": 13,
14291461
"metadata": {},
14301462
"output_type": "execute_result"
14311463
}
@@ -1437,7 +1469,8 @@
14371469
" all_feature_cluster_df,\n",
14381470
" fs_feature_cluster_df,\n",
14391471
" bz_feature_cluster_df,\n",
1440-
" non_bz_feature_cluster_df\n",
1472+
" non_bz_feature_cluster_df,\n",
1473+
" random_feature_cluster_df\n",
14411474
" ]\n",
14421475
").reset_index(drop=True)\n",
14431476
"\n",
@@ -1450,7 +1483,7 @@
14501483
],
14511484
"metadata": {
14521485
"kernelspec": {
1453-
"display_name": "Python [conda env:resistance-mechanisms-signature]",
1486+
"display_name": "Python [conda env:resistance-mechanisms-signature] *",
14541487
"language": "python",
14551488
"name": "conda-env-resistance-mechanisms-signature-py"
14561489
},

5.signature-exploration/1.visualize_feature_spaces.ipynb

Lines changed: 29 additions & 14 deletions
Large diffs are not rendered by default.
Loading
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:ef675198a10f6429be82575942af03ba3513c7ff25c16be3435123342177bec3
3-
size 2409
2+
oid sha256:39d72834ac9c1d9ae922e9e7ae04b0b8c9f47eeb156089a179b2008eb5e12195
3+
size 2979
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:eae59a6da779040b2bc22e4bf6af744f9e18b31c74c84f1070b7a85ec1a36d13
3-
size 314510
2+
oid sha256:9fbd31f73d2f77da2e03203f0cd33e09c79bd809e02b12224769f38672d6cac4
3+
size 392962

5.signature-exploration/scripts/nbconverted/0.evaluate_feature_spaces.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
import umap
2424
import pathlib
25+
import random
2526
import numpy as np
2627
import pandas as pd
2728

@@ -259,14 +260,21 @@ def perform_clustering(
259260

260261

261262
# Get all features except those in the signature
262-
all_features_except_bz_sig_features = infer_cp_features(profile_df)
263-
all_features_except_bz_sig_features = [x for x in all_features_except_bz_sig_features if x not in bz_sig_features]
263+
all_features = infer_cp_features(profile_df)
264+
all_features_except_bz_sig_features = [x for x in all_features if x not in bz_sig_features]
264265
len(all_features_except_bz_sig_features)
265266

266267

268+
# In[9]:
269+
270+
271+
# Get random features
272+
random_features = random.sample(all_features_except_bz_sig_features, len(bz_sig_features))
273+
274+
267275
# ## Step 1: Calculate UMAP coordinates using four different feature spaces:
268276

269-
# In[9]:
277+
# In[10]:
270278

271279

272280
# 1) All feature umap
@@ -295,8 +303,15 @@ def perform_clustering(
295303
umap_category="all_except_bortezomib_signature_features"
296304
)
297305

306+
# 5) 45 random CellProfiler features
307+
umap_random_sig_df = process_umap(
308+
profile_df,
309+
features=random_features,
310+
umap_category="random_45_features"
311+
)
312+
298313

299-
# In[10]:
314+
# In[11]:
300315

301316

302317
# Output umap summary
@@ -305,7 +320,8 @@ def perform_clustering(
305320
umap_all_feature_df,
306321
umap_fs_feature_df,
307322
umap_bz_sig_df,
308-
umap_non_bz_sig_df
323+
umap_non_bz_sig_df,
324+
umap_random_sig_df
309325
]
310326
).reset_index(drop=True)
311327

@@ -317,7 +333,7 @@ def perform_clustering(
317333

318334
# ## Step 2: Perform clustering and calculate enrichment and silhouette scores for the four feature spaces
319335

320-
# In[11]:
336+
# In[12]:
321337

322338

323339
low_k = 2
@@ -372,8 +388,20 @@ def perform_clustering(
372388
high_k = high_k
373389
)
374390

391+
# 5) Random features clustering
392+
random_feature_cluster_df = perform_clustering(
393+
df = profile_df,
394+
features = random_features,
395+
pca_n_components=pca_n_components,
396+
class_column = "Metadata_clone_type",
397+
positive_class = "wildtype",
398+
feature_category = "random_45_features",
399+
low_k = low_k,
400+
high_k = high_k
401+
)
375402

376-
# In[12]:
403+
404+
# In[13]:
377405

378406

379407
# Output clustering summary
@@ -382,7 +410,8 @@ def perform_clustering(
382410
all_feature_cluster_df,
383411
fs_feature_cluster_df,
384412
bz_feature_cluster_df,
385-
non_bz_feature_cluster_df
413+
non_bz_feature_cluster_df,
414+
random_feature_cluster_df
386415
]
387416
).reset_index(drop=True)
388417

5.signature-exploration/scripts/nbconverted/1.visualize_feature_spaces.r

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,20 @@ singscore_df <- readr::read_tsv(
1717
TotalScore = "d"
1818
)
1919
) %>%
20-
dplyr::select(Metadata_Plate, Metadata_Well, Metadata_batch, Metadata_dataset, TotalScore)
20+
dplyr::select(
21+
Metadata_Plate,
22+
Metadata_Well,
23+
Metadata_batch,
24+
Metadata_dataset,
25+
TotalScore
26+
)
2127

2228
print(dim(singscore_df))
2329
head(singscore_df, 3)
2430

2531
# Load umap summary and process
2632
umap_file <- file.path("results", "umap_feature_summary.tsv.gz")
33+
2734
umap_df <- readr::read_tsv(
2835
umap_file,
2936
col_types = readr::cols(
@@ -43,12 +50,16 @@ umap_df$Metadata_umap_category <- dplyr::recode(
4350
all_features = "All features",
4451
feature_selected = "Feature selected",
4552
all_except_bortezomib_signature_features = "All except BZ",
46-
bortezomib_signature_features = "BZ features"
53+
bortezomib_signature_features = "BZ features",
54+
random_45_features = "Random features"
4755
)
4856

57+
umap_df <- umap_df %>%
58+
dplyr::filter(Metadata_umap_category != "All except BZ")
59+
4960
umap_df$Metadata_umap_category <- factor(
5061
umap_df$Metadata_umap_category,
51-
levels = c("All features", "Feature selected", "All except BZ", "BZ features")
62+
levels = c("All features", "Feature selected", "Random features", "BZ features")
5263
)
5364

5465
print(dim(umap_df))
@@ -72,12 +83,16 @@ cluster_df$feature_category <- dplyr::recode(
7283
all_features = "All features",
7384
feature_selected = "Feature selected",
7485
all_except_bortezomib_signature_features = "All except BZ",
75-
bortezomib_signature_features = "BZ features"
86+
bortezomib_signature_features = "BZ features",
87+
random_45_features = "Random 45 features"
7688
)
7789

90+
cluster_df <- cluster_df %>%
91+
dplyr::filter(feature_category != "All except BZ")
92+
7893
cluster_df$feature_category <- factor(
7994
cluster_df$feature_category,
80-
levels = c("All features", "Feature selected", "All except BZ", "BZ features")
95+
levels = c("All features", "Feature selected", "Random 45 features", "BZ features")
8196
)
8297

8398
cluster_df$clustering_metric <- dplyr::recode(
@@ -179,13 +194,13 @@ cluster_gg <- (
179194
labels = c(
180195
"All features" = "All features",
181196
"Feature selected" = "Feature selected",
182-
"All except BZ" = "All except BZ",
197+
"Random 45 features" = "Random features",
183198
"BZ features" = "BZ features"
184199
),
185200
values = c(
186201
"All features" = "#1f78b4",
187202
"Feature selected" = "#33a02c",
188-
"All except BZ" = "#a6cee3",
203+
"Random 45 features" = "#a6cee3",
189204
"BZ features" = "#b2df8a"
190205
)
191206
)
@@ -194,13 +209,13 @@ cluster_gg <- (
194209
labels = c(
195210
"All features" = "All features",
196211
"Feature selected" = "Feature selected",
197-
"All except BZ" = "All except BZ",
212+
"Random 45 features" = "Random features",
198213
"BZ features" = "BZ features"
199214
),
200215
values = c(
201216
"All features" = "#1f78b4",
202217
"Feature selected" = "#33a02c",
203-
"All except BZ" = "#a6cee3",
218+
"Random 45 features" = "#a6cee3",
204219
"BZ features" = "#b2df8a"
205220
)
206221
)

0 commit comments

Comments
 (0)