[Response to Reviewers] Replace "all except bz" features with "random" features (#132)

gwaybio · web-flow · commit 11e711fd8241 · 2023-07-18T12:37:36.000-06:00
* replace all except bz features with random

* fix facet
diff --git a/5.signature-exploration/0.evaluate_feature_spaces.ipynb b/5.signature-exploration/0.evaluate_feature_spaces.ipynb
@@ -46,6 +46,7 @@
    "source": [
     "import umap\n",
     "import pathlib\n",
+    "import random\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "\n",
@@ -959,11 +960,22 @@
    ],
    "source": [
     "# Get all features except those in the signature\n",
-    "all_features_except_bz_sig_features = infer_cp_features(profile_df)\n",
-    "all_features_except_bz_sig_features = [x for x in all_features_except_bz_sig_features if x not in bz_sig_features]\n",
+    "all_features = infer_cp_features(profile_df)\n",
+    "all_features_except_bz_sig_features = [x for x in all_features if x not in bz_sig_features]\n",
     "len(all_features_except_bz_sig_features)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "e33dbdf8-910c-42d1-877b-cea5f68c1690",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get random features\n",
+    "random_features = random.sample(all_features_except_bz_sig_features, len(bz_sig_features))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "a602bce6-6682-4760-b6d8-be45b8c76920",
@@ -974,7 +986,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "bb325549-8f00-4c01-b6b8-39bf55bca976",
    "metadata": {},
    "outputs": [],
@@ -1003,20 +1015,27 @@
     "    profile_df,\n",
     "    features=all_features_except_bz_sig_features,\n",
     "    umap_category=\"all_except_bortezomib_signature_features\"\n",
+    ")\n",
+    "\n",
+    "# 5) 45 random CellProfiler features\n",
+    "umap_random_sig_df = process_umap(\n",
+    "    profile_df,\n",
+    "    features=random_features,\n",
+    "    umap_category=\"random_45_features\"\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "d699c95b-1d3d-4c15-bf70-1b70d5589dab",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(15828, 19)\n"
+      "(19785, 19)\n"
      ]
     },
     {
@@ -1227,7 +1246,7 @@
        "4            wildtype           all_features  "
       ]
      },
-     "execution_count": 10,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1239,7 +1258,8 @@
     "        umap_all_feature_df,\n",
     "        umap_fs_feature_df,\n",
     "        umap_bz_sig_df,\n",
-    "        umap_non_bz_sig_df\n",
+    "        umap_non_bz_sig_df,\n",
+    "        umap_random_sig_df\n",
     "    ]\n",
     ").reset_index(drop=True)\n",
     "\n",
@@ -1259,7 +1279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "24a12a25",
    "metadata": {},
    "outputs": [],
@@ -1314,20 +1334,32 @@
     "    feature_category = \"all_except_bortezomib_signature_features\",\n",
     "    low_k = low_k, \n",
     "    high_k = high_k\n",
+    ")\n",
+    "\n",
+    "# 5) Random features clustering\n",
+    "random_feature_cluster_df = perform_clustering(\n",
+    "    df = profile_df,\n",
+    "    features = random_features,\n",
+    "    pca_n_components=pca_n_components,\n",
+    "    class_column = \"Metadata_clone_type\",\n",
+    "    positive_class = \"wildtype\",\n",
+    "    feature_category = \"random_45_features\",\n",
+    "    low_k = low_k, \n",
+    "    high_k = high_k\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "id": "306e58fc",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(52, 6)\n"
+      "(65, 6)\n"
      ]
     },
     {
@@ -1425,7 +1457,7 @@
        "4     6.906626e-06     all_features  "
       ]
      },
-     "execution_count": 12,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1437,7 +1469,8 @@
     "        all_feature_cluster_df,\n",
     "        fs_feature_cluster_df,\n",
     "        bz_feature_cluster_df,\n",
-    "        non_bz_feature_cluster_df\n",
+    "        non_bz_feature_cluster_df,\n",
+    "        random_feature_cluster_df\n",
     "    ]\n",
     ").reset_index(drop=True)\n",
     "\n",
@@ -1450,7 +1483,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:resistance-mechanisms-signature]",
+   "display_name": "Python [conda env:resistance-mechanisms-signature] *",
    "language": "python",
    "name": "conda-env-resistance-mechanisms-signature-py"
   },
diff --git a/5.signature-exploration/1.visualize_feature_spaces.ipynb b/5.signature-exploration/1.visualize_feature_spaces.ipynb
diff --git a/5.signature-exploration/figures/feature_space_comparison.png b/5.signature-exploration/figures/feature_space_comparison.png
diff --git a/5.signature-exploration/results/clustering_feature_summary.tsv.gz b/5.signature-exploration/results/clustering_feature_summary.tsv.gz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ef675198a10f6429be82575942af03ba3513c7ff25c16be3435123342177bec3
-size 2409
+oid sha256:39d72834ac9c1d9ae922e9e7ae04b0b8c9f47eeb156089a179b2008eb5e12195
+size 2979
diff --git a/5.signature-exploration/results/umap_feature_summary.tsv.gz b/5.signature-exploration/results/umap_feature_summary.tsv.gz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eae59a6da779040b2bc22e4bf6af744f9e18b31c74c84f1070b7a85ec1a36d13
-size 314510
+oid sha256:9fbd31f73d2f77da2e03203f0cd33e09c79bd809e02b12224769f38672d6cac4
+size 392962
diff --git a/5.signature-exploration/scripts/nbconverted/0.evaluate_feature_spaces.py b/5.signature-exploration/scripts/nbconverted/0.evaluate_feature_spaces.py
@@ -22,6 +22,7 @@
 
 import umap
 import pathlib
+import random
 import numpy as np
 import pandas as pd
 
@@ -259,14 +260,21 @@ def perform_clustering(
 
 
 # Get all features except those in the signature
-all_features_except_bz_sig_features = infer_cp_features(profile_df)
-all_features_except_bz_sig_features = [x for x in all_features_except_bz_sig_features if x not in bz_sig_features]
+all_features = infer_cp_features(profile_df)
+all_features_except_bz_sig_features = [x for x in all_features if x not in bz_sig_features]
 len(all_features_except_bz_sig_features)
 
 
+# In[9]:
+
+
+# Get random features
+random_features = random.sample(all_features_except_bz_sig_features, len(bz_sig_features))
+
+
 # ## Step 1: Calculate UMAP coordinates using four different feature spaces:
 
-# In[9]:
+# In[10]:
 
 
 # 1) All feature umap
@@ -295,8 +303,15 @@ def perform_clustering(
     umap_category="all_except_bortezomib_signature_features"
 )
 
+# 5) 45 random CellProfiler features
+umap_random_sig_df = process_umap(
+    profile_df,
+    features=random_features,
+    umap_category="random_45_features"
+)
+
 
-# In[10]:
+# In[11]:
 
 
 # Output umap summary
@@ -305,7 +320,8 @@ def perform_clustering(
         umap_all_feature_df,
         umap_fs_feature_df,
         umap_bz_sig_df,
-        umap_non_bz_sig_df
+        umap_non_bz_sig_df,
+        umap_random_sig_df
     ]
 ).reset_index(drop=True)
 
@@ -317,7 +333,7 @@ def perform_clustering(
 
 # ## Step 2: Perform clustering and calculate enrichment and silhouette scores for the four feature spaces
 
-# In[11]:
+# In[12]:
 
 
 low_k = 2
@@ -372,8 +388,20 @@ def perform_clustering(
     high_k = high_k
 )
 
+# 5) Random features clustering
+random_feature_cluster_df = perform_clustering(
+    df = profile_df,
+    features = random_features,
+    pca_n_components=pca_n_components,
+    class_column = "Metadata_clone_type",
+    positive_class = "wildtype",
+    feature_category = "random_45_features",
+    low_k = low_k, 
+    high_k = high_k
+)
 
-# In[12]:
+
+# In[13]:
 
 
 # Output clustering summary
@@ -382,7 +410,8 @@ def perform_clustering(
         all_feature_cluster_df,
         fs_feature_cluster_df,
         bz_feature_cluster_df,
-        non_bz_feature_cluster_df
+        non_bz_feature_cluster_df,
+        random_feature_cluster_df
     ]
 ).reset_index(drop=True)
 
diff --git a/5.signature-exploration/scripts/nbconverted/1.visualize_feature_spaces.r b/5.signature-exploration/scripts/nbconverted/1.visualize_feature_spaces.r
@@ -17,13 +17,20 @@ singscore_df <- readr::read_tsv(
         TotalScore = "d"
     )
 ) %>%
-    dplyr::select(Metadata_Plate, Metadata_Well, Metadata_batch, Metadata_dataset, TotalScore)
+    dplyr::select(
+        Metadata_Plate,
+        Metadata_Well,
+        Metadata_batch,
+        Metadata_dataset,
+        TotalScore
+    )
 
 print(dim(singscore_df))
 head(singscore_df, 3)
 
 # Load umap summary and process
 umap_file <- file.path("results", "umap_feature_summary.tsv.gz")
+
 umap_df <- readr::read_tsv(
     umap_file,
     col_types = readr::cols(
@@ -43,12 +50,16 @@ umap_df$Metadata_umap_category <- dplyr::recode(
     all_features = "All features",
     feature_selected = "Feature selected",
     all_except_bortezomib_signature_features = "All except BZ",
-    bortezomib_signature_features = "BZ features"
+    bortezomib_signature_features = "BZ features",
+    random_45_features = "Random features"
 )
 
+umap_df <- umap_df %>%
+    dplyr::filter(Metadata_umap_category != "All except BZ")
+
 umap_df$Metadata_umap_category <- factor(
     umap_df$Metadata_umap_category,
-    levels = c("All features", "Feature selected", "All except BZ", "BZ features")
+    levels = c("All features", "Feature selected", "Random features", "BZ features")
 )
 
 print(dim(umap_df))
@@ -72,12 +83,16 @@ cluster_df$feature_category <- dplyr::recode(
     all_features = "All features",
     feature_selected = "Feature selected",
     all_except_bortezomib_signature_features = "All except BZ",
-    bortezomib_signature_features = "BZ features"
+    bortezomib_signature_features = "BZ features",
+    random_45_features = "Random 45 features"
 )
 
+cluster_df <- cluster_df %>%
+    dplyr::filter(feature_category != "All except BZ")
+
 cluster_df$feature_category <- factor(
     cluster_df$feature_category,
-    levels = c("All features", "Feature selected", "All except BZ", "BZ features")
+    levels = c("All features", "Feature selected", "Random 45 features", "BZ features")
 )
 
 cluster_df$clustering_metric <- dplyr::recode(
@@ -179,13 +194,13 @@ cluster_gg <- (
         labels = c(
             "All features" =  "All features",
             "Feature selected" = "Feature selected",
-            "All except BZ" = "All except BZ",
+            "Random 45 features" = "Random features",
             "BZ features" = "BZ features"
         ),
         values = c(
             "All features" =  "#1f78b4",
             "Feature selected" = "#33a02c",
-            "All except BZ" = "#a6cee3",
+            "Random 45 features" = "#a6cee3",
             "BZ features" = "#b2df8a"
         )
     )
@@ -194,13 +209,13 @@ cluster_gg <- (
         labels = c(
             "All features" =  "All features",
             "Feature selected" = "Feature selected",
-            "All except BZ" = "All except BZ",
+            "Random 45 features" = "Random features",
             "BZ features" = "BZ features"
         ),
         values = c(
             "All features" =  "#1f78b4",
             "Feature selected" = "#33a02c",
-            "All except BZ" = "#a6cee3",
+            "Random 45 features" = "#a6cee3",
             "BZ features" = "#b2df8a"
         )
     )