cytomining · gwaybio · Aug 16, 2022 · Aug 13, 2022 · Aug 15, 2022 · Aug 15, 2022
diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
@@ -397,6 +397,34 @@ def get_subsample(self, df=None, compartment="cells", rename_col=True):
 
         self.is_subset_computed = True
 
+    def is_feature_col(self, col):
+        """Check if column is a feature."""
+        return (
+            col.startswith("Cell")
+            or col.startswith("Cytoplasm")
+            or col.startswith("Nuclei")
+        )
+
+    def count(self, table):
+        """Count total number of rows for a table."""
+        (num_rows,) = next(self.conn.execute(f"SELECT COUNT(*) FROM {table}"))
+        return num_rows
+
+    def get_columns(self, table):
+        """Get feature and metadata columns lists."""
+        ptr = self.conn.execute(f"SELECT * FROM {table} LIMIT 1").cursor
+        col_names = [obj[0] for obj in ptr.description]
+
+        feat_cols = []
+        meta_cols = []
+        for col in col_names:
+            if self.is_feature_col(col):
+                feat_cols.append(col)
+            else:
+                meta_cols.append(col)
+
+        return meta_cols, feat_cols
+
     def load_compartment(self, compartment):
         """Creates the compartment dataframe.
 
@@ -410,9 +438,30 @@ def load_compartment(self, compartment):
         pandas.core.frame.DataFrame
             Compartment dataframe.
         """
-        compartment_query = "select * from {}".format(compartment)
-        df = pd.read_sql(sql=compartment_query, con=self.conn)
-        return df
+
+        # Get data useful to pre-alloc memory
+        num_cells = self.count(compartment)
+        meta_cols, feat_cols = self.get_columns(compartment)
+        num_meta, num_feats = len(meta_cols), len(feat_cols)
+
+        # Use pre-allocated np.array for data
+        feats = np.empty(shape=(num_cells, num_feats), dtype=np.float64)
+        # Use pre-allocated pd.DataFrame for metadata
+        metas = pd.DataFrame(columns=meta_cols, index=range(num_cells))
+
+        # Query database for selected columns of chosen compartment
+        columns = ", ".join(meta_cols + feat_cols)
+        query = f"select {columns} from {compartment}"
+        resultset = self.conn.execute(query)
+
+        # Load data row by row for both meta information and features
+        for i, row in enumerate(resultset):
+            metas.loc[i] = row[:num_meta]
+            feats[i] = row[num_meta:]
+
+        # Return concatenated data and metainformation of compartment
+        return pd.concat(
+            [pd.DataFrame(columns=feat_cols, data=feats), metas], axis=1)
 
     def aggregate_compartment(
         self,
@@ -658,26 +707,27 @@ def merge_single_cells(
                 ]
 
                 if isinstance(sc_df, str):
-                    initial_df = self.load_compartment(compartment=left_compartment)
+                    sc_df = self.load_compartment(compartment=left_compartment)
 
                     if compute_subsample:
                         # Sample cells proportionally by self.strata
-                        self.get_subsample(df=initial_df, rename_col=False)
+                        self.get_subsample(df=sc_df, rename_col=False)
 
                         subset_logic_df = self.subset_data_df.drop(
                             self.image_df.columns, axis="columns"
                         )
 
-                        initial_df = subset_logic_df.merge(
-                            initial_df, how="left", on=subset_logic_df.columns.tolist()
-                        ).reindex(initial_df.columns, axis="columns")
+                        sc_df = subset_logic_df.merge(
+                            sc_df, how="left", on=subset_logic_df.columns.tolist()
+                        ).reindex(sc_df.columns, axis="columns")
 
-                    sc_df = initial_df.merge(
+                    sc_df = sc_df.merge(
                         self.load_compartment(compartment=right_compartment),
                         left_on=self.merge_cols + [left_link_col],
                         right_on=self.merge_cols + [right_link_col],
                         suffixes=merge_suffix,
                     )
+
                 else:
                     sc_df = sc_df.merge(
                         self.load_compartment(compartment=right_compartment),

diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -235,10 +235,12 @@ def test_SingleCells_count():
 
 def test_load_compartment():
     loaded_compartment_df = ap.load_compartment(compartment="cells")
-    pd.testing.assert_frame_equal(loaded_compartment_df, cells_df)
+    pd.testing.assert_frame_equal(
+        loaded_compartment_df, cells_df, check_dtype=False)
 
     # Test non-canonical compartment loading
-    pd.testing.assert_frame_equal(new_compartment_df, ap_new.load_compartment("new"))
+    pd.testing.assert_frame_equal(
+        new_compartment_df, ap_new.load_compartment("new"), check_dtype=False)
 
 
 def test_merge_single_cells():
@@ -307,6 +309,7 @@ def test_merge_single_cells():
                 pd.testing.assert_frame_equal(
                     norm_method_df.sort_index(axis=1),
                     manual_merge_normalize.sort_index(axis=1),
+                    check_dtype=False
                 )
 
     # Test non-canonical compartment merging
@@ -337,7 +340,8 @@ def test_merge_single_cells():
 
     default_feature_infer_df = ap_new.merge_single_cells(single_cell_normalize=True)
 
-    pd.testing.assert_frame_equal(norm_new_method_df, default_feature_infer_df)
+    pd.testing.assert_frame_equal(
+        norm_new_method_df, default_feature_infer_df, check_dtype=False)
     pd.testing.assert_frame_equal(
         norm_new_method_df, norm_new_method_no_feature_infer_df
     )