Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix of .merge_single_cells() to Load Single-Cell Data into Dataframes #219

Merged
merged 9 commits into from
Aug 16, 2022
68 changes: 59 additions & 9 deletions pycytominer/cyto_utils/cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,34 @@ def get_subsample(self, df=None, compartment="cells", rename_col=True):

self.is_subset_computed = True

def is_feature_col(self, col):
bunnech marked this conversation as resolved.
Show resolved Hide resolved
"""Check if column is a feature."""
return (
col.startswith("Cell")
or col.startswith("Cytoplasm")
or col.startswith("Nuclei")
)

def count(self, table):
bunnech marked this conversation as resolved.
Show resolved Hide resolved
"""Count total number of rows for a table."""
(num_rows,) = next(self.conn.execute(f"SELECT COUNT(*) FROM {table}"))
return num_rows

def get_columns(self, table):
bunnech marked this conversation as resolved.
Show resolved Hide resolved
"""Get feature and metadata columns lists."""
ptr = self.conn.execute(f"SELECT * FROM {table} LIMIT 1").cursor
col_names = [obj[0] for obj in ptr.description]

feat_cols = []
meta_cols = []
for col in col_names:
if self.is_feature_col(col):
feat_cols.append(col)
else:
meta_cols.append(col)

return meta_cols, feat_cols

def load_compartment(self, compartment):
"""Creates the compartment dataframe.

Expand All @@ -410,9 +438,30 @@ def load_compartment(self, compartment):
pandas.core.frame.DataFrame
Compartment dataframe.
"""
compartment_query = "select * from {}".format(compartment)
df = pd.read_sql(sql=compartment_query, con=self.conn)
return df

# Get data useful to pre-alloc memory
num_cells = self.count(compartment)
meta_cols, feat_cols = self.get_columns(compartment)
num_meta, num_feats = len(meta_cols), len(feat_cols)

# Use pre-allocated np.array for data
feats = np.empty(shape=(num_cells, num_feats), dtype=np.float64)
# Use pre-allocated pd.DataFrame for metadata
metas = pd.DataFrame(columns=meta_cols, index=range(num_cells))

# Query database for selected columns of chosen compartment
columns = ", ".join(meta_cols + feat_cols)
query = f"select {columns} from {compartment}"
resultset = self.conn.execute(query)
bunnech marked this conversation as resolved.
Show resolved Hide resolved

# Load data row by row for both meta information and features
for i, row in enumerate(resultset):
metas.loc[i] = row[:num_meta]
feats[i] = row[num_meta:]
bunnech marked this conversation as resolved.
Show resolved Hide resolved

# Return concatenated data and metainformation of compartment
return pd.concat(
gwaybio marked this conversation as resolved.
Show resolved Hide resolved
[pd.DataFrame(columns=feat_cols, data=feats), metas], axis=1)

def aggregate_compartment(
self,
Expand Down Expand Up @@ -658,26 +707,27 @@ def merge_single_cells(
]

if isinstance(sc_df, str):
initial_df = self.load_compartment(compartment=left_compartment)
sc_df = self.load_compartment(compartment=left_compartment)
bunnech marked this conversation as resolved.
Show resolved Hide resolved

if compute_subsample:
# Sample cells proportionally by self.strata
self.get_subsample(df=initial_df, rename_col=False)
self.get_subsample(df=sc_df, rename_col=False)

subset_logic_df = self.subset_data_df.drop(
self.image_df.columns, axis="columns"
)

initial_df = subset_logic_df.merge(
initial_df, how="left", on=subset_logic_df.columns.tolist()
).reindex(initial_df.columns, axis="columns")
sc_df = subset_logic_df.merge(
sc_df, how="left", on=subset_logic_df.columns.tolist()
).reindex(sc_df.columns, axis="columns")

sc_df = initial_df.merge(
sc_df = sc_df.merge(
self.load_compartment(compartment=right_compartment),
left_on=self.merge_cols + [left_link_col],
right_on=self.merge_cols + [right_link_col],
suffixes=merge_suffix,
)

else:
sc_df = sc_df.merge(
self.load_compartment(compartment=right_compartment),
Expand Down
10 changes: 7 additions & 3 deletions pycytominer/tests/test_cyto_utils/test_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,10 +235,12 @@ def test_SingleCells_count():

def test_load_compartment():
loaded_compartment_df = ap.load_compartment(compartment="cells")
pd.testing.assert_frame_equal(loaded_compartment_df, cells_df)
pd.testing.assert_frame_equal(
loaded_compartment_df, cells_df, check_dtype=False)

# Test non-canonical compartment loading
pd.testing.assert_frame_equal(new_compartment_df, ap_new.load_compartment("new"))
pd.testing.assert_frame_equal(
new_compartment_df, ap_new.load_compartment("new"), check_dtype=False)
bunnech marked this conversation as resolved.
Show resolved Hide resolved


def test_merge_single_cells():
Expand Down Expand Up @@ -307,6 +309,7 @@ def test_merge_single_cells():
pd.testing.assert_frame_equal(
norm_method_df.sort_index(axis=1),
manual_merge_normalize.sort_index(axis=1),
check_dtype=False
gwaybio marked this conversation as resolved.
Show resolved Hide resolved
)

# Test non-canonical compartment merging
Expand Down Expand Up @@ -337,7 +340,8 @@ def test_merge_single_cells():

default_feature_infer_df = ap_new.merge_single_cells(single_cell_normalize=True)

pd.testing.assert_frame_equal(norm_new_method_df, default_feature_infer_df)
pd.testing.assert_frame_equal(
norm_new_method_df, default_feature_infer_df, check_dtype=False)
pd.testing.assert_frame_equal(
norm_new_method_df, norm_new_method_no_feature_infer_df
)
Expand Down