Skip to content

Commit

Permalink
Implementing parquet loading in load_profiles function (#262)
Browse files Browse the repository at this point in the history
* added new function `infer_profile_file_type`

* Fixed Unicode Bug

* fixed csv error

* improved variable names

* removed unwanted comments

* added extension based inference for parquet

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Gregory Way <[email protected]>

* Update pycytominer/tests/test_cyto_utils/test_load.py

Co-authored-by: Gregory Way <[email protected]>

* edited pathlib imports, documentation fixed

* applied black formatting

* added typing

* updated tests

* update tests

* testing update

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Dave Bunten <[email protected]>

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Dave Bunten <[email protected]>

* added black formatting

* update pathing

* fixed docs

* black formatting

* tests update

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Gregory Way <[email protected]>

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Gregory Way <[email protected]>

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Gregory Way <[email protected]>

* test update

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Gregory Way <[email protected]>

* fixed typo

* added comments

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Dave Bunten <[email protected]>

* replaced `.absolute()` with `.resolve()`

* applied black formatting

* removed try and accept block

---------

Co-authored-by: Gregory Way <[email protected]>
Co-authored-by: Dave Bunten <[email protected]>
  • Loading branch information
3 people authored Mar 23, 2023
1 parent b2c6cc4 commit 9340ff3
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 20 deletions.
69 changes: 64 additions & 5 deletions pycytominer/cyto_utils/load.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,52 @@
import csv
import gzip
import pathlib
import numpy as np
import pandas as pd
from typing import Union


def infer_delim(file):
def is_path_a_parquet_file(file: Union[str, pathlib.Path]) -> bool:
"""Checks if the provided file path is a parquet file.
Identify parquet files by inspecting the file extensions.
If the file does not end with `parquet`, this will return False, else True.
Parameters
----------
file : Union[str, pathlib.Path]
path to parquet file
Returns
-------
bool
Returns True if the file path contains `.parquet`, else it will return
False
Raises
------
TypeError
Raised if a non str or non-path object is passed in the `file` parameter
FileNotFoundError
Raised if the provided path in the `file` does not exist
"""
# type checking
if not isinstance(file, (str, pathlib.Path)):
raise TypeError(f"file must be a str or pathlib.Path not {type(file)}")

# Convert str to pathlib.Path object and absolute path
# check if the file also exists while converting to absolute path
if isinstance(file, str):
file = pathlib.Path(file).resolve(strict=True)

# Check if file path is a parquet file
if file.suffix.lower() == ".parquet":
return True

return False


def infer_delim(file: str):
"""
Sniff the delimiter in the given file
Expand Down Expand Up @@ -41,13 +83,23 @@ def load_profiles(profiles):
Return
------
pandas DataFrame of profiles
Raises:
-------
FileNotFoundError
Raised if the provided profile does not exists
"""
if not isinstance(profiles, pd.DataFrame):
if is_path_a_parquet_file(profiles):
return pd.read_parquet(profiles, engine="pyarrow")

try:
delim = infer_delim(profiles)
profiles = pd.read_csv(profiles, sep=delim)
except FileNotFoundError:
raise FileNotFoundError(f"{profiles} profile file not found")

return profiles


Expand Down Expand Up @@ -124,9 +176,12 @@ def load_npz_features(npz_file, fallback_feature_prefix="DP", metadata=True):
# Load metadata
if "metadata" in files:
metadata = npz["metadata"].item()
metadata_df = pd.DataFrame(metadata, index=range(0, df.shape[0]), dtype=str)
metadata_df = pd.DataFrame(
metadata, index=range(0, df.shape[0]), dtype=str
)
metadata_df.columns = [
f"Metadata_{x}" if not x.startswith("Metadata_") else x for x in metadata_df
f"Metadata_{x}" if not x.startswith("Metadata_") else x
for x in metadata_df
]

# Determine the appropriate metadata prefix
Expand All @@ -145,12 +200,16 @@ def load_npz_features(npz_file, fallback_feature_prefix="DP", metadata=True):

# Append metadata with features
if "metadata" in files:
df = metadata_df.merge(df, how="outer", left_index=True, right_index=True)
df = metadata_df.merge(
df, how="outer", left_index=True, right_index=True
)

return df


def load_npz_locations(npz_file, location_x_col_index=0, location_y_col_index=1):
def load_npz_locations(
npz_file, location_x_col_index=0, location_y_col_index=1
):
"""
Load an npz file storing locations and, sometimes, metadata.
Expand Down
84 changes: 69 additions & 15 deletions pycytominer/tests/test_cyto_utils/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
load_npz_features,
load_npz_locations,
)
from pycytominer.cyto_utils.load import infer_delim
from pycytominer.cyto_utils.load import infer_delim, is_path_a_parquet_file

random.seed(123)

Expand All @@ -20,13 +20,17 @@
# Set file paths for data-to-be-loaded
output_data_file = os.path.join(tmpdir, "test_data.csv")
output_data_comma_file = os.path.join(tmpdir, "test_data_comma.csv")
output_data_parquet = os.path.join(tmpdir, "test_parquet.parquet")
output_data_gzip_file = "{}.gz".format(output_data_file)
output_platemap_file = os.path.join(tmpdir, "test_platemap.csv")
output_platemap_comma_file = os.path.join(tmpdir, "test_platemap_comma.csv")
output_platemap_file_gzip = "{}.gz".format(output_platemap_file)
output_npz_file = os.path.join(tmpdir, "test_npz.npz")
output_npz_with_model_file = os.path.join(tmpdir, "test_npz_withmodel.npz")
output_npz_without_metadata_file = os.path.join(tmpdir, "test_npz_withoutmetadata.npz")
output_npz_without_metadata_file = os.path.join(
tmpdir, "test_npz_withoutmetadata.npz"
)


# Example .npz file with real data
example_npz_file = os.path.join(
Expand Down Expand Up @@ -54,10 +58,18 @@
data_df = pd.concat(
[
pd.DataFrame(
{"Metadata_Well": ["A01", "A02", "A03"], "x": [1, 3, 8], "y": [5, 3, 1]}
{
"Metadata_Well": ["A01", "A02", "A03"],
"x": [1, 3, 8],
"y": [5, 3, 1],
}
),
pd.DataFrame(
{"Metadata_Well": ["B01", "B02", "B03"], "x": [1, 3, 5], "y": [8, 3, 1]}
{
"Metadata_Well": ["B01", "B02", "B03"],
"x": [1, 3, 5],
"y": [8, 3, 1],
}
),
]
).reset_index(drop=True)
Expand All @@ -76,19 +88,29 @@
# Write to temp files
data_df.to_csv(output_data_file, sep="\t", index=False)
data_df.to_csv(output_data_comma_file, sep=",", index=False)
data_df.to_csv(output_data_gzip_file, sep="\t", index=False, compression="gzip")
data_df.to_csv(
output_data_gzip_file, sep="\t", index=False, compression="gzip"
)
data_df.to_parquet(output_data_parquet, engine="pyarrow")

platemap_df.to_csv(output_platemap_file, sep="\t", index=False)
platemap_df.to_csv(output_platemap_comma_file, sep=",", index=False)
platemap_df.to_csv(output_platemap_file_gzip, sep="\t", index=False, compression="gzip")
platemap_df.to_csv(
output_platemap_file_gzip, sep="\t", index=False, compression="gzip"
)

# Write npz temp files
key_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}
npz_metadata_dict.update(npz_model_key)
key_with_model_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}
key_with_model_values = {
k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()
}

np.savez_compressed(output_npz_file, features=npz_feats, metadata=key_values)
np.savez_compressed(
output_npz_with_model_file, features=npz_feats, metadata=key_with_model_values
output_npz_with_model_file,
features=npz_feats,
metadata=key_with_model_values,
)
np.savez_compressed(output_npz_without_metadata_file, features=npz_feats)

Expand All @@ -105,7 +127,6 @@ def test_infer_delim():


def test_load_profiles():

profiles = load_profiles(output_data_file)
pd.testing.assert_frame_equal(data_df, profiles)

Expand All @@ -120,7 +141,6 @@ def test_load_profiles():


def test_load_platemap():

platemap = load_platemap(output_platemap_file, add_metadata_id=False)
pd.testing.assert_frame_equal(platemap, platemap_df)

Expand All @@ -130,7 +150,9 @@ def test_load_platemap():
platemap = load_platemap(output_platemap_file_gzip, add_metadata_id=False)
pd.testing.assert_frame_equal(platemap, platemap_df)

platemap_with_annotation = load_platemap(output_platemap_file, add_metadata_id=True)
platemap_with_annotation = load_platemap(
output_platemap_file, add_metadata_id=True
)
platemap_df.columns = [f"Metadata_{x}" for x in platemap_df.columns]
pd.testing.assert_frame_equal(platemap_with_annotation, platemap_df)

Expand All @@ -151,7 +173,10 @@ def test_load_npz():
assert npz_df.columns.tolist() == core_cols + ["DP_0", "DP_1"]

assert npz_custom_prefix_df.shape == (6, 5)
assert npz_custom_prefix_df.columns.tolist() == core_cols + ["test_0", "test_1"]
assert npz_custom_prefix_df.columns.tolist() == core_cols + [
"test_0",
"test_1",
]

assert npz_with_model_df.shape == (6, 6)
assert npz_with_model_df.columns.tolist() == core_cols + [
Expand All @@ -169,7 +194,9 @@ def test_load_npz():

# Check real data
assert real_data_df.shape == (206, 54)
assert all([x in real_data_df.columns for x in core_cols + ["Metadata_Model"]])
assert all(
[x in real_data_df.columns for x in core_cols + ["Metadata_Model"]]
)
assert len(real_data_df.Metadata_Model.unique()) == 1
assert real_data_df.Metadata_Model.unique()[0] == "cnn"
assert real_data_df.drop(
Expand All @@ -188,11 +215,38 @@ def test_load_npz():
IndexError, match="OutOfBounds indexing via location_x_col_index"
):
load_npz_locations(
example_npz_file_locations, location_x_col_index=2, location_y_col_index=1
example_npz_file_locations,
location_x_col_index=2,
location_y_col_index=1,
)
with pytest.raises(
IndexError, match="OutOfBounds indexing via location_y_col_index"
):
load_npz_locations(
example_npz_file_locations, location_x_col_index=0, location_y_col_index=2
example_npz_file_locations,
location_x_col_index=0,
location_y_col_index=2,
)


def test_is_path_a_parquet_file():
# checking parquet file
check_pass = is_path_a_parquet_file(output_data_parquet)
check_fail = is_path_a_parquet_file(output_data_file)

# checking if the correct booleans are returned
assert (check_pass, True)
assert (check_fail, False)

# loading in pandas dataframe from parquet file
parquet_df = pd.read_parquet(output_data_parquet)
parquet_profile_test = load_profiles(output_data_parquet)
pd.testing.assert_frame_equal(parquet_profile_test, parquet_df)

# loading csv file with new load_profile()
csv_df = pd.read_csv(output_data_comma_file)
csv_profile_test = load_profiles(output_data_comma_file)
pd.testing.assert_frame_equal(csv_profile_test, csv_df)

# checking if the same df is produced from parquet and csv files
pd.testing.assert_frame_equal(parquet_profile_test, csv_profile_test)

0 comments on commit 9340ff3

Please sign in to comment.