Skip to content

Commit 9340ff3

Browse files
axiomcuragwaybiod33bs
authored
Implementing parquet loading in load_profiles function (#262)
* added new function `infer_profile_file_type` * Fixed Unicode Bug * fixed csv error * improved variable names * removed unwanted comments * added extension based inference for parquet * Update pycytominer/cyto_utils/load.py Co-authored-by: Gregory Way <[email protected]> * Update pycytominer/tests/test_cyto_utils/test_load.py Co-authored-by: Gregory Way <[email protected]> * edited pathlib imports, documentation fixed * applied black formatting * added typing * updated tests * update tests * testing update * Update pycytominer/cyto_utils/load.py Co-authored-by: Dave Bunten <[email protected]> * Update pycytominer/cyto_utils/load.py Co-authored-by: Dave Bunten <[email protected]> * added black formatting * update pathing * fixed docs * black formatting * tests update * Update pycytominer/cyto_utils/load.py Co-authored-by: Gregory Way <[email protected]> * Update pycytominer/cyto_utils/load.py Co-authored-by: Gregory Way <[email protected]> * Update pycytominer/cyto_utils/load.py Co-authored-by: Gregory Way <[email protected]> * test update * Update pycytominer/cyto_utils/load.py Co-authored-by: Gregory Way <[email protected]> * fixed typo * added comments * Update pycytominer/cyto_utils/load.py Co-authored-by: Dave Bunten <[email protected]> * replaced `.absolute()` with `.resolve()` * applied black formatting * removed try and accept block --------- Co-authored-by: Gregory Way <[email protected]> Co-authored-by: Dave Bunten <[email protected]>
1 parent b2c6cc4 commit 9340ff3

File tree

2 files changed

+133
-20
lines changed

2 files changed

+133
-20
lines changed

pycytominer/cyto_utils/load.py

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,52 @@
11
import csv
22
import gzip
3+
import pathlib
34
import numpy as np
45
import pandas as pd
6+
from typing import Union
57

68

7-
def infer_delim(file):
9+
def is_path_a_parquet_file(file: Union[str, pathlib.Path]) -> bool:
10+
"""Checks if the provided file path is a parquet file.
11+
12+
Identify parquet files by inspecting the file extensions.
13+
If the file does not end with `parquet`, this will return False, else True.
14+
15+
Parameters
16+
----------
17+
file : Union[str, pathlib.Path]
18+
path to parquet file
19+
20+
Returns
21+
-------
22+
bool
23+
Returns True if the file path contains `.parquet`, else it will return
24+
False
25+
26+
Raises
27+
------
28+
TypeError
29+
Raised if a non str or non-path object is passed in the `file` parameter
30+
FileNotFoundError
31+
Raised if the provided path in the `file` does not exist
32+
"""
33+
# type checking
34+
if not isinstance(file, (str, pathlib.Path)):
35+
raise TypeError(f"file must be a str or pathlib.Path not {type(file)}")
36+
37+
# Convert str to pathlib.Path object and absolute path
38+
# check if the file also exists while converting to absolute path
39+
if isinstance(file, str):
40+
file = pathlib.Path(file).resolve(strict=True)
41+
42+
# Check if file path is a parquet file
43+
if file.suffix.lower() == ".parquet":
44+
return True
45+
46+
return False
47+
48+
49+
def infer_delim(file: str):
850
"""
951
Sniff the delimiter in the given file
1052
@@ -41,13 +83,23 @@ def load_profiles(profiles):
4183
Return
4284
------
4385
pandas DataFrame of profiles
86+
87+
Raises:
88+
-------
89+
FileNotFoundError
90+
Raised if the provided profile does not exists
91+
4492
"""
4593
if not isinstance(profiles, pd.DataFrame):
94+
if is_path_a_parquet_file(profiles):
95+
return pd.read_parquet(profiles, engine="pyarrow")
96+
4697
try:
4798
delim = infer_delim(profiles)
4899
profiles = pd.read_csv(profiles, sep=delim)
49100
except FileNotFoundError:
50101
raise FileNotFoundError(f"{profiles} profile file not found")
102+
51103
return profiles
52104

53105

@@ -124,9 +176,12 @@ def load_npz_features(npz_file, fallback_feature_prefix="DP", metadata=True):
124176
# Load metadata
125177
if "metadata" in files:
126178
metadata = npz["metadata"].item()
127-
metadata_df = pd.DataFrame(metadata, index=range(0, df.shape[0]), dtype=str)
179+
metadata_df = pd.DataFrame(
180+
metadata, index=range(0, df.shape[0]), dtype=str
181+
)
128182
metadata_df.columns = [
129-
f"Metadata_{x}" if not x.startswith("Metadata_") else x for x in metadata_df
183+
f"Metadata_{x}" if not x.startswith("Metadata_") else x
184+
for x in metadata_df
130185
]
131186

132187
# Determine the appropriate metadata prefix
@@ -145,12 +200,16 @@ def load_npz_features(npz_file, fallback_feature_prefix="DP", metadata=True):
145200

146201
# Append metadata with features
147202
if "metadata" in files:
148-
df = metadata_df.merge(df, how="outer", left_index=True, right_index=True)
203+
df = metadata_df.merge(
204+
df, how="outer", left_index=True, right_index=True
205+
)
149206

150207
return df
151208

152209

153-
def load_npz_locations(npz_file, location_x_col_index=0, location_y_col_index=1):
210+
def load_npz_locations(
211+
npz_file, location_x_col_index=0, location_y_col_index=1
212+
):
154213
"""
155214
Load an npz file storing locations and, sometimes, metadata.
156215

pycytominer/tests/test_cyto_utils/test_load.py

Lines changed: 69 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
load_npz_features,
1111
load_npz_locations,
1212
)
13-
from pycytominer.cyto_utils.load import infer_delim
13+
from pycytominer.cyto_utils.load import infer_delim, is_path_a_parquet_file
1414

1515
random.seed(123)
1616

@@ -20,13 +20,17 @@
2020
# Set file paths for data-to-be-loaded
2121
output_data_file = os.path.join(tmpdir, "test_data.csv")
2222
output_data_comma_file = os.path.join(tmpdir, "test_data_comma.csv")
23+
output_data_parquet = os.path.join(tmpdir, "test_parquet.parquet")
2324
output_data_gzip_file = "{}.gz".format(output_data_file)
2425
output_platemap_file = os.path.join(tmpdir, "test_platemap.csv")
2526
output_platemap_comma_file = os.path.join(tmpdir, "test_platemap_comma.csv")
2627
output_platemap_file_gzip = "{}.gz".format(output_platemap_file)
2728
output_npz_file = os.path.join(tmpdir, "test_npz.npz")
2829
output_npz_with_model_file = os.path.join(tmpdir, "test_npz_withmodel.npz")
29-
output_npz_without_metadata_file = os.path.join(tmpdir, "test_npz_withoutmetadata.npz")
30+
output_npz_without_metadata_file = os.path.join(
31+
tmpdir, "test_npz_withoutmetadata.npz"
32+
)
33+
3034

3135
# Example .npz file with real data
3236
example_npz_file = os.path.join(
@@ -54,10 +58,18 @@
5458
data_df = pd.concat(
5559
[
5660
pd.DataFrame(
57-
{"Metadata_Well": ["A01", "A02", "A03"], "x": [1, 3, 8], "y": [5, 3, 1]}
61+
{
62+
"Metadata_Well": ["A01", "A02", "A03"],
63+
"x": [1, 3, 8],
64+
"y": [5, 3, 1],
65+
}
5866
),
5967
pd.DataFrame(
60-
{"Metadata_Well": ["B01", "B02", "B03"], "x": [1, 3, 5], "y": [8, 3, 1]}
68+
{
69+
"Metadata_Well": ["B01", "B02", "B03"],
70+
"x": [1, 3, 5],
71+
"y": [8, 3, 1],
72+
}
6173
),
6274
]
6375
).reset_index(drop=True)
@@ -76,19 +88,29 @@
7688
# Write to temp files
7789
data_df.to_csv(output_data_file, sep="\t", index=False)
7890
data_df.to_csv(output_data_comma_file, sep=",", index=False)
79-
data_df.to_csv(output_data_gzip_file, sep="\t", index=False, compression="gzip")
91+
data_df.to_csv(
92+
output_data_gzip_file, sep="\t", index=False, compression="gzip"
93+
)
94+
data_df.to_parquet(output_data_parquet, engine="pyarrow")
95+
8096
platemap_df.to_csv(output_platemap_file, sep="\t", index=False)
8197
platemap_df.to_csv(output_platemap_comma_file, sep=",", index=False)
82-
platemap_df.to_csv(output_platemap_file_gzip, sep="\t", index=False, compression="gzip")
98+
platemap_df.to_csv(
99+
output_platemap_file_gzip, sep="\t", index=False, compression="gzip"
100+
)
83101

84102
# Write npz temp files
85103
key_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}
86104
npz_metadata_dict.update(npz_model_key)
87-
key_with_model_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}
105+
key_with_model_values = {
106+
k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()
107+
}
88108

89109
np.savez_compressed(output_npz_file, features=npz_feats, metadata=key_values)
90110
np.savez_compressed(
91-
output_npz_with_model_file, features=npz_feats, metadata=key_with_model_values
111+
output_npz_with_model_file,
112+
features=npz_feats,
113+
metadata=key_with_model_values,
92114
)
93115
np.savez_compressed(output_npz_without_metadata_file, features=npz_feats)
94116

@@ -105,7 +127,6 @@ def test_infer_delim():
105127

106128

107129
def test_load_profiles():
108-
109130
profiles = load_profiles(output_data_file)
110131
pd.testing.assert_frame_equal(data_df, profiles)
111132

@@ -120,7 +141,6 @@ def test_load_profiles():
120141

121142

122143
def test_load_platemap():
123-
124144
platemap = load_platemap(output_platemap_file, add_metadata_id=False)
125145
pd.testing.assert_frame_equal(platemap, platemap_df)
126146

@@ -130,7 +150,9 @@ def test_load_platemap():
130150
platemap = load_platemap(output_platemap_file_gzip, add_metadata_id=False)
131151
pd.testing.assert_frame_equal(platemap, platemap_df)
132152

133-
platemap_with_annotation = load_platemap(output_platemap_file, add_metadata_id=True)
153+
platemap_with_annotation = load_platemap(
154+
output_platemap_file, add_metadata_id=True
155+
)
134156
platemap_df.columns = [f"Metadata_{x}" for x in platemap_df.columns]
135157
pd.testing.assert_frame_equal(platemap_with_annotation, platemap_df)
136158

@@ -151,7 +173,10 @@ def test_load_npz():
151173
assert npz_df.columns.tolist() == core_cols + ["DP_0", "DP_1"]
152174

153175
assert npz_custom_prefix_df.shape == (6, 5)
154-
assert npz_custom_prefix_df.columns.tolist() == core_cols + ["test_0", "test_1"]
176+
assert npz_custom_prefix_df.columns.tolist() == core_cols + [
177+
"test_0",
178+
"test_1",
179+
]
155180

156181
assert npz_with_model_df.shape == (6, 6)
157182
assert npz_with_model_df.columns.tolist() == core_cols + [
@@ -169,7 +194,9 @@ def test_load_npz():
169194

170195
# Check real data
171196
assert real_data_df.shape == (206, 54)
172-
assert all([x in real_data_df.columns for x in core_cols + ["Metadata_Model"]])
197+
assert all(
198+
[x in real_data_df.columns for x in core_cols + ["Metadata_Model"]]
199+
)
173200
assert len(real_data_df.Metadata_Model.unique()) == 1
174201
assert real_data_df.Metadata_Model.unique()[0] == "cnn"
175202
assert real_data_df.drop(
@@ -188,11 +215,38 @@ def test_load_npz():
188215
IndexError, match="OutOfBounds indexing via location_x_col_index"
189216
):
190217
load_npz_locations(
191-
example_npz_file_locations, location_x_col_index=2, location_y_col_index=1
218+
example_npz_file_locations,
219+
location_x_col_index=2,
220+
location_y_col_index=1,
192221
)
193222
with pytest.raises(
194223
IndexError, match="OutOfBounds indexing via location_y_col_index"
195224
):
196225
load_npz_locations(
197-
example_npz_file_locations, location_x_col_index=0, location_y_col_index=2
226+
example_npz_file_locations,
227+
location_x_col_index=0,
228+
location_y_col_index=2,
198229
)
230+
231+
232+
def test_is_path_a_parquet_file():
233+
# checking parquet file
234+
check_pass = is_path_a_parquet_file(output_data_parquet)
235+
check_fail = is_path_a_parquet_file(output_data_file)
236+
237+
# checking if the correct booleans are returned
238+
assert (check_pass, True)
239+
assert (check_fail, False)
240+
241+
# loading in pandas dataframe from parquet file
242+
parquet_df = pd.read_parquet(output_data_parquet)
243+
parquet_profile_test = load_profiles(output_data_parquet)
244+
pd.testing.assert_frame_equal(parquet_profile_test, parquet_df)
245+
246+
# loading csv file with new load_profile()
247+
csv_df = pd.read_csv(output_data_comma_file)
248+
csv_profile_test = load_profiles(output_data_comma_file)
249+
pd.testing.assert_frame_equal(csv_profile_test, csv_df)
250+
251+
# checking if the same df is produced from parquet and csv files
252+
pd.testing.assert_frame_equal(parquet_profile_test, csv_profile_test)

0 commit comments

Comments
 (0)