10
10
load_npz_features ,
11
11
load_npz_locations ,
12
12
)
13
- from pycytominer .cyto_utils .load import infer_delim
13
+ from pycytominer .cyto_utils .load import infer_delim , is_path_a_parquet_file
14
14
15
15
random .seed (123 )
16
16
20
20
# Set file paths for data-to-be-loaded
21
21
output_data_file = os .path .join (tmpdir , "test_data.csv" )
22
22
output_data_comma_file = os .path .join (tmpdir , "test_data_comma.csv" )
23
+ output_data_parquet = os .path .join (tmpdir , "test_parquet.parquet" )
23
24
output_data_gzip_file = "{}.gz" .format (output_data_file )
24
25
output_platemap_file = os .path .join (tmpdir , "test_platemap.csv" )
25
26
output_platemap_comma_file = os .path .join (tmpdir , "test_platemap_comma.csv" )
26
27
output_platemap_file_gzip = "{}.gz" .format (output_platemap_file )
27
28
output_npz_file = os .path .join (tmpdir , "test_npz.npz" )
28
29
output_npz_with_model_file = os .path .join (tmpdir , "test_npz_withmodel.npz" )
29
- output_npz_without_metadata_file = os .path .join (tmpdir , "test_npz_withoutmetadata.npz" )
30
+ output_npz_without_metadata_file = os .path .join (
31
+ tmpdir , "test_npz_withoutmetadata.npz"
32
+ )
33
+
30
34
31
35
# Example .npz file with real data
32
36
example_npz_file = os .path .join (
54
58
data_df = pd .concat (
55
59
[
56
60
pd .DataFrame (
57
- {"Metadata_Well" : ["A01" , "A02" , "A03" ], "x" : [1 , 3 , 8 ], "y" : [5 , 3 , 1 ]}
61
+ {
62
+ "Metadata_Well" : ["A01" , "A02" , "A03" ],
63
+ "x" : [1 , 3 , 8 ],
64
+ "y" : [5 , 3 , 1 ],
65
+ }
58
66
),
59
67
pd .DataFrame (
60
- {"Metadata_Well" : ["B01" , "B02" , "B03" ], "x" : [1 , 3 , 5 ], "y" : [8 , 3 , 1 ]}
68
+ {
69
+ "Metadata_Well" : ["B01" , "B02" , "B03" ],
70
+ "x" : [1 , 3 , 5 ],
71
+ "y" : [8 , 3 , 1 ],
72
+ }
61
73
),
62
74
]
63
75
).reset_index (drop = True )
76
88
# Write to temp files
77
89
data_df .to_csv (output_data_file , sep = "\t " , index = False )
78
90
data_df .to_csv (output_data_comma_file , sep = "," , index = False )
79
- data_df .to_csv (output_data_gzip_file , sep = "\t " , index = False , compression = "gzip" )
91
+ data_df .to_csv (
92
+ output_data_gzip_file , sep = "\t " , index = False , compression = "gzip"
93
+ )
94
+ data_df .to_parquet (output_data_parquet , engine = "pyarrow" )
95
+
80
96
platemap_df .to_csv (output_platemap_file , sep = "\t " , index = False )
81
97
platemap_df .to_csv (output_platemap_comma_file , sep = "," , index = False )
82
- platemap_df .to_csv (output_platemap_file_gzip , sep = "\t " , index = False , compression = "gzip" )
98
+ platemap_df .to_csv (
99
+ output_platemap_file_gzip , sep = "\t " , index = False , compression = "gzip"
100
+ )
83
101
84
102
# Write npz temp files
85
103
key_values = {k : npz_metadata_dict [k ] for k in npz_metadata_dict .keys ()}
86
104
npz_metadata_dict .update (npz_model_key )
87
- key_with_model_values = {k : npz_metadata_dict [k ] for k in npz_metadata_dict .keys ()}
105
+ key_with_model_values = {
106
+ k : npz_metadata_dict [k ] for k in npz_metadata_dict .keys ()
107
+ }
88
108
89
109
np .savez_compressed (output_npz_file , features = npz_feats , metadata = key_values )
90
110
np .savez_compressed (
91
- output_npz_with_model_file , features = npz_feats , metadata = key_with_model_values
111
+ output_npz_with_model_file ,
112
+ features = npz_feats ,
113
+ metadata = key_with_model_values ,
92
114
)
93
115
np .savez_compressed (output_npz_without_metadata_file , features = npz_feats )
94
116
@@ -105,7 +127,6 @@ def test_infer_delim():
105
127
106
128
107
129
def test_load_profiles ():
108
-
109
130
profiles = load_profiles (output_data_file )
110
131
pd .testing .assert_frame_equal (data_df , profiles )
111
132
@@ -120,7 +141,6 @@ def test_load_profiles():
120
141
121
142
122
143
def test_load_platemap ():
123
-
124
144
platemap = load_platemap (output_platemap_file , add_metadata_id = False )
125
145
pd .testing .assert_frame_equal (platemap , platemap_df )
126
146
@@ -130,7 +150,9 @@ def test_load_platemap():
130
150
platemap = load_platemap (output_platemap_file_gzip , add_metadata_id = False )
131
151
pd .testing .assert_frame_equal (platemap , platemap_df )
132
152
133
- platemap_with_annotation = load_platemap (output_platemap_file , add_metadata_id = True )
153
+ platemap_with_annotation = load_platemap (
154
+ output_platemap_file , add_metadata_id = True
155
+ )
134
156
platemap_df .columns = [f"Metadata_{ x } " for x in platemap_df .columns ]
135
157
pd .testing .assert_frame_equal (platemap_with_annotation , platemap_df )
136
158
@@ -151,7 +173,10 @@ def test_load_npz():
151
173
assert npz_df .columns .tolist () == core_cols + ["DP_0" , "DP_1" ]
152
174
153
175
assert npz_custom_prefix_df .shape == (6 , 5 )
154
- assert npz_custom_prefix_df .columns .tolist () == core_cols + ["test_0" , "test_1" ]
176
+ assert npz_custom_prefix_df .columns .tolist () == core_cols + [
177
+ "test_0" ,
178
+ "test_1" ,
179
+ ]
155
180
156
181
assert npz_with_model_df .shape == (6 , 6 )
157
182
assert npz_with_model_df .columns .tolist () == core_cols + [
@@ -169,7 +194,9 @@ def test_load_npz():
169
194
170
195
# Check real data
171
196
assert real_data_df .shape == (206 , 54 )
172
- assert all ([x in real_data_df .columns for x in core_cols + ["Metadata_Model" ]])
197
+ assert all (
198
+ [x in real_data_df .columns for x in core_cols + ["Metadata_Model" ]]
199
+ )
173
200
assert len (real_data_df .Metadata_Model .unique ()) == 1
174
201
assert real_data_df .Metadata_Model .unique ()[0 ] == "cnn"
175
202
assert real_data_df .drop (
@@ -188,11 +215,38 @@ def test_load_npz():
188
215
IndexError , match = "OutOfBounds indexing via location_x_col_index"
189
216
):
190
217
load_npz_locations (
191
- example_npz_file_locations , location_x_col_index = 2 , location_y_col_index = 1
218
+ example_npz_file_locations ,
219
+ location_x_col_index = 2 ,
220
+ location_y_col_index = 1 ,
192
221
)
193
222
with pytest .raises (
194
223
IndexError , match = "OutOfBounds indexing via location_y_col_index"
195
224
):
196
225
load_npz_locations (
197
- example_npz_file_locations , location_x_col_index = 0 , location_y_col_index = 2
226
+ example_npz_file_locations ,
227
+ location_x_col_index = 0 ,
228
+ location_y_col_index = 2 ,
198
229
)
230
+
231
+
232
+ def test_is_path_a_parquet_file ():
233
+ # checking parquet file
234
+ check_pass = is_path_a_parquet_file (output_data_parquet )
235
+ check_fail = is_path_a_parquet_file (output_data_file )
236
+
237
+ # checking if the correct booleans are returned
238
+ assert (check_pass , True )
239
+ assert (check_fail , False )
240
+
241
+ # loading in pandas dataframe from parquet file
242
+ parquet_df = pd .read_parquet (output_data_parquet )
243
+ parquet_profile_test = load_profiles (output_data_parquet )
244
+ pd .testing .assert_frame_equal (parquet_profile_test , parquet_df )
245
+
246
+ # loading csv file with new load_profile()
247
+ csv_df = pd .read_csv (output_data_comma_file )
248
+ csv_profile_test = load_profiles (output_data_comma_file )
249
+ pd .testing .assert_frame_equal (csv_profile_test , csv_df )
250
+
251
+ # checking if the same df is produced from parquet and csv files
252
+ pd .testing .assert_frame_equal (parquet_profile_test , csv_profile_test )
0 commit comments