Implementing parquet loading in load_profiles function (#262)

axiomcura · gwaybio · d33bs · web-flow · commit 9340ff370945 · 2023-03-23T09:26:54.000-06:00
* added new function `infer_profile_file_type`

* Fixed Unicode Bug

* fixed csv error

* improved variable names

* removed unwanted comments

* added extension based inference for parquet

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Gregory Way &lt;gregory.way@gmail.com&gt;

* Update pycytominer/tests/test_cyto_utils/test_load.py

Co-authored-by: Gregory Way &lt;gregory.way@gmail.com&gt;

* edited pathlib imports, documentation fixed

* applied black formatting

* added typing

* updated tests

* update tests

* testing update

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Dave Bunten &lt;ekgto445@gmail.com&gt;

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Dave Bunten &lt;ekgto445@gmail.com&gt;

* added black formatting

* update pathing

* fixed docs

* black formatting

* tests update

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Gregory Way &lt;gregory.way@gmail.com&gt;

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Gregory Way &lt;gregory.way@gmail.com&gt;

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Gregory Way &lt;gregory.way@gmail.com&gt;

* test update

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Gregory Way &lt;gregory.way@gmail.com&gt;

* fixed typo

* added comments

* Update pycytominer/cyto_utils/load.py

Co-authored-by: Dave Bunten &lt;ekgto445@gmail.com&gt;

* replaced `.absolute()` with `.resolve()`

* applied black formatting

* removed try and accept block

---------

Co-authored-by: Gregory Way &lt;gregory.way@gmail.com&gt;
Co-authored-by: Dave Bunten &lt;ekgto445@gmail.com&gt;
diff --git a/pycytominer/cyto_utils/load.py b/pycytominer/cyto_utils/load.py
@@ -1,10 +1,52 @@
 import csv
 import gzip
+import pathlib
 import numpy as np
 import pandas as pd
+from typing import Union
 
 
-def infer_delim(file):
+def is_path_a_parquet_file(file: Union[str, pathlib.Path]) -> bool:
+    """Checks if the provided file path is a parquet file.
+
+    Identify parquet files by inspecting the file extensions.
+    If the file does not end with `parquet`, this will return False, else True.
+
+    Parameters
+    ----------
+    file : Union[str, pathlib.Path]
+        path to parquet file
+
+    Returns
+    -------
+    bool
+        Returns True if the file path contains `.parquet`, else it will return
+        False
+
+    Raises
+    ------
+    TypeError
+        Raised if a non str or non-path object is passed in the `file` parameter
+    FileNotFoundError
+        Raised if the provided path in the `file` does not exist
+    """
+    # type checking
+    if not isinstance(file, (str, pathlib.Path)):
+        raise TypeError(f"file must be a str or pathlib.Path not {type(file)}")
+
+    # Convert str to pathlib.Path object and absolute path
+    # check if the file also exists while converting to absolute path
+    if isinstance(file, str):
+        file = pathlib.Path(file).resolve(strict=True)
+
+    # Check if file path is a parquet file
+    if file.suffix.lower() == ".parquet":
+        return True
+
+    return False
+
+
+def infer_delim(file: str):
     """
     Sniff the delimiter in the given file
 
@@ -41,13 +83,23 @@ def load_profiles(profiles):
     Return
     ------
     pandas DataFrame of profiles
+
+    Raises:
+    -------
+    FileNotFoundError
+        Raised if the provided profile does not exists
+
     """
     if not isinstance(profiles, pd.DataFrame):
+        if is_path_a_parquet_file(profiles):
+            return pd.read_parquet(profiles, engine="pyarrow")
+
         try:
             delim = infer_delim(profiles)
             profiles = pd.read_csv(profiles, sep=delim)
         except FileNotFoundError:
             raise FileNotFoundError(f"{profiles} profile file not found")
+
     return profiles
 
 
@@ -124,9 +176,12 @@ def load_npz_features(npz_file, fallback_feature_prefix="DP", metadata=True):
     # Load metadata
     if "metadata" in files:
         metadata = npz["metadata"].item()
-        metadata_df = pd.DataFrame(metadata, index=range(0, df.shape[0]), dtype=str)
+        metadata_df = pd.DataFrame(
+            metadata, index=range(0, df.shape[0]), dtype=str
+        )
         metadata_df.columns = [
-            f"Metadata_{x}" if not x.startswith("Metadata_") else x for x in metadata_df
+            f"Metadata_{x}" if not x.startswith("Metadata_") else x
+            for x in metadata_df
         ]
 
         # Determine the appropriate metadata prefix
@@ -145,12 +200,16 @@ def load_npz_features(npz_file, fallback_feature_prefix="DP", metadata=True):
 
     # Append metadata with features
     if "metadata" in files:
-        df = metadata_df.merge(df, how="outer", left_index=True, right_index=True)
+        df = metadata_df.merge(
+            df, how="outer", left_index=True, right_index=True
+        )
 
     return df
 
 
-def load_npz_locations(npz_file, location_x_col_index=0, location_y_col_index=1):
+def load_npz_locations(
+    npz_file, location_x_col_index=0, location_y_col_index=1
+):
     """
     Load an npz file storing locations and, sometimes, metadata.
 
diff --git a/pycytominer/tests/test_cyto_utils/test_load.py b/pycytominer/tests/test_cyto_utils/test_load.py
@@ -10,7 +10,7 @@
     load_npz_features,
     load_npz_locations,
 )
-from pycytominer.cyto_utils.load import infer_delim
+from pycytominer.cyto_utils.load import infer_delim, is_path_a_parquet_file
 
 random.seed(123)
 
@@ -20,13 +20,17 @@
 # Set file paths for data-to-be-loaded
 output_data_file = os.path.join(tmpdir, "test_data.csv")
 output_data_comma_file = os.path.join(tmpdir, "test_data_comma.csv")
+output_data_parquet = os.path.join(tmpdir, "test_parquet.parquet")
 output_data_gzip_file = "{}.gz".format(output_data_file)
 output_platemap_file = os.path.join(tmpdir, "test_platemap.csv")
 output_platemap_comma_file = os.path.join(tmpdir, "test_platemap_comma.csv")
 output_platemap_file_gzip = "{}.gz".format(output_platemap_file)
 output_npz_file = os.path.join(tmpdir, "test_npz.npz")
 output_npz_with_model_file = os.path.join(tmpdir, "test_npz_withmodel.npz")
-output_npz_without_metadata_file = os.path.join(tmpdir, "test_npz_withoutmetadata.npz")
+output_npz_without_metadata_file = os.path.join(
+    tmpdir, "test_npz_withoutmetadata.npz"
+)
+
 
 # Example .npz file with real data
 example_npz_file = os.path.join(
@@ -54,10 +58,18 @@
 data_df = pd.concat(
     [
         pd.DataFrame(
-            {"Metadata_Well": ["A01", "A02", "A03"], "x": [1, 3, 8], "y": [5, 3, 1]}
+            {
+                "Metadata_Well": ["A01", "A02", "A03"],
+                "x": [1, 3, 8],
+                "y": [5, 3, 1],
+            }
         ),
         pd.DataFrame(
-            {"Metadata_Well": ["B01", "B02", "B03"], "x": [1, 3, 5], "y": [8, 3, 1]}
+            {
+                "Metadata_Well": ["B01", "B02", "B03"],
+                "x": [1, 3, 5],
+                "y": [8, 3, 1],
+            }
         ),
     ]
 ).reset_index(drop=True)
@@ -76,19 +88,29 @@
 # Write to temp files
 data_df.to_csv(output_data_file, sep="\t", index=False)
 data_df.to_csv(output_data_comma_file, sep=",", index=False)
-data_df.to_csv(output_data_gzip_file, sep="\t", index=False, compression="gzip")
+data_df.to_csv(
+    output_data_gzip_file, sep="\t", index=False, compression="gzip"
+)
+data_df.to_parquet(output_data_parquet, engine="pyarrow")
+
 platemap_df.to_csv(output_platemap_file, sep="\t", index=False)
 platemap_df.to_csv(output_platemap_comma_file, sep=",", index=False)
-platemap_df.to_csv(output_platemap_file_gzip, sep="\t", index=False, compression="gzip")
+platemap_df.to_csv(
+    output_platemap_file_gzip, sep="\t", index=False, compression="gzip"
+)
 
 # Write npz temp files
 key_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}
 npz_metadata_dict.update(npz_model_key)
-key_with_model_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}
+key_with_model_values = {
+    k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()
+}
 
 np.savez_compressed(output_npz_file, features=npz_feats, metadata=key_values)
 np.savez_compressed(
-    output_npz_with_model_file, features=npz_feats, metadata=key_with_model_values
+    output_npz_with_model_file,
+    features=npz_feats,
+    metadata=key_with_model_values,
 )
 np.savez_compressed(output_npz_without_metadata_file, features=npz_feats)
 
@@ -105,7 +127,6 @@ def test_infer_delim():
 
 
 def test_load_profiles():
-
     profiles = load_profiles(output_data_file)
     pd.testing.assert_frame_equal(data_df, profiles)
 
@@ -120,7 +141,6 @@ def test_load_profiles():
 
 
 def test_load_platemap():
-
     platemap = load_platemap(output_platemap_file, add_metadata_id=False)
     pd.testing.assert_frame_equal(platemap, platemap_df)
 
@@ -130,7 +150,9 @@ def test_load_platemap():
     platemap = load_platemap(output_platemap_file_gzip, add_metadata_id=False)
     pd.testing.assert_frame_equal(platemap, platemap_df)
 
-    platemap_with_annotation = load_platemap(output_platemap_file, add_metadata_id=True)
+    platemap_with_annotation = load_platemap(
+        output_platemap_file, add_metadata_id=True
+    )
     platemap_df.columns = [f"Metadata_{x}" for x in platemap_df.columns]
     pd.testing.assert_frame_equal(platemap_with_annotation, platemap_df)
 
@@ -151,7 +173,10 @@ def test_load_npz():
     assert npz_df.columns.tolist() == core_cols + ["DP_0", "DP_1"]
 
     assert npz_custom_prefix_df.shape == (6, 5)
-    assert npz_custom_prefix_df.columns.tolist() == core_cols + ["test_0", "test_1"]
+    assert npz_custom_prefix_df.columns.tolist() == core_cols + [
+        "test_0",
+        "test_1",
+    ]
 
     assert npz_with_model_df.shape == (6, 6)
     assert npz_with_model_df.columns.tolist() == core_cols + [
@@ -169,7 +194,9 @@ def test_load_npz():
 
     # Check real data
     assert real_data_df.shape == (206, 54)
-    assert all([x in real_data_df.columns for x in core_cols + ["Metadata_Model"]])
+    assert all(
+        [x in real_data_df.columns for x in core_cols + ["Metadata_Model"]]
+    )
     assert len(real_data_df.Metadata_Model.unique()) == 1
     assert real_data_df.Metadata_Model.unique()[0] == "cnn"
     assert real_data_df.drop(
@@ -188,11 +215,38 @@ def test_load_npz():
         IndexError, match="OutOfBounds indexing via location_x_col_index"
     ):
         load_npz_locations(
-            example_npz_file_locations, location_x_col_index=2, location_y_col_index=1
+            example_npz_file_locations,
+            location_x_col_index=2,
+            location_y_col_index=1,
         )
     with pytest.raises(
         IndexError, match="OutOfBounds indexing via location_y_col_index"
     ):
         load_npz_locations(
-            example_npz_file_locations, location_x_col_index=0, location_y_col_index=2
+            example_npz_file_locations,
+            location_x_col_index=0,
+            location_y_col_index=2,
         )
+
+
+def test_is_path_a_parquet_file():
+    # checking parquet file
+    check_pass = is_path_a_parquet_file(output_data_parquet)
+    check_fail = is_path_a_parquet_file(output_data_file)
+
+    # checking if the correct booleans are returned
+    assert (check_pass, True)
+    assert (check_fail, False)
+
+    # loading in pandas dataframe from parquet file
+    parquet_df = pd.read_parquet(output_data_parquet)
+    parquet_profile_test = load_profiles(output_data_parquet)
+    pd.testing.assert_frame_equal(parquet_profile_test, parquet_df)
+
+    # loading csv file with new load_profile()
+    csv_df = pd.read_csv(output_data_comma_file)
+    csv_profile_test = load_profiles(output_data_comma_file)
+    pd.testing.assert_frame_equal(csv_profile_test, csv_df)
+
+    # checking if the same df is produced from parquet and csv files
+    pd.testing.assert_frame_equal(parquet_profile_test, csv_profile_test)