add docstrings

Milotrince · Milotrince · commit e5856984eb56 · 2024-04-14T18:32:29.000-07:00
diff --git a/fog_x/dataset.py b/fog_x/dataset.py
@@ -4,6 +4,8 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
+import polars
+import pandas
 
 from fog_x.database import (
     DatabaseConnector,
@@ -36,6 +38,22 @@ def __init__(
         step_data_connector: DatabaseConnector = None,
         storage: Optional[str] = None,
     ) -> None:
+        """
+
+        Args:
+            name (str): Name of this dataset. Used as the directory name when exporting.
+            path (str): Required. Local path of where this dataset should be stored.
+            features (optional Dict[str, FeatureType]): Description of `param1`.
+            enable_feature_inference (bool): enable inferring additional FeatureTypes
+
+        Example:
+        ```
+        >>> dataset = fog_x.Dataset('my_dataset', path='~/fog_x/my_dataset`)
+        ```
+
+        TODO:
+            * is replace_existing actually used anywhere?
+        """
         self.name = name
         path = os.path.expanduser(path)
         self.path = path
@@ -55,23 +73,24 @@ def __init__(
             if not os.path.exists(f"{path}/{name}"):
                 os.makedirs(f"{path}/{name}")
             step_data_connector = LazyFrameConnector(f"{path}/{name}")
-        self.db_manager = DatabaseManager(
-            episode_info_connector, step_data_connector
-        )
+        self.db_manager = DatabaseManager(episode_info_connector, step_data_connector)
         self.db_manager.initialize_dataset(self.name, features)
 
         self.storage = storage
         self.obs_keys = []
         self.act_keys = []
         self.step_keys = []
 
-    def new_episode(
-        self, metadata: Optional[Dict[str, Any]] = None
-    ) -> Episode:
+    def new_episode(self, metadata: Optional[Dict[str, Any]] = None) -> Episode:
         """
         Create a new episode / trajectory.
-        TODO #1: support multiple processes writing to the same episode
-        TODO #2: close the previous episode if not closed
+
+        Returns:
+            Episode
+
+        TODO:
+            * support multiple processes writing to the same episode
+            * close the previous episode if not closed
         """
         return Episode(
             metadata=metadata,
@@ -113,6 +132,10 @@ def export(
     ) -> None:
         """
         Export the dataset.
+
+        Args:
+            export_path (optional str): location of exported data. Uses dataset.path/export by default.
+            format (str): Supported formats are `rtx`, `open-x`, and `rlds`.
         """
         if format == "rtx" or format == "open-x" or format == "rlds":
             if export_path == None:
@@ -207,20 +230,14 @@ def export(
                             and feature_spec.shape != ()
                         ):
                             # reverse the process
-                            value = np.load(io.BytesIO(v)).astype(
-                                feature_spec.np_dtype
-                            )
+                            value = np.load(io.BytesIO(v)).astype(feature_spec.np_dtype)
                         elif (
                             isinstance(feature_spec, tfds.core.features.Tensor)
                             and feature_spec.shape == ()
                         ):
                             value = np.array(v, dtype=feature_spec.np_dtype)
-                        elif isinstance(
-                            feature_spec, tfds.core.features.Image
-                        ):
-                            value = np.load(io.BytesIO(v)).astype(
-                                feature_spec.np_dtype
-                            )
+                        elif isinstance(feature_spec, tfds.core.features.Image):
+                            value = np.load(io.BytesIO(v)).astype(feature_spec.np_dtype)
                         else:
                             value = v
 
@@ -265,7 +282,18 @@ def load_rtx_episodes(
         additional_metadata: Optional[Dict[str, Any]] = None,
     ):
         """
-        Load the dataset.
+        Load robot data from Tensorflow Datasets.
+
+        Args:
+            name (str): Name of RT-X episodes, which can be found at [Tensorflow Datasets](https://www.tensorflow.org/datasets/catalog) under the Robotics category
+            split (optional str): the portion of data to load, see [Tensorflow Split API](https://www.tensorflow.org/datasets/splits)
+            additional_metadata (optional Dict[str, Any]): additional metadata to be associated with the loaded episodes
+
+        Example:
+            ```
+            >>> dataset.load_rtx_episodes(name="berkeley_autolab_ur5)
+            >>> dataset.load_rtx_episodes(name="berkeley_autolab_ur5", split="train[:10]", additional_metadata={"data_collector": "Alice", "custom_tag": "sample"})
+            ```
         """
 
         # this is only required if rtx format is used
@@ -325,26 +353,36 @@ def load_rtx_episodes(
                         fog_epsiode.add(
                             feature=str(k),
                             value=v.numpy(),
-                            feature_type=FeatureType(
-                                tf_feature_spec=data_type[k]
-                            ),
+                            feature_type=FeatureType(tf_feature_spec=data_type[k]),
                         )
                         self.step_keys.append(k)
             fog_epsiode.close()
 
-    def get_episode_info(self):
+    def get_episode_info(self) -> pandas.DataFrame:
         """
-        Return the metadata as pandas dataframe.
+        Returns:
+            metadata of all episodes as `pandas.DataFrame`
         """
         return self.db_manager.get_episode_info_table()
 
-    def get_step_data(self):
+    def get_step_data(self) -> polars.LazyFrame:
         """
-        Return the all step data as lazy dataframe.
+        Returns:
+            step data of all episodes
         """
         return self.db_manager.get_step_table_all()
 
-    def get_step_data_by_episode_ids(self, episode_ids: List[int], as_lazy_frame = True):
+    def get_step_data_by_episode_ids(
+        self, episode_ids: List[int], as_lazy_frame=True
+    ) -> List[polars.LazyFrame] | List[polars.DataFrame]:
+        """
+        Args:
+            episode_ids (List[int]): list of episode ids
+            as_lazy_frame (bool): whether to return polars.LazyFrame or polars.DataFrame
+
+        Returns:
+            step data of each episode
+        """
         episodes = []
         for episode_id in episode_ids:
             if episode_id == None:
@@ -354,8 +392,17 @@ def get_step_data_by_episode_ids(self, episode_ids: List[int], as_lazy_frame = T
             else:
                 episodes.append(self.db_manager.get_step_table(episode_id).collect())
         return episodes
-    
-    def read_by(self, episode_info: Any = None):
+
+    def read_by(self, episode_info: Any = None) -> List[polars.LazyFrame]:
+        """
+        To be used with `Dataset.get_episode_info`.
+
+        Args:
+            episode_info (pandas.DataFrame): episode metadata information to determine which episodes to read
+
+        Returns:
+            episodes filtered by `episode_info`
+        """
         episode_ids = list(episode_info["episode_id"])
         logger.info(f"Reading episodes as order: {episode_ids}")
         episodes = []
@@ -375,6 +422,11 @@ def get_episodes_from_metadata(self, metadata: Any = None):
         return episodes
 
     def pytorch_dataset_builder(self, metadata=None, **kwargs):
+        """
+        Used for loading current dataset as a PyTorch dataset.
+        To be used with `torch.utils.data.DataLoader`.
+        """
+
         import torch
         from torch.utils.data import Dataset
 
@@ -414,15 +466,22 @@ def __getitem__(self, idx):
         return pytorch_dataset
 
     def get_as_huggingface_dataset(self):
+        """
+        Load current dataset as a HuggingFace dataset.
+
+        TODO:
+            * currently the support for huggingg face dataset is limited.
+                it only shows its capability of easily returning a hf dataset
+            * add features from the episode metadata
+            * allow selecting episodes based on queries.
+                doing so requires creating a new copy of the dataset on disk
+        """
         import datasets
 
-        # TODO: currently the support for huggingg face dataset is limited 
-        # it only shows its capability of easily returning a hf dataset 
-        # TODO #1: add features from the episode metadata 
-        # TODO #2: allow selecting episodes based on queries 
-        # doing so requires creating a new copy of the dataset on disk 
         dataset_path = self.path + "/" + self.name
-        parquet_files = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path)]
+        parquet_files = [
+            os.path.join(dataset_path, f) for f in os.listdir(dataset_path)
+        ]
 
-        hf_dataset = datasets.load_dataset('parquet', data_files=parquet_files)
+        hf_dataset = datasets.load_dataset("parquet", data_files=parquet_files)
         return hf_dataset
diff --git a/fog_x/episode.py b/fog_x/episode.py
@@ -28,6 +28,22 @@ def add(
         timestamp: Optional[int] = None,
         feature_type: Optional[FeatureType] = None,
     ) -> None:
+        """
+        Add one feature step data.
+        To add multiple features at each step, call this multiple times or use
+        `add_by_dict` to ensure the same timestamp is used for each feature.
+
+        Args:
+            feature (str): name of the feature
+            value (Any): value associated with the feature
+            timestamp (optional int): nanoseconds since the Epoch.
+                If not provided, the current time is used.
+
+        Examples:
+            >>> episode.add('feature1', 'image1.jpg')
+            >>> episode.add('feature2', 100)
+        """
+
         if timestamp is None:
             timestamp = time.time_ns()
 
@@ -45,18 +61,46 @@ def add_by_dict(
         self, data: Dict[str, Any], timestamp: Optional[int] = None
     ) -> None:
         """
-        add the same timestamp for all features
+        Add multiple features as step data.
+        Ensures that the same timestamp is used for each feature.
+
+        Args:
+            data (Dict[str, Any]): feature names and their values
+            timestamp (optional int): nanoseconds since the Epoch.
+                If not provided, the current time is used.
+
+        Examples:
+            >>> episode.add_by_dict({'feature1': 'image1.jpg', 'feature2': 100})
         """
         if timestamp is None:
             timestamp = time.time_ns()
         for feature, value in data.items():
             self.add(feature, value, timestamp)
 
     def compact(self) -> None:
+        """
+        Creates a table for the compacted data.
+
+        TODO:
+            * compact should not be run more than once?
+            * expand docstring description
+        """
         self.db_manager.compact()
 
     def get_steps(self) -> List[Dict[str, Any]]:
+        """
+        Retrieves the episode's step data.
+
+        Returns:
+            the step data
+
+        TODO:
+            * get_steps not in db_manager; db_manager.get_step_table_all returns a `LazyFrame`, not `List[Dict[str, Any]]`
+        """
         return self.db_manager.get_steps()
 
     def close(self) -> None:
+        """
+        Saves the episode object.
+        """
         self.db_manager.close()