PR #99: Revert to old Dataset class

ma595 · ma595 · commit 84bb85e5a364 · 2025-07-08T15:34:49.000+01:00
diff --git a/exercises/01_penguin_classification.ipynb b/exercises/01_penguin_classification.ipynb
@@ -167,29 +167,28 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from typing import List, Tuple, Dict\n",
+    "from typing import List, Tuple, Any, Dict\n",
     "\n",
     "# import some useful functions here, see https://pytorch.org/docs/stable/torch.html\n",
     "# where `tensor` is used for constructing tensors,\n",
     "# and using a lower-precision float32 is advised for performance\n",
-    "\n",
     "# Task 4: add imports here\n",
-    "# from torch import tensor, Tensor, float32\n",
+    "# from torch import tensor, float32\n",
     "\n",
     "from torch.utils.data import Dataset\n",
     "\n",
     "from palmerpenguins import load_penguins\n",
     "\n",
     "\n",
     "class PenguinDataset(Dataset):\n",
-    "    \"\"\"Simplified Penguin dataset for classification tasks.\n",
+    "    \"\"\"Penguin dataset class.\n",
     "\n",
     "    Parameters\n",
     "    ----------\n",
     "    input_keys : List[str]\n",
-    "        Column names to use as input features.\n",
+    "        The column titles to use in the input feature vectors.\n",
     "    target_key : str\n",
-    "        Categorical target column (e.g., \"species\").\n",
+    "        The column titles to use in the target feature vectors.\n",
     "    train : bool\n",
     "        If ``True``, this object will serve as the training set, and if\n",
     "        ``False``, the validation set.\n",
@@ -198,6 +197,7 @@
     "    -----\n",
     "    The validation split contains 10 male and 10 female penguins of each\n",
     "    species.\n",
+    "\n",
     "    \"\"\"\n",
     "\n",
     "    def __init__(\n",
@@ -206,44 +206,77 @@
     "        target_key: str,\n",
     "        train: bool,\n",
     "    ):\n",
-    "        \"\"\"Build `PenguinDataset` for classification.\"\"\"\n",
+    "        \"\"\"Build ``PenguinDataset``.\"\"\"\n",
     "        self.input_keys = input_keys\n",
     "        self.target_key = target_key\n",
     "\n",
-    "        # Load and clean full dataset\n",
     "        data = load_penguins()\n",
-    "        data = data.dropna().sort_values(by=sorted(data.columns)).reset_index(drop=True)\n",
-    "        data[\"sex\"] = (data[\"sex\"] == \"male\").astype(float)\n",
+    "        data = (\n",
+    "            data.loc[~data.isna().any(axis=1)]\n",
+    "            .sort_values(by=sorted(data.keys()))\n",
+    "            .reset_index(drop=True)\n",
+    "        )\n",
+    "        # Transform the sex field into a float, with male represented by 1.0, female by 0.0\n",
+    "        data.sex = (data.sex == \"male\").astype(float)\n",
     "        self.full_df = data\n",
     "\n",
-    "        # Create stratified validation split\n",
-    "        valid_df = data.groupby([\"species\", \"sex\"]).sample(n=10, random_state=123)\n",
-    "        train_df = data[~data.index.isin(valid_df.index)]\n",
+    "        valid_df = self.full_df.groupby(by=[\"species\", \"sex\"]).sample(\n",
+    "            n=10,\n",
+    "            random_state=123,\n",
+    "        )\n",
+    "        # The training items are simply the items *not* in the valid split\n",
+    "        train_df = self.full_df.loc[~self.full_df.index.isin(valid_df.index)]\n",
     "\n",
-    "        # Choose split\n",
-    "        split_df = train_df if train else valid_df\n",
+    "        self.split = {\"train\": train_df, \"valid\": valid_df}[\n",
+    "            \"train\" if train is True else \"valid\"\n",
+    "        ]\n",
     "\n",
     "        # Build label map from the full dataset\n",
-    "        unique_labels = sorted(self.full_df[target_key].unique())\n",
-    "        self.label_map = {label: idx for idx, label in enumerate(unique_labels)}\n",
+    "        unique_labels = sorted(self.full_df[self.target_key].unique())\n",
+    "        self.label_map: Dict[str, int] = {\n",
+    "            label: idx for idx, label in enumerate(unique_labels)\n",
+    "        }\n",
     "\n",
-    "        # Precompute tensors from split only\n",
-    "        self.features = tensor(split_df[input_keys].values, dtype=float32)\n",
-    "        self.targets = tensor(\n",
-    "            split_df[target_key].map(self.label_map).values, dtype=long\n",
-    "        )\n",
+    "    def __len__(self) -> int:\n",
+    "        \"\"\"Return the length of requested split.\n",
     "\n",
-    "    def get_label_map(self) -> Dict:\n",
-    "        \"\"\"Return the label-to-index mapping.\"\"\"\n",
-    "        return self.label_map\n",
+    "        Returns\n",
+    "        -------\n",
+    "        int\n",
+    "            The number of items in the dataset.\n",
     "\n",
-    "    def __len__(self) -> int:\n",
-    "        # Task 4 - Exercise #1: Return length of features\n",
-    "        return ...\n",
+    "        \"\"\"\n",
+    "        return len(self.split)\n",
+    "\n",
+    "    def __getitem__(self, idx: int) -> Tuple[Any, Any]:\n",
+    "        \"\"\"Return an input-target pair.\n",
     "\n",
-    "    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]:\n",
-    "        # Task 4 - Exercise #2: Return example\n",
-    "        return ..."
+    "        Parameters\n",
+    "        ----------\n",
+    "        idx : int\n",
+    "            Index of the input-target pair to return.\n",
+    "\n",
+    "        Returns\n",
+    "        -------\n",
+    "        in_feats : Any\n",
+    "            Inputs.\n",
+    "        target : Any\n",
+    "            Targets.\n",
+    "\n",
+    "        \"\"\"\n",
+    "        # get the row index (idx) from the dataframe and\n",
+    "        # select relevant column features (provided as input_keys)\n",
+    "        feats = tuple(self.split.iloc[idx][self.input_keys])\n",
+    "\n",
+    "        # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)\n",
+    "        tgt = self.split.iloc[idx][self.target_key]\n",
+    "\n",
+    "        # Task 4 -- Part (a): Convert the tuple features to PyTorch Tensors\n",
+    "\n",
+    "        # Task 4 -- Part (b): Convert the target (a Python integer) to a 0-D tensor (scalar tensor).\n",
+    "\n",
+    "\n",
+    "        return feats, tgt"
    ]
   },
   {
@@ -300,12 +333,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Task 3 -- Part (a) and (b): Applying transforms to the data\n",
+    "### Task 4 -- Part (a) and (b): Convert Dataset outputs to PyTorch Tensors\n",
     "\n",
     "Modify the `PenguinDataset` class above so that the tuples of numbers are converted to PyTorch `torch.Tensor` s and the string targets are converted to indices.\n",
     "\n",
     "- Begin by importing relevant PyTorch functions.\n",
-    "- Complete `__len__()` and `__getitem__()` functions above.\n",
+    "- Complete the `__getitem__()` function above.\n",
     "\n",
     "Then create a training and validation set.\n",
     "\n",
@@ -314,7 +347,8 @@
     "  \n",
     "For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data.\n",
     "\n",
-    "- Is this transformation approach general? No, but it's a good start. "
+    "- Is this transformation approach general? No, but it's a good start. \n",
+    "  - Switch between validation/train time transformations?"
    ]
   },
   {
@@ -364,6 +398,8 @@
    "source": [
     "from torchvision.transforms import Compose\n",
     "\n",
+    "# from ml_workshop import PenguinDataset\n",
+    "\n",
     "# import some useful functions here, see https://pytorch.org/docs/stable/torch.html\n",
     "# where `tensor` is used for constructing tensors,\n",
     "# and using a lower-precision float32 is advised for performance\n",
diff --git a/worked-solutions/01_penguin_classification_solutions.ipynb b/worked-solutions/01_penguin_classification_solutions.ipynb
@@ -273,7 +273,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Task 1 -- Part (c): Apply umap to visualise the data"
+    "### Task 1 -- Part (c): Apply umap to visualise the data"
    ]
   },
   {
@@ -375,33 +375,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from typing import List, Tuple, Dict\n",
+    "from typing import List, Tuple, Any\n",
     "\n",
     "# import some useful functions here, see https://pytorch.org/docs/stable/torch.html\n",
     "# where `tensor` is used for constructing tensors,\n",
     "# and using a lower-precision float32 is advised for performance\n",
-    "\n",
     "# Task 4: add imports here\n",
-    "from torch import tensor, Tensor, float32, long\n",
+    "from torch import tensor, float32, long\n",
     "\n",
     "from torch.utils.data import Dataset\n",
     "\n",
     "from palmerpenguins import load_penguins\n",
     "\n",
     "\n",
     "class PenguinDataset(Dataset):\n",
-    "    \"\"\"Simplified Penguin dataset for classification tasks.\n",
+    "    \"\"\"Penguin dataset class.\n",
     "\n",
     "    Parameters\n",
     "    ----------\n",
     "    input_keys : List[str]\n",
-    "        Column names to use as input features.\n",
+    "        The column titles to use in the input feature vectors.\n",
     "    target_key : str\n",
-    "        Categorical target column (e.g., \"species\").\n",
+    "        The column titles to use in the target feature vectors.\n",
     "    train : bool\n",
     "        If ``True``, this object will serve as the training set, and if\n",
     "        ``False``, the validation set.\n",
@@ -410,49 +409,44 @@
     "    -----\n",
     "    The validation split contains 10 male and 10 female penguins of each\n",
     "    species.\n",
+    "\n",
     "    \"\"\"\n",
+    "\n",
     "    def __init__(\n",
     "        self,\n",
     "        input_keys: List[str],\n",
     "        target_key: str,\n",
     "        train: bool,\n",
     "    ):\n",
-    "        \"\"\"Build `PenguinDataset` for classification.\"\"\"\n",
+    "        \"\"\"Build ``PenguinDataset``.\"\"\"\n",
     "        self.input_keys = input_keys\n",
     "        self.target_key = target_key\n",
     "\n",
-    "        # Load and clean full dataset\n",
     "        data = load_penguins()\n",
     "        data = (\n",
-    "            data.dropna()\n",
-    "            .sort_values(by=sorted(data.columns))\n",
+    "            data.loc[~data.isna().any(axis=1)]\n",
+    "            .sort_values(by=sorted(data.keys()))\n",
     "            .reset_index(drop=True)\n",
     "        )\n",
-    "        data[\"sex\"] = (data[\"sex\"] == \"male\").astype(float)\n",
+    "        # Transform the sex field into a float, with male represented by 1.0, female by 0.0\n",
+    "        data.sex = (data.sex == \"male\").astype(float)\n",
     "        self.full_df = data\n",
     "\n",
-    "        # Create stratified validation split\n",
-    "        valid_df = data.groupby([\"species\", \"sex\"]).sample(n=10, random_state=123)\n",
-    "        train_df = data[~data.index.isin(valid_df.index)]\n",
+    "        valid_df = self.full_df.groupby(by=[\"species\", \"sex\"]).sample(\n",
+    "            n=10,\n",
+    "            random_state=123,\n",
+    "        )\n",
+    "        # The training items are simply the items *not* in the valid split\n",
+    "        train_df = self.full_df.loc[~self.full_df.index.isin(valid_df.index)]\n",
     "\n",
-    "        # Choose split\n",
-    "        split_df = train_df if train else valid_df\n",
+    "        self.split = {\"train\": train_df, \"valid\": valid_df}[\n",
+    "            \"train\" if train is True else \"valid\"\n",
+    "        ]\n",
     "\n",
     "        # Build label map from the full dataset\n",
-    "        unique_labels = sorted(self.full_df[target_key].unique())\n",
+    "        unique_labels = sorted(self.full_df[self.target_key].unique())\n",
     "        self.label_map = {label: idx for idx, label in enumerate(unique_labels)}\n",
     "\n",
-    "        # Precompute tensors from split only\n",
-    "        self.features = tensor(split_df[input_keys].values, dtype=float32)\n",
-    "        self.targets = tensor(\n",
-    "            split_df[target_key].map(self.label_map).values,\n",
-    "            dtype=long\n",
-    "        )\n",
-    "\n",
-    "    def get_label_map(self) -> Dict:\n",
-    "        \"\"\"Return the label-to-index mapping.\"\"\"\n",
-    "        return self.label_map\n",
-    "\n",
     "    def __len__(self) -> int:\n",
     "        \"\"\"Return the length of requested split.\n",
     "\n",
@@ -462,10 +456,9 @@
     "            The number of items in the dataset.\n",
     "\n",
     "        \"\"\"\n",
-    "        # Task 4 - Exercise #1: Return length of features\n",
-    "        return len(self.features)\n",
+    "        return len(self.split)\n",
     "\n",
-    "    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]:\n",
+    "    def __getitem__(self, idx: int) -> Tuple[Any, Any]:\n",
     "        \"\"\"Return an input-target pair.\n",
     "\n",
     "        Parameters\n",
@@ -475,14 +468,25 @@
     "\n",
     "        Returns\n",
     "        -------\n",
-    "        in_feats : Tensor\n",
+    "        in_feats : Any\n",
     "            Inputs.\n",
-    "        target : Tensor\n",
+    "        target : Any\n",
     "            Targets.\n",
     "\n",
     "        \"\"\"\n",
-    "        # Task 4 - Exercise #2: Return example\n",
-    "        return self.features[idx], self.targets[idx]"
+    "        # get the row index (idx) from the dataframe and\n",
+    "        # select relevant column features (provided as input_keys)\n",
+    "        feats = tuple(self.split.iloc[idx][self.input_keys])\n",
+    "\n",
+    "        # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)\n",
+    "        tgt = self.split.iloc[idx][self.target_key]\n",
+    "\n",
+    "        # Task 4 -- Part (a): Convert the tuple features to PyTorch Tensors\n",
+    "        feats = tensor(feats, dtype=float32)\n",
+    "\n",
+    "        # Task 4 -- Part (b): Convert the target (a Python integer) to a 0-D tensor (scalar tensor).\n",
+    "        tgt = tensor(self.label_map[tgt], dtype=long)\n",
+    "        return feats, tgt"
    ]
   },
   {