train.py

import os
import re
import time
import random
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from tqdm.autonotebook import tqdm
from torchvision.transforms import v2
from torch.utils.data import DataLoader, Subset
from torch.utils.tensorboard import SummaryWriter

from kitti360_dataset import Kitti360Dataset
from custom_transforms import SegmentationIdToTrainId
from invhuberloss import InvHuberLoss
from metrics import MeanIoU, RMSE
from log import LogMetric
from moviepy_frame_inference import create_moviepy_visualisation_in_tensorboard


def main(form_data, kitti360, HYPERPARAMETERS, model, save_model, checkpoint, debug):
  print(f"=> Selected model: {model}")
  # Normalise an image into mean ~= 0 and std ~= 1
  normalise = v2.Normalize(mean=[0.3242, 0.3529, 0.3242], std=[0.2892, 0.3015, 0.3077])
  transform = v2.Compose([
    v2.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
    # v2.RandomResizedCrop(size=(128, 416), scale=(0.9, 1.0)),
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    # v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    # v2.Normalize(mean=[0.3242, 0.3529, 0.3242], std=[0.2892, 0.3015, 0.3077])
    normalise
  ])

  val_transform = v2.Compose([
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True)
  ])

  segm_transform = v2.Compose([
    SegmentationIdToTrainId(Kitti360Dataset.labels),
    v2.ToImage(),
    # v2.ToDtype(torch.uint8, scale=True)
    v2.ToDtype(torch.long, scale=False)
  ])

  dep_transform = v2.Compose([
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
  ])

  horizontal_flip = v2.RandomHorizontalFlip(p=0.5)


  kt360 = Kitti360Dataset(
    root=kitti360,
    form_dir=form_data,
    train=True,
    transform=transform,
    segm_only_transform=segm_transform,
    dep_only_transform=dep_transform
  )
  kt360 = horizontal_flip(kt360)

  kt360_val = Kitti360Dataset(
    root=kitti360,
    form_dir=form_data,
    train=False,
    transform=val_transform,
    segm_only_transform=segm_transform,
    dep_only_transform=dep_transform
  )
  #kt360_val = horizontal_flip(kt360_val)

  # DataLoader
  kt360_loader = DataLoader(
    kt360,
    batch_size=HYPERPARAMETERS['batch_size'],
    pin_memory=True,
    shuffle=True
  )

  # DataLoader
  kt360_val_loader = DataLoader(
    kt360_val,
    batch_size=HYPERPARAMETERS['batch_size'],
    pin_memory=True,
    shuffle=False
  )

  device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
  )
  print(f"=> Using {device} device")

  trainId = sorted(set([x.trainId for x in kt360.labels if x.trainId != 255 and x.trainId != -1]))

  weights = None
  loss_fn = None

  model_name = model
  if model == "semantic":
    from SEM_LRefineNet import net
    # Load model
    model = net(len(trainId))
    model.to(device)
    optimiser = optim.Adam(model.parameters(), lr=HYPERPARAMETERS['learning_rate'])

    seg_loss_function = nn.CrossEntropyLoss(ignore_index=255)
    loss_fn = (seg_loss_function, None)

    seg_weight = 1.0
    weights = (seg_weight, None)

  elif model == "depth":
    from DEPTH_LRefineNet import net
    # Load model
    model = net(len(trainId))
    model.to(device)
    optimiser = optim.Adam(model.parameters(), lr=HYPERPARAMETERS['learning_rate'])

    depth_loss_function = InvHuberLoss(ignore_index=0)
    loss_fn = (None, depth_loss_function)

    depth_weight = 1.0
    weights = (None, depth_weight)

  elif model == "dispnet":
    from DispNetS import DispNetS as net
    # Load model
    model = net()
    model.to(device)
    optimiser = optim.Adam(model.parameters(), lr=HYPERPARAMETERS['learning_rate'])

    depth_loss_function = InvHuberLoss(ignore_index=0)
    loss_fn = (None, depth_loss_function)

    depth_weight = 1.0
    weights = (None, depth_weight)

  elif model == "multi-task":
    from MLRefineNet import net
    # Load model
    model = net(len(trainId))
    model.to(device)
    optimiser = optim.Adam(model.parameters(), lr=HYPERPARAMETERS['learning_rate'])

    seg_loss_function = nn.CrossEntropyLoss(ignore_index=255)
    depth_loss_function = InvHuberLoss(ignore_index=0)
    loss_fn = (seg_loss_function, depth_loss_function)

    seg_weight, depth_weight = (0.5, 0.5)
    weights = (seg_weight, depth_weight)

  elif model == "author":
    from AUTHOR_MLRefineNet import net
    # Load model
    model = net(num_classes=len(trainId), num_tasks=2)
    model.to(device)
    optimiser = optim.Adam(model.parameters(), lr=HYPERPARAMETERS['learning_rate'])

    seg_loss_function = nn.CrossEntropyLoss(ignore_index=255)
    depth_loss_function = InvHuberLoss(ignore_index=0)
    loss_fn = (seg_loss_function, depth_loss_function)

    seg_weight, depth_weight = (0.5, 0.5)
    weights = (seg_weight, depth_weight)

  elif model == "no-relu-author":
    from NO_RELU_AUTHOR_MLRefineNet import net
    # Load model
    model = net(num_classes=len(trainId), num_tasks=2)
    model.to(device)
    optimiser = optim.Adam(model.parameters(), lr=HYPERPARAMETERS['learning_rate'])

    seg_loss_function = nn.CrossEntropyLoss(ignore_index=255)
    depth_loss_function = InvHuberLoss(ignore_index=0)
    loss_fn = (seg_loss_function, depth_loss_function)

    seg_weight, depth_weight = (0.5, 0.5)
    weights = (seg_weight, depth_weight)

  else:
    raise RuntimeError("Unexpected Error: model was not selected, please select one of the following: "
                       "'semantic', 'depth', 'dispnet', 'multi-task', 'author' or 'no-relu-author' (in terminal `--semantic`, `--depth`, `--dispnet`, `--multi-task`, `--author` or `--no-relu-author`)")

  if weights is None or loss_fn is None:
    raise RuntimeError(f"Unexpected Error: training has None values on weights values {weights} and/or loss function {loss_fn}")


  #############################################################################
  # TRAINING & TESTING LOOPS                                                  #
  #                                                                           #
  # NOTE: SEE BELOW AFTER THE 2 FUNCTION TO FIND EXECUTION OF THESE FUNCTIONS #
  #############################################################################
  def train_loop(data_loader, model, model_name, optimiser, loss_fn, weights, epoch, writer, n_iter, start_timestamp, checkpoint):
      model.train()


      def semantic_loop(n_iter):
        meaniou = MeanIoU(len(trainId))

        # Initialize tqdm with dynamic postfix
        progress_bar = tqdm(data_loader, desc="Training", leave=False)

        for X, (y_seg, _) in progress_bar:
            # Tensors to Device
            X, y_seg, _ = X.to(device), y_seg.to(device), _.to(device)

            # Compute prediction and loss
            pred_seg = model(X)
            loss_fn_seg, _ = loss_fn

            seg_weight, _ = weights
            loss_seg = loss_fn_seg(pred_seg.squeeze(dim=1), y_seg.squeeze(dim=1))
            loss = seg_weight * loss_seg

            writer.add_scalar('Total_Loss_per_batch', loss, n_iter)
            writer.add_scalar('Loss_Seg_per_batch', loss_seg, n_iter)

            # Backpropagation
            loss.backward()
            optimiser.step()
            optimiser.zero_grad()

            with torch.no_grad():
                meaniou.update(pred_seg.squeeze(dim=1).cpu().numpy(), y_seg.squeeze(dim=1).cpu().numpy())

            writer.add_scalar('Train_MeanIoU_over_batch', meaniou.val(), n_iter)

            # Update tqdm with loss information
            progress_bar.set_postfix({
                "MeanIoU": f"{meaniou.val() * 100:.2f}%",
                "Loss": loss.item(),
                "Loss_Seg": loss_seg.item(),
            })
            n_iter += 1 # Increment iteration for `writer` tensorboard
        
        # checkpoints_path = f"{save_model}/checkpoints/{model_name}/{start_timestamp}_run/"
        checkpoints_path = os.path.join(save_model, "checkpoints", model_name, f"{start_timestamp}_run/")
        os.makedirs(checkpoints_path, exist_ok=True)
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        torch.save({
              'epoch': epoch,
              'n_iter': n_iter,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimiser.state_dict(),
              'loss': loss.item(),
              }, os.path.join(checkpoints_path, f"{model_name}_model_checkpoints_epoch{epoch}_miou{meaniou.val() * 100:.2f}_{timestamp}.pt"))
              # , f"{checkpoints_path}{model_name}_model_checkpoints_epoch{epoch}_miou{meaniou.val() * 100:.2f}_{timestamp}.pt")

        # Reset metrics
        meaniou.reset()

        return n_iter


      def depth_loop(n_iter):
        rmse1 = RMSE()
        rmse2 = RMSE()
        rmse3 = RMSE()
        rmse4 = RMSE()

        # Initialize tqdm with dynamic postfix
        progress_bar = tqdm(data_loader, desc="Training", leave=False)

        for X, (_, y_depth) in progress_bar:
            # Tensors to Device
            X, _, y_depth = X.to(device), _.to(device), y_depth.to(device)

            # Compute prediction and loss
            disp1, disp2, disp3, disp4 = model(X)
            pred_depth = [disp.squeeze(dim=1) for disp in [disp1, disp2, disp3, disp4]]
            y_depth = [
              y_depth,
              y_depth[:, :, ::2, ::2],
              y_depth[:, :, ::2*2, ::2*2],
              y_depth[:, :, ::2*2*2, ::2*2*2]
            ]
            gt_depth = [gt_dep for gt_dep in y_depth]
            y_depth = [y_dep.squeeze(dim=1) for y_dep in y_depth]
            _, loss_fn_depth = loss_fn

            _, depth_weight = weights
            loss_depth1 = loss_fn_depth(pred_depth[0], y_depth[0])
            loss_depth2 = loss_fn_depth(pred_depth[1], y_depth[1])
            loss_depth3 = loss_fn_depth(pred_depth[2], y_depth[2])
            loss_depth4 = loss_fn_depth(pred_depth[3], y_depth[3])
            loss_depth = (1/4 * loss_depth1 + 1/4 * loss_depth2 + 1/4 * loss_depth3 + 1/4 * loss_depth4)
            loss = depth_weight * loss_depth

            writer.add_scalar('Total_Loss_per_batch', loss, n_iter)
            writer.add_scalar('Loss_Depth_per_batch', loss_depth, n_iter)

            # Per Depth scale
            writer.add_scalar('Loss_Depth1_per_batch', loss_depth1, n_iter)
            writer.add_scalar('Loss_Depth2_per_batch', loss_depth2, n_iter)
            writer.add_scalar('Loss_Depth3_per_batch', loss_depth3, n_iter)
            writer.add_scalar('Loss_Depth4_per_batch', loss_depth4, n_iter)

            # Backpropagation
            loss.backward()
            optimiser.step()
            optimiser.zero_grad()

            with torch.no_grad():
                rmse1.update(disp1.squeeze(dim=1).cpu().numpy(), gt_depth[0].squeeze(dim=1).cpu().numpy())
                rmse2.update(disp2.squeeze(dim=1).cpu().numpy(), gt_depth[1].squeeze(dim=1).cpu().numpy())
                rmse3.update(disp3.squeeze(dim=1).cpu().numpy(), gt_depth[2].squeeze(dim=1).cpu().numpy())
                rmse4.update(disp4.squeeze(dim=1).cpu().numpy(), gt_depth[3].squeeze(dim=1).cpu().numpy())

            writer.add_scalar('Train_RMSE1_over_batch', rmse1.val(), n_iter)
            writer.add_scalar('Train_RMSE2_over_batch', rmse2.val(), n_iter)
            writer.add_scalar('Train_RMSE3_over_batch', rmse3.val(), n_iter)
            writer.add_scalar('Train_RMSE4_over_batch', rmse4.val(), n_iter)

            # Update tqdm with loss information
            progress_bar.set_postfix({
                "RMSE_1": rmse1.val(),
                "RMSE_2": rmse2.val(),
                "RMSE_3": rmse3.val(),
                "RMSE_4": rmse4.val(),
                "Loss": loss.item(),
                "Loss_Depth": loss_depth.item()
            })
            n_iter += 1 # Increment iteration for `writer` tensorboard
        
        # checkpoints_path = f"{save_model}/checkpoints/{model_name}/{start_timestamp}_run/"
        checkpoints_path = os.path.join(save_model, "checkpoints", model_name, f"{start_timestamp}_run/")
        os.makedirs(checkpoints_path, exist_ok=True)
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        torch.save({
              'epoch': epoch,
              'n_iter': n_iter,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimiser.state_dict(),
              'loss': loss.item(),
              }, os.path.join(checkpoints_path, f"{model_name}_model_checkpoints_epoch{epoch}_rmse1d{rmse1.val():>8f}_rmse2d{rmse2.val():>8f}_rmse3d{rmse3.val():>8f}_rmse4d{rmse4.val():>8f}_{timestamp}.pt"))
              #, f"{checkpoints_path}{model_name}_model_checkpoints_epoch{epoch}_rmse1d{rmse1.val():>8f}_rmse2d{rmse2.val():>8f}_rmse3d{rmse3.val():>8f}_rmse4d{rmse4.val():>8f}_{timestamp}.pt")

        # Reset metrics
        rmse1.reset()
        rmse2.reset()
        rmse3.reset()
        rmse4.reset()

        return n_iter


      def multi_task_loop(n_iter):
        meaniou = MeanIoU(len(trainId))
        rmse1 = RMSE()
        rmse2 = RMSE()
        rmse3 = RMSE()
        rmse4 = RMSE()

        # Initialize tqdm with dynamic postfix
        progress_bar = tqdm(data_loader, desc="Training", leave=False)

        for X, (y_seg, y_depth) in progress_bar:
            # Tensors to Device
            X, y_seg, y_depth = X.to(device), y_seg.to(device), y_depth.to(device)

            # Compute prediction and loss
            disp1, disp2, disp3, disp4, pred_seg = model(X)
            pred_depth = [disp.squeeze(dim=1) for disp in [disp1, disp2, disp3, disp4]]
            y_depth = [
              y_depth,
              y_depth[:, :, ::2, ::2],
              y_depth[:, :, ::2*2, ::2*2],
              y_depth[:, :, ::2*2*2, ::2*2*2]
            ]
            gt_depth = [gt_dep for gt_dep in y_depth]
            y_depth = [y_dep.squeeze(dim=1) for y_dep in y_depth]
            loss_fn_seg, loss_fn_depth = loss_fn

            seg_weight, depth_weight = weights
            loss_seg = loss_fn_seg(pred_seg.squeeze(dim=1), y_seg.squeeze(dim=1))
            loss_depth1 = loss_fn_depth(pred_depth[0], y_depth[0])
            loss_depth2 = loss_fn_depth(pred_depth[1], y_depth[1])
            loss_depth3 = loss_fn_depth(pred_depth[2], y_depth[2])
            loss_depth4 = loss_fn_depth(pred_depth[3], y_depth[3])
            loss_depth = (1/4 * loss_depth1 + 1/4 * loss_depth2 + 1/4 * loss_depth3 + 1/4 * loss_depth4)
            loss = seg_weight * loss_seg + depth_weight * loss_depth

            writer.add_scalar('Total_Loss_per_batch', loss, n_iter)
            writer.add_scalar('Loss_Seg_per_batch', loss_seg, n_iter)
            writer.add_scalar('Loss_Depth_per_batch', loss_depth, n_iter)

            # Per Depth scale
            writer.add_scalar('Loss_Depth1_per_batch', loss_depth1, n_iter)
            writer.add_scalar('Loss_Depth2_per_batch', loss_depth2, n_iter)
            writer.add_scalar('Loss_Depth3_per_batch', loss_depth3, n_iter)
            writer.add_scalar('Loss_Depth4_per_batch', loss_depth4, n_iter)

            # Backpropagation
            loss.backward()
            optimiser.step()
            optimiser.zero_grad()

            with torch.no_grad():
                meaniou.update(pred_seg.squeeze(dim=1).cpu().numpy(), y_seg.squeeze(dim=1).cpu().numpy())
                rmse1.update(disp1.squeeze(dim=1).cpu().numpy(), gt_depth[0].squeeze(dim=1).cpu().numpy())
                rmse2.update(disp2.squeeze(dim=1).cpu().numpy(), gt_depth[1].squeeze(dim=1).cpu().numpy())
                rmse3.update(disp3.squeeze(dim=1).cpu().numpy(), gt_depth[2].squeeze(dim=1).cpu().numpy())
                rmse4.update(disp4.squeeze(dim=1).cpu().numpy(), gt_depth[3].squeeze(dim=1).cpu().numpy())

            writer.add_scalar('Train_MeanIoU_over_batch', meaniou.val(), n_iter)
            writer.add_scalar('Train_RMSE1_over_batch', rmse1.val(), n_iter)
            writer.add_scalar('Train_RMSE2_over_batch', rmse2.val(), n_iter)
            writer.add_scalar('Train_RMSE3_over_batch', rmse3.val(), n_iter)
            writer.add_scalar('Train_RMSE4_over_batch', rmse4.val(), n_iter)

            # Update tqdm with loss information
            progress_bar.set_postfix({
                "MeanIoU": f"{meaniou.val() * 100:.2f}%",
                "RMSE_1": rmse1.val(),
                "RMSE_2": rmse2.val(),
                "RMSE_3": rmse3.val(),
                "RMSE_4": rmse4.val(),
                "Loss": loss.item(),
                "Loss_Seg": loss_seg.item(),
                "Loss_Depth": loss_depth.item()
            })
            n_iter += 1 # Increment iteration for `writer` tensorboard
        
        # checkpoints_path = f"{save_model}/checkpoints/{model_name}/{start_timestamp}_run/"
        checkpoints_path = os.path.join(save_model, "checkpoints", model_name, f"{start_timestamp}_run/")
        os.makedirs(checkpoints_path, exist_ok=True)
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        torch.save({
              'epoch': epoch,
              'n_iter': n_iter,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimiser.state_dict(),
              'loss': loss.item(),
              }, os.path.join(checkpoints_path, f"{model_name}_model_checkpoints_epoch{epoch}_miou{meaniou.val() * 100:.2f}_rmse1d{rmse1.val():>8f}_rmse2d{rmse2.val():>8f}_rmse3d{rmse3.val():>8f}_rmse4d{rmse4.val():>8f}_{timestamp}.pt"))
              #, f"{checkpoints_path}{model_name}_model_checkpoints_epoch{epoch}_miou{meaniou.val() * 100:.2f}_rmse1d{rmse1.val():>8f}_rmse2d{rmse2.val():>8f}_rmse3d{rmse3.val():>8f}_rmse4d{rmse4.val():>8f}_{timestamp}.pt")

        # Reset metrics
        meaniou.reset()
        rmse1.reset()
        rmse2.reset()
        rmse3.reset()
        rmse4.reset()

        return n_iter
      

      def author_loop(n_iter):
        meaniou = MeanIoU(len(trainId))
        rmse1 = RMSE()

        # Initialize tqdm with dynamic postfix
        progress_bar = tqdm(data_loader, desc="Training", leave=False)

        for X, (y_seg, y_depth) in progress_bar:
            # Tensors to Device
            X, y_seg, y_depth = X.to(device), y_seg.to(device), y_depth.to(device)

            # Compute prediction and loss
            pred_seg, pred_depth = model(X)
            pred_seg = F.interpolate(pred_seg, size=(128, 416), mode='nearest')
            pred_depth = F.interpolate(pred_depth, size=(128, 416), mode='bilinear', align_corners=False)

            loss_fn_seg, loss_fn_depth = loss_fn

            seg_weight, depth_weight = weights
            loss_seg = loss_fn_seg(pred_seg.squeeze(dim=1), y_seg.squeeze(dim=1))
            loss_depth = loss_fn_depth(pred_depth.squeeze(dim=1), y_depth.squeeze(dim=1))
            loss = seg_weight * loss_seg + depth_weight * loss_depth

            writer.add_scalar('Total_Loss_per_batch', loss, n_iter)
            writer.add_scalar('Loss_Seg_per_batch', loss_seg, n_iter)
            writer.add_scalar('Loss_Depth_per_batch', loss_depth, n_iter)

            # Backpropagation
            loss.backward()
            optimiser.step()
            optimiser.zero_grad()

            with torch.no_grad():
                meaniou.update(pred_seg.squeeze(dim=1).cpu().numpy(), y_seg.squeeze(dim=1).cpu().numpy())
                rmse1.update(pred_depth.squeeze(dim=1).cpu().numpy(), y_depth.squeeze(dim=1).cpu().numpy())

            writer.add_scalar('Train_MeanIoU_over_batch', meaniou.val(), n_iter)
            writer.add_scalar('Train_RMSE1_over_batch', rmse1.val(), n_iter)

            # Update tqdm with loss information
            progress_bar.set_postfix({
                "MeanIoU": f"{meaniou.val() * 100:.2f}%",
                "RMSE1": rmse1.val(),
                "Loss": loss.item(),
                "Loss_Seg": loss_seg.item(),
                "Loss_Depth": loss_depth.item()
            })
            n_iter += 1 # Increment iteration for `writer` tensorboard
        
        # checkpoints_path = f"{save_model}/checkpoints/{model_name}/{start_timestamp}_run/"
        checkpoints_path = os.path.join(save_model, "checkpoints", model_name, f"{start_timestamp}_run/")
        os.makedirs(checkpoints_path, exist_ok=True)
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        torch.save({
              'epoch': epoch,
              'n_iter': n_iter,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimiser.state_dict(),
              'loss': loss.item(),
              }, os.path.join(checkpoints_path, f"{model_name}_model_checkpoints_epoch{epoch}_miou{meaniou.val() * 100:.2f}_rmse1d{rmse1.val():>8f}_{timestamp}.pt"))
              # , f"{checkpoints_path}{model_name}_model_checkpoints_epoch{epoch}_miou{meaniou.val() * 100:.2f}_rmse1d{rmse1.val():>8f}_{timestamp}.pt")

        # Reset metrics
        meaniou.reset()
        rmse1.reset()

        return n_iter
      

      if checkpoint and n_iter == 1:
        # checkpoints_path = f"{save_model}checkpoints/{model_name}/{checkpoint['run']}/{checkpoint['filename']}"
        checkpoints_path = os.path.join(save_model, "checkpoints", model_name, checkpoint['run'], checkpoint['filename'])
        model_checkpoint = torch.load(checkpoints_path, weights_only=True, map_location=device)
        model.load_state_dict(model_checkpoint['model_state_dict'])
        optimiser.load_state_dict(model_checkpoint['optimizer_state_dict'])
        n_iter += (model_checkpoint['epoch'] * len(data_loader))
        # print(f"model_checkpoint['epoch']: {model_checkpoint['epoch']}")
        # print(f"(model_checkpoint['epoch'] * len(data_loader)): {(model_checkpoint['epoch'] * len(data_loader))}")
        # print(f"n_iter: {n_iter}")
        # raise RuntimeError("STILL DEBUGGING")
        # n_iter += model_checkpoint['n_iter']

      #################
      # Training Loop #
      #################
      if model_name == "semantic":
        n_iter = semantic_loop(n_iter=n_iter)
      
      elif model_name == "depth" or model_name == "dispnet":
        n_iter = depth_loop(n_iter=n_iter)

      elif model_name == "multi-task":
        n_iter = multi_task_loop(n_iter=n_iter)

      elif model_name == "author" or model_name == "no-relu-author":
         n_iter = author_loop(n_iter=n_iter)

      else:
        raise RuntimeError(f"Unexpected Error: Train Loop have both seg_weight and depth_weight being ({seg_weight}, {depth_weight})")

      return n_iter


  @torch.no_grad()
  def test_loop(data_loader, model, model_name, loss_fn, weights, writer, log_for_best, start_timestamp):
      model.eval()

      #########################################
      # HELPERS - COLOURING AND SAVING IMAGES #
      #########################################
      # Function to create a color mapping from train_id to RGB color
      def create_colour_mapping(classes):
          return {cls.trainId: cls.color for cls in classes}

      # Function to apply color to the semantic segmentation mask
      def apply_colour_map(target_tensor, color_mapping):
          # Ensure the input tensor has the correct shape
          batch_size, _, h, w = target_tensor.shape
          
          # Create an empty tensor to hold the colored images
          colored_target = torch.zeros((batch_size, 3, h, w), dtype=torch.uint8)
          
          # Iterate over the batch
          for i in range(batch_size):
              # Get the 2D mask for the ith image
              mask = target_tensor[i, 0]  # Shape: (128, 416)
              
              # Apply the color mapping
              for train_id, color in color_mapping.items():
                  mask_indices = (mask == train_id)
                  if mask_indices.any():  # Only assign if there are any matching pixels
                      color_tensor = torch.tensor(color, dtype=torch.uint8).view(3, 1, 1)
                      colored_target[i][:, mask_indices] = color_tensor.view(3, -1)
          
          return colored_target
      

      def apply_turbo_colourmap(depth_tensor):
        # Convert the depth tensor to a numpy array
        depth_numpy = depth_tensor.squeeze().cpu().numpy()

        # Normalize the depth values to [0, 1] for proper colormap application
        depth_normalized = (depth_numpy - depth_numpy.min()) / (depth_numpy.max() - depth_numpy.min())

        # Apply the turbo colormap
        turbo_colormap = plt.colormaps['turbo']
        depth_colored = turbo_colormap(depth_normalized)[:, :, :3]  # Drop the alpha channel

        # Convert the colormap back to a tensor and permute the dimensions to match NCHW
        depth_colored_tensor = torch.from_numpy(depth_colored).permute(2, 0, 1)

        return depth_colored_tensor
      

      def save_sample_images(model, num_samples, model_name):
          num_samples = num_samples  # Number of images to retrieve
          
          X_batch = []
          y_seg_batch = []
          y_depth_batch = []

          for i in range(num_samples):
              # Randomly select an index from the dataset
              random_index = random.randint(0, len(kt360_val) - 1)

              # Retrieve the sample using the random index
              X, (y_seg, y_depth) = kt360_val[random_index]

              # Append each sample to the respective batch list
              X_batch.append(X.unsqueeze(0))  # Add batch dimension
              y_seg_batch.append(y_seg.unsqueeze(0))  # Add batch dimension
              y_depth_batch.append(y_depth.unsqueeze(0))  # Add batch dimension

          # Concatenate lists into batches
          X_batch = torch.cat(X_batch, dim=0)  # X.shape = (10, 3, 128, 416)
          y_seg_batch = torch.cat(y_seg_batch, dim=0)  # y_seg.shape = (10, 1, 128, 416)
          y_depth_batch = torch.cat(y_depth_batch, dim=0)  # y_depth.shape = (10, 1, 128, 416)

          # Normalize the input images
          X_input = X_batch.clone()  # Clone for non-normalized RGB images
          X_batch = normalise(X_batch)  # Apply normalization

          # Tensors to Device
          X_batch, y_seg_batch, y_depth_batch = (
              X_batch.to(device),
              y_seg_batch.to(device),
              y_depth_batch.to(device),
          )

          if model_name == "semantic":
            # Get predictions from the model
            pred_seg = model(X_batch)

            # Process segmentation predictions
            pred_seg = torch.argmax(pred_seg, dim=1, keepdim=True)

            # Apply color mapping
            pred_seg = apply_colour_map(pred_seg, create_colour_mapping(Kitti360Dataset.labels))
            y_seg_batch = apply_colour_map(y_seg_batch, create_colour_mapping(Kitti360Dataset.labels))

            # Write images to TensorBoard
            writer.add_images("Qualitative_Results_RGB", X_input, epoch, dataformats='NCHW')
            writer.add_images("Qualitative_Results_Prediction_Semantic", pred_seg, epoch, dataformats='NCHW')
            writer.add_images("Qualitative_Results_Ground_Truth_Semantic", y_seg_batch, epoch, dataformats='NCHW')
            
          elif model_name == "depth" or model_name == "dispnet":
             # Get predictions from the model
            pred_depth = model(X_batch)

            # Apply the turbo colormap to depth predictions and ground truth
            pred_depth_colored = torch.stack([apply_turbo_colourmap(d) for d in pred_depth])
            y_depth_colored = torch.stack([apply_turbo_colourmap(d) for d in y_depth_batch])

            # Write images to TensorBoard
            writer.add_images("Qualitative_Results_RGB", X_input, epoch, dataformats='NCHW')
            writer.add_images("Qualitative_Results_Prediction_Depth", pred_depth_colored, epoch, dataformats='NCHW')
            writer.add_images("Qualitative_Results_Ground_Truth_Depth", y_depth_colored, epoch, dataformats='NCHW')

          elif model_name == "multi-task":
            # Get predictions from the model
            pred_depth, pred_seg = model(X_batch)

            # Process segmentation predictions
            pred_seg = torch.argmax(pred_seg, dim=1, keepdim=True)

            # Apply color mapping
            pred_seg = apply_colour_map(pred_seg, create_colour_mapping(Kitti360Dataset.labels))
            y_seg_batch = apply_colour_map(y_seg_batch, create_colour_mapping(Kitti360Dataset.labels))

            # Apply the turbo colormap to depth predictions and ground truth
            pred_depth_colored = torch.stack([apply_turbo_colourmap(d) for d in pred_depth])
            y_depth_colored = torch.stack([apply_turbo_colourmap(d) for d in y_depth_batch])

            # Write images to TensorBoard
            writer.add_images("Qualitative_Results_RGB", X_input, epoch, dataformats='NCHW')
            writer.add_images("Qualitative_Results_Prediction_Semantic", pred_seg, epoch, dataformats='NCHW')
            writer.add_images("Qualitative_Results_Ground_Truth_Semantic", y_seg_batch, epoch, dataformats='NCHW')
            writer.add_images("Qualitative_Results_Prediction_Depth", pred_depth_colored, epoch, dataformats='NCHW')
            writer.add_images("Qualitative_Results_Ground_Truth_Depth", y_depth_colored, epoch, dataformats='NCHW')

          elif model_name == "author" or model_name == "no-relu-author":
             # Get predictions from the model
            pred_seg, pred_depth = model(X_batch)
            pred_seg = F.interpolate(pred_seg, size=(128, 416), mode='nearest')
            pred_depth = F.interpolate(pred_depth, size=(128, 416), mode='bilinear', align_corners=False)

            # Process segmentation predictions
            pred_seg = torch.argmax(pred_seg, dim=1, keepdim=True)

            # Apply color mapping
            pred_seg = apply_colour_map(pred_seg, create_colour_mapping(Kitti360Dataset.labels))
            y_seg_batch = apply_colour_map(y_seg_batch, create_colour_mapping(Kitti360Dataset.labels))

            # Apply the turbo colormap to depth predictions and ground truth
            pred_depth_colored = torch.stack([apply_turbo_colourmap(d) for d in pred_depth])
            y_depth_colored = torch.stack([apply_turbo_colourmap(d) for d in y_depth_batch])

            # Write images to TensorBoard
            writer.add_images("Qualitative_Results_RGB", X_input, epoch, dataformats='NCHW')
            writer.add_images("Qualitative_Results_Prediction_Semantic", pred_seg, epoch, dataformats='NCHW')
            writer.add_images("Qualitative_Results_Ground_Truth_Semantic", y_seg_batch, epoch, dataformats='NCHW')
            writer.add_images("Qualitative_Results_Prediction_Depth", pred_depth_colored, epoch, dataformats='NCHW')
            writer.add_images("Qualitative_Results_Ground_Truth_Depth", y_depth_colored, epoch, dataformats='NCHW')
      #########################################

      def semantic_loop():
        num_batches = len(data_loader)
        test_loss = 0

        meaniou = MeanIoU(len(trainId))

        # Initialize tqdm with dynamic postfix
        progress_bar = tqdm(data_loader, desc="Testing", leave=False)

        for X, (y_seg, _) in progress_bar:
            X_input = X.clone() # Clean copy of RGB image that is not normalised
            X = normalise(X)
            # Tensors to Device
            X, y_seg, _ = X.to(device), y_seg.to(device), _.to(device)

            pred_seg = model(X)
            loss_fn_seg, _ = loss_fn

            seg_weight, _ = weights
            loss_seg = loss_fn_seg(pred_seg.squeeze(dim=1), y_seg.squeeze(dim=1))
            loss = seg_weight * loss_seg
            test_loss += loss

            meaniou.update(pred_seg.squeeze(dim=1).cpu().numpy(), y_seg.squeeze(dim=1).cpu().numpy())

            # Update tqdm with metrics information
            progress_bar.set_postfix({
                "MeanIoU": f"{meaniou.val() * 100:.2f}%",
                "Loss": loss.item(),
                "Loss_Seg": loss_seg.item(),
            })

        writer.add_scalar("MeanIoU_per_epoch", meaniou.val(), epoch)

        # Save qualitative images results:
        # ==================================
        save_sample_images(model=model, num_samples=16, model_name=model_name)
        # ==================================


        # best_models_path = f"{save_model}/best_models/{model_name}/{start_timestamp}_run/"
        best_models_path = os.path.join(save_model, "best_models", model_name, f"{start_timestamp}_run/")
        os.makedirs(best_models_path, exist_ok=True)

        test_loss /= num_batches
        previous_meaniou = log_for_best.getMetric()
        previous_loss = log_for_best.getLoss()
        #                                        Loss lower is better!
        if meaniou.val() >= previous_meaniou and test_loss <= previous_loss:
          # timestamp = time.strftime("%Y%m%d_%H%M%S")
          torch.save({
              'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimiser.state_dict(),
              'loss': loss.item(),
              }, os.path.join(best_models_path, f"{model_name}_best_model.pt"))
              # , f"{best_models_path}{model_name}_best_model.pt")

        tqdm.write(f"Test Error: \n MeanIoU: {meaniou.val() * 100:.2f}%, Avg loss: {test_loss:>8f} \n")
        
        # Update `log_for_best` with current metrics and loss
        log_for_best.update(meaniou.val(), test_loss)

        # Reset metrics
        meaniou.reset()


      def depth_loop():
        num_batches = len(data_loader)
        test_loss = 0

        rmse = RMSE()

        # Initialize tqdm with dynamic postfix
        progress_bar = tqdm(data_loader, desc="Testing", leave=False)

        for X, (_, y_depth) in progress_bar:
            X_input = X.clone() # Clean copy of RGB image that is not normalised
            X = normalise(X)
            # Tensors to Device
            X, _, y_depth = X.to(device), _.to(device), y_depth.to(device)

            pred_depth = model(X)
            _, loss_fn_depth = loss_fn

            _, depth_weight = weights
            loss_depth = loss_fn_depth(pred_depth.squeeze(dim=1), y_depth.squeeze(dim=1))
            loss = depth_weight * loss_depth
            test_loss += loss

            rmse.update(pred_depth.squeeze(dim=1).cpu().numpy(), y_depth.squeeze(dim=1).cpu().numpy())

            # Update tqdm with metrics information
            progress_bar.set_postfix({
                "RMSE": rmse.val(),
                "Loss": loss.item(),
                "Loss_Depth": loss_depth.item()
            })

        writer.add_scalar("RMSE_per_epoch", rmse.val(), epoch)

        # Save qualitative images results:
        # ==================================
        save_sample_images(model=model, num_samples=16, model_name=model_name)
        # ==================================


        # best_models_path = f"{save_model}/best_models/{model_name}/{start_timestamp}_run/"
        best_models_path = os.path.join(save_model, "best_models", model_name, f"{start_timestamp}_run/")
        os.makedirs(best_models_path, exist_ok=True)

        test_loss /= num_batches
        previous_rmse = log_for_best.getMetric()
        previous_loss = log_for_best.getLoss()
        #                                  Loss lower is better!
        if rmse.val() >= previous_rmse and test_loss <= previous_loss:
          # timestamp = time.strftime("%Y%m%d_%H%M%S")
          torch.save({
              'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimiser.state_dict(),
              'loss': loss.item(),
              }, os.path.join(best_models_path, f"{model_name}_best_model.pt"))
              # , f"{best_models_path}{model_name}_best_model.pt")

        tqdm.write(f"Test Error: \n RMSE: {(rmse.val()):>8f}, Avg loss: {test_loss:>8f} \n")
        
        # Update `log_for_best` with current metrics and loss
        log_for_best.update(rmse.val(), test_loss)

        # Reset metrics
        rmse.reset()


      def multi_task_loop():
        num_batches = len(data_loader)
        test_loss = 0

        meaniou = MeanIoU(len(trainId))
        rmse = RMSE()

        # Initialize tqdm with dynamic postfix
        progress_bar = tqdm(data_loader, desc="Testing", leave=False)

        for X, (y_seg, y_depth) in progress_bar:
            X_input = X.clone() # Clean copy of RGB image that is not normalised
            X = normalise(X)
            # Tensors to Device
            X, y_seg, y_depth = X.to(device), y_seg.to(device), y_depth.to(device)

            pred_depth, pred_seg = model(X)
            loss_fn_seg, loss_fn_depth = loss_fn

            seg_weight, depth_weight = weights
            loss_seg = loss_fn_seg(pred_seg.squeeze(dim=1), y_seg.squeeze(dim=1))
            loss_depth = loss_fn_depth(pred_depth.squeeze(dim=1), y_depth.squeeze(dim=1))
            loss = seg_weight * loss_seg + depth_weight * loss_depth
            test_loss += loss

            meaniou.update(pred_seg.squeeze(dim=1).cpu().numpy(), y_seg.squeeze(dim=1).cpu().numpy())
            rmse.update(pred_depth.squeeze(dim=1).cpu().numpy(), y_depth.squeeze(dim=1).cpu().numpy())

            # Update tqdm with metrics information
            progress_bar.set_postfix({
                "MeanIoU": f"{meaniou.val() * 100:.2f}%",
                "RMSE": rmse.val(),
                "Loss": loss.item(),
                "Loss_Seg": loss_seg.item(),
                "Loss_Depth": loss_depth.item()
            })

        writer.add_scalar("MeanIoU_per_epoch", meaniou.val(), epoch)
        writer.add_scalar("RMSE_per_epoch", rmse.val(), epoch)

        # Save qualitative images results:
        # ==================================
        save_sample_images(model=model, num_samples=16, model_name=model_name)
        # ==================================


        # best_models_path = f"{save_model}/best_models/{model_name}/{start_timestamp}_run/"
        best_models_path = os.path.join(save_model, "best_models", model_name, f"{start_timestamp}_run/")
        os.makedirs(best_models_path, exist_ok=True)

        test_loss /= num_batches
        previous_meaniou, previous_rmse = log_for_best.getMetric()
        previous_loss = log_for_best.getLoss()
        #                                                                         Loss lower is better!
        if (meaniou.val() >= previous_meaniou or rmse.val() >= previous_rmse) and test_loss <= previous_loss:
          # timestamp = time.strftime("%Y%m%d_%H%M%S")
          torch.save({
              'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimiser.state_dict(),
              'loss': loss.item(),
              }, os.path.join(best_models_path, f"{model_name}_best_model.pt"))
              # , f"{best_models_path}{model_name}_best_model.pt")

        tqdm.write(f"Test Error: \n MeanIoU: {meaniou.val() * 100:.2f}%, RMSE: {(rmse.val()):>8f}, Avg loss: {test_loss:>8f} \n")
        
        # Update `log_for_best` with current metrics and loss
        log_for_best.update_tuple((meaniou.val(), rmse.val()), test_loss)

        # Reset metrics
        meaniou.reset()
        rmse.reset()

      
      def author_loop():
        num_batches = len(data_loader)
        test_loss = 0

        meaniou = MeanIoU(len(trainId))
        rmse = RMSE()

        # Initialize tqdm with dynamic postfix
        progress_bar = tqdm(data_loader, desc="Testing", leave=False)

        for X, (y_seg, y_depth) in progress_bar:
            X_input = X.clone() # Clean copy of RGB image that is not normalised
            X = normalise(X)
            # Tensors to Device
            X, y_seg, y_depth = X.to(device), y_seg.to(device), y_depth.to(device)

            pred_seg, pred_depth,  = model(X)
            pred_seg = F.interpolate(pred_seg, size=(128, 416), mode='nearest')
            pred_depth = F.interpolate(pred_depth, size=(128, 416), mode='bilinear', align_corners=False)

            loss_fn_seg, loss_fn_depth = loss_fn

            seg_weight, depth_weight = weights
            loss_seg = loss_fn_seg(pred_seg.squeeze(dim=1), y_seg.squeeze(dim=1))
            loss_depth = loss_fn_depth(pred_depth.squeeze(dim=1), y_depth.squeeze(dim=1))
            loss = seg_weight * loss_seg + depth_weight * loss_depth
            test_loss += loss

            meaniou.update(pred_seg.squeeze(dim=1).cpu().numpy(), y_seg.squeeze(dim=1).cpu().numpy())
            rmse.update(pred_depth.squeeze(dim=1).cpu().numpy(), y_depth.squeeze(dim=1).cpu().numpy())

            # Update tqdm with metrics information
            progress_bar.set_postfix({
                "MeanIoU": f"{meaniou.val() * 100:.2f}%",
                "RMSE": rmse.val(),
                "Loss": loss.item(),
                "Loss_Seg": loss_seg.item(),
                "Loss_Depth": loss_depth.item()
            })

        writer.add_scalar("MeanIoU_per_epoch", meaniou.val(), epoch)
        writer.add_scalar("RMSE_per_epoch", rmse.val(), epoch)

        # Save qualitative images results:
        # ==================================
        save_sample_images(model=model, num_samples=16, model_name=model_name)
        # ==================================


        # best_models_path = f"{save_model}/best_models/{model_name}/{start_timestamp}_run/"
        best_models_path = os.path.join(save_model, "best_models", model_name, f"{start_timestamp}_run/")
        os.makedirs(best_models_path, exist_ok=True)

        test_loss /= num_batches
        previous_meaniou, previous_rmse = log_for_best.getMetric()
        previous_loss = log_for_best.getLoss()
        #                                                                         Loss lower is better!
        if (meaniou.val() >= previous_meaniou or rmse.val() >= previous_rmse) and test_loss <= previous_loss:
          # timestamp = time.strftime("%Y%m%d_%H%M%S")
          torch.save({
              'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimiser.state_dict(),
              'loss': loss.item(),
              }, os.path.join(best_models_path, f"{model_name}_best_model.pt"))
              # , f"{best_models_path}{model_name}_best_model.pt")

        tqdm.write(f"Test Error: \n MeanIoU: {meaniou.val() * 100:.2f}%, RMSE: {(rmse.val()):>8f}, Avg loss: {test_loss:>8f} \n")
        
        # Update `log_for_best` with current metrics and loss
        log_for_best.update_tuple((meaniou.val(), rmse.val()), test_loss)

        # Reset metrics
        meaniou.reset()
        rmse.reset()


      ################
      # Testing Loop #
      ################
      if model_name == "semantic":
        semantic_loop()
      
      elif model_name == "depth" or model_name == "dispnet":
        depth_loop()

      elif model_name == "multi-task":
        multi_task_loop()
      
      elif model_name == "author" or model_name == "no-relu-author":
         author_loop()

      else:
        raise RuntimeError(f"Unexpected Error: Test Loop have both seg_weight and depth_weight being ({seg_weight}, {depth_weight})")


  writer = None
  run_timestamp = None
  if checkpoint:
    run_timestamp = re.match(r'(\d{8}_\d{6})_', checkpoint['run']).group(1)
    # tensorboard_path = f"{save_model}/tensorboard_events/{model_name}/{checkpoint['run']}"
    tensorboard_path = os.path.join(save_model, "tensorboard_events", model_name, checkpoint['run'])
    writer = SummaryWriter(log_dir=tensorboard_path)
  else:
    run_timestamp = time.strftime("%Y%m%d_%H%M%S")
    # tensorboard_path = f"{save_model}/tensorboard_events/{model_name}/{run_timestamp}_run/"
    tensorboard_path = os.path.join(save_model, "tensorboard_events", model_name, f"{run_timestamp}_run/")
    os.makedirs(tensorboard_path, exist_ok=True)
    writer = SummaryWriter(log_dir=tensorboard_path)

  if writer is None:
    raise RuntimeError(f"Unexpected Error: `torch.utils.tensorboard` `SummaryWriter` is none ({writer})")
  
  if run_timestamp is None:
    raise RuntimeError(f"Unexpected Error: `run_timestamp` is none ({run_timestamp})")

  # To Keep Track of Previous Metrics/Losses to check current
  # test loop Metrics/Losses to update best models
  log_for_best = LogMetric()
  if model_name == "multi-task" or model_name == "author" or model_name == "no-relu-author":
    # Loss is `np.inf` because loss will be always lower than that
    # (unless you normalised the image so bad or something weird happened to your
    # loss function)
    #                         metric  loss
    log_for_best.update_tuple((0, 0), np.inf)
  else:
    log_for_best.update(0, np.inf)

  # Integer value to keep tracking of training loop to log on `writer` tensorboard
  n_iter = 1


  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
  # > HERE WHERE THE EXECUTION STARTS >
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

  ########################
  # SUBSET FOR DEBUGGING #
  ########################
  if debug:
    subset_indices = list(range(64))
    kt360_subset = Subset(kt360, subset_indices)
    kt360_loader = DataLoader(
        kt360_subset,
        batch_size=HYPERPARAMETERS['batch_size'],
        pin_memory=True,
        shuffle=True
    )
    kt360_val_subset = Subset(kt360_val, subset_indices)
    kt360_val_loader = DataLoader(
        kt360_val_subset,
        batch_size=HYPERPARAMETERS['batch_size'],
        pin_memory=True,
        shuffle=False
    )
    print(f"=> [DEBUG MODE] len(kt360_loader) = {len(kt360_loader)}, len(kt360_val_loader) = {len(kt360_val_loader)}")
  
  ##########################################
  # TRAINING & VALIDATION LOOP STARTS HERE #
  ##########################################
  start_epoch = None
  if checkpoint:
     start_epoch = int(re.search(r'\d+', checkpoint['epoch']).group(0))
  else:
     start_epoch = 1
  
  if start_epoch is None:
     raise RuntimeError(f"Unexpected Error: `start_epoch` is none ({start_epoch})")

  # Outer loop for epochs with tqdm and dynamic postfix
  epoch_progress = tqdm(range(start_epoch, HYPERPARAMETERS['epochs'] + 1), desc="Epochs", leave=True)

  for epoch in epoch_progress:
      n_iter_ = train_loop(data_loader=kt360_loader, model=model, model_name=model_name, optimiser=optimiser,
                 loss_fn=loss_fn, weights=weights, epoch=epoch, writer=writer, n_iter=n_iter, start_timestamp=run_timestamp, checkpoint=checkpoint)
      test_loop(data_loader=kt360_val_loader, model=model, model_name=model_name, loss_fn=loss_fn,
                weights=weights, writer=writer, log_for_best=log_for_best, start_timestamp=run_timestamp)

      tqdm.write(f"Completed epoch {epoch}/{HYPERPARAMETERS['epochs']}")
      n_iter = n_iter_

  best_models_path = os.path.join(save_model, "best_models", model_name, f"{run_timestamp}_run/", f"{model_name}_best_model.pt")
  create_moviepy_visualisation_in_tensorboard(kitti360_path=kitti360, form_data_path=form_data,
                                              model=model, model_name=model_name, best_model_path=best_models_path,
                                              writer=writer, device=device)
  # Close `writer` for tensorboard
  writer.close()


def valid_directory(path):
    # Check if the provided path is a directory
    if os.path.isdir(path):
        return path
    else:
        raise argparse.ArgumentTypeError(f"Directory '{path}' not found.")


if __name__ == '__main__':
  import argparse
  parser = argparse.ArgumentParser(description='Multi-task Training on MLRefineNet',
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)

  parser.add_argument('formatted_data', metavar='DIR',
                    help='path to formatted dataset')
  parser.add_argument('kitti360', metavar='DIR',
                      help='path to KITTI-360 dataset')
  parser.add_argument('-e', '--epochs', type=int, metavar='N', help='number of epochs', default=10)
  parser.add_argument('-b', '--batch','--batch-size', type=int, metavar='N', help='batch size for the dataset', default=12)
  parser.add_argument('-lr','--learning-rate', type=float, metavar='N', help='learning rate for the Adam Optimiser', default=1e-3)
  parser.add_argument(
        '--save-model',
        type=valid_directory,
        default='./artifacts/',
        metavar='DIR',
        help='directory to save the model. Default is ./artifacts/'
  )
  parser.add_argument('--clearml', action='store_true', help='starts clearml for cloud-based logging')
  parser.add_argument('--project_name', type=str, help='name of the ClearML project')
  parser.add_argument('--task_name', type=str, help='name of the ClearML task')

  parser.add_argument('-ckpt', '--run-from-checkpoint', action='store_true', help='continue training from the latest checkpoint')
  parser.add_argument('-r', '--read-checkpoint', metavar='DIR', help='override checkpoints path e.g. ("20240902_202049_run")')
  parser.add_argument('--from-epoch', type=int, metavar='N', help='override from which epoch checkpoint to retrieve')

  parser.add_argument('--debug', action='store_true', help='use debug mode with small subset of the model (64 images)')

  group = parser.add_mutually_exclusive_group(required=True)
  group.add_argument('-s', '--semantic', action='store_true', help='use single output semantic segmentation model')
  group.add_argument('-d', '--depth', action='store_true', help='use single output depth estimation model')
  group.add_argument('-ds', '--dispnet', action='store_true', help='use SfmLearner\'s DispNetS depth estimation model')
  group.add_argument('-m', '--multi-task', action='store_true', help='use the multi-task model for both semantic segmentation and depth estimation outputs')
  group.add_argument('-a', '--author', action='store_true', help='use the **author\'s** multi-task model for both semantic segmentation and depth estimation outputs')
  group.add_argument('-na', '--no-relu-author', action='store_true', help='use the NO RELU6 at the end **author\'s** multi-task model for both semantic segmentation and depth estimation outputs')

  args = parser.parse_args()

  if args.epochs <= 1 or args.batch <= 1:
     parser.error(f"-e/--epochs and/or -b/--batch/--batch-size must be equal to or greater than 1 (--epoch: {args.epochs}, --batch: {args.batch})")

  # Check if --read-checkpoint is used without --run-from-checkpoint
  if args.read_checkpoint and not args.run_from_checkpoint:
      parser.error("--read-checkpoint can only be used with -ckpt/--run-from-checkpoint.")

  # Check if --from-epoch is used without --read-checkpoint
  if args.from_epoch is not None and not args.read_checkpoint:
      parser.error("--from-epoch can only be used with --read-checkpoint.")

  if args.clearml:
        if not args.project_name or not args.task_name:
            parser.error("--project_name and --task_name are required when --clearml is set")

        from clearml import Task
        # task = Task.init(project_name='MSc Dissertation 2023-24',
        #                 task_name='Experiment for SfmLearner Validation balanced on semantics and revert back normalisiation')
        task = Task.init(project_name=args.project_name,
                         task_name=args.task_name,
                         auto_connect_frameworks={'pytorch': False})  # Avoids uploading continuously the checkpoints which takes a lot of space
        
        arg_dict = vars(args)
        task.set_parameters_as_dict(arg_dict)
        print("=> ClearML Cloud-Based Logging Enabled")

        HYPERPARAMETERS = {
          'epochs': args.epochs,
          'batch_size': args.batch,
          'learning_rate': args.learning_rate
        }
  else:
    HYPERPARAMETERS = {
      'epochs': args.epochs,
      'batch_size': args.batch,
      'learning_rate': args.learning_rate
    }

  if HYPERPARAMETERS['epochs'] is None or HYPERPARAMETERS['batch_size'] is None or HYPERPARAMETERS['learning_rate'] is None:
     raise RuntimeError(f"Unexpected Error: HYPERPARAMETERS contain none, --epochs {HYPERPARAMETERS['epochs']}, "
                        f"--batch {HYPERPARAMETERS['batch_size']}, --learning-rate {HYPERPARAMETERS['learning_rate']}")


  def run_from_checkpoint(model):
    checkpoint = None
    pattern = r'epoch(\d+)'

    if args.run_from_checkpoint:
      checkpoint_path = os.path.join(args.save_model, "checkpoints", model)
      valid_directory(checkpoint_path)

      if args.read_checkpoint:
        selected_checkpoint_filenames = sorted(os.listdir(os.path.join(checkpoint_path, args.read_checkpoint)),
                                               key=lambda file: int(re.search(pattern, file).group(1)))
        latest_epoch = re.search(pattern, selected_checkpoint_filenames[-1]).group(1)

        if latest_epoch is None:
           parser.error(f"--read-checkpoint file {args.read_checkpoint} does not have any checkpoints!")

        if args.from_epoch is not None:
          if args.from_epoch > int(latest_epoch):
             parser.error(f"--from-epoch should not bigger than the latest epoch checkpoint in --read-checkpoint {args.read_checkpoint}")
          elif args.from_epoch < 1:
             parser.error("--from-epoch value must be bigger than 1")

          checkpoint_file = [checkpoint_file for checkpoint_file in selected_checkpoint_filenames if re.search(f"epoch{args.from_epoch}", checkpoint_file)][0]

          checkpoint = { # -ckpt --read-checkpoint --from-epoch
            "run": args.read_checkpoint,
            "epoch": f"epoch{args.from_epoch}",
            "filename": checkpoint_file
          }

        else:
          checkpoint_file = [checkpoint_file for checkpoint_file in selected_checkpoint_filenames if re.search(f"epoch{latest_epoch}", checkpoint_file)][0]
          checkpoint = { # -ckpt --read-checkpoint
            "run": args.read_checkpoint,
            "epoch": f"epoch{latest_epoch}",
            "filename": checkpoint_file
          }

      else:
        latest_checkpoint = sorted(os.listdir(checkpoint_path))[-1]
        checkpoint_filenames = sorted(os.listdir(os.path.join(checkpoint_path, latest_checkpoint)),
                                      key=lambda file: int(re.search(pattern, file).group(1)))
        latest_epoch = re.search(pattern, checkpoint_filenames[-1]).group(1)

        checkpoint_file = [checkpoint_file for checkpoint_file in checkpoint_filenames if re.search(f"epoch{latest_epoch}", checkpoint_file)][0]

        checkpoint = { # -ckpt
          "run": latest_checkpoint,
          "epoch": f"epoch{latest_epoch}",
          "filename": checkpoint_file
        }
    
    if checkpoint:
       chosen_epoch_point = int(re.search(pattern, checkpoint['epoch']).group(1))
       if args.epochs <= chosen_epoch_point:
          parser.error(f"when continuing from checkpoint (`-ckpt`), --epoch must not be equal to or smaller than -ckpt/-r/--from-epoch (--epoch {args.epochs} <= -ckpt/-r/--from-epoch {chosen_epoch_point})")
  
    return checkpoint


  model = None
  if args.semantic:
    model = "semantic"
    checkpoint = run_from_checkpoint(model)
    main(form_data=args.formatted_data, kitti360=args.kitti360, 
         HYPERPARAMETERS=HYPERPARAMETERS, model=model, save_model=args.save_model, checkpoint=checkpoint, debug=args.debug)

  elif args.depth:
    model = "depth"
    checkpoint = run_from_checkpoint(model)
    main(form_data=args.formatted_data, kitti360=args.kitti360,
         HYPERPARAMETERS=HYPERPARAMETERS, model=model, save_model=args.save_model, checkpoint=checkpoint, debug=args.debug)
    
  elif args.dispnet:
    model = "dispnet"
    checkpoint = run_from_checkpoint(model)
    main(form_data=args.formatted_data, kitti360=args.kitti360,
         HYPERPARAMETERS=HYPERPARAMETERS, model=model, save_model=args.save_model, checkpoint=checkpoint, debug=args.debug)

  elif args.multi_task:
    model = "multi-task"
    checkpoint = run_from_checkpoint(model)
    main(form_data=args.formatted_data, kitti360=args.kitti360,
         HYPERPARAMETERS=HYPERPARAMETERS, model=model, save_model=args.save_model, checkpoint=checkpoint, debug=args.debug)
  
  elif args.author:
    model = "author"
    checkpoint = run_from_checkpoint(model)
    main(form_data=args.formatted_data, kitti360=args.kitti360,
         HYPERPARAMETERS=HYPERPARAMETERS, model=model, save_model=args.save_model, checkpoint=checkpoint, debug=args.debug)
    
  elif args.no_relu_author:
    model = "no-relu-author"
    checkpoint = run_from_checkpoint(model)
    main(form_data=args.formatted_data, kitti360=args.kitti360,
         HYPERPARAMETERS=HYPERPARAMETERS, model=model, save_model=args.save_model, checkpoint=checkpoint, debug=args.debug)

  else:
    raise RuntimeError(f"Unexpected Error: model is none ({model}), please select one of the following: "
                       "'semantic', 'depth', 'dispnet', 'multi-task', 'author' or 'no-relu-author' (in terminal `--semantic`, `--depth`, `--dispnet`, `--multi-task`, `--author` or `--no-relu-author`)")
  

  if args.clearml and args.run_from_checkpoint is False:
    best_models_path = os.path.join(args.save_model, "best_models", model)
    best_models_latest_run = sorted(os.listdir(best_models_path))[-1]
    task.upload_artifact(name=f"{model}_best_model",
                         artifact_object=os.path.join(best_models_path, best_models_latest_run, f"{model}_best_model.pt"))


    tensorboard_path = os.path.join(args.save_model, "tensorboard_events", model)
    tensorboard_latest_run = sorted(os.listdir(tensorboard_path))[-1]
    task.upload_artifact(name='TensorBoard Event',
                         artifact_object=os.path.join(tensorboard_path, tensorboard_latest_run, "events.out.tfevents*"))

    # close clearml task
    task.close()