EnnengYang
diff --git a/‎README.md
+35 b/‎README.md
+35
diff --git a/‎adatask.py
+114 b/‎adatask.py
+114
diff --git a/‎common.py
+68 b/‎common.py
+68
diff --git a/‎data.py
+110 b/‎data.py
+110
@@ -0,0 +1,35 @@
+# AdaTask
+[AdaTask: A Task-Aware Adaptive Learning Rate Approach to Multi-Task Learning](https://arxiv.org/abs/2211.15055)
+
+In this paper we propose a Task-wise Adaptive Learning Rate Method, named AdaTask, to use task-specific accumulative gradients when adjusting the learning rate of each parameter. 
+
+## DataSet
+- Download [CityScapes](https://www.dropbox.com/sh/gaw6vh6qusoyms6/AADwWi0Tp3E3M4B2xzeGlsEna?dl=0) dataset and put it in the dataset directory.
+
+
+##  Train and Evaluate Method
+
+  ```
+    python3  main_cityscapes.py --method=adam
+  ```
+
+  ```
+    python3  main_cityscapes.py --method=adam_with_adatask
+  ```
+
+
+
+## Reference
+
+Please cite our paper if you use this code.
+
+```
+@inproceedings{adatask_aaai2023,
+  title={AdaTask: A Task-aware Adaptive Learning Rate Approach to Multi-task Learning},
+  author={{Yang, Enneng and Pan, Junwei and Wang, Ximei and Yu, Haibin and Shen, Li and Chen, Xihua and Xiao, Lei and Jiang, Jie and Guo, Guibing},
+  booktitle={AAAI},
+  year={2023}
+}
+
+```
+
@@ -0,0 +1,114 @@
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+from typing import List, Union
+
+class Adam_with_AdaTask(Optimizer):
+    r"""
+        Implements Adam with AdaTask algorithm.
+    """
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, args=None, device='cpu', n_tasks=3, task_weight=[1, 1]):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad)
+        super(Adam_with_AdaTask, self).__init__(params, defaults)
+
+        self.n_tasks = n_tasks
+        self.device = device
+        self.betas = betas
+        self.eps = eps
+        self.task_weight = torch.Tensor(task_weight).to(device)
+
+    def zero_grad_modules(self, modules_parameters):
+        for p in modules_parameters:
+            if p.grad is not None:
+                p.grad.detach_()
+                p.grad.zero_()
+
+    def backward_and_step(self,
+        losses: torch.Tensor,
+        shared_parameters: Union[List[torch.nn.parameter.Parameter], torch.Tensor] = None,
+        task_specific_parameters: Union[List[torch.nn.parameter.Parameter], torch.Tensor] = None,
+        last_shared_parameters: Union[List[torch.nn.parameter.Parameter], torch.Tensor] = None, ):
+
+        shared_grads = []
+        if shared_parameters is not None:
+            for i in range(len(losses)):
+                self.zero_grad_modules(shared_parameters)
+                (self.task_weight[i] * losses[i]).backward(retain_graph=True)
+                grad = [p.grad.detach().clone() if (p.requires_grad is True and p.grad is not None) else None for p in shared_parameters]
+                shared_grads.append(grad)
+
+        if task_specific_parameters is not None:
+            self.zero_grad_modules(task_specific_parameters)
+            (self.task_weight*losses).sum().backward()
+            task_specific_grads = [p.grad.detach().clone() if (p.requires_grad is True and p.grad is not None) else None for p in task_specific_parameters]
+
+        return self.step(shared_parameters, task_specific_parameters, shared_grads, task_specific_grads)
+
+    @torch.no_grad()
+    def step(self, shared_parameters, task_specific_parameters, shared_grads, task_specific_grads):
+        # lr
+        for group in self.param_groups:
+            step_lr = group['lr']
+
+        # shared param
+        for pi in range(len(shared_parameters)):
+            p = shared_parameters[pi]
+            state = self.state[p]
+            # State initialization
+            if len(state) == 0:
+                state['step'] = 0
+                for t in range(self.n_tasks):
+                    # Exponential moving average of gradient values
+                    state['exp_avg_'+str(t)] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq_'+str(t)] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+            state['step'] += 1
+            beta1, beta2 = self.betas
+            bias_correction1 = 1 - beta1 ** state['step']
+            bias_correction2 = 1 - beta2 ** state['step']
+
+            for t in range(self.n_tasks):
+                grad = shared_grads[t][pi]
+                exp_avg = state['exp_avg_' + str(t)]
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+                exp_avg_sq = state['exp_avg_sq_' + str(t)]
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(self.eps)
+                step_size = step_lr / bias_correction1
+                p.addcdiv_(exp_avg, denom, value=-step_size)
+
+        # task specific param
+        for pi in range(len(task_specific_parameters)):
+            p = task_specific_parameters[pi]
+            state = self.state[p]
+            # State initialization
+            if len(state) == 0:
+                state['step'] = 0
+                state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+            state['step'] += 1
+            beta1, beta2 = self.betas
+            bias_correction1 = 1 - beta1 ** state['step']
+            bias_correction2 = 1 - beta2 ** state['step']
+
+            grad = task_specific_grads[pi]
+            exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+            exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+            exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+            denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(self.eps)
+            step_size = step_lr / bias_correction1
+            p.addcdiv_(exp_avg, denom, value=-step_size)
+
+        return None
@@ -0,0 +1,68 @@
+import argparse
+import logging
+import random
+from collections import defaultdict
+from pathlib import Path
+import numpy as np
+import torch
+
+def str_to_list(string):
+    return [float(s) for s in string.split(",")]
+
+def str_or_float(value):
+    try:
+        return float(value)
+    except:
+        return value
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+
+common_parser = argparse.ArgumentParser(add_help=False)
+common_parser.add_argument("--data-path", type=Path, help="path to data")
+common_parser.add_argument("--log_path", type=Path, help="path to log")
+common_parser.add_argument("--n-epochs", type=int, default=200)
+common_parser.add_argument("--n_task", type=int, default=2)
+common_parser.add_argument("--batch-size", type=int, default=120, help="batch size")
+common_parser.add_argument("--lr", type=float, default=1e-3, help="learning rate")
+common_parser.add_argument("--method-params-lr", type=float, default=0.025, help="lr for weight method params. If None, set to args.lr. For uncertainty weighting",)
+common_parser.add_argument("--gpu", type=int, default=0, help="gpu device ID")
+common_parser.add_argument("--seed", type=int, default=42, help="seed value")
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+def set_logger():
+    logging.basicConfig(
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        level=logging.INFO,)
+
+def set_seed(seed):
+    """for reproducibility
+    :param seed:
+    :return:
+    """
+    np.random.seed(seed)
+    random.seed(seed)
+
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+
+def get_device(no_cuda=False, gpus="0"):
+    return torch.device(
+        f"cuda:{gpus}" if torch.cuda.is_available() and not no_cuda else "cpu"
+    )
+
@@ -0,0 +1,110 @@
+import fnmatch
+import os
+import random
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.utils.data.dataset import Dataset
+
+class RandomScaleCrop(object):
+    """
+    Credit to Jialong Wu from https://github.com/lorenmt/mtan/issues/34.
+    """
+
+    def __init__(self, scale=[1.0, 1.2, 1.5]):
+        self.scale = scale
+
+    def __call__(self, img, label, depth, normal):
+        height, width = img.shape[-2:]
+        sc = self.scale[random.randint(0, len(self.scale) - 1)]
+        h, w = int(height / sc), int(width / sc)
+        i = random.randint(0, height - h)
+        j = random.randint(0, width - w)
+        img_ = F.interpolate(
+            img[None, :, i : i + h, j : j + w],
+            size=(height, width),
+            mode="bilinear",
+            align_corners=True,
+        ).squeeze(0)
+        label_ = (
+            F.interpolate(
+                label[None, None, i : i + h, j : j + w],
+                size=(height, width),
+                mode="nearest",
+            )
+            .squeeze(0)
+            .squeeze(0)
+        )
+        depth_ = F.interpolate(
+            depth[None, :, i : i + h, j : j + w], size=(height, width), mode="nearest"
+        ).squeeze(0)
+        normal_ = F.interpolate(
+            normal[None, :, i : i + h, j : j + w],
+            size=(height, width),
+            mode="bilinear",
+            align_corners=True,
+        ).squeeze(0)
+        return img_, label_, depth_ / sc, normal_
+
+class RandomScaleCropCityScapes(object):
+    """
+    Credit to Jialong Wu from https://github.com/lorenmt/mtan/issues/34.
+    """
+    def __init__(self, scale=[1.0, 1.2, 1.5]):
+        self.scale = scale
+
+    def __call__(self, img, label, depth):
+        height, width = img.shape[-2:]
+        sc = self.scale[random.randint(0, len(self.scale) - 1)]
+        h, w = int(height / sc), int(width / sc)
+        i = random.randint(0, height - h)
+        j = random.randint(0, width - w)
+        img_ = F.interpolate(img[None, :, i:i + h, j:j + w], size=(height, width), mode='bilinear', align_corners=True).squeeze(0)
+        label_ = F.interpolate(label[None, None, i:i + h, j:j + w], size=(height, width), mode='nearest').squeeze(0).squeeze(0)
+        depth_ = F.interpolate(depth[None, :, i:i + h, j:j + w], size=(height, width), mode='nearest').squeeze(0)
+        return img_, label_, depth_ / sc
+
+class CityScapes(Dataset):
+    """
+    We could further improve the performance with the data augmentation of NYUv2 defined in:
+        [1] PAD-Net: Multi-Tasks Guided Prediction-and-Distillation Network for Simultaneous Depth Estimation and Scene Parsing
+        [2] Pattern affinitive propagation across depth, surface normal and semantic segmentation
+        [3] Mti-net: Multiscale task interaction networks for multi-task learning
+
+        1. Random scale in a selected raio 1.0, 1.2, and 1.5.
+        2. Random horizontal flip.
+
+    Please note that: all baselines and MTAN did NOT apply data augmentation in the original paper.
+    """
+    def __init__(self, root, train=True, augmentation=False):
+        self.train = train
+        self.root = os.path.expanduser(root)
+        self.augmentation = augmentation
+
+        # read the data file
+        if train:
+            self.data_path = root + '/train'
+        else:
+            self.data_path = root + '/val'
+
+        # calculate data length
+        self.data_len = len(fnmatch.filter(os.listdir(self.data_path + '/image'), '*.npy'))
+
+    def __getitem__(self, index):
+        # load data from the pre-processed npy files
+        image = torch.from_numpy(np.moveaxis(np.load(self.data_path + '/image/{:d}.npy'.format(index)), -1, 0))
+        semantic = torch.from_numpy(np.load(self.data_path + '/label_7/{:d}.npy'.format(index)))
+        depth = torch.from_numpy(np.moveaxis(np.load(self.data_path + '/depth/{:d}.npy'.format(index)), -1, 0))
+
+        # apply data augmentation if required
+        if self.augmentation:
+            image, semantic, depth = RandomScaleCropCityScapes()(image, semantic, depth)
+            if torch.rand(1) < 0.5:
+                image = torch.flip(image, dims=[2])
+                semantic = torch.flip(semantic, dims=[1])
+                depth = torch.flip(depth, dims=[2])
+
+        return image.float(), semantic.float(), depth.float()
+
+    def __len__(self):
+        return self.data_len