diff --git a/.github/workflows/run_singularity_versions.yml b/.github/workflows/run_singularity_versions.yml
index fe576a30..c7862636 100644
--- a/.github/workflows/run_singularity_versions.yml
+++ b/.github/workflows/run_singularity_versions.yml
@@ -1,6 +1,16 @@
 name: Test Support for different Singularity Versions
 
-on: [push]
+on:
+  pull_request:
+    types: [ready_for_review]
+
+  pull_request_review:
+    types: [submitted]
+
+  push:
+    branches:
+      - 'main'
+      - 'development'
 
 jobs:
   Tests:
@@ -10,25 +20,25 @@ jobs:
       matrix:
         include:
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.5"
+            DISPLAY_NAME: "Singularity Container Examples with S3.7"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.5"
+            SINGULARITY_VERSION: "3.7"
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.6"
+            DISPLAY_NAME: "Singularity Container Examples with S3.8"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.6"
+            SINGULARITY_VERSION: "3.8"
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.7"
+            DISPLAY_NAME: "Singularity Container Examples with S3.9"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.7"
+            SINGULARITY_VERSION: "3.9"
           - python-version: 3.7
-            DISPLAY_NAME: "Singularity Container Examples with S3.8"
+            DISPLAY_NAME: "Singularity Container Examples with S3.10"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: false
-            SINGULARITY_VERSION: "3.8"
+            SINGULARITY_VERSION: "3.10"
 
       fail-fast: false
 
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 3c22a210..4fecec7d 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -15,26 +15,36 @@ jobs:
             DISPLAY_NAME: "Singularity Tests + CODECOV"
             RUN_TESTS: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
             RUN_CODECOV: true
+
           - python-version: 3.7
             DISPLAY_NAME: "Codestyle"
             RUN_CODESTYLE: true
+            USE_SINGULARITY: false
+
           - python-version: 3.7
             DISPLAY_NAME: "Singularity Container Examples"
             RUN_CONTAINER_EXAMPLES: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
+
           - python-version: 3.7
             DISPLAY_NAME: "Local Examples"
             RUN_LOCAL_EXAMPLES: true
             USE_SINGULARITY: false
+
           - python-version: 3.8
             DISPLAY_NAME: "Singularity Tests"
             RUN_TESTS: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
+
           - python-version: 3.9
             DISPLAY_NAME: "Singularity Tests"
             RUN_TESTS: true
             USE_SINGULARITY: true
+            SINGULARITY_VERSION: "3.8"
       fail-fast: false
 
     name: Tests ${{ matrix.python-version }} ${{ matrix.DISPLAY_NAME }}
@@ -42,6 +52,7 @@ jobs:
     env:
       RUN_TESTS: ${{ matrix.RUN_TESTS }}
       USE_SINGULARITY: ${{ matrix.USE_SINGULARITY }}
+      SINGULARITY_VERSION: ${{ matrix.SINGULARITY_VERSION }}
       RUN_CODECOV: ${{ matrix.RUN_CODECOV }}
       RUN_CODESTYLE: ${{ matrix.RUN_CODESTYLE }}
       RUN_CONTAINER_EXAMPLES: ${{ matrix.RUN_CONTAINER_EXAMPLES }}
@@ -58,6 +69,10 @@ jobs:
       uses: actions/setup-go@v2
       with:
         go-version: '1.14.15' # The Go version to download (if necessary) and use.
+    - name: Set up Singularity
+      if: matrix.USE_SINGULARITY == true
+      run: |
+        chmod +x ci_scripts/install_singularity.sh && source ./ci_scripts/install_singularity.sh
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
diff --git a/README.md b/README.md
index b74b1a00..ec0a442e 100644
--- a/README.md
+++ b/README.md
@@ -54,14 +54,14 @@ cd HPOBench
 pip install .
 ```
 
-**Note:** This does not install *singularity (version 3.6)*. Please follow the steps described here: [user-guide](https://sylabs.io/guides/3.6/user-guide/quick_start.html#quick-installation-steps).   
+**Note:** This does not install *singularity (version 3.8)*. Please follow the steps described here: [user-guide](https://sylabs.io/guides/3.8/user-guide/quick_start.html#quick-installation-steps).   
 If you run into problems, using the most recent singularity version might help: [here](https://singularity.hpcng.org/admin-docs/master/installation.html)
 
 ## Containerized Benchmarks
 
-We provide all benchmarks as containerized versions to (i) isolate their dependencies and (ii) keep them reproducible. Our containerized benchmarks do not rely on external dependencies and thus do not change over time. For this, we rely on [Singularity (version 3.6)](https://sylabs.io/guides/3.6/user-guide/) and for now upload all containers to a [gitlab registry](https://gitlab.tf.uni-freiburg.de/muelleph/hpobench-registry/container_registry)
+We provide all benchmarks as containerized versions to (i) isolate their dependencies and (ii) keep them reproducible. Our containerized benchmarks do not rely on external dependencies and thus do not change over time. For this, we rely on [Singularity (version 3.8)](https://sylabs.io/guides/3.8/user-guide/) and for now upload all containers to a [gitlab registry](https://gitlab.tf.uni-freiburg.de/muelleph/hpobench-registry/container_registry)
 
-The only other requirements are: [ConfigSpace](https://github.com/automl/ConfigSpace), *scipy* and *numpy* 
+The only other requirements are: [ConfigSpace](https://github.com/automl/ConfigSpace), *numpy*, *oslo* and *Pyro4* 
 
 ### Run a Benchmark Locally
 
@@ -139,10 +139,9 @@ If you use a benchmark in your experiments, please specify the version number of
 the used container to ensure reproducibility. When starting an experiment, HPOBench writes automatically these two version numbers to the log. 
 
 ### Troubleshooting and Further Notes
-
   - **Singularity throws an 'Invalid Image format' exception**
-  Use a singularity version > 3. For users of the Meta-Cluster in Freiburg, you have to set the following path:
-  ```export PATH=/usr/local/kislurm/singularity-3.5/bin/:$PATH```
+  Use a singularity version >= 3.8. If you have multiple singularity installations, you have to add the correct singularity version to your $PATH, e.g.
+  ```export PATH=/usr/local/kislurm/singularity-3.8/bin/:$PATH```
 
   - **A Benchmark fails with `SystemError: Could not start an instance of the benchmark. Retried 5 times` but the container 
 can be started locally with `singularity instance start <pathtocontainer> test`**
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
index b68a1b88..2d229f74 100644
--- a/ci_scripts/install.sh
+++ b/ci_scripts/install.sh
@@ -40,35 +40,9 @@ else
     echo "Skip installing packages for local examples"
 fi
 
-if [[ "$USE_SINGULARITY" == "true" ]]; then
-    echo "Install Singularity"
-
-    sudo apt-get update && sudo apt-get install -y \
-      build-essential \
-      libssl-dev \
-      uuid-dev \
-      libgpgme11-dev \
-      squashfs-tools \
-      libseccomp-dev \
-      wget \
-      pkg-config \
-      git \
-      cryptsetup
-
-    export VERSION=3.5.3 && # adjust this as necessary \
-      wget https://github.com/sylabs/singularity/archive/refs/tags/v${VERSION}.tar.gz && \
-      tar -xzf v${VERSION}.tar.gz && \
-      cd singularity-${VERSION}
-
-    ./mconfig && \
-      make -C builddir && \
-      sudo make -C builddir install
-
-    cd ..
-    install_packages="${install_packages}placeholder,"
-else
-    echo "Skip installing Singularity"
-fi
+# We add a placeholder / No-OP operator. When running the container examples, we don't install any
+# additional packages. That causes an error, since `pip install .[]` does not work.
+install_packages="${install_packages}NOP,"
 
 # remove the trailing comma
 install_packages="$(echo ${install_packages} | sed 's/,*\r*$//')"
diff --git a/ci_scripts/install_singularity.sh b/ci_scripts/install_singularity.sh
index 292df85b..9a89e4a3 100644
--- a/ci_scripts/install_singularity.sh
+++ b/ci_scripts/install_singularity.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env sh
 
-echo "Install Singularity"
+echo "Inside Singularity Installation Script"
 
 sudo apt-get update && sudo apt-get install -y \
   build-essential \
@@ -14,21 +14,33 @@ sudo apt-get update && sudo apt-get install -y \
   git \
   cryptsetup
 
-if [[ "$SINGULARITY_VERSION" == "3.5" ]]; then
-    export VERSION=3.5.3
-elif [[ "$SINGULARITY_VERSION" == "3.6" ]]; then
-    export VERSION=3.6.4
-elif [[ "$SINGULARITY_VERSION" == "3.7" ]]; then
+if [[ "$SINGULARITY_VERSION" == "3.7" ]]; then
     export VERSION=3.7.3
+    export FILENAME=singularity-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity
+
 elif [[ "$SINGULARITY_VERSION" == "3.8" ]]; then
-    export VERSION=3.8.0
+    export VERSION=3.8.4
+    export FILENAME=singularity-ce-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity-ce-"${VERSION}"
+
+elif [[ "$SINGULARITY_VERSION" == "3.9" ]]; then
+    export VERSION=3.9.3
+    export FILENAME=singularity-ce-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity-ce-"${VERSION}"
+
+elif [[ "$SINGULARITY_VERSION" == "3.10" ]]; then
+    export VERSION=3.10.0
+    export FILENAME=singularity-ce-"${VERSION}"
+    export EXTRACTED_FILENAME=singularity-ce-"${VERSION}"
+
 else
     echo "Skip installing Singularity"
 fi
 
-wget https://github.com/sylabs/singularity/archive/refs/tags/v${VERSION}.tar.gz && \
-tar -xzf v${VERSION}.tar.gz && \
-cd singularity-${VERSION} && \
+wget https://github.com/sylabs/singularity/releases/download/v"${VERSION}"/"${FILENAME}".tar.gz && \
+tar -xzf "${FILENAME}".tar.gz && \
+cd "${EXTRACTED_FILENAME}" && \
 ./mconfig && \
 make -C builddir && \
 sudo make -C builddir install
diff --git a/extra_requirements/lm_benchmark.json b/extra_requirements/lm_benchmark.json
new file mode 100644
index 00000000..8f263bba
--- /dev/null
+++ b/extra_requirements/lm_benchmark.json
@@ -0,0 +1,6 @@
+{
+  "lm": [
+    "torch==1.3.0",
+    "tqdm>=3.0.0"
+  ]
+}
\ No newline at end of file
diff --git a/extra_requirements/mo_cnn.json b/extra_requirements/mo_cnn.json
new file mode 100644
index 00000000..35914e3e
--- /dev/null
+++ b/extra_requirements/mo_cnn.json
@@ -0,0 +1,7 @@
+{
+  "mo_cnn": [
+    "tqdm>=3.0.0",
+    "torch==1.9.0",
+    "pandas==1.2.4"
+  ]
+}
diff --git a/extra_requirements/multi_objective.json b/extra_requirements/multi_objective.json
new file mode 100644
index 00000000..146c06a7
--- /dev/null
+++ b/extra_requirements/multi_objective.json
@@ -0,0 +1,3 @@
+{
+  "mo_adult": ["pandas==1.2.4","scikit-learn==0.24.2","tqdm>=3.1.4"]
+}
\ No newline at end of file
diff --git a/extra_requirements/yahpo_gym.json b/extra_requirements/yahpo_gym.json
new file mode 100644
index 00000000..77bea14d
--- /dev/null
+++ b/extra_requirements/yahpo_gym.json
@@ -0,0 +1,3 @@
+{
+  "yahpo_gym": ["yahpo_gym@git+https://github.com/pfistfl/yahpo_gym#egg=yahpo_gym&subdirectory=yahpo_gym"]
+}
diff --git a/hpobench/benchmarks/mo/__init__.py b/hpobench/benchmarks/mo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/benchmarks/mo/adult_benchmark.py b/hpobench/benchmarks/mo/adult_benchmark.py
new file mode 100644
index 00000000..a12e8a70
--- /dev/null
+++ b/hpobench/benchmarks/mo/adult_benchmark.py
@@ -0,0 +1,445 @@
+"""
+Changelog:
+==========
+
+0.0.1:
+* First implementation of the Multi-Objective Fair Adult Benchmark.
+"""
+import logging
+import time
+from typing import Union, Dict, List, Any, Tuple
+
+import ConfigSpace as CS
+import numpy as np
+from ConfigSpace.conditions import GreaterThanCondition
+from sklearn.metrics import accuracy_score
+from sklearn.neural_network import MLPClassifier
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+from hpobench.dependencies.mo.fairness_metrics import fairness_risk, STATISTICAL_DISPARITY, UNEQUALIZED_ODDS, \
+    UNEQUAL_OPPORTUNITY
+from hpobench.dependencies.mo.scalar import get_fitted_scaler
+from hpobench.util.data_manager import AdultDataManager
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('ADULT_FAIR')
+
+
+class AdultBenchmark(AbstractMultiObjectiveBenchmark):
+    def __init__(self,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Multi-objective fairness HPO task. Optimize the HP of a NN on the adult data set.
+
+        Parameters
+        ----------
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmark's random state.
+        """
+        super(AdultBenchmark, self).__init__(rng=rng, **kwargs)
+
+        data_manager = AdultDataManager()
+        self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test = data_manager.load()
+        self.output_class = np.unique(self.y_train)
+        self.feature_names = data_manager.feature_names
+        self.sensitive_feature = data_manager.sensitive_names
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all parameters for the MLP.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('n_fc_layers', default_value=3, lower=1, upper=4, log=False),
+            CS.UniformIntegerHyperparameter('fc_layer_0', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_1', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_2', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_3', default_value=16, lower=2, upper=32, log=True),
+            CS.UniformFloatHyperparameter('alpha', lower=10**-5, upper=10**-1, default_value=10**-2, log=True),
+            CS.UniformFloatHyperparameter('learning_rate_init', lower=10**-5, upper=1, default_value=10**-3, log=True),
+            CS.UniformFloatHyperparameter('beta_1', lower=10**-3, upper=0.99, default_value=10**-3, log=True),
+            CS.UniformFloatHyperparameter('beta_2', lower=10**-3, upper=0.99, default_value=10**-3, log=True),
+            CS.UniformFloatHyperparameter('tol', lower=10**-5, upper=10**-2, default_value=10**-3, log=True),
+        ])
+
+        cs.add_conditions([
+            # Add the fc_layer_1 (2nd layer) if we allow more than 1 `n_fc_layers`, and so on...
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_1'), cs.get_hyperparameter('n_fc_layers'), 1),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_2'), cs.get_hyperparameter('n_fc_layers'), 2),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_3'), cs.get_hyperparameter('n_fc_layers'), 3),
+        ])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters.
+
+        Fidelities
+        ----------
+        budget: int - Values: [1, 200]
+            Number of epochs an architecture was trained.
+            Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameter(
+            CS.UniformIntegerHyperparameter(
+                'budget', lower=1, upper=200, default_value=200, log=False
+            )
+        )
+        return fidelity_space
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {
+            'name': 'Multi-objective Asynchronous Successive Halving',
+            'references':
+                ['@article{schmucker2021multi,'
+                 'title={Multi-objective Asynchronous Successive Halving},'
+                 'author={Schmucker, Robin and Donini, Michele and Zafar, Muhammad Bilal and Salinas,'
+                 ' David and Archambeau, C{\'e}dric},'
+                 'journal={arXiv preprint arXiv:2106.12639},'
+                 'year={2021}']}
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        """Get a list of objectives evaluated in the objective_function. """
+        return ['accuracy', 'DSP', 'DEO', 'DFP']
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           shuffle: bool = False,
+                           **kwargs) -> Dict:
+        """
+        Objective function for the multi-objective adult benchmark.
+
+        We train a NN and evaluate its performance using fairness metrics.
+        This function returns the performance on the validation set.
+        However, we report also train and test performance.
+
+        Parameters
+        ----------
+        configuration: Dict, CS.Configuration
+            Configuration for the MLP model.
+        fidelity: Dict, None
+            budget: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict - validation metrics after training on train
+                accuracy: float
+                DSO: float
+                DEO: float
+                DFP: float
+            cost : time to train the network
+            info : Dict
+                 train_accuracy : float
+                 valid_accuracy : float
+                 test_accuracy : float
+                 training_cost : float - time to train the network. see `training_cost`
+                 total_cost : float - elapsed time for the entire obj_func call,
+                 eval_train_cost : float - time to compute metrics on training split
+                 eval_valid_cost : float - time to compute metrics on validation split
+                 eval_test_cost : float - time to compute metrics on test split
+                 train_DSO : float
+                 train_DEO : float
+                 train_DFP : float
+                 valid_DSO : float
+                 valid_DEO : float
+                 valid_DFP : float
+                 test_DSO : float
+                 test_DEO : float
+                 test_DFP : float
+                 fidelity : int
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+        if shuffle:
+            self._shuffle_data(rng=self.rng, shuffle_valid=False)
+
+        ts_start = time.time()
+
+        budget = fidelity['budget']
+        logger.debug(f"budget for evaluation of config:{budget}")
+        logger.debug(f"config for evaluation:{configuration}")
+
+        sensitive_rows_train = self.X_train[:, self.feature_names.index(self.sensitive_feature)]
+        sensitive_rows_val = self.X_valid[:, self.feature_names.index(self.sensitive_feature)]
+        sensitive_rows_test = self.X_test[:, self.feature_names.index(self.sensitive_feature)]
+
+        X_train, X_valid, X_test = self.X_train.copy(), self.X_valid.copy(), self.X_test.copy()
+
+        # Normalize data
+        scaler = get_fitted_scaler(X_train, "Standard")
+        if scaler is not None:
+            X_train = scaler(X_train)
+            X_valid = scaler(X_valid)
+            X_test = scaler(X_test)
+
+        # Create model. The parameters fc_layer_1-3 might not be included in the search space.
+        hidden = [configuration['fc_layer_0'],
+                  configuration.get('fc_layer_1', None),
+                  configuration.get('fc_layer_2', None),
+                  configuration.get('fc_layer_3', None)][:configuration['n_fc_layers']]
+
+        for item in ['fc_layer_0', 'fc_layer_1', 'fc_layer_2', 'fc_layer_3', 'n_fc_layers']:
+            if item in configuration:
+                configuration.pop(item)
+
+        # We deviate here from the original implementation. They have called `budget`-times mlp.partial_fit().
+        # We call `.fit()` due to efficiency aspects.
+        mlp = MLPClassifier(**configuration, hidden_layer_sizes=hidden, shuffle=shuffle,
+                            random_state=self.rng, max_iter=budget)
+
+        mlp.fit(X_train, self.y_train)
+        training_cost = time.time() - ts_start
+
+        train_accuracy, train_statistical_disparity, train_unequal_opportunity, train_unequalized_odds, \
+            eval_train_runtime = \
+            AdultBenchmark._compute_metrics_on_split(X_train, self.y_train, sensitive_rows_train, mlp)
+
+        val_accuracy, val_statistical_disparity, val_unequal_opportunity, val_unequalized_odds, eval_valid_runtime = \
+            AdultBenchmark._compute_metrics_on_split(X_valid, self.y_valid, sensitive_rows_val, mlp)
+
+        test_accuracy, test_statistical_disparity, test_unequal_opportunity, test_unequalized_odds, eval_test_runtime =\
+            AdultBenchmark._compute_metrics_on_split(X_test, self.y_test, sensitive_rows_test, mlp)
+
+        logger.debug(f"config: {configuration}, val_acc: {val_accuracy}, test_score: {test_accuracy}, "
+                     f"train score: {train_accuracy}, dsp: {val_statistical_disparity}, "
+                     f"deo :{val_unequal_opportunity}, dfp :{val_unequalized_odds}")
+
+        elapsed_time = time.time() - ts_start
+
+        return {'function_value': {'accuracy': float(val_accuracy),
+                                   'DSO': float(val_statistical_disparity),
+                                   'DEO': float(val_unequal_opportunity),
+                                   'DFP': float(val_unequalized_odds)
+                                   },
+                'cost': training_cost,
+                'info': {'train_accuracy': float(train_accuracy),
+                         'valid_accuracy': float(val_accuracy),
+                         'test_accuracy': float(test_accuracy),
+                         'training_cost': training_cost,
+                         'total_cost': elapsed_time,
+                         'eval_train_cost': eval_train_runtime,
+                         'eval_valid_cost': eval_valid_runtime,
+                         'eval_test_cost': eval_test_runtime,
+                         'train_DSO': float(train_statistical_disparity),
+                         'train_DEO': float(train_unequal_opportunity),
+                         'train_DFP': float(train_unequalized_odds),
+                         'valid_DSO': float(val_statistical_disparity),
+                         'valid_DEO': float(val_unequal_opportunity),
+                         'valid_DFP': float(val_unequalized_odds),
+                         'test_DSO': float(test_statistical_disparity),
+                         'test_DEO': float(test_unequal_opportunity),
+                         'test_DFP': float(test_unequalized_odds),
+                         'fidelity': budget
+                         }
+                }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                shuffle: Union[bool, None] = False,
+                                **kwargs) -> Dict:
+        """
+        Objective function for the multi-objective adult benchmark.
+
+        We train a NN and evaluate its performance using fairness metrics.
+        This function returns the performance on the test set.
+
+        Parameters
+        ----------
+        configuration: Dict, CS.Configuration
+            Configuration for the MLP model.
+            Use default configuration if None.
+        fidelity: Dict, CS.Configuration, None
+            epoch: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict - test metrics reported after training on (train+valid)
+                accuracy: float
+                DSO: float
+                DEO: float
+                DFP: float
+            cost : float - time to train the network. see `training_cost`
+            info : Dict
+                 train_accuracy : float
+                 test_accuracy : float
+                 training_cost : float
+                 total_cost : float - elapsed time for the entire obj_func_test call,
+                 eval_train_cost : float - time to compute metrics on training split
+                 eval_test_cost : float - time to compute metrics on test split
+                 train_DSO : float
+                 train_DEO : float
+                 train_DFP : float
+                 test_DSO : float
+                 test_DEO : float
+                 test_DFP : float
+                 fidelity : int
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        if shuffle:
+            self._shuffle_data(self.rng, shuffle_valid=True)
+
+        ts_start = time.time()
+
+        budget = fidelity['budget']
+
+        X_train, X_valid, X_test = self.X_train.copy(), self.X_valid.copy(), self.X_test.copy()
+        X_train = np.vstack((X_train, X_valid))
+        y_train = np.vstack((self.y_train[:, np.newaxis], self.y_valid[:, np.newaxis])).ravel()
+
+        sensitive_rows_train = X_train[:, self.feature_names.index(self.sensitive_feature)]
+        sensitive_rows_test = X_test[:, self.feature_names.index(self.sensitive_feature)]
+
+        # Normalize data
+        scaler = get_fitted_scaler(X_train, "Standard")
+        if scaler is not None:
+            X_train = scaler(X_train)
+            X_test = scaler(X_test)
+
+        # Create model. The parameters fc_layer_1-3 might not be included in the search space.
+        hidden = [configuration['fc_layer_0'],
+                  configuration.get('fc_layer_1', None),
+                  configuration.get('fc_layer_2', None),
+                  configuration.get('fc_layer_3', None)][:configuration['n_fc_layers']]
+
+        for item in ['fc_layer_0', 'fc_layer_1', 'fc_layer_2', 'fc_layer_3', 'n_fc_layers']:
+            if item in configuration:
+                configuration.pop(item)
+
+        # We deviate here from the original implementation. They have called `budget`-times mlp.partial_fit().
+        # We call `.fit()` due to efficiency aspects.
+        mlp = MLPClassifier(**configuration, hidden_layer_sizes=hidden, shuffle=shuffle,
+                            random_state=rng, max_iter=budget)
+        mlp.fit(X_train, y_train)
+        training_cost = time.time() - ts_start
+
+        train_accuracy, train_statistical_disparity, train_unequal_opportunity, train_unequalized_odds, \
+            eval_train_runtime = \
+            AdultBenchmark._compute_metrics_on_split(X_train, y_train, sensitive_rows_train, mlp)
+
+        test_accuracy, test_statistical_disparity, test_unequal_opportunity, test_unequalized_odds, eval_test_runtime =\
+            AdultBenchmark._compute_metrics_on_split(X_test, self.y_test, sensitive_rows_test, mlp)
+
+        elapsed_time = time.time() - ts_start
+
+        logger.debug(f"config:{configuration}, test_score: {test_accuracy}, train score:{train_accuracy},"
+                     f"dsp:{test_statistical_disparity}, deo :{test_unequal_opportunity}, dfp :{test_unequalized_odds}")
+
+        return {'function_value': {'accuracy': float(test_accuracy),
+                                   'DSO': float(test_statistical_disparity),
+                                   'DEO': float(test_unequal_opportunity),
+                                   'DFP': float(test_unequalized_odds)
+                                   },
+                'cost': training_cost,
+                'info': {'train_accuracy': float(train_accuracy),
+                         'test_accuracy': float(test_accuracy),
+                         'training_cost': training_cost,
+                         'total_cost': elapsed_time,
+                         'eval_train_cost': eval_train_runtime,
+                         'eval_test_cost': eval_test_runtime,
+                         'train_DSO': float(train_statistical_disparity),
+                         'train_DEO': float(train_unequal_opportunity),
+                         'train_DFP': float(train_unequalized_odds),
+                         'test_DSO': float(test_statistical_disparity),
+                         'test_DEO': float(test_unequal_opportunity),
+                         'test_DFP': float(test_unequalized_odds),
+                         'fidelity': budget
+                         }
+                }
+
+    @staticmethod
+    def _compute_metrics_on_split(
+            x_split: np.ndarray, y_split: np.ndarray, sensitive_rows: Any,  mlp: Any
+    ) -> Tuple:
+
+        start = time.time()
+        _y_pred = mlp.predict(x_split)
+        accuracy = accuracy_score(y_split, _y_pred)
+        statistical_disparity = fairness_risk(x_split, y_split, sensitive_rows, mlp, STATISTICAL_DISPARITY)
+        unequal_opportunity = fairness_risk(x_split, y_split, sensitive_rows, mlp, UNEQUAL_OPPORTUNITY)
+        unequalized_odds = fairness_risk(x_split, y_split, sensitive_rows, mlp, UNEQUALIZED_ODDS)
+        runtime = time.time() - start
+        return accuracy, statistical_disparity, unequal_opportunity, unequalized_odds, runtime
+
+    def _shuffle_data(self, rng=None, shuffle_valid=False) -> None:
+        """
+        Reshuffle the training data.
+
+        Parameters
+        ----------
+        rng
+            If 'rng' is None, the training idx are shuffled according to the class-random-state
+        shuffle_valid: bool, None
+            If true, shuffle the validation data. Defaults to False.
+        """
+        random_state = rng_helper.get_rng(rng, self.rng)
+
+        train_idx = np.arange(len(self.X_train))
+        random_state.shuffle(train_idx)
+        self.X_train = self.X_train[train_idx]
+        self.y_train = self.y_train[train_idx]
+
+        if shuffle_valid:
+            valid_idx = np.arange(len(self.X_valid))
+            random_state.shuffle(valid_idx)
+            self.X_valid = self.X_valid[valid_idx]
+            self.y_valid = self.y_valid[valid_idx]
+
+
+__all__ = ['AdultBenchmark']
diff --git a/hpobench/benchmarks/mo/cnn_benchmark.py b/hpobench/benchmarks/mo/cnn_benchmark.py
new file mode 100644
index 00000000..d8bfd939
--- /dev/null
+++ b/hpobench/benchmarks/mo/cnn_benchmark.py
@@ -0,0 +1,575 @@
+"""
+Changelog:
+==========
+
+0.0.1:
+* First implementation of the Multi-Objective CNN Benchmark.
+"""
+import logging
+import random
+import time
+from typing import Union, Dict, List, Tuple, Any
+
+import ConfigSpace as CS
+import numpy as np
+import torch
+import torch.nn as nn
+import tqdm
+from ConfigSpace.conditions import GreaterThanCondition
+from torch.utils.data import TensorDataset, DataLoader
+
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+from hpobench.util.data_manager import CNNDataManager
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('MO_CNN')
+
+
+class AccuracyTop1:
+
+    def __init__(self):
+        self.reset()
+
+        self.sum = 0
+        self.cnt = 0
+
+    def reset(self):
+        self.sum = 0
+        self.cnt = 0
+
+    def __call__(self, y_true: torch.Tensor, y_pred: torch.Tensor) -> float:
+        self.sum += y_pred.topk(1)[1].eq(y_true.argmax(-1).reshape(-1, 1).expand(-1, 1)).float().sum().to('cpu').numpy()
+        self.cnt += y_pred.size(0)
+        return self.sum / self.cnt
+
+
+class Net(nn.Module):
+    """
+    The model to optimize
+    """
+
+    def __init__(self, config: Dict, input_shape: Tuple = (3, 28, 28),
+                 num_classes: Union[int, None] = 10):
+        super(Net, self).__init__()
+        inp_ch = input_shape[0]
+        layers = []
+        for i in range(config['n_conv_layers']):
+            out_ch = config['conv_layer_{}'.format(i)]
+            ks = config['kernel_size']
+            layers.append(nn.Conv2d(inp_ch, out_ch, kernel_size=ks, padding=(ks - 1) // 2))
+            layers.append(nn.ReLU())
+            if config['batch_norm']:
+                layers.append(nn.BatchNorm2d(out_ch))
+            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
+            inp_ch = out_ch
+
+        self.conv_layers = nn.Sequential(*layers)
+        self.pooling = nn.AdaptiveAvgPool2d(1) if config['global_avg_pooling'] else nn.Identity()
+        self.output_size = num_classes
+
+        self.fc_layers = nn.ModuleList()
+
+        inp_n = self._get_conv_output(input_shape)
+
+        layers = [nn.Flatten()]
+        for i in range(config['n_fc_layers']):
+            out_n = config['fc_layer_{}'.format(i)]
+
+            layers.append(nn.Linear(inp_n, out_n))
+            layers.append(nn.ReLU())
+
+            inp_n = out_n
+
+        layers.append(nn.Linear(inp_n, num_classes))
+        self.fc_layers = nn.Sequential(*layers)
+
+    # generate input sample and forward to get shape
+    def _get_conv_output(self, shape: Tuple) -> int:
+        bs = 1
+        input = torch.autograd.Variable(torch.rand(bs, *shape))
+        output_feat = self.conv_layers(input)
+        output_feat = self.pooling(output_feat)
+        n_size = output_feat.data.view(bs, -1).size(1)
+        return n_size
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv_layers(x)
+        x = self.pooling(x)
+        x = self.fc_layers(x)
+        return x
+
+    def train_fn(self, optimizer: torch.optim.Optimizer, criterion: Any, loader: DataLoader, device: torch.device):
+        """
+        Training method
+
+        Parameters
+        ----------
+        optimizer
+            optimization algorithm
+        criterion
+            loss function
+        loader
+            data loader for either training or testing set
+        device
+            Either CPU or GPU
+        Returns
+        -------
+        accuracy on the data
+        """
+        accuracy = AccuracyTop1()
+        self.train()
+
+        acc = 0
+        for images, labels in loader:
+            images = images.to(device)
+            labels = labels.to(device)
+
+            optimizer.zero_grad()
+            logits = self(images)
+
+            loss = criterion(logits, labels.argmax(-1))
+            loss.backward()
+            optimizer.step()
+
+            acc = accuracy(labels, logits)
+
+        return acc
+
+    def eval_fn(self, loader: DataLoader, device: torch.device):
+        """
+        Evaluation method
+
+        Parameters
+        ----------
+        loader:
+            data loader for either training or testing set
+        device:
+            torch device
+
+        Returns
+        -------
+        accuracy on the data
+        """
+        accuracy = AccuracyTop1()
+        self.eval()
+
+        acc = 0
+        with torch.no_grad():  # no gradient needed
+            for images, labels in loader:
+                images = images.to(device)
+                labels = labels.to(device)
+
+                outputs = self(images)
+                acc = accuracy(labels, outputs)
+
+        return acc
+
+
+class CNNBenchmark(AbstractMultiObjectiveBenchmark):
+    def __init__(self, dataset: str,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Parameters
+        ----------
+        dataset : str
+            One of fashion, flower.
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmark's random state.
+        """
+
+        super(CNNBenchmark, self).__init__(rng=rng)
+        allowed_datasets = ["fashion", "flower"]
+        assert dataset in allowed_datasets, f'Requested data set is not supported. Must be one of ' \
+                                            f'{", ".join(allowed_datasets)}, but was {dataset}'
+        logger.info(f'Start Benchmark on dataset {dataset}')
+
+        self.dataset = dataset
+        self.__seed_everything()
+
+        # Dataset loading
+        data_manager = CNNDataManager(dataset=self.dataset)
+        self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test = data_manager.load()
+
+        self.output_classes = self.y_train.shape[1]
+        self.input_shape = self.X_train.shape[1:4]
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
+        the CNN model.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('n_conv_layers', default_value=3, lower=1, upper=3, log=False),
+            CS.UniformIntegerHyperparameter('n_fc_layers', default_value=3, lower=1, upper=3, log=False),
+            CS.UniformIntegerHyperparameter('conv_layer_0', default_value=128, lower=16, upper=1024, log=True),
+            CS.UniformIntegerHyperparameter('conv_layer_1', default_value=128, lower=16, upper=1024, log=True),
+            CS.UniformIntegerHyperparameter('conv_layer_2', default_value=128, lower=16, upper=1024, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_0', default_value=32, lower=2, upper=512, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_1', default_value=32, lower=2, upper=512, log=True),
+            CS.UniformIntegerHyperparameter('fc_layer_2', default_value=32, lower=2, upper=512, log=True),
+
+            CS.UniformIntegerHyperparameter('batch_size', lower=1, upper=512, default_value=128, log=True),
+            CS.UniformFloatHyperparameter('learning_rate_init', lower=10**-5, upper=1, default_value=10**-3, log=True),
+            CS.CategoricalHyperparameter('batch_norm', default_value=False, choices=[False, True]),
+            CS.CategoricalHyperparameter('global_avg_pooling', default_value=True, choices=[False, True]),
+            CS.CategoricalHyperparameter('kernel_size', default_value=5, choices=[7, 5, 3])
+        ])
+
+        cs.add_conditions([
+            # Add the conv_layer_1 (2nd layer) if we allow more than 1 (>1) `n_conv_layers`, and so on...
+            GreaterThanCondition(cs.get_hyperparameter('conv_layer_1'), cs.get_hyperparameter('n_conv_layers'), 1),
+            GreaterThanCondition(cs.get_hyperparameter('conv_layer_2'), cs.get_hyperparameter('n_conv_layers'), 2),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_1'), cs.get_hyperparameter('n_fc_layers'), 1),
+            GreaterThanCondition(cs.get_hyperparameter('fc_layer_2'), cs.get_hyperparameter('n_fc_layers'), 2),
+        ])
+
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters
+
+        Fidelities
+        ----------
+        budget: int - [1, 25]
+            Number of epochs to train
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('budget', lower=1, upper=25, default_value=25, log=False)
+        ])
+        return fidelity_space
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {
+            'name': 'Bag of baselines for multi-objective joint neural architecture search and '
+                    'hyperparameter optimization',
+            'references': ['@article{guerrero2021bag,'
+                           'title   = {Bag of baselines for multi - objective joint neural architecture search and '
+                           'hyperparameter optimization},'
+                           'author  = {Guerrero-Viu, Julia and Hauns, Sven and Izquierdo, Sergio and Miotto, '
+                           'Guilherme and Schrodi, Simon and Biedenkapp, Andre and Elsken, Thomas and Deng, '
+                           'Difan and Lindauer, Marius and Hutter, Frank},},'
+                           'journal = {arXiv preprint arXiv:2105.01015},'
+                           'year    = {2021}}',
+                           ],
+            'code': 'https://github.com/automl/multi-obj-baselines',
+        }
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        """Get the names of the objectives reported in the objective function."""
+        return ['accuracy', 'model_size']
+
+    def init_model(self, config: Union[CS.Configuration, Dict]) -> Net:
+        """
+        Function that returns the model initialized based on the configuration and fidelity
+        """
+        if isinstance(config, CS.Configuration):
+            config = config.get_dictionary()
+        return Net(config, self.input_shape, self.output_classes)
+
+    def __seed_everything(self):
+        """Helperfunction: Make the benchmark deterministic by setting the correct seeds"""
+        seed = self.rng.randint(0, 100000)
+        logger.debug(f'Generate seed: {seed}')
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+
+    def _shuffle_data(self, rng=None, shuffle_valid=False) -> None:
+        """
+        Reshuffle the training data.
+
+        Parameters
+        ----------
+        rng
+            If 'rng' is None, the training idx are shuffled according to the class-random-state
+        shuffle_valid: bool, None
+            If true, shuffle the validation data. Defaults to False.
+        """
+        random_state = rng_helper.get_rng(rng, self.rng)
+
+        train_idx = np.arange(len(self.X_train))
+        random_state.shuffle(train_idx)
+        self.X_train = self.X_train[train_idx]
+        self.y_train = self.y_train[train_idx]
+
+        if shuffle_valid:
+            valid_idx = np.arange(len(self.X_valid))
+            random_state.shuffle(valid_idx)
+            self.X_valid = self.X_valid[valid_idx]
+            self.y_valid = self.y_valid[valid_idx]
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           shuffle: bool = False,
+                           **kwargs) -> Dict:
+        """
+        Train a CNN on either the flower or the fashion data set and return the performance on the validation
+        data split.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the CNN Model
+        fidelity: Dict, CS.Configuration, None
+            epoch: int - Values: [1, 50]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                negative_accuracy: float
+                    1 - validation accuracy
+                log_model_size: float
+                    log10 of the number of parameters
+            cost : time to train the network
+            info : Dict
+                train_accuracy : float,
+                training_cost : float,
+                valid_accuracy : float,
+                valid_cost : float,
+                test_accuracy : float,
+                test_cost : float,
+                model_size : int,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+        self.__seed_everything()
+
+        if shuffle:
+            self._shuffle_data(rng=self.rng, shuffle_valid=False)
+
+        time_in = time.time()
+
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        logger.info(f'We use the device: {device}')
+
+        # initializing model
+        model = self.init_model(configuration).to(device)
+        epochs = fidelity['budget']
+
+        optimizer = torch.optim.Adam(model.parameters(), lr=configuration['learning_rate_init'])
+        criterion = torch.nn.CrossEntropyLoss()
+
+        ds_train = TensorDataset(self.X_train, self.y_train)
+        ds_train = DataLoader(ds_train, batch_size=configuration['batch_size'], shuffle=True)
+
+        ds_val = TensorDataset(self.X_valid, self.y_valid)
+        ds_val = DataLoader(ds_val, batch_size=configuration['batch_size'], shuffle=True)
+
+        ds_test = TensorDataset(self.X_test, self.y_test)
+        ds_test = DataLoader(ds_test, batch_size=configuration['batch_size'], shuffle=True)
+
+        start = time.time()
+        t = tqdm.tqdm(total=epochs)
+
+        train_accuracy = 0
+        for epoch in range(epochs):
+            train_accuracy = model.train_fn(optimizer, criterion, ds_train, device).item()
+            t.set_postfix(train_accuracy=train_accuracy)
+            t.update()
+        training_runtime = time.time() - start
+
+        num_params = np.sum([p.numel() for p in model.parameters()]).item()
+        start = time.time()
+        val_accuracy = model.eval_fn(ds_val, device).item()
+        eval_valid_runtime = time.time() - start
+        start = time.time()
+        test_accuracy = model.eval_fn(ds_test, device).item()
+        eval_test_runtime = time.time() - start
+
+        t.set_postfix(
+            train_acc=train_accuracy,
+            val_acc=val_accuracy,
+            tst_acc=test_accuracy,
+            len=np.log10(num_params),
+            train_runtime=training_runtime,
+            eval_valid_runtime=eval_valid_runtime,
+            eval_test_runtime=eval_test_runtime,
+        )
+        t.close()
+
+        elapsed_time = time.time() - time_in
+
+        return {'function_value': {'negative_accuracy': 1 - val_accuracy,
+                                   'log_model_size': float(np.log10(num_params))},
+                'cost': float(training_runtime),
+                'info': {'train_accuracy': train_accuracy,
+                         'training_cost': training_runtime,
+                         'valid_accuracy': val_accuracy,
+                         'valid_cost': eval_valid_runtime,
+                         'test_accuracy': test_accuracy,
+                         'test_cost': eval_test_runtime,
+                         'total_time': elapsed_time,
+                         'model_size': num_params,
+                         'fidelity': fidelity}
+                }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                shuffle: bool = False,
+                                **kwargs) -> Dict:
+        """
+        Train a CNN on both the train adn validation split of either the flower or the fashion data set and
+        get the test results.
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the CNN Model
+        fidelity: Dict, CS.Configuration, None
+            epoch: int - Values: [1, 50]
+                Number of epochs an architecture was trained.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        shuffle: bool, None
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                negative_accuracy: float
+                    1 - test accuracy
+                log_model_size: float
+                    log10 of the number of parameters
+            cost : time to train the network
+            info : Dict
+                train_accuracy : float,
+                training_cost : float,
+                test_accuracy : float,
+                test_cost : float,
+                model_size : int,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+
+        time_in = time.time()
+
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+        self.__seed_everything()
+
+        if shuffle:
+            self._shuffle_data(rng=self.rng, shuffle_valid=False)
+
+        train_X = torch.vstack((self.X_train, self.X_valid))
+        y_train = torch.cat((self.y_train, self.y_valid))
+
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        # initializing model
+        model = self.init_model(configuration).to(device)
+        epochs = fidelity['budget']
+
+        optimizer = torch.optim.Adam(model.parameters(), lr=configuration['learning_rate_init'])
+        criterion = torch.nn.CrossEntropyLoss()
+
+        ds_train = TensorDataset(train_X, y_train)
+        ds_train = DataLoader(ds_train, batch_size=configuration['batch_size'], shuffle=True)
+
+        ds_test = TensorDataset(self.X_test, self.y_test)
+        ds_test = DataLoader(ds_test, batch_size=configuration['batch_size'], shuffle=True)
+
+        start = time.time()
+        t = tqdm.tqdm(total=epochs)
+
+        train_accuracy = 0
+        for epoch in range(epochs):
+            train_accuracy = model.train_fn(optimizer, criterion, ds_train, device).item()
+            t.set_postfix(train_accuracy=train_accuracy)
+            t.update()
+        training_runtime = time.time() - start
+
+        num_params = np.sum([p.numel() for p in model.parameters()])
+        start = time.time()
+        test_accuracy = model.eval_fn(ds_test, device).item()
+        eval_test_runtime = time.time() - start
+
+        t.set_postfix(
+            train_acc=train_accuracy,
+            tst_acc=test_accuracy,
+            len=np.log10(num_params),
+            eval_train_runtime=training_runtime,
+            eval_test_runtime=eval_test_runtime,
+
+        )
+        t.close()
+
+        elapsed_time = time.time() - time_in
+
+        return {'function_value': {'negative_accuracy': 1 - test_accuracy,
+                                   'log_model_size': float(np.log10(num_params))},
+                'cost': training_runtime,
+                'info': {'train_accuracy': train_accuracy,
+                         'training_cost': training_runtime,
+                         'test_accuracy': test_accuracy,
+                         'test_cost': eval_test_runtime,
+                         'total_time': elapsed_time,
+                         'model_size': num_params,
+                         'fidelity': fidelity}
+                }
+
+
+class FashionCNNBenchmark(CNNBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(FashionCNNBenchmark, self).__init__(dataset='fashion', rng=rng, **kwargs)
+
+
+class FlowerCNNBenchmark(CNNBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(FlowerCNNBenchmark, self).__init__(dataset='flower', rng=rng, **kwargs)
+
+
+__all__ = ["FashionCNNBenchmark",
+           "FlowerCNNBenchmark"]
diff --git a/hpobench/benchmarks/mo/lm_benchmark.py b/hpobench/benchmarks/mo/lm_benchmark.py
new file mode 100644
index 00000000..fd016d01
--- /dev/null
+++ b/hpobench/benchmarks/mo/lm_benchmark.py
@@ -0,0 +1,383 @@
+"""
+Changelog:
+==========
+
+0.0.1:
+* First implementation of the Multi-Objective Language Model Benchmark.
+"""
+from typing import Union, Dict, List
+import ConfigSpace as CS
+import numpy as np
+import torch
+import torch.nn as nn
+import logging
+import hpobench.util.rng_helper as rng_helper
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark
+from hpobench.util.data_manager import LanguageModelDataManager
+from hpobench.dependencies.lm.tokenize_util import batchify
+from hpobench.dependencies.lm.model import TransformerModel
+import time
+import math
+import tqdm
+import random
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('LM_Bench')
+
+
+class LanguageModelBenchmark(AbstractMultiObjectiveBenchmark):
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Tranformer based multi-objective language model benchmark
+
+        Parameters
+        ----------
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmarks
+
+        Transformer Model is based on : "https://arxiv.org/pdf/1706.03762.pdf"
+        """
+        super(LanguageModelBenchmark, self).__init__(rng=rng)
+
+        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        data_manager = LanguageModelDataManager(self.device)
+        self.X_train, self.X_valid, self.X_test = data_manager.load()
+        self.ntokens = len(data_manager.corpus.dictionary)
+        self.__seed_everything()
+        self.variable = {"eval_batch_size": 10,
+                         "nlayers": 2,
+                         "bptt": 35,
+                         "tied": True,
+                         # Number of attention head
+                         "nhead": 2,
+                         "ntoken": self.ntokens
+                         }
+
+    def __seed_everything(self):
+        """Helperfunction: Make the benchmark deterministic by setting the correct seeds"""
+        seed = self.rng.randint(0, 100000)
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """Parameter space to be optimized --- contains the hyperparameters
+        """
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformIntegerHyperparameter(
+                'batch_size', default_value=128, lower=8, upper=256
+            ),
+            CS.UniformIntegerHyperparameter(
+                'emsize', default_value=128, lower=32, upper=1024, log=True
+            ),
+            CS.UniformIntegerHyperparameter(
+                'lr_factor', default_value=50, lower=1, upper=100, log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'lr', default_value=5, lower=1, upper=50, log=True
+            ),
+            CS.UniformFloatHyperparameter(
+                'dropout', default_value=0.99, lower=0, upper=0.99
+            ),
+            CS.UniformFloatHyperparameter(
+                'clip', default_value=0.99, lower=0.1, upper=2
+            )
+
+        ])
+        return cs
+
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        return ['log_perplexity', 'accuracy', 'time']
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters
+
+        Fidelities:
+         - epoch: int
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+
+        fidelity_space = CS.ConfigurationSpace(seed=seed)
+        fidelity_space.add_hyperparameters([
+            CS.UniformIntegerHyperparameter(
+                'budget', lower=1, upper=81, default_value=81, log=False
+            )
+        ])
+        return fidelity_space
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {
+            'name': 'Multi-objective Asynchronous Successive Halving',
+            'references': ['@article{schmucker2021multi,'
+                           'title={Multi-objective Asynchronous Successive Halving},'
+                           'author={Schmucker, Robin and Donini, Michele and Zafar, Muhammad Bilal and Salinas,'
+                           ' David and Archambeau, C{\'e}dric},'
+                           'journal={arXiv preprint arXiv:2106.12639},'
+                           'year={2021}',
+                           ],
+        }
+
+    def init_model(self, config: Union[CS.Configuration, Dict]):
+        """ Function that returns the model initialized based on the configuration and fidelity
+        """
+        model = TransformerModel(
+            self.variable['ntoken'], config['emsize'], self.variable['nhead'], config['emsize'],
+            self.variable['nlayers'], config['dropout'])
+
+        return model
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           **kwargs) -> Dict:
+        """
+
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 81]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+
+
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                validation_accuracy: float
+                log_perplexity: float
+            cost : time to train the network
+            info : Dict
+                validation_accuracy : float,
+                test_accuracy : float,
+                log_perplexity : float,
+                negative_log_perplexity : float,
+                training_cost : float,
+                valid_cost : float,
+                test_cost : float,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+
+        self.rng = rng_helper.get_rng(self.rng, rng)
+        self.__seed_everything()
+
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        ts_start = time.time()
+
+        # batchify data
+        batch_size = configuration['batch_size']
+        train_data = batchify(self.X_train, batch_size=batch_size).to(device)
+        val_data = batchify(self.X_valid, batch_size=self.variable["eval_batch_size"]).to(device)
+        test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(device)
+
+        epochs = fidelity['budget']
+
+        model = self.init_model(configuration).to(device)
+
+        criterion = nn.CrossEntropyLoss()
+
+        learning_rate = configuration['lr']
+        learning_rate_factor = configuration['lr_factor']
+        clip = configuration['clip']
+        best_val_loss = None
+        train_time = 0
+        eval_time = 0
+
+        t = tqdm.tqdm(total=epochs)
+        for epoch in range(epochs):
+            epoch_start_time = time.time()
+            train_loss, train_acc = model.train_fun(self.ntokens, criterion, train_data, learning_rate, clip)
+            train_time += time.time() - epoch_start_time
+            start = time.time()
+            val_loss, val_acc = model.eval_fun(self.ntokens, criterion, val_data)
+            val_loss = np.clip(val_loss, 1e-10, 10)
+            eval_time += start - time.time()
+
+            t.set_postfix(val_accuracy=val_acc)
+            t.update()
+
+            # Taken from original experimental setup
+            if not np.isfinite(val_loss):
+                val_loss = 7
+
+            # Save the model if the validation loss is the best we've seen so far.
+            if not best_val_loss or val_loss < best_val_loss:
+                best_val_loss = val_loss
+            else:
+                # Anneal the learning rate if no improvement has been seen in the validation dataset.
+                learning_rate /= learning_rate_factor
+
+        start_time = time.time()
+        _, test_acc = model.eval_fun(self.ntokens, criterion, test_data)
+        eval_test_runtime = time.time() - start_time
+
+        perplexity = math.exp(best_val_loss)
+        log_perplexity = best_val_loss
+        neg_log_perplexity = 10 - best_val_loss
+        elapsed_time = ts_start - time.time()
+
+        return {'function_value': {'log_perplexity': log_perplexity,
+                                   'accuracy': 1 - val_acc.item(),
+                                   'time': train_time + eval_time
+                                   },
+                'cost': elapsed_time,
+                'info': {'train_accuracy': train_acc.item(),
+                         'validation_accuracy': val_acc.item(),
+                         'test_accuracy': test_acc.item(),
+                         'log_perplexity': log_perplexity,
+                         'perplexity': perplexity,
+                         'negative_log_perplexity': neg_log_perplexity,
+                         'training_cost': train_time,
+                         'valid_cost': eval_time,
+                         'test_cost': eval_test_runtime,
+                         'fidelity': fidelity
+                         }
+                }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Get the validated results. Runs a given configuration on the largest budget (here: 50).
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 81]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+
+        kwargs
+        Returns
+        -------
+        Dict -
+            function_value : Dict
+                validation_accuracy: float
+                log_perplexity: float
+            cost : time to train the network
+            info : Dict
+                validation_accuracy : float,
+                test_accuracy : float,
+                log_perplexity : float,
+                negative_log_perplexity : float,
+                training_cost : float,
+                valid_cost : float,
+                test_cost : float,
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+
+        assert fidelity['epoch'] == 81, 'Only test data for the 81 epoch is available. '
+        ts_start = time.time()
+
+        self.rng = rng_helper.get_rng(self.rng, rng)
+        self.__seed_everything()
+
+        # batchify data
+        batch_size = configuration['batch_size']
+        train_data = batchify(self.X_train, batch_size=batch_size)
+        val_data = batchify(self.X_valid, batch_size=batch_size)
+        train_data = np.vstack((train_data, val_data))
+        train_data = torch.tensor(train_data).to(self.device)
+        test_data = batchify(self.X_test, batch_size=self.variable["eval_batch_size"]).to(self.device)
+
+        epochs = fidelity['budget']
+
+        model = self.init_model(configuration).to(self.device)
+
+        criterion = nn.CrossEntropyLoss()
+
+        learning_rate = configuration['lr']
+        learning_rate_factor = configuration['lr_factor']
+        clip = configuration['clip']
+        best_test_loss = None
+        train_time = 0
+        eval_time = 0
+        t = tqdm.tqdm(total=epochs)
+        for epoch in range(1, epochs + 1):
+            epoch_start_time = time.time()
+            train_loss, train_acc = model.train_fun(self.ntokens, criterion, train_data, learning_rate,
+                                                    clip)
+            train_time += time.time() - epoch_start_time
+            start = time.time()
+
+            test_loss, test_acc = model.eval_fun(self.ntokens, criterion, test_data)
+            test_loss = np.clip(test_loss, 1e-10, 10)
+            eval_time += time.time() - start
+
+            t.set_postfix(test_accuracy=test_acc)
+            t.update()
+            if not np.isfinite(test_loss):
+                test_loss = 7
+
+            # Save the model if the validation loss is the best we've seen so far.
+            if not best_test_loss or test_loss < best_test_loss:
+                best_test_loss = test_loss
+            else:
+                # Anneal the learning rate if no improvement has been seen in the validation dataset.
+                learning_rate /= learning_rate_factor
+
+        perplexity = math.exp(best_test_loss)
+        log_perplexity = best_test_loss
+        neg_log_perplexity = 10 - best_test_loss
+        elapsed_time = ts_start - time.time()
+
+        return {'function_value': {'log_perplexity': log_perplexity,
+                                   'accuracy': 1 - test_acc.item(),
+                                   'time': train_time + eval_time
+                                   },
+                'cost': elapsed_time,
+                'info': {'train_accuracy': train_acc.item(),
+                         'test_accuracy': test_acc.item(),
+                         'log_perplexity': log_perplexity,
+                         'perplexity': perplexity,
+                         'negative_log_perplexity': neg_log_perplexity,
+                         'training_cost': train_time,
+                         'test_cost': eval_time,
+                         'fidelity': fidelity
+                         }
+                }
+
+    __all__ = ["LanguageModelBenchmark"]
diff --git a/hpobench/benchmarks/nas/nasbench_201.py b/hpobench/benchmarks/nas/nasbench_201.py
index 17bac321..0c2324c2 100644
--- a/hpobench/benchmarks/nas/nasbench_201.py
+++ b/hpobench/benchmarks/nas/nasbench_201.py
@@ -27,6 +27,10 @@
 
 Changelog:
 ==========
+0.0.6
+* Add the multiobjective version of this benchmark by returning flops, model size, latency and missclassification rate
+* Integrate #138: Improve the docstrings about the seeds.
+
 0.0.5
 * Add for each benchmark a new one with a different fidelity space.
   The new fidelity space corresponds to the fidelity space in the DEHB paper.
@@ -54,16 +58,18 @@
 import numpy as np
 
 import hpobench.util.rng_helper as rng_helper
-from hpobench.abstract_benchmark import AbstractBenchmark
+from hpobench.abstract_benchmark import AbstractBenchmark, AbstractMultiObjectiveBenchmark
+
 from hpobench.util.data_manager import NASBench_201Data
 
-__version__ = '0.0.5'
+
+__version__ = '0.0.6'
 MAX_NODES = 4
 
 logger = logging.getLogger('NASBENCH201')
 
 
-class NasBench201BaseBenchmark(AbstractBenchmark):
+class NasBench201BaseMOBenchmark(AbstractMultiObjectiveBenchmark):
     def __init__(self, dataset: str,
                  rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         """
@@ -129,6 +135,8 @@ def __init__(self, dataset: str,
         - In the original data, the training splits are always marked with the key 'train' but they use different
           identifiers to refer to the available evaluation splits. We report them also in the table below.
         - We exclude the data set cifar10 from this benchmark.
+        - In NasBench201, not all architectures have values for the three seeds. To increase robustness, we have patched
+          missing values with the values from an available seed.
 
          Some further remarks:
         - cifar10-valid is trained on the train split and tested on the validation split.
@@ -145,13 +153,13 @@ def __init__(self, dataset: str,
             Random seed for the benchmark's random state.
         """  # noqa: E501
 
-        super(NasBench201BaseBenchmark, self).__init__(rng=rng)
+        super(NasBench201BaseMOBenchmark, self).__init__(rng=rng)
 
         data_manager = NASBench_201Data(dataset=dataset)
 
         self.dataset = dataset
         self.data = data_manager.load()
-        self.config_to_structure = NasBench201BaseBenchmark.config_to_structure_func(max_nodes=MAX_NODES)
+        self.config_to_structure = NasBench201BaseMOBenchmark.config_to_structure_func(max_nodes=MAX_NODES)
 
     def dataset_mapping(self, dataset):
         mapping = {'cifar10-valid': ('x-valid', 'ori-test'),
@@ -160,7 +168,7 @@ def dataset_mapping(self, dataset):
         return mapping[dataset]
 
     # pylint: disable=arguments-differ
-    @AbstractBenchmark.check_parameters
+    @AbstractMultiObjectiveBenchmark.check_parameters
     def objective_function(self, configuration: Union[CS.Configuration, Dict],
                            fidelity: Union[Dict, CS.Configuration, None] = None,
                            rng: Union[np.random.RandomState, int, None] = None,
@@ -205,7 +213,15 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : training precision
+            function_value : Dict
+                misclassification_rate : float
+                    1 - validation accuracy
+                num_flops : float
+                    Number of floating point operations in M
+                model_size : float
+                    Model size in MB
+                latency : float
+                    Time to evaluate a configuration in seconds
             cost : time to train the network
             info : Dict
                 train_precision : float
@@ -264,22 +280,38 @@ def objective_function(self, configuration: Union[CS.Configuration, Dict],
         test_times = [np.sum((self.data[seed][structure_str]['eval_times'][f'{test_key}@{199}'])
                              for e in range(1, epoch + 1)) for seed in data_seed]
 
-        return {'function_value': float(100 - np.mean(valid_accuracies)),
-                'cost': float(np.sum(valid_times) + np.sum(train_times)),
-                'info': {'train_precision': float(100 - np.mean(train_accuracies)),
-                         'train_losses': float(np.mean(train_losses)),
-                         'train_cost': float(np.sum(train_times)),
-                         'valid_precision': float(100 - np.mean(valid_accuracies)),
-                         'valid_losses': float(np.mean(valid_losses)),
-                         'valid_cost': float(np.sum(valid_times) + np.sum(train_times)),
-                         'test_precision': float(100 - np.mean(test_accuracies)),
-                         'test_losses': float(np.mean(test_losses)),
-                         'test_cost': float(np.sum(train_times)) + float(np.sum(test_times)),
-                         'fidelity': fidelity
-                         }
-                }
-
-    @AbstractBenchmark.check_parameters
+        # Number of floating point operations in million
+        num_flops = [self.data[seed][structure_str]['flop'] for seed in data_seed]
+
+        # Number of trainable model parameters in MB
+        model_size = [self.data[seed][structure_str]['params'] for seed in data_seed]
+
+        # Time to evaluate in seconds
+        latency = [self.data[seed][structure_str]['latency'] for seed in data_seed]
+
+        return {
+            'function_value': {
+                'misclassification_rate': float(100 - np.mean(valid_accuracies)),
+                'num_flops': float(np.mean(num_flops)),
+                'model_size': float(np.mean(model_size)),
+                'latency': float(np.mean(latency)),
+            },
+            'cost': float(np.sum(valid_times) + np.sum(train_times)),
+            'info': {
+                'train_precision': float(100 - np.mean(train_accuracies)),
+                'train_losses': float(np.mean(train_losses)),
+                'train_cost': float(np.sum(train_times)),
+                'valid_precision': float(100 - np.mean(valid_accuracies)),
+                'valid_losses': float(np.mean(valid_losses)),
+                'valid_cost': float(np.sum(valid_times) + np.sum(train_times)),
+                'test_precision': float(100 - np.mean(test_accuracies)),
+                'test_losses': float(np.mean(test_losses)),
+                'test_cost': float(np.sum(train_times)) + float(np.sum(test_times)),
+                'fidelity': fidelity
+            }
+        }
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
     def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
                                 fidelity: Union[Dict, None] = None,
                                 rng: Union[np.random.RandomState, int, None] = None,
@@ -294,10 +326,9 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         ----------
         configuration
         fidelity: Dict, None
-            epoch: int - Values: [1, 200]
+            epoch: int - Values: [200]
                 Number of epochs an architecture was trained.
-                Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)
-
+                Note: We only have test performance on the last epoch.
             Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
         rng : np.random.RandomState, int, None
             Random seed to use in the benchmark.
@@ -311,7 +342,15 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         Returns
         -------
         Dict -
-            function_value : evaluation precision
+            function_value : Dict
+                misclassification_rate : float
+                    1 - test accuracy
+                num_flops : float
+                    Number of floating point operations in M
+                model_size : float
+                    Model size in MB
+                latency : float
+                    Time to evaluate a configuration in seconds
             cost : time to the network + time to validate
             info : Dict
                 train_precision
@@ -327,10 +366,19 @@ def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
         # to test and the corresponding time cost
         assert fidelity['epoch'] == 200, 'Only test data for the 200. epoch is available. '
 
+        if 'data_seed' in kwargs:
+            all_seeds_available = all([seed in kwargs['data_seed'] for seed in (777, 888, 999)])
+            if not all_seeds_available:
+                logger.warning('You have not specified all available seeds for the '
+                               '`objective_function_test`. However, we are going to ignore them, '
+                               ' because we report test values only as mean across all seeds.'
+                               f' Your given seeds: {kwargs["seed"]}')
+            del kwargs['data_seed']
+
         result = self.objective_function(configuration=configuration, fidelity=fidelity,
                                          data_seed=(777, 888, 999),
                                          rng=rng, **kwargs)
-        result['function_value'] = result['info']['test_precision']
+        result['function_value']['misclassification_rate'] = result['info']['test_precision']
         result['cost'] = result['info']['test_cost']
         return result
 
@@ -349,7 +397,7 @@ def config_to_structure(config):
                     op_name = config[node_str]
                     x_list.append((op_name, j))
                 genotypes.append(tuple(x_list))
-            return NasBench201BaseBenchmark._Structure(genotypes)
+            return NasBench201BaseMOBenchmark._Structure(genotypes)
         return config_to_structure
 
     @staticmethod
@@ -387,7 +435,7 @@ def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSp
         seed = seed if seed is not None else np.random.randint(1, 100000)
         cs = CS.ConfigurationSpace(seed=seed)
 
-        search_space = NasBench201BaseBenchmark.get_search_spaces('cell', 'nas-bench-201')
+        search_space = NasBench201BaseMOBenchmark.get_search_spaces('cell', 'nas-bench-201')
         hps = [CS.CategoricalHyperparameter(f'{i}<-{j}', search_space) for i in range(1, MAX_NODES) for j in range(i)]
         cs.add_hyperparameters(hps)
         return cs
@@ -420,6 +468,10 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
 
         return fidel_space
 
+    @staticmethod
+    def get_objective_names() -> List[str]:
+        return ['misclassification_rate', 'num_flops', 'model_size', 'latency']
+
     @staticmethod
     def get_meta_information() -> Dict:
         """ Returns the meta information for the benchmark """
@@ -471,25 +523,296 @@ def __getitem__(self, index):
             return self.nodes[index]
 
 
-class Cifar10ValidNasBench201Benchmark(NasBench201BaseBenchmark):
+class Cifar10ValidNasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(Cifar10ValidNasBench201MOBenchmark, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
+
+
+class Cifar100NasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(Cifar100NasBench201MOBenchmark, self).__init__(dataset='cifar100', rng=rng, **kwargs)
+
+
+class ImageNetNasBench201MOBenchmark(NasBench201BaseMOBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(ImageNetNasBench201MOBenchmark, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
+
+
+class NasBench201SOBenchmark(AbstractBenchmark):
+    def __init__(self, dataset: str,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Benchmark interface to the NASBench201 Benchmarks. The NASBench201 contains
+        results for architectures on 4 different data sets.
+
+        We have split the "api" file from NASBench201 in separate files per data set.
+        The original "api" file contains all data sets, but loading this single file took too much RAM.
+
+        We recommend to not call this base class directly but using the correct subclass below.
+
+        The parameter ``dataset`` indicates which data set was used for training.
+
+        For each data set the metrics
+        'train_acc1es', 'train_losses', 'train_times', 'eval_acc1es', 'eval_times', 'eval_losses' are available.
+        However, the data sets report them on different data splits (train, train + valid, test, valid or test+valid).
+
+        We summarize all information about the data sets in the following tables.
+
+        Datastet        Metric      Avail.Epochs    Explanation             returned by HPOBENCH
+        ----------------------------------------------------------------------------------------
+        cifar10-valid   train       [0-199]         training set
+        cifar10-valid   x-valid     [0-199]         validation set          objective function
+        cifar10-valid   x-test
+        cifar10-valid   ori-test    199             test set                objective function test
+
+        cifar100        train       [0-199]         training set
+        cifar100        x-valid     199             validation set
+        cifar100        x-test      199             test set                objective function test
+        cifar100        ori-test    [0-199]         validation + test set   objective function
+
+        ImageNet16-120  train       [0-199]         training set
+        ImageNet16-120  x-valid     199             validation set
+        ImageNet16-120  x-test      199             test set                objective function test
+        ImageNet16-120  ori-test    [0-199]         validation + test set   objective function
+
+
+        We have also extracted the incumbents per split. We report the incumbent accuracy and loss performance
+        i) by taking the maximum value across all seeds and configurations
+        ii) averaged across the three available seeds
+
+                                    i) The best possible incumbents (NO AVG!)                       ii) The "average" incumbent
+        Datastet        Metric      (Index of Arch, Accuracy)       (Index, Loss)                   (Index of Arch, Accuracy)       (Index, Loss)
+        ----------------------------------------------------------------------------------------------------------------------------------------------------------
+        cifar10-valid   train       (258, 100.0)                    (2778, 0.001179278278425336)    (10154, 100)                    (2778, 0.0013082386429297428)
+        cifar10-valid   x-valid     (6111, 91.71999999023437)       (14443, 0.3837750501537323)     (6111, 91.60666665039064)       (3888, 0.3894046771335602)
+        cifar10-valid   x-test
+        cifar10-valid   ori-test    (14174, 91.65)                  (3385, 0.3850496160507202)      (1459, 91.52333333333333)       (3385, 0.3995230517864227)
+
+        cifar100        train       (9930, 99.948)                  (9930, 0.012630240231156348)    (9930, 99.93733333333334)       (9930, 0.012843489621082942)
+        cifar100        x-valid     (13714, 73.71999998779297)      (13934, 1.1490126512527465)     (9930, 73.4933333577474)        (7361, 1.1600867895126343)
+        cifar100        x-test      (1459, 74.28000004882813)       (15383, 1.1427113876342774)     (9930, 73.51333332112631)       (7337, 1.1747569534301758)
+        cifar100        ori-test    (9930, 73.88)                   (13706, 1.1610547459602356)     (9930, 73.50333333333333)       (7361, 1.1696554500579834)
+
+        ImageNet16-120  train       (9930, 73.2524719841793)        (9930, 0.9490517352046979)      (9930, 73.22918040138735)       (9930, 0.9524298415108582)
+        ImageNet16-120  x-valid     (13778, 47.39999985758463)      (10721, 2.0826991437276203)     (10676, 46.73333327229818)      (10721, 2.0915397168795264)
+        ImageNet16-120  x-test      (857, 48.03333317057292)        (12887, 2.0940088628133138)     (857, 47.31111100599501)        (11882, 2.106453532218933)
+        ImageNet16-120  ori-test    (857, 47.083333353678384)       (11882, 2.0950548852284747)     (857, 46.8444444647895)         (11882, 2.1028235816955565)
+
+
+        Note:
+        - The parameter epoch is 0 indexed!
+        - In the original data, the training splits are always marked with the key 'train' but they use different
+          identifiers to refer to the available evaluation splits. We report them also in the table below.
+        - We exclude the data set cifar10 from this benchmark.
+        - In NasBench201, not all architectures have values for the three seeds. To increase robustness, we have patched
+          missing values with the values from an available seed.
+
+         Some further remarks:
+        - cifar10-valid is trained on the train split and tested on the validation split.
+        - The train metrics are dictionaries with epochs (e.g. 0, 1, 2) as key and the metric as value.
+          The evaluation metrics, however, have as key the identifiers, e.g. ori-test@0, with 0 indicating the epoch.
+          Also, each data set reports values for all 200 epochs for a metric on the specified split
+          and a single value on the 200th epoch for the other splits.
+
+        Parameters
+        ----------
+        dataset : str
+            One of cifar10-valid, cifar10, cifar100, ImageNet16-120.
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmark's random state.
+        """  # noqa: E501
+
+        super(NasBench201SOBenchmark, self).__init__(rng=rng, **kwargs)
+        self.mo_benchmark = NasBench201BaseMOBenchmark(rng=rng, dataset=dataset, **kwargs)
+
+    # pylint: disable=arguments-differ
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           data_seed: Union[List, Tuple, int, None] = (777, 888, 999),
+                           **kwargs) -> Dict:
+        """
+        Objective function for the NASBench201 benchmark.
+        This functions sends a query to NASBench201 and evaluates the configuration.
+        As already explained in the class definition, different data sets are trained on different splits.
+
+        The table above gives a detailed summary over the available splits, epochs, and which identifier are used per
+        dataset.
+
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed! (Results after the first epoch: epoch = 1)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        data_seed : List, Tuple, None, int
+            The nasbench_201 benchmark include for each run 3 different seeds: 777, 888, 999.
+            The user can specify which seed to use. If more than one seed is given, the results are averaged
+            across the seeds but then the training time is the sum of the costs per seed.
+            When this value is explicitly set to `None`, the function will chose randomly one out of [777, 888, 999].
+
+            Note:
+                For some architectures (configurations) no run was available. We've set missing values to an
+                available value from another seed. Therefore, it is possible that run results are exactly the same for
+                different seeds.
+
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : training precision
+            cost : time to train the network
+            info : Dict
+                train_precision : float
+                train_losses : float
+                train_cost : float
+                    Time needed to train the network for 'epoch' many epochs. If more than one seed is given,
+                    this field is the sum of the training time per network
+                eval_precision : float
+                eval_losses : float
+                eval_cost : float
+                    Time needed to train the network for 'epoch many epochs plus the time to evaluate the network on the
+                    evaluation split. If more than one seed is given, this field is the sum of the eval cost per network
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+        results = self.mo_benchmark.objective_function(
+            configuration=configuration, fidelity=fidelity, rng=rng, data_seed=data_seed, **kwargs
+        )
+
+        results['function_value'] = results['function_value']['misclassification_rate']
+        return results
+
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Get the validated results from the NASBench201. Runs a given configuration on the largest budget (here: 200).
+        The test function uses all data set seeds (777, 888, 999).
+
+        See also :py:meth:`~hpobench.benchmarks.nas.nasbench_201.objective_function`
+
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [1, 200]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 1 indexed. (Results after the first epoch: epoch = 1)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : evaluation precision
+            cost : time to the network + time to validate
+            info : Dict
+                train_precision
+                train_losses
+                train_cost
+                eval_precision
+                eval_losses
+                eval_cost
+                fidelity : used fidelities in this evaluation
+        """
+
+        results = self.mo_benchmark.objective_function_test(
+            configuration=configuration, fidelity=fidelity, rng=rng, **kwargs
+        )
+
+        results['function_value'] = results['function_value']['misclassification_rate']
+        return results
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Return the CS representation of the search space.
+        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
+        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
+
+        Parameters
+        ----------
+        seed : int, None
+            Random seed for the configuration space.
+
+        Returns
+        -------
+        CS.ConfigurationSpace -
+            Containing the benchmark's hyperparameter
+        """
+        return NasBench201BaseMOBenchmark.get_configuration_space(seed=seed)
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the NAS Benchmark 201.
+
+        Fidelities:
+         - epoch: int
+         The loss / accuracy at `epoch`. Can be from 0 to 199.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        return NasBench201BaseMOBenchmark.get_fidelity_space(seed=seed)
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return NasBench201BaseMOBenchmark.get_meta_information()
+
+
+class Cifar10ValidNasBench201Benchmark(NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar10ValidNasBench201Benchmark, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
 
 
-class Cifar100NasBench201Benchmark(NasBench201BaseBenchmark):
+class Cifar100NasBench201Benchmark(NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(Cifar100NasBench201Benchmark, self).__init__(dataset='cifar100', rng=rng, **kwargs)
 
 
-class ImageNetNasBench201Benchmark(NasBench201BaseBenchmark):
+class ImageNetNasBench201Benchmark(NasBench201SOBenchmark):
 
     def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
         super(ImageNetNasBench201Benchmark, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
 
 
-class _NasBench201BaseBenchmarkOriginal(NasBench201BaseBenchmark):
+class _NasBench201BaseBenchmarkOriginal(NasBench201SOBenchmark):
 
     @staticmethod
     def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -528,7 +851,7 @@ def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
     @staticmethod
     def get_meta_information() -> Dict:
         """ Returns the meta information for the benchmark """
-        meta_information = NasBench201BaseBenchmark.get_meta_information()
+        meta_information = NasBench201SOBenchmark.get_meta_information()
         meta_information['note'] = \
             'This version of the benchmark implements the fidelity space defined in the DEHB paper.' \
             'See [DEHB](https://github.com/automl/DEHB/tree/937dd5cf48e79f6d587ea2ff408cb5ad9a8dce46/dehb/examples)'
@@ -558,4 +881,7 @@ def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs
            "ImageNetNasBench201Benchmark",
            "Cifar10ValidNasBench201BenchmarkOriginal",
            "Cifar100NasBench201BenchmarkOriginal",
-           "ImageNetNasBench201BenchmarkOriginal"]
+           "ImageNetNasBench201BenchmarkOriginal",
+           "Cifar10ValidNasBench201MOBenchmark",
+           "Cifar100NasBench201MOBenchmark",
+           "ImageNetNasBench201MOBenchmark"]
diff --git a/hpobench/benchmarks/surrogates/yahpo_gym.py b/hpobench/benchmarks/surrogates/yahpo_gym.py
new file mode 100644
index 00000000..19522700
--- /dev/null
+++ b/hpobench/benchmarks/surrogates/yahpo_gym.py
@@ -0,0 +1,193 @@
+"""
+How to use this benchmark:
+--------------------------
+
+We recommend using the containerized version of this benchmark.
+If you want to use this benchmark locally (without running it via the corresponding container),
+you need to perform the following steps.
+
+Prerequisites:
+==============
+Conda environment in which the HPOBench is installed (pip install .). Activate your environment.
+```
+conda activate <Name_of_Conda_HPOBench_environment>
+```
+
+1. Clone from github:
+=====================
+```
+git clone HPOBench
+```
+
+2. Clone and install
+====================
+```
+cd /path/to/HPOBench
+pip install .[yahpo_gym]
+
+```
+
+Changelog:
+==========
+0.0.1:
+* First implementation
+"""
+import os
+import logging
+from typing import Union, Dict, List
+
+import ConfigSpace as CS
+import numpy as np
+
+from yahpo_gym.benchmark_set import BenchmarkSet
+from hpobench.abstract_benchmark import AbstractMultiObjectiveBenchmark, AbstractBenchmark
+
+__version__ = '0.0.1'
+
+logger = logging.getLogger('YAHPOGym')
+
+
+class YAHPOGymMOBenchmark(AbstractMultiObjectiveBenchmark):
+
+    def __init__(self, scenario: str, instance: str,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        For a list of available scenarios and instances see
+        'https://slds-lmu.github.io/yahpo_gym/scenarios.html'
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+        rng : np.random.RandomState, int, None
+        """
+
+        # When in the containerized version, redirect to the data inside the container.
+        if 'YAHPO_CONTAINER' in os.environ:
+            from yahpo_gym.local_config import LocalConfiguration
+            local_config = LocalConfiguration()
+            local_config.init_config(data_path='/home/data/yahpo_data')
+
+        self.scenario = scenario
+        self.instance = instance
+        self.benchset = BenchmarkSet(scenario, active_session=True)
+        self.benchset.set_instance(instance)
+
+        logger.info(f'Start Benchmark for scenario {scenario} and instance {instance}')
+        super(YAHPOGymMOBenchmark, self).__init__(rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_opt_space(drop_fidelity_params=True, seed=seed)
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.benchset.get_fidelity_space(seed=seed)
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[CS.Configuration, Dict, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        # No batch predicts, so we can grab the first item
+        out = self.benchset.objective_function({**configuration, **fidelity})[0]
+        # Convert to float for serialization
+        out = {k: float(v) for k, v in out.items()}
+
+        # Get runtime name
+        cost = out[self.benchset.config.runtime_name]
+
+        return {'function_value': out,
+                "cost": cost,
+                'info': {'fidelity': fidelity}}
+
+    @AbstractMultiObjectiveBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[CS.Configuration, Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) \
+            -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_objective_names(self) -> List[str]:
+        return self.benchset.config.y_names
+
+    @staticmethod
+    def get_meta_information():
+        """ Returns the meta information for the benchmark """
+        return {'name': 'YAHPO Gym',
+                'references': ['@misc{pfisterer2021yahpo,',
+                               'title={YAHPO Gym -- Design Criteria and a new Multifidelity '
+                               '       Benchmark for Hyperparameter Optimization},',
+                               'author    = {Florian Pfisterer and Lennart Schneider and'
+                               '             Julia Moosbauer and Martin Binder'
+                               '             and Bernd Bischl},',
+                               'eprint={2109.03670},',
+                               'archivePrefix={arXiv},',
+                               'year      = {2021}}'],
+                'code': 'https://github.com/pfistfl/yahpo_gym/yahpo_gym'}
+
+
+class YAHPOGymBenchmark(AbstractBenchmark):
+
+    def __init__(self, scenario: str, instance: str, objective: str = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        For a list of available scenarios and instances see
+        'https://slds-lmu.github.io/yahpo_gym/scenarios.html'
+        Parameters
+        ----------
+        scenario : str
+            Name for the surrogate data. Must be one of ["lcbench", "fcnet", "nb301", "rbv2_svm",
+            "rbv2_ranger", "rbv2_rpart", "rbv2_glmnet", "rbv2_aknn", "rbv2_xgboost", "rbv2_super"]
+        instance : str
+            A valid instance for the scenario. See `self.benchset.instances`.
+        objective : str
+            Name of the (single-crit) objective. See `self.benchset.config.y_names`.
+            Initialized to None, picks the first element in y_names.
+        rng : np.random.RandomState, int, None
+        """
+
+        self.backbone = YAHPOGymMOBenchmark(scenario=scenario, instance=instance, rng=rng)
+        self.objective = objective
+
+        super(YAHPOGymBenchmark, self).__init__(rng=rng)
+
+    @AbstractBenchmark.check_parameters
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, CS.Configuration, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+
+        mo_results = self.backbone.objective_function(configuration=configuration,
+                                                      fidelity=fidelity,
+                                                      **kwargs)
+
+        # If not objective is set, we just grab the first returned entry.
+        if self.objective is None:
+            self.objective = self.backbone.benchset.config.y_names[0]
+
+        obj_value = mo_results['function_value'][self.objective]
+
+        return {'function_value': obj_value,
+                "cost": mo_results['cost'],
+                'info': {'fidelity': fidelity, 'objectives': mo_results['function_value']}}
+
+    @AbstractBenchmark.check_parameters
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, CS.Configuration, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        return self.objective_function(configuration, fidelity=fidelity, rng=rng)
+
+    # pylint: disable=arguments-differ
+    def get_configuration_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.backbone.get_configuration_space(seed=seed)
+
+    # pylint: disable=arguments-differ
+    def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        return self.backbone.get_fidelity_space(seed=seed)
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        return YAHPOGymMOBenchmark.get_meta_information()
diff --git a/hpobench/container/benchmarks/mo/__init__.py b/hpobench/container/benchmarks/mo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/container/benchmarks/mo/adult_benchmark.py b/hpobench/container/benchmarks/mo/adult_benchmark.py
new file mode 100644
index 00000000..dbdcaf4d
--- /dev/null
+++ b/hpobench/container/benchmarks/mo/adult_benchmark.py
@@ -0,0 +1,12 @@
+""" Benchmark for the Multi-Objective Adult Benchmark from hpobench/benchmarks/mo/adult_benchmark.py
+"""
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class AdultBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'AdultBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'fair_adult')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(AdultBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/mo/cnn_benchmark.py b/hpobench/container/benchmarks/mo/cnn_benchmark.py
new file mode 100644
index 00000000..c9a1d009
--- /dev/null
+++ b/hpobench/container/benchmarks/mo/cnn_benchmark.py
@@ -0,0 +1,22 @@
+""" Benchmark for the Multi-Objective CNN Benchmark from hpobench/benchmarks/mo/cnn_benchmark.py
+"""
+
+from hpobench.container.client_abstract_benchmark import AbstractMOBenchmarkClient
+
+
+class FlowerCNNBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'FlowerCNNBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'mo_cnn')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['gpu'] = kwargs.get('gpu', True)
+        super(FlowerCNNBenchmark, self).__init__(**kwargs)
+
+
+class FashionCNNBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'FashionCNNBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'mo_cnn')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['gpu'] = kwargs.get('gpu', True)
+        super(FashionCNNBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/mo/lm_benchmark.py b/hpobench/container/benchmarks/mo/lm_benchmark.py
new file mode 100644
index 00000000..0f506c43
--- /dev/null
+++ b/hpobench/container/benchmarks/mo/lm_benchmark.py
@@ -0,0 +1,13 @@
+""" Benchmark for the Multi-Objective Language Model Benchmark from hpobench/benchmarks/mo/lm_benchmark.py
+"""
+
+from hpobench.container.client_abstract_benchmark import AbstractMOBenchmarkClient
+
+
+class LanguageModelBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'LanguageModelBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'lm_benchmark')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        kwargs['gpu'] = kwargs.get('gpu', True)
+        super(LanguageModelBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/benchmarks/nas/nasbench_201.py b/hpobench/container/benchmarks/nas/nasbench_201.py
index 5eb9c68f..2a948c6b 100644
--- a/hpobench/container/benchmarks/nas/nasbench_201.py
+++ b/hpobench/container/benchmarks/nas/nasbench_201.py
@@ -54,9 +54,36 @@ def __init__(self, **kwargs):
         super(ImageNetNasBench201BenchmarkOriginal, self).__init__(**kwargs)
 
 
+class Cifar10ValidNasBench201MOBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar10ValidNasBench201MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
+        super(Cifar10ValidNasBench201MOBenchmark, self).__init__(**kwargs)
+
+
+class Cifar100NasBench201MOBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar100NasBench201MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
+        super(Cifar100NasBench201MOBenchmark, self).__init__(**kwargs)
+
+
+class ImageNetNasBench201MOBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'ImageNetNasBench201MOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.6')
+        super(ImageNetNasBench201MOBenchmark, self).__init__(**kwargs)
+
+
 __all__ = ["Cifar10ValidNasBench201Benchmark",
            "Cifar100NasBench201Benchmark",
            "ImageNetNasBench201Benchmark",
            "Cifar10ValidNasBench201BenchmarkOriginal",
            "Cifar100NasBench201BenchmarkOriginal",
-           "ImageNetNasBench201BenchmarkOriginal"]
+           "ImageNetNasBench201BenchmarkOriginal",
+           "Cifar10ValidNasBench201MOBenchmark",
+           "Cifar100NasBench201MOBenchmark",
+           "ImageNetNasBench201MOBenchmark"]
diff --git a/hpobench/container/benchmarks/od/__init__.py b/hpobench/container/benchmarks/od/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/container/benchmarks/surrogates/yahpo_gym.py b/hpobench/container/benchmarks/surrogates/yahpo_gym.py
new file mode 100644
index 00000000..9774975d
--- /dev/null
+++ b/hpobench/container/benchmarks/surrogates/yahpo_gym.py
@@ -0,0 +1,20 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+from hpobench.container.client_abstract_benchmark import AbstractBenchmarkClient, AbstractMOBenchmarkClient
+
+
+class YAHPOGymBenchmark(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_gym')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(YAHPOGymBenchmark, self).__init__(**kwargs)
+
+
+class YAHPOGymMOBenchmark(AbstractMOBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'YAHPOGymMOBenchmark')
+        kwargs['container_name'] = kwargs.get('container_name', 'yahpo_gym')
+        kwargs['latest'] = kwargs.get('container_tag', '0.0.1')
+        super(YAHPOGymMOBenchmark, self).__init__(**kwargs)
diff --git a/hpobench/container/recipes/mo/Singularity.AdultBenchmark b/hpobench/container/recipes/mo/Singularity.AdultBenchmark
new file mode 100644
index 00000000..d373caa2
--- /dev/null
+++ b/hpobench/container/recipes/mo/Singularity.AdultBenchmark
@@ -0,0 +1,25 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER sharmaa@informatik.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && git checkout master \
+    && pip install .[mo_adult] \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge
+
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.adult_benchmark $@
\ No newline at end of file
diff --git a/hpobench/container/recipes/mo/Singularity.CNNBenchmark b/hpobench/container/recipes/mo/Singularity.CNNBenchmark
new file mode 100644
index 00000000..c9870968
--- /dev/null
+++ b/hpobench/container/recipes/mo/Singularity.CNNBenchmark
@@ -0,0 +1,26 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER sharmaa@informatik.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    cd /home \
+    && cd /home \
+    && git clone https://github.com/automl/HPOBench.git \
+    && cd HPOBench \
+    && git checkout master \
+    && pip install .[mo_cnn] \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge
+
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.cnn_benchmark $@
\ No newline at end of file
diff --git a/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark b/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark
new file mode 100644
index 00000000..770da7f9
--- /dev/null
+++ b/hpobench/container/recipes/mo/Singularity.LanguageModelBenchmark
@@ -0,0 +1,30 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER sharmaa@informatik.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    cd /home \
+    && mkdir data && cd data \
+    && wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt \
+    && wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt \
+    && wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt \
+    && cd /home \
+    && git clone https://github.com/ayushi-3536/HPOBench.git \
+    && cd HPOBench \
+    && git checkout wikitext \
+    && pip install .[lm_benchmark] \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge
+
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py mo.lm_benchmark $@
\ No newline at end of file
diff --git a/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark b/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
new file mode 100644
index 00000000..66ee63b1
--- /dev/null
+++ b/hpobench/container/recipes/surrogates/Singularity.YAHPOGymBenchmark
@@ -0,0 +1,39 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER pfistererf@googlemail.com
+VERSION v0.0.1
+
+%help
+    This is a template for a Singularity recipe
+
+%environment
+    YAHPO_CONTAINER=1
+    export YAHPO_CONTAINER
+
+%post
+    apt update -y
+    apt install build-essential git wget -y
+
+    /usr/local/bin/python -m pip install --upgrade pip
+
+    cd /home \
+    && mkdir data && cd data \
+    && git clone --depth 1 -b main https://github.com/pfistfl/yahpo_data.git\
+
+    cd /home \
+    && git clone https://github.com/pfistfl/HPOBench.git \
+    && cd HPOBench \
+    && echo "Please never push a recipe that checks out any other branch than development or master" \
+    && git checkout master \
+    && pip install .[yahpo_gym] \
+    && echo "Please don't touch the following lines" \
+    && cd / \
+    && mkdir /var/lib/hpobench/ \
+    && chmod -R 777 /var/lib/hpobench/ \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip cache purge \
+
+%runscript
+    python -s /home/HPOBench/hpobench/container/server_abstract_benchmark.py surrogates.yahpo_gym $@
diff --git a/hpobench/dependencies/lm/__init__.py b/hpobench/dependencies/lm/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/dependencies/lm/model.py b/hpobench/dependencies/lm/model.py
new file mode 100644
index 00000000..4d9e8e97
--- /dev/null
+++ b/hpobench/dependencies/lm/model.py
@@ -0,0 +1,148 @@
+import torch
+import torch.nn as nn
+import math
+import torch.nn.functional as F
+
+
+class PositionalEncoding(nn.Module):
+    r"""Inject some information about the relative or absolute position of the tokens
+        in the sequence. The positional encodings have the same dimension as
+        the embeddings, so that the two can be summed. Here, we use sine and cosine
+        functions of different frequencies.
+    .. math::
+        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
+        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
+        \text{where pos is the word position and i is the embed idx)
+    Args:
+        d_model: the embed dim (required).
+        dropout: the dropout value (default=0.1).
+        max_len: the max. length of the incoming sequence (default=5000).
+        >>> pos_encoder = PositionalEncoding(d_model)
+    """
+
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        r"""Inputs of forward function
+        Args:
+            x: the sequence fed to the positional encoder model (required).
+        Shape:
+            x: [sequence length, batch size, embed dim]
+            output: [sequence length, batch size, embed dim]
+        Examples:
+            >>> output = pos_encoder(x)
+        """
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+
+
+class TransformerModel(nn.Module):
+    """Container module with an encoder, a transformer module, and a decoder."""
+
+    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5, bptt=35, rng=None):
+        super(TransformerModel, self).__init__()
+        try:
+            from torch.nn import TransformerEncoder, TransformerEncoderLayer
+        except Exception:
+            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
+        self.model_type = 'Transformer'
+        self.src_mask = None
+        self.pos_encoder = PositionalEncoding(ninp, dropout)
+        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
+        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
+        self.encoder = nn.Embedding(ntoken, ninp)
+        self.ninp = ninp
+        self.decoder = nn.Linear(ninp, ntoken)
+        self.init_weights()
+        self.bptt = bptt
+
+    def _generate_square_subsequent_mask(self, sz):
+        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        return mask
+
+    def get_batch(self, source, i):
+        seq_len = min(self.bptt, len(source) - 1 - i)
+        data = source[i:i + seq_len]
+        target = source[i + 1:i + 1 + seq_len].view(-1)
+        return data, target
+
+    def init_weights(self):
+        initrange = 0.1
+        self.encoder.weight.data.uniform_(-initrange, initrange)
+        self.decoder.bias.data.zero_()
+        self.decoder.weight.data.uniform_(-initrange, initrange)
+
+    def forward(self, src, has_mask=True):
+        if has_mask:
+            device = src.device
+            if self.src_mask is None or self.src_mask.size(0) != len(src):
+                mask = self._generate_square_subsequent_mask(len(src)).to(device)
+                self.src_mask = mask
+        else:
+            self.src_mask = None
+        src = self.encoder(src) * math.sqrt(self.ninp)
+        src = self.pos_encoder(src)
+        output = self.transformer_encoder(src, self.src_mask)
+        output = self.decoder(output)
+        return F.log_softmax(output, dim=-1)
+
+    def train_fun(self, ntokens, criterion, train_data, lr, clip):
+        # Turn on training mode which enables dropout.
+        self.train()
+        total_loss = 0.
+        total_acc = 0.
+        for batch, i in enumerate(range(0, train_data.size(0) - 1, self.bptt)):
+            data, targets = self.get_batch(train_data, i)
+            # Starting each batch, we detach the hidden state from how it was previously produced.
+            # If we didn't, the model would try backpropagating all the way to start of the dataset.
+            self.zero_grad()
+            output = self(data)
+            output_flat = output.view(-1, ntokens)
+            loss = criterion(output_flat, targets)
+            loss.backward()
+
+            # calculate loss and accuracy
+            total_loss += len(data) * loss.item()
+            winners = output_flat.argmax(dim=1)
+            corrects = (winners == targets)
+            accuracy = corrects.sum().float() / float(targets.size(0))
+            total_acc += len(data) * accuracy
+
+            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
+            torch.nn.utils.clip_grad_norm_(self.parameters(), clip)
+            for p in self.parameters():
+                p.data.add_(-lr, p.grad.data)
+
+        avg_acc = total_acc / (len(train_data) - 1)
+        return total_loss / (len(train_data) - 1), avg_acc
+
+    def eval_fun(self, ntokens, criterion, data_source):
+        # Turn on evaluation mode which disables dropout.
+        self.eval()
+        total_loss = 0.
+        total_acc = 0.
+        with torch.no_grad():
+            for i in range(0, data_source.size(0) - 1, self.bptt):
+                data, targets = self.get_batch(data_source, i)
+                output = self(data)
+                output_flat = output.view(-1, ntokens)
+                total_loss += len(data) * criterion(output_flat, targets).item()
+
+                # inserted accuracy
+                winners = output_flat.argmax(dim=1)
+                corrects = (winners == targets)
+                accuracy = corrects.sum().float() / float(targets.size(0))
+                total_acc += len(data) * accuracy
+
+        avg_acc = total_acc / (len(data_source) - 1)
+        return total_loss / (len(data_source) - 1), avg_acc
diff --git a/hpobench/dependencies/lm/tokenize_util.py b/hpobench/dependencies/lm/tokenize_util.py
new file mode 100644
index 00000000..f68e850d
--- /dev/null
+++ b/hpobench/dependencies/lm/tokenize_util.py
@@ -0,0 +1,55 @@
+import torch
+
+
+class Dictionary(object):
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = []
+
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.idx2word.append(word)
+            self.word2idx[word] = len(self.idx2word) - 1
+        return self.word2idx[word]
+
+    def __len__(self):
+        return len(self.idx2word)
+
+
+class Corpus(object):
+    def __init__(self, logger):
+        self.dictionary = Dictionary()
+        self.logger = logger
+
+    def tokenize(self, path):
+        """Tokenizes a text file."""
+        # Add words to the dictionary
+        with open(path, 'r', encoding="utf8") as f:
+            for line in f:
+                words = line.split() + ['<eos>']
+                for word in words:
+                    self.dictionary.add_word(word)
+        # Tokenize file content
+        with open(path, 'r', encoding="utf8") as f:
+            idss = []
+            for line in f:
+                words = line.split() + ['<eos>']
+                ids = []
+                try:
+                    for word in words:
+                        ids.append(self.dictionary.word2idx[word])
+                except Exception:
+                    self.logger.debug("word2idx:{}", self.dictionary.word2idx)
+                idss.append(torch.tensor(ids).type(torch.int64))
+            ids = torch.cat(idss)
+        return ids
+
+
+def batchify(data, batch_size):
+    # Work out how cleanly we can divide the dataset into bsz parts.
+    nbatch = data.size(0) // batch_size
+    # Trim off any extra elements that wouldn't cleanly fit (remainders).
+    data = data.narrow(0, 0, nbatch * batch_size)
+    # Evenly divide the data across the bsz batches.
+    data = data.view(batch_size, -1).t().contiguous()
+    return data
diff --git a/hpobench/dependencies/mo/__init__.py b/hpobench/dependencies/mo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hpobench/dependencies/mo/fairness_metrics.py b/hpobench/dependencies/mo/fairness_metrics.py
new file mode 100644
index 00000000..7776fbd9
--- /dev/null
+++ b/hpobench/dependencies/mo/fairness_metrics.py
@@ -0,0 +1,110 @@
+"""
+This file contains functionality to compute various fairness related risk scores.
+"""
+
+import numpy as np
+
+STATISTICAL_DISPARITY = 'statistical_disparity'  # P(1 | group A) - P(1 | group B)
+UNEQUAL_OPPORTUNITY = 'unequal_opportunity'  # P(1 | group A, 0) - P(1 | group B, 0)
+UNEQUALIZED_ODDS = 'unequalized_odds'  # P(1 | group A, 1) - P(1 | group B, 1)
+
+TPR0 = 'tpr0'
+TPR1 = 'tpr1'
+TPR_DIF = 'tpr_dif'
+TPR_MIN = 'tpr_min'
+
+FAIRNESS_METRICS = [STATISTICAL_DISPARITY, UNEQUAL_OPPORTUNITY, UNEQUALIZED_ODDS, TPR0, TPR1, TPR_DIF, TPR_MIN]
+
+PRED_THRESHOLD = 0.5
+
+
+def fairness_risk(x, y, sensitive_rows, model, unfairness_metric):
+    """
+    Returns the fairness_risk based on the definition of the unfairness_metric, currently supporting:
+    statistical_disparity: P(positive prediction | group A) = P(positive prediction | group B)
+
+    Parameters
+    ----------
+    x: np.ndarray
+        inputs
+    y: np.ndarray
+        labels in {0, 1} such that 0 is a "positive" label, 1 "negative"
+    sensitive_rows: np.ndarray
+        binary array indicating which rows correspond to the protected group
+    model:
+        trained sklearn model
+    unfairness_metric: str
+        string with unfairness condition
+
+    Returns
+    -------
+        float
+    """
+    predicted_probs = model.predict_proba(x)
+    if unfairness_metric == STATISTICAL_DISPARITY:
+        prob_not_recidive_a = np.mean(
+            predicted_probs[sensitive_rows == 1][:, 0])
+        prob_not_recidive_b = np.mean(
+            predicted_probs[sensitive_rows == 0][:, 0])
+        return abs(prob_not_recidive_a - prob_not_recidive_b)
+    elif unfairness_metric == UNEQUAL_OPPORTUNITY:
+        prob_not_recidive_a = np.mean(
+            predicted_probs[sensitive_rows == 1 & (y == 0)][:, 0])
+        prob_not_recidive_b = np.mean(
+            predicted_probs[sensitive_rows == 0 & (y == 0)][:, 0])
+        return abs(prob_not_recidive_a - prob_not_recidive_b)
+    elif unfairness_metric == UNEQUALIZED_ODDS:
+        prob_not_recidive_a = np.mean(
+            predicted_probs[sensitive_rows == 1 & (y == 1)][:, 0])
+        prob_not_recidive_b = np.mean(
+            predicted_probs[sensitive_rows == 0 & (y == 1)][:, 0])
+        return abs(prob_not_recidive_a - prob_not_recidive_b)
+    elif unfairness_metric == TPR0:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y0 = y[sensitive_rows == 0]
+        y0_pred = y_pred[sensitive_rows == 0]
+        tp0 = np.sum((y0_pred == 1) & (y0 == 1))
+        fn0 = np.sum((y0_pred == 0) & (y0 == 1))
+        tpr0 = tp0 / (tp0 + fn0)
+        return tpr0
+    elif unfairness_metric == TPR1:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y1 = y[sensitive_rows == 1]
+        y1_pred = y_pred[sensitive_rows == 1]
+        tp1 = np.sum((y1_pred == 1) & (y1 == 1))
+        fn1 = np.sum((y1_pred == 0) & (y1 == 1))
+        tpr1 = tp1 / (tp1 + fn1)
+        return tpr1
+    elif unfairness_metric == TPR_DIF:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y0 = y[sensitive_rows == 0]
+        y0_pred = y_pred[sensitive_rows == 0]
+        tp0 = np.sum((y0_pred == 1) & (y0 == 1))
+        fn0 = np.sum((y0_pred == 0) & (y0 == 1))
+        tpr0 = tp0 / (tp0 + fn0)
+
+        y1 = y[sensitive_rows == 1]
+        y1_pred = y_pred[sensitive_rows == 1]
+        tp1 = np.sum((y1_pred == 1) & (y1 == 1))
+        fn1 = np.sum((y1_pred == 0) & (y1 == 1))
+        tpr1 = tp1 / (tp1 + fn1)
+        return abs(tpr0 - tpr1)
+    elif unfairness_metric == TPR_MIN:
+        y_pred = np.where(predicted_probs[:, 0] >= PRED_THRESHOLD, 1, 0)
+        y0 = y[sensitive_rows == 0]
+        y0_pred = y_pred[sensitive_rows == 0]
+        tp0 = np.sum((y0_pred == 1) & (y0 == 1))
+        fn0 = np.sum((y0_pred == 0) & (y0 == 1))
+        tpr0 = tp0 / (tp0 + fn0)
+
+        y1 = y[sensitive_rows == 1]
+        y1_pred = y_pred[sensitive_rows == 1]
+        tp1 = np.sum((y1_pred == 1) & (y1 == 1))
+        fn1 = np.sum((y1_pred == 0) & (y1 == 1))
+        tpr1 = tp1 / (tp1 + fn1)
+        return min(tpr0, tpr1)
+    else:
+        raise ValueError(
+            f'{unfairness_metric} is not a valid unfairness condition. '
+            f'Please specify one among ({STATISTICAL_DISPARITY}, {UNEQUAL_OPPORTUNITY}, {UNEQUALIZED_ODDS})'
+        )
diff --git a/hpobench/dependencies/mo/scalar.py b/hpobench/dependencies/mo/scalar.py
new file mode 100644
index 00000000..3f434fde
--- /dev/null
+++ b/hpobench/dependencies/mo/scalar.py
@@ -0,0 +1,36 @@
+import numpy as np
+from typing import Union
+
+try:
+    from sklearn.preprocessing import MinMaxScaler, StandardScaler
+except ImportError:
+    print("scikit-learn not installed")
+
+
+def get_fitted_scaler(x_train: np.ndarray, name: Union[None, str] = None):
+    """
+    Instantiates a scaler by a given name and fits the scaler with x_train.
+    Parameters
+    ----------
+    x_train: np.ndarray
+        Train data
+
+    name: str, None
+        Name of the scaling method. Defaults to no scaling.
+
+    Returns
+    -------
+
+    """
+
+    if name == "MinMax":
+        scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
+    elif name == "Standard":
+        scaler = StandardScaler(copy=True)
+    elif name is None or name == "None":
+        return None
+    else:
+        raise NotImplementedError()
+
+    scaler.fit(x_train)
+    return lambda x: scaler.transform(x)
diff --git a/hpobench/util/data_manager.py b/hpobench/util/data_manager.py
index a2e33121..cce04868 100644
--- a/hpobench/util/data_manager.py
+++ b/hpobench/util/data_manager.py
@@ -37,7 +37,6 @@
 except ImportError:
     print("pandas is not installed, can't download datasets for the ml.tabular_benchmarks (not needed for containers)")
 
-
 import hpobench
 
 
@@ -845,10 +844,163 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         X_train, y_train = data[:n_train, 1:], data[:n_train, 0]
         X_val, y_val = data[n_train:n_train + n_val, 1:], data[n_train:n_train + n_val, 0]
         X_test, y_test = data[n_train + n_val:, 1:], data[n_train + n_val:, 0]
+        return X_train, y_train, X_val, y_val, X_test, y_test
+
+
+class CNNDataManager(HoldoutDataManager):
+
+    def __init__(self, dataset: str):
+
+        super(CNNDataManager, self).__init__()
+        self.logger.debug('CNNDataManager: Starting to load data')
+
+        allowed_datasets = ["fashion", "flower"]
+        assert dataset in allowed_datasets, f'Requested data set is not supported. Must be one of ' \
+                                            f'{", ".join(allowed_datasets)}, but was {dataset}'
+
+        self.url_source = f'https://github.com/ayushi-3536/DatasetHost/blob/main/{dataset}.tar.gz?raw=true'
+        self.dataset = dataset
+        self.save_dir = hpobench.config_file.data_dir / "CNN" / f'{dataset}'
+        self.compressed_data = self.save_dir / f'{dataset}.tar.gz'
+        self.create_save_directory(self.save_dir)
+
+    def load(self):
+        """
+        Loads CNN Benchmark from data directory as defined in hpobenchrc.data_directory.
+        Downloads data if necessary.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        t = time()
+        self._download()
+        X_trn, y_trn, X_val, y_val, X_tst, y_tst = self._load()
+        self.logger.info(f'CNNDataManager: Data successfully loaded after {time() - t:.2f}')
+
+        return X_trn, y_trn, X_val, y_val, X_tst, y_tst
+
+    def _download(self):
+
+        # Check if data is already downloaded.
+        # Use a file lock to ensure that no two processes try to download the same files at the same time.
+        if self.compressed_data.exists():
+            self.logger.debug('CNNDataManager: Data already downloaded')
+        else:
+
+            self.logger.info(f'CNNDataManager: Start downloading data from {self.url_source} '
+                             f'to {self.save_dir}')
+            self._download_file_with_progressbar(data_url=self.url_source, data_file=self.compressed_data)
+            self._untar_data(compressed_file=self.compressed_data, save_dir=self.save_dir)
+
+    def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Load the data from file and split it into train, test and validation split.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        data_extract_path = self.save_dir / "data"
+        X_train = np.load(data_extract_path / 'x_train.npy')
+        y_train = np.load(data_extract_path / 'y_train.npy')
+
+        X_val = np.load(data_extract_path / 'x_val.npy')
+        y_val = np.load(data_extract_path / 'y_val.npy')
+
+        # Read Test datasets
+        X_test = np.load(data_extract_path / 'x_test.npy')
+        y_test = np.load(data_extract_path / 'y_test.npy')
+
+        def __cast_x_y(x, y) -> Tuple:
+            import torch
+            return torch.tensor(x).float().permute(0, 3, 1, 2), torch.tensor(y).long()
+
+        X_train, y_train = __cast_x_y(X_train, y_train)
+        X_val, y_val = __cast_x_y(X_val, y_val)
+        X_test, y_test = __cast_x_y(X_test, y_test)
 
         return X_train, y_train, X_val, y_val, X_test, y_test
 
 
+class LanguageModelDataManager(HoldoutDataManager):
+    def __init__(self, device):
+        from hpobench.dependencies.lm.tokenize_util import Corpus
+        super(LanguageModelDataManager, self).__init__()
+        self.logger.debug('LanguageModelDataManager: Starting to load data')
+
+        self.urls = {
+            "train": "https://raw.githubusercontent.com/pytorch/examples/master/"
+                     "word_language_model/data/wikitext-2/train.txt",
+            "valid": "https://raw.githubusercontent.com/pytorch/examples/master/"
+                     "word_language_model/data/wikitext-2/valid.txt",
+            "test": "https://raw.githubusercontent.com/pytorch/examples/master/"
+                    "word_language_model/data/wikitext-2/test.txt",
+        }
+
+        self.save_dir = hpobench.config_file.data_dir / "wikitext"
+        self.create_save_directory(self.save_dir)
+        self.corpus = Corpus(logger=self.logger)
+        self.device = device
+        self.tokenize_path = self.save_dir / "tokenize"
+
+    def load(self):
+        """
+        Loads Adult Fair Datasets from data directory as defined in hpobenchrc.data_directory.
+        Downloads data if necessary.
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        t = time()
+        self._download()
+        self.X_train, self.X_valid, self.X_test = self._load()
+        self.logger.info(f'LanguageModelDataManager: Data successfully loaded after {time() - t:.2f}')
+        return self.X_train, self.X_valid, self.X_test
+
+    @lockutils.synchronized('not_thread_process_safe', external=True,
+                            lock_path=f'{hpobench.config_file.cache_dir}/language_model', delay=0.5)
+    def _download(self):
+        for data in self.urls:
+            if (self.save_dir / f'{data}.txt').exists():
+                self.logger.debug(f'LanguageModelDataManager : tokenized {data}.txt already exist')
+            else:
+                self._download_file_with_progressbar(self.urls[data], self.save_dir / f"{data}.txt")
+
+    def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Load the data from file and split it into train, test and validation split.
+        Returns
+        -------
+        X_train: np.ndarray
+        X_valid: np.ndarray
+        X_test: np.ndarray
+        """
+
+        X_train = self.corpus.tokenize(self.save_dir / 'train.txt')
+        X_valid = self.corpus.tokenize(self.save_dir / 'valid.txt')
+        X_test = self.corpus.tokenize(self.save_dir / 'test.txt')
+        return X_train, X_valid, X_test
+
+
 class YearPredictionMSDData(HoldoutDataManager):
 
     def __init__(self):
@@ -926,6 +1078,165 @@ def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndar
         return X_trn, y_trn, X_val, y_val, X_tst, y_tst
 
 
+class AdultDataManager(HoldoutDataManager):
+
+    def __init__(self):
+        super(AdultDataManager, self).__init__()
+        self.logger.debug('AdultDataManager: Starting to load data')
+        self.urls = {"data": "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
+                     "test_data": "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"}
+
+        self.feature_names = ['age', 'fnlwgt', 'education-num', 'marital-status', 'relationship', 'race',
+                              'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'country',
+                              'employment_type']
+        self.sensitive_names = 'sex'
+
+        self._save_dir = hpobench.config_file.data_dir / "adult"
+
+        self._data_extract_path = self._save_dir / "processed_data"
+
+        self.create_save_directory(self._data_extract_path)
+
+    def load(self):
+        """
+        Loads Adult Fair Datasets from data directory as defined in hpobenchrc.data_directory.
+        Downloads data if necessary.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+
+        t = time()
+        self._download()
+        X_trn, y_trn, X_val, y_val, X_tst, y_tst = self._load()
+        self.logger.info(f'AdultDataManager: Data successfully loaded after {time() - t:.2f}')
+
+        return X_trn, y_trn, X_val, y_val, X_tst, y_tst
+
+    def _download(self):
+
+        if not (self._save_dir / "adult.data").exists():
+            self._download_file_with_progressbar(self.urls["data"], self._save_dir / "adult.data")
+
+        if not (self._save_dir / "adult.test").exists():
+            self._download_file_with_progressbar(self.urls["test_data"], self._save_dir / "adult.test")
+
+    def _load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Load the data from file and split it into train, test and validation split.
+
+        Returns
+        -------
+        X_train: np.ndarray
+        y_train: np.ndarray
+        X_val: np.ndarray
+        y_val: np.ndarray
+        X_test: np.ndarray
+        y_test: np.ndarray
+        """
+        processed_files = ['x_train', 'x_valid', 'x_test', 'y_train', 'y_valid', 'y_test']
+        file_is_missing = not all([(self._data_extract_path / f'{file}.npy').exists() for file in processed_files])
+
+        if file_is_missing:
+            columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
+                       "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
+                       "hours-per-week", "country", "salary"]
+            train_data = pd.read_csv(self._save_dir / 'adult.data', names=columns, sep=',', na_values='?')
+            test_data = pd.read_csv(self._save_dir / 'adult.test', names=columns, sep=',', skiprows=1, na_values='?')
+
+            X, y = self._process_adult_data(train_data)
+            X_test, y_test = self._process_adult_data(test_data)
+
+            n_trn = int(X.shape[0] * 0.7)
+            # Creation of Train and Test dataset
+            X_train, y_train = X[:n_trn], y[:n_trn]
+            X_valid, y_valid = X[n_trn:], y[n_trn:]
+
+            np.save(self._data_extract_path / 'x_train.npy', X_train)
+            np.save(self._data_extract_path / 'x_valid.npy', X_valid)
+            np.save(self._data_extract_path / 'x_test.npy', X_test)
+
+            np.save(self._data_extract_path / 'y_train.npy', y_train)
+            np.save(self._data_extract_path / 'y_valid.npy', y_valid)
+            np.save(self._data_extract_path / 'y_test.npy', y_test)
+
+        else:
+            X_train = np.load(self._data_extract_path / 'x_train.npy')
+            X_valid = np.load(self._data_extract_path / 'x_valid.npy')
+            X_test = np.load(self._data_extract_path / 'x_test.npy')
+
+            y_train = np.load(self._data_extract_path / 'y_train.npy')
+            y_valid = np.load(self._data_extract_path / 'y_valid.npy')
+            y_test = np.load(self._data_extract_path / 'y_test.npy')
+
+        return X_train, y_train, X_valid, y_valid, X_test, y_test
+
+    def _process_adult_data(self, df) -> Tuple[np.ndarray, np.ndarray]:
+        # mapping all categories of marital status to Single(1) or Couple(0)
+        df['marital-status'] = df['marital-status'].replace(
+            [' Divorced', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed'], 'Single')
+        df['marital-status'] = df['marital-status'].replace([' Married-AF-spouse', ' Married-civ-spouse'], 'Couple')
+        df['marital-status'] = df['marital-status'].map({'Couple': 0, 'Single': 1})
+
+        # mapping race
+        race_map = {' White': 0, ' Amer-Indian-Eskimo': 1, ' Asian-Pac-Islander': 2, ' Black': 3, ' Other': 4}
+        df['race'] = df['race'].map(race_map)
+
+        # categorizing all work classes into 4 major categories
+        def get_workclass(x):
+            if x['workclass'] == ' Federal-gov' or x['workclass'] == ' Local-gov' or x['workclass'] == ' State-gov':
+                return 'govt'
+            elif x['workclass'] == ' Private':
+                return 'private'
+            elif x['workclass'] == ' Self-emp-inc' or x['workclass'] == ' Self-emp-not-inc':
+                return 'self_employed'
+            else:
+                return 'without_pay'
+
+        df['employment_type'] = df.apply(get_workclass, axis=1)
+        employment_map = {'govt': 0, 'private': 1, 'self_employed': 2, 'without_pay': 3}
+        df['employment_type'] = df['employment_type'].map(employment_map)
+
+        # mapping relationship map
+        rel_map = {' Unmarried': 0, ' Wife': 1, ' Husband': 2, ' Not-in-family': 3, ' Own-child': 4,
+                   ' Other-relative': 5}
+        df['relationship'] = df['relationship'].map(rel_map)
+
+        # maping capital gain/loss to binary values
+        df.loc[(df['capital-gain'] > 0), 'capital-gain'] = 1
+        df.loc[(df['capital-gain'] == 0, 'capital-gain')] = 0
+        df.loc[(df['capital-loss'] > 0), 'capital-loss'] = 1
+        df.loc[(df['capital-loss'] == 0, 'capital-loss')] = 0
+
+        # defining salary map
+        salary_map = {' <=50K': 1, ' >50K': 0, ' <=50K.': 1, ' >50K.': 0, }
+        df['salary'] = df['salary'].map(salary_map).astype(int)
+
+        df['sex'] = df['sex'].map({' Male': 1, ' Female': 0}).astype(int)
+
+        # replacing all missing values with np.nan
+        df['country'] = df['country'].replace(' ?', np.nan)
+        df['workclass'] = df['workclass'].replace(' ?', np.nan)
+        df['occupation'] = df['occupation'].replace(' ?', np.nan)
+
+        # categorizing countries into "Non-US" and "US"
+        df.loc[df['country'] != ' United-States', 'country'] = 'Non-US'
+        df.loc[df['country'] == ' United-States', 'country'] = 'US'
+        df['country'] = df['country'].map({'US': 1, 'Non-US': 0}).astype(int)
+
+        df.drop(labels=['workclass', 'education', 'occupation'], axis=1, inplace=True)
+        X = df.drop(['salary'], axis=1)
+        y = df['salary']
+
+        return X.to_numpy(), y.to_numpy()
+
+
 class TabularDataManager(DataManager):
     def __init__(self, model: str, task_id: [int, str], data_dir: [str, Path, None] = None):
         super(TabularDataManager, self).__init__()
diff --git a/requirements.txt b/requirements.txt
index 73ae9818..aad54f85 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-scipy>=1.4.1
 numpy>=1.18.1
 ConfigSpace>=0.4.12
 Pyro4==4.80
diff --git a/tests/test_adult.py b/tests/test_adult.py
new file mode 100644
index 00000000..d7a030b7
--- /dev/null
+++ b/tests/test_adult.py
@@ -0,0 +1,37 @@
+import logging
+import pytest
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+def test_adult_benchmark():
+    from hpobench.container.benchmarks.mo.adult_benchmark import AdultBenchmark
+
+    # Check Seeding
+    benchmark = AdultBenchmark(rng=0)
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_1 = cs.sample_configuration()
+
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_2 = cs.sample_configuration()
+
+    assert cfg_1 == cfg_2
+
+    test_config = {
+        'alpha': 0.00046568046379195655, 'beta_1': 0.14382335124614148, 'beta_2': 0.0010007892350251595,
+        'fc_layer_0': 4, 'fc_layer_1': 2, 'fc_layer_2': 2, 'fc_layer_3': 3,'n_fc_layers': 4,
+        'learning_rate_init': 0.0005343227125594117,
+        'tol': 0.0004134759007834719
+    }
+
+    result_1 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+
+    assert result_1['info']['valid_accuracy'] == pytest.approx(0.7539, rel=0.001)
+    assert result_1['info']['valid_accuracy'] == result_1['function_value']['accuracy']
+    assert result_1['info']['train_accuracy'] == pytest.approx(0.76145, rel=0.001)
+    assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']
+
+    result_1 = benchmark.objective_function_test(test_config, rng=1, fidelity={'budget': 3})
+    assert result_1['function_value']['accuracy'] == pytest.approx(0.76377, rel=0.001)
+    assert result_1['function_value']['accuracy'] == result_1['info']['test_accuracy']
diff --git a/tests/test_mo_cnn.py b/tests/test_mo_cnn.py
new file mode 100644
index 00000000..308c59ad
--- /dev/null
+++ b/tests/test_mo_cnn.py
@@ -0,0 +1,48 @@
+import pytest
+
+
+def test_mo_cnn_seeding():
+    from hpobench.container.benchmarks.mo.cnn_benchmark import FlowerCNNBenchmark
+    b1 = FlowerCNNBenchmark(rng=0)
+    b2 = FlowerCNNBenchmark(rng=0)
+    test_config = {
+        'batch_norm': True, 'batch_size': 71, 'conv_layer_0': 194,  'conv_layer_1': 152,
+        'conv_layer_2': 92, 'fc_layer_0': 65, 'fc_layer_1': 19, 'fc_layer_2': 273,
+        'global_avg_pooling': True, 'kernel_size': 5, 'learning_rate_init': 0.09091283280651452,
+        'n_conv_layers': 2, 'n_fc_layers': 2
+    }
+
+    result_1 = b1.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    result_2 = b2.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    for metric in result_1['function_value'].keys():
+        assert result_1['function_value'][metric] == pytest.approx(result_2['function_value'][metric], abs=0.001)
+
+
+def test_mo_cnn_benchmark():
+    from hpobench.container.benchmarks.mo.cnn_benchmark import FlowerCNNBenchmark
+
+    # Check Seeding
+    benchmark = FlowerCNNBenchmark(rng=0)
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_1 = cs.sample_configuration()
+
+    cs = benchmark.get_configuration_space(seed=0)
+    cfg_2 = cs.sample_configuration()
+
+    assert cfg_1 == cfg_2
+
+    test_config = {
+        'batch_norm': True, 'batch_size': 71, 'conv_layer_0': 194,  'conv_layer_1': 152,
+        'conv_layer_2': 92, 'fc_layer_0': 65, 'fc_layer_1': 19, 'fc_layer_2': 273,
+        'global_avg_pooling': True, 'kernel_size': 5, 'learning_rate_init': 0.09091283280651452,
+        'n_conv_layers': 2, 'n_fc_layers': 2
+    }
+
+    result_1 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 3})
+    print(f'MO CNN: Valid Accuracy = {result_1["info"]["valid_accuracy"]}')
+    print(f'MO CNN: Train Accuracy = {result_1["info"]["train_accuracy"]}')
+    # assert result_1['info']['train_accuracy'] == pytest.approx(0.1044, rel=0.001)
+    # assert result_1['info']['valid_accuracy'] == pytest.approx(0.1029, rel=0.001)
+    assert result_1['info']['valid_accuracy'] == pytest.approx(1 - result_1['function_value']['negative_accuracy'], abs=0.001)
+    assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']
diff --git a/tests/test_nasbench_201.py b/tests/test_nasbench_201.py
index 22c24b34..70e46de9 100644
--- a/tests/test_nasbench_201.py
+++ b/tests/test_nasbench_201.py
@@ -1,11 +1,11 @@
 import logging
 logging.basicConfig(level=logging.DEBUG)
-
 import pytest
 
-from hpobench.benchmarks.nas.nasbench_201 import ImageNetNasBench201Benchmark, Cifar100NasBench201Benchmark, \
+from hpobench.container.benchmarks.nas.nasbench_201 import ImageNetNasBench201Benchmark, Cifar100NasBench201Benchmark, \
     Cifar10ValidNasBench201Benchmark
-
+from hpobench.benchmarks.nas.nasbench_201 import \
+    Cifar10ValidNasBench201MOBenchmark as LocalCifar10ValidNasBench201MOBenchmark
 from hpobench.util.container_utils import disable_container_debug, enable_container_debug
 
 skip_message = 'We currently skip this test because it takes too much time.'
@@ -23,67 +23,87 @@ def test_nasbench201_cifar10valid(enable_debug):
 
     b = Cifar10ValidNasBench201Benchmark(rng=0)
 
-    cs = b.get_configuration_space(seed=0)
-    config = cs.sample_configuration()
-    fidelity = {'epoch': 199}
-
-    result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
-    assert result['function_value'] == pytest.approx(0.411, abs=0.1)
-    assert result['cost'] == pytest.approx(6650.88, abs=0.1)
-    assert result['info']['train_precision'] == result['function_value']
-    assert result['info']['train_cost'] == result['cost']
-
-    result = b.objective_function_test(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
-    with pytest.raises(AssertionError):
+    cs_1 = b.get_configuration_space(seed=0)
+    config_1 = cs_1.sample_configuration()
+    cs_2 = b.get_configuration_space(seed=0)
+    config_2 = cs_2.sample_configuration()
+    assert config_1 == config_2
+
+    config = {
+        '1<-0': 'nor_conv_1x1',
+        '2<-0': 'nor_conv_3x3',
+        '2<-1': 'nor_conv_3x3',
+        '3<-0': 'nor_conv_1x1',
+        '3<-1': 'nor_conv_1x1',
+        '3<-2': 'nor_conv_3x3'
+    }
+    result = b.objective_function(configuration=config, fidelity={'epoch': 199}, data_seed=(777, 888, 999))
+    assert result['function_value'] == pytest.approx(9.78, abs=0.1)
+    assert result['cost'] == pytest.approx(11973.20, abs=0.1)
+    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_cost'] == result['cost']
+
+    result = b.objective_function_test(configuration=config, fidelity={'epoch': 200})
+    assert result['function_value'] == pytest.approx(9.70, abs=0.1)
+    assert result['cost'] == pytest.approx(10426.33, abs=0.2)
+    assert result['info']['test_precision'] == result['function_value']
+    assert result['info']['test_cost'] == result['cost']
+
+    with pytest.raises(ValueError):
         result = b.objective_function_test(configuration=config, fidelity={'epoch': 10})
 
+
 @pytest.mark.skip(reason=skip_message)
 def test_nasbench201_cifar100(enable_debug):
     b = Cifar100NasBench201Benchmark(rng=0)
 
-    cs = b.get_configuration_space(seed=0)
-    config = cs.sample_configuration()
+    config = {'1<-0': 'nor_conv_1x1',
+              '2<-0': 'nor_conv_3x3',
+              '2<-1': 'nor_conv_3x3',
+              '3<-0': 'nor_conv_1x1',
+              '3<-1': 'nor_conv_1x1',
+              '3<-2': 'nor_conv_3x3'}
     fidelity = {'epoch': 199}
 
     result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
     assert result is not None
-    assert result['function_value'] == pytest.approx(7.8259, abs=0.1)
-    assert result['cost'] == pytest.approx(13301.76, abs=0.1)
-    assert result['info']['train_precision'] == result['function_value']
-    assert result['info']['train_cost'] == result['cost']
+    assert result['function_value'] == pytest.approx(29.5233, abs=0.1)
+    assert result['cost'] == pytest.approx(19681.70, abs=0.1)
+    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_cost'] == result['cost']
 
 
 @pytest.mark.skip(reason=skip_message)
 def test_nasbench201_Image(enable_debug):
     b = ImageNetNasBench201Benchmark(rng=0)
-
-    cs = b.get_configuration_space(seed=0)
-    config = cs.sample_configuration()
+    config = {'1<-0': 'nor_conv_1x1',
+              '2<-0': 'nor_conv_3x3',
+              '2<-1': 'nor_conv_3x3',
+              '3<-0': 'nor_conv_1x1',
+              '3<-1': 'nor_conv_1x1',
+              '3<-2': 'nor_conv_3x3'}
     fidelity = {'epoch': 199}
 
     result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
-
     assert result is not None
-    assert result['function_value'] == pytest.approx(62.858, abs=0.1)
-    assert result['cost'] == pytest.approx(40357.56, abs=0.1)
-    assert result['info']['train_precision'] == result['function_value']
-    assert result['info']['train_cost'] == result['cost']
+    assert result['function_value'] == pytest.approx(55.2167, abs=0.1)
+    assert result['cost'] == pytest.approx(57119.22, abs=0.1)
+    assert result['info']['valid_precision'] == result['function_value']
+    assert result['info']['valid_cost'] == result['cost']
 
 
 def test_nasbench201_fidelity_space():
-    fs = Cifar10ValidNasBench201Benchmark.get_fidelity_space()
+    fs = LocalCifar10ValidNasBench201MOBenchmark.get_fidelity_space()
     assert len(fs.get_hyperparameters()) == 1
 
 
 def test_nasbench201_config():
-    cs = Cifar10ValidNasBench201Benchmark.get_configuration_space(seed=0)
+
+    cs = LocalCifar10ValidNasBench201MOBenchmark.get_configuration_space(seed=0)
     c = cs.sample_configuration()
-    func = Cifar10ValidNasBench201Benchmark.config_to_structure_func(4)
-    struct = func(c)
 
+    func = LocalCifar10ValidNasBench201MOBenchmark.config_to_structure_func(4)
+    struct = func(c)
     assert struct.__repr__() == '_Structure(4 nodes with |nor_conv_1x1~0|+|nor_conv_3x3~0|nor_conv_3x3~1|+' \
                                 '|nor_conv_1x1~0|nor_conv_1x1~1|nor_conv_3x3~2|)'
     assert len(struct) == 4
diff --git a/tests/test_wikitext.py b/tests/test_wikitext.py
new file mode 100644
index 00000000..727a8ea4
--- /dev/null
+++ b/tests/test_wikitext.py
@@ -0,0 +1,28 @@
+import logging
+import pytest
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+def test_wikitext_benchmark():
+    from hpobench.benchmarks.mo.lm_benchmark import LanguageModelBenchmark
+
+    # Check Seeding
+    benchmark = LanguageModelBenchmark(rng=0)
+    cs = benchmark.get_configuration_space(seed=1)
+    cfg_1 = cs.sample_configuration()
+
+    cs = benchmark.get_configuration_space(seed=1)
+    cfg_2 = cs.sample_configuration()
+
+    assert cfg_1 == cfg_2
+
+    test_config = {
+        'batch_size': 144, 'clip': 1.458859796107597, 'dropout': 0.5967357423109274,
+        'emsize': 575, 'lr': 5.245378070737081, 'lr_factor': 15
+    }
+
+    result_1 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 1})
+    result_2 = benchmark.objective_function(test_config, rng=1, fidelity={'budget': 1})
+    assert result_1['info']['train_accuracy'] == pytest.approx(0.76145, rel=0.001)
+    assert result_1['info']['train_accuracy'] == result_2['info']['train_accuracy']
diff --git a/tests/test_yahpo.py b/tests/test_yahpo.py
new file mode 100644
index 00000000..97a7d06d
--- /dev/null
+++ b/tests/test_yahpo.py
@@ -0,0 +1,77 @@
+import sys
+from typing import Dict, List
+
+import pytest
+
+from hpobench.container.benchmarks.surrogates.yahpo_gym import YAHPOGymBenchmark, YAHPOGymMOBenchmark
+
+
+def test_yahpo_init():
+    b = YAHPOGymBenchmark(scenario="lcbench", instance="167152", objective="val_accuracy")
+
+    fs = b.get_fidelity_space(seed=0)
+    fidelity = fs.sample_configuration().get_dictionary()
+    assert isinstance(fidelity, Dict)
+
+    cs = b.get_configuration_space(seed=0)
+    config = cs.sample_configuration().get_dictionary()
+
+    # Some tests are dependent on the python version.
+    if sys.version.startswith('3.9'):
+        assert fidelity['epoch'] == pytest.approx(29, abs=0.001)
+        assert config['OpenML_task_id'] == "167152"
+        assert config['num_layers'] == pytest.approx(4, abs=0.001)
+        assert config['max_units'] == pytest.approx(289, abs=0.0001)
+        assert config['weight_decay'] == pytest.approx(0.04376, abs=0.001)
+        assert config['learning_rate'] == pytest.approx(0.01398, abs=0.0001)
+        assert config['batch_size'] == pytest.approx(106, abs=0.001)
+
+    constant_fidelity = {'epoch': 29}
+    constant_config = {
+        'OpenML_task_id': '167152', 'batch_size': 106, 'learning_rate': 0.013981961408994055,
+        'max_dropout': 0.6027633760716439, 'max_units': 289, 'momentum': 0.47705277141162516,
+        'num_layers': 4, 'weight_decay': 0.04376434525415663
+    }
+
+    result = b.objective_function(configuration=constant_config, fidelity=constant_fidelity)
+    assert result['function_value'] == pytest.approx(61.297, abs=0.1)
+    assert result['cost'] == pytest.approx(119.4965, abs=0.1)
+    assert isinstance(result['info'], Dict)
+
+
+def test_yahpo_mo():
+    b = YAHPOGymMOBenchmark(scenario="lcbench", instance="167152")
+
+    fs = b.get_fidelity_space(seed=0)
+    fidelity = fs.sample_configuration().get_dictionary()
+    assert isinstance(fidelity, Dict)
+
+    cs = b.get_configuration_space(seed=0)
+    config = cs.sample_configuration().get_dictionary()
+
+    # Some tests are dependent on the python version.
+    if sys.version.startswith('3.9'):
+        assert fidelity['epoch'] == pytest.approx(29, abs=0.001)
+        assert config['OpenML_task_id'] == "167152"
+        assert config['num_layers'] == pytest.approx(4, abs=0.001)
+        assert config['max_units'] == pytest.approx(289, abs=0.0001)
+        assert config['weight_decay'] == pytest.approx(0.04376, abs=0.001)
+        assert config['learning_rate'] == pytest.approx(0.01398, abs=0.0001)
+        assert config['batch_size'] == pytest.approx(106, abs=0.001)
+
+    constant_fidelity = {'epoch': 29}
+    constant_config = {
+        'OpenML_task_id': '167152', 'batch_size': 106, 'learning_rate': 0.013981961408994055,
+        'max_dropout': 0.6027633760716439, 'max_units': 289, 'momentum': 0.47705277141162516,
+        'num_layers': 4, 'weight_decay': 0.04376434525415663
+    }
+
+    result = b.objective_function(configuration=constant_config, fidelity=constant_fidelity)
+    assert isinstance(result['function_value'], Dict)
+    assert result['function_value']['val_accuracy'] == pytest.approx(61.2971, abs=0.0001)
+    assert result['cost'] == pytest.approx(119.4965, abs=0.0001)
+
+    names = b.get_objective_names()
+    assert isinstance(names, List)
+    assert len(names) == 6
+    assert names[2] == 'val_cross_entropy'