dunnolab
diff --git a/‎.gitignore
+172 b/‎.gitignore
+172
diff --git a/‎README.md
+18-1 b/‎README.md
+18-1
diff --git a/‎collection/.gitkeep b/‎collection/.gitkeep
diff --git a/‎collection/README.md
+86 b/‎collection/README.md
+86
diff --git a/‎collection/combine.py
+112 b/‎collection/combine.py
+112
@@ -0,0 +1,172 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+test.py
+mlc_run.sh
+.ml-job-preset.yml
+wandb
+run.sh
+wandb_sweep.yaml
+**/*.hdf5
+**/*.DS_Store
+*.ipynb
+*.hdf5
+*.gz
@@ -1 +1,18 @@
-# xland-minigrid-datasets
+# XLand-100B: A Large-Scale Multi-Task Dataset for In-Context Reinforcement Learning
+
+Official code for the 'XLand-100B: A Large-Scale Multi-Task Dataset for In-Context Reinforcement Learning' paper. We provide the utilities used to collect the datasets as well as the code used for experiments with the baselines, namely AD and DPT. As these parts are semantically unrelated, they are separated into separate directories for simplicity (in the cleanrl style).
+
+Both XLand-100B and XLand-Trivial-20B hosted on public S3 bucket and freely available for everyone under CC BY-SA 4.0 Licence. See the README in each directory for instructions.
+
+## Downloading the datasets
+
+We advise starting with Trivial dataset for debugging due to smaller size and faster downloading time. Both datasets have an identical structure. For additional details we refer to the paper. 
+
+Datasets can be downloaded with the curl utility (or any other like wget) as follows:
+```commandline
+# XLand-Trivial-20B, approx 60GB size
+curl -L -o xland-trivial-20b.hdf5 https://sc.link/A4rEW
+
+# XLand-100B, approx 325GB size
+curl -L -o xland-100b.hdf5 https://sc.link/MoCvZ
+```
@@ -0,0 +1,86 @@
+# Dataset collection
+
+Here we provide the code used to collect the datasets. We adapted the single-task recurrent PPO implementation from the original XLand-MiniGrid baselines. We used wandb sweeps to pretraind base agent and collect individual learning histories at scale on multiple GPUs. We then combined all the individual histories into a single dataset using `combine.py`.
+
+If you notice any discrepancies with the paper, don't be afraid to open an issue and report about it! 
+
+## Pretraining
+
+Pretraining is simple. We provide config for pretraining in `configs/pretrain_base.yaml`. To start:
+```commandline
+python training/train.py \
+    --config_path='configs/pretrain_base.yaml' \
+    --checkpoint_path='path-for-the-final-checkpoint' \
+    --wandb_logging=True
+```
+We used pretraining only for the main dataset (tasks from medium benchmark).
+
+## Collecting
+
+We used wandb sweeps for collection. We provide base configs for trivial and medium in `configs/trivial_base.yaml` and `configs/medium_base.yaml` respectively.
+
+### Trivial
+
+First, create wandb config:
+```yaml
+# trivial_wandb.yaml
+entity: <your-enitty>
+project: xminigrid-datasets
+program: training/train.py
+method: grid
+parameters:
+  config_path:
+    value: "configs/trivial_base.yaml"
+  group:
+    value: "xland-minigrid-datasets-trivial-v0"
+  dataset_path:
+    value: <path-to-your-dir-for-data>
+  dataset_num_histories:
+    value: 32
+  ruleset_id:
+    min: 0
+    max: 10000
+    distribution: int_uniform
+```
+Next, create wandb agent with the `wandb sweep trivial_wandb.yaml` to get the sweep ID. To start collection, run `wandb agent <sweep-id>`. 
+
+### Medium
+
+Likewise, create a config:
+```yaml
+# medium_wandb.yaml
+entity: <your-enitty>
+project: xminigrid-datasets
+program: training/train.py
+method: grid
+parameters:
+  config_path:
+    value: "configs/medium_base.yaml"
+  group:
+    value: "xland-minigrid-datasets-medium-v0"
+  dataset_path:
+    value: <path-to-your-dir-for-data>
+  pretrained_checkpoint_path:
+    value: <path-to-your-pre-trained-checkpoint>
+  dataset_num_histories:
+    value: 32
+  ruleset_id:
+    min: 0
+    max: 30000
+    distribution: int_uniform
+```
+Unlike trivial, you must additionally specify the path to the pre-trained checkpoint (you can use `None` to train from scratch). After that, create wandb agent with the `wandb sweep medium_wandb.yaml` to get the sweep ID. To start collection, run `wandb agent <sweep-id>`.
+
+## Combining
+
+We used simple `combine.py` script to combine all individual learning histories into one dataset. As we described in the paper, we already tuned the hdf5 chunk size which worked best in our experiments, however you can customise it by changing the hardcoded values in the code.
+
+For example, we filterd out all runs with last return below 0.3:
+```commandline
+python combine.py \
+    --wandb-entity=your-entity \
+    --wandb-sweep=your-collection-sweep \
+    --data-path=your-data-path \
+    --combined-path=your-combined-path \
+    --final-return-thrs=0.3 \
+```
@@ -0,0 +1,112 @@
+import argparse
+import glob
+import gzip
+import os
+
+import h5py
+import wandb
+from tqdm.auto import tqdm
+
+
+def get_run(sweep_runs, ruleset_id):
+    wandb_run = [r for r in sweep_runs if r.config["ruleset_id"] == ruleset_id]
+    assert len(wandb_run) == 1
+    return wandb_run[0]
+
+
+def extract_id(filename):
+    return int(os.path.basename(filename).split("-")[-1].split(".")[0])
+
+
+def main(args):
+    print("Processing sweep runs...")
+    api = wandb.Api()
+    all_runs = api.runs(args.wandb_entity)
+    dataset_runs = [r for r in tqdm(all_runs) if hasattr(r.sweep, "id") and r.sweep.id == args.wandb_sweep]
+
+    print("Combining...")
+    files = glob.glob(os.path.join(args.data_path, "*.gz"))
+    files = sorted(files, key=lambda f: extract_id(f))
+
+    with h5py.File(args.combined_path, "w", rdcc_nbytes=5e9, rdcc_nslots=20000) as new_df:
+        idx = 0
+        for file in tqdm(files):
+            try:
+                with gzip.open(file, "rb") as gf:
+                    with h5py.File(gf, "r") as df:
+                        # checking that agent achieved return >= thrs, else skip
+                        wandb_run = get_run(dataset_runs, df.attrs["ruleset-id"])
+
+                        if "final_return" not in wandb_run.summary:
+                            print(f"Corrupted run {file}, skipping...")
+                            continue
+
+                        if wandb_run.summary["final_return"] < args.final_return_thrs:
+                            continue
+
+                        assert str(idx) not in new_df.keys(), "key already exists"
+                        g = new_df.create_group(str(idx))
+                        g.attrs.update(df.attrs)
+
+                        g.create_dataset(
+                            "states",
+                            shape=df["states"].shape,
+                            dtype=df["states"].dtype,
+                            data=df["states"][:],
+                            compression="gzip",
+                            compression_opts=6,
+                            chunks=(1, 4096, 5, 5),
+                        )
+                        g.create_dataset(
+                            "actions",
+                            shape=df["actions"].shape,
+                            dtype=df["actions"].dtype,
+                            data=df["actions"][:],
+                            compression="gzip",
+                            compression_opts=6,
+                            chunks=(1, 4096),
+                        )
+                        g.create_dataset(
+                            "rewards",
+                            shape=df["rewards"].shape,
+                            dtype=df["rewards"].dtype,
+                            data=df["rewards"][:],
+                            compression="gzip",
+                            compression_opts=6,
+                            chunks=(1, 4096),
+                        )
+                        g.create_dataset(
+                            "dones",
+                            shape=df["dones"].shape,
+                            dtype=df["dones"].dtype,
+                            data=df["dones"][:],
+                            compression="gzip",
+                            compression_opts=6,
+                            chunks=(1, 4096),
+                        )
+                        g.create_dataset(
+                            "expert_actions",
+                            shape=df["expert_actions"].shape,
+                            dtype=df["expert_actions"].dtype,
+                            data=df["expert_actions"][:],
+                            compression="gzip",
+                            compression_opts=6,
+                            chunks=(1, 4096),
+                        )
+            
+            except OSError:
+                print(f"Corrupted file {file}, skipping...")
+                continue
+
+            idx = idx + 1
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--wandb-entity", type=str)
+    parser.add_argument("--wandb-sweep", type=str)
+    parser.add_argument("--final-return-thrs", type=float, default=0.3)
+    parser.add_argument("--data-path", type=str)
+    parser.add_argument("--combined-path", type=str)
+    args = parser.parse_args()
+    main(args)