From 58cce071ef52a1defcb22f2323a5bd11e5fc7277 Mon Sep 17 00:00:00 2001
From: Evgeny Pavlov <epavlov@mozilla.com>
Date: Thu, 15 Feb 2024 15:33:24 -0800
Subject: [PATCH] Support typos and noise modifiers (#428)

* Update opustrainer

* Adjust configs

* Add evaluation modifiers

* Reduce noise

* Add tests for typos and noise

* Fix typos augmenter

* Fix linting issues

* Update docs

* Update opustrainer

* Adjust configs

* Add evaluation modifiers

* Reduce noise

* Add tests for typos and noise

* Fix typos augmenter

* Fix linting issues

* Update docs

* Fix test

* Update opus trainer

* Remove noise parameters from config

* Update opustrainer with fixes

* Run linter

* Fix tests after merge

* Disable noise for student

* Update lockfile

* Fix formatting

* Disable typos for student

* Rename assert functions

* Switch back to faster validation

* Document decision on using augmentations

* Fix typo
---
 docs/opus-trainer.md                          |  38 +++---
 pipeline/data/dataset_importer.py             |  36 +++--
 pipeline/data/requirements/data.in            |   2 +-
 pipeline/data/requirements/data.txt           |   8 +-
 .../train/configs/opustrainer/backward.yml    |   4 -
 .../train/configs/opustrainer/student.yml     |  16 ++-
 .../train/configs/opustrainer/teacher.yml     |   7 +-
 .../train/configs/training/teacher.train.yml  |   2 +-
 pipeline/train/requirements/train.in          |   2 +-
 pipeline/train/requirements/train.txt         |   6 +-
 poetry.lock                                   |   6 +-
 pyproject.toml                                |   2 +-
 tests/test_data_importer.py                   | 125 +++++++++++++-----
 13 files changed, 166 insertions(+), 88 deletions(-)
diff --git a/docs/opus-trainer.md b/docs/opus-trainer.md
index 105a7e6a6..db6ba573a 100644
--- a/docs/opus-trainer.md
+++ b/docs/opus-trainer.md
@@ -23,8 +23,9 @@ Supported augmentations:
 - **Upper case** - make some sentences from the dataset upper case
 - **Title case** - use title case for some sentences from the dataset
 - **Typos** - add random typos in some words
-- **Tags** - add emojis and other random Unicode symbols in the source and target sentences 
-  (requires alignments information for the training corpus)
+- **Noise** - inserts lines with random unicode noise 
+- **Tags (inline noise)** - add emojis and other random Unicode symbols in the source and target sentences 
+  (requires space tokenized alignments for the training corpus)
 
 It is possible to specify the probability of augmentation 
 (which will roughly correspond to the percentage of augmented sentences):
@@ -73,17 +74,27 @@ modifiers:
 - UpperCase: 0.1 # Apply randomly to 10% of sentences
 - TitleCase: 0.1
 - Typos: 0.05
+- Noise: 0.0005
+  min_word_length: 2 # Minimum word length for each word in the noisy sentence
+  max_word_length: 5 # Maximum word length for each word in the noisy sentence
+  max_words: 6 # Maximum number of words in each noisy sentence
 - Tags: 0.05
-  augment: 0.05
-  replace: 0.05
-  spm_vocab: <vocab>
-  
+  augment: 1
 seed: 1111
 
 # parallel sentences + token alignments
 num_fields: 3
 ```
 
+## Models
+
+Current strategy is to run as many supported augmentations as possible for the teacher 
+and student models and skip augmentaiton entirely for the backward model. 
+This is mostly based on the intuition that we do not need the backward model to be robust and would rather prioritize quality that is usually affected by the noisier data.
+Even though the student is supposed to learn on the exact output of the teacher model, training on augmented data seems to be working in practice.
+
+We might rethink this strategy in future after running more experiments.
+
 
 ## Evaluation
 
@@ -105,17 +116,15 @@ For example:
 
 ### Supported modifiers
 
-`aug-typos` - applies typos with a probability of 0.1
-
-`aug-title` - applies title case with probability 0.1
+`aug-typos` - applies 4 random typos to all sentences in the dataset
 
-`aug-title-strict` - applies title case to all sentences
+`aug-title` - applies title case to the whole dataset
 
-`aug-upper` -  applies upper case with probability 0.1
+`aug-upper` -  applies upper case to the whole dataset
 
-`aug-upper-strict` - applies upper case to all sentences
+`aug-noise` -  generates extra lines with noise (1 line of noise for each line of the dataset, so the dataset becomes twice longer)
 
-`aug-mix` - applies, title case and upper case sequentially with 0.1 probability each
+`aug-mix` - applies all the existing modifiers with 0.1 probability each
 
 ### Example training config
 ```yaml
@@ -127,9 +136,8 @@ For example:
     - flores_devtest
     - flores_aug-mix_devtest
     - flores_aug-title_devtest
-    - flores_aug-title-strict_devtest
     - flores_aug-upper_devtest
-    - flores_aug-upper-strict_devtest
     - flores_aug-typos_devtest
+    - flores_aug-noise_devtest
 ```
 
diff --git a/pipeline/data/dataset_importer.py b/pipeline/data/dataset_importer.py
index 615e5c2ff..56600d353 100755
--- a/pipeline/data/dataset_importer.py
+++ b/pipeline/data/dataset_importer.py
@@ -11,11 +11,13 @@
 
 import argparse
 import os
+import random
 import re
 import subprocess
 import sys
-from typing import Iterable, List
+from typing import Dict, Iterable, List
 
+from opustrainer.modifiers.noise import NoiseModifier
 from opustrainer.modifiers.surface import TitleCaseModifier, UpperCaseModifier
 from opustrainer.modifiers.typos import TypoModifier
 from opustrainer.types import Modifier
@@ -44,18 +46,26 @@ def __call__(self, batch: List[str]) -> Iterable[str]:
 
 MIX_PROB = 0.1  # 10% will be augmented in the mix
 
+
+def get_typos_probs() -> Dict[str, float]:
+    # select 4 random types of typos
+    typos = set(random.sample(list(TypoModifier.modifiers.keys()), k=4))
+    # set probability 1 for selected typos and 0 for the rest
+    probs = {typo: 1.0 if typo in typos else 0.0 for typo in TypoModifier.modifiers.keys()}
+    return probs
+
+
 modifier_map = {
-    "aug-typos": TypoModifier(MIX_PROB),
-    "aug-title": TitleCaseModifier(MIX_PROB),
-    "aug-title-strict": TitleCaseModifier(1.0),
-    "aug-upper": UpperCaseModifier(MIX_PROB),
-    "aug-upper-strict": UpperCaseModifier(1.0),
-    "aug-mix": CompositeModifier(
+    "aug-typos": lambda: TypoModifier(1.0, **get_typos_probs()),
+    "aug-title": lambda: TitleCaseModifier(1.0),
+    "aug-upper": lambda: UpperCaseModifier(1.0),
+    "aug-noise": lambda: NoiseModifier(1.0),
+    "aug-mix": lambda: CompositeModifier(
         [
-            # TODO: enable typos, issue https://github.com/mozilla/firefox-translations-training/issues/262
-            # TypoModifier(NOISE_RATE),
+            TypoModifier(MIX_PROB, **get_typos_probs()),
             TitleCaseModifier(MIX_PROB),
             UpperCaseModifier(MIX_PROB),
+            NoiseModifier(MIX_PROB),
         ]
     ),
 }
@@ -88,8 +98,6 @@ def augment(output_prefix: str, aug_modifer: str):
     if aug_modifer not in modifier_map:
         raise ValueError(f"Invalid modifier {aug_modifer}. Allowed values: {modifier_map.keys()}")
 
-    modifer = modifier_map[aug_modifer]
-
     # file paths for compressed and uncompressed corpus
     uncompressed_src = f"{output_prefix}.{SRC}"
     uncompressed_trg = f"{output_prefix}.{TRG}"
@@ -97,7 +105,11 @@ def augment(output_prefix: str, aug_modifer: str):
     compressed_trg = f"{output_prefix}.{TRG}.{COMP_EXT}"
 
     corpus = read_corpus_tsv(compressed_src, compressed_trg, uncompressed_src, uncompressed_trg)
-    modified = list(modifer(corpus))
+    modified = []
+    for line in corpus:
+        # recreate modifier for each line to apply randomization (for typos)
+        modifier = modifier_map[aug_modifer]()
+        modified += modifier([line])
     write_modified(modified, uncompressed_src, uncompressed_trg)
 
 
diff --git a/pipeline/data/requirements/data.in b/pipeline/data/requirements/data.in
index 93f5abff4..833b10020 100644
--- a/pipeline/data/requirements/data.in
+++ b/pipeline/data/requirements/data.in
@@ -1,3 +1,3 @@
 # use the latest main, switch to PyPi when released
-git+https://github.com/hplt-project/OpusTrainer.git@9133e1525c7ee37f53ea14ee6a180152bf7ea192
+git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
 
diff --git a/pipeline/data/requirements/data.txt b/pipeline/data/requirements/data.txt
index 13ddbe543..f79cc2202 100644
--- a/pipeline/data/requirements/data.txt
+++ b/pipeline/data/requirements/data.txt
@@ -8,18 +8,16 @@ click==8.1.7
     # via sacremoses
 joblib==1.3.2
     # via sacremoses
-opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@9133e1525c7ee37f53ea14ee6a180152bf7ea192
-    # via -r pipeline/train/requirements/train.in
+opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
+    # via -r pipeline/data/requirements/data.in
 pyyaml==6.0.1
     # via opustrainer
 regex==2023.10.3
     # via sacremoses
-sacremoses==0.0.53
+sacremoses==0.1.1
     # via opustrainer
 sentencepiece==0.1.99
     # via opustrainer
-six==1.16.0
-    # via sacremoses
 tqdm==4.66.1
     # via sacremoses
 typo==0.1.5
diff --git a/pipeline/train/configs/opustrainer/backward.yml b/pipeline/train/configs/opustrainer/backward.yml
index 571f6a536..e3ec606cc 100644
--- a/pipeline/train/configs/opustrainer/backward.yml
+++ b/pipeline/train/configs/opustrainer/backward.yml
@@ -8,10 +8,6 @@ train:
   - original 1.0
   - until original 10 # Train for 10 epochs. Only OpusTrainer can control epochs, it's all one big epoch for Marian
 
-modifiers:
-- UpperCase: 0.07 # Apply randomly to 7% of sentences
-- TitleCase: 0.05
-#- Typos: 0.05
 
 seed: 1111
 num_fields: 2
diff --git a/pipeline/train/configs/opustrainer/student.yml b/pipeline/train/configs/opustrainer/student.yml
index e6e4d0584..532aa8223 100644
--- a/pipeline/train/configs/opustrainer/student.yml
+++ b/pipeline/train/configs/opustrainer/student.yml
@@ -8,19 +8,21 @@ train:
   - original 1.0
   - until original inf # General training until marian early stops
 
-# TODO: augment corpus before decoding or reduce augmentation rate
-# TODO: https://github.com/mozilla/firefox-translations-training/issues/272
+
 modifiers:
 - UpperCase: 0.07 # Apply randomly to 7% of sentences
 - TitleCase: 0.05
-# TODO: enable typos, issue https://github.com/mozilla/firefox-translations-training/issues/262
+# do not use typos modifier for the student, it causes issues with alignments
 #- Typos: 0.05
-# TODO: enable tags, currently doesn't work because of the issue with  tokenization
+# TODO: enable noise for student when we switch to space tokenized alignments for Tags
+#- Noise: 0.0005
+#  min_word_length: 2 # Minimum word length for each word in the noisy sentence
+#  max_word_length: 5 # Maximum word length for each word in the noisy sentence
+#  max_words: 6 # Maximum number of words in each noisy sentence
+# TODO: enable inline noise, currently doesn't work because it requires space tokenized alignments
 # TODO: https://github.com/mozilla/firefox-translations-training/issues/261
 #- Tags: 0.05
-#  augment: 0.05
-#  replace: 0.05
-#  spm_vocab: <vocab>
+#  augment: 1
 
 seed: 1111
 # parallel sentences + token alignments
diff --git a/pipeline/train/configs/opustrainer/teacher.yml b/pipeline/train/configs/opustrainer/teacher.yml
index 5d097cc37..d4426c4fd 100644
--- a/pipeline/train/configs/opustrainer/teacher.yml
+++ b/pipeline/train/configs/opustrainer/teacher.yml
@@ -21,8 +21,11 @@ finetune:
 modifiers:
 - UpperCase: 0.07 # Apply randomly to 7% of sentences
 - TitleCase: 0.05
-# TODO: enable typos, issue https://github.com/mozilla/firefox-translations-training/issues/262
-#- Typos: 0.05
+- Typos: 0.05
+- Noise: 0.0005
+  min_word_length: 2 # Minimum word length for each word in the noisy sentence
+  max_word_length: 5 # Maximum word length for each word in the noisy sentence
+  max_words: 6 # Maximum number of words in each noisy sentence
 
 
 # random seed should be different for different teacher models
diff --git a/pipeline/train/configs/training/teacher.train.yml b/pipeline/train/configs/training/teacher.train.yml
index d34086853..9b70a5cb5 100644
--- a/pipeline/train/configs/training/teacher.train.yml
+++ b/pipeline/train/configs/training/teacher.train.yml
@@ -4,7 +4,7 @@ learn-rate: 0.0003 # Turn this down if you get a diverged model, maybe 0.0001
 optimizer-delay: 2 # Roughly GPU devices * optimizer-delay = 8, but keep as an integer
 lr-report: True
 save-freq: 5000
-valid-freq: 5000
+valid-freq: 3000
 valid-max-length: 300
 valid-mini-batch: 8
 early-stopping: 20
diff --git a/pipeline/train/requirements/train.in b/pipeline/train/requirements/train.in
index 93f5abff4..833b10020 100644
--- a/pipeline/train/requirements/train.in
+++ b/pipeline/train/requirements/train.in
@@ -1,3 +1,3 @@
 # use the latest main, switch to PyPi when released
-git+https://github.com/hplt-project/OpusTrainer.git@9133e1525c7ee37f53ea14ee6a180152bf7ea192
+git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
 
diff --git a/pipeline/train/requirements/train.txt b/pipeline/train/requirements/train.txt
index 0eb918f08..d7d301010 100644
--- a/pipeline/train/requirements/train.txt
+++ b/pipeline/train/requirements/train.txt
@@ -8,18 +8,16 @@ click==8.1.7
     # via sacremoses
 joblib==1.3.2
     # via sacremoses
-opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@9133e1525c7ee37f53ea14ee6a180152bf7ea192
+opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
     # via -r pipeline/train/requirements/train.in
 pyyaml==6.0.1
     # via opustrainer
 regex==2023.10.3
     # via sacremoses
-sacremoses==0.0.53
+sacremoses==0.1.1
     # via opustrainer
 sentencepiece==0.1.99
     # via opustrainer
-six==1.16.0
-    # via sacremoses
 tqdm==4.66.1
     # via sacremoses
 typo==0.1.5
diff --git a/poetry.lock b/poetry.lock
index 004b57db8..f1cca6357 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1915,8 +1915,8 @@ typo = "0.1.5"
 [package.source]
 type = "git"
 url = "https://github.com/hplt-project/OpusTrainer.git"
-reference = "9133e1525c7ee37f53ea14ee6a180152bf7ea192"
-resolved_reference = "9133e1525c7ee37f53ea14ee6a180152bf7ea192"
+reference = "c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21"
+resolved_reference = "c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21"
 
 [[package]]
 name = "packaging"
@@ -3773,4 +3773,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "f40b2383ba105036dca33b317913ea6bff866d92ce3c7fff1ee28989df6d2060"
+content-hash = "98a4ba0a5622fb1e61eab3393822a763cae6a3162f017333ae15a0090ecfa8e7"
diff --git a/pyproject.toml b/pyproject.toml
index 49854d728..25e30caaa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ mtdata="0.3.2"
 requests="2.26.0"
 pytest="7.4.3"
 # use the latest main, switch to PyPi when released
-opustrainer = {git = "https://github.com/hplt-project/OpusTrainer.git", rev="9133e1525c7ee37f53ea14ee6a180152bf7ea192"}
+opustrainer = {git = "https://github.com/hplt-project/OpusTrainer.git", rev="c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21"}
 requests-mock = "^1.11.0"
 sh = "^2.0.6"
 zstandard = "^0.22.0"
diff --git a/tests/test_data_importer.py b/tests/test_data_importer.py
index 6ad3a31f7..a91623b0d 100644
--- a/tests/test_data_importer.py
+++ b/tests/test_data_importer.py
@@ -19,7 +19,7 @@
 from pipeline.data.dataset_importer import run_import
 
 # the augmentation is probabilistic, here is a range for 0.1 probability
-AUG_MAX_RATE = 0.3
+AUG_MAX_RATE = 0.35
 AUG_MIN_RATE = 0.01
 
 
@@ -32,15 +32,50 @@ def is_title_case(text):
     return all((word[0].isupper() or not word.isalpha()) for word in text.split())
 
 
+def is_title_lines(src_l, trg_l, aug_src_l, aug_trg_l):
+    return is_title_case(aug_src_l) and is_title_case(aug_trg_l)
+
+
 def is_upper_case(text):
     return all((word.isupper() or not word.isalpha()) for word in text.split())
 
 
-def get_aug_rate(file, check_func):
-    lines = read_lines(file)
-    aug_num = len([l for l in lines if check_func(l)])
-    rate = aug_num / len(lines)
-    print(f"augmentation rate for {file}: {rate}")
+def is_upper_lines(src_l, trg_l, aug_src_l, aug_trg_l):
+    return is_upper_case(aug_src_l) and is_upper_case(aug_trg_l)
+
+
+def src_is_different(src_l, trg_l, aug_src_l, aug_trg_l):
+    return src_l != aug_src_l
+
+
+def assert_all_equal(*items):
+    assert len(set(items)) == 1
+
+
+def assert_twice_longer(src, trg, aug_src, aug_trg):
+    assert src * 2 == aug_src
+    assert trg * 2 == aug_trg
+
+
+def get_aug_rate(src, trg, aug_src, aug_trg, check_func, check_len=None):
+    src, trg, aug_src, aug_trg = (
+        read_lines(src),
+        read_lines(trg),
+        read_lines(aug_src),
+        read_lines(aug_trg),
+    )
+    if check_len:
+        check_len(len(src), len(trg), len(aug_src), len(aug_trg))
+
+    if len(src) != len(aug_src):
+        rate = 0
+    else:
+        aug_num = 0
+        for lines in zip(src, trg, aug_src, aug_trg):
+            if check_func(*lines):
+                aug_num += 1
+        rate = aug_num / len(src)
+
     return rate
 
 
@@ -108,47 +143,73 @@ def test_mono_source_import(language, importer, dataset, data_dir):
     assert len(read_lines(mono_data)) > 0
 
 
-augmentation_params = [
-    ("sacrebleu_aug-upper_wmt19", is_upper_case, AUG_MIN_RATE, AUG_MAX_RATE),
-    ("sacrebleu_aug-upper-strict_wmt19", is_upper_case, 1.0, 1.0),
-    ("sacrebleu_aug-title_wmt19", is_title_case, AUG_MIN_RATE, AUG_MAX_RATE),
-    ("sacrebleu_aug-title-strict_wmt19", is_title_case, 1.0, 1.0),
-]
-
-
-@pytest.mark.parametrize("params", augmentation_params, ids=[d[0] for d in augmentation_params])
+@pytest.mark.parametrize(
+    "params",
+    [
+        ("sacrebleu_aug-upper_wmt19", is_upper_lines, assert_all_equal, 1.0, 1.0),
+        ("sacrebleu_aug-title_wmt19", is_title_lines, assert_all_equal, 1.0, 1.0),
+        # there's a small chance for the string to stay the same
+        ("sacrebleu_aug-typos_wmt19", src_is_different, assert_all_equal, 0.95, 1.0),
+        # noise modifier generates extra lines
+        ("sacrebleu_aug-noise_wmt19", lambda x: True, assert_twice_longer, 0.0, 0.0),
+    ],
+    ids=["upper", "title", "typos", "noise"],
+)
 def test_specific_augmentation(params, data_dir):
-    dataset, check_func, min_rate, max_rate = params
-    prefix = data_dir.join(dataset)
-    output_src = f"{prefix}.{SRC}.{ARTIFACT_EXT}"
-    output_trg = f"{prefix}.{TRG}.{ARTIFACT_EXT}"
-
-    run_import("corpus", dataset, prefix)
+    dataset, check_func, check_len, min_rate, max_rate = params
+    original_dataset = "sacrebleu_wmt19"
+    prefix_aug = data_dir.join(dataset)
+    prefix_original = data_dir.join(original_dataset)
+    output_src = f"{prefix_aug}.{SRC}.{ARTIFACT_EXT}"
+    output_trg = f"{prefix_aug}.{TRG}.{ARTIFACT_EXT}"
+    original_src = f"{prefix_original}.{SRC}.{ARTIFACT_EXT}"
+    original_trg = f"{prefix_original}.{TRG}.{ARTIFACT_EXT}"
+    run_import("corpus", original_dataset, prefix_original)
+
+    run_import("corpus", dataset, prefix_aug)
 
     data_dir.print_tree()
     assert os.path.exists(output_src)
     assert os.path.exists(output_trg)
-
-    for file in (output_src, output_trg):
-        rate = get_aug_rate(file, check_func)
-        assert rate >= min_rate
-        assert rate <= max_rate
+    rate = get_aug_rate(original_src, original_trg, output_src, output_trg, check_func, check_len)
+    assert rate >= min_rate
+    assert rate <= max_rate
 
 
 def test_augmentation_mix(data_dir):
     dataset = "sacrebleu_aug-mix_wmt19"
+    original_dataset = "sacrebleu_wmt19"
     prefix = data_dir.join(dataset)
+    prefix_original = data_dir.join(original_dataset)
     output_src = f"{prefix}.{SRC}.{ARTIFACT_EXT}"
     output_trg = f"{prefix}.{TRG}.{ARTIFACT_EXT}"
+    original_src = f"{prefix_original}.{SRC}.{ARTIFACT_EXT}"
+    original_trg = f"{prefix_original}.{TRG}.{ARTIFACT_EXT}"
+    run_import("corpus", original_dataset, prefix_original)
 
     run_import("corpus", dataset, prefix)
 
     data_dir.print_tree()
     assert os.path.exists(output_src)
     assert os.path.exists(output_trg)
-
-    for file in (output_src, output_trg):
-        for check_func in (is_upper_case, is_title_case):
-            rate = get_aug_rate(file, check_func)
-            assert rate <= AUG_MAX_RATE
-            assert rate >= AUG_MIN_RATE
+    src, trg, aug_src, aug_trg = (
+        read_lines(original_src),
+        read_lines(original_trg),
+        read_lines(output_src),
+        read_lines(output_trg),
+    )
+    len_noise_src = len(aug_src) - len(src)
+    len_noise_trg = len(aug_trg) - len(trg)
+    # check noise rate
+    for noise, original in [(len_noise_src, len(src)), (len_noise_trg, len(trg))]:
+        noise_rate = noise / original
+        assert noise_rate > AUG_MIN_RATE
+        assert noise_rate < AUG_MAX_RATE
+
+    # check augmentation rate without noise
+    for aug, original in [(aug_src, src), (aug_trg, trg)]:
+        len_unchanged = len(set(aug).intersection(set(original)))
+        len_original = len(original)
+        aug_rate = (len_original - len_unchanged) / len(original)
+        assert aug_rate > AUG_MIN_RATE
+        assert aug_rate < AUG_MAX_RATE