Support typos and noise modifiers (#428)

* Update opustrainer * Adjust configs * Add evaluation modifiers * Reduce noise * Add tests for typos and noise * Fix typos augmenter * Fix linting issues * Update docs * Update opustrainer * Adjust configs * Add evaluation modifiers * Reduce noise * Add tests for typos and noise * Fix typos augmenter * Fix linting issues * Update docs * Fix test * Update opus trainer * Remove noise parameters from config * Update opustrainer with fixes * Run linter * Fix tests after merge * Disable noise for student * Update lockfile * Fix formatting * Disable typos for student * Rename assert functions * Switch back to faster validation * Document decision on using augmentations * Fix typo
mozilla · Feb 15, 2024 · 58cce07 · 58cce07
1 parent 092fd98
commit 58cce07
Show file tree

Hide file tree

Showing 13 changed files with 166 additions and 88 deletions.
diff --git a/docs/opus-trainer.md b/docs/opus-trainer.md
@@ -23,8 +23,9 @@ Supported augmentations:
 - **Upper case** - make some sentences from the dataset upper case
 - **Title case** - use title case for some sentences from the dataset
 - **Typos** - add random typos in some words
-- **Tags** - add emojis and other random Unicode symbols in the source and target sentences 
-  (requires alignments information for the training corpus)
+- **Noise** - inserts lines with random unicode noise 
+- **Tags (inline noise)** - add emojis and other random Unicode symbols in the source and target sentences 
+  (requires space tokenized alignments for the training corpus)
 
 It is possible to specify the probability of augmentation 
 (which will roughly correspond to the percentage of augmented sentences):
@@ -73,17 +74,27 @@ modifiers:
 - UpperCase: 0.1 # Apply randomly to 10% of sentences
 - TitleCase: 0.1
 - Typos: 0.05
+- Noise: 0.0005
+  min_word_length: 2 # Minimum word length for each word in the noisy sentence
+  max_word_length: 5 # Maximum word length for each word in the noisy sentence
+  max_words: 6 # Maximum number of words in each noisy sentence
 - Tags: 0.05
-  augment: 0.05
-  replace: 0.05
-  spm_vocab: <vocab>
-  
+  augment: 1
 seed: 1111
 
 # parallel sentences + token alignments
 num_fields: 3
 ```
 
+## Models
+
+Current strategy is to run as many supported augmentations as possible for the teacher 
+and student models and skip augmentaiton entirely for the backward model. 
+This is mostly based on the intuition that we do not need the backward model to be robust and would rather prioritize quality that is usually affected by the noisier data.
+Even though the student is supposed to learn on the exact output of the teacher model, training on augmented data seems to be working in practice.
+
+We might rethink this strategy in future after running more experiments.
+
 
 ## Evaluation
 
@@ -105,17 +116,15 @@ For example:
 
 ### Supported modifiers
 
-`aug-typos` - applies typos with a probability of 0.1
-
-`aug-title` - applies title case with probability 0.1
+`aug-typos` - applies 4 random typos to all sentences in the dataset
 
-`aug-title-strict` - applies title case to all sentences
+`aug-title` - applies title case to the whole dataset
 
-`aug-upper` -  applies upper case with probability 0.1
+`aug-upper` -  applies upper case to the whole dataset
 
-`aug-upper-strict` - applies upper case to all sentences
+`aug-noise` -  generates extra lines with noise (1 line of noise for each line of the dataset, so the dataset becomes twice longer)
 
-`aug-mix` - applies, title case and upper case sequentially with 0.1 probability each
+`aug-mix` - applies all the existing modifiers with 0.1 probability each
 
 ### Example training config
 ```yaml
@@ -127,9 +136,8 @@ For example:
     - flores_devtest
     - flores_aug-mix_devtest
     - flores_aug-title_devtest
-    - flores_aug-title-strict_devtest
     - flores_aug-upper_devtest
-    - flores_aug-upper-strict_devtest
     - flores_aug-typos_devtest
+    - flores_aug-noise_devtest
 ```
 
diff --git a/pipeline/data/dataset_importer.py b/pipeline/data/dataset_importer.py
@@ -11,11 +11,13 @@
 
 import argparse
 import os
+import random
 import re
 import subprocess
 import sys
-from typing import Iterable, List
+from typing import Dict, Iterable, List
 
+from opustrainer.modifiers.noise import NoiseModifier
 from opustrainer.modifiers.surface import TitleCaseModifier, UpperCaseModifier
 from opustrainer.modifiers.typos import TypoModifier
 from opustrainer.types import Modifier
@@ -44,18 +46,26 @@ def __call__(self, batch: List[str]) -> Iterable[str]:
 
 MIX_PROB = 0.1  # 10% will be augmented in the mix
 
+
+def get_typos_probs() -> Dict[str, float]:
+    # select 4 random types of typos
+    typos = set(random.sample(list(TypoModifier.modifiers.keys()), k=4))
+    # set probability 1 for selected typos and 0 for the rest
+    probs = {typo: 1.0 if typo in typos else 0.0 for typo in TypoModifier.modifiers.keys()}
+    return probs
+
+
 modifier_map = {
-    "aug-typos": TypoModifier(MIX_PROB),
-    "aug-title": TitleCaseModifier(MIX_PROB),
-    "aug-title-strict": TitleCaseModifier(1.0),
-    "aug-upper": UpperCaseModifier(MIX_PROB),
-    "aug-upper-strict": UpperCaseModifier(1.0),
-    "aug-mix": CompositeModifier(
+    "aug-typos": lambda: TypoModifier(1.0, **get_typos_probs()),
+    "aug-title": lambda: TitleCaseModifier(1.0),
+    "aug-upper": lambda: UpperCaseModifier(1.0),
+    "aug-noise": lambda: NoiseModifier(1.0),
+    "aug-mix": lambda: CompositeModifier(
         [
-            # TODO: enable typos, issue https://github.com/mozilla/firefox-translations-training/issues/262
-            # TypoModifier(NOISE_RATE),
+            TypoModifier(MIX_PROB, **get_typos_probs()),
             TitleCaseModifier(MIX_PROB),
             UpperCaseModifier(MIX_PROB),
+            NoiseModifier(MIX_PROB),
         ]
     ),
 }
@@ -88,16 +98,18 @@ def augment(output_prefix: str, aug_modifer: str):
     if aug_modifer not in modifier_map:
         raise ValueError(f"Invalid modifier {aug_modifer}. Allowed values: {modifier_map.keys()}")
 
-    modifer = modifier_map[aug_modifer]
-
     # file paths for compressed and uncompressed corpus
     uncompressed_src = f"{output_prefix}.{SRC}"
     uncompressed_trg = f"{output_prefix}.{TRG}"
     compressed_src = f"{output_prefix}.{SRC}.{COMP_EXT}"
     compressed_trg = f"{output_prefix}.{TRG}.{COMP_EXT}"
 
     corpus = read_corpus_tsv(compressed_src, compressed_trg, uncompressed_src, uncompressed_trg)
-    modified = list(modifer(corpus))
+    modified = []
+    for line in corpus:
+        # recreate modifier for each line to apply randomization (for typos)
+        modifier = modifier_map[aug_modifer]()
+        modified += modifier([line])
     write_modified(modified, uncompressed_src, uncompressed_trg)
 
 

diff --git a/pipeline/data/requirements/data.in b/pipeline/data/requirements/data.in
@@ -1,3 +1,3 @@
 # use the latest main, switch to PyPi when released
-git+https://github.com/hplt-project/OpusTrainer.git@9133e1525c7ee37f53ea14ee6a180152bf7ea192
+git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
 
diff --git a/pipeline/data/requirements/data.txt b/pipeline/data/requirements/data.txt
@@ -8,18 +8,16 @@ click==8.1.7
     # via sacremoses
 joblib==1.3.2
     # via sacremoses
-opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@9133e1525c7ee37f53ea14ee6a180152bf7ea192
-    # via -r pipeline/train/requirements/train.in
+opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
+    # via -r pipeline/data/requirements/data.in
 pyyaml==6.0.1
     # via opustrainer
 regex==2023.10.3
     # via sacremoses
-sacremoses==0.0.53
+sacremoses==0.1.1
     # via opustrainer
 sentencepiece==0.1.99
     # via opustrainer
-six==1.16.0
-    # via sacremoses
 tqdm==4.66.1
     # via sacremoses
 typo==0.1.5

diff --git a/pipeline/train/configs/opustrainer/backward.yml b/pipeline/train/configs/opustrainer/backward.yml
@@ -8,10 +8,6 @@ train:
   - original 1.0
   - until original 10 # Train for 10 epochs. Only OpusTrainer can control epochs, it's all one big epoch for Marian
 
-modifiers:
-- UpperCase: 0.07 # Apply randomly to 7% of sentences
-- TitleCase: 0.05
-#- Typos: 0.05
 
 seed: 1111
 num_fields: 2
diff --git a/pipeline/train/configs/opustrainer/student.yml b/pipeline/train/configs/opustrainer/student.yml
@@ -8,19 +8,21 @@ train:
   - original 1.0
   - until original inf # General training until marian early stops
 
-# TODO: augment corpus before decoding or reduce augmentation rate
-# TODO: https://github.com/mozilla/firefox-translations-training/issues/272
+
 modifiers:
 - UpperCase: 0.07 # Apply randomly to 7% of sentences
 - TitleCase: 0.05
-# TODO: enable typos, issue https://github.com/mozilla/firefox-translations-training/issues/262
+# do not use typos modifier for the student, it causes issues with alignments
 #- Typos: 0.05
-# TODO: enable tags, currently doesn't work because of the issue with  tokenization
+# TODO: enable noise for student when we switch to space tokenized alignments for Tags
+#- Noise: 0.0005
+#  min_word_length: 2 # Minimum word length for each word in the noisy sentence
+#  max_word_length: 5 # Maximum word length for each word in the noisy sentence
+#  max_words: 6 # Maximum number of words in each noisy sentence
+# TODO: enable inline noise, currently doesn't work because it requires space tokenized alignments
 # TODO: https://github.com/mozilla/firefox-translations-training/issues/261
 #- Tags: 0.05
-#  augment: 0.05
-#  replace: 0.05
-#  spm_vocab: <vocab>
+#  augment: 1
 
 seed: 1111
 # parallel sentences + token alignments

diff --git a/pipeline/train/configs/opustrainer/teacher.yml b/pipeline/train/configs/opustrainer/teacher.yml
@@ -21,8 +21,11 @@ finetune:
 modifiers:
 - UpperCase: 0.07 # Apply randomly to 7% of sentences
 - TitleCase: 0.05
-# TODO: enable typos, issue https://github.com/mozilla/firefox-translations-training/issues/262
-#- Typos: 0.05
+- Typos: 0.05
+- Noise: 0.0005
+  min_word_length: 2 # Minimum word length for each word in the noisy sentence
+  max_word_length: 5 # Maximum word length for each word in the noisy sentence
+  max_words: 6 # Maximum number of words in each noisy sentence
 
 
 # random seed should be different for different teacher models

diff --git a/pipeline/train/configs/training/teacher.train.yml b/pipeline/train/configs/training/teacher.train.yml
@@ -4,7 +4,7 @@ learn-rate: 0.0003 # Turn this down if you get a diverged model, maybe 0.0001
 optimizer-delay: 2 # Roughly GPU devices * optimizer-delay = 8, but keep as an integer
 lr-report: True
 save-freq: 5000
-valid-freq: 5000
+valid-freq: 3000
 valid-max-length: 300
 valid-mini-batch: 8
 early-stopping: 20

diff --git a/pipeline/train/requirements/train.in b/pipeline/train/requirements/train.in
@@ -1,3 +1,3 @@
 # use the latest main, switch to PyPi when released
-git+https://github.com/hplt-project/OpusTrainer.git@9133e1525c7ee37f53ea14ee6a180152bf7ea192
+git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
 
diff --git a/pipeline/train/requirements/train.txt b/pipeline/train/requirements/train.txt
@@ -8,18 +8,16 @@ click==8.1.7
     # via sacremoses
 joblib==1.3.2
     # via sacremoses
-opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@9133e1525c7ee37f53ea14ee6a180152bf7ea192
+opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
     # via -r pipeline/train/requirements/train.in
 pyyaml==6.0.1
     # via opustrainer
 regex==2023.10.3
     # via sacremoses
-sacremoses==0.0.53
+sacremoses==0.1.1
     # via opustrainer
 sentencepiece==0.1.99
     # via opustrainer
-six==1.16.0
-    # via sacremoses
 tqdm==4.66.1
     # via sacremoses
 typo==0.1.5

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,7 @@ mtdata="0.3.2"
 requests="2.26.0"
 pytest="7.4.3"
 # use the latest main, switch to PyPi when released
-opustrainer = {git = "https://github.com/hplt-project/OpusTrainer.git", rev="9133e1525c7ee37f53ea14ee6a180152bf7ea192"}
+opustrainer = {git = "https://github.com/hplt-project/OpusTrainer.git", rev="c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21"}
 requests-mock = "^1.11.0"
 sh = "^2.0.6"
 zstandard = "^0.22.0"