Add documentation to various parts of the scripts and pipeline (#298)

mozilla · Dec 15, 2023 · 742fb8f · 742fb8f
1 parent 8ad592a
commit 742fb8f
Show file tree

Hide file tree

Showing 21 changed files with 382 additions and 180 deletions.
diff --git a/configs/tc.prod.yml b/configs/tc.prod.yml
@@ -1,9 +1,101 @@
 # Example of a production config for Taskcluster
+#
+# See the training guide for more information:
+#   https://mozilla.github.io/firefox-translations-training/training-guide.html
+#
+# The defaults for these parameters are available at:
+#   taskcluster/translations_taskgraph/parameters.py
+
+# An "experiment" is an individual training run.
+experiment:
+  # Provide an identifiable name for your experiment.
+  name: baseline_en_ru
+
+  # The source and target languages. This is the language tag part of the
+  # BCP 47 locale identifier.
+  src: en
+  trg: ru
+
+  # The metric to use for computing the best model validation.
+  #   cross-entropy, ce-mean-words, perplexity, valid-script, translation, bleu,
+  #   bleu-segmented, chrf
+  # See: https://github.com/marian-nmt/marian/blob/65bf82ffce52f4854295d8b98482534f176d494e/src/common/config_parser.cpp#L588-L592
+  best-model: chrf
+
+  # Use the Opus Cleaner tool on the data cleaning step.
+  # https://github.com/hplt-project/OpusCleaner
+  use-opuscleaner: "true"
+
+  # Bicleaner is a tool that aims at detecting noisy sentence pairs in a parallel corpus.
+  # See: docs/bicleaner.md
+  bicleaner:
+    default-threshold: 0.5
+    dataset-thresholds:
+      opus_CCAligned/v1: 0.7
+      opus_LinguaTools-WikiTitles/v2014: 0.7
+      opus_OpenSubtitles/v2018: 0.8           # Example of a higher filtering level.
+      opus_ParaCrawl/v9: 0.0                  # Example of cleaning being disabled.
+      opus_WikiMatrix/v1: 0.7
+      opus_bible-uedin/v1: 0.7
+
+  # Limits the maximum sentences use in the monolingual data for both the source and target
+  # languages.
+  mono-max-sentences-src: 100_000_000
+  mono-max-sentences-trg: 20_000_000
+
+  # The number of sentences split out into separate files for the monolingual data.
+  # Also see taskcluster.split-chunks below.
+  split-length: 2_000_000
+
+  # How many bytes of the corpus are sampled to be used in SentencePiece
+  # tokenization training.
+  spm-sample-size: 10_000_000
+
+  # Determine how many teachers to train.
+  # See: docs/teacher-ensemble.md
+  teacher-ensemble: 2
+
+  # Path to a pretrained backward model (optional).
+  backward-model: NOT-YET-SUPPORTED
+
+  # Path to a pretrained vocabulary (optional).
+  vocab: NOT-YET-SUPPORTED
+
+# The lists of datasets. Each dataset is defined by a corpus key. This key is formed
+# by "{IMPORTER}_{DATASET}". These datasets and their corpus key can be found through
+#
+# TODO(docs) - Document augmentation in corpus keys.
+#
+# To find datasets run:
+#  poetry run utils/find_corpus.py en ru
 datasets:
+  # The datasets used for validation while training. These should not be the same
+  # ones used in test or train. This is what is used to determine when to stop training.
   devtest:
     - flores_dev
     - sacrebleu_wmt08
     - mtdata_Neulab-tedtalks_dev-1-eng-hun
+
+  # The datasets used for the final evaluation to determine the quality of the trained
+  # model.
+  test:
+    - flores_devtest
+    - sacrebleu_wmt09
+    - mtdata_Neulab-tedtalks_test-1-eng-hun
+
+  # The parallel training data.
+  train:
+    - opus_Books/v1
+    - opus_CCAligned/v1
+    - opus_CCMatrix/v1
+    - opus_DGT/v2019
+    - opus_ECB/v1
+    - opus_ECDC/v2016-03-16
+    - opus_ELITR-ECA/v1
+    - opus_ELRC-2019-EUIPO_2017/v1
+    - opus_ELRC-2715-EMEA/v1
+
+  # Monolingual data sources for the source language.
   mono-src:
     - news-crawl_news.2021
     - news-crawl_news.2020
@@ -20,6 +112,8 @@ datasets:
     - news-crawl_news.2009
     - news-crawl_news.2008
     - news-crawl_news.2007
+
+  # Monolingual data sources for the target language.
   mono-trg:
     - news-crawl_news.2021
     - news-crawl_news.2020
@@ -36,107 +130,10 @@ datasets:
     - news-crawl_news.2009
     - news-crawl_news.2008
     - news-crawl_news.2007
-  test:
-    - flores_devtest
-    - sacrebleu_wmt09
-    - mtdata_Neulab-tedtalks_test-1-eng-hun
-  train:
-    - opus_Books/v1
-    - opus_CCAligned/v1
-    - opus_CCMatrix/v1
-    - opus_DGT/v2019
-    - opus_ECB/v1
-    - opus_ECDC/v2016-03-16
-    - opus_ELITR-ECA/v1
-    - opus_ELRC-2019-EUIPO_2017/v1
-    - opus_ELRC-2715-EMEA/v1
-    - opus_ELRC-2744-vaccination/v1
-    - opus_ELRC-2876-EU_publications_medi/v1
-    - opus_ELRC-3064-wikipedia_health/v1
-    - opus_ELRC-3203-antibiotic/v1
-    - opus_ELRC-3294-EUROPARL_covid/v1
-    - opus_ELRC-3465-EC_EUROPA_covid/v1
-    - opus_ELRC-3566-EUR_LEX_covid/v1
-    - opus_ELRC-3607-presscorner_covid/v1
-    - opus_ELRC-5067-SciPar/v1
-    - opus_ELRC-EC_EUROPA/v1
-    - opus_ELRC-EMEA/v1
-    - opus_ELRC-EUIPO_2017/v1
-    - opus_ELRC-EUROPARL_covid/v1
-    - opus_ELRC-EUR_LEX/v1
-    - opus_ELRC-EU_publications/v1
-    - opus_ELRC-antibiotic/v1
-    - opus_ELRC-presscorner_covid/v1
-    - opus_ELRC-vaccination/v1
-    - opus_ELRC-wikipedia_health/v1
-    - opus_ELRC_2922/v1
-    - opus_ELRC_2923/v1
-    - opus_ELRC_3382/v1
-    - opus_EMEA/v3
-    - opus_EUbookshop/v2
-    - opus_EUconst/v1
-    - opus_Europarl/v8
-    - opus_GNOME/v1
-    - opus_GlobalVoices/v2018q4
-    - opus_JRC-Acquis/v3.0
-    - opus_KDE4/v2
-    - opus_LinguaTools-WikiTitles/v2014
-    - opus_NeuLab-TedTalks/v1
-    - opus_OpenSubtitles/v2018
-    - opus_PHP/v1
-    - opus_ParaCrawl/v9
-    - opus_QED/v2.0a
-    - opus_TED2020/v1
-    - opus_Tatoeba/v2022-03-03
-    - opus_TildeMODEL/v2018
-    - opus_Ubuntu/v14.10
-    - opus_WikiMatrix/v1
-    - opus_Wikipedia/v1.0
-    - opus_XLEnt/v1.2
-    - opus_bible-uedin/v1
-    - opus_wikimedia/v20210402
-    - mtdata_ELRC-antibiotic-1-eng-hun
-    - mtdata_ELRC-ec_europa_covid-1-eng-hun
-    - mtdata_ELRC-emea-1-eng-hun
-    - mtdata_ELRC-eu_publications_medical_v2-1-eng-hun
-    - mtdata_ELRC-euipo_2017-1-eng-hun
-    - mtdata_ELRC-eur_lex_covid-1-eng-hun
-    - mtdata_ELRC-presscorner_covid-1-eng-hun
-    - mtdata_ELRC-vaccination-1-eng-hun
-    - mtdata_ELRC-wikipedia_health-1-eng-hun
-    - mtdata_EU-dcep-1-eng-hun
-    - mtdata_EU-eac_forms-1-eng-hun
-    - mtdata_EU-eac_reference-1-eng-hun
-    - mtdata_EU-ecdc-1-eng-hun
-    - mtdata_LinguaTools-wikititles-2014-eng-hun
-    - mtdata_Neulab-tedtalks_train-1-eng-hun
-    - mtdata_Tilde-ecb-2017-eng-hun
-    - mtdata_Tilde-eesc-2017-eng-hun
-    - mtdata_Tilde-ema-2016-eng-hun
-    - mtdata_Tilde-rapid-2016-eng-hun
-    - mtdata_Lindat-khresmoi_summary_dev-2-eng-hun
-    - mtdata_Lindat-khresmoi_summary_test-2-eng-hun
-experiment:
-  backward-model: NOT-YET-SUPPORTED
-  best-model: chrf
-  bicleaner:
-    dataset-thresholds:
-      opus_CCAligned/v1: 0.7
-      opus_LinguaTools-WikiTitles/v2014: 0.7
-      opus_OpenSubtitles/v2018: 0.8
-      opus_ParaCrawl/v9: 0.0
-      opus_WikiMatrix/v1: 0.7
-      opus_bible-uedin/v1: 0.7
-    default-threshold: 0.5
-  mono-max-sentences-src: 100000000
-  mono-max-sentences-trg: 20000000
-  name: baseline_enhu
-  split-length: 2000000
-  spm-sample-size: 10000000
-  src: en
-  teacher-ensemble: 2
-  trg: hu
-  vocab: NOT-YET-SUPPORTED
+
+# Arguments that are provided to Marian, the underlying machine learning system used
+# to train language.
+# https://marian-nmt.github.io/docs/cmd/marian/
 marian-args:
   decoding-backward:
     beam-size: '12'
@@ -152,6 +149,14 @@ marian-args:
     early-stopping: '20'
   training-student-finetuned:
     early-stopping: '20'
+
+# Run all of the training pipeline with "all", or run a specific stage such as
+# "merge-corpus". For more information see:
+#
+# https://mozilla.github.io/firefox-translations-training/task-cluster.html#running-up-to-a-specific-step
 target-stage: all
+
 taskcluster:
+  # After the parallel corpora are merged and de-duplicated, the combined file is
+  # then split into an even number of chunks.
   split-chunks: 10
diff --git a/docs/Gemfile b/docs/Gemfile
@@ -3,5 +3,6 @@
 source "https://rubygems.org"
 
 gem "jekyll", "~> 4.3"
-gem "just-the-docs", "~> 0.7.0"
+gem "jekyll-relative-links", "~> 0.7.0"
 gem "jekyll-remote-theme", "~> 0.4.3"
+gem "just-the-docs", "~> 0.7.0"
diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock
@@ -33,6 +33,8 @@ GEM
       webrick (~> 1.7)
     jekyll-include-cache (0.2.1)
       jekyll (>= 3.7, < 5.0)
+    jekyll-relative-links (0.7.0)
+      jekyll (>= 3.3, < 5.0)
     jekyll-remote-theme (0.4.3)
       addressable (~> 2.0)
       jekyll (>= 3.5, < 5.0)
@@ -81,6 +83,7 @@ PLATFORMS
 
 DEPENDENCIES
   jekyll (~> 4.3)
+  jekyll-relative-links (~> 0.7.0)
   jekyll-remote-theme (~> 0.4.3)
   just-the-docs (~> 0.7.0)
 

diff --git a/docs/_config.yml b/docs/_config.yml
@@ -10,4 +10,5 @@ aux_links:
   "GitHub":
     - "https://github.com/mozilla/firefox-translations-training"
 plugins:
+  - jekyll-relative-links
   - jekyll-remote-theme
diff --git a/docs/bicleaner.md b/docs/bicleaner.md
@@ -0,0 +1,50 @@
+---
+layout: default
+title: Bicleaner
+parent: Data cleaning
+---
+# Bicleaner
+
+Bicleaner is a tool that aims at detecting noisy sentence pairs in a parallel corpus. The classifier scores parallel sentences from 0 to 1 where 0 means a very noisy translation and 1 is a good translation. In the pipeline, Bicleaner AI will be used first if [the language is available][ai-releases], otherwise it will fallback to the original non-AI Bicleaner.
+
+See:
+  * [https://github.com/bitextor/bicleaner-ai](https://github.com/bitextor/bicleaner-ai)
+  * [https://github.com/bitextor/bicleaner](https://github.com/bitextor/bicleaner)
+
+For supported languages see:
+  * [Bicleaner AI Releases][ai-releases]
+  * [Bicleaner Releases][releases]
+
+New language releases should be added to: `taskcluster/ci/fetch/bicleaner.yml`
+
+## How to configure for training
+
+The configuration specifies a default threshold and a per-dataset threshold. A sentence pair will be kept if its score is **above** the given threshold.
+
+- `0.5` should be a [good default value].
+- Increase the threshold for noisier datasets.
+- Set the threshold to `0` to skip cleaning entirely.
+
+## Recommendations for specific datasets
+
+| Data set      | Threshold | Reason                   |
+| ------------- | --------- | -------                  |
+| OpenSubtitles | 0.8       | This is a noiser dataset |
+| ParaCrawl     | 0         | This dataset has already been cleaned by bicleaner. See [Bicleaner AI: Bicleaner Goes Neural], section 4.2.2 |
+
+## Example config:
+
+```
+  bicleaner:
+    default-threshold: 0.5
+    dataset-thresholds:
+      opus_CCAligned/v1: 0.7
+      opus_OpenSubtitles/v2018: 0.8
+      opus_ParaCrawl/v9: 0
+      ...
+```
+
+[good default value]: https://github.com/bitextor/bicleaner-ai/wiki/How-to-train-your-Bicleaner-AI#bicleaning-a-corpus
+[ai-releases]: https://github.com/bitextor/bicleaner-ai-data/releases
+[releases]: https://github.com/bitextor/bicleaner-data/releases
+[Bicleaner AI: Bicleaner Goes Neural]: https://aclanthology.org/2022.lrec-1.87.pdf
diff --git a/docs/cleaning.md b/docs/cleaning.md
@@ -2,6 +2,7 @@
 layout: default
 title: Data cleaning
 nav_order: 5
+has_children: true
 ---
 
 # Data cleaning
@@ -10,17 +11,16 @@ Making datasets less noisy to improve quality of translation.
 
 ## Regular pipeline
 
-
 Config setting:
 ```
   use-opuscleaner: false
 ```
 
 ### Dataset fixing
 
-Some datasets require fixes like detokenization. 
+Some datasets require fixes like detokenization.
 Dataset and language specific fixes are implemented in [https://github.com/mozilla/firefox-translations-training/tree/main/pipeline/clean/fixes](https://github.com/mozilla/firefox-translations-training/tree/main/pipeline/clean/fixes).
-Naming convention: 
+Naming convention:
 - `<dataset_name>.sh` for parallel dataset cleaning
 - `<dataset_name>.<lang>.sh` for language specific cleaning of parallel or monolingual dataset
 - `/` in dataset name should be replaced with `_`
@@ -32,8 +32,8 @@ Make sure the language is present in [clean_parallel](https://github.com/mozilla
 
 ### Bicleaner
 
-It is recommended to use Bicleaner ML models to filter noisy data. 
-See more details on how to configure it in the [Model training guide, Bicleaner section](training-guide.md/#bicleaner).
+It is recommended to use Bicleaner ML models to filter noisy data.
+See the [bicleaner documentation](bicleaner.md) for more details on how to configure it.
 
 
 ## OpusCleaner
@@ -46,7 +46,8 @@ Config setting:
 ```
 
 ## Custom filter configs
-The idea behind the OpusCleaner is customizing filter rules for each language pair and dataset 
+
+The idea behind the OpusCleaner is customizing filter rules for each language pair and dataset
 to get a training corpus with less noise and train higher quality translation models.
 
 Filtering rules can be tuned in an interactive UI.

diff --git a/docs/pipeline-steps.md b/docs/pipeline-steps.md
@@ -2,6 +2,7 @@
 layout: default
 title: Pipeline steps
 nav_order: 3
+has_children: true
 ---
 
 # Pipeline steps

diff --git a/docs/teacher-ensemble.md b/docs/teacher-ensemble.md
@@ -0,0 +1,21 @@
+---
+layout: default
+title: Teacher Ensemble
+parent: Pipeline steps
+---
+
+# Teacher Ensemble
+
+Teacher models are larger and slower translation models that have higher BLEU scores. In the pipeline they are used to distill smaller and faster student models at the cost of a lower BLEU score.
+
+In the config files, you can specify how many teachers to train via `experiment.teacher-ensemble` key. The teachers will be identical except they will be initialized with different random seeds. This has been shown to improve the performance during student distillation, as the translation probabilities will be combined from both models.
+
+While our current implementation only changes seeds, it's also possible to have ensembles that use different configurations or are trained on different datasets.
+
+Recommendations information from [Efficient machine translation](https://nbogoychev.com/efficient-machine-translation/#ensembling):
+
+> One very easy way to improve translation quality of the teacher is to produce an ensemble of systems that produce translation together. This is done by training identical systems, initialising them with different random seed. The more systems, the better, although returns are diminishing.
+>
+> For example, if we want to have an ensemble of two systems, we need to separate configuration files for training, where the seed parameter is different. Configuration one would have seed: 1111, whereas configuration two would have seed: 2222.
+
+We typically use two teacher models in our training.