diff --git a/.devcontainer/dockerfile b/.devcontainer/dockerfile index 6212c64c..06547678 100644 --- a/.devcontainer/dockerfile +++ b/.devcontainer/dockerfile @@ -20,6 +20,7 @@ RUN apt-get update && \ apt-get install --no-install-recommends -y \ python$PYTHON_VERSION \ python$PYTHON_VERSION-distutils \ + python$PYTHON_VERSION-dev \ git vim curl gdb ca-certificates gnupg2 tar make gcc libssl-dev zlib1g-dev libncurses5-dev \ libbz2-dev libreadline-dev libreadline6-dev libxml2-dev xz-utils libgdbm-dev libgdbm-compat-dev tk-dev dirmngr \ libxmlsec1-dev libsqlite3-dev libffi-dev liblzma-dev lzma lzma-dev uuid-dev && \ @@ -39,4 +40,6 @@ RUN pip install -U pip setuptools \ COPY ./.devcontainer/clearml.conf /root/clearml.conf +ENV EFLOMAL_PATH=/workspaces/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin + CMD ["bash"] diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d38f9aac..380c8e10 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,6 +59,8 @@ jobs: poetry run pyright - name: Test with pytest run: poetry run pytest --cov --cov-report=xml + env: + EFLOMAL_PATH: /home/runner/work/machine.py/machine.py/.venv/lib/python${{ matrix.python-version }}/site-packages/eflomal/bin - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v4 env: diff --git a/dockerfile b/dockerfile index 08f20a61..77e49bc0 100755 --- a/dockerfile +++ b/dockerfile @@ -1,9 +1,7 @@ # syntax=docker/dockerfile:1.7-labs - ARG PYTHON_VERSION=3.12 ARG UBUNTU_VERSION=noble ARG POETRY_VERSION=1.6.1 -ARG CUDA_VERSION=12.6.1-base-ubuntu24.04 FROM python:$PYTHON_VERSION-slim AS builder ARG POETRY_VERSION @@ -25,7 +23,7 @@ COPY poetry.lock pyproject.toml /src RUN poetry export --with=gpu --without-hashes -f requirements.txt > requirements.txt -FROM nvidia/cuda:$CUDA_VERSION +FROM python:$PYTHON_VERSION ARG PYTHON_VERSION ENV PIP_DISABLE_PIP_VERSION_CHECK=on @@ -64,4 +62,6 @@ RUN --mount=type=cache,target=/root/.cache \ RUN python -m pip install --no-deps . && rm -r /root/* ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1 +ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin + CMD ["bash"] diff --git a/dockerfile.cpu_only b/dockerfile.cpu_only index cb41bb7f..aab45898 100755 --- a/dockerfile.cpu_only +++ b/dockerfile.cpu_only @@ -43,4 +43,6 @@ RUN --mount=type=cache,target=/root/.cache \ RUN python -m pip install --no-deps . && rm -r /root/* ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1 +ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin + CMD ["bash"] diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py new file mode 100644 index 00000000..0f5526b5 --- /dev/null +++ b/machine/jobs/eflomal_aligner.py @@ -0,0 +1,167 @@ +# NOTE: this is a temporary solution to be able to use the eflomal aligner inside of machine.py. +# The vast majority of this code is taken from the silnlp repository. + +import os +import subprocess +from contextlib import ExitStack +from importlib.util import find_spec +from math import sqrt +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import IO, Iterable, List, Sequence, Tuple + +from ..corpora import AlignedWordPair +from ..corpora.token_processors import escape_spaces, lowercase, normalize +from ..tokenization import LatinWordTokenizer +from ..translation import SymmetrizationHeuristic, WordAlignmentMatrix + + +# From silnlp.common.package_utils +def is_eflomal_available() -> bool: + return find_spec("eflomal") is not None + + +if is_eflomal_available(): + from eflomal import read_text, write_text # type: ignore + +EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal") +TOKENIZER = LatinWordTokenizer() + + +# From silnlp.alignment.tools +def execute_eflomal( + source_path: Path, + target_path: Path, + forward_links_path: Path, + reverse_links_path: Path, + n_iterations: Tuple[int, int, int], +) -> None: + if not is_eflomal_available(): + raise RuntimeError("eflomal is not installed.") + + args = [ + str(EFLOMAL_PATH), + "-s", + str(source_path), + "-t", + str(target_path), + "-f", + str(forward_links_path), + "-r", + str(reverse_links_path), + # "-q", + "-m", + "3", + "-n", + "3", + "-N", + "0.2", + "-1", + str(n_iterations[0]), + "-2", + str(n_iterations[1]), + "-3", + str(n_iterations[2]), + ] + subprocess.run(args, stderr=subprocess.DEVNULL) + + +# From silnlp.alignment.eflomal +def to_word_alignment_matrix(alignment_str: str) -> WordAlignmentMatrix: + word_pairs = AlignedWordPair.from_string(alignment_str) + row_count = 0 + column_count = 0 + for pair in word_pairs: + if pair.source_index + 1 > row_count: + row_count = pair.source_index + 1 + if pair.target_index + 1 > column_count: + column_count = pair.target_index + 1 + return WordAlignmentMatrix.from_word_pairs(row_count, column_count, word_pairs) + + +# From silnlp.alignment.eflomal +def to_eflomal_text_file(input: Iterable[str], output_file: IO[bytes], prefix_len: int = 0, suffix_len: int = 0) -> int: + sents, index = read_text(input, True, prefix_len, suffix_len) + n_sents = len(sents) + voc_size = len(index) + write_text(output_file, tuple(sents), voc_size) + return n_sents + + +# From silnlp.alignment.eflomal +def prepare_files( + src_input: Iterable[str], src_output_file: IO[bytes], trg_input: Iterable[str], trg_output_file: IO[bytes] +) -> int: + n_src_sents = to_eflomal_text_file(src_input, src_output_file) + n_trg_sents = to_eflomal_text_file(trg_input, trg_output_file) + if n_src_sents != n_trg_sents: + raise ValueError("Mismatched file sizes") + return n_src_sents + + +def tokenize(sent: str) -> Sequence[str]: + return list(TOKENIZER.tokenize(sent)) + + +def normalize_for_alignment(sent: Sequence[str]) -> str: + return " ".join(lowercase(normalize("NFC", escape_spaces(sent)))) + + +# From silnlp.alignment.eflomal +class EflomalAligner: + def __init__(self, model_dir: Path) -> None: + self._model_dir = model_dir + + def train(self, src_toks: Sequence[Sequence[str]], trg_toks: Sequence[Sequence[str]]) -> None: + self._model_dir.mkdir(exist_ok=True) + with TemporaryDirectory() as temp_dir: + src_eflomal_path = Path(temp_dir, "source") + trg_eflomal_path = Path(temp_dir, "target") + with ExitStack() as stack: + src_output_file = stack.enter_context(src_eflomal_path.open("wb")) + trg_output_file = stack.enter_context(trg_eflomal_path.open("wb")) + # Write input files for the eflomal binary + n_sentences = prepare_files( + [normalize_for_alignment(s) for s in src_toks], + src_output_file, + [normalize_for_alignment(s) for s in trg_toks], + trg_output_file, + ) + + iters = max(2, int(round(1.0 * 5000 / sqrt(n_sentences)))) + iters4 = max(1, iters // 4) + n_iterations = (max(2, iters4), iters4, iters) + + # Run wrapper for the eflomal binary + execute_eflomal( + src_eflomal_path, + trg_eflomal_path, + self._model_dir / "forward-align.txt", + self._model_dir / "reverse-align.txt", + n_iterations, + ) + + def align(self, sym_heuristic: str = "grow-diag-final-and") -> List[str]: + forward_align_path = self._model_dir / "forward-align.txt" + reverse_align_path = self._model_dir / "reverse-align.txt" + + alignments = [] + heuristic = SymmetrizationHeuristic[sym_heuristic.upper().replace("-", "_")] + with ExitStack() as stack: + forward_file = stack.enter_context(forward_align_path.open("r", encoding="utf-8-sig")) + reverse_file = stack.enter_context(reverse_align_path.open("r", encoding="utf-8-sig")) + + for forward_line, reverse_line in zip(forward_file, reverse_file): + forward_matrix = to_word_alignment_matrix(forward_line.strip()) + reverse_matrix = to_word_alignment_matrix(reverse_line.strip()) + src_len = max(forward_matrix.row_count, reverse_matrix.row_count) + trg_len = max(forward_matrix.column_count, reverse_matrix.column_count) + + forward_matrix.resize(src_len, trg_len) + reverse_matrix.resize(src_len, trg_len) + + forward_matrix.symmetrize_with(reverse_matrix, heuristic) + + alignments.append(str(forward_matrix)) + + return alignments diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py index 1ff719a4..b7b2afbc 100644 --- a/machine/jobs/nmt_engine_build_job.py +++ b/machine/jobs/nmt_engine_build_job.py @@ -1,15 +1,16 @@ import logging from contextlib import ExitStack +from pathlib import Path +from tempfile import TemporaryDirectory from typing import Any, Callable, Optional, Sequence, Tuple from ..corpora.corpora_utils import batch from ..corpora.parallel_text_corpus import ParallelTextCorpus from ..corpora.text_corpus import TextCorpus -from ..translation.translation_engine import TranslationEngine from ..utils.phased_progress_reporter import Phase, PhasedProgressReporter from ..utils.progress_status import ProgressStatus +from .eflomal_aligner import EflomalAligner, is_eflomal_available, tokenize from .nmt_model_factory import NmtModelFactory -from .shared_file_service_base import DictToJsonWriter from .translation_engine_build_job import TranslationEngineBuildJob from .translation_file_service import PretranslationInfo, TranslationFileService @@ -28,12 +29,25 @@ def _get_progress_reporter( self, progress: Optional[Callable[[ProgressStatus], None]], corpus_size: int ) -> PhasedProgressReporter: if corpus_size > 0: - phases = [ - Phase(message="Training NMT model", percentage=0.9), - Phase(message="Pretranslating segments", percentage=0.1), - ] + if self._config.align_pretranslations: + phases = [ + Phase(message="Training NMT model", percentage=0.8), + Phase(message="Pretranslating segments", percentage=0.1), + Phase(message="Aligning segments", percentage=0.1, report_steps=False), + ] + else: + phases = [ + Phase(message="Training NMT model", percentage=0.9), + Phase(message="Pretranslating segments", percentage=0.1), + ] else: - phases = [Phase(message="Pretranslating segments", percentage=1.0)] + if self._config.align_pretranslations: + phases = [ + Phase(message="Pretranslating segments", percentage=0.9), + Phase(message="Aligning segments", percentage=0.1, report_steps=False), + ] + else: + phases = [Phase(message="Pretranslating segments", percentage=1.0)] return PhasedProgressReporter(progress, phases) def _respond_to_no_training_corpus(self) -> Tuple[int, float]: @@ -89,18 +103,66 @@ def _batch_inference( with ExitStack() as stack: phase_progress = stack.enter_context(progress_reporter.start_next_phase()) engine = stack.enter_context(self._nmt_model_factory.create_engine()) - src_pretranslations = stack.enter_context(self._translation_file_service.get_source_pretranslations()) - writer = stack.enter_context(self._translation_file_service.open_target_pretranslation_writer()) + pretranslations = [ + pt_info for pt_info in stack.enter_context(self._translation_file_service.get_source_pretranslations()) + ] + src_segments = [pt_info["translation"] for pt_info in pretranslations] current_inference_step = 0 phase_progress(ProgressStatus.from_step(current_inference_step, inference_step_count)) batch_size = self._config["inference_batch_size"] - for pi_batch in batch(src_pretranslations, batch_size): + for seg_batch in batch(iter(src_segments), batch_size): if check_canceled is not None: check_canceled() - _translate_batch(engine, pi_batch, writer) - current_inference_step += len(pi_batch) + for i, result in enumerate(engine.translate_batch(seg_batch)): + pretranslations[current_inference_step + i]["translation"] = result.translation + current_inference_step += len(seg_batch) phase_progress(ProgressStatus.from_step(current_inference_step, inference_step_count)) + if self._config.align_pretranslations and is_eflomal_available(): + logger.info("Aligning source to pretranslations") + pretranslations = self._align(src_segments, pretranslations, progress_reporter, check_canceled) + + writer = stack.enter_context(self._translation_file_service.open_target_pretranslation_writer()) + for pretranslation in pretranslations: + writer.write(pretranslation) + + def _align( + self, + src_segments: Sequence[str], + pretranslations: Sequence[PretranslationInfo], + progress_reporter: PhasedProgressReporter, + check_canceled: Optional[Callable[[], None]], + ) -> Sequence[PretranslationInfo]: + if check_canceled is not None: + check_canceled() + + logger.info("Aligning source to pretranslations") + progress_reporter.start_next_phase() + + src_tokenized = [tokenize(s) for s in src_segments] + trg_tokenized = [tokenize(pt_info["translation"]) for pt_info in pretranslations] + + with TemporaryDirectory() as td: + aligner = EflomalAligner(Path(td)) + logger.info("Training aligner") + aligner.train(src_tokenized, trg_tokenized) + + if check_canceled is not None: + check_canceled() + + logger.info("Aligning pretranslations") + alignments = aligner.align() + + if check_canceled is not None: + check_canceled() + + for i in range(len(pretranslations)): + pretranslations[i]["source_toks"] = list(src_tokenized[i]) + pretranslations[i]["translation_toks"] = list(trg_tokenized[i]) + pretranslations[i]["alignment"] = alignments[i] + + return pretranslations + def _save_model(self) -> None: if "save_model" in self._config and self._config.save_model is not None: logger.info("Saving model") @@ -108,14 +170,3 @@ def _save_model(self) -> None: self._translation_file_service.save_model( model_path, f"models/{self._config.save_model + ''.join(model_path.suffixes)}" ) - - -def _translate_batch( - engine: TranslationEngine, - batch: Sequence[PretranslationInfo], - writer: DictToJsonWriter, -) -> None: - source_segments = [pi["translation"] for pi in batch] - for i, result in enumerate(engine.translate_batch(source_segments)): - batch[i]["translation"] = result.translation - writer.write(batch[i]) diff --git a/machine/jobs/settings.yaml b/machine/jobs/settings.yaml index f937826d..cfc727c8 100644 --- a/machine/jobs/settings.yaml +++ b/machine/jobs/settings.yaml @@ -3,6 +3,7 @@ default: shared_file_uri: s3:/silnlp/ shared_file_folder: production inference_batch_size: 1024 + align_pretranslations: false huggingface: parent_model_name: facebook/nllb-200-distilled-1.3B train_params: diff --git a/machine/jobs/translation_file_service.py b/machine/jobs/translation_file_service.py index 16e9f2e7..54c4ae90 100644 --- a/machine/jobs/translation_file_service.py +++ b/machine/jobs/translation_file_service.py @@ -16,6 +16,9 @@ class PretranslationInfo(TypedDict): textId: str # noqa: N815 refs: List[str] translation: str + source_toks: List[str] + translation_toks: List[str] + alignment: str SOURCE_FILENAME = "train.src.txt" @@ -62,6 +65,9 @@ def generator() -> Generator[PretranslationInfo, None, None]: textId=pi["textId"], refs=list(pi["refs"]), translation=pi["translation"], + source_toks=list(pi["source_toks"]), + translation_toks=list(pi["translation_toks"]), + alignment=pi["alignment"], ) return ContextManagedGenerator(generator()) diff --git a/poetry.lock b/poetry.lock index bd6b9039..4d8ded6e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -733,6 +733,79 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli"] +[[package]] +name = "cython" +version = "3.0.12" +description = "The Cython compiler for writing C extensions in the Python language." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" +files = [ + {file = "Cython-3.0.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba67eee9413b66dd9fbacd33f0bc2e028a2a120991d77b5fd4b19d0b1e4039b9"}, + {file = "Cython-3.0.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bee2717e5b5f7d966d0c6e27d2efe3698c357aa4d61bb3201997c7a4f9fe485a"}, + {file = "Cython-3.0.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7cffc3464f641c8d0dda942c7c53015291beea11ec4d32421bed2f13b386b819"}, + {file = "Cython-3.0.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d3a8f81980ffbd74e52f9186d8f1654e347d0c44bfea6b5997028977f481a179"}, + {file = "Cython-3.0.12-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8d32856716c369d01f2385ad9177cdd1a11079ac89ea0932dc4882de1aa19174"}, + {file = "Cython-3.0.12-cp310-cp310-win32.whl", hash = "sha256:712c3f31adec140dc60d064a7f84741f50e2c25a8edd7ae746d5eb4d3ef7072a"}, + {file = "Cython-3.0.12-cp310-cp310-win_amd64.whl", hash = "sha256:d6945694c5b9170cfbd5f2c0d00ef7487a2de7aba83713a64ee4ebce7fad9e05"}, + {file = "Cython-3.0.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:feb86122a823937cc06e4c029d80ff69f082ebb0b959ab52a5af6cdd271c5dc3"}, + {file = "Cython-3.0.12-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfdbea486e702c328338314adb8e80f5f9741f06a0ae83aaec7463bc166d12e8"}, + {file = "Cython-3.0.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:563de1728c8e48869d2380a1b76bbc1b1b1d01aba948480d68c1d05e52d20c92"}, + {file = "Cython-3.0.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:398d4576c1e1f6316282aa0b4a55139254fbed965cba7813e6d9900d3092b128"}, + {file = "Cython-3.0.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1e5eadef80143026944ea8f9904715a008f5108d1d644a89f63094cc37351e73"}, + {file = "Cython-3.0.12-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5a93cbda00a5451175b97dea5a9440a3fcee9e54b4cba7a7dbcba9a764b22aec"}, + {file = "Cython-3.0.12-cp311-cp311-win32.whl", hash = "sha256:3109e1d44425a2639e9a677b66cd7711721a5b606b65867cb2d8ef7a97e2237b"}, + {file = "Cython-3.0.12-cp311-cp311-win_amd64.whl", hash = "sha256:d4b70fc339adba1e2111b074ee6119fe9fd6072c957d8597bce9a0dd1c3c6784"}, + {file = "Cython-3.0.12-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fe030d4a00afb2844f5f70896b7f2a1a0d7da09bf3aa3d884cbe5f73fff5d310"}, + {file = "Cython-3.0.12-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a7fec4f052b8fe173fe70eae75091389955b9a23d5cec3d576d21c5913b49d47"}, + {file = "Cython-3.0.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0faa5e39e5c8cdf6f9c3b1c3f24972826e45911e7f5b99cf99453fca5432f45e"}, + {file = "Cython-3.0.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d53de996ed340e9ab0fc85a88aaa8932f2591a2746e1ab1c06e262bd4ec4be7"}, + {file = "Cython-3.0.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ea3a0e19ab77266c738aa110684a753a04da4e709472cadeff487133354d6ab8"}, + {file = "Cython-3.0.12-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c151082884be468f2f405645858a857298ac7f7592729e5b54788b5c572717ba"}, + {file = "Cython-3.0.12-cp312-cp312-win32.whl", hash = "sha256:3083465749911ac3b2ce001b6bf17f404ac9dd35d8b08469d19dc7e717f5877a"}, + {file = "Cython-3.0.12-cp312-cp312-win_amd64.whl", hash = "sha256:c0b91c7ebace030dd558ea28730de8c580680b50768e5af66db2904a3716c3e3"}, + {file = "Cython-3.0.12-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4ee6f1ea1bead8e6cbc4e64571505b5d8dbdb3b58e679d31f3a84160cebf1a1a"}, + {file = "Cython-3.0.12-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57aefa6d3341109e46ec1a13e3a763aaa2cbeb14e82af2485b318194be1d9170"}, + {file = "Cython-3.0.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:879ae9023958d63c0675015369384642d0afb9c9d1f3473df9186c42f7a9d265"}, + {file = "Cython-3.0.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36fcd584dae547de6f095500a380f4a0cce72b7a7e409e9ff03cb9beed6ac7a1"}, + {file = "Cython-3.0.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:62b79dcc0de49efe9e84b9d0e2ae0a6fc9b14691a65565da727aa2e2e63c6a28"}, + {file = "Cython-3.0.12-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4aa255781b093a8401109d8f2104bbb2e52de7639d5896aefafddc85c30e0894"}, + {file = "Cython-3.0.12-cp313-cp313-win32.whl", hash = "sha256:77d48f2d4bab9fe1236eb753d18f03e8b2619af5b6f05d51df0532a92dfb38ab"}, + {file = "Cython-3.0.12-cp313-cp313-win_amd64.whl", hash = "sha256:86c304b20bd57c727c7357e90d5ba1a2b6f1c45492de2373814d7745ef2e63b4"}, + {file = "Cython-3.0.12-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ff5c0b6a65b08117d0534941d404833d516dac422eee88c6b4fd55feb409a5ed"}, + {file = "Cython-3.0.12-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:680f1d6ed4436ae94805db264d6155ed076d2835d84f20dcb31a7a3ad7f8668c"}, + {file = "Cython-3.0.12-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc24609613fa06d0d896309f7164ba168f7e8d71c1e490ed2a08d23351c3f41"}, + {file = "Cython-3.0.12-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1879c073e2b34924ce9b7ca64c212705dcc416af4337c45f371242b2e5f6d32"}, + {file = "Cython-3.0.12-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:bfb75123dd4ff767baa37d7036da0de2dfb6781ff256eef69b11b88b9a0691d1"}, + {file = "Cython-3.0.12-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:f39640f8df0400cde6882e23c734f15bb8196de0a008ae5dc6c8d1ec5957d7c8"}, + {file = "Cython-3.0.12-cp36-cp36m-win32.whl", hash = "sha256:8c9efe9a0895abee3cadfdad4130b30f7b5e57f6e6a51ef2a44f9fc66a913880"}, + {file = "Cython-3.0.12-cp36-cp36m-win_amd64.whl", hash = "sha256:63d840f2975e44d74512f8f34f1f7cb8121c9428e26a3f6116ff273deb5e60a2"}, + {file = "Cython-3.0.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:75c5acd40b97cff16fadcf6901a91586cbca5dcdba81f738efaf1f4c6bc8dccb"}, + {file = "Cython-3.0.12-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e62564457851db1c40399bd95a5346b9bb99e17a819bf583b362f418d8f3457a"}, + {file = "Cython-3.0.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ccd1228cc203b1f1b8a3d403f5a20ad1c40e5879b3fbf5851ce09d948982f2c"}, + {file = "Cython-3.0.12-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25529ee948f44d9a165ff960c49d4903267c20b5edf2df79b45924802e4cca6e"}, + {file = "Cython-3.0.12-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:90cf599372c5a22120609f7d3a963f17814799335d56dd0dcf8fe615980a8ae1"}, + {file = "Cython-3.0.12-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:9f8c48748a9c94ea5d59c26ab49ad0fad514d36f894985879cf3c3ca0e600bf4"}, + {file = "Cython-3.0.12-cp37-cp37m-win32.whl", hash = "sha256:3e4fa855d98bc7bd6a2049e0c7dc0dcf595e2e7f571a26e808f3efd84d2db374"}, + {file = "Cython-3.0.12-cp37-cp37m-win_amd64.whl", hash = "sha256:120681093772bf3600caddb296a65b352a0d3556e962b9b147efcfb8e8c9801b"}, + {file = "Cython-3.0.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:731d719423e041242c9303c80cae4327467299b90ffe62d4cc407e11e9ea3160"}, + {file = "Cython-3.0.12-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3238a29f37999e27494d120983eca90d14896b2887a0bd858a381204549137a"}, + {file = "Cython-3.0.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b588c0a089a9f4dd316d2f9275230bad4a7271e5af04e1dc41d2707c816be44b"}, + {file = "Cython-3.0.12-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ab9f5198af74eb16502cc143cdde9ca1cbbf66ea2912e67440dd18a36e3b5fa"}, + {file = "Cython-3.0.12-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8ee841c0e114efa1e849c281ac9b8df8aa189af10b4a103b1c5fd71cbb799679"}, + {file = "Cython-3.0.12-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:43c48b5789398b228ea97499f5b864843ba9b1ab837562a9227c6f58d16ede8b"}, + {file = "Cython-3.0.12-cp38-cp38-win32.whl", hash = "sha256:5e5f17c48a4f41557fbcc7ee660ccfebe4536a34c557f553b6893c1b3c83df2d"}, + {file = "Cython-3.0.12-cp38-cp38-win_amd64.whl", hash = "sha256:309c081057930bb79dc9ea3061a1af5086c679c968206e9c9c2ec90ab7cb471a"}, + {file = "Cython-3.0.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54115fcc126840926ff3b53cfd2152eae17b3522ae7f74888f8a41413bd32f25"}, + {file = "Cython-3.0.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:629db614b9c364596d7c975fa3fb3978e8c5349524353dbe11429896a783fc1e"}, + {file = "Cython-3.0.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:af081838b0f9e12a83ec4c3809a00a64c817f489f7c512b0e3ecaf5f90a2a816"}, + {file = "Cython-3.0.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:34ce459808f7d8d5d4007bc5486fe50532529096b43957af6cbffcb4d9cc5c8d"}, + {file = "Cython-3.0.12-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d6c6cd6a75c8393e6805d17f7126b96a894f310a1a9ea91c47d141fb9341bfa8"}, + {file = "Cython-3.0.12-cp39-cp39-win32.whl", hash = "sha256:a4032e48d4734d2df68235d21920c715c451ac9de15fa14c71b378e8986b83be"}, + {file = "Cython-3.0.12-cp39-cp39-win_amd64.whl", hash = "sha256:dcdc3e5d4ce0e7a4af6903ed580833015641e968d18d528d8371e2435a34132c"}, + {file = "Cython-3.0.12-py2.py3-none-any.whl", hash = "sha256:0038c9bae46c459669390e53a1ec115f8096b2e4647ae007ff1bf4e6dee92806"}, + {file = "cython-3.0.12.tar.gz", hash = "sha256:b988bb297ce76c671e28c97d017b95411010f7c77fa6623dd0bb47eed1aee1bc"}, +] + [[package]] name = "datasets" version = "2.21.0" @@ -881,6 +954,23 @@ toml = ["toml"] vault = ["hvac"] yaml = ["ruamel.yaml"] +[[package]] +name = "eflomal" +version = "2.0.0" +description = "pip installable eflomal" +optional = false +python-versions = "*" +files = [ + {file = "eflomal-2.0.0.tar.gz", hash = "sha256:b71183dcf85bf4f59f44ef7a59f5268df1c17c0c8d8093f77b220025ffdba100"}, +] + +[package.dependencies] +Cython = "*" +numpy = "*" + +[package.extras] +test = ["pytest"] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -4780,11 +4870,11 @@ type = ["pytest-mypy"] [extras] huggingface = ["datasets", "sacremoses", "transformers"] -jobs = ["clearml", "dynaconf", "json-stream"] +jobs = ["clearml", "dynaconf", "eflomal", "json-stream"] sentencepiece = ["sentencepiece"] thot = ["sil-thot"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "ff353baa0a9c4519a6bef585b095c141da9c20b6dad4ef47c0af3ea57c92e6ee" +content-hash = "b650f3e8499b348a527c5e5f0e89ba90e55fb7df93bb907cc8d8e5fdd6b63cb0" diff --git a/pyproject.toml b/pyproject.toml index 8f6527d9..822c5ee5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ botocore = "^1.35.41" boto3 = "^1.19.41" dynaconf = "^3.2.5" json-stream = "^1.3.0" +eflomal = { markers = "sys_platform == 'linux'", version = "^2.0.0" } [tool.poetry.group.dev.dependencies] pytest = "^8.3.2" @@ -95,7 +96,7 @@ accelerate = { version = "^0.26.1", markers = "sys_platform == 'win32' or sys_pl sentencepiece = ["sentencepiece"] thot = ["sil-thot"] huggingface = ["transformers", "datasets", "sacremoses"] -jobs = ["clearml", "json-stream", "dynaconf"] +jobs = ["clearml", "json-stream", "dynaconf", "eflomal"] [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py index a5e416d6..227b909b 100644 --- a/tests/jobs/test_nmt_engine_build_job.py +++ b/tests/jobs/test_nmt_engine_build_job.py @@ -17,6 +17,7 @@ PretranslationInfo, TranslationFileService, ) +from machine.jobs.eflomal_aligner import is_eflomal_available from machine.translation import ( Phrase, Trainer, @@ -36,6 +37,23 @@ def test_run(decoy: Decoy) -> None: pretranslations = json.loads(env.target_pretranslations) assert len(pretranslations) == 1 assert pretranslations[0]["translation"] == "Please, I have booked a room." + if is_eflomal_available(): + assert pretranslations[0]["source_toks"] == [ + "Por", + "favor", + ",", + "tengo", + "reservada", + "una", + "habitación", + ".", + ] + assert pretranslations[0]["translation_toks"] == ["Please", ",", "I", "have", "booked", "a", "room", "."] + assert len(pretranslations[0]["alignment"]) > 0 + else: + assert pretranslations[0]["source_toks"] == [] + assert pretranslations[0]["translation_toks"] == [] + assert len(pretranslations[0]["alignment"]) == 0 decoy.verify(env.translation_file_service.save_model(Path("model.tar.gz"), "models/save-model.tar.gz"), times=1) @@ -113,6 +131,9 @@ def __init__(self, decoy: Decoy) -> None: textId="text1", refs=["ref1"], translation="Por favor, tengo reservada una habitación.", + source_toks=[], + translation_toks=[], + alignment="", ) ] ) @@ -134,7 +155,15 @@ def open_target_pretranslation_writer(env: _TestEnvironment) -> Iterator[DictToJ ) self.job = NmtEngineBuildJob( - MockSettings({"src_lang": "es", "trg_lang": "en", "save_model": "save-model", "inference_batch_size": 100}), + MockSettings( + { + "src_lang": "es", + "trg_lang": "en", + "save_model": "save-model", + "inference_batch_size": 100, + "align_pretranslations": True, + } + ), self.nmt_model_factory, self.translation_file_service, ) diff --git a/tests/jobs/test_smt_engine_build_job.py b/tests/jobs/test_smt_engine_build_job.py index 0cf2d948..16afcacf 100644 --- a/tests/jobs/test_smt_engine_build_job.py +++ b/tests/jobs/test_smt_engine_build_job.py @@ -137,6 +137,9 @@ def __init__(self, decoy: Decoy) -> None: textId="text1", refs=["ref1"], translation="Por favor, tengo reservada una habitación.", + source_toks=[], + translation_toks=[], + alignment="", ) ] ) @@ -158,7 +161,14 @@ def open_target_pretranslation_writer(env: _TestEnvironment) -> Iterator[DictToJ ) self.job = SmtEngineBuildJob( - MockSettings({"build_id": "mybuild", "inference_batch_size": 100, "thot_mt": {"tokenizer": "latin"}}), + MockSettings( + { + "build_id": "mybuild", + "inference_batch_size": 100, + "thot_mt": {"tokenizer": "latin"}, + "align_pretranslations": False, + } + ), self.smt_model_factory, self.translation_file_service, )