├── .gitignore
├── LICENSE
├── README.md
├── giza.py
├── giza_aligner.py
├── lexicon.py
├── requirements.txt
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | .bin/
141 | .vscode/
142 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 SIL International
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Giza-py: MGIZA++ Command-line Runner
  2 | 
  3 | giza-py is a simple, Python-based, command-line runner for MGIZA++, a popular tool for building word alignment models.
  4 | 
  5 | ## Installation
  6 | 
  7 | ### Python
  8 | 
  9 | Giza-py requires [Python 3.7](https://www.python.org/downloads/) or greater.
 10 | 
 11 | ### Giza-py
 12 | 
 13 | To install Giza-py, clone the repo and install pip dependencies:
 14 | 
 15 | ```
 16 | git clone https://github.com/sillsdev/giza-py.git
 17 | cd giza-py
 18 | pip install -r requirements.txt
 19 | ```
 20 | 
 21 | ### MGIZA++
 22 | 
 23 | In order to install MGIZA++ on Linux/macOS, follow these steps:
 24 | 
 25 | 1. Download the [Boost C++ library](https://www.boost.org/) and unzip it.
 26 | 2. Build Boost:
 27 | 
 28 | ```
 29 | cd <boost_dir>
 30 | ./bootstrap.sh --prefix=./build --with-libraries=thread,system
 31 | ./b2 install
 32 | ```
 33 | 
 34 | 3. Clone the MGIZA++ repo:
 35 | 
 36 | ```
 37 | git clone https://github.com/moses-smt/mgiza.git
 38 | ```
 39 | 
 40 | 4. Build MGIZA++ (CMake is required):
 41 | 
 42 | ```
 43 | cd <mgiza_dir>/mgizapp
 44 | cmake -DBOOST_ROOT=<boost_dir>/build -DBoost_USE_STATIC_LIBS=ON -DCMAKE_INSTALL_PREFIX=<giza-py_dir>/.bin .
 45 | make
 46 | make install
 47 | ```
 48 | 
 49 | ## Usage
 50 | 
 51 | ### Generating alignments
 52 | 
 53 | To generate alignments using MGIZA++, run the following command:
 54 | 
 55 | ```
 56 | python3 giza.py --source <src_path> --target <trg_path> --alignments <output_path>
 57 | ```
 58 | 
 59 | The source and target corpora files must be text files where tokens are separated by spaces. Giza-py will output the alignments in Pharaoh format.
 60 | 
 61 | Alignment probabilties for each aligned word pair can be output by using the `--include-probs` argument. Giza-py will include alignment probabilities in the generated alignment file. The probabilities are separated from each word pair using a colon `:` delimiter. Here is an example of the Pharaoh format with probabilities included:
 62 | 
 63 | ```
 64 | 7-0:0.22661511 5-3:0.4715056 3-6:0.67267063 1-7:0.10234439
 65 | 0-0:0.75820181 4-1:0.24716581 8-4:0.72411429
 66 | ```
 67 | 
 68 | _Note: The probabilities included in the alignment file are only alignment probabilities and do not include translation probabilities. If you want translation probabilties, they can be obtained by [generating a lexicon](#generating-a-lexicon)._
 69 | 
 70 | ### Models
 71 | 
 72 | By default, Giza-py will generate alignments using the IBM-4 model. To specify a different model, use the `--model` argument.
 73 | 
 74 | ```
 75 | python3 giza.py --source <src_path> --target <trg_path> --alignments <output_path> --model hmm
 76 | ```
 77 | 
 78 | The number of iterations for each stage of training can be specified using the `--m{model_number}` arguments. The following example will train an IBM-4 model with 10 iterations for the IBM-1 stage:
 79 | 
 80 | ```
 81 | python3 giza.py --source <src_path> --target <trg_path> --alignments <output_path> --m1 10
 82 | ```
 83 | 
 84 | The following are the parameters for configuring the number of iterations for each supported model:
 85 | 
 86 | - ibm1
 87 |   - m1: IBM-1 (default: 5 iterations)
 88 | - ibm2
 89 |   - m1: IBM-1 (default: 5 iterations)
 90 |   - m2: IBM-2 (default: 5 iterations)
 91 | - hmm
 92 |   - m1: IBM-1 (default: 5 iterations)
 93 |   - mh: HMM (default: 5 iterations)
 94 | - ibm3
 95 |   - m1: IBM-1 (default: 5 iterations)
 96 |   - mh: HMM (default: 5 iterations)
 97 |   - m3: IBM-3 (default: 5 iterations)
 98 | - ibm4
 99 |   - m1: IBM-1 (default: 5 iterations)
100 |   - mh: HMM (default: 5 iterations)
101 |   - m3: IBM-3 (default: 5 iterations)
102 |   - m4: IBM-4 (default: 5 iterations)
103 | 
104 | ### Symmetrization
105 | 
106 | Giza-py generates symmetrized alignments using direct and inverse alignment models. By default, Giza-py will symmetrize alignments using the "grow-diag-final-and" heuristic. To specify a different heuristic, use the `--sym-heuristic` argument.
107 | 
108 | ```
109 | python3 giza.py --source <src_path> --target <trg_path> --alignments <output_path> --sym-heuristic intersection
110 | ```
111 | 
112 | Giza-py supports many different symmetrization heuristics:
113 | 
114 | - union
115 | - intersection
116 | - och
117 | - grow
118 | - grow-diag
119 | - grow-diag-final
120 | - grow-diag-final-and
121 | 
122 | ### Generating a lexicon
123 | 
124 | Giza-py can also extract a bilingual lexicon from the trained alignment model.
125 | 
126 | ```
127 | python3 giza.py --source <src_path> --target <trg_path> --lexicon <output_path>
128 | ```
129 | 
130 | The lexicon is extracted as a tab-separated text file. The score for each word pair is the maximum probability from the direct and inverse alignment model.
131 | 
132 | The lexicon can be filtered by using the `--lexicon-threshold` argument. Giza-py will filter out all translations with a probability that is less than or equal to the specified threshold.
133 | 


--------------------------------------------------------------------------------
/giza.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import tempfile
 3 | from pathlib import Path
 4 | 
 5 | from giza_aligner import HmmGizaAligner, Ibm1GizaAligner, Ibm2GizaAligner, Ibm3GizaAligner, Ibm4GizaAligner
 6 | 
 7 | 
 8 | def main() -> None:
 9 |     parser = argparse.ArgumentParser(description="Aligns the parallel corpus for an experiment")
10 |     parser.add_argument("--bin", type=str, default=".bin", metavar="PATH", help="The mgiza++ folder")
11 |     parser.add_argument("--source", type=str, required=True, metavar="PATH", help="The source corpus")
12 |     parser.add_argument("--target", type=str, required=True, metavar="PATH", help="The target corpus")
13 |     parser.add_argument("--alignments", type=str, default=None, metavar="PATH", help="The output alignments")
14 |     parser.add_argument(
15 |         "--include-probs",
16 |         default=False,
17 |         action="store_true",
18 |         help="Include alignment probabilities in output alignments",
19 |     )
20 |     parser.add_argument("--lexicon", type=str, default=None, metavar="PATH", help="The output lexicon")
21 |     parser.add_argument(
22 |         "--lexicon-threshold", type=float, default=0.0, metavar="THRESHOLD", help="The lexicon probability threshold"
23 |     )
24 |     parser.add_argument(
25 |         "--model",
26 |         type=str,
27 |         choices=["ibm1", "ibm2", "hmm", "ibm3", "ibm4"],
28 |         default="ibm4",
29 |         help="The word alignment model",
30 |     )
31 |     parser.add_argument(
32 |         "--sym-heuristic",
33 |         type=str,
34 |         choices=["union", "intersection", "och", "grow", "grow-diag", "grow-diag-final", "grow-diag-final-and"],
35 |         default="grow-diag-final-and",
36 |         help="The symmetrization heuristic",
37 |     )
38 |     parser.add_argument("--m1", type=int, default=None, metavar="ITERATIONS", help="The number of IBM-1 iterations")
39 |     parser.add_argument("--m2", type=int, default=None, metavar="ITERATIONS", help="The number of IBM-2 iterations")
40 |     parser.add_argument("--mh", type=int, default=None, metavar="ITERATIONS", help="The number of HMM iterations")
41 |     parser.add_argument("--m3", type=int, default=None, metavar="ITERATIONS", help="The number of IBM-3 iterations")
42 |     parser.add_argument("--m4", type=int, default=None, metavar="ITERATIONS", help="The number of IBM-4 iterations")
43 |     parser.add_argument("--maxsentencelength", type=int, default=101, metavar="TRAINING", help="The maximum sentence length")
44 |     parser.add_argument("--maxfertility", type=int, default=10, metavar="TRAINING", help="The maximum fertility parameter")
45 |     parser.add_argument("--quiet", default=False, action="store_true", help="Quiet display")
46 |     args = parser.parse_args()
47 | 
48 |     bin_dir = Path(args.bin)
49 | 
50 |     model: str = args.model
51 |     model = model.lower()
52 | 
53 |     optArgs: List[str] = [
54 |         "-ml", str(args.maxsentencelength),
55 |         "-maxfertility", str(args.maxfertility)
56 |     ]
57 | 
58 |     with tempfile.TemporaryDirectory() as td:
59 |         temp_dir = Path(td)
60 |         if model == "ibm1":
61 |             aligner = Ibm1GizaAligner(bin_dir, temp_dir, m1=args.m1)
62 |         elif model == "ibm2":
63 |             aligner = Ibm2GizaAligner(bin_dir, temp_dir, m1=args.m1, m2=args.m2)
64 |         elif model == "hmm":
65 |             aligner = HmmGizaAligner(bin_dir, temp_dir, m1=args.m1, mh=args.mh)
66 |         elif model == "ibm3":
67 |             aligner = Ibm3GizaAligner(bin_dir, temp_dir, m1=args.m1, m2=args.m2, mh=args.mh, m3=args.m3)
68 |         elif model == "ibm4":
69 |             aligner = Ibm4GizaAligner(bin_dir, temp_dir, m1=args.m1, m2=args.m2, mh=args.mh, m3=args.m3, m4=args.m4)
70 |         else:
71 |             raise RuntimeError("Invalid model type.")
72 | 
73 |         source_path = Path(args.source)
74 |         target_path = Path(args.target)
75 |         print("Training...", end="" if args.quiet else "\n", flush=args.quiet)
76 |         aligner.train(source_path, target_path, quiet=args.quiet, optArgs=optArgs)
77 |         if args.quiet:
78 |             print(" done.")
79 | 
80 |         if args.alignments is not None:
81 |             alignments_file_path = Path(args.alignments)
82 |             print("Generating alignments...", end="", flush=True)
83 |             aligner.align(alignments_file_path, args.include_probs, args.sym_heuristic)
84 |             print(" done.")
85 |         if args.lexicon is not None:
86 |             lexicon_path = Path(args.lexicon)
87 |             print("Extracting lexicon...", end="", flush=True)
88 |             aligner.extract_lexicon(lexicon_path, args.lexicon_threshold)
89 |             print(" done.")
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     main()
94 | 


--------------------------------------------------------------------------------
/giza_aligner.py:
--------------------------------------------------------------------------------
  1 | import platform
  2 | import shutil
  3 | import subprocess
  4 | from bisect import insort_left
  5 | from math import ceil
  6 | from pathlib import Path
  7 | from typing import IO, Any, Dict, Iterable, Iterator, List, Optional, Set, TextIO, Tuple
  8 | 
  9 | from machine.translation import SymmetrizationHeuristic, WordAlignmentMatrix
 10 | 
 11 | from lexicon import Lexicon
 12 | from utils import load_corpus, parse_giza_alignments, remove_bom_inplace, write_corpus
 13 | 
 14 | MAX_SENT_LENGTH = 101
 15 | PROB_SMOOTH = 1e-7
 16 | IBM4_SMOOTH_FACTOR = 0.2
 17 | 
 18 | 
 19 | class GizaAligner:
 20 |     def __init__(
 21 |         self,
 22 |         bin_dir: Path,
 23 |         model_dir: Path,
 24 |         m1: Optional[int] = None,
 25 |         m2: Optional[int] = None,
 26 |         mh: Optional[int] = None,
 27 |         m3: Optional[int] = None,
 28 |         m4: Optional[int] = None,
 29 |     ) -> None:
 30 |         self.bin_dir = bin_dir
 31 |         self.model_dir = model_dir
 32 |         self.m1 = m1
 33 |         self.m2 = m2
 34 |         self.mh = mh
 35 |         self.m3 = m3
 36 |         self.m4 = m4
 37 | 
 38 |     @property
 39 |     def file_suffix(self) -> str:
 40 |         suffix = ""
 41 |         if self.m3 is None or self.m3 > 0 or self.m4 is None or self.m4 > 0:
 42 |             suffix = "3.final"
 43 |         elif self.mh is None or self.mh > 0:
 44 |             suffix = f"hmm.{5 if self.mh is None else self.mh}"
 45 |         elif self.m2 is not None and self.m2 > 0:
 46 |             suffix = f"2.{self.m2}"
 47 |         elif self.m1 is None or self.m1 > 0:
 48 |             suffix = f"1.{5 if self.m1 is None else self.m1}"
 49 |         return suffix
 50 | 
 51 |     def train(self, src_file_path: Path, trg_file_path: Path, quiet: bool = False, optArgs: List[str] = []) -> None:
 52 |         self.model_dir.mkdir(exist_ok=True)
 53 |         dest_src_file_path = self.model_dir / "src.txt"
 54 |         shutil.copyfile(src_file_path, dest_src_file_path)
 55 |         src_file_path = dest_src_file_path
 56 |         dest_trg_file_path = self.model_dir / "trg.txt"
 57 |         shutil.copyfile(trg_file_path, dest_trg_file_path)
 58 |         trg_file_path = dest_trg_file_path
 59 | 
 60 |         remove_bom_inplace(src_file_path)
 61 |         remove_bom_inplace(trg_file_path)
 62 | 
 63 |         if self.m4 is None or self.m4 > 0:
 64 |             self._execute_mkcls(src_file_path, "src", quiet)
 65 |             self._execute_mkcls(trg_file_path, "trg", quiet)
 66 | 
 67 |         src_trg_snt_file_path, trg_src_snt_file_path = self._execute_plain2snt(
 68 |             src_file_path, trg_file_path, "src", "trg", quiet
 69 |         )
 70 | 
 71 |         self._execute_snt2cooc(src_trg_snt_file_path, quiet)
 72 |         self._execute_snt2cooc(trg_src_snt_file_path, quiet)
 73 | 
 74 |         src_trg_prefix = src_trg_snt_file_path.with_suffix("")
 75 |         src_trg_output_prefix = src_trg_prefix.parent / (src_trg_prefix.name + "_invswm")
 76 |         self._execute_mgiza(src_trg_snt_file_path, src_trg_output_prefix, quiet, optArgs=optArgs)
 77 |         src_trg_alignments_file_path = src_trg_output_prefix.with_suffix(f".A{self.file_suffix}.all")
 78 |         self._save_alignments(src_trg_output_prefix, src_trg_alignments_file_path)
 79 | 
 80 |         trg_src_output_prefix = src_trg_prefix.parent / (src_trg_prefix.name + "_swm")
 81 |         self._execute_mgiza(trg_src_snt_file_path, trg_src_output_prefix, quiet, optArgs=optArgs)
 82 |         trg_src_alignments_file_path = trg_src_output_prefix.with_suffix(f".A{self.file_suffix}.all")
 83 |         self._save_alignments(trg_src_output_prefix, trg_src_alignments_file_path)
 84 | 
 85 |     def align(
 86 |         self,
 87 |         alignments_file_path: Path,
 88 |         include_probs: bool = False,
 89 |         sym_heuristic: str = "grow-diag-final-and",
 90 |     ) -> None:
 91 |         src_trg_alignments_file_path = self.model_dir / f"src_trg_invswm.A{self.file_suffix}.all"
 92 |         trg_src_alignments_file_path = self.model_dir / f"src_trg_swm.A{self.file_suffix}.all"
 93 |         sym_alignments_file_path = self.model_dir / "alignments.txt"
 94 |         self._symmetrize(
 95 |             src_trg_alignments_file_path,
 96 |             trg_src_alignments_file_path,
 97 |             sym_alignments_file_path,
 98 |             sym_heuristic,
 99 |         )
100 | 
101 |         src_file_path = self.model_dir / "src.txt"
102 |         trg_file_path = self.model_dir / "trg.txt"
103 | 
104 |         with open(alignments_file_path, "w", encoding="utf-8", newline="\n") as alignments_file, open(
105 |             sym_alignments_file_path,
106 |             "r",
107 |             encoding="utf-8-sig",
108 |         ) as sym_alignments_file:
109 |             alignment_probs_data: Any = None
110 |             direct_alignments_file: Optional[IO] = None
111 |             inverse_alignments_file: Optional[IO] = None
112 |             direct_alignments: Optional[Iterator[Set[Tuple[int, int]]]] = None
113 |             inverse_alignments: Optional[Iterator[Set[Tuple[int, int]]]] = None
114 |             if include_probs:
115 |                 alignment_probs_data = self._init_alignment_probs_data()
116 |                 direct_alignments_file = open(src_trg_alignments_file_path, "r", encoding="utf-8-sig")
117 |                 direct_alignments = iter(parse_giza_alignments(direct_alignments_file))
118 |                 inverse_alignments_file = open(trg_src_alignments_file_path, "r", encoding="utf-8-sig")
119 |                 inverse_alignments = iter(parse_giza_alignments(inverse_alignments_file))
120 |             try:
121 |                 for src_str, trg_str in zip(load_corpus(src_file_path), load_corpus(trg_file_path)):
122 |                     if len(src_str) == 0 or len(trg_str) == 0:
123 |                         alignments_file.write("\n")
124 |                         continue
125 | 
126 |                     src_tokens = src_str.split()
127 |                     trg_tokens = trg_str.split()
128 |                     alignment_str = sym_alignments_file.readline().strip()
129 | 
130 |                     if direct_alignments is not None and inverse_alignments is not None:
131 |                         direct_alignment = next(direct_alignments)
132 |                         inverse_alignment = next(inverse_alignments)
133 | 
134 |                         direct_probs = self._get_alignment_probs(
135 |                             alignment_probs_data, src_tokens, trg_tokens, direct_alignment, True
136 |                         )
137 |                         inverse_probs = self._get_alignment_probs(
138 |                             alignment_probs_data, trg_tokens, src_tokens, inverse_alignment, False
139 |                         )
140 | 
141 |                         new_alignment_str = ""
142 |                         for word_pair_str in alignment_str.split():
143 |                             src_index_str, trg_index_str = word_pair_str.split("-", maxsplit=2)
144 |                             src_index = int(src_index_str)
145 |                             trg_index = int(trg_index_str)
146 |                             direct_prob = direct_probs.get((src_index, trg_index), 0.0)
147 |                             inverse_prob = inverse_probs.get((trg_index, src_index), 0.0)
148 |                             prob = round(max(direct_prob, inverse_prob), 8)
149 |                             if len(new_alignment_str) != 0:
150 |                                 new_alignment_str += " "
151 |                             new_alignment_str += f"{src_index}-{trg_index}:{prob}"
152 |                         alignment_str = new_alignment_str
153 |                     alignments_file.write(alignment_str + "\n")
154 |             finally:
155 |                 if direct_alignments_file is not None:
156 |                     direct_alignments_file.close()
157 |                 if inverse_alignments_file is not None:
158 |                     inverse_alignments_file.close()
159 | 
160 |     def extract_lexicon(self, out_file_path: Path, threshold: float = 0.0) -> None:
161 |         src_vocab = self._load_vocab("src")
162 |         trg_vocab = self._load_vocab("trg")
163 |         direct_lexicon = self._load_lexicon(src_vocab, trg_vocab, "invswm", threshold=threshold)
164 |         inverse_lexicon = self._load_lexicon(trg_vocab, src_vocab, "swm", threshold=threshold)
165 |         lexicon = Lexicon.symmetrize(direct_lexicon, inverse_lexicon, threshold=threshold)
166 |         lexicon.write(out_file_path)
167 | 
168 |     def _execute_mkcls(self, input_file_path: Path, output_prefix: str, quiet: bool) -> None:
169 |         mkcls_path = self.bin_dir / "mkcls"
170 |         if platform.system() == "Windows":
171 |             mkcls_path = mkcls_path.with_suffix(".exe")
172 |         if not mkcls_path.is_file():
173 |             raise RuntimeError("mkcls is not installed.")
174 | 
175 |         output_file_path = self.model_dir / f"{output_prefix}.vcb.classes"
176 | 
177 |         args: List[str] = [
178 |             str(mkcls_path),
179 |             "-n10",
180 |             f"-p{input_file_path}",
181 |             f"-V{output_file_path}",
182 |         ]
183 |         subprocess.run(args, stdout=subprocess.DEVNULL if quiet else None, stderr=subprocess.DEVNULL if quiet else None)
184 | 
185 |     def _execute_plain2snt(
186 |         self, src_file_path: Path, trg_file_path: Path, output_src_prefix: str, output_trg_prefix: str, quiet: bool
187 |     ) -> Tuple[Path, Path]:
188 |         plain2snt_path = self.bin_dir / "plain2snt"
189 |         if platform.system() == "Windows":
190 |             plain2snt_path = plain2snt_path.with_suffix(".exe")
191 |         if not plain2snt_path.is_file():
192 |             raise RuntimeError("plain2snt is not installed.")
193 | 
194 |         src_trg_snt_file_path = self.model_dir / f"{output_src_prefix}_{output_trg_prefix}.snt"
195 |         trg_src_snt_file_path = self.model_dir / f"{output_trg_prefix}_{output_src_prefix}.snt"
196 | 
197 |         args: List[str] = [
198 |             str(plain2snt_path),
199 |             str(src_file_path),
200 |             str(trg_file_path),
201 |             "-vcb1",
202 |             str(self.model_dir / f"{output_src_prefix}.vcb"),
203 |             "-vcb2",
204 |             str(self.model_dir / f"{output_trg_prefix}.vcb"),
205 |             "-snt1",
206 |             str(src_trg_snt_file_path),
207 |             "-snt2",
208 |             str(trg_src_snt_file_path),
209 |         ]
210 |         subprocess.run(args, stdout=subprocess.DEVNULL if quiet else None, stderr=subprocess.DEVNULL)
211 |         return src_trg_snt_file_path, trg_src_snt_file_path
212 | 
213 |     def _execute_snt2cooc(self, snt_file_path: Path, quiet: bool) -> None:
214 |         snt2cooc_path = self.bin_dir / "snt2cooc"
215 |         if platform.system() == "Windows":
216 |             snt2cooc_path = snt2cooc_path.with_suffix(".exe")
217 |         if not snt2cooc_path.is_file():
218 |             raise RuntimeError("snt2cooc is not installed.")
219 | 
220 |         snt_dir = snt_file_path.parent
221 |         prefix = snt_file_path.stem
222 |         prefix1, prefix2 = prefix.split("_", maxsplit=2)
223 | 
224 |         args: List[str] = [
225 |             str(snt2cooc_path),
226 |             str(self.model_dir / f"{prefix}.cooc"),
227 |             str(snt_dir / f"{prefix1}.vcb"),
228 |             str(snt_dir / f"{prefix2}.vcb"),
229 |             str(snt_file_path),
230 |         ]
231 |         subprocess.run(args, stdout=subprocess.DEVNULL if quiet else None, stderr=subprocess.DEVNULL)
232 | 
233 |     def _execute_mgiza(self, snt_file_path: Path, output_path: Path, quiet: bool, optArgs: List[str]=[]) -> None:
234 |         mgiza_path = self.bin_dir / "mgiza"
235 |         if platform.system() == "Windows":
236 |             mgiza_path = mgiza_path.with_suffix(".exe")
237 |         if not mgiza_path.is_file():
238 |             raise RuntimeError("mgiza is not installed.")
239 | 
240 |         snt_dir = snt_file_path.parent
241 |         prefix = snt_file_path.stem
242 |         prefix1, prefix2 = prefix.split("_", maxsplit=2)
243 | 
244 |         args: List[str] = [
245 |             str(mgiza_path),
246 |             "-C",
247 |             str(snt_file_path),
248 |             "-CoocurrenceFile",
249 |             str(snt_dir / f"{prefix}.cooc"),
250 |             "-S",
251 |             str(snt_dir / f"{prefix1}.vcb"),
252 |             "-T",
253 |             str(snt_dir / f"{prefix2}.vcb"),
254 |             "-o",
255 |             str(output_path),
256 |         ] + optArgs
257 | 
258 |         if self.m1 is not None:
259 |             args.extend(["-m1", str(self.m1)])
260 |         if self.m2 is not None and (self.mh is None or self.mh == 0):
261 |             args.extend(["-m2", str(self.m2)])
262 |             if self.mh is None:
263 |                 args.extend(["-mh", "0"])
264 |         if self.mh is not None:
265 |             args.extend(["-mh", str(self.mh)])
266 |         if self.m3 is not None:
267 |             args.extend(["-m3", str(self.m3)])
268 |         if self.m4 is not None:
269 |             args.extend(["-m4", str(self.m4)])
270 | 
271 |         if self.m3 == 0 and self.m4 == 0:
272 |             if self.mh is None or self.mh > 0:
273 |                 args.extend(["-th", str(5 if self.mh is None else self.mh)])
274 |             elif self.m2 is not None and self.m2 > 0:
275 |                 args.extend(["-t2", str(self.m2)])
276 |             elif self.m1 is None or self.m1 > 0:
277 |                 args.extend(["-t1", str(5 if self.m1 is None else self.m1)])
278 |         subprocess.run(args, stdout=subprocess.DEVNULL if quiet else None, stderr=subprocess.DEVNULL if quiet else None)
279 | 
280 |     def _save_alignments(self, model_prefix: Path, output_file_path: Path) -> None:
281 |         alignments: List[Tuple[int, str]] = []
282 |         for input_file_path in model_prefix.parent.glob(model_prefix.name + f".A{self.file_suffix}.part*"):
283 |             with open(input_file_path, "r", encoding="utf-8") as in_file:
284 |                 line_index = 0
285 |                 segment_index = 0
286 |                 cur_alignment: str = ""
287 |                 for line in in_file:
288 |                     cur_alignment += line
289 |                     alignment_line_index = line_index % 3
290 |                     if alignment_line_index == 0:
291 |                         start = line.index("(")
292 |                         end = line.index(")")
293 |                         segment_index = int(line[start + 1 : end])
294 |                     elif alignment_line_index == 2:
295 |                         alignments.append((segment_index, cur_alignment.strip()))
296 |                         cur_alignment = ""
297 |                     line_index += 1
298 | 
299 |         write_corpus(
300 |             output_file_path,
301 |             map(lambda a: str(a[1]), sorted(alignments, key=lambda a: a[0])),
302 |         )
303 | 
304 |     def _symmetrize(
305 |         self, direct_align_path: Path, inverse_align_path: Path, output_path: Path, sym_heuristic: str
306 |     ) -> None:
307 |         heuristic = SymmetrizationHeuristic[sym_heuristic.upper().replace("-", "_")]
308 |         with open(direct_align_path, "r", encoding="utf-8-sig") as direct_file, open(
309 |             inverse_align_path, "r", encoding="utf-8-sig"
310 |         ) as inverse_file, open(output_path, "w", encoding="utf-8", newline="\n") as out_file:
311 |             for matrix, inv_matrix in zip(_parse_giza_alignments(direct_file), _parse_giza_alignments(inverse_file)):
312 |                 src_len = max(matrix.row_count, inv_matrix.column_count)
313 |                 trg_len = max(matrix.column_count, inv_matrix.row_count)
314 | 
315 |                 matrix.resize(src_len, trg_len)
316 |                 inv_matrix.resize(trg_len, src_len)
317 | 
318 |                 inv_matrix.transpose()
319 |                 matrix.symmetrize_with(inv_matrix, heuristic)
320 | 
321 |                 out_file.write(str(matrix) + "\n")
322 | 
323 |     def _init_alignment_probs_data(self) -> Any:
324 |         return None
325 | 
326 |     def _get_alignment_probs(
327 |         self, data: Any, src_words: List[str], trg_words: List[str], alignment: Set[Tuple[int, int]], is_direct: bool
328 |     ) -> Dict[Tuple[int, int], float]:
329 |         return {word_pair: 1.0 / (len(src_words) + 1) for word_pair in alignment}
330 | 
331 |     def _load_vocab(self, side: str) -> List[str]:
332 |         vocab_path = self.model_dir / f"{side}.vcb"
333 |         vocab: List[str] = ["NULL", "UNK"]
334 |         for line in load_corpus(vocab_path):
335 |             index_str, word, _ = line.split()
336 |             assert int(index_str) == len(vocab)
337 |             vocab.append(word)
338 |         return vocab
339 | 
340 |     def _load_lexicon(
341 |         self,
342 |         src_vocab: List[str],
343 |         trg_vocab: List[str],
344 |         align_model: str,
345 |         threshold: float = 0.0,
346 |         include_special_tokens: bool = False,
347 |     ) -> Lexicon:
348 |         lexicon = Lexicon()
349 |         model_path = self.model_dir / f"src_trg_{align_model}.t{self.file_suffix}"
350 |         for line in load_corpus(model_path):
351 |             src_index_str, trg_index_str, prob_str = line.split(maxsplit=3)
352 |             src_index = int(src_index_str)
353 |             trg_index = int(trg_index_str)
354 |             if include_special_tokens or (src_index > 1 and trg_index > 1):
355 |                 src_word = src_vocab[src_index]
356 |                 trg_word = trg_vocab[trg_index]
357 |                 prob = float(prob_str)
358 |                 if prob > threshold:
359 |                     lexicon[src_word, trg_word] = prob
360 |         return lexicon
361 | 
362 | 
363 | def _parse_giza_alignments(stream: TextIO) -> Iterable[WordAlignmentMatrix]:
364 |     line_index = 0
365 |     target: List[str] = []
366 |     for line in stream:
367 |         line = line.strip()
368 |         if line.startswith("#"):
369 |             line_index = 0
370 |         elif line_index == 1:
371 |             target = line.split()
372 |         elif line_index == 2:
373 |             start = line.find("({")
374 |             end = line.find("})")
375 |             src_index = -1
376 |             source: List[str] = []
377 |             pairs: Set[Tuple[int, int]] = set()
378 |             while start != -1 and end != -1:
379 |                 if src_index > -1:
380 |                     trg_indices_str = line[start + 2 : end].strip()
381 |                     trg_indices = trg_indices_str.split()
382 |                     for trg_index in trg_indices:
383 |                         pairs.add((src_index, int(trg_index) - 1))
384 |                 start = line.find("({", start + 2)
385 |                 if start >= 0:
386 |                     src_word = line[end + 3 : start]
387 |                     source.append(src_word)
388 |                     end = line.find("})", end + 2)
389 |                     src_index += 1
390 |             yield WordAlignmentMatrix.from_word_pairs(len(source), len(target), pairs)
391 |         line_index += 1
392 | 
393 | 
394 | class Ibm1GizaAligner(GizaAligner):
395 |     def __init__(self, bin_dir: Path, model_dir: Path, m1: Optional[int] = None) -> None:
396 |         super().__init__(bin_dir, model_dir, m1=m1, mh=0, m3=0, m4=0)
397 | 
398 | 
399 | class Ibm2GizaAligner(GizaAligner):
400 |     def __init__(self, bin_dir: Path, model_dir: Path, m1: Optional[int] = None, m2: Optional[int] = None) -> None:
401 |         super().__init__(bin_dir, model_dir, m1=m1, m2=5 if m2 is None else m2, mh=0, m3=0, m4=0)
402 | 
403 |     def _init_alignment_probs_data(self) -> Any:
404 |         return {
405 |             "direct_alignment_table": self._load_alignment_table("invswm"),
406 |             "inverse_alignment_table": self._load_alignment_table("swm"),
407 |         }
408 | 
409 |     def _save_alignments(self, model_prefix: Path, output_file_path: Path) -> None:
410 |         shutil.move(str(model_prefix) + f".A{self.file_suffix}", output_file_path)
411 | 
412 |     def _get_alignment_probs(
413 |         self, data: Any, src_words: List[str], trg_words: List[str], alignment: Set[Tuple[int, int]], is_direct: bool
414 |     ) -> Dict[Tuple[int, int], float]:
415 |         alignment_table: Dict[Tuple[int, int], Dict[int, float]]
416 |         if is_direct:
417 |             alignment_table = data["direct_alignment_table"]
418 |         else:
419 |             alignment_table = data["inverse_alignment_table"]
420 | 
421 |         probs: Dict[Tuple[int, int], float] = {}
422 |         for src_index, trg_index in alignment:
423 |             i = src_index + 1
424 |             j = trg_index + 1
425 |             prob = 0.0
426 |             elem = alignment_table.get((j, len(src_words)))
427 |             if elem is not None:
428 |                 prob = elem.get(i, 0.0)
429 |             probs[(src_index, trg_index)] = max(PROB_SMOOTH, prob)
430 |         return probs
431 | 
432 |     def _load_alignment_table(self, align_model: str) -> Dict[Tuple[int, int], Dict[int, float]]:
433 |         table: Dict[Tuple[int, int], Dict[int, float]] = {}
434 |         totals: Dict[Tuple[int, int], float] = {}
435 |         ext = "ap" if platform.system() == "Windows" else "a"
436 |         for line in load_corpus(self.model_dir / f"src_trg_{align_model}.{ext}{self.file_suffix}"):
437 |             fields = line.split(maxsplit=5)
438 |             i = int(fields[0])
439 |             j = int(fields[1])
440 |             slen = int(fields[2])
441 |             count = float(fields[4])
442 |             key = (j, slen)
443 |             counts = table.get(key)
444 |             if counts is None:
445 |                 counts = {}
446 |                 table[key] = counts
447 |             counts[i] = count
448 |             total = totals.get(key, 0.0)
449 |             totals[key] = total + count
450 | 
451 |         for key, counts in table.items():
452 |             total = totals[key]
453 |             for j, count in counts.items():
454 |                 counts[j] = count / total
455 | 
456 |         return table
457 | 
458 | 
459 | def normalize(values: List[float]) -> None:
460 |     sum_values = sum(values)
461 |     for i in range(len(values)):
462 |         if sum_values > 0:
463 |             values[i] /= sum_values
464 |         else:
465 |             values[i] = 1.0 / len(values)
466 | 
467 | 
468 | def smooth(values: List[float], p: float) -> None:
469 |     pp = p / len(values)
470 |     for i in range(len(values)):
471 |         values[i] = (1.0 - p) * values[i] + pp
472 | 
473 | 
474 | class HmmGizaAligner(GizaAligner):
475 |     def __init__(self, bin_dir: Path, model_dir: Path, m1: Optional[int] = None, mh: Optional[int] = None) -> None:
476 |         super().__init__(bin_dir, model_dir, m1=m1, mh=mh, m3=0, m4=0)
477 | 
478 |     def _init_alignment_probs_data(self) -> Any:
479 |         return {
480 |             "direct_alignment_table": self._load_alignment_table("invswm"),
481 |             "direct_alpha_table": self._load_alpha_table("invswm"),
482 |             "inverse_alignment_table": self._load_alignment_table("swm"),
483 |             "inverse_alpha_table": self._load_alpha_table("swm"),
484 |         }
485 | 
486 |     def _get_alignment_probs(
487 |         self, data: Any, src_words: List[str], trg_words: List[str], alignment: Set[Tuple[int, int]], is_direct: bool
488 |     ) -> Dict[Tuple[int, int], float]:
489 |         alignment_table: Dict[int, float]
490 |         alpha_table: Dict[int, List[float]]
491 |         if is_direct:
492 |             alignment_table = data["direct_alignment_table"]
493 |             alpha_table = data["direct_alpha_table"]
494 |         else:
495 |             alignment_table = data["inverse_alignment_table"]
496 |             alpha_table = data["inverse_alpha_table"]
497 | 
498 |         probs_table: List[List[float]] = []
499 |         for i1 in range(len(src_words) * 2):
500 |             i1_real = i1 % len(src_words)
501 |             al: List[float] = []
502 |             for i2 in range(len(src_words)):
503 |                 al.append(alignment_table.get(i1_real - i2, 1.0 / (2 * (len(src_words) - 1))))
504 |             normalize(al)
505 |             smooth(al, 0.2)
506 |             i1_probs: List[float] = []
507 |             for i2 in range(len(src_words) * 2):
508 |                 i2_real = i2 % len(src_words)
509 |                 empty_i2 = i2 >= len(src_words)
510 |                 if empty_i2:
511 |                     prob = 0.4 if i1_real == i2_real else 0
512 |                 else:
513 |                     prob = al[i2_real]
514 |                 i1_probs.append(prob)
515 |             normalize(i1_probs)
516 |             probs_table.append(i1_probs)
517 | 
518 |         alpha = alpha_table[len(src_words)]
519 | 
520 |         asymm_al: List[int] = [-1] * len(trg_words)
521 |         for src_index, trg_index in alignment:
522 |             asymm_al[trg_index] = src_index
523 | 
524 |         probs: Dict[Tuple[int, int], float] = {}
525 |         prev_src_index = -1
526 |         for trg_index in range(len(asymm_al)):
527 |             src_index = asymm_al[trg_index]
528 |             if prev_src_index == -1:
529 |                 if src_index == -1:
530 |                     src_index = len(src_words)
531 |                 else:
532 |                     probs[(src_index, trg_index)] = alpha[src_index]
533 |             elif src_index == -1:
534 |                 if prev_src_index < len(src_words):
535 |                     src_index = prev_src_index + len(src_words)
536 |             else:
537 |                 probs[(src_index, trg_index)] = probs_table[prev_src_index][src_index]
538 |             prev_src_index = src_index
539 |         return probs
540 | 
541 |     def _load_alignment_table(self, align_model: str) -> Dict[int, float]:
542 |         table: Dict[int, float] = {}
543 |         for line in load_corpus(self.model_dir / f"src_trg_{align_model}.h{self.file_suffix}"):
544 |             fields = line.split()
545 |             for i in range(7, len(fields), 2):
546 |                 pos = int(fields[i])
547 |                 value = float(fields[i + 1])
548 |                 table[pos] = value
549 | 
550 |         return table
551 | 
552 |     def _load_alpha_table(self, align_model: str) -> Dict[int, List[float]]:
553 |         table: Dict[int, List[float]] = {}
554 |         for line in load_corpus(self.model_dir / f"src_trg_{align_model}.h{self.file_suffix}.alpha"):
555 |             fields = line.split()
556 |             src_len = int(fields[0]) / 2
557 |             values: List[float] = []
558 |             for i in range(2, len(fields)):
559 |                 value = float(fields[i])
560 |                 values.append(value)
561 |             normalize(values)
562 |             table[src_len] = values
563 | 
564 |         return table
565 | 
566 | 
567 | class Ibm3GizaAligner(GizaAligner):
568 |     def __init__(
569 |         self,
570 |         bin_dir: Path,
571 |         model_dir: Path,
572 |         m1: Optional[int] = None,
573 |         m2: Optional[int] = None,
574 |         mh: Optional[int] = None,
575 |         m3: Optional[int] = None,
576 |     ) -> None:
577 |         super().__init__(bin_dir, model_dir, m1=m1, m2=m2, mh=mh, m3=m3, m4=0)
578 | 
579 |     def _init_alignment_probs_data(self) -> Any:
580 |         return {
581 |             "direct_distortion_table": self._load_distortion_table("invswm"),
582 |             "inverse_distortion_table": self._load_distortion_table("swm"),
583 |         }
584 | 
585 |     def _get_alignment_probs(
586 |         self, data: Any, src_words: List[str], trg_words: List[str], alignment: Set[Tuple[int, int]], is_direct: bool
587 |     ) -> Dict[Tuple[int, int], float]:
588 |         distortion_table: Dict[Tuple[int, int], Dict[int, float]]
589 |         if is_direct:
590 |             distortion_table = data["direct_distortion_table"]
591 |         else:
592 |             distortion_table = data["inverse_distortion_table"]
593 | 
594 |         probs: Dict[Tuple[int, int], float] = {}
595 |         for src_index, trg_index in alignment:
596 |             i = src_index + 1
597 |             j = trg_index + 1
598 |             prob = 0.0
599 |             elem = distortion_table.get((i, len(trg_words)))
600 |             if elem is not None:
601 |                 prob = elem.get(j, 0.0)
602 |             probs[(src_index, trg_index)] = max(PROB_SMOOTH, prob)
603 |         return probs
604 | 
605 |     def _load_distortion_table(self, align_model: str) -> Dict[Tuple[int, int], Dict[int, float]]:
606 |         table: Dict[Tuple[int, int], Dict[int, float]] = {}
607 |         for line in load_corpus(self.model_dir / f"src_trg_{align_model}.d{self.file_suffix}"):
608 |             fields = line.split(maxsplit=5)
609 |             j = int(fields[0])
610 |             i = int(fields[1])
611 |             tlen = int(fields[3])
612 |             prob = float(fields[4])
613 |             key = (i, tlen)
614 |             probs = table.get(key)
615 |             if probs is None:
616 |                 probs = {}
617 |                 table[key] = probs
618 |             probs[j] = prob
619 |         return table
620 | 
621 | 
622 | class Ibm4GizaAligner(GizaAligner):
623 |     def __init__(
624 |         self,
625 |         bin_dir: Path,
626 |         model_dir: Path,
627 |         m1: Optional[int] = None,
628 |         m2: Optional[int] = None,
629 |         mh: Optional[int] = None,
630 |         m3: Optional[int] = None,
631 |         m4: Optional[int] = None,
632 |     ) -> None:
633 |         super().__init__(bin_dir, model_dir, m1=m1, m2=m2, mh=mh, m3=m3, m4=m4)
634 | 
635 |     def _init_alignment_probs_data(self) -> Any:
636 |         return {
637 |             "src_word_classes": self._load_word_classes("src"),
638 |             "trg_word_classes": self._load_word_classes("trg"),
639 |             "direct_head_distortion_table": self._load_head_distortion_table("invswm"),
640 |             "inverse_head_distortion_table": self._load_head_distortion_table("swm"),
641 |             "direct_nonhead_distortion_table": self._load_nonhead_distortion_table("invswm"),
642 |             "inverse_nonhead_distortion_table": self._load_nonhead_distortion_table("swm"),
643 |         }
644 | 
645 |     def _get_alignment_probs(
646 |         self, data: Any, src_words: List[str], trg_words: List[str], alignment: Set[Tuple[int, int]], is_direct: bool
647 |     ) -> Dict[Tuple[int, int], float]:
648 |         head_distortion_table: Dict[Tuple[int, int], Dict[int, float]]
649 |         nonhead_distortion_table: Dict[int, Dict[int, float]]
650 |         src_classes: Dict[str, int]
651 |         trg_classes: Dict[str, int]
652 |         if is_direct:
653 |             head_distortion_table = data["direct_head_distortion_table"]
654 |             nonhead_distortion_table = data["direct_nonhead_distortion_table"]
655 |             src_classes = data["src_word_classes"]
656 |             trg_classes = data["trg_word_classes"]
657 |         else:
658 |             head_distortion_table = data["inverse_head_distortion_table"]
659 |             nonhead_distortion_table = data["inverse_nonhead_distortion_table"]
660 |             src_classes = data["trg_word_classes"]
661 |             trg_classes = data["src_word_classes"]
662 | 
663 |         cepts: List[List[int]] = [[] for _ in range(0, len(src_words) + 1)]
664 |         for src_index, trg_index in alignment:
665 |             i = src_index + 1
666 |             j = trg_index + 1
667 |             insort_left(cepts[i], j)
668 | 
669 |         probs: Dict[Tuple[int, int], float] = {}
670 |         for src_index, trg_index in alignment:
671 |             i = src_index + 1
672 |             j = trg_index + 1
673 |             t = trg_words[j - 1]
674 |             trg_word_class = trg_classes[t]
675 |             if cepts[i][0] == j:
676 |                 prev_cept = i - 1
677 |                 while prev_cept > 0 and len(cepts[prev_cept]) == 0:
678 |                     prev_cept -= 1
679 |                 if prev_cept == 0:
680 |                     src_word_class = 0
681 |                     center = 0
682 |                 else:
683 |                     s_prev_cept = src_words[prev_cept - 1]
684 |                     src_word_class = src_classes[s_prev_cept]
685 |                     center = int(ceil(sum(cepts[prev_cept]) / len(cepts[prev_cept])))
686 |                 dj = j - center
687 |                 prob = 0.0
688 |                 elem = head_distortion_table.get((src_word_class, trg_word_class))
689 |                 if elem is not None:
690 |                     prob = elem.get(dj, 0.0)
691 |                 probs[(src_index, trg_index)] = max(
692 |                     PROB_SMOOTH,
693 |                     IBM4_SMOOTH_FACTOR / (2 * len(trg_words) - 1) + (1 - IBM4_SMOOTH_FACTOR) * prob,
694 |                 )
695 |             else:
696 |                 pos_in_cept = cepts[i].index(j)
697 |                 prev_in_cept = cepts[i][pos_in_cept - 1]
698 |                 dj = j - prev_in_cept
699 |                 prob = 0.0
700 |                 elem = nonhead_distortion_table.get(trg_word_class)
701 |                 if elem is not None:
702 |                     prob = elem.get(dj, 0.0)
703 |                 probs[(src_index, trg_index)] = max(
704 |                     PROB_SMOOTH, IBM4_SMOOTH_FACTOR / (len(trg_words) - 1) + (1 - IBM4_SMOOTH_FACTOR) * prob
705 |                 )
706 |         return probs
707 | 
708 |     def _load_word_classes(self, side: str) -> Dict[str, int]:
709 |         word_classes: Dict[str, int] = {}
710 |         classes: Dict[str, int] = {}
711 |         for line in load_corpus(self.model_dir / f"{side}.vcb.classes"):
712 |             word, word_class_str = line.split("\t", maxsplit=2)
713 |             class_index = classes.get(word_class_str)
714 |             if class_index is None:
715 |                 class_index = len(classes) + 1
716 |                 classes[word_class_str] = class_index
717 |             word_classes[word] = class_index
718 |         return word_classes
719 | 
720 |     def _load_head_distortion_table(self, align_model: str) -> Dict[Tuple[int, int], Dict[int, float]]:
721 |         table: Dict[Tuple[int, int], Dict[int, float]] = {}
722 |         for line in load_corpus(self.model_dir / f"src_trg_{align_model}.d4.final"):
723 |             fields = line.split()
724 |             trg_word_class = int(fields[3])
725 |             src_word_class = int(fields[4])
726 |             key = (src_word_class, trg_word_class)
727 |             probs = table.get(key)
728 |             if probs is None:
729 |                 probs = {}
730 |                 table[key] = probs
731 |             for index, prob_str in enumerate(fields[9:]):
732 |                 if prob_str != "0":
733 |                     dj = index - MAX_SENT_LENGTH
734 |                     probs[dj] = float(prob_str)
735 |         return table
736 | 
737 |     def _load_nonhead_distortion_table(self, align_model: str) -> Dict[int, Dict[int, float]]:
738 |         table: Dict[int, Dict[int, float]] = {}
739 |         ext = "db4" if platform.system() == "Windows" else "D4"
740 |         is_key_line = True
741 |         trg_word_class = 0
742 |         for line in load_corpus(self.model_dir / f"src_trg_{align_model}.{ext}.final"):
743 |             fields = line.split()
744 |             if is_key_line:
745 |                 trg_word_class = int(fields[3])
746 |             else:
747 |                 probs = table.get(trg_word_class)
748 |                 if probs is None:
749 |                     probs = {}
750 |                     table[trg_word_class] = probs
751 |                 for index, prob_str in enumerate(fields):
752 |                     if prob_str != "0":
753 |                         dj = index - MAX_SENT_LENGTH
754 |                         probs[dj] = float(prob_str)
755 |             is_key_line = not is_key_line
756 |         return table
757 | 


--------------------------------------------------------------------------------
/lexicon.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Iterable, Iterator, Set, Tuple
 2 | from pathlib import Path
 3 | 
 4 | 
 5 | class Lexicon:
 6 |     @classmethod
 7 |     def symmetrize(cls, direct_lexicon: "Lexicon", inverse_lexicon: "Lexicon", threshold: float = 0.0) -> "Lexicon":
 8 |         src_words: Set[str] = set(direct_lexicon.source_words)
 9 |         src_words.update(inverse_lexicon.target_words)
10 | 
11 |         trg_words: Set[str] = set(inverse_lexicon.source_words)
12 |         trg_words.update(direct_lexicon.target_words)
13 | 
14 |         lexicon = Lexicon()
15 |         for src_word in src_words:
16 |             for trg_word in trg_words:
17 |                 direct_prob = direct_lexicon[src_word, trg_word]
18 |                 inverse_prob = inverse_lexicon[trg_word, src_word]
19 |                 prob = max(direct_prob, inverse_prob)
20 |                 if prob > threshold:
21 |                     lexicon[src_word, trg_word] = prob
22 |         return lexicon
23 | 
24 |     def __init__(self) -> None:
25 |         self._table: Dict[str, Dict[str, float]] = {}
26 | 
27 |     def __getitem__(self, indices: Tuple[str, str]) -> float:
28 |         src_word, trg_word = indices
29 |         src_entry = self._table.get(src_word)
30 |         if src_entry is None:
31 |             return 0
32 |         return src_entry.get(trg_word, 0)
33 | 
34 |     def __setitem__(self, indices: Tuple[str, str], value: float) -> None:
35 |         if value == 0:
36 |             return
37 |         src_word, trg_word = indices
38 |         src_entry = self._table.get(src_word)
39 |         if src_entry is None:
40 |             src_entry = {}
41 |             self._table[src_word] = src_entry
42 |         src_entry[trg_word] = value
43 | 
44 |     def __iter__(self) -> Iterator[Tuple[str, str, float]]:
45 |         return (
46 |             (src_word, trg_word, prob)
47 |             for (src_word, trg_words) in self._table.items()
48 |             for (trg_word, prob) in trg_words.items()
49 |         )
50 | 
51 |     @property
52 |     def source_words(self) -> Iterable[str]:
53 |         return self._table.keys()
54 | 
55 |     @property
56 |     def target_words(self) -> Iterable[str]:
57 |         trg_words: Set[str] = set()
58 |         for src_entry in self._table.values():
59 |             trg_words.update(src_entry.keys())
60 |         return trg_words
61 | 
62 |     def write(self, file_path: Path) -> None:
63 |         with open(file_path, "w", encoding="utf-8", newline="\n") as file:
64 |             for src_word, trg_word, prob in sorted(self, key=lambda t: (t[0], -t[2], t[1])):
65 |                 file.write(f"{src_word}\t{trg_word}\t{round(prob, 8)}\n")
66 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sil-machine


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import os
 3 | from pathlib import Path
 4 | from typing import IO, Iterable, Iterator, Set, Tuple
 5 | 
 6 | 
 7 | def write_corpus(corpus_path: Path, sentences: Iterable[str]) -> None:
 8 |     with open(corpus_path, "w", encoding="utf-8", newline="\n") as file:
 9 |         for sentence in sentences:
10 |             file.write(sentence + "\n")
11 | 
12 | 
13 | def load_corpus(corpus_path: Path) -> Iterator[str]:
14 |     with open(corpus_path, "r", encoding="utf-8-sig") as in_file:
15 |         for line in in_file:
16 |             line = line.strip()
17 |             yield line
18 | 
19 | 
20 | def parse_giza_alignments(alignments_file: IO[str]) -> Iterable[Set[Tuple[int, int]]]:
21 |     line_index = 0
22 |     for line in alignments_file:
23 |         line = line.strip()
24 |         if line.startswith("#"):
25 |             line_index = 0
26 |         elif line_index == 2:
27 |             start = line.find("({")
28 |             end = line.find("})")
29 |             src_index = -1
30 |             pairs: Set[Tuple[int, int]] = set()
31 |             while start != -1 and end != -1:
32 |                 if src_index > -1:
33 |                     trg_indices_str = line[start + 2 : end].strip()
34 |                     trg_indices = trg_indices_str.split()
35 |                     pairs.update(((src_index, int(trg_index) - 1) for trg_index in trg_indices))
36 |                 start = line.find("({", start + 2)
37 |                 if start >= 0:
38 |                     end = line.find("})", end + 2)
39 |                     src_index += 1
40 |             yield pairs
41 |         line_index += 1
42 | 
43 | 
44 | def remove_bom_inplace(path):
45 |     """Removes BOM mark, if it exists, from a file and rewrites it in-place"""
46 |     buffer_size = 4096
47 |     bom_length = len(codecs.BOM_UTF8)
48 | 
49 |     with open(path, "r+b") as fp:
50 |         chunk = fp.read(buffer_size)
51 |         if chunk.startswith(codecs.BOM_UTF8):
52 |             i = 0
53 |             chunk = chunk[bom_length:]
54 |             while chunk:
55 |                 fp.seek(i)
56 |                 fp.write(chunk)
57 |                 i += len(chunk)
58 |                 fp.seek(bom_length, os.SEEK_CUR)
59 |                 chunk = fp.read(buffer_size)
60 |             fp.seek(-bom_length, os.SEEK_CUR)
61 |             fp.truncate()
62 | 


--------------------------------------------------------------------------------