├── .gitignore ├── LICENSE ├── README.md ├── giza.py ├── giza_aligner.py ├── lexicon.py ├── requirements.txt └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | .bin/ 141 | .vscode/ 142 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 SIL International 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Giza-py: MGIZA++ Command-line Runner 2 | 3 | giza-py is a simple, Python-based, command-line runner for MGIZA++, a popular tool for building word alignment models. 4 | 5 | ## Installation 6 | 7 | ### Python 8 | 9 | Giza-py requires [Python 3.7](https://www.python.org/downloads/) or greater. 10 | 11 | ### Giza-py 12 | 13 | To install Giza-py, clone the repo and install pip dependencies: 14 | 15 | ``` 16 | git clone https://github.com/sillsdev/giza-py.git 17 | cd giza-py 18 | pip install -r requirements.txt 19 | ``` 20 | 21 | ### MGIZA++ 22 | 23 | In order to install MGIZA++ on Linux/macOS, follow these steps: 24 | 25 | 1. Download the [Boost C++ library](https://www.boost.org/) and unzip it. 26 | 2. Build Boost: 27 | 28 | ``` 29 | cd 30 | ./bootstrap.sh --prefix=./build --with-libraries=thread,system 31 | ./b2 install 32 | ``` 33 | 34 | 3. Clone the MGIZA++ repo: 35 | 36 | ``` 37 | git clone https://github.com/moses-smt/mgiza.git 38 | ``` 39 | 40 | 4. Build MGIZA++ (CMake is required): 41 | 42 | ``` 43 | cd /mgizapp 44 | cmake -DBOOST_ROOT=/build -DBoost_USE_STATIC_LIBS=ON -DCMAKE_INSTALL_PREFIX=/.bin . 45 | make 46 | make install 47 | ``` 48 | 49 | ## Usage 50 | 51 | ### Generating alignments 52 | 53 | To generate alignments using MGIZA++, run the following command: 54 | 55 | ``` 56 | python3 giza.py --source --target --alignments 57 | ``` 58 | 59 | The source and target corpora files must be text files where tokens are separated by spaces. Giza-py will output the alignments in Pharaoh format. 60 | 61 | Alignment probabilties for each aligned word pair can be output by using the `--include-probs` argument. Giza-py will include alignment probabilities in the generated alignment file. The probabilities are separated from each word pair using a colon `:` delimiter. Here is an example of the Pharaoh format with probabilities included: 62 | 63 | ``` 64 | 7-0:0.22661511 5-3:0.4715056 3-6:0.67267063 1-7:0.10234439 65 | 0-0:0.75820181 4-1:0.24716581 8-4:0.72411429 66 | ``` 67 | 68 | _Note: The probabilities included in the alignment file are only alignment probabilities and do not include translation probabilities. If you want translation probabilties, they can be obtained by [generating a lexicon](#generating-a-lexicon)._ 69 | 70 | ### Models 71 | 72 | By default, Giza-py will generate alignments using the IBM-4 model. To specify a different model, use the `--model` argument. 73 | 74 | ``` 75 | python3 giza.py --source --target --alignments --model hmm 76 | ``` 77 | 78 | The number of iterations for each stage of training can be specified using the `--m{model_number}` arguments. The following example will train an IBM-4 model with 10 iterations for the IBM-1 stage: 79 | 80 | ``` 81 | python3 giza.py --source --target --alignments --m1 10 82 | ``` 83 | 84 | The following are the parameters for configuring the number of iterations for each supported model: 85 | 86 | - ibm1 87 | - m1: IBM-1 (default: 5 iterations) 88 | - ibm2 89 | - m1: IBM-1 (default: 5 iterations) 90 | - m2: IBM-2 (default: 5 iterations) 91 | - hmm 92 | - m1: IBM-1 (default: 5 iterations) 93 | - mh: HMM (default: 5 iterations) 94 | - ibm3 95 | - m1: IBM-1 (default: 5 iterations) 96 | - mh: HMM (default: 5 iterations) 97 | - m3: IBM-3 (default: 5 iterations) 98 | - ibm4 99 | - m1: IBM-1 (default: 5 iterations) 100 | - mh: HMM (default: 5 iterations) 101 | - m3: IBM-3 (default: 5 iterations) 102 | - m4: IBM-4 (default: 5 iterations) 103 | 104 | ### Symmetrization 105 | 106 | Giza-py generates symmetrized alignments using direct and inverse alignment models. By default, Giza-py will symmetrize alignments using the "grow-diag-final-and" heuristic. To specify a different heuristic, use the `--sym-heuristic` argument. 107 | 108 | ``` 109 | python3 giza.py --source --target --alignments --sym-heuristic intersection 110 | ``` 111 | 112 | Giza-py supports many different symmetrization heuristics: 113 | 114 | - union 115 | - intersection 116 | - och 117 | - grow 118 | - grow-diag 119 | - grow-diag-final 120 | - grow-diag-final-and 121 | 122 | ### Generating a lexicon 123 | 124 | Giza-py can also extract a bilingual lexicon from the trained alignment model. 125 | 126 | ``` 127 | python3 giza.py --source --target --lexicon 128 | ``` 129 | 130 | The lexicon is extracted as a tab-separated text file. The score for each word pair is the maximum probability from the direct and inverse alignment model. 131 | 132 | The lexicon can be filtered by using the `--lexicon-threshold` argument. Giza-py will filter out all translations with a probability that is less than or equal to the specified threshold. 133 | -------------------------------------------------------------------------------- /giza.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tempfile 3 | from pathlib import Path 4 | 5 | from giza_aligner import HmmGizaAligner, Ibm1GizaAligner, Ibm2GizaAligner, Ibm3GizaAligner, Ibm4GizaAligner 6 | 7 | 8 | def main() -> None: 9 | parser = argparse.ArgumentParser(description="Aligns the parallel corpus for an experiment") 10 | parser.add_argument("--bin", type=str, default=".bin", metavar="PATH", help="The mgiza++ folder") 11 | parser.add_argument("--source", type=str, required=True, metavar="PATH", help="The source corpus") 12 | parser.add_argument("--target", type=str, required=True, metavar="PATH", help="The target corpus") 13 | parser.add_argument("--alignments", type=str, default=None, metavar="PATH", help="The output alignments") 14 | parser.add_argument( 15 | "--include-probs", 16 | default=False, 17 | action="store_true", 18 | help="Include alignment probabilities in output alignments", 19 | ) 20 | parser.add_argument("--lexicon", type=str, default=None, metavar="PATH", help="The output lexicon") 21 | parser.add_argument( 22 | "--lexicon-threshold", type=float, default=0.0, metavar="THRESHOLD", help="The lexicon probability threshold" 23 | ) 24 | parser.add_argument( 25 | "--model", 26 | type=str, 27 | choices=["ibm1", "ibm2", "hmm", "ibm3", "ibm4"], 28 | default="ibm4", 29 | help="The word alignment model", 30 | ) 31 | parser.add_argument( 32 | "--sym-heuristic", 33 | type=str, 34 | choices=["union", "intersection", "och", "grow", "grow-diag", "grow-diag-final", "grow-diag-final-and"], 35 | default="grow-diag-final-and", 36 | help="The symmetrization heuristic", 37 | ) 38 | parser.add_argument("--m1", type=int, default=None, metavar="ITERATIONS", help="The number of IBM-1 iterations") 39 | parser.add_argument("--m2", type=int, default=None, metavar="ITERATIONS", help="The number of IBM-2 iterations") 40 | parser.add_argument("--mh", type=int, default=None, metavar="ITERATIONS", help="The number of HMM iterations") 41 | parser.add_argument("--m3", type=int, default=None, metavar="ITERATIONS", help="The number of IBM-3 iterations") 42 | parser.add_argument("--m4", type=int, default=None, metavar="ITERATIONS", help="The number of IBM-4 iterations") 43 | parser.add_argument("--maxsentencelength", type=int, default=101, metavar="TRAINING", help="The maximum sentence length") 44 | parser.add_argument("--maxfertility", type=int, default=10, metavar="TRAINING", help="The maximum fertility parameter") 45 | parser.add_argument("--quiet", default=False, action="store_true", help="Quiet display") 46 | args = parser.parse_args() 47 | 48 | bin_dir = Path(args.bin) 49 | 50 | model: str = args.model 51 | model = model.lower() 52 | 53 | optArgs: List[str] = [ 54 | "-ml", str(args.maxsentencelength), 55 | "-maxfertility", str(args.maxfertility) 56 | ] 57 | 58 | with tempfile.TemporaryDirectory() as td: 59 | temp_dir = Path(td) 60 | if model == "ibm1": 61 | aligner = Ibm1GizaAligner(bin_dir, temp_dir, m1=args.m1) 62 | elif model == "ibm2": 63 | aligner = Ibm2GizaAligner(bin_dir, temp_dir, m1=args.m1, m2=args.m2) 64 | elif model == "hmm": 65 | aligner = HmmGizaAligner(bin_dir, temp_dir, m1=args.m1, mh=args.mh) 66 | elif model == "ibm3": 67 | aligner = Ibm3GizaAligner(bin_dir, temp_dir, m1=args.m1, m2=args.m2, mh=args.mh, m3=args.m3) 68 | elif model == "ibm4": 69 | aligner = Ibm4GizaAligner(bin_dir, temp_dir, m1=args.m1, m2=args.m2, mh=args.mh, m3=args.m3, m4=args.m4) 70 | else: 71 | raise RuntimeError("Invalid model type.") 72 | 73 | source_path = Path(args.source) 74 | target_path = Path(args.target) 75 | print("Training...", end="" if args.quiet else "\n", flush=args.quiet) 76 | aligner.train(source_path, target_path, quiet=args.quiet, optArgs=optArgs) 77 | if args.quiet: 78 | print(" done.") 79 | 80 | if args.alignments is not None: 81 | alignments_file_path = Path(args.alignments) 82 | print("Generating alignments...", end="", flush=True) 83 | aligner.align(alignments_file_path, args.include_probs, args.sym_heuristic) 84 | print(" done.") 85 | if args.lexicon is not None: 86 | lexicon_path = Path(args.lexicon) 87 | print("Extracting lexicon...", end="", flush=True) 88 | aligner.extract_lexicon(lexicon_path, args.lexicon_threshold) 89 | print(" done.") 90 | 91 | 92 | if __name__ == "__main__": 93 | main() 94 | -------------------------------------------------------------------------------- /giza_aligner.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import shutil 3 | import subprocess 4 | from bisect import insort_left 5 | from math import ceil 6 | from pathlib import Path 7 | from typing import IO, Any, Dict, Iterable, Iterator, List, Optional, Set, TextIO, Tuple 8 | 9 | from machine.translation import SymmetrizationHeuristic, WordAlignmentMatrix 10 | 11 | from lexicon import Lexicon 12 | from utils import load_corpus, parse_giza_alignments, remove_bom_inplace, write_corpus 13 | 14 | MAX_SENT_LENGTH = 101 15 | PROB_SMOOTH = 1e-7 16 | IBM4_SMOOTH_FACTOR = 0.2 17 | 18 | 19 | class GizaAligner: 20 | def __init__( 21 | self, 22 | bin_dir: Path, 23 | model_dir: Path, 24 | m1: Optional[int] = None, 25 | m2: Optional[int] = None, 26 | mh: Optional[int] = None, 27 | m3: Optional[int] = None, 28 | m4: Optional[int] = None, 29 | ) -> None: 30 | self.bin_dir = bin_dir 31 | self.model_dir = model_dir 32 | self.m1 = m1 33 | self.m2 = m2 34 | self.mh = mh 35 | self.m3 = m3 36 | self.m4 = m4 37 | 38 | @property 39 | def file_suffix(self) -> str: 40 | suffix = "" 41 | if self.m3 is None or self.m3 > 0 or self.m4 is None or self.m4 > 0: 42 | suffix = "3.final" 43 | elif self.mh is None or self.mh > 0: 44 | suffix = f"hmm.{5 if self.mh is None else self.mh}" 45 | elif self.m2 is not None and self.m2 > 0: 46 | suffix = f"2.{self.m2}" 47 | elif self.m1 is None or self.m1 > 0: 48 | suffix = f"1.{5 if self.m1 is None else self.m1}" 49 | return suffix 50 | 51 | def train(self, src_file_path: Path, trg_file_path: Path, quiet: bool = False, optArgs: List[str] = []) -> None: 52 | self.model_dir.mkdir(exist_ok=True) 53 | dest_src_file_path = self.model_dir / "src.txt" 54 | shutil.copyfile(src_file_path, dest_src_file_path) 55 | src_file_path = dest_src_file_path 56 | dest_trg_file_path = self.model_dir / "trg.txt" 57 | shutil.copyfile(trg_file_path, dest_trg_file_path) 58 | trg_file_path = dest_trg_file_path 59 | 60 | remove_bom_inplace(src_file_path) 61 | remove_bom_inplace(trg_file_path) 62 | 63 | if self.m4 is None or self.m4 > 0: 64 | self._execute_mkcls(src_file_path, "src", quiet) 65 | self._execute_mkcls(trg_file_path, "trg", quiet) 66 | 67 | src_trg_snt_file_path, trg_src_snt_file_path = self._execute_plain2snt( 68 | src_file_path, trg_file_path, "src", "trg", quiet 69 | ) 70 | 71 | self._execute_snt2cooc(src_trg_snt_file_path, quiet) 72 | self._execute_snt2cooc(trg_src_snt_file_path, quiet) 73 | 74 | src_trg_prefix = src_trg_snt_file_path.with_suffix("") 75 | src_trg_output_prefix = src_trg_prefix.parent / (src_trg_prefix.name + "_invswm") 76 | self._execute_mgiza(src_trg_snt_file_path, src_trg_output_prefix, quiet, optArgs=optArgs) 77 | src_trg_alignments_file_path = src_trg_output_prefix.with_suffix(f".A{self.file_suffix}.all") 78 | self._save_alignments(src_trg_output_prefix, src_trg_alignments_file_path) 79 | 80 | trg_src_output_prefix = src_trg_prefix.parent / (src_trg_prefix.name + "_swm") 81 | self._execute_mgiza(trg_src_snt_file_path, trg_src_output_prefix, quiet, optArgs=optArgs) 82 | trg_src_alignments_file_path = trg_src_output_prefix.with_suffix(f".A{self.file_suffix}.all") 83 | self._save_alignments(trg_src_output_prefix, trg_src_alignments_file_path) 84 | 85 | def align( 86 | self, 87 | alignments_file_path: Path, 88 | include_probs: bool = False, 89 | sym_heuristic: str = "grow-diag-final-and", 90 | ) -> None: 91 | src_trg_alignments_file_path = self.model_dir / f"src_trg_invswm.A{self.file_suffix}.all" 92 | trg_src_alignments_file_path = self.model_dir / f"src_trg_swm.A{self.file_suffix}.all" 93 | sym_alignments_file_path = self.model_dir / "alignments.txt" 94 | self._symmetrize( 95 | src_trg_alignments_file_path, 96 | trg_src_alignments_file_path, 97 | sym_alignments_file_path, 98 | sym_heuristic, 99 | ) 100 | 101 | src_file_path = self.model_dir / "src.txt" 102 | trg_file_path = self.model_dir / "trg.txt" 103 | 104 | with open(alignments_file_path, "w", encoding="utf-8", newline="\n") as alignments_file, open( 105 | sym_alignments_file_path, 106 | "r", 107 | encoding="utf-8-sig", 108 | ) as sym_alignments_file: 109 | alignment_probs_data: Any = None 110 | direct_alignments_file: Optional[IO] = None 111 | inverse_alignments_file: Optional[IO] = None 112 | direct_alignments: Optional[Iterator[Set[Tuple[int, int]]]] = None 113 | inverse_alignments: Optional[Iterator[Set[Tuple[int, int]]]] = None 114 | if include_probs: 115 | alignment_probs_data = self._init_alignment_probs_data() 116 | direct_alignments_file = open(src_trg_alignments_file_path, "r", encoding="utf-8-sig") 117 | direct_alignments = iter(parse_giza_alignments(direct_alignments_file)) 118 | inverse_alignments_file = open(trg_src_alignments_file_path, "r", encoding="utf-8-sig") 119 | inverse_alignments = iter(parse_giza_alignments(inverse_alignments_file)) 120 | try: 121 | for src_str, trg_str in zip(load_corpus(src_file_path), load_corpus(trg_file_path)): 122 | if len(src_str) == 0 or len(trg_str) == 0: 123 | alignments_file.write("\n") 124 | continue 125 | 126 | src_tokens = src_str.split() 127 | trg_tokens = trg_str.split() 128 | alignment_str = sym_alignments_file.readline().strip() 129 | 130 | if direct_alignments is not None and inverse_alignments is not None: 131 | direct_alignment = next(direct_alignments) 132 | inverse_alignment = next(inverse_alignments) 133 | 134 | direct_probs = self._get_alignment_probs( 135 | alignment_probs_data, src_tokens, trg_tokens, direct_alignment, True 136 | ) 137 | inverse_probs = self._get_alignment_probs( 138 | alignment_probs_data, trg_tokens, src_tokens, inverse_alignment, False 139 | ) 140 | 141 | new_alignment_str = "" 142 | for word_pair_str in alignment_str.split(): 143 | src_index_str, trg_index_str = word_pair_str.split("-", maxsplit=2) 144 | src_index = int(src_index_str) 145 | trg_index = int(trg_index_str) 146 | direct_prob = direct_probs.get((src_index, trg_index), 0.0) 147 | inverse_prob = inverse_probs.get((trg_index, src_index), 0.0) 148 | prob = round(max(direct_prob, inverse_prob), 8) 149 | if len(new_alignment_str) != 0: 150 | new_alignment_str += " " 151 | new_alignment_str += f"{src_index}-{trg_index}:{prob}" 152 | alignment_str = new_alignment_str 153 | alignments_file.write(alignment_str + "\n") 154 | finally: 155 | if direct_alignments_file is not None: 156 | direct_alignments_file.close() 157 | if inverse_alignments_file is not None: 158 | inverse_alignments_file.close() 159 | 160 | def extract_lexicon(self, out_file_path: Path, threshold: float = 0.0) -> None: 161 | src_vocab = self._load_vocab("src") 162 | trg_vocab = self._load_vocab("trg") 163 | direct_lexicon = self._load_lexicon(src_vocab, trg_vocab, "invswm", threshold=threshold) 164 | inverse_lexicon = self._load_lexicon(trg_vocab, src_vocab, "swm", threshold=threshold) 165 | lexicon = Lexicon.symmetrize(direct_lexicon, inverse_lexicon, threshold=threshold) 166 | lexicon.write(out_file_path) 167 | 168 | def _execute_mkcls(self, input_file_path: Path, output_prefix: str, quiet: bool) -> None: 169 | mkcls_path = self.bin_dir / "mkcls" 170 | if platform.system() == "Windows": 171 | mkcls_path = mkcls_path.with_suffix(".exe") 172 | if not mkcls_path.is_file(): 173 | raise RuntimeError("mkcls is not installed.") 174 | 175 | output_file_path = self.model_dir / f"{output_prefix}.vcb.classes" 176 | 177 | args: List[str] = [ 178 | str(mkcls_path), 179 | "-n10", 180 | f"-p{input_file_path}", 181 | f"-V{output_file_path}", 182 | ] 183 | subprocess.run(args, stdout=subprocess.DEVNULL if quiet else None, stderr=subprocess.DEVNULL if quiet else None) 184 | 185 | def _execute_plain2snt( 186 | self, src_file_path: Path, trg_file_path: Path, output_src_prefix: str, output_trg_prefix: str, quiet: bool 187 | ) -> Tuple[Path, Path]: 188 | plain2snt_path = self.bin_dir / "plain2snt" 189 | if platform.system() == "Windows": 190 | plain2snt_path = plain2snt_path.with_suffix(".exe") 191 | if not plain2snt_path.is_file(): 192 | raise RuntimeError("plain2snt is not installed.") 193 | 194 | src_trg_snt_file_path = self.model_dir / f"{output_src_prefix}_{output_trg_prefix}.snt" 195 | trg_src_snt_file_path = self.model_dir / f"{output_trg_prefix}_{output_src_prefix}.snt" 196 | 197 | args: List[str] = [ 198 | str(plain2snt_path), 199 | str(src_file_path), 200 | str(trg_file_path), 201 | "-vcb1", 202 | str(self.model_dir / f"{output_src_prefix}.vcb"), 203 | "-vcb2", 204 | str(self.model_dir / f"{output_trg_prefix}.vcb"), 205 | "-snt1", 206 | str(src_trg_snt_file_path), 207 | "-snt2", 208 | str(trg_src_snt_file_path), 209 | ] 210 | subprocess.run(args, stdout=subprocess.DEVNULL if quiet else None, stderr=subprocess.DEVNULL) 211 | return src_trg_snt_file_path, trg_src_snt_file_path 212 | 213 | def _execute_snt2cooc(self, snt_file_path: Path, quiet: bool) -> None: 214 | snt2cooc_path = self.bin_dir / "snt2cooc" 215 | if platform.system() == "Windows": 216 | snt2cooc_path = snt2cooc_path.with_suffix(".exe") 217 | if not snt2cooc_path.is_file(): 218 | raise RuntimeError("snt2cooc is not installed.") 219 | 220 | snt_dir = snt_file_path.parent 221 | prefix = snt_file_path.stem 222 | prefix1, prefix2 = prefix.split("_", maxsplit=2) 223 | 224 | args: List[str] = [ 225 | str(snt2cooc_path), 226 | str(self.model_dir / f"{prefix}.cooc"), 227 | str(snt_dir / f"{prefix1}.vcb"), 228 | str(snt_dir / f"{prefix2}.vcb"), 229 | str(snt_file_path), 230 | ] 231 | subprocess.run(args, stdout=subprocess.DEVNULL if quiet else None, stderr=subprocess.DEVNULL) 232 | 233 | def _execute_mgiza(self, snt_file_path: Path, output_path: Path, quiet: bool, optArgs: List[str]=[]) -> None: 234 | mgiza_path = self.bin_dir / "mgiza" 235 | if platform.system() == "Windows": 236 | mgiza_path = mgiza_path.with_suffix(".exe") 237 | if not mgiza_path.is_file(): 238 | raise RuntimeError("mgiza is not installed.") 239 | 240 | snt_dir = snt_file_path.parent 241 | prefix = snt_file_path.stem 242 | prefix1, prefix2 = prefix.split("_", maxsplit=2) 243 | 244 | args: List[str] = [ 245 | str(mgiza_path), 246 | "-C", 247 | str(snt_file_path), 248 | "-CoocurrenceFile", 249 | str(snt_dir / f"{prefix}.cooc"), 250 | "-S", 251 | str(snt_dir / f"{prefix1}.vcb"), 252 | "-T", 253 | str(snt_dir / f"{prefix2}.vcb"), 254 | "-o", 255 | str(output_path), 256 | ] + optArgs 257 | 258 | if self.m1 is not None: 259 | args.extend(["-m1", str(self.m1)]) 260 | if self.m2 is not None and (self.mh is None or self.mh == 0): 261 | args.extend(["-m2", str(self.m2)]) 262 | if self.mh is None: 263 | args.extend(["-mh", "0"]) 264 | if self.mh is not None: 265 | args.extend(["-mh", str(self.mh)]) 266 | if self.m3 is not None: 267 | args.extend(["-m3", str(self.m3)]) 268 | if self.m4 is not None: 269 | args.extend(["-m4", str(self.m4)]) 270 | 271 | if self.m3 == 0 and self.m4 == 0: 272 | if self.mh is None or self.mh > 0: 273 | args.extend(["-th", str(5 if self.mh is None else self.mh)]) 274 | elif self.m2 is not None and self.m2 > 0: 275 | args.extend(["-t2", str(self.m2)]) 276 | elif self.m1 is None or self.m1 > 0: 277 | args.extend(["-t1", str(5 if self.m1 is None else self.m1)]) 278 | subprocess.run(args, stdout=subprocess.DEVNULL if quiet else None, stderr=subprocess.DEVNULL if quiet else None) 279 | 280 | def _save_alignments(self, model_prefix: Path, output_file_path: Path) -> None: 281 | alignments: List[Tuple[int, str]] = [] 282 | for input_file_path in model_prefix.parent.glob(model_prefix.name + f".A{self.file_suffix}.part*"): 283 | with open(input_file_path, "r", encoding="utf-8") as in_file: 284 | line_index = 0 285 | segment_index = 0 286 | cur_alignment: str = "" 287 | for line in in_file: 288 | cur_alignment += line 289 | alignment_line_index = line_index % 3 290 | if alignment_line_index == 0: 291 | start = line.index("(") 292 | end = line.index(")") 293 | segment_index = int(line[start + 1 : end]) 294 | elif alignment_line_index == 2: 295 | alignments.append((segment_index, cur_alignment.strip())) 296 | cur_alignment = "" 297 | line_index += 1 298 | 299 | write_corpus( 300 | output_file_path, 301 | map(lambda a: str(a[1]), sorted(alignments, key=lambda a: a[0])), 302 | ) 303 | 304 | def _symmetrize( 305 | self, direct_align_path: Path, inverse_align_path: Path, output_path: Path, sym_heuristic: str 306 | ) -> None: 307 | heuristic = SymmetrizationHeuristic[sym_heuristic.upper().replace("-", "_")] 308 | with open(direct_align_path, "r", encoding="utf-8-sig") as direct_file, open( 309 | inverse_align_path, "r", encoding="utf-8-sig" 310 | ) as inverse_file, open(output_path, "w", encoding="utf-8", newline="\n") as out_file: 311 | for matrix, inv_matrix in zip(_parse_giza_alignments(direct_file), _parse_giza_alignments(inverse_file)): 312 | src_len = max(matrix.row_count, inv_matrix.column_count) 313 | trg_len = max(matrix.column_count, inv_matrix.row_count) 314 | 315 | matrix.resize(src_len, trg_len) 316 | inv_matrix.resize(trg_len, src_len) 317 | 318 | inv_matrix.transpose() 319 | matrix.symmetrize_with(inv_matrix, heuristic) 320 | 321 | out_file.write(str(matrix) + "\n") 322 | 323 | def _init_alignment_probs_data(self) -> Any: 324 | return None 325 | 326 | def _get_alignment_probs( 327 | self, data: Any, src_words: List[str], trg_words: List[str], alignment: Set[Tuple[int, int]], is_direct: bool 328 | ) -> Dict[Tuple[int, int], float]: 329 | return {word_pair: 1.0 / (len(src_words) + 1) for word_pair in alignment} 330 | 331 | def _load_vocab(self, side: str) -> List[str]: 332 | vocab_path = self.model_dir / f"{side}.vcb" 333 | vocab: List[str] = ["NULL", "UNK"] 334 | for line in load_corpus(vocab_path): 335 | index_str, word, _ = line.split() 336 | assert int(index_str) == len(vocab) 337 | vocab.append(word) 338 | return vocab 339 | 340 | def _load_lexicon( 341 | self, 342 | src_vocab: List[str], 343 | trg_vocab: List[str], 344 | align_model: str, 345 | threshold: float = 0.0, 346 | include_special_tokens: bool = False, 347 | ) -> Lexicon: 348 | lexicon = Lexicon() 349 | model_path = self.model_dir / f"src_trg_{align_model}.t{self.file_suffix}" 350 | for line in load_corpus(model_path): 351 | src_index_str, trg_index_str, prob_str = line.split(maxsplit=3) 352 | src_index = int(src_index_str) 353 | trg_index = int(trg_index_str) 354 | if include_special_tokens or (src_index > 1 and trg_index > 1): 355 | src_word = src_vocab[src_index] 356 | trg_word = trg_vocab[trg_index] 357 | prob = float(prob_str) 358 | if prob > threshold: 359 | lexicon[src_word, trg_word] = prob 360 | return lexicon 361 | 362 | 363 | def _parse_giza_alignments(stream: TextIO) -> Iterable[WordAlignmentMatrix]: 364 | line_index = 0 365 | target: List[str] = [] 366 | for line in stream: 367 | line = line.strip() 368 | if line.startswith("#"): 369 | line_index = 0 370 | elif line_index == 1: 371 | target = line.split() 372 | elif line_index == 2: 373 | start = line.find("({") 374 | end = line.find("})") 375 | src_index = -1 376 | source: List[str] = [] 377 | pairs: Set[Tuple[int, int]] = set() 378 | while start != -1 and end != -1: 379 | if src_index > -1: 380 | trg_indices_str = line[start + 2 : end].strip() 381 | trg_indices = trg_indices_str.split() 382 | for trg_index in trg_indices: 383 | pairs.add((src_index, int(trg_index) - 1)) 384 | start = line.find("({", start + 2) 385 | if start >= 0: 386 | src_word = line[end + 3 : start] 387 | source.append(src_word) 388 | end = line.find("})", end + 2) 389 | src_index += 1 390 | yield WordAlignmentMatrix.from_word_pairs(len(source), len(target), pairs) 391 | line_index += 1 392 | 393 | 394 | class Ibm1GizaAligner(GizaAligner): 395 | def __init__(self, bin_dir: Path, model_dir: Path, m1: Optional[int] = None) -> None: 396 | super().__init__(bin_dir, model_dir, m1=m1, mh=0, m3=0, m4=0) 397 | 398 | 399 | class Ibm2GizaAligner(GizaAligner): 400 | def __init__(self, bin_dir: Path, model_dir: Path, m1: Optional[int] = None, m2: Optional[int] = None) -> None: 401 | super().__init__(bin_dir, model_dir, m1=m1, m2=5 if m2 is None else m2, mh=0, m3=0, m4=0) 402 | 403 | def _init_alignment_probs_data(self) -> Any: 404 | return { 405 | "direct_alignment_table": self._load_alignment_table("invswm"), 406 | "inverse_alignment_table": self._load_alignment_table("swm"), 407 | } 408 | 409 | def _save_alignments(self, model_prefix: Path, output_file_path: Path) -> None: 410 | shutil.move(str(model_prefix) + f".A{self.file_suffix}", output_file_path) 411 | 412 | def _get_alignment_probs( 413 | self, data: Any, src_words: List[str], trg_words: List[str], alignment: Set[Tuple[int, int]], is_direct: bool 414 | ) -> Dict[Tuple[int, int], float]: 415 | alignment_table: Dict[Tuple[int, int], Dict[int, float]] 416 | if is_direct: 417 | alignment_table = data["direct_alignment_table"] 418 | else: 419 | alignment_table = data["inverse_alignment_table"] 420 | 421 | probs: Dict[Tuple[int, int], float] = {} 422 | for src_index, trg_index in alignment: 423 | i = src_index + 1 424 | j = trg_index + 1 425 | prob = 0.0 426 | elem = alignment_table.get((j, len(src_words))) 427 | if elem is not None: 428 | prob = elem.get(i, 0.0) 429 | probs[(src_index, trg_index)] = max(PROB_SMOOTH, prob) 430 | return probs 431 | 432 | def _load_alignment_table(self, align_model: str) -> Dict[Tuple[int, int], Dict[int, float]]: 433 | table: Dict[Tuple[int, int], Dict[int, float]] = {} 434 | totals: Dict[Tuple[int, int], float] = {} 435 | ext = "ap" if platform.system() == "Windows" else "a" 436 | for line in load_corpus(self.model_dir / f"src_trg_{align_model}.{ext}{self.file_suffix}"): 437 | fields = line.split(maxsplit=5) 438 | i = int(fields[0]) 439 | j = int(fields[1]) 440 | slen = int(fields[2]) 441 | count = float(fields[4]) 442 | key = (j, slen) 443 | counts = table.get(key) 444 | if counts is None: 445 | counts = {} 446 | table[key] = counts 447 | counts[i] = count 448 | total = totals.get(key, 0.0) 449 | totals[key] = total + count 450 | 451 | for key, counts in table.items(): 452 | total = totals[key] 453 | for j, count in counts.items(): 454 | counts[j] = count / total 455 | 456 | return table 457 | 458 | 459 | def normalize(values: List[float]) -> None: 460 | sum_values = sum(values) 461 | for i in range(len(values)): 462 | if sum_values > 0: 463 | values[i] /= sum_values 464 | else: 465 | values[i] = 1.0 / len(values) 466 | 467 | 468 | def smooth(values: List[float], p: float) -> None: 469 | pp = p / len(values) 470 | for i in range(len(values)): 471 | values[i] = (1.0 - p) * values[i] + pp 472 | 473 | 474 | class HmmGizaAligner(GizaAligner): 475 | def __init__(self, bin_dir: Path, model_dir: Path, m1: Optional[int] = None, mh: Optional[int] = None) -> None: 476 | super().__init__(bin_dir, model_dir, m1=m1, mh=mh, m3=0, m4=0) 477 | 478 | def _init_alignment_probs_data(self) -> Any: 479 | return { 480 | "direct_alignment_table": self._load_alignment_table("invswm"), 481 | "direct_alpha_table": self._load_alpha_table("invswm"), 482 | "inverse_alignment_table": self._load_alignment_table("swm"), 483 | "inverse_alpha_table": self._load_alpha_table("swm"), 484 | } 485 | 486 | def _get_alignment_probs( 487 | self, data: Any, src_words: List[str], trg_words: List[str], alignment: Set[Tuple[int, int]], is_direct: bool 488 | ) -> Dict[Tuple[int, int], float]: 489 | alignment_table: Dict[int, float] 490 | alpha_table: Dict[int, List[float]] 491 | if is_direct: 492 | alignment_table = data["direct_alignment_table"] 493 | alpha_table = data["direct_alpha_table"] 494 | else: 495 | alignment_table = data["inverse_alignment_table"] 496 | alpha_table = data["inverse_alpha_table"] 497 | 498 | probs_table: List[List[float]] = [] 499 | for i1 in range(len(src_words) * 2): 500 | i1_real = i1 % len(src_words) 501 | al: List[float] = [] 502 | for i2 in range(len(src_words)): 503 | al.append(alignment_table.get(i1_real - i2, 1.0 / (2 * (len(src_words) - 1)))) 504 | normalize(al) 505 | smooth(al, 0.2) 506 | i1_probs: List[float] = [] 507 | for i2 in range(len(src_words) * 2): 508 | i2_real = i2 % len(src_words) 509 | empty_i2 = i2 >= len(src_words) 510 | if empty_i2: 511 | prob = 0.4 if i1_real == i2_real else 0 512 | else: 513 | prob = al[i2_real] 514 | i1_probs.append(prob) 515 | normalize(i1_probs) 516 | probs_table.append(i1_probs) 517 | 518 | alpha = alpha_table[len(src_words)] 519 | 520 | asymm_al: List[int] = [-1] * len(trg_words) 521 | for src_index, trg_index in alignment: 522 | asymm_al[trg_index] = src_index 523 | 524 | probs: Dict[Tuple[int, int], float] = {} 525 | prev_src_index = -1 526 | for trg_index in range(len(asymm_al)): 527 | src_index = asymm_al[trg_index] 528 | if prev_src_index == -1: 529 | if src_index == -1: 530 | src_index = len(src_words) 531 | else: 532 | probs[(src_index, trg_index)] = alpha[src_index] 533 | elif src_index == -1: 534 | if prev_src_index < len(src_words): 535 | src_index = prev_src_index + len(src_words) 536 | else: 537 | probs[(src_index, trg_index)] = probs_table[prev_src_index][src_index] 538 | prev_src_index = src_index 539 | return probs 540 | 541 | def _load_alignment_table(self, align_model: str) -> Dict[int, float]: 542 | table: Dict[int, float] = {} 543 | for line in load_corpus(self.model_dir / f"src_trg_{align_model}.h{self.file_suffix}"): 544 | fields = line.split() 545 | for i in range(7, len(fields), 2): 546 | pos = int(fields[i]) 547 | value = float(fields[i + 1]) 548 | table[pos] = value 549 | 550 | return table 551 | 552 | def _load_alpha_table(self, align_model: str) -> Dict[int, List[float]]: 553 | table: Dict[int, List[float]] = {} 554 | for line in load_corpus(self.model_dir / f"src_trg_{align_model}.h{self.file_suffix}.alpha"): 555 | fields = line.split() 556 | src_len = int(fields[0]) / 2 557 | values: List[float] = [] 558 | for i in range(2, len(fields)): 559 | value = float(fields[i]) 560 | values.append(value) 561 | normalize(values) 562 | table[src_len] = values 563 | 564 | return table 565 | 566 | 567 | class Ibm3GizaAligner(GizaAligner): 568 | def __init__( 569 | self, 570 | bin_dir: Path, 571 | model_dir: Path, 572 | m1: Optional[int] = None, 573 | m2: Optional[int] = None, 574 | mh: Optional[int] = None, 575 | m3: Optional[int] = None, 576 | ) -> None: 577 | super().__init__(bin_dir, model_dir, m1=m1, m2=m2, mh=mh, m3=m3, m4=0) 578 | 579 | def _init_alignment_probs_data(self) -> Any: 580 | return { 581 | "direct_distortion_table": self._load_distortion_table("invswm"), 582 | "inverse_distortion_table": self._load_distortion_table("swm"), 583 | } 584 | 585 | def _get_alignment_probs( 586 | self, data: Any, src_words: List[str], trg_words: List[str], alignment: Set[Tuple[int, int]], is_direct: bool 587 | ) -> Dict[Tuple[int, int], float]: 588 | distortion_table: Dict[Tuple[int, int], Dict[int, float]] 589 | if is_direct: 590 | distortion_table = data["direct_distortion_table"] 591 | else: 592 | distortion_table = data["inverse_distortion_table"] 593 | 594 | probs: Dict[Tuple[int, int], float] = {} 595 | for src_index, trg_index in alignment: 596 | i = src_index + 1 597 | j = trg_index + 1 598 | prob = 0.0 599 | elem = distortion_table.get((i, len(trg_words))) 600 | if elem is not None: 601 | prob = elem.get(j, 0.0) 602 | probs[(src_index, trg_index)] = max(PROB_SMOOTH, prob) 603 | return probs 604 | 605 | def _load_distortion_table(self, align_model: str) -> Dict[Tuple[int, int], Dict[int, float]]: 606 | table: Dict[Tuple[int, int], Dict[int, float]] = {} 607 | for line in load_corpus(self.model_dir / f"src_trg_{align_model}.d{self.file_suffix}"): 608 | fields = line.split(maxsplit=5) 609 | j = int(fields[0]) 610 | i = int(fields[1]) 611 | tlen = int(fields[3]) 612 | prob = float(fields[4]) 613 | key = (i, tlen) 614 | probs = table.get(key) 615 | if probs is None: 616 | probs = {} 617 | table[key] = probs 618 | probs[j] = prob 619 | return table 620 | 621 | 622 | class Ibm4GizaAligner(GizaAligner): 623 | def __init__( 624 | self, 625 | bin_dir: Path, 626 | model_dir: Path, 627 | m1: Optional[int] = None, 628 | m2: Optional[int] = None, 629 | mh: Optional[int] = None, 630 | m3: Optional[int] = None, 631 | m4: Optional[int] = None, 632 | ) -> None: 633 | super().__init__(bin_dir, model_dir, m1=m1, m2=m2, mh=mh, m3=m3, m4=m4) 634 | 635 | def _init_alignment_probs_data(self) -> Any: 636 | return { 637 | "src_word_classes": self._load_word_classes("src"), 638 | "trg_word_classes": self._load_word_classes("trg"), 639 | "direct_head_distortion_table": self._load_head_distortion_table("invswm"), 640 | "inverse_head_distortion_table": self._load_head_distortion_table("swm"), 641 | "direct_nonhead_distortion_table": self._load_nonhead_distortion_table("invswm"), 642 | "inverse_nonhead_distortion_table": self._load_nonhead_distortion_table("swm"), 643 | } 644 | 645 | def _get_alignment_probs( 646 | self, data: Any, src_words: List[str], trg_words: List[str], alignment: Set[Tuple[int, int]], is_direct: bool 647 | ) -> Dict[Tuple[int, int], float]: 648 | head_distortion_table: Dict[Tuple[int, int], Dict[int, float]] 649 | nonhead_distortion_table: Dict[int, Dict[int, float]] 650 | src_classes: Dict[str, int] 651 | trg_classes: Dict[str, int] 652 | if is_direct: 653 | head_distortion_table = data["direct_head_distortion_table"] 654 | nonhead_distortion_table = data["direct_nonhead_distortion_table"] 655 | src_classes = data["src_word_classes"] 656 | trg_classes = data["trg_word_classes"] 657 | else: 658 | head_distortion_table = data["inverse_head_distortion_table"] 659 | nonhead_distortion_table = data["inverse_nonhead_distortion_table"] 660 | src_classes = data["trg_word_classes"] 661 | trg_classes = data["src_word_classes"] 662 | 663 | cepts: List[List[int]] = [[] for _ in range(0, len(src_words) + 1)] 664 | for src_index, trg_index in alignment: 665 | i = src_index + 1 666 | j = trg_index + 1 667 | insort_left(cepts[i], j) 668 | 669 | probs: Dict[Tuple[int, int], float] = {} 670 | for src_index, trg_index in alignment: 671 | i = src_index + 1 672 | j = trg_index + 1 673 | t = trg_words[j - 1] 674 | trg_word_class = trg_classes[t] 675 | if cepts[i][0] == j: 676 | prev_cept = i - 1 677 | while prev_cept > 0 and len(cepts[prev_cept]) == 0: 678 | prev_cept -= 1 679 | if prev_cept == 0: 680 | src_word_class = 0 681 | center = 0 682 | else: 683 | s_prev_cept = src_words[prev_cept - 1] 684 | src_word_class = src_classes[s_prev_cept] 685 | center = int(ceil(sum(cepts[prev_cept]) / len(cepts[prev_cept]))) 686 | dj = j - center 687 | prob = 0.0 688 | elem = head_distortion_table.get((src_word_class, trg_word_class)) 689 | if elem is not None: 690 | prob = elem.get(dj, 0.0) 691 | probs[(src_index, trg_index)] = max( 692 | PROB_SMOOTH, 693 | IBM4_SMOOTH_FACTOR / (2 * len(trg_words) - 1) + (1 - IBM4_SMOOTH_FACTOR) * prob, 694 | ) 695 | else: 696 | pos_in_cept = cepts[i].index(j) 697 | prev_in_cept = cepts[i][pos_in_cept - 1] 698 | dj = j - prev_in_cept 699 | prob = 0.0 700 | elem = nonhead_distortion_table.get(trg_word_class) 701 | if elem is not None: 702 | prob = elem.get(dj, 0.0) 703 | probs[(src_index, trg_index)] = max( 704 | PROB_SMOOTH, IBM4_SMOOTH_FACTOR / (len(trg_words) - 1) + (1 - IBM4_SMOOTH_FACTOR) * prob 705 | ) 706 | return probs 707 | 708 | def _load_word_classes(self, side: str) -> Dict[str, int]: 709 | word_classes: Dict[str, int] = {} 710 | classes: Dict[str, int] = {} 711 | for line in load_corpus(self.model_dir / f"{side}.vcb.classes"): 712 | word, word_class_str = line.split("\t", maxsplit=2) 713 | class_index = classes.get(word_class_str) 714 | if class_index is None: 715 | class_index = len(classes) + 1 716 | classes[word_class_str] = class_index 717 | word_classes[word] = class_index 718 | return word_classes 719 | 720 | def _load_head_distortion_table(self, align_model: str) -> Dict[Tuple[int, int], Dict[int, float]]: 721 | table: Dict[Tuple[int, int], Dict[int, float]] = {} 722 | for line in load_corpus(self.model_dir / f"src_trg_{align_model}.d4.final"): 723 | fields = line.split() 724 | trg_word_class = int(fields[3]) 725 | src_word_class = int(fields[4]) 726 | key = (src_word_class, trg_word_class) 727 | probs = table.get(key) 728 | if probs is None: 729 | probs = {} 730 | table[key] = probs 731 | for index, prob_str in enumerate(fields[9:]): 732 | if prob_str != "0": 733 | dj = index - MAX_SENT_LENGTH 734 | probs[dj] = float(prob_str) 735 | return table 736 | 737 | def _load_nonhead_distortion_table(self, align_model: str) -> Dict[int, Dict[int, float]]: 738 | table: Dict[int, Dict[int, float]] = {} 739 | ext = "db4" if platform.system() == "Windows" else "D4" 740 | is_key_line = True 741 | trg_word_class = 0 742 | for line in load_corpus(self.model_dir / f"src_trg_{align_model}.{ext}.final"): 743 | fields = line.split() 744 | if is_key_line: 745 | trg_word_class = int(fields[3]) 746 | else: 747 | probs = table.get(trg_word_class) 748 | if probs is None: 749 | probs = {} 750 | table[trg_word_class] = probs 751 | for index, prob_str in enumerate(fields): 752 | if prob_str != "0": 753 | dj = index - MAX_SENT_LENGTH 754 | probs[dj] = float(prob_str) 755 | is_key_line = not is_key_line 756 | return table 757 | -------------------------------------------------------------------------------- /lexicon.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Iterable, Iterator, Set, Tuple 2 | from pathlib import Path 3 | 4 | 5 | class Lexicon: 6 | @classmethod 7 | def symmetrize(cls, direct_lexicon: "Lexicon", inverse_lexicon: "Lexicon", threshold: float = 0.0) -> "Lexicon": 8 | src_words: Set[str] = set(direct_lexicon.source_words) 9 | src_words.update(inverse_lexicon.target_words) 10 | 11 | trg_words: Set[str] = set(inverse_lexicon.source_words) 12 | trg_words.update(direct_lexicon.target_words) 13 | 14 | lexicon = Lexicon() 15 | for src_word in src_words: 16 | for trg_word in trg_words: 17 | direct_prob = direct_lexicon[src_word, trg_word] 18 | inverse_prob = inverse_lexicon[trg_word, src_word] 19 | prob = max(direct_prob, inverse_prob) 20 | if prob > threshold: 21 | lexicon[src_word, trg_word] = prob 22 | return lexicon 23 | 24 | def __init__(self) -> None: 25 | self._table: Dict[str, Dict[str, float]] = {} 26 | 27 | def __getitem__(self, indices: Tuple[str, str]) -> float: 28 | src_word, trg_word = indices 29 | src_entry = self._table.get(src_word) 30 | if src_entry is None: 31 | return 0 32 | return src_entry.get(trg_word, 0) 33 | 34 | def __setitem__(self, indices: Tuple[str, str], value: float) -> None: 35 | if value == 0: 36 | return 37 | src_word, trg_word = indices 38 | src_entry = self._table.get(src_word) 39 | if src_entry is None: 40 | src_entry = {} 41 | self._table[src_word] = src_entry 42 | src_entry[trg_word] = value 43 | 44 | def __iter__(self) -> Iterator[Tuple[str, str, float]]: 45 | return ( 46 | (src_word, trg_word, prob) 47 | for (src_word, trg_words) in self._table.items() 48 | for (trg_word, prob) in trg_words.items() 49 | ) 50 | 51 | @property 52 | def source_words(self) -> Iterable[str]: 53 | return self._table.keys() 54 | 55 | @property 56 | def target_words(self) -> Iterable[str]: 57 | trg_words: Set[str] = set() 58 | for src_entry in self._table.values(): 59 | trg_words.update(src_entry.keys()) 60 | return trg_words 61 | 62 | def write(self, file_path: Path) -> None: 63 | with open(file_path, "w", encoding="utf-8", newline="\n") as file: 64 | for src_word, trg_word, prob in sorted(self, key=lambda t: (t[0], -t[2], t[1])): 65 | file.write(f"{src_word}\t{trg_word}\t{round(prob, 8)}\n") 66 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sil-machine -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import os 3 | from pathlib import Path 4 | from typing import IO, Iterable, Iterator, Set, Tuple 5 | 6 | 7 | def write_corpus(corpus_path: Path, sentences: Iterable[str]) -> None: 8 | with open(corpus_path, "w", encoding="utf-8", newline="\n") as file: 9 | for sentence in sentences: 10 | file.write(sentence + "\n") 11 | 12 | 13 | def load_corpus(corpus_path: Path) -> Iterator[str]: 14 | with open(corpus_path, "r", encoding="utf-8-sig") as in_file: 15 | for line in in_file: 16 | line = line.strip() 17 | yield line 18 | 19 | 20 | def parse_giza_alignments(alignments_file: IO[str]) -> Iterable[Set[Tuple[int, int]]]: 21 | line_index = 0 22 | for line in alignments_file: 23 | line = line.strip() 24 | if line.startswith("#"): 25 | line_index = 0 26 | elif line_index == 2: 27 | start = line.find("({") 28 | end = line.find("})") 29 | src_index = -1 30 | pairs: Set[Tuple[int, int]] = set() 31 | while start != -1 and end != -1: 32 | if src_index > -1: 33 | trg_indices_str = line[start + 2 : end].strip() 34 | trg_indices = trg_indices_str.split() 35 | pairs.update(((src_index, int(trg_index) - 1) for trg_index in trg_indices)) 36 | start = line.find("({", start + 2) 37 | if start >= 0: 38 | end = line.find("})", end + 2) 39 | src_index += 1 40 | yield pairs 41 | line_index += 1 42 | 43 | 44 | def remove_bom_inplace(path): 45 | """Removes BOM mark, if it exists, from a file and rewrites it in-place""" 46 | buffer_size = 4096 47 | bom_length = len(codecs.BOM_UTF8) 48 | 49 | with open(path, "r+b") as fp: 50 | chunk = fp.read(buffer_size) 51 | if chunk.startswith(codecs.BOM_UTF8): 52 | i = 0 53 | chunk = chunk[bom_length:] 54 | while chunk: 55 | fp.seek(i) 56 | fp.write(chunk) 57 | i += len(chunk) 58 | fp.seek(bom_length, os.SEEK_CUR) 59 | chunk = fp.read(buffer_size) 60 | fp.seek(-bom_length, os.SEEK_CUR) 61 | fp.truncate() 62 | --------------------------------------------------------------------------------