├── LICENSE ├── README.md ├── align_anchors.py ├── anchoring.pyx ├── buffer_work_space.pyx ├── environment.yml ├── eval_data ├── README.md ├── bleualign │ ├── deu │ │ ├── test0.txt │ │ ├── test1.txt │ │ ├── test2.txt │ │ ├── test3.txt │ │ ├── test4.txt │ │ ├── test5.txt │ │ └── test6.txt │ ├── fra │ │ ├── test0.txt │ │ ├── test1.txt │ │ ├── test2.txt │ │ ├── test3.txt │ │ ├── test4.txt │ │ ├── test5.txt │ │ └── test6.txt │ └── gold │ │ ├── dev.txt │ │ ├── test0.txt │ │ ├── test1.txt │ │ ├── test2.txt │ │ ├── test3.txt │ │ ├── test4.txt │ │ ├── test5.txt │ │ └── test6.txt └── parice │ ├── eng │ ├── es_1.txt │ ├── n_1.txt │ ├── n_2.txt │ ├── n_3.txt │ ├── s_1.txt │ ├── s_2.txt │ ├── s_3.txt │ ├── t_1.txt │ ├── t_2.txt │ └── u_1.txt │ ├── gold │ ├── es_1.txt │ ├── n_1.txt │ ├── n_2.txt │ ├── n_3.txt │ ├── s_1.txt │ ├── s_2.txt │ ├── s_3.txt │ ├── t_1.txt │ ├── t_2.txt │ └── u_1.txt │ └── isl │ ├── es_1.txt │ ├── n_1.txt │ ├── n_2.txt │ ├── n_3.txt │ ├── s_1.txt │ ├── s_2.txt │ ├── s_3.txt │ ├── t_1.txt │ ├── t_2.txt │ └── u_1.txt ├── evaluation └── evaluate.py ├── file_read_back.pyx ├── files2align.py ├── galechurch.pyx ├── greedy.pyx ├── repeatedTimer.py ├── reportInfo.py ├── requirements.txt ├── sentAlign.py └── utilities.pyx /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SentAlign 2 | 3 | SentAlign is a sentence alignment tool for parallel corpora. It uses [LaBSE](https://aclanthology.org/2022.acl-long.62.pdf) embeddings to find sentence pairs that are similar in meaning 4 | and an alignment algorithm based on Dijkstra's algorithm to find the optimal alignment. Dynamic programming with cosine similarity tends to favour many-to-many alignments over 1-1 alignments. To counteract this we end with re-evaluating each alignment by inspecting mergers, insertions and deletions. The approach is described in more detail in the paper [SentAlign: Accurate and Scalable Sentence Alignment](). 5 | 6 | ### License 7 | Copyright 2023 Steinþór Steingrímsson 8 | 9 | SentAlign is released under the [Apache License, Version 2.0](LICENSE). 10 | 11 | 12 | ### Building the environment 13 | 14 | If you haven't already check out the repository: 15 | ```bash 16 | git clone https://github.com/steinst/SentAlign.git 17 | cd SentAlign 18 | ``` 19 | 20 | The environment can be built using the provided environment.yml file: 21 | ```bash 22 | conda env create -f environment.yml 23 | ``` 24 | 25 | ### Running the aligner 26 | We assume that the documents to be aligned have the same names in the source and target language, but are kept in folders named using the language code. For example, if we want to align the files in the folder `/path/to/files` we would have the following structure: 27 | ```bash 28 | /path/to/files/eng/file1.txt 29 | /path/to/files/eng/file2.txt 30 | ... 31 | /path/to/files/isl/file1.txt 32 | /path/to/files/isl/file2.txt 33 | ``` 34 | 35 | Assuming a Conda environment has been built as described above, the environment has to be activated before SentAlign is run: 36 | ```bash 37 | conda activate SentAlign 38 | ``` 39 | 40 | Start by creating a list of files to align: 41 | 42 | ```bash 43 | python3 files2align.py -dir /path/to/files --source-language eng 44 | ``` 45 | 46 | Then you run the alignments. Aligning English and Icelandic files: 47 | 48 | ```bash 49 | python3 sentAlign.py -dir /path/to/files -sl eng -tl isl 50 | ``` 51 | 52 | The aligner generates two type of files in the output folder: `/path/to/files/output`. Files ending with `.path` files containing the alignments and LaBSE scores. They are formatted like this: 53 | 54 | ``` 55 | [0]:[0,1]:0.7565563 56 | [1]:[2]:0.99999994 57 | [2]:[3]:0.92132425 58 | [3]:[4]:0.87890404 59 | [4]:[5,6]:0.9721296 60 | []:[7]:0 61 | [5]:[8]:0.89229476 62 | [6]:[9]:0.70389956 63 | ... 64 | ``` 65 | 66 | and `.aligned` files containing the aligned sentence pairs as well as the LaBSE score. Source sentence in the first column, target sentence in the second and LaBSE score in the third: 67 | 68 | 69 | 70 | ## Evaluating Test Sets 71 | The SentAlign paper evaluates the aligner on two evaluation sets: the German-French evaluation set comprising data from the text+berg corpus and published with BleuAlign, and an Icelandic-English test set using data from the Parice corpus. 72 | 73 | To reproduce the results, run the following commands for the German-French test set: 74 | 75 | ```bash 76 | python3 files2align.py -dir eval_data/bleualign --source-language deu 77 | python3 sentAlign.py -dir eval_data/bleualign -sl deu -tl fra 78 | python3 evaluation/evaluate.py -t eval_data/bleualign/output/test*.txt.path -g eval_data/bleualign/gold/test*.txt 79 | ``` 80 | Which should give you the following results: 81 | ``` 82 | --------------------------------- 83 | | | Strict | Lax | 84 | | Precision | 0.935 | 0.998 | 85 | | Recall | 0.929 | 0.933 | 86 | | F1 | 0.932 | 0.964 | 87 | --------------------------------- 88 | ``` 89 | 90 | ## Parameters 91 | 92 | ### Input and output settings 93 | ```bash 94 | '--corpus-folder', '-dir' 95 | '--source-language', '-sl', default='eng' 96 | '--target-language', '-tl', default='isl' 97 | '--filename', '-f', help='Name of source and target file(s) to be aligned', type=str, nargs='+' 98 | '--output-folder', '-out', default='output' 99 | ``` 100 | ### Aligner settings 101 | ```bash 102 | '-n', '--num_overlaps', type=int, default=4, help='Maximum number of allowed overlaps.' 103 | '--max-concatenations', '-concats', type=int, help='Maximum number of concatenated sentences per language', default=4 104 | '--free-concatenations', '-freejoins', type=int, help='Maximum number of concatenations before penalty is applied', default=2 105 | '--score-cutoff', '-cutoff', type=float, help='Minimum similarity score for a sentence pair to be considered', default=0.4 106 | '--minimum-length-words', '-minwords', type=int, help='Minimum number of words per language, for a sentence pair to be considered', default=1 107 | '--maximum-length-words', '-maxwords', type=int, help='Maximum number of words per language, for a sentence pair to be considered', default=110 108 | '--penalty-after-words', '-penwords', type=int, help='Maximum number of words per language, before a length penalty is applied', default=80 109 | '--penalty-per-word', '-wordpen', type=float, help='Penalty applied for each word when maximum number of unpenalized words have been reached', default=0.01 110 | '--anchoring-delimiter', '-anchor', type=int, help='Maximum nodes in the alignment graph, before applying hard delimiters.', default=4000000 111 | '--maximum-length-gale-church', '-maxgc', type=float, help='Maximum number of sentences in file for Gale-Church alignment. If longer, only greedy alignment selection applied', default=10000 112 | ``` 113 | ### Other settings 114 | ```bash 115 | '--proc-device', '-device', help='cuda for gpu, cpu if you don''t have an NVIDIA graphics card', default='cuda' 116 | '--num-proc', '-proc', help='number of processors to allocate for the pathfinding calculations', default=8 117 | ``` 118 | 119 | ## Publications 120 | 121 | If you use SentAlign, please cite the SentAlign paper: 122 | 123 | ```bibtex 124 | @inproceedings{sentalign-2023, 125 | title = {{SentAlign: Accurate and Scalable Sentence Alignment}}, 126 | author = "Steingrímsson, Steinþór and 127 | Loftsson, Hrafn and 128 | Way, Andy", 129 | booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations", 130 | month = dec, 131 | year = "2023", 132 | address = "Singapore, Singapore", 133 | publisher = "Association for Computational Linguistics", 134 | } 135 | ``` 136 | SentAlign is also described in Steinþór Steingrímsson's PhD thesis: 137 | 138 | ```bibtex 139 | @phdthesis{Steingrimsson2023Phd, 140 | title = {Effectively compiling parallel corpora for machine translation in resource-scarce conditions}, 141 | school = {Reykjavik University}, 142 | author = {Steingrímsson, Steinþór}, 143 | year = {2023}, 144 | } 145 | ``` 146 | -------------------------------------------------------------------------------- /anchoring.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | 3 | """ 4 | A Cython implementation of the Gale-Church algorithm 5 | """ 6 | cimport cython 7 | import numpy as np 8 | cimport numpy as np 9 | 10 | 11 | def calculate_anchor_nomatrix_set(galechurch_alignments, src_emb_dict, trg_emb_dict, float minimum_anchor_score, 12 | anchor_source_list_lines, anchor_target_list_lines, int source_len, int target_len, 13 | anchor_source_list, anchor_target_list, int start_source, int end_source, 14 | int start_target, int end_target, labse_score_matrix, 15 | source_loc_start_list, source_loc_end_list, target_loc_start_list, target_loc_end_list): 16 | cdef int i_ctr 17 | cdef int j_ctr 18 | cdef str i, j 19 | cdef int source_loc_start, source_loc_end, target_loc_start, target_loc_end 20 | cdef float labse_score 21 | 22 | labse_alignments = [] 23 | i_ctr = 0 24 | for i in anchor_source_list: 25 | j_ctr = 0 26 | source_loc_start = source_loc_start_list[i_ctr] 27 | source_loc_end = source_loc_end_list[i_ctr] 28 | if source_loc_start >= start_source and source_loc_end < end_source: 29 | for j in anchor_target_list: 30 | target_loc_start = target_loc_start_list[j_ctr] 31 | target_loc_end = target_loc_end_list[j_ctr] 32 | if target_loc_start >= start_target and target_loc_end < end_target: 33 | labse_score = labse_score_matrix[i_ctr][j_ctr] 34 | if labse_score > minimum_anchor_score: 35 | labse_alignments.append([anchor_source_list_lines[i_ctr].strip(), anchor_target_list_lines[j_ctr].strip()]) 36 | j_ctr += 1 37 | i_ctr += 1 38 | 39 | a_list = [value for value in galechurch_alignments if value in labse_alignments] + [[str(source_len), str(target_len)]] 40 | return a_list 41 | 42 | 43 | def calculate_anchor_set(source_start_matrix, source_end_matrix, target_start_matrix, target_end_matrix, 44 | galechurch_alignments, labse_matrix, double minimum_ancor_score, anchor_source_list_lines, 45 | anchor_target_list_lines, int start_source, int end_source, int start_target, int end_target): 46 | 47 | cdef int i_ctr, j_ctr 48 | labse_alignments = [] 49 | 50 | above_threshold = np.where(labse_matrix > minimum_ancor_score) 51 | 52 | for i_ctr, j_ctr in zip(above_threshold[0], above_threshold[1]): 53 | if source_start_matrix[i_ctr] > start_source + 2 and source_end_matrix[i_ctr] < end_source - 2: 54 | if target_start_matrix[j_ctr] > start_target + 2 and target_end_matrix[j_ctr] < end_target - 2: 55 | labse_alignments.append([anchor_source_list_lines[i_ctr].strip(), anchor_target_list_lines[j_ctr].strip()]) 56 | 57 | anchor_list = [value for value in galechurch_alignments if value in labse_alignments] + [[str(end_source), str(end_target)]] 58 | return anchor_list 59 | -------------------------------------------------------------------------------- /buffer_work_space.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | 3 | """ 4 | A Cython implementation of the Gale-Church algorithm 5 | """ 6 | """Adapted from:""" 7 | """BufferWorkSpace module.""" 8 | """Author: Robin Robin""" 9 | """Email: robinsquare42@gmail.com""" 10 | """Version: 2.0.0""" 11 | """Coda available at: https://github.com/RobinNil/file_read_backwards""" 12 | 13 | cimport cython 14 | 15 | import os 16 | 17 | new_lines = ["\r\n", "\n", "\r"] 18 | new_lines_bytes = [n.encode("ascii") for n in new_lines] # we only support encodings that's backward compat with ascii 19 | 20 | 21 | cdef class BufferWorkSpace: 22 | """It is a helper module for FileReadBackwards.""" 23 | cdef int chunk_size 24 | cdef int read_position 25 | cdef bytes read_buffer 26 | cdef int n 27 | cdef int seek_position 28 | cdef int read_size 29 | cdef int i 30 | cdef int l 31 | cdef fp 32 | 33 | def __init__(self, fp, int chunk_size): 34 | """Convention for the data. 35 | When read_buffer is not None, it represents contents of the file from `read_position` onwards 36 | that has not been processed/returned. 37 | read_position represents the file pointer position that has been read into read_buffer 38 | initialized to be just past the end of file. 39 | """ 40 | self.fp = fp 41 | self.read_position = _get_file_size(self.fp) # set the previously read position to the 42 | self.read_buffer = None 43 | self.chunk_size = chunk_size 44 | 45 | def add_to_buffer(self, bytes content, int read_position): 46 | """Add additional bytes content as read from the read_position. 47 | Args: 48 | content (bytes): data to be added to buffer working BufferWorkSpac. 49 | read_position (int): where in the file pointer the data was read from. 50 | """ 51 | self.read_position = read_position 52 | if self.read_buffer is None: 53 | self.read_buffer = content 54 | else: 55 | self.read_buffer = content + self.read_buffer 56 | 57 | def yieldable(self): 58 | """Return True if there is a line that the buffer can return, False otherwise.""" 59 | if self.read_buffer is None: 60 | return False 61 | 62 | t = _remove_trailing_new_line(self.read_buffer) 63 | n = _find_furthest_new_line(t) 64 | if n >= 0: 65 | return True 66 | 67 | # we have read in entire file and have some unprocessed lines 68 | if self.read_position == 0 and self.read_buffer is not None: 69 | return True 70 | return False 71 | 72 | def return_line(self): 73 | """Return a new line if it is available. 74 | Precondition: self.yieldable() must be True 75 | """ 76 | assert(self.yieldable()) 77 | 78 | t = _remove_trailing_new_line(self.read_buffer) 79 | i = _find_furthest_new_line(t) 80 | 81 | if i >= 0: 82 | l = i + 1 83 | after_new_line = slice(l, None) 84 | up_to_include_new_line = slice(0, l) 85 | r = t[after_new_line] 86 | self.read_buffer = t[up_to_include_new_line] 87 | else: # the case where we have read in entire file and at the "last" line 88 | r = t 89 | self.read_buffer = None 90 | return r 91 | 92 | def read_until_yieldable(self): 93 | """Read in additional chunks until it is yieldable.""" 94 | while not self.yieldable(): 95 | read_content, read_position = _get_next_chunk(self.fp, self.read_position, self.chunk_size) 96 | self.add_to_buffer(read_content, read_position) 97 | 98 | def has_returned_every_line(self): 99 | """Return True if every single line in the file has been returned, False otherwise.""" 100 | if self.read_position == 0 and self.read_buffer is None: 101 | return True 102 | return False 103 | 104 | 105 | def _get_file_size(fp): 106 | return os.fstat(fp.fileno()).st_size 107 | 108 | 109 | def _get_next_chunk(fp, int previously_read_position, int chunk_size): 110 | """Return next chunk of data that we would from the file pointer. 111 | Args: 112 | fp: file-like object 113 | previously_read_position: file pointer position that we have read from 114 | chunk_size: desired read chunk_size 115 | Returns: 116 | (bytestring, int): data that has been read in, the file pointer position where the data has been read from 117 | """ 118 | seek_position, read_size = _get_what_to_read_next(fp, previously_read_position, chunk_size) 119 | fp.seek(seek_position) 120 | read_content = fp.read(read_size) 121 | read_position = seek_position 122 | return read_content, read_position 123 | 124 | 125 | def _get_what_to_read_next(fp, int previously_read_position, int chunk_size): 126 | """Return information on which file pointer position to read from and how many bytes. 127 | Args: 128 | fp 129 | past_read_positon (int): The file pointer position that has been read previously 130 | chunk_size(int): ideal io chunk_size 131 | Returns: 132 | (int, int): The next seek position, how many bytes to read next 133 | """ 134 | seek_position = max(previously_read_position - chunk_size, 0) 135 | read_size = chunk_size 136 | 137 | # examples: say, our new_lines are potentially "\r\n", "\n", "\r" 138 | # find a reading point where it is not "\n", rewind further if necessary 139 | # if we have "\r\n" and we read in "\n", 140 | # the next iteration would treat "\r" as a different new line. 141 | # Q: why don't I just check if it is b"\n", but use a function ? 142 | # A: so that we can potentially expand this into generic sets of separators, later on. 143 | while seek_position > 0: 144 | fp.seek(seek_position) 145 | if _is_partially_read_new_line(fp.read(1)): 146 | seek_position -= 1 147 | read_size += 1 # as we rewind further, let's make sure we read more to compensate 148 | else: 149 | break 150 | 151 | # take care of special case when we are back to the beginnin of the file 152 | read_size = min(previously_read_position - seek_position, read_size) 153 | return seek_position, read_size 154 | 155 | 156 | def _remove_trailing_new_line(l): 157 | """Remove a single instance of new line at the end of l if it exists. 158 | Returns: 159 | bytestring 160 | """ 161 | # replace only 1 instance of newline 162 | # match longest line first (hence the reverse=True), we want to match "\r\n" rather than "\n" if we can 163 | for n in sorted(new_lines_bytes, key=lambda x: len(x), reverse=True): 164 | if l.endswith(n): 165 | remove_new_line = slice(None, -len(n)) 166 | return l[remove_new_line] 167 | return l 168 | 169 | 170 | def _find_furthest_new_line(read_buffer): 171 | """Return -1 if read_buffer does not contain new line otherwise the position of the rightmost newline. 172 | Args: 173 | read_buffer (bytestring) 174 | Returns: 175 | int: The right most position of new line character in read_buffer if found, else -1 176 | """ 177 | new_line_positions = [read_buffer.rfind(n) for n in new_lines_bytes] 178 | return max(new_line_positions) 179 | 180 | 181 | def _is_partially_read_new_line(b): 182 | """Return True when b is part of a new line separator found at index >= 1, False otherwise. 183 | Args: 184 | b (bytestring) 185 | Returns: 186 | bool 187 | """ 188 | for n in new_lines_bytes: 189 | if n.find(b) >= 1: 190 | return True 191 | return False -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: SentAlign 2 | channels: 3 | - conda-forge 4 | - anaconda 5 | dependencies: 6 | - python=3.8 7 | - transformers=4.22.2 8 | - numpy=1.22.1 9 | - cython=0.29.27 10 | - pytorch-gpu=1.13.0 11 | -------------------------------------------------------------------------------- /eval_data/README.md: -------------------------------------------------------------------------------- 1 | ## Evaluation Sets 2 | 3 | For convenience we provide the two evaluation sets used to evaluate the alignment tool. 4 | 5 | ### Bleualign and Text+Berg evaluation set 6 | The bleualign folder contains the manually aligned German-French evaluation set created from the Text+Berg corpus, first used to evaluate Bleualign and commonly used for sentence alignment evaluation since. If you use that evaluation set you should cite the Bleualign paper: 7 | 8 | ```bibtex 9 | @inproceedings{Sennrich2010MTbasedSA, 10 | title = {{MT-based Sentence Alignment for OCR-generated Parallel Texts}}, 11 | author = "Rico Sennrich and Martin Volk", 12 | booktitle = "Proceedings of the 9th Conference of the Association for Machine Translation in the Americas: Research Papers", 13 | address = "Denver, Colorado", 14 | year = "2010", 15 | publisher = "Association for Machine Translation in the Americas", 16 | url = "https://aclanthology.org/2010.amta-papers.14", 17 | } 18 | ``` 19 | 20 | ### ParIce evaluation set 21 | The parice folder contains an evaluation set for English-Icelandic sentence alignment from 10 aligned documents in five subcorpora of the ParIce corpus. The evaluation set is [distributed](https://repository.clarin.is/repository/xmlui/handle/20.500.12537/150) under a CC BY 4.0 license. If you use that evaluation set, please consider citing the ParIce paper and the SentAlign paper where the evaluation set was first used. 22 | 23 | The ParIce paper: 24 | 25 | ```bibtex 26 | @inproceedings{barkarson-steingrimsson-2019-compiling, 27 | title = {{Compiling and Filtering {P}ar{I}ce: An {E}nglish-{I}celandic Parallel Corpus}}, 28 | author = "Barkarson, Starka{\dh}ur and Steingr{\'\i}msson, Stein{\th}{\'o}r", 29 | booktitle = "Proceedings of the 22nd Nordic Conference on Computational Linguistics", 30 | month = sep # "{--}" # oct, 31 | year = "2019", 32 | address = "Turku, Finland", 33 | publisher = {Link{\"o}ping University Electronic Press}, 34 | url = "https://aclanthology.org/W19-6115", 35 | pages = "140--145", 36 | } 37 | ``` 38 | The SentAlign paper: 39 | 40 | ```bibtex 41 | @inproceedings{sentalign-2023, 42 | title = {{SentAlign: Accurate and Scalable Sentence Alignment}}, 43 | author = "Steingrímsson, Steinþór and 44 | Loftsson, Hrafn and 45 | Way, Andy", 46 | booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations", 47 | month = dec, 48 | year = "2023", 49 | address = "Singapore, Singapore", 50 | publisher = "Association for Computational Linguistics", 51 | } 52 | ``` 53 | 54 | -------------------------------------------------------------------------------- /eval_data/bleualign/deu/test2.txt: -------------------------------------------------------------------------------- 1 | .in Tag in Uschenen 2 | Hanspeter Sigrist , Oberbalm 3 | In den Wänden von 4 | Uschenen 5 | ( Weg der Jugend ) 6 | Ein Klettergebiet macht Geschichte In der Anfangsphase , als sich in der Schweiz die Entwicklung zum Freiklettern erst abzuzeichnen begann , wurden am Felsband von Üschenen , oberhalb von Kandersteg , Anstiege eröffnet , die zu den schwierigsten der Schweiz zählten . 7 | Die Routen bewegten sich zunächst im Bereich des sechsten , dann des siebten und schliesslich des achten Grades . 8 | Besonders Aufsehen erregten die konsequent von unten eröffneten Routen wie Le Toit , Quo Vadis , Via del Ladro Corda und Kolibri . 9 | Als man aber auch in Üschenen begann , die ersten , meist kürzeren Anstiege abseilend zu eröffnen , wurde es plötzlich etwas stiller um dieses Klettergebiet . 10 | Während die einen diese neue Praxis verärgerte , glaubten andere , die vorhandenen Möglichkeiten seien bereits weitgehend ausgeschöpft . 11 | Und so konzentrierte sich das Interesse auf Gebiete , die noch weniger erschlossen waren . 12 | Erst 1988 rückten durch die Eröffnung neuer Routen die landschaftlich überaus reizvoll gelegenen Felsen von Üschenen wieder ins Blickfeld . 13 | In der Zwischenzeit hatte man auch gelernt , die verschiedenen Aspekte mit mehr Objektivität und der nötigen Toleranz zu betrachten , sind doch die Unterschiede zwischen von unten und von oben eingerichteten Klettereien nun jedem klar und Vor und Nachteile bekannt geworden . 14 | Beides ist möglich , beides hat seine Berechtigung und nicht zuletzt auch seinen besonderen Reiz . 15 | Es gibt anspruchsvolle Anstiege wie zum Beispiel die von unten eröffnete Route Kumulus von Martin Stettier , die eine gesunde Moral und hohes Können erfordern und damit auch über einen ganz eigenen Erlebniswert verfügen . 16 | Dasselbe gilt für die von den rein klettertechnischen Schwierigkeiten her gesehenen Spitzenrouten und Anziehungspunkte für leistungsstarke Kletterer Bscbüttigütti{\Q ) und Fusion ( 10- ) . 17 | Diese stellen jedoch andere Anforderungen - nicht nur an den Kletterer , welcher der Besonderheit der Route mit Konsequenz und grösster Konzentration begegnen muss , sondern auch an den Sichernden , der viel zu einem schnellen Gelingen eines solch anspruchsvollen Unternehmens beitragen kann . 18 | Die beiden Spitzenrouten wurden 1988 erstmals Rotpunkt geklettert : 19 | die Fusion durch Jürg von Känel im Oktober , und das langjährige Projekt Bschüttigüttigelang dem Autor an einem neblig-kalten Tag im Sommer . 20 | Dies , nachdem die Route neu eingerichtet und die Linienführung im obersten Teil noch bestimmt werden musste . 21 | Die Durchsteigung derartiger Routen bietet - besonders wenn sie , wie in diesem Fall , nach nur sehr kurzer Vorbereitungszeit bereits im ersten Vorstiegsversuch gelingt sehr intensive Klettererlebnisse und gehört deshalb zu den Höhepunkten im Leben eines Kletterers . 22 | Jene Augenblicke , in denen alle Voraussetzungen gegeben sind , damit eine Route im 10. Schwierigkeitsgrad in sehr kurzer Zeit gelingt , lassen sich kaum im voraus bestimmen oder planen . 23 | Zumindest mir scheint diese Fähigkeit nicht gegeben zu sein - selbst wenn ich mich vorher lange und intensiv mit mir und meinem Ziel auseinandergesetzt habe . 24 | Vielleicht kann man aber eine solche Herausforderung auch nur bestehen , wenn ihr eine besondere Situation vorausgegangen ist . 25 | Eine Situation , die , von jedem Erfolgsdruck befreit , hemmende Blockierungen löst und so erst die notwendige Ausgewogenheit der Bewegungsabläufe ermöglicht . 26 | Manchmal spielen aber auch klimatische Bedingungen eine grosse Rolle , indem sie erlauben , die Kraft voll auszuspielen - oder eben nicht . 27 | Kaum etwas vermag mich in einem schwierigen Aufstieg mehr zu irritieren als das durch eine etwas zu hohe Temperatur hervorgerufene unangenehme Gefühl , fast unmerklich , aber ständig von jedem Griff zu rutschen . 28 | In solchen Fällen konzentriere ich mich dann ganz von selbst auf dieses Problem und werde damit vom Klettern abgelenkt . 29 | Es kann aber auch vorkommen , dass ein Umfeld , so zum Beispiel zufällig anwesende Personen , derart motivierend wirkt , dass ein Kletterer sich plötzlich mit anscheinend grösster Sicherheit am Fels bewegen kann . 30 | Allerdings zeigt sich dann meist einige Zeit später , dass solche nicht zuletzt durch Im weit ausladenden Dach der Route « Fusion » ( 10- ) 31 | A 32 | äussere Faktoren zustande gekommenen Leistungen mit enormem Kraftaufwand verbunden sind . 33 | Um so wertvoller und erlebnisintensiver werden deshalb jene Momente empfunden , in denen man sich den Anforderungen gewachsen fühlt , selbst wenn die Ziele sehr hoch gesteckt sind . 34 | Die persönliche Situation , die ( Atmosphäre ) , der Tag , die Person des Sichernden all das und vielleicht noch mehr müssen ideal zusammenpassen , um ein durchgehend positives Umfeld zu schaffen . 35 | Routenziele Bei der Fusion handelt es sich um eine Kombination aus einer bestehenden Route im 9. Schwierigkeitsgrad und einer davon abzweigenden Traverse über ein ausladendes Dach . 36 | Der Ort , wo die ( Fusion ) stattfinden soll , ist gleichzeitig auch die Schlüsselstelle der gesamten Tour . 37 | Nach einer steilen , mit messerscharfen kleinen Griffen bestückten Passage und einem eindrücklich ausladenden Dach folgt die Stelle , an der sich alles entscheidet . 38 | Das Vor und Nachher ist verhältnismässig leicht in den Griff zu bekommen , nicht aber die Loslösung von der ursprünglichen Linie , die gerade aufwärts weiterführen würde . 39 | Ich habe mir die Route zusammen mit Heinz Gut ein erstes Mal an einem wunderschönen Herbstnachmittag im November angeschaut . 40 | Neben einem kurzen mit der etwas speziellen Linienführung und den originellen Bewegungsabläufen vor und nach der Schlüsselpassage reicht es gerade noch für je einen Vorstiegsversuch . 41 | An der Schlüsselstelle sind wir aber bereits völlig ausgepumpt , chancenlos , den ( Absprung ) von der geraden Linie überhaupt zu wagen . 42 | Voller Ehrfurcht beginnen wir auf der Heimfahrt von der neuesten zu sprechen . 43 | Eine Woche später droht sich die Zeit des stabilen schönen Herbstwetters ihrem Ende zu nähern . 44 | Und damit scheint auch der Traum von der Fusion für dieses Jahr ausgeträumt zu sein . 45 | Heinz weilt in Südfrankreich , er hat dort Ziele , die ihm eher machbar erscheinen . 46 | Und Gabriele will nach Italien . 47 | Sie mag den Nebel hier nicht und befürchtet , um diese Jahreszeit an den voralpinen Felsen ohnehin nur zu frieren . 48 | Am heutigen warmen und windstillen Tag sollte dies jedoch nicht der Fall sein , weshalb ich mich entschliesse , nochmals die Fusion zu versuchen und dann abends loszufahren . 49 | So können wir den vielleicht letzten sonnigen Herbsttag nützen , und Gabriele hätte die Möglichkeit , sich etwas von ihrer weiten nächtlichen Anfahrt aus Deutschland zu erholen . 50 | Die eigenen Ziele zu nennen ist oft gar nicht so leicht . 51 | Geklettert wird nach wie vor zu zweit , und meist haben beide Partner ihre Routenziele und Vorstellungen , die auf einen zufriedenstellenden Nenner gebracht werden müssen . 52 | Deshalb scheint es mir manchmal recht schwierig , die eigenen Interessen vor mir selbst und dem Partner einzugestehen und gegebenenfalls sogar durchzusetzen , befürchte ich dabei doch , im Falle eines Misserfolges seine Hilfe zu Unrecht beansprucht zu haben . 53 | Andererseits sollten wir trotz derartiger Bedenken vielleicht vermehrt daran glauben , dass solche Unterstützung gerne geleistet wird und von Herzen kommt , wenn der Partner spürt , dass der richtige Moment da ist , um dem andern zu helfen , sein Ziel zu erreichen . 54 | In der Route « Bschütti-gütti ) ( 10 ) , der schwierigsten Route im Klettergebiet von Üschpnpn Und es ist der richtige Moment ! 55 | Üschenen präsentiert sich von seiner allerschönsten Seite . 56 | Die Luft ist frisch , und am Fusse der Felsen kann man sich noch in die warme Sonne legen . 57 | Die Berge sind schon weit hinunter eingeschneit , und bis auf einen einzelnen Kletterer , der am Einrichten einer neuen Route ist , sind wir hier oben allein . 58 | Ein 7. Grad zum Einklettern und einige kurze Testzüge in den schwierigen Passagen der Route mit besonderer Aufmerksamkeit auf den Schlüsselzug dienen der Einstimmung . 59 | Das linke Handgelenk schmerzt bei der extrem aufgestellten Fingerhaltung an dem runden Griff . 60 | Die kleine , unscheinbare Warze bohrt sich in die Fingerkuppe des rechten Zeigefingers . 61 | Hier muss ich mich mit aller Kraft festhalten , nur dann ist der weite dynamische Zug an den Fingerschlitz möglich . 62 | Dieser erste Vorstiegsversuch gelingt gar nicht schlecht . 63 | Beim Einhängen der blauen fixen Schlinge bin ich aber instabil und brauche deshalb viel Kraft . 64 | Zudem liegt die Hand unter dem Seil . 65 | Das kostet zu viel Zeit , um sie für das schwierige Nachgreifen freizubekommen . 66 | Ein zweiter Anlauf erfolgt nur wenig später , solange der richtige Teil der Bewegungsabläufe noch im Gefühl ist . 67 | Gabriele hat mich für das schwierige Einhängemanöver beruhigt und mir Mut gemacht . 68 | Ihre Anweisungen helfen mir in diesem Moment sehr viel , und es geht auch gleich deutlich besser . 69 | Nur um wenige Millimeter verfehle ich den Griff . 70 | Pause . 71 | Wir wandern ein wenig umher und schauen uns den neuen kleinen Klettergarten an . 72 | Dann auch das Bschüttigütti . 73 | Immer wieder fasziniert mich diese Linie durch die steil aufschiessende Platte mit ihren nur winzigen Einkerbungen . 74 | Ich fühle mich gut . 75 | Ausgewogen und ruhig . 76 | Vielleicht deshalb , weil es mir in diesem Moment hier oben , inmitten einer wunderschönen Landschaft , an nichts fehlt . 77 | Die Pause ist um , und ich muss wieder etwas tun : 78 | Den nächsten Versuch wagen . 79 | Wiederum bekunde ich Mühe , die Bewegung im entscheidenden Moment genau zu erfühlen und zu kontrollieren . 80 | Vor dem dynamischen Zug nehme ich noch zu viel Schwung , was bei der Kleinheit der Griffe ohnehin ein Unsinn ist . 81 | Selber fällt mir der kleine Fehler aber kaum auf . 82 | Ich spüre nur , dass etwas noch nicht ganz stimmt . 83 | Gabriele gesteht mir noch einen Versuch zu , nur noch diesen einen . 84 | Beim Losklettern frage ich mich kurz , ob sie wohl ungeduldig ist oder ob andere Überlegungen dahinterstecken . 85 | Bestimmt würde sie mich noch einen weiteren Anlauf machen lassen , selbst wenn ich genau weiss , dass Konzentration und Kraft dazu nicht mehr ausreichen , dass die Haut an der Fingerspitze bald durchreisst und bei aller Feilscherei um einen fünften Versuch dieser mit Sicherheit danebengehen würde . 86 | Im Dach ist kein Platz für derartige Überlegungen . 87 | Die weiten Züge erfordern volle Aufmerksamkeit . 88 | Dann der Überkreuzer an den winzigen Griff , der nötige Druck auf den Fussen , der Zwischengriff , und - mit etwas Glück - erwische ich den Schlitz mit einem Teil der Fingerspitzen . 89 | Ein kurzes Nachfassen und ich habe ihn richtig in der Hand . 90 | Die Traverse hat begonnen und muss jetzt noch konzentriert zu Ende geführt werden - bis an das vordere Dachende . 91 | Einhängen des letzten Hakens und ein entschlossener weiter Zug an den Ausstiegsgriff . 92 | Die letzten Sonnenstrahlen und die freundliche Sicherungshilfe von Ernst Müller - er ist alleine hier oben unterwegs - erlauben uns sogar noch , den Quergang ein zweites Mal zu klettern , dabei einige Bilder zu machen und den schönen Moment noch etwas auszukosten . 93 | In bester Laune packen wir unsere Sachen zusammen und verabschieden uns von diesem einzigartigen Ort . 94 | Die Sonne ist hinter dem Lohner verschwunden , und es wird schnell kalt . 95 | Die Bewegung auf dem Abstieg wärmt uns aber bald wieder auf . 96 | -------------------------------------------------------------------------------- /eval_data/bleualign/deu/test4.txt: -------------------------------------------------------------------------------- 1 | ■rinnerungen Piz Buin und Piz Platta 2 | Romedi Reinalter , S-chanf 3 | Piz Platta im Oberhalbstein 4 | Die Skitouren der SAC-Sektion Bernina auf den Piz Buin und den Piz Platta in den Rhätischen Alpen gehören schon lange der Vergangenheit an . 5 | Dies erinnert mich an mein den damaligen Teilnehmern gegebenes Versprechen , die persönlichen Eindrücke niederzuschreiben , ebenso aber auch an die Gründe , warum es dann doch nicht dazu gekommen ist . 6 | Einmal in den hektischen Alltagsbetrieb zurückgekehrt , musste bald diesem , bald jenem Priorität gegeben werden , so dass vieles , das auf den ersten Blick weniger von Belang erschien , auf die lange Bank geschoben wurde . 7 | Aber aufgeschoben ist nicht aufgeho- ben , denn ein solches Versprechen wirkt weiter , macht sich bemerkbar , bleibt als ständiger leiser Vorwurf bestehen . 8 | Soeben habe ich am Fusse der Crasta Mora , auf einem südexponierten Hang gepicknickt . 9 | Unten in der Talebene , wo sich noch eine dünne , aber harte Schneeschicht hat halten können , gleitet ein Langläufer den schneefreien Rändern des einstigen Bachverlaufs des Beverin ausweichend , leicht auf und absteigend dahin . 10 | Meine Augen folgen ihm , bis er in der Ferne verschwindet , und meine Gedanken schweifen zurück . 11 | Zurück zu den Skitouren der Sektion Bernina auf den Piz Buin und den Piz Platta . 12 | Wenn ich mich jetzt zu erinnern versuche , was damals vorgefallen ist , muss ich geste hen , dass mir viele kleine , lustige Einzelgeschichten und Anekdoten , die sich in der Gruppe abgespielt haben , nicht mehr vollständig präsent sind . 13 | Bei einzelnen Vorkommnissen haben sich die Konturen zum Teil verwischt , sie wirken verschwommen und leben erst dann wieder auf , wenn zu gegebener Zeit , in Anwesenheit der damaligen Teilnehmer , die Eindrücke rekonstruiert werden können . 14 | Davon ausgenommen sind natürlich jene Erleb- Nächste Doppelseite : 15 | Im Aufstieg über den Ostgrat von der Fuorcla Buin zum Piz Buin Grond nisse , denen eine starke subjektive Komponente anhaftet . 16 | Wenn irgendwann , irgendwo das Vergangene , das noch nicht vollständig verdaut ist , wie ein Film im Eiltempo sich abspult , gelange ich oft zu einer differenzierteren Betrachtungsweise oder sogar zu einer anderen Sicht der Dinge . 17 | Jede Tour setzt sich aus einer Aneinanderreihung von vielen einzelnen Details zusammen , die erst in ihrem gegenseitigen Verhältnis wieder einen Gesamteindruck vermitteln . 18 | Eigentlich nichts Aussergewöhnliches , etwas , das sich im täglichen Leben im Gemsen , eine auf Skitouren stets wieder anzutreffende Wildart mer wieder abspielt . 19 | Eine Tour kann aus grandiosen Eindrücken bestehen , aber es kann auch sein , dass Einzelheiten eine derart grosse Bedeutung erhalten , dass sie alles andere überstrahlen ; 20 | so zum Beispiel die Schmerzen , die eine Blase beim Laufen verursacht , der Tanz mit hohen Bergschuhen in einer kleinen , getäferten Wirtsstube im abgelegenen Maiensäss , die Wolkenbilder , die am Fuss des Piz Platta bei einem Wirbelsturm entstanden , oder die Laute der Schneehühner , die frühmorgens beim Wegmarsch von der Buinhütte zu vernehmen waren . 21 | Für mich wäre es nun sinnlos , die ganze Tour schriftlich nachzuvollziehen , vielleicht auch zu mühsam , die Erinnerungsbruchstücke aneinanderzureihen . 22 | Immer wieder frage ich mich - fragt sich wohl jeder - , weshalb man in aller Frühe aufsteht , bei klirrender Kälte schlotternd am Parkplatz bei der Post in St. Moritz Bad auf die Tourenkameraden wartet , einen langen , beschwerlichen Aufstieg auf sich nimmt und schwierige Passagen bei der Abfahrt meistert . 23 | Draussen in der Natur kann ich mehr Abstand zum Alltäglichen und vertieften Zugang zu mir selber gewinnen . 24 | Es ist nicht so , dass ■<©•■ . 25 | Die beiden Buin , Piz Buin Grond und Piz Buin Pitschen , im Silvrettagebiet ( Unterengadin ) Flechtengesellschaft im Sonnenlicht ich den Schwierigkeiten entgehen möchte , im Gegenteil . 26 | Die dabei gewonnene Distanz bietet mir die Möglichkeit , das , was mich beschäftigt , von einer anderen Seite anzupacken oder nach neuen Wegen zu suchen . 27 | Es kam schon vor , dass es mich in meiner periodisch auftretenden Isoliertheit grosse innere Überwindung kostete , an einer Sektionstour teilzunehmen , von der ich dann aber frohen Mutes und in meiner inneren Welt bestärkt zurückkehrte . 28 | Dabei Messen sich öfters gute Lebensgefühle , die in schwierigen Zeiten um so seltener werden , wieder erwecken . 29 | Trotzdem bieten die Berge dem mit Schwierigkeiten behafteten Menschen keinen Ausweg aus einem unerfüllten Leben . 30 | Jeder von uns verfügt über einen Lebensraum , der ihm mehr oder weniger vertraut ist . 31 | Dies ist sein Alltag , in dem es gilt , sich durchzuschlagen . 32 | Man pflegt zu sagen , der Mensch sei ein ( Gewohnheitstier ) , doch stets wird die andere , neue Welt , die delta)] """ 66 | cdef long lx, ly 67 | cdef double delta 68 | lx, ly = sum(sx), sum(sy) 69 | m = (lx + ly * mean_xy) / 2 70 | try: 71 | delta = (lx - ly * mean_xy) / sqrt(m * variance_xy) 72 | except ZeroDivisionError: 73 | return float('-inf') 74 | return - 100 * (LOG2 + norm_logsf(abs(delta))) 75 | 76 | 77 | cdef calc_cost(int i, int j, x, y, m): 78 | costs = [] 79 | if i == j == 0: 80 | return (0,0,0) 81 | else: 82 | costs.append(min((m[i - di, j - dj][0] + length_cost(x[i - di:i], y[j - dj:j]) + bead_cost, di, dj) for (di, dj), bead_cost in bead_costs.items() if i - di >= 0 and j - dj >= 0)) 83 | return min(costs) 84 | 85 | 86 | cdef calc_cost_large(int i, int j, x, y, scorecutoff, m): 87 | costs = [] 88 | if i == j == 0: 89 | return (0,0,0) 90 | else: 91 | try: 92 | costs.append(min((m[i - di][j - dj][0] + length_cost(x[i - di:i], y[j - dj:j]) + bead_cost, di, dj) for (di, dj), bead_cost in bead_costs.items() if i - di >= 0 and j - dj >= 0)) 93 | except Exception as e: 94 | print(e) 95 | print(scorecutoff, i, j) 96 | return (scorecutoff, i, j) 97 | return min(costs) 98 | 99 | 100 | def _align(x, y, int longlength=1000, scorecutoff=None, my_basename='filename'): 101 | m = {} 102 | cdef int max_length 103 | cdef long highest_score 104 | cdef int i, j, currline 105 | cdef long c, di, dj 106 | 107 | max_length = max(len(x), len(y)) 108 | temp = tempfile.NamedTemporaryFile(prefix='gc_' + my_basename + '_', delete=False, mode='w') 109 | highest_score = 0 110 | 111 | if (max_length > longlength): 112 | for i in range(len(x)+1): 113 | m[i] = {} 114 | for j in range(len(y)+1): 115 | min_i_j = calc_cost_large(i, j, x, y, scorecutoff, m) 116 | m[i][j] = min_i_j 117 | 118 | for key, value in m[i].items(): 119 | if not isinf(value[0]): 120 | if scorecutoff is None: 121 | temp.write(str(key) + '|' + str(int(value[0])) + '|' + str(value[1]) + '|' + str(value[2]) + '\t') 122 | elif int(value[0]) < scorecutoff: 123 | temp.write(str(key) + '|' + str(int(value[0])) + '|' + str(value[1]) + '|' + str(value[2]) + '\t') 124 | temp.write('\n') 125 | if i - 3 >= 0: 126 | m[i - 3] = {} 127 | temp.close() 128 | 129 | if (max_length > longlength): 130 | i, j = len(x), len(y) 131 | try: 132 | with FileReadBackwards(temp.name) as fi: 133 | tempdict = {} 134 | currLine = i 135 | for line in fi: 136 | di = 0 137 | if currLine == i: 138 | currvalues = line.strip().split('\t') 139 | for cv in currvalues: 140 | v = cv.split('|') 141 | try: 142 | tempdict[int(v[0])] = (int(v[1]), int(v[2]), int(v[3])) 143 | except: #length cost was too large for all combinations 144 | tempdict[0] = (100000000, 1, 0) 145 | while di == 0: 146 | try: 147 | (c, di, dj) = tempdict[j] 148 | except: #Exception for the case of length cost having been cut off when generating the temp file 149 | (c, di, dj) = (100000000, 0, 1) 150 | if c > highest_score: 151 | highest_score = c 152 | if di == dj == 0: 153 | break 154 | yield (i - di, i), (j - dj, j), highest_score, len(x), len(y) 155 | i -= di 156 | j -= dj 157 | if di <= dj <= 0: 158 | break 159 | 160 | currLine -= 1 161 | except Exception as e: 162 | print(e) 163 | print('error in _align') 164 | sys.exit(1) 165 | print('finished fi') 166 | else: 167 | for i in range(len(x)+1): 168 | for j in range(len(y)+1): 169 | m[i, j] = calc_cost(i, j, x, y, m) 170 | 171 | while True: 172 | (c, di, dj) = m[i, j] 173 | if c > highest_score: 174 | highest_score = c 175 | if di == dj == 0: 176 | break 177 | yield (i-di, i), (j-dj, j), highest_score, len(x), len(y) 178 | i -= di 179 | j -= dj 180 | remove(temp.name) 181 | 182 | 183 | cdef char_length(sentence): 184 | """ Length of a sentence in characters """ 185 | return len(sentence.replace(' ', '')) 186 | 187 | 188 | def align(sx, sy, int longlength, scorecutoff, my_basename): 189 | """ Align two groups of sentences """ 190 | cx = list(map(char_length, sx)) 191 | cy = list(map(char_length, sy)) 192 | cdef long highest_score 193 | cdef int lenx, leny 194 | for (i1, i2), (j1, j2), highest_score, len_x, len_y in reversed(list(_align(cx, cy, longlength, scorecutoff, my_basename))): 195 | source_sentences = range(i1, i2) 196 | target_sentences = range(j1, j2) 197 | if alignment_type == 'pharaoh': 198 | yield str(list(source_sentences)) + ':' + str(list(target_sentences)), highest_score, len_x, len_y 199 | elif alignment_type == 'text': 200 | yield ' '.join(sx[i1:i2]) + '\t' + ' '.join(sy[j1:j2]), highest_score, len_x, len_y 201 | 202 | 203 | def read_blocks(f): 204 | # Blocks are separated by an empty line. They can be paragraphs or documents. 205 | block = [] 206 | for l in f: 207 | if not l.strip(): 208 | yield block 209 | block = [] 210 | else: 211 | block.append(l.strip()) 212 | if block: 213 | yield block 214 | 215 | 216 | def gale_church(corpus_x, corpus_y, int longlength, scorecutoff): 217 | alignments_out = '' 218 | with open(corpus_x) as fx, open(corpus_y) as fy: 219 | for block_x, block_y in zip(read_blocks(fx), read_blocks(fy)): 220 | for alignment, highest_score, len_x, len_y in align(block_x, block_y, longlength, scorecutoff, basename(corpus_x)): 221 | alignments_out += alignment + '\n' 222 | #print('output', alignments_out, highest_score, len_x, len_y) 223 | return alignments_out, highest_score, len_x, len_y 224 | -------------------------------------------------------------------------------- /greedy.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | 3 | """ 4 | A Cython implementation of the Gale-Church algorithm 5 | """ 6 | 7 | cimport cython 8 | import numpy as np 9 | cimport numpy as np 10 | from cython.parallel cimport prange 11 | 12 | ## add greedy anchor selection for very large files (e.g. over 40 k sentences) then not use labse_score_matrix but calculate labse on the fly 13 | 14 | 15 | @cython.boundscheck(False) 16 | @cython.wraparound(False) 17 | def get_highest_labse_anchor(start_source: int, start_target: int, anchor: list[str], anchor_source_list: list[str], anchor_target_list: list[str], anchor_source_list_lines: list[str], anchor_target_list_lines: list[str], src_emb_dict, trg_emb_dict): 18 | cdef int end_source, end_target, source_len, target_len, source_ctr, target_ctr 19 | cdef int min_target, max_target 20 | cdef float maximum_source_target_difference 21 | cdef double labse_score, highest_score 22 | cdef highest_anchor 23 | 24 | highest_score = 0.0 25 | highest_anchor = None 26 | try: 27 | end_source = int(anchor[0].split(',')[0]) 28 | except: 29 | end_source = anchor[0] 30 | try: 31 | end_target = int(anchor[1].split(',')[0]) 32 | except: 33 | end_target = anchor[1] 34 | 35 | source_len = end_source - start_source 36 | target_len = end_target - start_target 37 | # not start in the beginning to try to make larger spans 38 | start_source = int(start_source + (0.2 * source_len)) 39 | start_target = int(start_target + (0.2 * target_len)) 40 | 41 | maximum_source_target_difference = max(int(1.15*(abs(source_len-target_len))), int(0.15*((target_len+source_len)/2))) 42 | 43 | for source_ctr in range(start_source, end_source-2): 44 | #þetta ræður bara við fylki þar sem allar linur eru með og þær eru í röð 45 | if start_source + 2 < int(anchor_source_list_lines[source_ctr].strip().split(',')[0]) < end_source - 2: 46 | min_target = max(start_target, min(int(source_ctr-maximum_source_target_difference), end_target)) 47 | max_target = min(end_target, max(start_target, int(source_ctr+maximum_source_target_difference))) 48 | for target_ctr in range(min_target+2, max_target-2): 49 | if min_target + 2 < int(anchor_target_list_lines[target_ctr].strip().split(',')[0]) < max_target - 2: 50 | try: 51 | labse_score = trg_emb_dict[anchor_target_list[target_ctr].strip()].dot(src_emb_dict[anchor_source_list[source_ctr].strip()].transpose()) 52 | except Exception as e: 53 | labse_score = 0 54 | try: 55 | if labse_score > highest_score: 56 | highest_score = labse_score 57 | highest_anchor = [str(anchor_source_list_lines[source_ctr]).strip(), str(anchor_target_list_lines[target_ctr]).strip()] 58 | except Exception as e: 59 | print(e) 60 | return highest_anchor 61 | 62 | 63 | def greedy_anchor_selection(int start_source, int start_target, anchor, 64 | anchor_source_list_lines, anchor_target_list_lines, float minimum_score, np.ndarray[double, ndim=2] labse_score_matrix): 65 | cdef int end_source, end_target, source_len, target_len, i_ctr, j_ctr 66 | cdef int min_target, max_target 67 | cdef float maximum_source_target_difference 68 | cdef double labse_score, highest_score 69 | cdef int source_loc_start, source_loc_end, target_loc_start, target_loc_end 70 | 71 | cdef highest_anchor 72 | 73 | highest_score = 0.0 74 | highest_anchor = None 75 | try: 76 | end_source = int(anchor[0].split(',')[0]) 77 | except: 78 | end_source = anchor[0] 79 | try: 80 | end_target = int(anchor[1].split(',')[0]) 81 | except: 82 | end_target = anchor[1] 83 | 84 | source_len = end_source - start_source 85 | target_len = end_target - start_target 86 | # not start in the beginning to try to make larger spans 87 | start_source = int(start_source + (0.2 * source_len)) 88 | start_target = int(start_target + (0.2 * target_len)) 89 | 90 | maximum_source_target_difference = max(int(1.15*(abs(source_len-target_len))), int(0.15*((target_len+source_len)/2))) 91 | above_threshold = np.where(labse_score_matrix[start_source:end_source, start_target:end_target] > minimum_score) 92 | for i_ctr, j_ctr in zip(above_threshold[0], above_threshold[1]): 93 | if int(anchor_source_list_lines[i_ctr]) > start_source + 2 and int(anchor_source_list_lines[i_ctr]) < end_source - 2: 94 | min_target = max(start_target, min(int(i_ctr-maximum_source_target_difference), end_target)) 95 | max_target = min(end_target, max(start_target, int(i_ctr+maximum_source_target_difference))) 96 | if int(anchor_target_list_lines[j_ctr]) > min_target + 2 and int(anchor_target_list_lines[j_ctr]) < max_target - 2: 97 | labse_score = labse_score_matrix[i_ctr, j_ctr] 98 | if labse_score > highest_score: 99 | highest_score = labse_score 100 | highest_anchor = [str(anchor_source_list_lines[i_ctr]).strip(), str(anchor_target_list_lines[j_ctr]).strip()] 101 | return highest_anchor 102 | 103 | @cython.boundscheck(False) 104 | @cython.wraparound(False) 105 | def greedy_anchor_selection_large(double minimum_score, np.ndarray[double, ndim=2] labse_score_matrix): 106 | cdef int source_len, target_len, i_ctr, j_ctr 107 | cdef int min_target, max_target 108 | cdef float maximum_source_target_difference 109 | cdef double labse_score, highest_score 110 | cdef highest_anchor 111 | #cdef int source_loc_start, source_loc_end, target_loc_start, target_loc_end 112 | 113 | highest_score = 0 114 | highest_anchor = None 115 | 116 | 117 | source_len = len(labse_score_matrix) 118 | target_len = len(labse_score_matrix[0]) 119 | 120 | start_source = int(0.2 * source_len) 121 | start_target = int(0.2 * target_len) 122 | 123 | 124 | maximum_source_target_difference = max(int(1.15*(abs(source_len-target_len))), int(0.15*((target_len+source_len)/2))) 125 | 126 | 127 | above_threshold = np.where(labse_score_matrix[start_source:source_len-1, start_target:target_len-1] > minimum_score) 128 | for i_ctr, j_ctr in zip(above_threshold[0], above_threshold[1]): 129 | if i_ctr > start_source + 2 and i_ctr < source_len - 2: 130 | min_target = max(start_target, min(int(i_ctr-maximum_source_target_difference), target_len)) 131 | max_target = min(target_len, max(start_target, int(i_ctr+maximum_source_target_difference))) 132 | if j_ctr > min_target + 2 and j_ctr < max_target - 2: 133 | labse_score = labse_score_matrix[i_ctr, j_ctr] 134 | if labse_score > highest_score: 135 | highest_score = labse_score 136 | highest_anchor = [i_ctr,j_ctr] 137 | return highest_anchor -------------------------------------------------------------------------------- /repeatedTimer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | from threading import Timer 4 | 5 | 6 | class RepeatedTimer(object): 7 | def __init__(self, interval, function, *args, **kwargs): 8 | self._timer = None 9 | self.interval = interval 10 | self.function = function 11 | self.args = args 12 | self.kwargs = kwargs 13 | self.is_running = False 14 | self.start() 15 | 16 | def _run(self): 17 | self.is_running = False 18 | self.start() 19 | self.function(*self.args, **self.kwargs) 20 | 21 | def start(self): 22 | if not self.is_running: 23 | self._timer = Timer(self.interval, self._run) 24 | self._timer.start() 25 | self.is_running = True 26 | 27 | def stop(self): 28 | self._timer.cancel() 29 | self.is_running = False 30 | -------------------------------------------------------------------------------- /reportInfo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import datetime 4 | 5 | class ReportInfo: 6 | def __init__(self, starttime, total_files): 7 | self.starttime = starttime 8 | self.file_start_time = datetime.datetime.now() 9 | self.align_start_time = datetime.datetime.now() 10 | self.align_estimated_time = datetime.datetime.now() 11 | self.input_file = '' 12 | self.total_files = total_files 13 | self.files_left = total_files 14 | self.current_path_knot = 1 15 | self.total_path_knots = 0 16 | self.totalestimatedtime = datetime.datetime.now() 17 | self.file_elapsed_time = datetime.datetime.now() 18 | self.align_elapsed_time = datetime.datetime.now() 19 | self.gale_church_elapsed_time = datetime.datetime.now() 20 | self.calc_anchors_elapsed_time = datetime.datetime.now() 21 | self.calc_elapsed_labse = datetime.datetime.now() 22 | self.greedy_algorithm_elapsed_time = datetime.datetime.now() 23 | self.file_processing_stage = 'Initializing ' 24 | self.source_file_length = 0 25 | self.target_file_length = 0 26 | self.anchors = [] 27 | 28 | def print_info(self): 29 | text_out = 'File: ' + str(self.input_file) + '\n' + 'File Elapsed Time: ' + str(self.file_elapsed_time) + '\n' + \ 30 | 'Source file length: ' + str(self.source_file_length) + '\n' + \ 31 | 'Target file length: ' + str(self.target_file_length) + '\n' + 'Nodes: ' + str(self.total_path_knots) + '\n' + \ 32 | 'Gale-Church Elapsed Time: ' + str(self.gale_church_elapsed_time) + '\n' + 'Align Elapsed Time: ' + str(self.align_elapsed_time) + '\n' + \ 33 | 'Calc Anchors Elapsed Time: ' + str(self.calc_anchors_elapsed_time) + '\n' + 'Greedy Algorithm Elapsed Time: ' + str(self.greedy_algorithm_elapsed_time) + '\n' + \ 34 | 'Anchors: ' + str(self.anchors) + '\n' 35 | return text_out 36 | 37 | def init_file(self, input_file): 38 | self.input_file = input_file 39 | self.file_processing_stage = 'Initializing...' 40 | self.file_start_time = datetime.datetime.now() 41 | 42 | def set_file(self, source_file_length, target_file_length): 43 | self.source_file_length = source_file_length 44 | self.target_file_length = target_file_length 45 | self.file_processing_stage = 'Initializing ' 46 | 47 | def set_status(self, status): 48 | self.file_processing_stage = status 49 | 50 | def set_aligning(self, total_path_knots): 51 | self.total_path_knots = total_path_knots 52 | self.align_start_time = datetime.datetime.now() 53 | self.file_processing_stage = 'Aligning ' 54 | self.files_left = self.files_left - 1 55 | 56 | def set_anchoring(self): 57 | self.file_processing_stage = 'Anchoring ' 58 | 59 | def set_elapsed_gale_church(self, galechurch): 60 | self.gale_church_elapsed_time = galechurch 61 | 62 | 63 | def set_elapsed_calc_labse(self, calclabse): 64 | self.calc_elapsed_labse = calclabse 65 | 66 | 67 | def set_elapsed_calc_anchors(self, calctime): 68 | self.calc_anchors_elapsed_time = calctime 69 | 70 | def set_total_calculations(self, total_calculations): 71 | self.total_path_knots = total_calculations 72 | 73 | def add_nodes(self, calculated_nodes): 74 | self.current_path_knot += calculated_nodes 75 | 76 | def set_elapsed_greedy(self, greedytime): 77 | self.greedy_algorithm_elapsed_time = greedytime 78 | 79 | 80 | def set_elapsed_align(self, aligntime): 81 | self.align_elapsed_time = aligntime 82 | 83 | def set_anchors(self, anchors): 84 | self.anchors = anchors 85 | 86 | def update_aligning(self, current_path_knot): 87 | self.current_path_knot = current_path_knot 88 | 89 | def update_times(self): 90 | self.file_elapsed_time = datetime.datetime.now() - self.file_start_time 91 | self.align_elapsed_time = datetime.datetime.now() - self.align_start_time 92 | self.align_estimated_time = (self.align_elapsed_time * (self.total_path_knots / self.current_path_knot)) + (self.align_start_time - self.file_start_time) 93 | self.current_path_knot = self.current_path_knot 94 | self.total_path_knots = self.total_path_knots 95 | self.file_processing_stage = self.file_processing_stage 96 | self.source_file_length = self.source_file_length 97 | self.target_file_length = self.target_file_length 98 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.22.1 2 | transformers==4.22.2 3 | Cython==0.29.27 -------------------------------------------------------------------------------- /utilities.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | 3 | cimport cython 4 | import numpy as np 5 | from cython.view cimport array as cvarray 6 | #cimport numpy as np 7 | 8 | cdef extern from "Python.h": 9 | object PyUnicode_AsUTF8String(object unicode) 10 | 11 | @cython.boundscheck(False) 12 | @cython.wraparound(False) 13 | def create_labse_score_matrix(anchor_source_list: list[str], anchor_target_list: list[str], src_emb_dict: dict, trg_emb_dict: dict): 14 | cdef str i, j 15 | cdef int i_ctr, j_ctr 16 | cdef double[:,:] labse_score_matrix = np.zeros((len(anchor_source_list), len(anchor_target_list)), dtype=np.float64) 17 | #out_matrix = np.array([[0 for x in range(len(anchor_target_list))] for y in range(len(anchor_source_list))]) 18 | 19 | i_ctr = 0 20 | for i in anchor_source_list: 21 | j_ctr = 0 22 | for j in range(anchor_target_list): 23 | #atl_length = len(anchor_target_list) 24 | #for j_ctr in prange(0, atl_length, 1, nogil=True): 25 | try: 26 | labse_score_matrix[i_ctr,j_ctr] = trg_emb_dict[j.strip()].dot(src_emb_dict[i.strip()].transpose()) 27 | except Exception as e: 28 | print(e) 29 | labse_score_matrix[i_ctr,j_ctr] = 0 30 | j_ctr = j_ctr + 1 31 | i_ctr = i_ctr + 1 32 | return labse_score_matrix 33 | 34 | @cython.boundscheck(False) 35 | @cython.wraparound(False) 36 | def loc_start_end_matrices(anchor_lines: list[str]): 37 | cdef int i 38 | cdef list[int] start_list, end_list 39 | start_list = [] 40 | end_list = [] 41 | for i in range(0, len(anchor_lines)): 42 | start_list.append(int(anchor_lines[i].split(',')[0])) 43 | end_list.append(int(anchor_lines[i].split(',')[1])) 44 | return start_list, end_list --------------------------------------------------------------------------------