├── .github └── workflows │ └── python-package.yml ├── .gitignore ├── .zenodo.json ├── LICENSE ├── MANIFEST.in ├── README.md ├── RELEASING.md ├── pip-requirements.txt ├── setup.cfg ├── setup.py ├── src └── lingrex │ ├── __init__.py │ ├── align.py │ ├── borrowing.py │ ├── cognates.py │ ├── colex.py │ ├── copar.py │ ├── evaluate.py │ ├── fuzzy.py │ ├── reconstruct.py │ ├── regularity.py │ ├── trimming.py │ └── util.py ├── tests ├── conftest.py ├── data │ ├── east-polynesian.tsv │ ├── hillburmish.tsv │ └── wordlist.tsv ├── test_align.py ├── test_borrowing.py ├── test_cognates.py ├── test_colex.py ├── test_copar.py ├── test_evaluate.py ├── test_fuzzy.py ├── test_reconstruct.py ├── test_regularity.py ├── test_trimming.py ├── test_util.py ├── test_workflows.py └── workflows │ ├── bodt-2019 │ ├── bodt-khobwa-cleaned.tsv │ ├── predict.py │ ├── results │ │ └── README.md │ └── test-prediction.py │ ├── list-2019 │ ├── data │ │ ├── burmish-240-8.tsv │ │ ├── chinese-203-19.tsv │ │ ├── chinese-623-14.tsv │ │ ├── east-polynesian.tsv │ │ ├── japanese-200-10.tsv │ │ └── polynesian-210-10.tsv │ ├── general.py │ ├── predict.py │ └── results │ │ ├── burmish-240-8-75.txt │ │ ├── burmish-240-8-individual-75.tsv │ │ ├── chinese-623-14-75.txt │ │ ├── chinese-623-14-individual-75.tsv │ │ ├── japanese-200-10-75.txt │ │ ├── japanese-200-10-individual-75.tsv │ │ ├── out-burmish.tsv │ │ ├── out-chinese.tsv │ │ ├── out-japanese.tsv │ │ ├── out-polynesian.tsv │ │ ├── polynesian-210-10-75.txt │ │ └── polynesian-210-10-individual-75.tsv │ └── wu-2020 │ ├── 4_crosssemantic.py │ ├── 5_correspondence.py │ ├── D_Chen_aligned.tsv │ ├── D_Chen_all_patterns.tsv │ ├── D_Chen_crossids.tsv │ └── D_Chen_patterns.tsv └── tox.ini /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: [3.8, 3.9, "3.10"] 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install .[test] 27 | - name: Test with pytest 28 | run: | 29 | pytest 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .info 6 | tests/bak/* 7 | dev/ 8 | # C extensions 9 | *.so 10 | *.swp 11 | .idea 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | .venv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # MacOS 107 | *.DS_Store 108 | -------------------------------------------------------------------------------- /.zenodo.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "LingRex. Linguistic Reconstruction with LingPy", 3 | "creators": [ 4 | { 5 | "name": "Johann-Mattis List" 6 | }, 7 | { 8 | "name": "Robert Forkel" 9 | } 10 | ], 11 | "access_right": "open", 12 | "keywords": [ 13 | "linguistics" 14 | ], 15 | "license": { 16 | "id": "CC-BY-4.0" 17 | }, 18 | "upload_type": "software", 19 | "communities": [ 20 | { 21 | "identifier": "digling" 22 | }, 23 | { 24 | "identifier": "calc" 25 | } 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 LingPy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE 2 | graft src 3 | global-exclude *.py[co] 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LingRex: Linguistic Reconstruction with LingPy 2 | 3 | [![Build Status](https://github.com/lingpy/lingrex/workflows/tests/badge.svg)](https://github.com/lingpy/lingrex/actions?query=workflow%3Atests) 4 | [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.1544943.svg)](https://doi.org/10.5281/zenodo.1544943) 5 | [![PyPI version](https://badge.fury.io/py/lingrex.png)](https://badge.fury.io/py/lingrex) 6 | 7 | LingRex offers the code needed for the automatic inference of sound correspondence patterns as described in the following paper: 8 | 9 | > List, J.-M. (2019): Automatic inference of sound correspondence patterns across multiple languages. Computational Linguistics 45.1. 137-161. [DOI: 10.1162/coli_a_00344](https://doi.org/10.1162/coli_a_00344) 10 | 11 | To test this workflow, please check the workflow code example in [`tests/workflows/list-2019`](tests/workflows/list-2019). 12 | 13 | LingRex offers also the code needed for a baseline algorithm for automatic word prediction or automatic phonological reconstruction in a supervised fashion. 14 | 15 | > List, J.-M. and R. Forkel and N. W. Hill (2022): A New Framework for Fast Automated Phonological Reconstruction Using Trimmed Alignments and Sound Correspondence Patterns. Proceedings of the 3rd International Workshop on Computational Approaches to Historical Language Change (LChange 2022). Dublin. Ireland. https://aclanthology.org/2022.lchange-1.9 16 | 17 | This algorithm is also used as a baseline for a Shared Task on the Prediction of Cognate Reflexes (https://sigtyp.github.io/st2022.html), organized as part of the SIGTYP Workshop at NAACL 2022. 18 | 19 | > List, J.-M., E. Vylomova, R. Forkel, N. Hill, and R. Cotterell (2022): The SIGTYP shared task on the prediction of cognate reflexes. In: Proceedings of the 4th Workshop on Computational Typology and Multilingual NLP. Association for Computational Linguistics 52-62. https://aclanthology.org/2022.sigtyp-1.7 20 | 21 | Methods for the handling of partial cognates were introduced in a study by Wu and List (2023): 22 | 23 | > Wu, M.-S. and J.-M. List (2023): Annotating cognates in phylogenetic studies of South-East Asian languages. Language Dynamics and Change. https://doi.org/10.1163/22105832-bja10023 24 | 25 | Methods for the trimming of phonetic alignments were introduced in a study by Blum and List (2023): 26 | 27 | > Blum, F. and J.-M. List (2023): Trimming phonetic alignments improves the inference of sound correspondence patterns from multilingual wordlists. In: Proceedings of the 5th Workshop on Computational Typology and Multilingual NLP. Association for Computational Linguistics. 52-64. https://aclanthology.org/2023.sigtyp-1.6.pdf 28 | 29 | Methods for the handling and creation of fuzzy / uncertain phonological reconstructions were introduced in a study by List et al. (2023): 30 | 31 | > List, J.-M.; Hill, N. W.; Blum, F.; and Forkel, R. (2023): A New Framework for the Representation and Computation of Uncertainty in Phonological Reconstruction. Proceedings of the 4th Workshop on Computational Approaches to Historical Language Change. 22-32. https://aclanthology.org/2023.lchange-1.3 32 | 33 | When using this package in your research, please make sure to quote the respective papers, depending on the algorithms you use, and quote the software package as follows: 34 | 35 | > List, J.-M. and R. Forkel (2023): LingRex: Linguistic Reconstruction with LingPy. [Computer software, Version 1.4.0]. With contributions by Frederic Blum and Mei-Shin Wu. Leipzig: Max Planck Institute for Evolutionary Anthropology. https://pypi.org/project/lingrex 36 | 37 | Since this software package itself makes use of LingPy's alignment algorithms, you should also quote the LingPy package itself. 38 | 39 | > List, J.-M. and R. Forkel (2023): LingPy. A Python library for quantitative tasks in historical linguistics. Version 2.6.10. Max Planck Institute for Evolutionary Anthropology: Leipzig. https://lingpy.org 40 | 41 | ## Installation 42 | 43 | Install the package via `pip`: 44 | 45 | ```shell 46 | pip install lingrex 47 | ``` 48 | 49 | ## Further Examples 50 | 51 | The borrowing detection algorithm implemented in LingRex is introduced in the 52 | paper: 53 | 54 | > List, J.-M. and R. Forkel (2021): Automated identification of borrowings in multilingual wordlists [version 1; peer review: 3 approved, 1 approved with reservations]. Open Research Europe 1.79. 1-11. [DOI: 10.12688/openreseurope.13843.1](https://doi.org/10.12688/openreseurope.13843.1) 55 | 56 | If you use this algorithm, please cite LingRex and this paper. 57 | 58 | In addition to the paper in which the correspondence pattern inference algorithm was first introduced, LingRex also offers the code to compute the workflow described in the following paper: 59 | 60 | > Wu, M.-S., N. Schweikhard, T. Bodt, N. Hill, and J.-M. List (2020): Computer-Assisted Language Comparison. State of the Art. Journal of Open Humanities Data 6.2. 1-14. [DOI: 10.5334/johd.12](https://doi.org/10.5334/johd.12) 61 | 62 | To test this workflow, please check the workflow code example in `tests/workflows/wu-2020`. 63 | 64 | If you use this workflow in your work, please quote this paper as well. 65 | 66 | In addition, our experiment (with T. Bodt) on predicting words with the help of sound correspondence patterns also made use of the LingRex package. 67 | 68 | > Bodt, T. and J.-M. List (2021): Reflex prediction. A case study of Western Kho-Bwa. Diachronica 0.0. 1-38. [DOI: 10.1075/dia.20009.bod](https://doi.org/10.1075/dia.20009.bod) 69 | 70 | To test this workflow, please check the workflow code example in `tests/workflows/bodt-2019`. 71 | -------------------------------------------------------------------------------- /RELEASING.md: -------------------------------------------------------------------------------- 1 | 2 | # Releasing lingrex 3 | 4 | - Do platform test via tox: 5 | ```shell 6 | tox -r 7 | ``` 8 | - test if the workflow scripts still work: 9 | ```shell 10 | pytest -m"workflow" 11 | ``` 12 | 13 | - Make sure statement coverage >= 99% 14 | - Use black and flake8 to make the code unified: 15 | ```shell 16 | flake8 src 17 | black src/lingrex/*.py 18 | ``` 19 | 20 | - Update the version number, by removing the trailing `.dev0` in: 21 | - `setup.cfg` 22 | - `src/lingrex/__init__.py` 23 | 24 | - Check metadata in `.zenodo.json` 25 | 26 | - Create the release commit: 27 | ```shell 28 | git commit -a -m "release " 29 | ``` 30 | 31 | - Create a release tag: 32 | ```shell 33 | git tag -a v -m" release" 34 | ``` 35 | 36 | - Release to PyPI: 37 | ```shell 38 | rm dist/* 39 | python setup.py sdist bdist_wheel 40 | twine upload dist/* 41 | ``` 42 | 43 | - Push to github: 44 | ```shell 45 | git push origin 46 | git push --tags 47 | ``` 48 | 49 | - Change version for the next release cycle, i.e. incrementing and adding .dev0 50 | - `setup.cfg` 51 | - `src/lingrex/__init__.py` 52 | 53 | - Commit/push the version change: 54 | ```shell 55 | git commit -a -m "bump version for development" 56 | git push origin 57 | ``` 58 | -------------------------------------------------------------------------------- /pip-requirements.txt: -------------------------------------------------------------------------------- 1 | lingpy >= 2.6.8 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = lingrex 3 | version = 1.4.3.dev0 4 | author = Johann-Mattis List 5 | author_email = mattis.list@uni-passau.de 6 | description = Web-Based Tool for Computer-Assisted Language Comparison 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | keywords = 10 | linguistics 11 | computational linguistics 12 | linguistic reconstruction 13 | cognate detection 14 | license = MIT 15 | license_files = LICENSE 16 | url = https://pypi.org/project/lingrex 17 | platforms = any 18 | classifiers = 19 | Development Status :: 5 - Production/Stable 20 | Intended Audience :: Developers 21 | Intended Audience :: Science/Research 22 | Natural Language :: English 23 | Operating System :: OS Independent 24 | Programming Language :: Python :: 3 25 | Programming Language :: Python :: 3.8 26 | Programming Language :: Python :: 3.9 27 | Programming Language :: Python :: 3.10 28 | Programming Language :: Python :: 3.11 29 | Programming Language :: Python :: 3.12 30 | Programming Language :: Python :: Implementation :: CPython 31 | Programming Language :: Python :: Implementation :: PyPy 32 | License :: OSI Approved :: GNU General Public License v3 (GPLv3) 33 | 34 | [options] 35 | zip_safe = False 36 | packages = find: 37 | package_dir = 38 | = src 39 | python_requires = >=3.8 40 | install_requires = 41 | lingpy>=2.6.13 42 | include_package_data = True 43 | 44 | [options.packages.find] 45 | where = src 46 | 47 | [options.package_data] 48 | 49 | [options.entry_points] 50 | 51 | [options.extras_require] 52 | dev = 53 | build 54 | wheel 55 | twine 56 | tox 57 | black 58 | flake8 59 | 60 | test = 61 | pytest 62 | pytest-cov 63 | pytest-mock 64 | coverage 65 | 66 | 67 | 68 | [bdist_wheel] 69 | universal = 1 70 | 71 | [flake8] 72 | ignore = E711,E712,D100,D101,D103,D102,D301,E731 73 | max-line-length = 100 74 | exclude = .tox,cython 75 | 76 | [tool:pytest] 77 | minversion = 5 78 | testpaths = tests 79 | addopts = --cov 80 | 81 | [easy_install] 82 | zip_ok = false 83 | 84 | [coverage:run] 85 | source = 86 | lingrex 87 | tests 88 | 89 | [coverage:report] 90 | show_missing = true 91 | skip_covered = true 92 | 93 | 94 | [tox:tox] 95 | envlist = py38, py39, py310, py311 96 | isolated_build = true 97 | skip_missing_interpreter = true 98 | 99 | [testenv] 100 | deps = .[test] 101 | commands = pytest {posargs} 102 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Setup for LingRex 3 | """ 4 | from setuptools import setup 5 | setup() 6 | 7 | -------------------------------------------------------------------------------- /src/lingrex/__init__.py: -------------------------------------------------------------------------------- 1 | from lingrex.copar import CoPaR, density 2 | 3 | assert CoPaR and density 4 | __version__ = "1.4.3.dev0" 5 | -------------------------------------------------------------------------------- /src/lingrex/align.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various phonetic alignment functions. 3 | """ 4 | from lingpy import basictypes as bt 5 | 6 | 7 | def gap_free_pairwise(seqA, seqB, syllables=None, gap="-"): 8 | """ 9 | Carry out a gap-free alignment in which segments are merged instead of gapped. 10 | """ 11 | syllables = [] if syllables is None else syllables 12 | start = True 13 | merge = False 14 | outA, outB = [], [] 15 | for i, (charA, charB) in enumerate(zip(seqA, seqB)): 16 | if i in syllables: 17 | start = True 18 | if start and charB == gap: 19 | outA.append(charA + ">") 20 | merge = True 21 | elif not merge and charB == gap: 22 | outA[-1] += "<" + charA 23 | elif merge: 24 | if charB == gap: 25 | outA[-1] += charA + ">" 26 | else: 27 | outA[-1] += charA 28 | outB.append(charB) 29 | merge = False 30 | else: 31 | outA.append(charA) 32 | outB.append(charB) 33 | start = False 34 | return outA, outB 35 | 36 | 37 | def align_to_template(sequence, structures, template, gap="-"): 38 | """ 39 | Align a sequence to a template. 40 | """ 41 | if (len(sequence) != len(structures)) or (len(template) < len(sequence)): 42 | raise ValueError( 43 | "sequence {0} and structure {1} have different length".format( 44 | repr(sequence), repr(structures) 45 | ) 46 | ) 47 | if len([x for x in structures if x not in template]) != 0: 48 | raise ValueError( 49 | "{0} items in the structure {1} is not in the template".format( 50 | len([x for x in structures if x not in template]), repr(structures) 51 | ) 52 | ) 53 | 54 | out = [] 55 | idxA, idxB = 0, 0 56 | while idxB < len(template): 57 | if idxA < len(sequence): 58 | segment, structure = sequence[idxA], structures[idxA] 59 | else: 60 | segment, structure = gap, "" 61 | current_structure = template[idxB] 62 | if current_structure == structure: 63 | out.append(segment) 64 | idxA += 1 65 | else: 66 | out.append(gap) 67 | idxB += 1 68 | 69 | return out 70 | 71 | 72 | def shrink_alignments(alignments): 73 | """ 74 | Remove columns from alignment which all consist of gaps. 75 | """ 76 | excludes = [] 77 | for i in range(len(alignments[0])): 78 | col = set([line[i] for line in alignments]) 79 | if "-" in col and len(col) == 1: 80 | excludes.append(i) 81 | return [ 82 | [site for i, site in enumerate(alignment) if i not in excludes] 83 | for alignment in alignments 84 | ] 85 | 86 | 87 | def shrink(tokens, structures, converter): 88 | """ 89 | Shrink tokens according to the converter. 90 | 91 | .. note:: Works only for shrinking two structure elements so far. 92 | """ 93 | outt, outs = [], [] 94 | sm, merge = None, False 95 | for i in range(len(tokens)): 96 | if i > 0: 97 | sm = " ".join([structures[i - 1], structures[i]]) 98 | if sm in converter: 99 | outt += [tokens[i - 1] + tokens[i]] 100 | outs += [converter[sm]] 101 | merge = True 102 | elif not merge: 103 | outt += [tokens[i - 1]] 104 | outs += [converter.get(structures[i - 1], structures[i - 1])] 105 | else: 106 | merge = False 107 | if sm not in converter: 108 | outt += [tokens[i]] 109 | outs += [converter.get(structures[i], structures[i])] 110 | return outt, outs 111 | 112 | 113 | def shrink_template( 114 | wordlist, 115 | structure="structure", 116 | segments="tokens", 117 | converter={"i m": "I", "i": "I", "n c": "R", "n": "R", "c": "R"}, 118 | new_structure="structure2", 119 | new_tokens="tokens2", 120 | override=False, 121 | ): 122 | """ 123 | Reduce a template by merging certain parts of the structure. 124 | """ 125 | D = {} 126 | for idx, strucs, tokens in wordlist.iter_rows(structure, segments): 127 | D[idx] = shrink(tokens, strucs, converter) 128 | wordlist.add_entries(new_structure, D, lambda x: bt.lists(x[1]), override=override) 129 | wordlist.add_entries(new_tokens, D, lambda x: bt.lists(x[0]), override=override) 130 | 131 | 132 | def template_alignment( 133 | wordlist, 134 | ref="cogid", 135 | template="CCCCVVccccT_CCCCVVccccT_CCCCVVccccT_CCCCVVccccT_CCCCvvT", 136 | structure="structure", 137 | fuzzy=False, 138 | segments="tokens", 139 | gap="-", 140 | alignment="alignment", 141 | override=True, 142 | ): 143 | """ 144 | Function aligns the cognate sets in a wordlist to a template. 145 | 146 | Note 147 | ---- 148 | This function was first introduced in Wu et al. (2020). 149 | 150 | > Wu, M.-S., N. Schweikhard, T. Bodt, N. Hill, and J.-M. List (2020): 151 | > Computer-Assisted Language Comparison. State of the Art. Journal of Open 152 | > Humanities Data 6.2. 1-14. DOI: https://doi.org/10.5334/johd.12 153 | """ 154 | 155 | for idx, tokens, structures in wordlist.iter_rows(segments, structure): 156 | wordlist[idx, segments], wordlist[idx, structure] = bt.lists(tokens), bt.lists( 157 | structures 158 | ) 159 | 160 | etd = wordlist.get_etymdict(ref) 161 | A = {} 162 | if not fuzzy: 163 | for cogid, vals in etd.items(): 164 | idxs = [] 165 | for val in vals: 166 | if val: 167 | idxs += val 168 | alignments = shrink_alignments( 169 | [ 170 | align_to_template( 171 | wordlist[idx, segments], 172 | wordlist[idx, structure], 173 | template, 174 | gap=gap, 175 | ) 176 | for idx in idxs 177 | ] 178 | ) 179 | for idx, alm in zip(idxs, alignments): 180 | A[idx] = alm 181 | if fuzzy: 182 | cogid2alm = {} 183 | # only align the first item 184 | for cogid, vals in etd.items(): 185 | idxs, alms, strucs = [], [], [] 186 | for val in vals: 187 | if val: 188 | idxs += val 189 | alms += [ 190 | wordlist[idx, segments].n[wordlist[idx, ref].index(cogid)] 191 | for idx in val 192 | ] 193 | strucs += [ 194 | wordlist[idx, structure].n[wordlist[idx, ref].index(cogid)] 195 | for idx in val 196 | ] 197 | alignments = shrink_alignments( 198 | [ 199 | align_to_template(alm, struc, template, gap=gap) 200 | for alm, struc in zip(alms, strucs) 201 | ] 202 | ) 203 | for idx, alm in zip(idxs, alignments): 204 | cogid2alm[cogid, idx] = " ".join(alm) 205 | # second iteration, add the alignments per cogid 206 | for idx, cogids in wordlist.iter_rows(ref): 207 | A[idx] = bt.lists( 208 | " + ".join([cogid2alm.get((cogid, idx), "?") for cogid in cogids]) 209 | ) 210 | wordlist.add_entries(alignment, A, lambda x: x, override=override) 211 | -------------------------------------------------------------------------------- /src/lingrex/borrowing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basic code for borrowing detection. 3 | """ 4 | import itertools 5 | import collections 6 | 7 | from lingpy import Pairwise 8 | from lingpy.compare.partial import Partial 9 | from lingpy.compare.lexstat import LexStat 10 | 11 | import networkx as nx 12 | 13 | from lingpy.util import pb 14 | 15 | 16 | def internal_cognates( 17 | wordlist, 18 | family="family", 19 | partial=True, 20 | method="lexstat", 21 | runs=10000, 22 | threshold=0.50, 23 | smooth=1, 24 | ratio=(2, 1), 25 | vscale=0.5, 26 | restricted_chars="_", 27 | modes=[("global", -1, 0.5), ("overlap", -1, 0.5)], 28 | ref="autocogids", 29 | cluster_method="upgma", 30 | model="sca", 31 | ): 32 | """ 33 | Cluster the data into cognate sets, but only inside each family. 34 | 35 | :param family: name of the column in which language family information can 36 | be found (defaults="family") 37 | 38 | Note 39 | ---- 40 | This method was first introduced by List and Forkel (2022). 41 | 42 | > List, J.-M. and R. Forkel (2022): Automated identification of borrowings 43 | > in multilingual wordlists [version 3; peer review: 4 approved]. Open 44 | > Research Europe 1.79. 1-11. DOI: https://doi.org/10.12688/openreseurope.13843.3 45 | """ 46 | families = {wordlist[k, family] for k in wordlist} 47 | 48 | # split data into parts 49 | D = {k: {} for k in sorted(families)} 50 | for idx, fam in wordlist.iter_rows(family): 51 | D[fam][idx] = [cell for cell in wordlist[idx]] 52 | 53 | gcogid = 0 54 | G = {} 55 | for fam, data in D.items(): 56 | data[0] = [h for h in wordlist.columns] 57 | if partial: 58 | lex = Partial(data, model=model) 59 | if method == "lexstat": 60 | lex.get_partial_scorer( 61 | runs=runs, 62 | smooth=smooth, 63 | ratio=ratio, 64 | vscale=vscale, 65 | restricted_chars=restricted_chars, 66 | modes=modes, 67 | ) 68 | lex.partial_cluster( 69 | ref=ref, 70 | method=method, 71 | cluster_method=cluster_method, 72 | threshold=threshold, 73 | ) 74 | else: 75 | lex = LexStat(data, model=model) 76 | if method == "lexstat": 77 | lex.get_scorer( 78 | runs=runs, 79 | smooth=smooth, 80 | ratio=ratio, 81 | vscale=vscale, 82 | restricted_chars=restricted_chars, 83 | modes=modes, 84 | ) 85 | lex.cluster( 86 | ref=ref, 87 | method=method, 88 | cluster_method=cluster_method, 89 | threshold=threshold, 90 | ) 91 | 92 | # prepare global cognate indicies 93 | if partial: 94 | C = {idx: len(lex[idx, ref]) * [0] for idx in lex} 95 | etd = lex.get_etymdict(ref=ref) 96 | for cogid, idxs in etd.items(): 97 | for idx_ in idxs: 98 | if idx_: 99 | for idx in idx_: 100 | cogids = lex[idx, ref] 101 | C[idx][cogids.index(cogid)] = cogid + gcogid 102 | else: 103 | C = {idx: 0 for idx in lex} 104 | etd = lex.get_etymdict(ref=ref) 105 | for cogid, idxs in etd.items(): 106 | for idx_ in idxs: 107 | if idx_: 108 | for idx in idx_: 109 | C[idx] = cogid + gcogid 110 | for idx in lex: 111 | G[idx] = C[idx] 112 | gcogid += max(etd) + 1 113 | 114 | renumber = {} 115 | cogid = 1 116 | if partial: 117 | for idx, vals in G.items(): 118 | f = wordlist[idx, family] 119 | new_cogids = [] 120 | for v in vals: 121 | if (f, v) in renumber: 122 | new_cogids += [renumber[f, v]] 123 | else: 124 | renumber[f, v] = cogid 125 | new_cogids += [cogid] 126 | cogid += 1 127 | G[idx] = new_cogids 128 | else: 129 | for idx, val in G.items(): 130 | f = wordlist[idx, family] 131 | if (f, val) not in renumber: 132 | renumber[f, val] = cogid 133 | cogid += 1 134 | G[idx] = renumber[f, val] 135 | 136 | wordlist.add_entries(ref, G, lambda x: x) 137 | 138 | 139 | def external_cognates( 140 | wordlist, 141 | cognates="autocogid", 142 | ref="autoborid", 143 | threshold=0.3, 144 | segments="tokens", 145 | gop=-1, 146 | family="family", 147 | doculect="doculect", 148 | concept="concept", 149 | align_mode="overlap", 150 | ): 151 | """ 152 | Compute language-external cognates and assign them to cognate sets. 153 | 154 | :param cognates: The column which holds previously calculated cognates. 155 | :param ref: The column which will store the new borrowing identifiers. 156 | :param family: The column storing family information. 157 | :param doculect: The column storing doculect information. 158 | 159 | Note 160 | ---- 161 | This method was first introduced by List and Forkel (2022). 162 | 163 | > List, J.-M. and R. Forkel (2022): Automated identification of borrowings 164 | > in multilingual wordlists [version 3; peer review: 4 approved]. Open 165 | > Research Europe 1.79. 1-11. DOI: https://doi.org/10.12688/openreseurope.13843.3 166 | """ 167 | 168 | B = {} 169 | borid = 1 170 | # iterate over the concepts 171 | for concept in pb(wordlist.rows): 172 | idxs = wordlist.get_list(row=concept, flat=True) 173 | for idx in idxs: 174 | B[idx] = 0 175 | taxa = [wordlist[idx, doculect] for idx in idxs] 176 | famis = [wordlist[idx, family] for idx in idxs] 177 | if len(set(famis)) > 1: 178 | G = nx.Graph() 179 | tokens = [wordlist[idx, segments] for idx in idxs] 180 | cogids = [wordlist[idx, cognates] for idx in idxs] 181 | 182 | # assemble cogids to groups 183 | groups = collections.defaultdict(list) 184 | for i, d, t, c in zip(idxs, taxa, tokens, cogids): 185 | groups[c] += [(i, d, t)] 186 | 187 | for group, items in groups.items(): 188 | G.add_node( 189 | str(group), 190 | concept=concept, 191 | taxa=", ".join([t[1] for t in items]), 192 | idxs=", ".join([str(t[0]) for t in items]), 193 | family=wordlist[[t[0] for t in items][0], family], 194 | ) 195 | 196 | # compare groups 197 | for (gA, iA), (gB, iB) in itertools.combinations(list(groups.items()), r=2): 198 | if G.nodes[str(gA)]["family"] != G.nodes[str(gB)]["family"]: 199 | wpairs = [ 200 | (" ".join(a[2]), " ".join(b[2])) 201 | for a, b in itertools.product(iA, iB) 202 | ] 203 | 204 | pairs = Pairwise(wpairs) 205 | pairs.align(distance=True, gop=gop, mode=align_mode) 206 | dst = [] 207 | for i, p in enumerate(pairs._alignments): 208 | dst += [p[2]] 209 | 210 | dst = sum(dst) / len(dst) 211 | if dst <= threshold: 212 | G.add_edge(str(gA), str(gB), distance=dst) 213 | 214 | # components 215 | for i, comp in enumerate(nx.connected_components(G)): 216 | if len(comp) > 1: 217 | table = [] 218 | for cogid in comp: 219 | idxs = [int(x) for x in G.nodes[cogid]["idxs"].split(", ")] 220 | for idx in idxs: 221 | table += [ 222 | [ 223 | wordlist[idx, doculect], 224 | wordlist[idx, concept], 225 | str(wordlist[idx, segments]), 226 | wordlist[idx, family], 227 | cogid, 228 | ] 229 | ] 230 | for idx in idxs: 231 | B[idx] = borid 232 | borid += 1 233 | wordlist.add_entries(ref, B, lambda x: x) 234 | -------------------------------------------------------------------------------- /src/lingrex/cognates.py: -------------------------------------------------------------------------------- 1 | """ 2 | Operations with cognate sets. 3 | """ 4 | import collections 5 | 6 | from clldutils.text import strip_brackets, split_text 7 | import lingpy 8 | 9 | 10 | def common_morpheme_cognates( 11 | wordlist, cognates="cogids", ref="autoid", morphemes="automorphemes", override=True 12 | ): 13 | """ 14 | Convert partial cognates to full cognates. 15 | 16 | Note 17 | ---- 18 | This method was first introduced by Wu and List (to appear). 19 | 20 | > Wu, Mei-Shin and List, Johann-Mattis (to appear): Annotating cognates in 21 | > phylogenetic studies of South-East Asian languages. Language Dynamics and 22 | > Change. Preprint: https://doi.org/10.17613/rabq-7z45 23 | """ 24 | 25 | C, M = {}, {} 26 | current = 1 27 | for concept in wordlist.rows: 28 | base = split_text(strip_brackets(concept))[0].upper().replace(" ", "_") 29 | idxs = wordlist.get_list(row=concept, flat=True) 30 | cogids = collections.defaultdict(list) 31 | for idx in idxs: 32 | M[idx] = [c for c in wordlist[idx, cognates]] 33 | for cogid in lingpy.basictypes.ints(wordlist[idx, cognates]): 34 | cogids[cogid] += [idx] 35 | for i, (cogid, idxs) in enumerate( 36 | sorted(cogids.items(), key=lambda x: len(x[1]), reverse=True) 37 | ): 38 | for idx in idxs: 39 | if idx not in C: 40 | C[idx] = current 41 | M[idx][M[idx].index(cogid)] = base 42 | else: 43 | M[idx][M[idx].index(cogid)] = "_" + base.lower() 44 | current += 1 45 | wordlist.add_entries(ref, C, lambda x: x) 46 | if morphemes: 47 | wordlist.add_entries(morphemes, M, lambda x: x, override=override) 48 | 49 | 50 | def salient_cognates( 51 | wordlist, cognates="cogids", ref="newcogid", morphemes="morphemes", override=True 52 | ): 53 | """ 54 | Convert partial cognates to full cognates ignoring non-salient cognate sets. 55 | 56 | Note 57 | ---- 58 | This method was first introduced by Wu and List (to appear). 59 | 60 | > Wu, Mei-Shin and List, Johann-Mattis (to appear): Annotating cognates in 61 | > phylogenetic studies of South-East Asian languages. Language Dynamics and 62 | > Change. Preprint: https://doi.org/10.17613/rabq-7z45 63 | """ 64 | 65 | lookup, D = {}, {} 66 | for idx, cogids, morphemes in wordlist.iter_rows(cognates, morphemes): 67 | selected_cogids = [] 68 | for cogid, morpheme in zip(cogids, morphemes): 69 | if not morpheme.startswith("_"): 70 | selected_cogids += [cogid] 71 | salient = tuple(selected_cogids) 72 | if salient in lookup: 73 | D[idx] = lookup[salient] 74 | elif D.values(): 75 | next_cogid = max(D.values()) + 1 76 | lookup[salient] = next_cogid 77 | D[idx] = next_cogid 78 | else: 79 | lookup[salient] = 1 80 | D[idx] = 1 81 | 82 | wordlist.add_entries(ref, D, lambda x: x, override=override) 83 | -------------------------------------------------------------------------------- /src/lingrex/colex.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for partial colexification manipulations. 3 | """ 4 | import collections 5 | 6 | 7 | def find_bad_internal_alignments(alignments, ref="cogids"): 8 | """ 9 | Helper function discards wrongly assigned cross-semantic cognates. 10 | 11 | Note 12 | ---- 13 | The function essentially iterates over the alignments and picks 14 | out those in which the same language has the same cognate ID, and if 15 | the alignment itself differs, it assigns it a new cognate ID. It 16 | presupposes that the data has not been analyzed in search for 17 | cross-semantic cognates. 18 | """ 19 | newIDs = {} 20 | 21 | def get_all_indices(lst): 22 | idxs = collections.defaultdict(list) 23 | for i, l in enumerate(lst): 24 | idxs[l] += [i] 25 | return idxs 26 | 27 | new_cogid = max(alignments.msa[ref]) + 1 28 | for cogid, msa in alignments.msa[ref].items(): 29 | idxs = [i for t, i in get_all_indices(msa["taxa"]).items() if len(i) > 1] 30 | for idx in idxs: 31 | tups = [tuple(msa["alignment"][x]) for x in idx] 32 | if len(set(tups)) > 1: 33 | bestt = sorted(tups, key=lambda x: tups.count(x), reverse=True)[0] 34 | for x in idx: 35 | if tuple(msa["alignment"][x]) != bestt: 36 | newIDs[msa["ID"][x]] = (cogid, new_cogid) 37 | new_cogid += 1 38 | 39 | for idx, (cogid, new_cogid) in newIDs.items(): 40 | this_idx = alignments[idx, ref].index(cogid) 41 | alignments[idx, ref][this_idx] = new_cogid 42 | 43 | 44 | def expand_alignment(msa, taxa, missing="Ø"): 45 | """ 46 | Expand an alignment by adding a symbol for missing taxa. 47 | """ 48 | out = [] 49 | for taxon in taxa: 50 | if taxon in msa["taxa"]: 51 | tidx = msa["taxa"].index(taxon) 52 | out.append( 53 | [x.split("/")[1] if "/" in x else x for x in msa["alignment"][tidx]] 54 | ) 55 | else: 56 | out.append(len(msa["alignment"][0]) * [missing]) 57 | return out 58 | 59 | 60 | def compatible(msa1, msa2, missing="Ø", gap="-"): 61 | """ 62 | Compare two alignments and check whether they colexify. 63 | """ 64 | matches = 0 65 | for line1, line2 in zip(msa1, msa2): 66 | if [x for x in line1 if x != gap] == [ 67 | x for x in line2 if x != gap 68 | ] and missing not in line1 + line2: 69 | matches += 1 70 | else: 71 | if list(set(line1))[0] != missing and list(set(line2))[0] != missing: 72 | return False 73 | return matches 74 | 75 | 76 | def merge_alignments(almA, almB, missing="Ø", gap="-"): 77 | """ 78 | Merge two alignments which are compatible. 79 | """ 80 | out = [] 81 | missing_taxa = [] 82 | for k, (a, b) in enumerate(zip(almA, almB)): 83 | if ( 84 | len(set(a)) == 1 85 | and list(set(a))[0] == missing # noqa: W503 86 | and len(set(b)) == 1 # noqa: W503 87 | and list(set(b))[0] == missing # noqa: W503 88 | ): 89 | missing_taxa += [k] 90 | i, j = 0, 0 91 | while i < len(almA[0]) and j < len(almB[0]): 92 | colA, colB = [row[i] for row in almA], [row[j] for row in almB] 93 | if colA == colB: 94 | out += [colA] 95 | i += 1 96 | j += 1 97 | else: 98 | col = [] 99 | for a, b in zip(colA, colB): 100 | if a == gap and a != b and b != missing: 101 | ncol = [] 102 | for k, c in enumerate(colA): 103 | if c == missing and k not in missing_taxa: 104 | ncol += [gap] 105 | else: 106 | ncol += [c] 107 | out += [ncol] 108 | i += 1 109 | col = [] 110 | break 111 | if b == gap and a != b and a != missing: 112 | ncol = [] 113 | for k, c in enumerate(colB): 114 | if c == missing and k not in missing_taxa: 115 | ncol += [gap] 116 | else: 117 | ncol += [c] 118 | out += [ncol] 119 | j += 1 120 | col = [] 121 | break 122 | 123 | col.append(b if a == missing else a) 124 | if col: 125 | out += [col] 126 | i += 1 127 | j += 1 128 | if i < len(almA[0]): 129 | ncol = [] 130 | for k, c in enumerate([row[i] for row in almA]): 131 | if c == missing and k not in missing_taxa: 132 | ncol += [gap] 133 | else: 134 | ncol += [c] 135 | out += [ncol] 136 | elif j < len(almB[0]): 137 | ncol = [] 138 | for k, c in enumerate([row[j] for row in almB]): 139 | if c == missing and k not in missing_taxa: 140 | ncol += [gap] 141 | else: 142 | ncol += [c] 143 | out += [ncol] 144 | 145 | nalm = [] 146 | for i in range(len(out[0])): 147 | nalm += [[row[i] for row in out]] 148 | return nalm 149 | 150 | 151 | def find_colexified_alignments( 152 | alignments, cognates="cogids", missing="Ø", ref="crossids" 153 | ): 154 | """ 155 | Identify identical alignments in a dataset and label them as homophones. 156 | 157 | Note 158 | ---- 159 | This function was first introduced in Wu et al. (2020). 160 | 161 | > Wu, M.-S., N. Schweikhard, T. Bodt, N. Hill, and J.-M. List (2020): 162 | > Computer-Assisted Language Comparison. State of the Art. Journal of Open 163 | > Humanities Data 6.2. 1-14. DOI: https://doi.org/10.5334/johd.12 164 | """ 165 | 166 | queue = [] 167 | for cogid, msa in sorted( 168 | alignments.msa[cognates].items(), 169 | key=lambda x: len(set(x[1]["taxa"])), 170 | reverse=True, 171 | ): 172 | queue += [(cogid, expand_alignment(msa, alignments.taxa, missing=missing))] 173 | 174 | merged = {} 175 | 176 | while queue: 177 | this_cogid, this_msa = queue.pop(0) 178 | deletes = [] 179 | merged[this_cogid] = this_cogid 180 | for i, (other_cogid, other_msa) in enumerate(queue): 181 | if compatible(this_msa, other_msa) >= 1: 182 | this_msa = merge_alignments(this_msa, other_msa) 183 | merged[other_cogid] = this_cogid 184 | deletes += [i] 185 | 186 | for i in deletes[::-1]: 187 | del queue[i] 188 | 189 | # assemble the clusters now 190 | if alignments._mode == "fuzzy": 191 | alignments.add_entries(ref, cognates, lambda x: [merged.get(y, y) for y in x]) 192 | else: 193 | alignments.add_entries(ref, cognates, lambda x: merged.get(x, x)) 194 | -------------------------------------------------------------------------------- /src/lingrex/copar.py: -------------------------------------------------------------------------------- 1 | import math 2 | import pathlib 3 | import itertools 4 | import collections 5 | 6 | from lingpy.sequence.sound_classes import class2tokens 7 | from lingpy.settings import rc 8 | from lingpy.align.sca import get_consensus, Alignments 9 | from lingpy.util import pb 10 | from lingpy import log 11 | from lingpy import basictypes as bt 12 | 13 | import networkx as nx 14 | 15 | 16 | def consensus_pattern(patterns, missing="Ø"): 17 | """ 18 | Return consensus pattern of multiple patterns. 19 | 20 | :param patterns: list of patterns 21 | :param missing: the character used to represent missing values 22 | 23 | .. note:: This consensus method raises an error if the patterns contain incompatible 24 | columns (non-identical values apart from the missing data character in the same 25 | column). 26 | """ 27 | out = [] 28 | for i in range(len(patterns[0])): 29 | col = [line[i] for line in patterns] 30 | no_gaps = [x for x in col if x != missing] 31 | if len(set(no_gaps)) > 1: 32 | raise ValueError("Your patterns are incompatible") 33 | out += [no_gaps[0] if no_gaps else missing] 34 | return tuple(out) 35 | 36 | 37 | def incompatible_columns(patterns, missing="Ø"): 38 | """ 39 | Compute whether a pattern has incompatible columns. 40 | """ 41 | columns = [] 42 | for i in range(len(patterns[0])): 43 | col = [ 44 | patterns[j][i] for j in range(len(patterns)) if patterns[j][i] != missing 45 | ] 46 | columns.append("*" if len(set(col)) > 1 else "") 47 | return columns 48 | 49 | 50 | def score_patterns(patterns, missing="Ø", mode="coverage"): 51 | """ 52 | Function gives a score for the overall number of reflexes. 53 | 54 | .. note:: This score tells simply to which degree a pattern is filled. It divides the 55 | number of cells not containing missing data by the number of cells in the 56 | matrix. 57 | """ 58 | # return -1 if the patterns are not compatible 59 | for i in range(len(patterns[0])): 60 | if len(set([row[i] for row in patterns if row[i] != missing])) > 1: 61 | return -1 62 | if len(patterns) <= 1: 63 | return -1 64 | 65 | if mode not in ["ranked", "pairs", "squared", "coverage"]: 66 | raise ValueError("you must select an appropriate mode") 67 | 68 | # we rank the columns by sorting them first 69 | if mode == "ranked": 70 | cols = [] 71 | for i in range(len(patterns[0])): 72 | cols += [sum([0 if row[i] == missing else 1 for row in patterns])] 73 | # sort the columns 74 | ranks, cols = list(range(1, len(cols) + 1))[::-1], sorted(cols, reverse=True) 75 | scores = [] 76 | for rank, col in zip(ranks, cols): 77 | scores += [rank * col] 78 | return sum(scores) / sum(ranks) / len(patterns) 79 | 80 | if mode == "squared": 81 | psize = len(patterns[0]) 82 | scores = [((psize - row.count(missing)) / psize) ** 2 for row in patterns] 83 | return sum(scores) / len(scores) 84 | 85 | if mode == "pairs": 86 | # count the number of pairs in the data 87 | pairs = 0 88 | covered = 0 89 | m, n = len(patterns[0]), len(patterns) 90 | for i in range(n): 91 | vals = m - patterns[i].count(missing) 92 | pairs += (vals**2 - vals) / 2 93 | for i in range(m): 94 | vals = n - [p[i] for p in patterns].count(missing) 95 | pairs += (vals**2 - vals) / 2 96 | if vals != 0: 97 | covered += 1 98 | return ((pairs / n) / covered) / m 99 | 100 | if mode == "coverage": 101 | cols = [] 102 | for i in range(len(patterns[0])): 103 | col = [row[i] for row in patterns] 104 | cols += [len(patterns) - col.count(missing)] 105 | return (sum(cols) / len(patterns[0])) / len(patterns) # * len(patterns[0])) 106 | 107 | 108 | def compatible_columns(colA, colB, missing="Ø", gap="-"): 109 | """Check for column compatibility. 110 | 111 | Parameters 112 | ---------- 113 | colA, colB = list 114 | Lists (sequence type) containing a given pattern. 115 | missing : str (default="Ø") 116 | A gap in the sense of "missing data", that is, a cognate set for which 117 | a value in a given language is absent. 118 | 119 | Returns 120 | ------- 121 | matches, mismatches : tuple 122 | The score for matches gives zero if there is no conflict but also no 123 | match. For mismatches it is accordingly. So if you seek for 124 | compatibility, a mismatch greater 0 means the patterns are not 125 | compatible. 126 | """ 127 | matches, mismatches = 0, 0 128 | for a, b in zip(colA, colB): 129 | if missing not in [a, b]: 130 | if a != b: 131 | mismatches += 1 132 | else: 133 | if a != gap: 134 | matches += 1 135 | return matches, mismatches 136 | 137 | 138 | def density(wordlist, ref="cogid"): 139 | """Compute the density of a wordlist. 140 | 141 | Note 142 | ---- 143 | We define the density of a wordlist by measuring how many words can be 144 | explained by the same cognate set. 145 | """ 146 | scores = [] 147 | for concept in wordlist.rows: 148 | idxs = wordlist.get_list(row=concept, flat=True) 149 | cogids = [wordlist[idx, ref] for idx in idxs] 150 | sums = [1 / cogids.count(cogid) for idx, cogid in zip(idxs, cogids)] 151 | scores.append(sum(sums) / len(sums)) 152 | return 1 - sum(scores) / len(scores) 153 | 154 | 155 | class CoPaR(Alignments): 156 | """Correspondence Pattern Recognition class 157 | 158 | Parameters 159 | ---------- 160 | wordlist : ~lingpy.basic.wordlist.Wordlist 161 | A wordlist object which should have a column for segments and a column 162 | for cognate sets. Since the class inherits from LingPy's 163 | Alignments-class, the same kind of data should be submitted. 164 | ref : str (default="cogid") 165 | The column which stores the cognate sets. 166 | segments : str (default="tokens") 167 | The column which stores the segmented transcriptions. 168 | alignment : str (default="alignment") 169 | The column which stores the alignments (or will store the alignments if 170 | they have not yet been computed). 171 | 172 | Note 173 | ---- 174 | This method was first introduced in List (2019). 175 | 176 | > List, J.-M. (2019): Automatic inference of sound correspondence patterns 177 | > across multiple languages. Computational Linguistics 45.1. 137-161. DOI: 178 | > http://doi.org/10.1162/coli_a_00344 179 | """ 180 | 181 | def __init__( 182 | self, 183 | wordlist, 184 | minrefs=3, 185 | ref="cogids", 186 | structure="structure", 187 | missing="Ø", 188 | gap="-", 189 | irregular="!?", 190 | **keywords 191 | ): 192 | Alignments.__init__(self, wordlist, ref=ref, **keywords) 193 | self.ref = ref 194 | self._structure = structure 195 | self.minrefs = minrefs 196 | self.missing = missing 197 | self.gap = gap 198 | self.irregular = irregular 199 | if structure not in self.columns: 200 | raise ValueError("no column {0} for structure was found".format(structure)) 201 | 202 | def positions_from_prostrings(self, cogid, indices, alignment, structures): 203 | """ 204 | Return positions matching from an alignment and user-defined prosodic strings 205 | """ 206 | if self._mode == "fuzzy": 207 | strucs = [] 208 | for idx, struc, alm in zip(indices, structures, alignment): 209 | pos_ = self[idx, self._ref].index(cogid) 210 | strucs += [class2tokens(struc.n[pos_], alm)] 211 | else: 212 | strucs = [ 213 | class2tokens(struc, alm) for struc, alm in zip(structures, alignment) 214 | ] 215 | get_consensus(alignment, gaps=True) 216 | prostring = [] 217 | for i in range(len(strucs[0])): 218 | row = [x[i] for x in strucs if x[i] != "-"] 219 | prostring += [row[0] if row else "+"] 220 | return [(i, p) for i, p in enumerate(prostring)] 221 | 222 | def reflexes_from_pos( 223 | self, position, taxa, current_taxa, alignment, missing, irregular 224 | ): 225 | reflexes = [] 226 | for t in taxa: 227 | if t not in current_taxa: 228 | reflexes += [missing] 229 | else: 230 | reflex = alignment[current_taxa.index(t)][position] 231 | if "/" in reflex: 232 | reflex = reflex.split("/")[1] 233 | elif reflex[0] in irregular: 234 | reflex = missing 235 | reflexes += [reflex] 236 | return reflexes 237 | 238 | def _check(self): 239 | """ 240 | Check for problematic patterns in the data. 241 | """ 242 | errors = [] 243 | for idx, struc, alm in self.iter_rows(self._structure, self._alignment): 244 | self[idx, self._structure] = self._str_type(struc) 245 | self[idx, self._alignment] = self._str_type(alm) 246 | if not len(self[idx, self._structure]) == len( 247 | [x for x in self[idx, self._alignment] if x != "-"] 248 | ): 249 | print( 250 | idx, 251 | self[idx, self._structure], 252 | "|", 253 | self[idx, self._alignment], 254 | "|", 255 | self[idx, "tokens"], 256 | ) 257 | log.warning("alignment and structure do not match in {0}".format(idx)) 258 | errors += [idx] 259 | return errors 260 | 261 | def get_sites(self): 262 | """ 263 | Retrieve the alignment sites of interest for initial analysis. 264 | """ 265 | sites, all_sites, taxa = ( 266 | collections.OrderedDict(), 267 | collections.OrderedDict(), 268 | self.cols, 269 | ) 270 | errors = self._check() 271 | if errors: 272 | raise ValueError("found {0} problems in the data".format(len(errors))) 273 | 274 | # iterate over all sites in the alignment 275 | visited = [] 276 | for cogid, msa in pb( 277 | sorted(self.msa[self.ref].items()), 278 | desc="CoPaR: get_patterns()", 279 | total=len(self.msa[self.ref]), 280 | ): 281 | # get essential data: taxa, alignment, etc. 282 | _taxa = [t for t in taxa if t in msa["taxa"]] 283 | _idxs = {t: msa["taxa"].index(t) for t in _taxa} 284 | _alms = [msa["alignment"][_idxs[t]] for t in _taxa] 285 | _wlid = [msa["ID"][_idxs[t]] for t in _taxa] 286 | 287 | # store visited entries 288 | visited += msa["ID"] 289 | if len(_taxa) >= self.minrefs: 290 | if self._mode == "fuzzy": 291 | _strucs = [] 292 | for _widx in _wlid: 293 | _these_strucs = self[_widx, self._structure] 294 | _strucs += [_these_strucs] 295 | else: 296 | _strucs = [self[idx, self._structure] for idx in _wlid] 297 | positions = self.positions_from_prostrings(cogid, _wlid, _alms, _strucs) 298 | for pidx, pos in positions: 299 | reflexes = self.reflexes_from_pos( 300 | pidx, taxa, _taxa, _alms, self.missing, self.irregular 301 | ) 302 | sites[cogid, pidx] = [pos, tuple(reflexes)] 303 | for pidx in range(len(_alms[0])): 304 | reflexes = self.reflexes_from_pos( 305 | pidx, taxa, _taxa, _alms, self.missing, self.irregular 306 | ) 307 | all_sites[cogid, pidx] = reflexes 308 | 309 | # add non-visited segments 310 | for idx in [i for i in self if i not in visited]: 311 | if self._mode == "fuzzy": 312 | for tt, ss, cogid in zip( 313 | self[idx, self._segments].n, 314 | self[idx, self._structure].n, 315 | self[idx, self._ref], 316 | ): 317 | for i, (t, s) in enumerate(zip(tt, ss)): 318 | all_sites[cogid, i] = [ 319 | self.missing if tax != self[idx][self._colIdx] else t 320 | for tax in self.cols 321 | ] 322 | else: 323 | for i, (t, s) in enumerate( 324 | zip(self[idx, self._segments], self[idx, self._structure]) 325 | ): 326 | all_sites[self[idx, self.ref], i] = [ 327 | self.missing if tax != self[idx][self._colIdx] else t 328 | for tax in self.cols 329 | ] 330 | 331 | self.sites = sites 332 | self.all_sites = all_sites 333 | 334 | def cluster_sites(self, match_threshold=1, score_mode="pairs"): 335 | """Cluster alignment sites using greedy clique cover. 336 | :param match_threshold: The threshold of matches for accepting two 337 | compatible columns. 338 | :param score_mode: select between "pairs", "coverage" 339 | 340 | .. note:: This algorithm follows the spirit of the Welsh-Powell algorithm for 341 | graph coloring. Since graph coloring is the inverse of clique 342 | partitioning, we can use the algorithm in the same spirit. 343 | 344 | """ 345 | if not hasattr(self, "clusters"): 346 | self.clusters = collections.defaultdict(list) 347 | for (cogid, idx), (pos, ptn) in self.sites.items(): 348 | self.clusters[pos, ptn] += [(cogid, idx)] 349 | clusters = self.clusters 350 | while True: 351 | prog = 0 352 | with pb( 353 | desc="CoPaR: cluster_sites()", total=len(self.clusters) 354 | ) as progress: 355 | sorted_clusters = sorted( 356 | clusters.items(), 357 | key=lambda x: ( 358 | score_patterns( 359 | [self.sites[y][1] for y in x[1]], mode=score_mode 360 | ), 361 | len(x[1]), 362 | ), 363 | reverse=True, 364 | ) 365 | out = [] 366 | while sorted_clusters: 367 | ((this_pos, this_cluster), these_vals), remaining_clusters = ( 368 | sorted_clusters[0], 369 | sorted_clusters[1:], 370 | ) 371 | queue = [] 372 | for (next_pos, next_cluster), next_vals in remaining_clusters: 373 | match, mism = compatible_columns( 374 | this_cluster, 375 | next_cluster, 376 | missing=self.missing, 377 | gap=self.gap, 378 | ) 379 | if ( 380 | this_pos == next_pos 381 | and match >= match_threshold # noqa: W503 382 | and mism == 0 # noqa: W503 383 | ): 384 | this_cluster = consensus_pattern( 385 | [this_cluster, next_cluster] 386 | ) 387 | these_vals += next_vals 388 | else: 389 | queue += [((next_pos, next_cluster), next_vals)] 390 | sorted_clusters = queue 391 | out += [((this_pos, this_cluster), these_vals)] 392 | progress.update(len(self.sites) - len(queue) - prog) 393 | prog = len(self.sites) - len(queue) 394 | clusters = {tuple(a): b for a, b in out} 395 | alls = [c for c in clusters] 396 | match = 0 397 | for i, (_a, a) in enumerate(alls): 398 | for j, (_b, b) in enumerate(alls): 399 | if i < j and _a == _b: 400 | ma, mi = compatible_columns( 401 | a, b, missing=self.missing, gap=self.gap 402 | ) 403 | if ma and not mi: 404 | match += 1 405 | if not match: 406 | break 407 | else: 408 | log.warning( 409 | "iterating, since {0} clusters can further be merged".format( 410 | match 411 | ) 412 | ) 413 | self.clusters = clusters 414 | self.ordered_clusters = sorted(clusters, key=lambda x: len(x[1])) 415 | 416 | def sites_to_pattern(self, threshold=1): 417 | """Algorithm assigns alignment sites to patterns. 418 | 419 | Notes 420 | ----- 421 | We rank according to general compatibility. 422 | """ 423 | asites = collections.defaultdict(list) 424 | for consensus in pb( 425 | self.clusters, desc="CoPaR: sites_to_pattern()", total=len(self.clusters) 426 | ): 427 | sites = self.clusters[consensus] 428 | for cog, pos in sites: 429 | struc, pattern = self.sites[cog, pos] 430 | for strucB, consensusB in self.clusters: 431 | ma, mi = compatible_columns(pattern, consensusB) 432 | if struc == strucB and not mi and ma >= threshold: 433 | asites[cog, pos] += [(ma, struc, consensusB)] 434 | self.patterns = asites 435 | 436 | def fuzziness(self): 437 | return sum([len(b) for a, b in self.patterns.items()]) / len(self.patterns) 438 | 439 | def irregular_patterns(self, accepted=2, matches=1, irregular_prefix="!"): 440 | """ 441 | Try to assign irregular patterns to accepted patterns. 442 | 443 | Parameters 444 | ---------- 445 | accepted : int (default=2) 446 | Minimal size of clusters that we regard as regular. 447 | 448 | """ 449 | bad_clusters = [ 450 | (clr, pts[0]) for clr, pts in self.clusters.items() if len(pts) == 1 451 | ] 452 | good_clusters = sorted( 453 | [(clr, pts) for clr, pts in self.clusters.items() if len(pts) >= accepted], 454 | key=lambda x: len(x[1]), 455 | reverse=True, 456 | ) 457 | new_clusters = {clr: [] for clr, pts in good_clusters} 458 | irregular_patterns = [] 459 | for clr, ptn in bad_clusters: 460 | if ptn.count(self.missing) <= 2: 461 | for clrB, pts in good_clusters: 462 | match, mism = compatible_columns(clr[1], clrB[1]) 463 | if mism <= matches and match > matches: 464 | new_clusters[clrB] += [clr] 465 | irregular_patterns += [clr] 466 | break 467 | # re-assign alignments to the data by adding the irregular character 468 | for key, value in sorted( 469 | new_clusters.items(), key=lambda x: len(x[1]), reverse=True 470 | ): 471 | if len(value) > 0: 472 | for i, pattern in enumerate(value): 473 | pt = [] 474 | for lid, (a, b) in enumerate(zip(key[1], pattern[1])): 475 | if a != b and self.missing not in [a, b]: 476 | pt += [irregular_prefix + b] 477 | # assign pattern to the corresponding alignments 478 | for cogid, position in self.clusters[pattern]: 479 | if self._mode == "fuzzy": 480 | word_indices = self.etd[self.ref][cogid][lid] 481 | if word_indices: 482 | for widx in word_indices: 483 | # get the position in the alignment 484 | alms = self[widx, self._alignment].n 485 | cog_pos = self[widx, self.ref].index(cogid) 486 | new_alm = alms[cog_pos] 487 | new_alm[position] = "{0}{1}/{2}".format( 488 | irregular_prefix, b, a 489 | ) 490 | alms[cog_pos] = new_alm 491 | self[ 492 | widx, self._alignment 493 | ] = self._str_type( 494 | " + ".join( 495 | [" ".join(x) for x in alms] 496 | ).split() 497 | ) 498 | else: 499 | word_indices = self.etd[self.ref][cogid][lid] 500 | if word_indices: 501 | for widx in word_indices: 502 | alm = self._str_type( 503 | self[widx, self._alignment] 504 | ) 505 | alm[position] = "{0}{1}/{2}".format( 506 | irregular_prefix, b, a 507 | ) 508 | self[ 509 | widx, self._alignment 510 | ] = self._str_type(" ".join(alm)) 511 | else: 512 | pt += [b] 513 | 514 | self.ipatterns = new_clusters 515 | for pattern, data in [ 516 | (a, b) for a, b in bad_clusters if a not in irregular_patterns 517 | ]: 518 | cogid, position = data 519 | if self._mode == "fuzzy": 520 | for indices in [idx for idx in self.etd[self.ref][cogid] if idx]: 521 | for widx in indices: 522 | cog_pos = self[widx, self.ref].index(cogid) 523 | alms = self[widx, self._alignment].n 524 | new_alm = alms[cog_pos] 525 | new_alm[position] = "{0}{1}".format( 526 | irregular_prefix, new_alm[position] 527 | ) 528 | alms[cog_pos] = new_alm 529 | self[widx, self._alignment] = self._str_type( 530 | " + ".join([" ".join(x) for x in alms]).split() 531 | ) 532 | 533 | return new_clusters 534 | 535 | def load_patterns(self, patterns="patterns"): 536 | self.id2ptn = collections.OrderedDict() 537 | self.clusters = collections.OrderedDict() 538 | self.id2pos = collections.defaultdict(set) 539 | self.sites = collections.OrderedDict() 540 | # get the template 541 | template = [self.missing for m in self.cols] 542 | tidx = {self.cols[i]: i for i in range(self.width)} 543 | for idx, ptn, alm, struc, doc, cogs in self.iter_rows( 544 | patterns, self._alignment, self._structure, "doculect", self._ref 545 | ): 546 | if self._mode == "fuzzy": 547 | ptn = bt.lists(ptn) 548 | for i in range(len(alm.n)): 549 | for j, (p, a) in enumerate(zip(ptn.n[i], alm.n[i])): 550 | if not p == "0/n": 551 | this_pattern = self.id2ptn.get(p, [t for t in template]) 552 | if this_pattern[tidx[doc]] == "Ø": 553 | this_pattern[tidx[doc]] = a 554 | self.id2ptn[p] = this_pattern 555 | self.id2pos[p].add((cogs[i], j)) 556 | else: 557 | for j, (p, a) in enumerate(zip(ptn, alm)): 558 | if not p == "0/n": 559 | this_pattern = self.id2ptn.get(p, [t for t in template]) 560 | if this_pattern[tidx[doc]] == "Ø": 561 | this_pattern[tidx[doc]] = a 562 | self.id2ptn[p] = this_pattern 563 | self.id2pos[p].add((cogs, j)) 564 | 565 | self.ptn2id = {tuple(v): k for k, v in self.id2ptn.items()} 566 | for k, v in self.id2ptn.items(): 567 | self.clusters[tuple(v)] = list(self.id2pos[k]) 568 | self.id2pos[k] = list(self.id2pos[k]) 569 | for s in self.id2pos[k]: 570 | self.sites[s] = [(len(self.id2pos[k]), tuple(v))] 571 | 572 | def add_patterns( 573 | self, ref="patterns", irregular_patterns=False, proto=False, override=True 574 | ): 575 | """Assign patterns to a new column in the word list.""" 576 | if not hasattr(self, "id2ptn"): 577 | self.id2ptn = {} 578 | if not hasattr(self, "pattern2id"): 579 | self.ptn2id = {} 580 | if proto: 581 | pidx = self.cols.index(proto) 582 | else: 583 | pidx = 0 584 | 585 | if irregular_patterns: 586 | new_clusters = collections.defaultdict(list) 587 | for reg, iregs in self.ipatterns.items(): 588 | for cogid, position in self.clusters[reg]: 589 | new_clusters[reg] += [(cogid, position)] 590 | for ireg in iregs: 591 | for cogid, position in self.clusters[ireg]: 592 | new_clusters[reg] += [(cogid, position)] 593 | else: 594 | new_clusters = self.clusters 595 | for pattern, rest in self.clusters.items(): 596 | for cogid, position in rest: 597 | if (cogid, position) not in new_clusters[pattern]: 598 | new_clusters[pattern] += [(cogid, position)] 599 | 600 | P = { 601 | idx: bt.lists( 602 | [ 603 | "0" if x not in rc("morpheme_separators") else "+" 604 | for x in self[idx, self._alignment] 605 | ] 606 | ) 607 | for idx in self 608 | } 609 | for i, ((struc, pattern), data) in enumerate( 610 | sorted(new_clusters.items(), key=lambda x: len(x), reverse=True) 611 | ): 612 | pattern_id = "{0}".format( 613 | i + 1 #, len(self.clusters[struc, pattern]), pattern[pidx] 614 | ) 615 | self.id2ptn[pattern_id] = pattern 616 | self.ptn2id[pattern] = pattern_id 617 | for cogid, position in data: 618 | word_indices = [c for c in self.etd[self.ref][cogid] if c] 619 | for idxs in word_indices: 620 | for idx in idxs: 621 | if self._mode == "fuzzy": 622 | pattern_position = self[idx, self.ref].index(cogid) 623 | this_pattern = P[idx].n[pattern_position] 624 | try: 625 | this_pattern[position] = pattern_id 626 | P[idx].change(pattern_position, this_pattern) 627 | except: # noqa: E722 628 | log.warning("error in {0}".format(cogid)) 629 | 630 | else: 631 | P[idx][position] = pattern_id 632 | self.add_entries(ref, P, lambda x: x, override=override) 633 | 634 | def write_patterns(self, filename, proto=False, irregular_patterns=False): 635 | if proto: 636 | pidx = self.cols.index(proto) 637 | else: 638 | pidx = 0 639 | 640 | if not hasattr(self, "id2ptn"): 641 | raise ValueError("You should run CoPaR.add_patterns first!") 642 | 643 | if irregular_patterns: 644 | new_clusters = collections.defaultdict(list) 645 | for (pos, reg), iregs in self.ipatterns.items(): 646 | for cogid, position in self.clusters[pos, reg]: 647 | new_clusters[pos, reg] += [(cogid, position)] 648 | for _, ireg in iregs: 649 | ireg_ = list(ireg) 650 | print(ireg_) 651 | for i, (a, b) in enumerate(zip(reg, ireg)): 652 | print(i, a, b) 653 | if a != b and b != self.missing: 654 | ireg_[i] = a + "/" + b 655 | ireg_ = tuple(ireg_) 656 | self.ptn2id[ireg_] = self.ptn2id[reg] 657 | for cogid, position in self.clusters[pos, ireg]: 658 | new_clusters[pos, ireg_] += [(cogid, position)] 659 | else: 660 | new_clusters = self.clusters 661 | for (struc, pattern), rest in self.clusters.items(): 662 | for cogid, position in rest: 663 | if (cogid, position) not in new_clusters[struc, pattern]: 664 | new_clusters[struc, pattern] += [(cogid, position)] 665 | text = "ID\tSTRUCTURE\tFREQUENCY\t{0}\t{1}\tCOGNATESETS\tCONCEPTS\n".format( 666 | self.cols[pidx], "\t".join([c for c in self.cols if c != self.cols[pidx]]) 667 | ) 668 | 669 | sound = "" 670 | idx = 0 671 | for (struc, pattern), entries in sorted( 672 | new_clusters.items(), 673 | key=lambda x: (x[0][0], x[0][1][pidx], len(x[1])), 674 | reverse=True, 675 | ): 676 | if sound != pattern[pidx]: 677 | sound = pattern[pidx] 678 | idx = 0 679 | concepts = [] 680 | for x, y in entries: 681 | for entry in self.etd[self.ref][x]: 682 | if entry: 683 | for value in entry: 684 | concepts += [self[value, "concept"]] 685 | concepts = " / ".join(sorted(set(concepts))) 686 | 687 | idx += 1 688 | text += "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format( 689 | self.ptn2id[pattern].split("/")[0], 690 | struc, 691 | len(entries), 692 | pattern[pidx], 693 | "\t".join([p for i, p in enumerate(pattern) if i != pidx]), 694 | ", ".join(["{0}:{1}".format(x, y) for x, y in entries]), 695 | concepts, 696 | ) 697 | pathlib.Path(filename).write_text(text, encoding="utf8") 698 | 699 | def purity(self): 700 | """ 701 | Compute the purity of the cluster analysis. 702 | 703 | .. note:: The purity is here interpreted as the degree to which 704 | patterns are filled with non-missing values. In this sense, it 705 | indicates to which degree information is computed and to which 706 | degree information is already provided by the data itself. 707 | """ 708 | 709 | def get_purity(patterns): 710 | all_sums = [] 711 | for i in range(len(patterns[0])): 712 | col = [line[i] for line in patterns] 713 | subset = set(col) 714 | sums = [] 715 | for itm in subset: 716 | if itm != self.missing: 717 | sums += [col.count(itm) ** 2] 718 | if sums: 719 | sums = math.sqrt(sum(sums)) / len(col) 720 | else: 721 | sums = 0 722 | all_sums += [sums] 723 | return sum(all_sums) / len(all_sums) 724 | 725 | graph = self.get_cluster_graph() 726 | purities = [] 727 | for node, data in graph.nodes(data=True): 728 | patterns = [] 729 | for neighbor in graph[node]: 730 | patterns += [graph.nodes[neighbor]["pattern"].split()] 731 | if patterns: 732 | purities += [get_purity(patterns)] 733 | else: 734 | purities += [0] 735 | return sum(purities) / len(purities) 736 | 737 | def get_cluster_graph(self): 738 | """ 739 | Compute a graph of the clusters. 740 | 741 | .. note:: In the cluster graph, the sites in the alignments are the 742 | nodes and the edges are drawn between nodes assigned to the same 743 | pattern. 744 | """ 745 | 746 | graph = nx.Graph() 747 | for (pos, ptn), sites in self.clusters.items(): 748 | for site in sites: 749 | graph.add_node( 750 | "{0[0]}-{0[1]}".format(site), 751 | pattern=" ".join(ptn), 752 | site=" ".join(self.sites[site][1]), 753 | ) 754 | 755 | for ((s1, p1), ptn1), ((s2, p2), ptn2) in itertools.combinations( 756 | self.sites.items(), r=2 757 | ): 758 | if ptn1[0] == ptn2[0]: 759 | m, mm = compatible_columns(ptn1[1], ptn2[1]) 760 | if m and not mm: 761 | graph.add_edge("{0}-{1}".format(s1, p1), "{0}-{1}".format(s2, p2)) 762 | return graph 763 | 764 | def upper_bound(self): 765 | """ 766 | Compute upper bound for clique partitioning following Bhasker 1991. 767 | """ 768 | degs = {s: 0 for s in self.sites} 769 | sings = {s: 0 for s in self.sites} 770 | for (nA, (posA, ptnA)), (nB, (posB, ptnB)) in itertools.combinations( 771 | self.sites.items(), r=2 772 | ): 773 | if posA == posB: 774 | m, n = compatible_columns(ptnA, ptnB) 775 | if n > 0: 776 | degs[nA] += 1 777 | degs[nB] += 1 778 | else: 779 | sings[nA] += 1 780 | sings[nB] += 1 781 | else: 782 | degs[nA] += 1 783 | degs[nB] += 1 784 | 785 | return max([b for a, b in degs.items() if sings[a] > 0]) 786 | 787 | def predict_words(self, **kw): 788 | """ 789 | Predict patterns for those cognate sets where we have missing data. 790 | 791 | .. note:: 792 | 793 | Purity (one of the return values) measures how well a given sound 794 | for a given site is reflected by one single sound (rather than 795 | multiple patterns pointing to different sounds) for a given 796 | doculect. It may be seen as a control case for the purity of a given 797 | prediction: if there are many alternative possibilities, this means 798 | that there is more uncertainty regarding the reconstructions or 799 | predictions. 800 | 801 | """ 802 | if not hasattr(self, "sites"): 803 | raise ValueError("You need to compute alignment sites first") 804 | 805 | minrefs = self.minrefs 806 | missing = self.missing 807 | samples = kw.get("samples", 3) 808 | 809 | # pre-analyse the data to get for each site the best patterns in ranked 810 | # form 811 | ranked_sites = {} 812 | ranked_clusters = sorted( 813 | [(s, p, len(f)) for (s, p), f in self.clusters.items()], 814 | key=lambda x: x[2], 815 | reverse=True, 816 | ) 817 | for (cogid, pos), ptns in self.patterns.items(): 818 | struc, ptn = self.sites[cogid, pos] 819 | missings = [i for i in range(self.width) if ptn[i] == missing] 820 | if (struc, ptn) in self.clusters: 821 | ranked_sites[cogid, pos] = [ 822 | (len(self.clusters[struc, ptn]), struc, ptn) 823 | ] 824 | else: 825 | ranked_sites[cogid, pos] = [(1, struc, ptn)] 826 | for strucB, ptnB, freq in ranked_clusters: 827 | m, mm = compatible_columns(ptn, ptnB) 828 | if struc == strucB and m >= 1 and mm == 0: 829 | if len(missings) > len( 830 | [ptnB[i] for i in missings if ptnB[i] == missing] 831 | ): 832 | ranked_sites[cogid, pos] += [(freq, strucB, ptnB)] 833 | 834 | purity = {site: {} for site in ranked_sites} 835 | 836 | preds = {} 837 | for cogid, msa in self.msa[self._ref].items(): 838 | missings = [t for t in self.cols if t not in msa["taxa"]] 839 | if len(set(msa["taxa"])) >= minrefs: 840 | words = [bt.strings("") for m in missings] 841 | for i, m in enumerate(missings): 842 | tidx = self.cols.index(m) 843 | for j in range(len(msa["alignment"][0])): 844 | segments = collections.defaultdict(int) 845 | sidx = 0 846 | if (cogid, j) in ranked_sites: 847 | while True: 848 | this_segment = ranked_sites[cogid, j][sidx][2][tidx] 849 | score = ranked_sites[cogid, j][sidx][0] 850 | if this_segment != missing: 851 | segments[this_segment] += score 852 | sidx += 1 853 | if sidx == len(ranked_sites[cogid, j]): 854 | break 855 | 856 | if not (cogid, j) in purity: 857 | purity[cogid, j] = {} 858 | 859 | if not segments: 860 | words[i] += ["Ø"] 861 | purity[cogid, j][m] = 0 862 | else: 863 | purity[cogid, j][m] = math.sqrt( 864 | sum( 865 | [ 866 | (s / sum(segments.values())) ** 2 867 | for s in segments.values() 868 | ] 869 | ) 870 | ) 871 | words[i] += [ 872 | "|".join( 873 | [ 874 | s 875 | for s in sorted( 876 | segments, 877 | key=lambda x: segments[x], 878 | reverse=True, 879 | ) 880 | ][:samples] 881 | ) 882 | ] 883 | if words: 884 | preds[cogid] = dict(zip(missings, words)) 885 | 886 | pudity = {doc: [] for doc in self.cols} 887 | for site, docs in purity.items(): 888 | for doc in docs: 889 | pudity[doc] += [purity[site][doc]] 890 | for doc, purs in pudity.items(): 891 | if purs: 892 | pudity[doc] = sum(purs) / len(purs) 893 | else: 894 | pudity[doc] = 0 895 | 896 | return preds, purity, pudity 897 | -------------------------------------------------------------------------------- /src/lingrex/evaluate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Miscellaneous evaluation functions. 3 | """ 4 | import statistics 5 | from lingpy.evaluate.acd import _get_bcubed_score as bcs 6 | import lingpy 7 | 8 | 9 | def compare_cognate_sets(wordlist, refA, refB): 10 | """ 11 | Compute cognate set comparison statistics by computing B-Cubed Scores. 12 | 13 | Note 14 | ---- 15 | This check was first described in Wu and List (2023). 16 | 17 | > Wu, M.-S. and J.-M. List (2023): Annotating cognates in phylogenetic studies 18 | > of South-East Asian languages. Language Dynamics and Change. 161-197. 19 | > DOI: https://doi.org/10.1163/22105832-bja10023 20 | """ 21 | ranks = [] 22 | for concept in wordlist.rows: 23 | cogsA = wordlist.get_list(row=concept, flat=True, entry=refA) 24 | cogsB = wordlist.get_list(row=concept, flat=True, entry=refB) 25 | p, r = bcs(cogsA, cogsB), bcs(cogsB, cogsA) 26 | f = 2 * (p * r) / (p + r) 27 | ranks += [[concept, p, r, f]] 28 | return ranks 29 | 30 | 31 | def cross_semantic_cognate_statistics( 32 | wordlist, 33 | ref="cogids", 34 | concept="concept", 35 | morpheme_glosses="morphemes", 36 | ignore_affixes=True, 37 | affixes=("suf", "suffix", "SUF", "SUFFIX"), 38 | ): 39 | """ 40 | Calculate colexification statistics for partial colexifications. 41 | 42 | :param wordlist: A LingPy wordlist. 43 | :param ref: Reference to the column with cognate identifiers. 44 | :param concept: Reference to the concept column. 45 | :param morpheme_glosses: Reference to the morpheme glosses. 46 | :param ignore_affixes: If set to True, will ignore morphemes flagged as affixes. 47 | :param affixes: List of strings that trigger that a morpheme gloss is 48 | ignored if it contains one of them as a substring. 49 | 50 | Note 51 | ---- 52 | This check was first described in Wu and List (2023). 53 | 54 | > Wu, M.-S. and J.-M. List (2023): Annotating cognates in phylogenetic studies 55 | > of South-East Asian languages. Language Dynamics and Change. 161-197. 56 | > DOI: https://doi.org/10.1163/22105832-bja10023 57 | """ 58 | 59 | # type check for basic types if they are not there 60 | for idx, cogids, morphemes in wordlist.iter_rows(ref, morpheme_glosses): 61 | wordlist[idx, ref] = lingpy.basictypes.ints(cogids) 62 | wordlist[idx, morpheme_glosses] = lingpy.basictypes.strings(morphemes) 63 | 64 | if ignore_affixes: 65 | D = {} 66 | for idx, cogids, morphemes in wordlist.iter_rows(ref, morpheme_glosses): 67 | new_cogids = [] 68 | for cogid, morpheme in zip(cogids, morphemes): 69 | if not sum([1 if s in morpheme else 0 for s in affixes]): 70 | new_cogids += [cogid] 71 | D[idx] = lingpy.basictypes.ints(new_cogids) 72 | wordlist.add_entries(ref + "_derived", D, lambda x: x) 73 | new_ref = ref + "_derived" 74 | else: 75 | new_ref = ref 76 | 77 | etd = wordlist.get_etymdict(ref=new_ref) 78 | indices = {ln: {} for ln in wordlist.cols} 79 | for i, ln in enumerate(wordlist.cols): 80 | for cogid, reflexes in etd.items(): 81 | if reflexes[i]: 82 | concepts = [wordlist[idx, concept] for idx in reflexes[i]] 83 | indices[ln][cogid] = len(set(concepts)) - 1 84 | 85 | all_scores = [] 86 | for cnc in wordlist.rows: 87 | # Loop through all the concepts in the data 88 | reflexes = wordlist.get_list( 89 | row=cnc, flat=True 90 | ) # The lexical entries of the concept. 91 | scores = [] 92 | for idx in reflexes: 93 | doculect, cogids = wordlist[idx, "doculect"], wordlist[idx, new_ref] 94 | scores += [statistics.mean([indices[doculect][cogid] for cogid in cogids])] 95 | all_scores += [[cnc, statistics.mean(scores)]] 96 | return sorted(all_scores, key=lambda x: (x[1], x[0])) 97 | -------------------------------------------------------------------------------- /src/lingrex/fuzzy.py: -------------------------------------------------------------------------------- 1 | """Create fuzzy reconstructions.""" 2 | from lingrex.reconstruct import PatternReconstructor 3 | import random 4 | from lingpy.util import pb as progressbar 5 | import lingpy 6 | 7 | 8 | def ntile(words, n=5, gap="-", missing="Ø"): 9 | """ 10 | Represent aligned words in form of n-tiles. 11 | """ 12 | if len(words) == 1: 13 | return ' '.join([x for x in words[0] if x != gap]) 14 | 15 | # start counting the occurrences 16 | cols = [] 17 | for i in range(len(words[0])): 18 | col = [line[i] for line in words] 19 | cols += [col] 20 | 21 | ntile = len(words) / n 22 | 23 | sounds = [] 24 | for col in cols: 25 | col = [x for x in col if x != missing] 26 | if not col: 27 | sounds += ['?'] 28 | else: 29 | ntile = len(col) / n 30 | dist = {} 31 | sounds += [[]] 32 | for s in set(col): 33 | dist[s] = int(col.count(s) / ntile + 0.5) 34 | for s, t in sorted(dist.items(), key=lambda x: x[1], reverse=True): 35 | for i in range(t): 36 | sounds[-1] += [s] 37 | iterated = 0 38 | while len(sounds[-1]) < n: 39 | sounds[-1] += sounds[-1] 40 | iterated += 1 41 | if iterated >= n: 42 | sounds[-1] += n * ["Ø"] 43 | sounds[-1] = sorted(sounds[-1][:n], key=lambda x: 44 | sounds[-1].count(x), reverse=True) 45 | sounds[-1] = '|'.join(sounds[-1]) 46 | 47 | return ' '.join([s for s in sounds if s.split('|').count(gap) != 48 | len(s.split('|'))-1]) 49 | 50 | 51 | class FuzzyReconstructor: 52 | """ 53 | Carry out fuzzy reconstructions by running reconstructions from different parts of the data. 54 | 55 | Note 56 | ---- 57 | This method was introduced in the study by List et al. (forthcoming): 58 | 59 | > List, J.-M.; Hill, N. W.; Blum, F.; and Forkel, R. (forthcoming): A New Framework for the 60 | > Representation and Computation of Uncertainty in Phonological Reconstruction. To appear in: 61 | > Proceedings of the 4th Workshop on Computational Approaches to Historical Language Change. 62 | """ 63 | 64 | def __init__(self, infile, target, ref="cogid", fuzzy=False, transcription="form"): 65 | if isinstance(infile, (str, dict)): 66 | wordlist = lingpy.align.sca.Alignments( 67 | infile, ref=ref, transcription=transcription 68 | ) 69 | elif isinstance( 70 | infile, (lingpy.align.sca.Alignments, lingpy.basic.wordlist.Wordlist) 71 | ): 72 | wordlist = infile 73 | else: 74 | raise ValueError("Argument for infile must be a string or a wordlist.") 75 | self.wordlist = wordlist 76 | self.target = target or self.wordlist.cols[0] 77 | self.ref = ref 78 | self.fuzzy = fuzzy 79 | 80 | def random_splits(self, splits=10, retain=0.9): 81 | idxs = [ 82 | idx 83 | for idx in self.wordlist 84 | if self.wordlist[idx, "doculect"] != self.target 85 | ] 86 | 87 | tidxs = self.wordlist.get_list(col=self.target, flat=True) 88 | cogids = [self.wordlist[idx, self.ref] for idx in tidxs] 89 | 90 | self.samples = [] 91 | for i in range(splits): 92 | self.samples += [random.sample(idxs, int(retain * len(idxs) + 0.5))] 93 | 94 | self.wordlists = {} 95 | for i, sample in enumerate(self.samples): 96 | D = {0: [c for c in self.wordlist.columns]} 97 | for idx in sample: 98 | D[idx] = [self.wordlist[idx, c] for c in D[0]] 99 | selected_cogids = [self.wordlist[idx, self.ref] for idx in sample] 100 | for cogid, tidx in zip(cogids, tidxs): 101 | if cogid in selected_cogids: 102 | D[tidx] = [self.wordlist[tidx, c] for c in D[0]] 103 | self.wordlists[i] = PatternReconstructor( 104 | D, ref=self.ref, target=self.target, fuzzy=self.fuzzy 105 | ) 106 | 107 | 108 | def fit_samples(self, clf, onehot=False, func=None, aligned=False, pb=False): 109 | pb = progressbar if pb else lambda x, desc: x 110 | for i, wordlist in pb(self.wordlists.items(), desc="fitting data"): 111 | wordlist.fit(clf=clf(), onehot=onehot, func=func, aligned=aligned) 112 | 113 | 114 | def predict( 115 | self, 116 | alignment, 117 | languages, 118 | desegment=True, 119 | orchar="¦", 120 | scorechar=":", 121 | output="percentiles", 122 | ): 123 | words = [] 124 | for i, wordlist in self.wordlists.items(): 125 | word = wordlist.predict(alignment, languages, desegment=False) 126 | 127 | words += [word] 128 | # transform to dictionary 129 | counts = {i: [] for i in range(len(words[0]))} 130 | for word in words: 131 | for i, sound in enumerate(word): 132 | counts[i] += [sound] 133 | # get percentiles 134 | if output in ["percentiles", "wp"]: 135 | out = [] 136 | for i, sounds in sorted(counts.items(), key=lambda x: x[0]): 137 | distinct = {s: sounds.count(s) / len(sounds) for s in set(sounds)} 138 | distinct_s = [ 139 | "{0}{1}{2}".format(k, scorechar, int(100 * v + 0.5)) 140 | for k, v in sorted( 141 | distinct.items(), key=lambda x: x[1], reverse=True 142 | ) 143 | ] 144 | 145 | out += [orchar.join(distinct_s)] 146 | if output == "percentiles": 147 | return out 148 | return words, out 149 | elif output == "words": 150 | return words 151 | 152 | -------------------------------------------------------------------------------- /src/lingrex/reconstruct.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module provides methods for linguistic reconstruction. 3 | """ 4 | 5 | import itertools 6 | import collections 7 | 8 | from lingpy.align.sca import Alignments, get_consensus 9 | from lingpy.sequence.sound_classes import prosodic_string, class2tokens 10 | from lingpy.align.multiple import Multiple 11 | from lingpy.align.pairwise import edit_dist, nw_align 12 | from lingpy.evaluate.acd import _get_bcubed_score as get_bcubed_score 13 | from lingpy.align.sca import normalize_alignment 14 | import networkx as nx 15 | from networkx.algorithms.clique import find_cliques 16 | from lingpy import log 17 | 18 | from lingrex.util import clean_sound, ungap, alm2tok 19 | 20 | 21 | class CorPaRClassifier(object): 22 | """ 23 | A classifier for word prediction based on correspondence patterns. 24 | 25 | Note 26 | ---- 27 | This classifier was first used in List et al. (2022). 28 | 29 | > List, J.-M., N. Hill, and R. Forkel (2022): A new framework for fast 30 | > automated phonological reconstruction using trimmed alignments and sound 31 | > correspondence patterns. In: Proceedings of the 3rd Workshop on 32 | > Computational Approaches to Historical Language Change. Association for 33 | > Computational Linguistics 89-96. URL: https://aclanthology.org/2022.lchange-1.9 34 | """ 35 | 36 | def __init__(self, minrefs=2, missing=0, threshold=1): 37 | self.G = nx.Graph() 38 | self.missing = 0 39 | self.threshold = threshold 40 | 41 | def compatible(self, ptA, ptB): 42 | """ 43 | Check for compatibility of two patterns. 44 | """ 45 | res = {True: 0, False: 0} 46 | for a, b in zip(ptA, ptB): 47 | if a and b: 48 | res[a == b] += 1 49 | return res[True], res[False] 50 | 51 | def consensus(self, nodes): 52 | """ 53 | Create a consensus pattern of multiple alignment sites. 54 | """ 55 | cons = [] 56 | for i in range(len(nodes[0])): 57 | nocons = True 58 | for node in nodes: 59 | if node[i] != self.missing: 60 | cons += [node[i]] 61 | nocons = False 62 | break 63 | if nocons: 64 | cons += [self.missing] 65 | return tuple(cons) 66 | 67 | def fit(self, X, y): 68 | """ 69 | Train the prediction of data in y with data in X. 70 | 71 | :param X: Two-dimensional array with observations. 72 | :param y: One-dimensional array with results. 73 | """ 74 | # get identical patterns 75 | P = collections.defaultdict(list) 76 | for i, row in enumerate(X): 77 | P[tuple(row + [y[i]])] += [i] 78 | # make graph 79 | for (pA, vA), (pB, vB) in itertools.combinations(P.items(), r=2): 80 | match_, mismatch = self.compatible(pA, pB) 81 | if not mismatch and match_ >= self.threshold: 82 | if pA not in self.G: 83 | self.G.add_node(pA, freq=len(vA)) 84 | if pB not in self.G: 85 | self.G.add_node(pB, freq=len(vB)) 86 | self.G.add_edge(pA, pB, weight=match_) 87 | self.patterns = collections.defaultdict(collections.Counter) 88 | self.lookup = collections.defaultdict(collections.Counter) 89 | # get cliques 90 | for nodes in find_cliques(self.G): 91 | cons = self.consensus(list(nodes)) 92 | self.patterns[cons[:-1]][cons[-1]] = len(nodes) 93 | for node in nodes: 94 | self.lookup[node[:-1]][cons[:-1]] += len(nodes) 95 | self.predictions = { 96 | ptn: counts.most_common(1)[0][0] for ptn, counts in self.patterns.items() 97 | } 98 | for ptn, counts in self.lookup.items(): 99 | self.predictions[ptn] = self.predictions[counts.most_common(1)[0][0]] 100 | 101 | # make index of data points for quick search based on attested data 102 | self.ptnlkp = collections.defaultdict(list) 103 | for ptn in self.patterns: 104 | for i in range(len(ptn)): 105 | if ptn[i] != self.missing: 106 | self.ptnlkp[i, ptn[i]] += [ptn] 107 | 108 | def predict(self, matrix): 109 | out = [] 110 | for row in matrix: 111 | ptn = tuple(row) 112 | if ptn in self.predictions: 113 | out.append(self.predictions[ptn]) 114 | else: 115 | candidates = collections.Counter() 116 | for i in range(len(ptn) - 1): 117 | if ptn[i] != self.missing: 118 | for ptnB in self.ptnlkp[i, ptn[i]]: 119 | if ptnB not in candidates: 120 | match_, mismatch = self.compatible(ptn, ptnB) 121 | if match_ and not mismatch: 122 | candidates[ptnB] = match_ + len(ptn) 123 | elif match_ - mismatch: 124 | candidates[ptnB] = match_ - mismatch 125 | if candidates: 126 | self.predictions[tuple(row)] = self.predictions[ 127 | candidates.most_common(1)[0][0] 128 | ] 129 | out += [self.predictions[tuple(row)]] 130 | else: 131 | out += [self.missing] 132 | return out 133 | 134 | 135 | class ReconstructionBase(Alignments): 136 | """ 137 | Basic class for the phonological reconstruction. 138 | """ 139 | 140 | def __init__( 141 | self, 142 | infile, 143 | target, 144 | ref="cogids", 145 | fuzzy=True, 146 | transcription="form", 147 | missing="Ø", 148 | gap="-", 149 | ): 150 | Alignments.__init__( 151 | self, infile, fuzzy=fuzzy, ref=ref, transcription=transcription 152 | ) 153 | self.target = target 154 | self.missing = missing 155 | self.gap = gap 156 | self.languages = [t for t in self.cols if t != target] 157 | self.target = target 158 | self.tgtidx = self.cols.index(target) 159 | self.lngidx = {t: self.cols.index(t) for t in self.languages} 160 | 161 | def iter_sequences(self, aligned=False): 162 | """ 163 | Iterate over aligned or unaligned sequences with or without the target \ 164 | sequence. 165 | """ 166 | seq_ref = self._alignments if aligned else self._segments 167 | for cogid, idxs in self.etd[self._ref].items(): 168 | if idxs[self.tgtidx]: 169 | if self._mode == "fuzzy": 170 | target = self[idxs[self.tgtidx][0], seq_ref].n[ 171 | self[idxs[self.tgtidx][0], self._ref].index(cogid) 172 | ] 173 | else: 174 | target = self[idxs[self.tgtidx][0], seq_ref] 175 | alignment, languages = [], [] 176 | for j, lng in enumerate(self.languages): 177 | lidx = self.lngidx[lng] 178 | if idxs[lidx]: 179 | languages += [lng] 180 | idx = idxs[lidx][0] 181 | if self._mode == "fuzzy": 182 | alm = self[idx, seq_ref].n[ 183 | self[idx, self._ref].index(cogid) 184 | ] 185 | else: 186 | alm = self[idx, seq_ref] 187 | alignment.append([clean_sound(x) for x in alm]) 188 | alignment.append([clean_sound(x) for x in target]) 189 | if aligned: 190 | alignment = normalize_alignment(alignment) 191 | languages.append(self.target) 192 | yield cogid, alignment, languages 193 | 194 | 195 | class OneHot(object): 196 | """ 197 | Create a one-hot-encoder from a matrix. 198 | """ 199 | 200 | def __init__(self, matrix): 201 | self.vals = [] 202 | for i in range(len(matrix[0])): 203 | cols = [row[i] for row in matrix] 204 | self.vals += [sorted(set(cols)) + ["?"]] 205 | 206 | def __call__(self, matrix): 207 | out = [[] for row in matrix] 208 | for i, vals in enumerate(self.vals): 209 | for j in range(len(matrix)): 210 | template = [0 for k in vals] 211 | try: 212 | template[matrix[j][i]] = 1 213 | except IndexError: 214 | template[-1] = 1 215 | out[j] += template 216 | return out 217 | 218 | 219 | def transform_alignment( 220 | seqs, 221 | languages, 222 | all_languages, 223 | align=True, 224 | training=True, 225 | missing="Ø", 226 | gap="-", 227 | startend=False, 228 | prosody=False, 229 | position=False, 230 | firstlast=False, 231 | ): 232 | """ 233 | Basic alignment function used for phonological reconstruction. 234 | """ 235 | if align: 236 | seqs = [[s for s in seq if s != gap] for seq in seqs] 237 | msa = Multiple([[s for s in seq if s != gap] for seq in seqs]) 238 | msa.prog_align() 239 | alms = [alm for alm in msa.alm_matrix] 240 | else: 241 | alms = normalize_alignment([s for s in seqs]) 242 | seqs = [[s for s in seq if s != gap] for seq in seqs] 243 | if training: 244 | alms = ungap(alms, languages, languages[-1]) 245 | these_seqs = seqs[:-1] 246 | else: 247 | these_seqs = seqs 248 | 249 | matrix = [[missing for x in all_languages] for y in alms[0]] 250 | for i in range(len(alms[0])): 251 | for j, lng in enumerate(languages): 252 | lidx = all_languages.index(lng) 253 | snd = clean_sound(alms[j][i]) 254 | matrix[i][lidx] = snd 255 | if position: 256 | for i in range(len(matrix)): 257 | matrix[i] += [i] 258 | if startend: 259 | matrix[0] += [0] 260 | for i in range(1, len(matrix) - 1): 261 | matrix[i] += [1] 262 | if len(matrix) > 1: 263 | matrix[-1] += [2] 264 | if prosody: 265 | for i, c in enumerate( 266 | get_consensus( 267 | [ 268 | class2tokens(prosodic_string(these_seqs[j], _output="CcV"), alms[j]) 269 | for j in range(len(these_seqs)) 270 | ], 271 | gaps=True, 272 | ) 273 | ): 274 | matrix[i] += [c] 275 | if firstlast: 276 | if training: 277 | all_seqs = len(all_languages) - 1 278 | else: 279 | all_seqs = len(all_languages) 280 | for i, row in enumerate(matrix): 281 | for j in range(all_seqs): 282 | matrix[i] += [matrix[0][j], matrix[-1][j]] 283 | 284 | # for debugging 285 | for row in matrix: 286 | assert len(row) == len(matrix[0]) 287 | return matrix 288 | 289 | 290 | class PatternReconstructor(ReconstructionBase): 291 | """ 292 | Automatic reconstruction with correspondence patterns. 293 | 294 | Note 295 | ---- 296 | This classifier was first used in List et al. (2022). 297 | 298 | > List, J.-M., N. Hill, and R. Forkel (2022): A new framework for fast 299 | > automated phonological reconstruction using trimmed alignments and sound 300 | > correspondence patterns. In: Proceedings of the 3rd Workshop on 301 | > Computational Approaches to Historical Language Change. Association for 302 | > Computational Linguistics 89-96. URL: https://aclanthology.org/2022.lchange-1.9 303 | """ 304 | 305 | def fit(self, clf=None, onehot=False, func=None, aligned=False): 306 | """ 307 | Fit a classifier to the data. 308 | 309 | :param clf: a classifier with a predict function. 310 | """ 311 | self.patterns = collections.defaultdict(lambda: collections.defaultdict(list)) 312 | self.occurrences = collections.defaultdict(list) 313 | self.func = func or transform_alignment 314 | 315 | for cogid, alignment, languages in self.iter_sequences(): 316 | if len(alignment) >= 2: 317 | matrix = self.func( 318 | alignment, languages, self.languages + [self.target], training=True 319 | ) 320 | for i, row in enumerate(matrix): 321 | ptn = tuple( 322 | row[: len(self.languages)] + row[len(self.languages) + 1 :] 323 | ) 324 | self.patterns[ptn][row[len(self.languages)]] += [(cogid, i)] 325 | for j, lng in enumerate(self.languages): 326 | if row[j] not in [self.missing]: 327 | self.occurrences[lng, j, row[j]] += [(cogid, i)] 328 | for j in range(len(self.languages) + 1, len(row)): 329 | self.occurrences[ 330 | "feature-{0}".format(j - 1), j - 1, row[j] 331 | ] += [(cogid, i)] 332 | 333 | self.snd2idx = {(i, self.missing): 0 for i in range(len(matrix[0]))} 334 | for i in range(len(matrix[0])): 335 | self.snd2idx[i, self.gap] = 1 336 | 337 | idxtracker = {i: 2 for i in range(len(matrix[0]))} 338 | for lng, lidx, sound in self.occurrences: 339 | last_idx = idxtracker[lidx] 340 | if (lidx, sound) not in self.snd2idx: 341 | self.snd2idx[lidx, sound] = last_idx 342 | idxtracker[lidx] += 1 343 | 344 | self.tgt2idx = {} 345 | idx = 1 346 | for pattern in self.patterns: 347 | for sound in self.patterns[pattern]: 348 | if sound not in self.tgt2idx: 349 | self.tgt2idx[sound] = idx 350 | idx += 1 351 | 352 | self.matrix = [] 353 | self.solutions = [] 354 | for pattern, sounds in self.patterns.items(): 355 | for sound, vals in sounds.items(): 356 | tidx = self.tgt2idx[sound] 357 | row = [] 358 | for i in range(len(pattern)): 359 | sidx = self.snd2idx[i, pattern[i]] 360 | row += [sidx] 361 | for cogid, idx in vals: 362 | self.matrix += [row] 363 | self.solutions += [tidx] 364 | self.dim = len(self.matrix[0]) 365 | if clf is not None: 366 | self.clf = clf 367 | else: 368 | self.clf = CorPaRClassifier() 369 | log.info("fitting classifier") 370 | if onehot: 371 | self.onehot = OneHot(self.matrix) 372 | self.clf.fit(self.onehot(self.matrix), self.solutions) 373 | else: 374 | self.clf.fit(self.matrix, self.solutions) 375 | self.idx2tgt = {v: k for k, v in self.tgt2idx.items()} 376 | log.info("fitted the classifier") 377 | 378 | def predict(self, alignment, languages, unknown="?", onehot=False, desegment=True): 379 | """ 380 | Predict a word form from an alignment. 381 | 382 | :param desegment: Return the form without gaps and ungapped tokens. 383 | """ 384 | matrix = self.func(alignment, languages, self.languages, training=False) 385 | for row in matrix: 386 | assert len(row) == self.dim 387 | new_matrix = [[0 for char in row] for row in matrix] 388 | for i, row in enumerate(matrix): 389 | for j, char in enumerate(row): 390 | new_matrix[i][j] = self.snd2idx.get((j, char), 0) 391 | if hasattr(self, "onehot"): 392 | new_matrix = self.onehot(new_matrix) 393 | out = [self.idx2tgt.get(idx, unknown) for idx in self.clf.predict(new_matrix)] 394 | return alm2tok(out) if desegment else out 395 | 396 | 397 | def eval_by_dist(data, func=None, **kw): 398 | """ 399 | Evaluate by measuring distances between sequences. 400 | 401 | :param data: List of tuples with prediction and attested sequence. 402 | :param func: Alignment function (defaults to edit distance) 403 | 404 | :note: Defaults to the unnormalized edit distance. 405 | """ 406 | func = func or edit_dist 407 | scores = [] 408 | for seqA, seqB in data: 409 | if not seqA: 410 | seqA = ["?"] 411 | if not seqB: 412 | seqB = ["?"] 413 | scores += [func(seqA, seqB, **kw)] 414 | return sum(scores) / len(scores) 415 | 416 | 417 | def eval_by_bcubes(data, func=None, **kw): 418 | """ 419 | Evaluate by measuring B-Cubed F-scores. 420 | 421 | :param data: List of tuples with prediction and attested sequence. 422 | :param func: Alignment function (defaults to Needleman-Wunsch) 423 | 424 | Note 425 | ---- 426 | This evaluation was first introduced in List (2019). 427 | 428 | > List, J.-M. (2019): Beyond Edit Distances: Comparing linguistic 429 | > reconstruction systems. Theoretical Linguistics 45.3-4. 1-10. DOI: 430 | > https://doi.org/10.1515/tl-2019-0016 431 | """ 432 | numsA, numsB = {"": 0}, {"": 0} 433 | func = func or nw_align 434 | almsA, almsB = [], [] 435 | for seqA, seqB in data: 436 | if not seqA: 437 | seqA = ["?"] 438 | if not seqB: 439 | seqB = ["?"] 440 | almA, almB, score = func(seqA, seqB, **kw) 441 | for a, b in zip(almA, almB): 442 | if a not in numsA: 443 | numsA[a] = max(numsA.values()) + 1 444 | if b not in numsB: 445 | numsB[b] = max(numsB.values()) + 1 446 | almsA += [numsA[a]] 447 | almsB += [numsB[b]] 448 | p, r = get_bcubed_score(almsA, almsB), get_bcubed_score(almsB, almsA) 449 | return 2 * (p * r) / (p + r) 450 | -------------------------------------------------------------------------------- /src/lingrex/regularity.py: -------------------------------------------------------------------------------- 1 | """ 2 | Calculate regularity metrics on dataset. 3 | """ 4 | import statistics 5 | 6 | from lingpy import log 7 | 8 | 9 | def regularity(wordlist, threshold=3, ref="cogid", min_refs=3, 10 | word_threshold=0.75, sound_classes="cv"): 11 | """ 12 | Check regularity in three flavors. 13 | 14 | - regularity based on the number of correspondence patterns that have more 15 | or the same number of sites as threshold 16 | - the proportion of correspondence patterns identified as regular via 17 | threshold counting all alignment sites 18 | - the proportion of words that we judge regular, judging words to be 19 | regular when more than the proportion word_threshold of sites are judged 20 | to be regular since they can be assigned to patterns that are covered by 21 | more than threshol sites 22 | 23 | :param wordlist: A lingpy Wordlist. 24 | :type wordlist: :class:lingpy.Wordlist 25 | :param threshold: The minimum number of alignment sites for a cognate set 26 | to be considered in the computation of regular words. Defaults to '3'. 27 | :type threshold: int 28 | :param ref: The column which stores the cognate sets, defaults to 'cogid' 29 | :type ref: str 30 | :param min_refs: The minimum number of occurrences a correspondence pattern 31 | to be considered recurring. Defaults to '3'. 32 | :type min_refs: int 33 | :param word_threshold: The relative threshold of patterns that need to be regular 34 | in order for a word to be considered regular as well. Defaults to '0.75'. 35 | :type word_threshold: float 36 | :param sound_classes: A string of characters or a list or a set of strings 37 | that contain the sound classes that the regularity should concentrate on. 38 | :type sound_clasess: str, list, set, tuple 39 | :return: Different scores of regularity. 40 | :rtype: tuple 41 | 42 | 43 | Note 44 | ---- 45 | These regularity checks were first introduced in a study by Blum and List (2023): 46 | 47 | > Blum, F. and J.-M. List (2023): Trimming phonetic alignments improves the inference of 48 | > sound correspondence patterns from multilingual wordlists. 49 | > In: Proceedings of the 5th Workshop on Computational Typology and Multilingual NLP. 50 | > Association for Computational Linguistics 52-64. https://aclanthology.org/2023.sigtyp-1.6 51 | """ 52 | if not hasattr(wordlist, "clusters"): 53 | raise ValueError("need a CoPaR object with clusters") 54 | patterns = len({p: len(vals) for p, vals in wordlist.clusters.items() \ 55 | if p[0] in sound_classes}) 56 | regular_patterns = len( 57 | [p for p, vals in wordlist.clusters.items() \ 58 | if len(vals) >= threshold and p[0] in sound_classes]) 59 | regular_proportion = sum( 60 | [len(vals) for p, vals in wordlist.clusters.items() \ 61 | if len(vals) >= threshold and p[0] in sound_classes] 62 | ) 63 | full_proportion = sum([len(vals) for p, vals in wordlist.clusters.items() \ 64 | if p[0] in sound_classes]) 65 | 66 | # get the proportion of words 67 | regular_words, irregular_words = 0, 0 68 | for cogid, msa in filter( 69 | lambda x: len(set(x[1]["taxa"])) >= min_refs, wordlist.msa[ref].items() 70 | ): 71 | scores = [] 72 | for idx in range(len(msa["alignment"][0])): 73 | if (cogid, idx) not in wordlist.patterns: # pragma: no cover 74 | log.warning("duplicate cognate in {0} / {1}".format(cogid, idx)) 75 | else: 76 | if wordlist.patterns[cogid, idx][0][1] in sound_classes: 77 | if ( 78 | max( 79 | [ 80 | len(wordlist.clusters[b, c]) 81 | for a, b, c in wordlist.patterns[cogid, idx] 82 | ] 83 | ) 84 | >= threshold 85 | ): 86 | scores.append(1) 87 | else: 88 | scores.append(0) 89 | if scores: 90 | if statistics.mean(scores) >= word_threshold: 91 | regular_words += len(set(msa["taxa"])) 92 | else: 93 | irregular_words += len(set(msa["taxa"])) 94 | 95 | return ( 96 | regular_patterns, 97 | patterns - regular_patterns, 98 | patterns, 99 | round((regular_patterns / patterns), 2), 100 | regular_proportion, 101 | full_proportion - regular_proportion, 102 | full_proportion, 103 | round((regular_proportion / full_proportion), 2), 104 | regular_words, 105 | irregular_words, 106 | regular_words + irregular_words, 107 | round((regular_words / (regular_words + irregular_words)), 2), 108 | ) 109 | -------------------------------------------------------------------------------- /src/lingrex/trimming.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functionality to trim alignments by removing sites. 3 | """ 4 | import random 5 | import typing 6 | import functools 7 | import itertools 8 | import collections 9 | 10 | from lingpy.sequence.sound_classes import token2class 11 | 12 | from lingrex.util import subsequence_of 13 | 14 | __all__ = ["GAP", "Site", "Sites", "prep_alignments"] 15 | GAP = "-" 16 | 17 | 18 | class Site(list): 19 | """ 20 | A site in an alignment is a "column", i.e. a list of the n-th sound in the aligned words. 21 | """ 22 | 23 | def gap_ratio(self, gap: str = GAP) -> float: 24 | return self.count(gap) / len(self) 25 | 26 | def first_sound(self, gap=GAP): 27 | for s in itertools.dropwhile(lambda c: c == gap, self): 28 | return s 29 | 30 | def soundclass(self, gap: str = GAP) -> str: 31 | return token2class(self.first_sound(gap=gap) or "+", "cv") 32 | 33 | 34 | class Sites(list): 35 | """ 36 | A Sites object represents an alignment in the orthogonal view, i.e. listing columns rather 37 | than rows. 38 | 39 | .. code-block:: python 40 | 41 | >>> s = Sites([list('s-terb-'), list('mete---'), list('-ate-bu'), list('--te-b-')]) 42 | >>> print(s) 43 | s - t e r b - 44 | m e t e - - - 45 | - a t e - b u 46 | - - t e - b - 47 | >>> print(s.trimmed(strategy='gap-oriented')) 48 | t e b 49 | t e - 50 | t e b 51 | t e b 52 | >>> print(s.trimmed(strategy='core-oriented')) 53 | t e r b 54 | t e - - 55 | t e - b 56 | t e - b 57 | >>> print(s.trimmed(strategy='core-oriented', threshold=0.6)) 58 | s - t e r b 59 | m e t e - - 60 | - a t e - b 61 | - - t e - b 62 | 63 | Note 64 | ---- 65 | Trimming of sites in an alignment was first introduced in a study by Blum and List (2023): 66 | 67 | > Blum, F. and J.-M. List (2023): Trimming phonetic alignments improves the inference of 68 | > sound correspondence patterns from multilingual wordlists. 69 | > In: Proceedings of the 5th Workshop on Computational Typology and Multilingual NLP. 70 | > Association for Computational Linguistics 52-64. https://aclanthology.org/2023.sigtyp-1.6 71 | """ 72 | 73 | def __init__( 74 | self, 75 | alms: typing.Optional[typing.List[typing.List[str]]] = None, 76 | sites: typing.Optional[typing.List[Site]] = None, 77 | gap: str = GAP, 78 | ): 79 | """ 80 | :parameter alms: List of aligned sequences. 81 | :parameter gap: String that codes gaps in alignment sites. 82 | """ 83 | assert (alms or sites) and not (alms and sites) 84 | assert alms is None or ( 85 | isinstance(alms[0], list) and isinstance(alms[0][0], str) 86 | ), "Expected list of lists of str, got {}".format(alms) 87 | self.gap = gap 88 | super().__init__( 89 | sites 90 | if sites 91 | else (Site([row[i] for row in alms]) for i in range(len(alms[0]))) 92 | ) 93 | 94 | @property 95 | def gap_ratios(self) -> typing.List[float]: 96 | return [s.gap_ratio(gap=self.gap) for s in self] 97 | 98 | @property 99 | def soundclasses(self) -> typing.List[str]: 100 | return [s.soundclass(gap=self.gap) for s in self] 101 | 102 | def _trimmed(self, idxs: typing.Iterable[int]) -> "Sites": 103 | """ 104 | Trim by removing the sites specified by index in `idxs`. 105 | """ 106 | idxs = set(idxs) 107 | return Sites(sites=[s for idx, s in enumerate(self) if idx not in idxs]) 108 | 109 | def to_alignment(self) -> typing.List[typing.List[str]]: 110 | return [[s[i] for s in self] for i in range(len(self[0]))] 111 | 112 | def __str__(self): 113 | return "\n".join("\t".join(w) for w in self.to_alignment()) 114 | 115 | def trimmed( 116 | self, 117 | strategy: str = "gap-oriented", 118 | threshold: float = 0.5, 119 | skeletons: typing.Iterable[str] = ("CV", "VC"), 120 | strict_ratio: bool = True, 121 | exclude="_+", 122 | ) -> "Sites": 123 | """ 124 | Trim by removing candidate sites as long as this leaves an alignment containing at least 125 | one of the cv-patterns from `skeletons`. 126 | 127 | Candidates are identified using `strategy`: 128 | - `'gap-oriented'`: Trim alignment sites by gaps. Candidates are groups of sites with the \ 129 | same gap ratio. Candidate groups are tried in order of decreasing gap ratio, and the \ 130 | trimming stops when not all sites in a group could be trimmed. 131 | - `'core-oriented'`: Trim alignment sites by gaps, preserving a core of sites. Candidates \ 132 | are tried from start and end inwards. 133 | 134 | :parameter threshold: Threshold for gap ratio to qualify sites for trimming. 135 | :param skeletons: Iterable of syllable-skeletons at least one of which should be preserved \ 136 | for further processing. 137 | :param exclude: Sequence of strings that should be excluded from further processing,\ 138 | e.g. morpheme boundaries. 139 | """ 140 | if strategy in {"gap-oriented", "gap"}: 141 | # Sites with big enough gap ratio, grouped by ratio, ordered by decreasing ratio. 142 | candidates = [ 143 | [i[0] for i in idxs] 144 | for score, idxs in 145 | # Note that the sort order must be a total ordering to make trimming reproducible. 146 | itertools.groupby( 147 | sorted( 148 | enumerate(self.gap_ratios), 149 | key=lambda x: (x[1], -x[0]), 150 | reverse=True, 151 | ), 152 | lambda i: i[1], 153 | ) 154 | if score >= threshold 155 | ] 156 | elif strategy in {"core-oriented", "core"}: 157 | gap_or_not = [ 158 | self.gap if ratio >= threshold else "S" for ratio in self.gap_ratios 159 | ] 160 | takewhile_gap = functools.partial( 161 | itertools.takewhile, lambda c: c[1] == self.gap 162 | ) 163 | leading_gaps = [i for i, _ in takewhile_gap(enumerate(gap_or_not))] 164 | trailing_gaps = [ 165 | len(gap_or_not) - 1 - i 166 | for i, _ in takewhile_gap(enumerate(reversed(gap_or_not))) 167 | ] 168 | candidates = trailing_gaps + leading_gaps 169 | else: 170 | raise ValueError( 171 | "Unknown strategy: {}".format(strategy) 172 | ) # pragma: no cover 173 | 174 | skeleton = list(enumerate(self.soundclasses)) 175 | idxs = {i for i, c in skeleton if c in exclude} # Exclude markers. 176 | for idxss in candidates: 177 | if not isinstance(idxss, list): 178 | idxss = [idxss] 179 | trimmed = [] 180 | for idx in idxss: 181 | current_skeleton = [c for i, c in skeleton if i not in idxs | {idx}] 182 | if any(subsequence_of(s, current_skeleton) for s in skeletons): 183 | # Trimming this site leaves a "big enough" remainder. 184 | idxs.add(idx) 185 | trimmed.append(True) 186 | else: 187 | trimmed.append(False) 188 | if strict_ratio and not all(trimmed): 189 | break 190 | return self._trimmed(idxs) 191 | 192 | def trimmed_random( 193 | self, 194 | strategy: str = "gap-oriented", 195 | threshold: float = 0.5, 196 | skeletons: typing.Iterable[str] = ("CV", "VC"), 197 | exclude="_+", 198 | ) -> "Sites": 199 | """ 200 | For a base trim function, return a random version with a similar CV distribution. 201 | 202 | :parameter method: Trimming function that should be applied to compute the CV distribution.\ 203 | Specified as name of a suitable method of `Sites`, or as callable. 204 | :parameter threshold: Threshold by which sites with gaps should be trimmed. 205 | :param skeletons: Tuple of syllable-skeletons that should be preserved 206 | for further processing. Defaults to '("CV", "VC")'. 207 | """ 208 | reference_skeleton = ( 209 | Sites(self.to_alignment(), gap=self.gap) 210 | .trimmed( 211 | strategy=strategy, 212 | threshold=threshold, 213 | skeletons=skeletons, 214 | exclude=exclude, 215 | ) 216 | .soundclasses 217 | ) 218 | # create a freq dict of ref skel 219 | rs_freqs = collections.Counter(reference_skeleton) 220 | # get a dictionary of indices by position 221 | indices = { # soundclass mapped to list of indices in cv template. 222 | sc: [i[0] for i in items] 223 | for sc, items in itertools.groupby( 224 | sorted(enumerate(self.soundclasses), key=lambda ii: ii[1]), 225 | lambda ii: ii[1], 226 | ) 227 | } 228 | # random sample indices to be retained 229 | retain = [random.sample(indices[c], rs_freqs[c]) for c, _ in rs_freqs.items()] 230 | retain = set(itertools.chain(*retain)) 231 | return self._trimmed([i for i in range(len(self)) if i not in retain]) 232 | 233 | 234 | def prep_alignments(aligned_wl, skeletons=("CV", "VC"), ref="cogid"): 235 | """ " 236 | Preparing the alignments assures that the structure is correctly 237 | added to the wordlist. 238 | 239 | :param wordlist: A lingpy Alignments. 240 | :type wordlist: :class:lingpy.Alignments 241 | :param skeletons: Tuple of syllable-skeletons that should be preserved 242 | for further processing. Defaults to '("CV", "VC")'. 243 | :type skeletons: tuple 244 | :param ref: The column which stores the cognate sets, defaults to 'cogid' 245 | :type ref: str 246 | :return: Pre-processed alignments. 247 | :rtype: :class:lingpy.Alignments 248 | """ 249 | whitelist = [] 250 | for _, msa in aligned_wl.msa[ref].items(): 251 | skel = Sites(msa["alignment"]).soundclasses 252 | if any([subsequence_of(s, skel) for s in skeletons]): 253 | whitelist += msa["ID"] 254 | aligned_wl.add_entries( 255 | "structure", "tokens", lambda x: " ".join(Sites([x]).soundclasses) 256 | ) 257 | dct = {0: aligned_wl.columns} 258 | for idx in whitelist: 259 | dct[idx] = aligned_wl[idx] 260 | return aligned_wl.__class__(dct, transcription="form") 261 | -------------------------------------------------------------------------------- /src/lingrex/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for the lingrex package. 3 | """ 4 | import math 5 | import pathlib 6 | 7 | from lingpy import tokens2class, prosodic_string 8 | from lingpy.align.sca import get_consensus 9 | from lingpy import basictypes as bt 10 | from lingpy.sequence.ngrams import get_n_ngrams 11 | 12 | 13 | def subsequence_of(source, target): 14 | """ 15 | Check if all items of source appear in target in order, but not necessarily consecutively. 16 | """ 17 | i = 0 18 | for c in source: 19 | try: 20 | i += target[i:].index(c) + 1 21 | except ValueError: # c is not in the remainder of target. 22 | return False 23 | return True 24 | 25 | 26 | def lingrex_path(*comps): 27 | return str(pathlib.Path(__file__).parent.joinpath(*comps)) 28 | 29 | 30 | def bleu_score(word, reference, n=4, weights=None, trim=True): 31 | """ 32 | Compute the BLEU score for predicted word and reference. 33 | 34 | :param word: the predicted word 35 | :param reference: the predicted reference 36 | :param n: the order of ngrams 37 | :param weights: list of weights, should be the same size as n 38 | :param trim: bool, decide to trim n-grams or not 39 | """ 40 | weights = [1 / n for x in range(n)] if weights is None else weights 41 | 42 | scores = [] 43 | for i in range(1, n + 1): 44 | new_wrd = list(get_n_ngrams(word, i)) 45 | new_ref = list(get_n_ngrams(reference, i)) 46 | if trim and i > 1: 47 | new_wrd = new_wrd[i - 1 : -(i - 1)] 48 | new_ref = new_ref[i - 1 : -(i - 1)] 49 | 50 | clipped, divide = [], [] 51 | for itm in set(new_wrd): 52 | clipped += [new_ref.count(itm)] 53 | divide += [new_wrd.count(itm)] 54 | scores += [sum(clipped) / sum(divide)] 55 | 56 | # calculate arithmetic mean 57 | out_score = 1 58 | for weight, score in zip(weights, scores): 59 | out_score = out_score * (score**weight) 60 | 61 | bp = ( 62 | 1 63 | if len(word) > len(reference) 64 | else math.e ** (1 - (len(reference) / len(word))) 65 | ) 66 | return bp * (out_score ** (1 / sum(weights))) 67 | 68 | 69 | def clean_sound(sound): 70 | """ 71 | Get rid of "a/b" notation for sound segments. 72 | """ 73 | return ".".join([s.split("/")[1] if "/" in s else s for s in sound.split(".")]) 74 | 75 | 76 | def alm2tok(seq, gap="-"): 77 | """ 78 | Turn an alignment into a sequence. 79 | """ 80 | return [clean_sound(x) for x in unjoin(seq) if x != gap] 81 | 82 | 83 | def unjoin(seq): 84 | """ 85 | Turn segments joined by a dot into unjoined segments. 86 | """ 87 | out = [] 88 | for itm in seq: 89 | out += itm.split(".") 90 | return out 91 | 92 | 93 | def ungap(alignment, languages, proto): 94 | """ 95 | Trim an MSA to remove all gaps in the target sequence. 96 | :examples: 97 | >>> ungap([['a', 'b'], ['x', '-'], ['y', '-']], ['proto', 'l1', 'l2'], 'proto') 98 | ... [['a.b'], ['x'], ['y']] 99 | >>> ungap([['a', 'b'], ['x', '-'], ['y', 'h']], ['proto', 'l1', 'l2'], 'proto') 100 | ... [['a', 'b'], ['x', '-'], ['y', 'h']] 101 | 102 | Note 103 | ---- 104 | This procedure for multiple alignments was first introduced in List et al. 105 | (2022). 106 | 107 | > List, J.-M., N. Hill, and R. Forkel (2022): A new framework for fast 108 | > automated phonological reconstruction using trimmed alignments and sound 109 | > correspondence patterns. In: Proceedings of the 3rd Workshop on 110 | > Computational Approaches to Historical Language Change. Association for 111 | > Computational Linguistics 89-96. URL: https://aclanthology.org/2022.lchange-1.9 112 | """ 113 | pidxs = [i for i, taxon in enumerate(languages) if taxon == proto] 114 | merges = [] 115 | for i in range(len(alignment[0])): # go through the rows of the alignment ... 116 | col = [row[i] for row in alignment] 117 | # ... looking for gap-only alignments (in non-proto languages): 118 | if {site for j, site in enumerate(col) if j not in pidxs} == {"-"}: 119 | merges += [i] 120 | if not merges: 121 | return alignment 122 | new_alms = [] 123 | for i, row in enumerate(alignment): 124 | new_alm, mergeit, started = [], False, True 125 | for j, cell in enumerate(row): 126 | if j in merges or mergeit: 127 | mergeit = False 128 | if not started: # j != 0: 129 | if cell != "-": 130 | new_alm[-1] += "." + cell if new_alm[-1] else cell 131 | else: 132 | mergeit = True 133 | new_alm.append("" if cell == "-" else cell) 134 | else: 135 | started = False 136 | new_alm.append(cell) 137 | new_alms.append([cell or "-" for cell in new_alm]) 138 | return new_alms 139 | 140 | 141 | def add_structure( 142 | wordlist, model="cv", segments="tokens", structure="structure", ref="cogid", gap="-" 143 | ): 144 | """ 145 | Add structure to a wordlist to make sure correspondence patterns can be inferred. 146 | """ 147 | if model not in ["cv", "c", "CcV", "ps", "nogap"]: 148 | raise ValueError("[i] you need to select a valid model") 149 | D = {} 150 | if model == "cv": 151 | for idx, tks in wordlist.iter_rows(segments): 152 | D[idx] = " ".join(tokens2class(tks, "cv")).lower() 153 | 154 | if model == "c": 155 | for idx, tks in wordlist.iter_rows(segments): 156 | D[idx] = ( 157 | " ".join(tokens2class(tks, "cv")) 158 | .lower() 159 | .replace("v", "c") 160 | .replace("t", "c") 161 | ) 162 | if model == "nogap": 163 | assert hasattr(wordlist, "msa") 164 | for cogid, msa in wordlist.msa[ref].items(): 165 | cons = [ 166 | "c" if c != gap else gap 167 | for c in get_consensus(msa["alignment"], gaps=True) 168 | ] 169 | for idx, alm in zip(msa["ID"], msa["alignment"]): 170 | struc = [] 171 | for a, b in zip(cons, alm): 172 | if b != "-": 173 | struc += [a] 174 | D[idx] = " ".join(struc) 175 | for idx, tks in wordlist.iter_rows(segments): 176 | if idx not in D: 177 | D[idx] = " ".join(["c" if c != "+" else c for c in tks]) 178 | if model == "CcV": 179 | for idx, tks in wordlist.iter_rows(segments): 180 | D[idx] = " ".join( 181 | list(prosodic_string(tks, _output="CcV").replace("_", "+")) 182 | ) 183 | if model == "ps": 184 | for idx, tks in wordlist.iter_rows(segments): 185 | D[idx] = " ".join(list(prosodic_string(tks))) 186 | 187 | if hasattr(wordlist, "_mode") and wordlist._mode == "fuzzy": 188 | struc_ = bt.lists 189 | else: 190 | struc_ = bt.strings 191 | wordlist.add_entries(structure, D, lambda x: struc_(x)) 192 | 193 | 194 | def prep_wordlist(wordlist, min_refs=3, exclude="_+"): 195 | """ 196 | Preprocessing will make sure that the data are unified. 197 | 198 | - delete markers of morpheme boundaries (often inconsistently applied), as 199 | indicated by exclude 200 | - only consider cognate sets with size > min_refs (unique taxa), as identified by 201 | - delete duplicate words in the same cognate set 202 | 203 | :param wordlist: A lingpy Wordlist. 204 | :type wordlist: :class:lingpy.Wordlist 205 | :param min_ref: The minimun number of words in a cognate set. 206 | Defaults to '3'. 207 | :type min_ref: int 208 | :param exclude: Sequence of strings that should be excluded from further processing, 209 | e.g. morpheme boundaries. Defaults to '_+'. 210 | :param exclude: str 211 | :return: Pre-processed wordlist. 212 | :rtype: :class:lingpy.Wordlist 213 | """ 214 | whitelist = [] 215 | for _, idxs in wordlist.get_etymdict(ref="cogid").items(): 216 | visited, all_indices = set(), [] 217 | for idx in map(lambda x: x[0], filter(lambda x: x, idxs)): 218 | if wordlist[idx, "doculect"] not in visited: 219 | visited.add(wordlist[idx, "doculect"]) 220 | all_indices += [idx] 221 | if len(visited) >= min_refs: 222 | whitelist += all_indices 223 | for idx, tokens in wordlist.iter_rows("tokens"): 224 | wordlist[idx, "tokens"] = [t for t in tokens if t not in exclude] 225 | 226 | dct = {0: wordlist.columns} 227 | for idx in whitelist: 228 | dct[idx] = wordlist[idx] 229 | return wordlist.__class__(dct) 230 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def data(): 8 | return pathlib.Path(__file__).parent / 'data' 9 | 10 | 11 | @pytest.fixture 12 | def wl_with_alignments(): 13 | return { 14 | 0: ["doculect", "concept", "form", "tokens", "alignment", "cogid"], 15 | 1: ["A", "one", "atawu", "a t a w u", "a t a w u", 1], 16 | 2: ["B", "one", "a_twu", "a _ t w u", "a t - w u", 1], 17 | 3: ["C", "one", "tawu", "t a w u", "- t a w u", 1], 18 | 4: ["D", "one", "tefu", "tʲ e f u", "- t e f u", 1], 19 | 5: ["A", "two", "satu", "s a t u", "s a t u", 2], 20 | 6: ["A", "two", "seram", "s e r a m", "s e r a m", 2] 21 | } 22 | -------------------------------------------------------------------------------- /tests/data/hillburmish.tsv: -------------------------------------------------------------------------------- 1 | ID DOCULECT CONCEPT VALUE FORM TOKENS NOTE COGIDS 2 | 1 OldBurmese I ṅa ṅa ṅ/ŋ a 665 3 | 4 Atsi I ŋo⁵¹ ŋo⁵¹ ŋ o ⁵¹ 665 4 | 6 Lashi I ŋo³¹ ŋo³¹ ŋ o ³¹ 665 5 | 9 ProtoBurmish I *ŋa ŋa¹ ŋ a ¹ 665 6 | 147 Atsi banana (plantain) ŋoʔ⁵⁵ mjuʔ²¹ ŋoʔ⁵⁵+mjuʔ²¹ ŋ o ʔ ⁵⁵ + m j u ʔ ²¹ 681 3302 7 | 149 Lashi banana (plantain) ŋɔʔ⁵⁵ mju̱k⁵⁵ ŋɔʔ⁵⁵+mju̱k⁵⁵ ŋ ɔ ʔ ⁵⁵ + m j u̱ k ⁵⁵ 681 3304 8 | 151 ProtoBurmish banana (plantain) *ŋak ŋak⁴ ŋ a k ⁴ 681 9 | 167 ProtoBurmish be (in the house) *ŋji ŋji¹ ŋ j i ¹ 488 10 | 283 ProtoBurmish blue *ŋjuŋ ŋjuŋ¹ ŋ j u ŋ ¹ 698 11 | 284 ProtoBurmish blue *ŋju ŋju¹ ŋ j u ¹ 699 12 | 665 Atsi cooked, be (rice) / done ŋjoʔ²¹ ŋjoʔ²¹ ŋ j o ʔ ²¹ 684 13 | 667 Lashi cooked, be (rice) / done ŋjɔːʔ³¹ ŋjɔːʔ³¹ ŋ j ɔː ʔ ³¹ 684 14 | 670 ProtoBurmish cooked, be (rice) / done *ŋjak ŋjak⁴ ŋ j a k ⁴ 684 15 | 733 OldBurmese cry ṅuiw ṅui̯ ṅ/ŋ ui̯ 693 16 | 736 Atsi cry ŋau⁵¹ ŋau⁵¹ ŋ au ⁵¹ 693 17 | 738 Lashi cry ŋaːu³¹ ŋaːu³¹ ŋ aːu ³¹ 693 18 | 741 ProtoBurmish cry *ŋu ŋu¹ ŋ u ¹ 693 19 | 799 ProtoBurmish day (time) *ŋjiX ŋji³ ŋ j i ³ 491 20 | 1230 Lashi fifteen tshĕ³³ ŋ³³ tshĕ³³+ŋ³³ tsʰ ĕ ³³ + ŋ ³³ 3295 666 21 | 1232 ProtoBurmish fifteen *ŋaX ŋa³ ŋ a ³ 667 22 | 1275 OldBurmese five ṅaḥ ṅaḥ ṅ/ŋ a ḥ/⁵ 666 23 | 1278 Atsi five ŋo²¹ ŋo²¹ ŋ o ²¹ 666 24 | 1283 ProtoBurmish five *ŋaH ŋa² ŋ a ² 666 25 | 2888 OldBurmese salty ṅan ṅan ṅ/ŋ a n 683 26 | 2895 ProtoBurmish salty *ŋan ŋan¹ ŋ a n ¹ 683 27 | 3109 OldBurmese silver ṅuy ṅuj ṅ/ŋ u j 696 28 | 3112 Atsi silver ŋun⁵¹ ŋun⁵¹ ŋ u n ⁵¹ 696 29 | 3114 Lashi silver ŋə³¹ ŋə³¹ ŋ ə ³¹ 696 30 | 3117 ProtoBurmish silver *ŋui ŋui¹ ŋ ui ¹ 696 31 | 3170 OldBurmese small ṅay ṅai ṅ/ŋ ai 668 32 | 3174 Lashi small ŋɛː³¹ ŋɛː³¹ ŋ ɛː ³¹ 668 33 | 3177 ProtoBurmish small *ŋai ŋai¹ ŋ ai ¹ 668 34 | 3619 Atsi tongs (fire) ŋjap²¹ ŋjap²¹ ŋ j a p ²¹ 686 35 | 3621 Lashi tongs (fire) ŋjap³¹ tsei⁵⁵ ŋjap³¹+tsei⁵⁵ ŋ j a p ³¹ + ts ei ⁵⁵ 686 3310 36 | 3623 ProtoBurmish tongs (fire) *ŋjat ŋjat⁴ ŋ j a t ⁴ 686 37 | 4030 OldBurmese young ṅay ṅai ṅ/ŋ ai 668 38 | -------------------------------------------------------------------------------- /tests/test_align.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from lingrex.align import ( 3 | gap_free_pairwise, 4 | align_to_template, 5 | shrink_alignments, 6 | template_alignment, 7 | shrink_template, 8 | ) 9 | from lingpy import Wordlist 10 | 11 | 12 | def test_gap_free_pairwise(): 13 | 14 | seqA, seqB = list("andra"), list("an-ra") 15 | 16 | almA, almB = gap_free_pairwise(seqA, seqB) 17 | assert almA[1] == "nr" 21 | 22 | seqA, seqB = list("este"), list("-ste") 23 | almA, almB = gap_free_pairwise(seqA, seqB) 24 | assert almA[0] == "e>s" 25 | 26 | seqA, seqB = list("euste"), list("--ste") 27 | almA, almB = gap_free_pairwise(seqA, seqB) 28 | assert almA[0] == "e>u>s" 29 | 30 | 31 | def test_align_to_template(): 32 | out = align_to_template("ka", "Cv", "Cvc") 33 | assert out[-1] == "-" 34 | 35 | with pytest.raises(ValueError): 36 | align_to_template("ka", "c", "Cvc") 37 | with pytest.raises(ValueError): 38 | align_to_template("ka", "cv", "Cv") 39 | 40 | 41 | def test_shrink_alignments(): 42 | out = shrink_alignments([["a", "b", "-"], ["a", "b", "-"]]) 43 | assert len(out[0]) == 2 44 | 45 | 46 | @pytest.fixture 47 | def wldata(): 48 | return { 49 | 0: ["doculect", "concept", "tokens", "structure", "cogid"], 50 | 1: ["a", "a", "b au".split(), "i n".split(), 1], 51 | 2: ["b", "a", "b o k".split(), "i n c".split(), 1], 52 | 3: ["c", "a", "b w a k".split(), "i m n c".split(), 1], 53 | } 54 | 55 | 56 | @pytest.fixture 57 | def wldata_listvalued_cogid(wldata): 58 | return {k: v if k == 0 else v[:-1] + [[v[-1]]] for k, v in wldata.items()} 59 | 60 | 61 | def test_template_alignment(wldata, wldata_listvalued_cogid): 62 | wl = Wordlist(wldata) 63 | template_alignment(wl, fuzzy=False, template="imnc") 64 | assert "alignment" in wl.columns 65 | wl = Wordlist(wldata_listvalued_cogid) 66 | template_alignment(wl, fuzzy=True, template="imnc") 67 | assert "alignment" in wl.columns 68 | 69 | 70 | def test_shrink_template(wldata_listvalued_cogid): 71 | wl = Wordlist(wldata_listvalued_cogid) 72 | shrink_template(wl) 73 | assert wl[2, "tokens2"][-1] == "ok" 74 | -------------------------------------------------------------------------------- /tests/test_borrowing.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from lingrex.borrowing import internal_cognates, external_cognates 3 | from lingpy import Wordlist 4 | 5 | 6 | @pytest.fixture 7 | def wl(data): 8 | return Wordlist(str(data / 'wordlist.tsv')) 9 | 10 | 11 | @pytest.mark.parametrize( 12 | 'kw,success', 13 | [ 14 | ( 15 | dict(ref="autocogids", partial=True, method="lexstat"), 16 | lambda wl: "autocogids" in wl.columns), 17 | ( 18 | dict(ref="autocogid", partial=False, method="lexstat"), 19 | lambda wl: "autocogid" in wl.columns), 20 | ( 21 | dict(ref="autocogids", partial=True, method="sca"), 22 | lambda wl: "autocogids" in wl.columns), 23 | ] 24 | ) 25 | def test_internal_cognates(kw, success, wl): 26 | internal_cognates(wl, runs=10, **kw) 27 | assert success(wl) 28 | 29 | etd = wl.get_etymdict(ref=kw["ref"]) 30 | 31 | for cogid, vals in etd.items(): 32 | concepts = [] 33 | for idx_ in vals: 34 | if idx_: 35 | for idx in idx_: 36 | concepts += [wl[idx, "concept"]] 37 | assert len(set(concepts)) == 1 38 | 39 | 40 | 41 | def test_external_cognates(wl): 42 | external_cognates(wl, cognates="cogid", ref="borrids") 43 | assert "borrids" in wl.columns 44 | -------------------------------------------------------------------------------- /tests/test_cognates.py: -------------------------------------------------------------------------------- 1 | from lingrex.cognates import common_morpheme_cognates, salient_cognates 2 | from lingpy import Wordlist 3 | 4 | 5 | def test_common_morpheme_cognates(): 6 | wl = Wordlist({ 7 | 0: ["doculect", "concept", "ipa", "tokens", "cogids"], 8 | 1: ["a", "a", "pla", "p l a + p u", [1, 2]], 9 | 2: ["b", "a", "pla", "p l a t + k i", [1, 3]], 10 | 3: ["c", "a", "pla", "k i + p l u p", [4, 1]], 11 | 4: ["d", "a", "pla", "p l a k", [1]], 12 | 5: ["a", "b", "pla", "t r a", [2]], 13 | 6: ["b", "b", "pla", "t a t", [2]], 14 | 7: ["c", "b", "pla", "d r ə p", [2]], 15 | }) 16 | common_morpheme_cognates(wl) 17 | assert wl[1, "autocogid"] == wl[2, "autocogid"] 18 | 19 | 20 | def test_salient_cognates(): 21 | wl = Wordlist({ 22 | 0: ["doculect", "concept", "morphemes", "tokens", "cogids",], 23 | 1: ["a", "a", "pla _pi".split(), "p l a + p u".split(), [1, 2]], 24 | 2: ["b", "a", "pla _po".split(), "p l a t + k i".split(), [1, 3]], 25 | 3: ["c", "a", "_po pla".split(), "k i + p l u p".split(), [4, 1]], 26 | 4: ["d", "a", "pla".split(), "p l a k".split(), [1]], 27 | 5: ["a", "b", "pla".split(), "t r a".split(), [2]], 28 | 6: ["b", "b", "pla".split(), "t a t".split(), [2]], 29 | 7: ["c", "b", "pla".split(), "d r ə p".split(), [2]], 30 | }) 31 | salient_cognates(wl) 32 | assert wl[1, "newcogid"] == wl[2, "newcogid"] 33 | -------------------------------------------------------------------------------- /tests/test_colex.py: -------------------------------------------------------------------------------- 1 | from lingrex.colex import ( 2 | expand_alignment, 3 | find_bad_internal_alignments, 4 | compatible, 5 | merge_alignments, 6 | find_colexified_alignments, 7 | ) 8 | from lingpy import Alignments 9 | 10 | 11 | def test_find_bad_internal_alignments(): 12 | wl = Alignments( 13 | { 14 | 0: ["doculect", "concept", "ipa", "tokens", "alignment", "cogids"], 15 | 1: ["a", "a", "bla", "b l a", "b l a -".split(), [1]], 16 | 2: ["b", "a", "bla", "b l a k", "b l a k".split(), [1]], 17 | 3: ["c", "a", "bla", "b a k", "b - a k".split(), [1]], 18 | 4: ["a", "b", "bla", "b l a k", "b l a k".split(), [1]], 19 | 5: ["b", "b", "bla", "b l a k", "b l a k".split(), [1]], 20 | 6: ["a", "c", "bla", "b l a", "b l a -".split(), [1]], 21 | }, 22 | ref="cogids", 23 | ) 24 | find_bad_internal_alignments(wl) 25 | assert wl[4, "cogids"][0] != 1 26 | 27 | 28 | def test_expand_alignment(): 29 | missing = "?" 30 | out = expand_alignment( 31 | {"taxa": ["a", "b", "c"], "alignment": [["t", "a"], ["t/p", "u"], ["t", "-"]]}, 32 | ["a", "d", "b", "c"], 33 | missing=missing, 34 | ) 35 | assert out[1][1] == missing 36 | assert out[2][0] == "p" 37 | 38 | 39 | def test_compatible(): 40 | missing = "?" 41 | matches = compatible( 42 | [["a", "b"], [missing, missing], ["a", "c"], ["a", "d"]], 43 | [ 44 | ["a", "-", "b"], 45 | ["a", "x", "b"], 46 | ["a", "-", "c"], 47 | [missing, missing, missing], 48 | ], 49 | missing=missing, 50 | ) 51 | assert matches == 2 52 | 53 | matches = compatible( 54 | [["a", "b"], [missing, missing], ["a", "c"], ["a", "d"]], 55 | [ 56 | ["a", "-", "c"], 57 | ["a", "x", "b"], 58 | ["a", "-", "c"], 59 | [missing, missing, missing], 60 | ], 61 | missing=missing, 62 | ) 63 | assert not matches 64 | 65 | 66 | def test_merge_alignments(): 67 | missing = "?" 68 | matches = merge_alignments( 69 | [ 70 | ["-", "a", "b"], 71 | [missing, missing, missing], 72 | ["-", "a", "c"], 73 | ["x", "a", "d"], 74 | ], 75 | [ 76 | ["a", "-", "b"], 77 | ["a", "x", "b"], 78 | ["a", "-", "c"], 79 | [missing, missing, missing], 80 | ], 81 | missing=missing, 82 | ) 83 | assert len(matches[0]) == 4 84 | 85 | missing = "?" 86 | matches = merge_alignments( 87 | [["a", "b"], ["a", "c"], ["a", "d"]], 88 | [ 89 | ["a", "-", "b"], 90 | ["a", "x", "b"], 91 | ["a", "-", "c"], 92 | ], 93 | missing=missing, 94 | ) 95 | assert len(matches[0]) == 3 96 | 97 | missing = "?" 98 | matches = merge_alignments( 99 | [["a", "b"], ["a", "c"], [missing, missing], ["a", "d"]], 100 | [ 101 | ["a", "-", "b", "-"], 102 | ["a", "x", "b", "-"], 103 | ["a", "-", "c", "e"], 104 | [missing, missing, missing, missing], 105 | ], 106 | missing=missing, 107 | ) 108 | assert len(matches[0]) == 4 109 | 110 | 111 | def test_find_colexified_alignments(): 112 | wl = Alignments( 113 | { 114 | 0: ["doculect", "concept", "ipa", "tokens", "alignment", "cogids"], 115 | 1: ["a", "a", "bla", "b l a", "b l a -".split(), [1]], 116 | 2: ["b", "a", "bla", "b l a k", "b l a k".split(), [1]], 117 | 3: ["c", "a", "bla", "b a k", "b - a k".split(), [1]], 118 | 4: ["a", "b", "bla", "b l a k", "b l a -".split(), [2]], 119 | 5: ["b", "b", "bla", "b l a k", "b l a k".split(), [2]], 120 | 6: ["a", "c", "bla", "b l a", "- b l a".split(), [3]], 121 | 7: ["d", "c", "bla", "a b l", "a b l -".split(), [3]], 122 | }, 123 | ref="cogids", 124 | ) 125 | 126 | find_colexified_alignments(wl) 127 | assert wl[1, "crossids"][0] == 1 128 | 129 | wl = Alignments( 130 | { 131 | 0: ["doculect", "concept", "ipa", "tokens", "alignment", "cogid"], 132 | 1: ["a", "a", "bla", "b l a", "b l a -".split(), 1], 133 | 2: ["b", "a", "bla", "b l a k", "b l a k".split(), 1], 134 | 3: ["c", "a", "bla", "b a k", "b - a k".split(), 1], 135 | 4: ["a", "b", "bla", "b l a k", "b l a -".split(), 2], 136 | 5: ["b", "b", "bla", "b l a k", "b l a k".split(), 2], 137 | 6: ["a", "c", "bla", "b l a", "- b l a".split(), 3], 138 | 7: ["d", "c", "bla", "a b l", "a b l -".split(), 3], 139 | }, 140 | ref="cogid", 141 | ) 142 | 143 | find_colexified_alignments(wl, cognates="cogid", ref="crossid") 144 | assert wl[1, "crossid"] == 1 145 | -------------------------------------------------------------------------------- /tests/test_copar.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from lingrex.copar import ( 3 | CoPaR, 4 | consensus_pattern, 5 | incompatible_columns, 6 | score_patterns, 7 | density, 8 | ) 9 | from lingpy import Wordlist, Alignments 10 | from lingrex.util import add_structure 11 | 12 | 13 | def test_consensus_pattern(): 14 | missing = "?" 15 | out = consensus_pattern( 16 | [["a", "b", "c"], ["a", "b", missing], [missing, missing, "c"]], missing=missing 17 | ) 18 | assert out == ("a", "b", "c") 19 | with pytest.raises(ValueError): 20 | consensus_pattern([["a", "b"], ["a", "c"]]) 21 | 22 | 23 | def test_incompatible_columns(): 24 | missing = "?" 25 | out = incompatible_columns( 26 | [ 27 | ["a", "b", "c"], 28 | ["I", "b", "c"], 29 | ["a", "b", missing], 30 | [missing, missing, "c"], 31 | ], 32 | missing=missing, 33 | ) 34 | assert out[0] == "*" 35 | 36 | 37 | @pytest.mark.parametrize( 38 | 'patterns,mode,result', 39 | [ 40 | ([["a", "b", "c"], ["a", "b", "d"], ["a", "b", "?"], ["?", "?", "c"]], 'coverage', -1), 41 | (["a", "b", "c"], 'coverage', -1), 42 | ([["a", "b", "c"], ["a", "b", "c"], ["a", "b", "?"], ["?", "?", "c"]], 'ranked', 0.75), 43 | ([["a", "b", "c"], ["a", "b", "c"], ["a", "b", "?"], ["?", "?", "c"]], 'squared', 0.64), 44 | ([["a", "b", "c"], ["a", "b", "c"], ["a", "b", "?"], ["?", "?", "c"]], 'pairs', 0.44), 45 | ([["a", "b", "c"], ["a", "b", "c"], ["a", "b", "?"], ["?", "?", "c"]], 'coverage', 0.75), 46 | ] 47 | ) 48 | def test_score_patterns(patterns, mode, result): 49 | assert result == pytest.approx(score_patterns(patterns, missing='?', mode=mode), abs=1e-2) 50 | 51 | 52 | def test_score_patterns_error(): 53 | with pytest.raises(ValueError): 54 | score_patterns([["a", "b"], ["a", "b"]], mode="bla") 55 | 56 | 57 | def test_density(): 58 | D = { 59 | 0: ["doculect", "concept", "tokens", "ipa", "cogid"], 60 | 1: ["a", "b", "t o x t ə".split(), "tochter", 1], 61 | 2: ["b", "b", "t o x t ə".split(), "tochter", 1], 62 | 3: ["c", "b", "t o x t ə".split(), "tochter", 1], 63 | 4: ["a", "c", "t o x t ə".split(), "tochter", 2], 64 | 5: ["b", "c", "t o x t ə".split(), "tochter", 2], 65 | 6: ["c", "c", "t o x t ə".split(), "tochter", 2], 66 | } 67 | assert 0.67 == pytest.approx(density(Wordlist(D), ref="cogid"), abs=1e-2) 68 | 69 | 70 | def test_CoPaR_fuzzy(): 71 | D = { 72 | 0: ["doculect", "concept", "ipa", "tokens", "cogids", "alignment"], 73 | 1: ["a", "a", "pla", "p l a", [1], "p l a -".split()], 74 | 2: ["b", "a", "pla", "p l a t", [1], "p l a t".split()], 75 | 3: ["c", "a", "pla", "p l u p", [1], "p l u p".split()], 76 | 4: ["d", "a", "pla", "p l a k", [1], "p l a k".split()], 77 | 5: ["a", "b", "pla", "t r a", [2], "t r a -".split()], 78 | 6: ["b", "b", "pla", "t a t", [2], "t - a t".split()], 79 | 7: ["c", "b", "pla", "d r ə p", [2], "d r ə p".split()], 80 | # 8: ["a", "b", "pla", "p l a k", [1], "d x a k".split()], 81 | 9: ["a", "c", "pla", "k l a", [3], "k r a -".split()], 82 | # 10: ["a", "c", "pla", "p l a t", [1], "k a t".split()], 83 | 11: ["c", "c", "pla", "k l ə p", [3], "k l ə p".split()], 84 | 12: ["d", "c", "pla", "g l a k", [3], "g l a k".split()], 85 | 13: ["d", "f", "buk", "b u k", [4], "b u k".split()], 86 | } 87 | alms = Alignments(D, ref="cogids", transcription="ipa") 88 | with pytest.raises(ValueError): 89 | CoPaR(alms, ref="cogids", structure="structure", minrefs=2) 90 | add_structure(alms, model="cv", structure="structure") 91 | cop = CoPaR(alms, ref="cogids", structure="structure", minrefs=1) 92 | cop.get_sites() 93 | assert len(cop.sites) == 12 94 | cop.cluster_sites() 95 | assert len(cop.clusters) == 9 96 | cop.sites_to_pattern() 97 | cop.add_patterns() 98 | cop.irregular_patterns() 99 | cop.fuzziness() 100 | # get the cluster graph 101 | G = cop.get_cluster_graph() 102 | assert len(G.nodes) == len(cop.sites) 103 | 104 | # compute the purity of the cluster graph 105 | assert round(cop.purity(), 2) == 0.42 106 | cop.load_patterns() 107 | 108 | 109 | def test_CoPaR_plain(tmp_path): 110 | D = { 111 | 0: ["doculect", "concept", "ipa", "tokens", "cogid", "alignment"], 112 | 1: ["a", "a", "pla", "p l a", 1, "p l a -".split()], 113 | 2: ["b", "a", "pla", "p l a t", 1, "p l a t".split()], 114 | 3: ["c", "a", "pla", "p l u p", 1, "p l u p".split()], 115 | 4: ["d", "a", "pla", "p l a k", 1, "p l a k".split()], 116 | 5: ["a", "b", "pla", "t r a", 2, "t r a -".split()], 117 | 6: ["b", "b", "pla", "t a t", 2, "t - a t".split()], 118 | 7: ["c", "b", "pla", "d r ə p", 2, "d r ə p".split()], 119 | # 8: ["a", "b", "pla", "p l a k", [1], "d x a k".split()], 120 | 9: ["a", "c", "pla", "k l a", 3, "k r a -".split()], 121 | # 10: ["a", "c", "pla", "p l a t", [1], "k a t".split()], 122 | 11: ["c", "c", "pla", "k l ə p", 3, "k l ə p".split()], 123 | 12: ["d", "c", "pla", "g l a k", 3, "g l a k".split()], 124 | 13: ["d", "f", "buk", "b u k", 4, "b u k".split()], 125 | } 126 | alms = Alignments(D, ref="cogid", transcription="ipa") 127 | add_structure(alms, model="cv", structure="structure") 128 | cop = CoPaR(alms, ref="cogid", structure="structure", minrefs=1) 129 | 130 | with pytest.raises(ValueError): 131 | cop.write_patterns("f") 132 | with pytest.raises(ValueError): 133 | cop.predict_words() 134 | 135 | cop.get_sites() 136 | assert len(cop.sites) == 12 137 | cop.cluster_sites() 138 | assert len(cop.clusters) == 9 139 | cop.sites_to_pattern() 140 | cop.irregular_patterns() 141 | cop.add_patterns(proto="a", irregular_patterns=True) 142 | cop.fuzziness() 143 | # get the cluster graph 144 | G = cop.get_cluster_graph() 145 | assert len(G.nodes) == len(cop.sites) 146 | 147 | # compute the purity of the cluster graph 148 | assert round(cop.purity(), 2) == 0.42 149 | 150 | assert cop.upper_bound() > 1 151 | 152 | cop.write_patterns(tmp_path / 'test') 153 | cop.write_patterns(tmp_path / 'test', proto="a", irregular_patterns=True) 154 | cop.predict_words() 155 | cop.load_patterns() 156 | 157 | 158 | def test_polynesian(data): 159 | cop = CoPaR(str(data / "east-polynesian.tsv"), ref="cogid", segments="segments") 160 | cop.align() 161 | cop.get_sites() 162 | cop.cluster_sites() 163 | cop.add_patterns() 164 | 165 | 166 | def test_warnings(): 167 | D = { 168 | 0: ["doculect", "concept", "ipa", "tokens", "cogids", "alignment", "structure"], 169 | 1: ["a", "a", "pla", "p l a", [1], "p l a -".split(), "i m n c".split()], 170 | 2: ["b", "a", "pla", "p l a t", [1], "p l a t".split(), "i n c".split()], 171 | 3: ["c", "a", "pla", "p l u p", [1], "p l u p".split(), "i m n c".split()], 172 | 4: ["d", "a", "pla", "p l a k", [1], "p l a k".split(), "i m n c".split()], 173 | } 174 | alms = Alignments(D, ref="cogids") 175 | cop = CoPaR(alms, structure="structure") 176 | with pytest.raises(ValueError): 177 | cop.get_sites() 178 | D = { 179 | 0: ["doculect", "concept", "ipa", "tokens", "cogids", "alignment", "structure"], 180 | 1: ["a", "a", "pla", "p l a", [1], "p !l a -".split(), "i m n".split()], 181 | 2: ["b", "a", "pla", "p l a t", [1], "p f/l a t".split(), "i m n c".split()], 182 | 3: ["c", "a", "pla", "p l u p", [1], "p l u p".split(), "i m n c".split()], 183 | 4: ["d", "a", "pla", "p l a k", [1], "p l a k".split(), "i m n c".split()], 184 | } 185 | cop = CoPaR(D) 186 | cop.get_sites() 187 | -------------------------------------------------------------------------------- /tests/test_evaluate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test evaluate module of lingrex. 3 | """ 4 | from lingrex.evaluate import ( 5 | compare_cognate_sets, 6 | cross_semantic_cognate_statistics 7 | ) 8 | from lingpy import Wordlist 9 | 10 | 11 | def test_compare_cognate_sets(): 12 | 13 | wordlist = Wordlist({ 14 | 0: ["doculect", "concept", "form", "looseid", "strictid"], 15 | 1: ["a", "a", "b", "1", "2"], 16 | 2: ["b", "a", "c", "1", "3"], 17 | 3: ["c", "a", "c", "1", "2"], 18 | 4: ["d", "a", "d", "1", "4"] 19 | }) 20 | ranks = compare_cognate_sets( 21 | wordlist, "strictid", "looseid") 22 | assert len(ranks) == 1 23 | assert ranks[0][0] == "a" 24 | assert ranks[0][1] == 1 25 | assert ranks[0][2] == 0.375 26 | 27 | 28 | def test_cross_semantic_cognate_statistics(): 29 | 30 | wordlist = Wordlist({ 31 | 0: ["doculect", "concept", "form", "cogids", "morphemes"], 32 | 1: ["a", "A", "a + b", "1 2", "a _suf"], 33 | 2: ["b", "A", "a + b", "1 2", "a _suf"], 34 | 3: ["c", "A", "c + d + a", "3 4 1", "_suf d a"], 35 | 4: ["d", "A", "d + e", "4 5", "d e"], 36 | 5: ["a", "B", "a + f", "1 6", "a f"], 37 | 6: ["b", "B", "a + f", "1 6", "a f"], 38 | 7: ["c", "C", "g + h + a", "7 8 1", "g h a"], 39 | 8: ["d", "C", "h + i", "8 9", "h i"], 40 | }) 41 | ranks = cross_semantic_cognate_statistics( 42 | wordlist, 43 | concept="concept", 44 | morpheme_glosses="morphemes", 45 | ignore_affixes=True 46 | ) 47 | assert len(ranks) == 3 48 | assert ranks[0][0] == "C" 49 | assert ranks[2][1] == 0.625 50 | wordlist = Wordlist({ 51 | 0: ["doculect", "concept", "form", "cogids", "morphemes"], 52 | 1: ["a", "A", "a + b", "1 2", "a _suf"], 53 | 2: ["b", "A", "a + b", "1 2", "a _suf"], 54 | 3: ["c", "A", "c + d + a", "3 4 1", "_suf d a"], 55 | 4: ["d", "A", "d + e", "4 5", "d e"], 56 | 5: ["a", "B", "a + f", "1 6", "a f"], 57 | 6: ["b", "B", "a + f", "1 6", "a f"], 58 | 7: ["c", "C", "g + h + a", "7 8 1", "g h a"], 59 | 8: ["d", "C", "h + i", "8 9", "h i"], 60 | }) 61 | ranks2 = cross_semantic_cognate_statistics( 62 | wordlist, 63 | concept="concept", 64 | morpheme_glosses="morphemes", 65 | ignore_affixes=False 66 | ) 67 | assert ranks2[2][1] != ranks[2][1] 68 | assert ranks2[2][1] == 0.5 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /tests/test_fuzzy.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test fuzzy reconstruction. 3 | """ 4 | import pytest 5 | from lingrex.fuzzy import FuzzyReconstructor, ntile 6 | from lingrex.reconstruct import CorPaRClassifier 7 | import random 8 | import lingpy 9 | 10 | 11 | 12 | def test_ntile(): 13 | assert set( 14 | ntile( 15 | ["kap", "kap", "kup", "kup" 16 | ], 2).split(" ")[1].split("|")) == set(["a", "u"]) 17 | # counting is not the same for missing data! 18 | assert ntile(["kap", "kØp", "kØp"], n=2) == 'k|k a|a p|p' 19 | 20 | def test_FuzzyReconstructor(data): 21 | random.seed(1234) 22 | 23 | pytest.raises(ValueError, FuzzyReconstructor, 1, "ProtoBurmish") 24 | pt = FuzzyReconstructor(str(data / "hillburmish.tsv"), "ProtoBurmish", ref="cogids", 25 | fuzzy=False) 26 | alms = lingpy.align.sca.Alignments( 27 | str(data / "hillburmish.tsv"), 28 | transcription="form", ref="cogids") 29 | pt = FuzzyReconstructor(alms, "ProtoBurmish", ref="cogids", 30 | fuzzy=False) 31 | pt.random_splits() 32 | assert hasattr(pt, "wordlists") 33 | 34 | clf = lambda: CorPaRClassifier() 35 | pt.fit_samples(clf) 36 | predis = pt.predict( 37 | pt.wordlist.msa["cogids"][665]["seqs"][:3], 38 | ["Atsi", "Lashi", "OldBurmese"], 39 | desegment=True 40 | ) 41 | assert predis[0] == "ŋ:100" 42 | predis = pt.predict( 43 | pt.wordlist.msa["cogids"][666]["seqs"][:3], 44 | ["Atsi", "Lashi", "OldBurmese"], 45 | desegment=True 46 | ) 47 | assert predis[-1] == "?:90¦⁴:10" 48 | 49 | predis = pt.predict( 50 | pt.wordlist.msa["cogids"][665]["seqs"][:3], 51 | ["Atsi", "Lashi", "OldBurmese"], 52 | desegment=True, 53 | output="percentiles" 54 | ) 55 | assert predis[0] == "ŋ:100" 56 | 57 | words, predis = pt.predict( 58 | pt.wordlist.msa["cogids"][665]["seqs"][:3], 59 | ["Atsi", "Lashi", "OldBurmese"], 60 | desegment=True, 61 | output="wp" 62 | ) 63 | assert predis[0] == "ŋ:100" 64 | 65 | words = pt.predict( 66 | pt.wordlist.msa["cogids"][665]["seqs"][:3], 67 | ["Atsi", "Lashi", "OldBurmese"], 68 | desegment=True, 69 | output="words" 70 | ) 71 | assert words[0][0] == "ŋ" 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /tests/test_reconstruct.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the reconstruction module of lingrex. 3 | """ 4 | import pytest 5 | from lingrex.reconstruct import ( 6 | CorPaRClassifier, 7 | OneHot, 8 | ReconstructionBase, 9 | PatternReconstructor, 10 | transform_alignment, 11 | eval_by_dist, 12 | eval_by_bcubes 13 | ) 14 | from functools import partial 15 | 16 | 17 | 18 | def test_transform_alignment(): 19 | 20 | out = transform_alignment( 21 | [["b", "a", "k"], ["b", "a"]], 22 | ["a", "b"], 23 | ["a", "b", "u"], 24 | training=False 25 | ) 26 | assert len(out) == 3 27 | 28 | out = transform_alignment( 29 | [["b", "k"], ["b", "a", "k"]], 30 | ["a", "b"], 31 | ["a", "b", "u"], 32 | training=True, 33 | 34 | ) 35 | assert len(out) == 2 36 | 37 | out = transform_alignment( 38 | [["b", "k"], ["b", "a", "k"]], 39 | ["a", "b"], 40 | ["a", "b", "u"], 41 | training=True, 42 | firstlast=True 43 | 44 | ) 45 | assert out[0][-1] == "k" 46 | 47 | out = transform_alignment( 48 | [["b", "k"], ["b", "a", "k"]], 49 | ["a", "b"], 50 | ["a", "b", "u"], 51 | training=True, 52 | startend=True 53 | ) 54 | assert out[0][-1] == 0 55 | 56 | 57 | def test_PatternReconstructor(data): 58 | 59 | pt = PatternReconstructor(str(data / "hillburmish.tsv"), "ProtoBurmish", ref="cogids", 60 | ) 61 | t1 = partial(transform_alignment, align=True, position=False, 62 | prosody=False, startend=False, firstlast=False) 63 | t2 = partial(transform_alignment, align=True, position=True, 64 | prosody=True, startend=True, firstlast=True) 65 | pt.fit(func=t1) 66 | assert pt.predict( 67 | pt.msa["cogids"][665]["seqs"][:3], 68 | ["Atsi", "Lashi", "OldBurmese"], 69 | desegment=True 70 | ) == ['ŋ', 'a', '¹'] 71 | pt.fit(func=t2) 72 | assert pt.predict( 73 | pt.msa["cogids"][665]["seqs"][:3], 74 | ["Atsi", "Lashi", "OldBurmese"], 75 | desegment=True 76 | ) == ['ŋ', 'a', '¹'] 77 | 78 | pt.fit(func=t1, onehot=True) 79 | assert pt.predict( 80 | pt.msa["cogids"][665]["seqs"][:3], 81 | ["Atsi", "Lashi", "OldBurmese"], 82 | desegment=True 83 | ) == ['ŋ', 'a', '¹'] 84 | 85 | def test_eval_by_dist(): 86 | assert eval_by_dist([[["t", "a"], ["t", "o"]]]) == 1 87 | assert eval_by_dist([[["t", "a"], []]]) == 2 88 | 89 | assert eval_by_dist([[["t", "a"], ["t", "o"]]], normalized=True) == 0.5 90 | 91 | def test_eval_by_bcubes(): 92 | assert eval_by_bcubes([[["t", "a"], ["t", "a"]]]) == 1 93 | assert eval_by_bcubes([ 94 | [["t", "a"], ["t", "o"]] 95 | ]) == 1.0 96 | assert eval_by_bcubes([ 97 | [["t", "a"], []] 98 | ]) == 1 99 | 100 | 101 | -------------------------------------------------------------------------------- /tests/test_regularity.py: -------------------------------------------------------------------------------- 1 | from pytest import raises 2 | from lingpy import Wordlist, Alignments 3 | from lingrex.copar import CoPaR 4 | from lingrex.util import add_structure 5 | from lingrex.regularity import regularity 6 | 7 | 8 | dummy_wl = { 9 | 0: ["doculect", "concept", "form", "ipa", "alignment", "cogid"], 10 | 1: ["A", "one", "atawu", "atawu", "a t a w u", 1], 11 | 2: ["B", "one", "atwu", "atwu", "a t - w u", 1], 12 | 3: ["C", "one", "tawu", "tawu", "- t a w u", 1], 13 | 4: ["D", "one", "tefu", "tefu", "- t e f u", 1], 14 | 5: ["A", "two", "satu", "satu", "s a t u", 2], 15 | 6: ["B", "two", "setu", "setu", "s e t u", 2], 16 | 7: ["C", "two", "situ", "situ", "s i t u", 2] 17 | } 18 | 19 | 20 | def test_regularity(): 21 | test_wl = Wordlist(dummy_wl) 22 | with raises(ValueError): 23 | regularity(test_wl) 24 | 25 | test_alg = Alignments(test_wl) 26 | add_structure(test_alg, model="cv", structure="structure") 27 | print(test_alg.structure) 28 | test_alg = CoPaR(test_alg, ref="cogid") 29 | test_alg.get_sites() 30 | test_alg.cluster_sites() 31 | test_alg.sites_to_pattern() 32 | output = regularity(test_alg, threshold=2, word_threshold=0.5, 33 | sound_classes="cv") 34 | 35 | assert output == (2, 5, 7, 0.29, 4, 5, 9, 0.44, 3, 4, 7, 0.43) 36 | -------------------------------------------------------------------------------- /tests/test_trimming.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from lingpy import Alignments 4 | 5 | from lingrex.trimming import * 6 | 7 | 8 | def test_Site(): 9 | site = Site([GAP, 'a', GAP, 't']) 10 | assert site.gap_ratio() == pytest.approx(0.5) 11 | assert site.gap_ratio(gap='#') == pytest.approx(0.0) 12 | assert site.soundclass() == 'V' 13 | assert site.soundclass(gap='a') == '0' 14 | 15 | 16 | @pytest.mark.parametrize( 17 | 'alms,gap,ratios', 18 | [ 19 | (["aaa", "aa-"], '-', [0.0, 0.0, 0.5]), 20 | (["aa", "a#"], '#', [0.0, 0.5]), 21 | ] 22 | ) 23 | def test_gap_ratio(alms, gap, ratios): 24 | assert Sites([list(w) for w in alms], gap=gap).gap_ratios == ratios 25 | 26 | 27 | def test_trimmed(): 28 | alm = [list("toxta-"), list("to-tir"), list("to-t-r"), list("do--ar")] 29 | assert " ".join(Sites(alm)._trimmed([2, 5]).to_alignment()[0]) == "t o t a" 30 | 31 | 32 | def test_soundclasses(): 33 | assert Sites([list("-bc"), list("ab-")], gap="-").soundclasses == ["V", "C", "C"] 34 | 35 | 36 | @pytest.mark.parametrize( 37 | 'alms,kw,result', 38 | [ 39 | (["abc", "a-c", "--c"], {}, list('ac')), 40 | (["abc", "a-c", "--c"], dict(skeletons=['VCC']), list('abc')), 41 | (["a+bco", "-+cco", "-+cco"], {}, list('bco')), 42 | (["a+b", "-+c", "-+c"], dict(exclude=""), list('a+b')), 43 | ([ 44 | #"- - n u - - 'b/b a".split(), 45 | '- - - - d ù/u - -'.split(), 46 | '- - - - d ú/u - -'.split(), 47 | '- - - - d ù/u - -'.split(), 48 | "ɾ u 'w/w a s i ɾ a".split(), 49 | '- - - - s u - e'.split(), 50 | "- - n u - - 'b/b a".split(), 51 | '- - - - d u l -'.split(), 52 | '- - n u k - w ɔ'.split(), 53 | ], {}, ['d', 'ù/u']), 54 | ([ 55 | "- - n u - - 'b/b a".split(), 56 | '- - - - d ù/u - -'.split(), 57 | '- - - - d ú/u - -'.split(), 58 | '- - - - d ù/u - -'.split(), 59 | "ɾ u 'w/w a s i ɾ a".split(), 60 | '- - - - s u - e'.split(), 61 | #"- - n u - - 'b/b a".split(), 62 | '- - - - d u l -'.split(), 63 | '- - n u k - w ɔ'.split(), 64 | ], {}, ['-', '-']), 65 | # Non-overlapping alignments: 66 | (['- - a b'.split(), 'a b - -'.split(), 'a b - -'.split()], {}, ['-', '-']), 67 | # 68 | (['- a b'.split(), 'b a -'.split(), 'b a -'.split()], {}, ['-', 'a']), 69 | (['- a b'.split(), 'b - -'.split(), 'b - -'.split()], {}, ['-', 'a']), 70 | ([ 71 | '- a b c'.split(), 72 | 'b - - -'.split(), 73 | 'b - - -'.split(), 74 | 'b - - d'.split() 75 | ], {}, ['-', 'a', 'c']), 76 | ([list('bbabb'), list('bb-bb'), list('-b-b-'), list('-b-b-')], {}, list('bbabb')), 77 | ([list('bbabb'), list('bb-bb'), list('-b-b-'), list('-b-b-')], {'strict_ratio': False}, list('bab')), 78 | ] 79 | ) 80 | def test_trim_by_gap(alms, kw, result): 81 | assert Sites([list(w) for w in alms]).trimmed(**kw).to_alignment()[0] == result 82 | 83 | 84 | @pytest.mark.parametrize( 85 | 'alms,kw,result', 86 | [ 87 | (["--mat", "-xmut", "--mit", "m-xit"], {}, list('mat')), 88 | (["--mat--", "-xmut--", "--mitx-", "m-xit-x"], {}, list('mat')), 89 | ([ 90 | "- - n u - - 'b/b a".split(), 91 | '- - - - d ù/u - -'.split(), 92 | '- - - - d ú/u - -'.split(), 93 | '- - - - d ù/u - -'.split(), 94 | "ɾ u 'w/w a s i ɾ a".split(), 95 | '- - - - s u - e'.split(), 96 | "- - n u - - 'b/b a".split(), 97 | '- - - - d u l -'.split(), 98 | '- - n u k - w ɔ'.split(), 99 | ], {}, ['-', '-', "'b/b", 'a']), 100 | ([list('bbabb'), list('bb-bb'), list('-b-b-'), list('-b-b-')], {}, list('bab')), 101 | ] 102 | ) 103 | def test_trim_by_core(alms, kw, result): 104 | sites = Sites([list(w) for w in alms]) 105 | assert sites.trimmed(strategy='core', **kw).to_alignment()[0] == result 106 | assert str(sites) 107 | 108 | 109 | def test_trim_random(mocker): 110 | mocker.patch('lingrex.trimming.random', mocker.Mock(sample=lambda pop, k: list(pop)[:k])) 111 | alms = [list(w) for w in ["--mat", "-xmut", "m-xut", "--xit"]] 112 | assert len(Sites(alms).trimmed()) == len(Sites(alms).trimmed_random()) 113 | assert set(Sites(alms).trimmed().soundclasses) == \ 114 | set(Sites(alms).trimmed_random().soundclasses) 115 | assert Sites(alms).trimmed_random(strategy='core') 116 | 117 | 118 | def test_prep_alignments(wl_with_alignments): 119 | test_wl = prep_alignments(Alignments(wl_with_alignments, transcription="form")) 120 | assert test_wl[4, "structure"] == "C V C V" 121 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from lingpy import Wordlist, Alignments 3 | from lingrex.util import lingrex_path, add_structure 4 | from lingrex.util import ungap, clean_sound, unjoin, alm2tok, bleu_score 5 | from lingrex.util import prep_wordlist, subsequence_of 6 | 7 | 8 | @pytest.mark.parametrize( 9 | 'source,target,result', 10 | [ 11 | ('cvc', 'cvcvc', True), 12 | ('cvc', 'cxcvc', True), 13 | ('vcc', 'vc', False), 14 | ('vcc', 'vcc', True), 15 | ('bla', 'bla', True), 16 | ('bla', 'bxlyaz', True), 17 | ('bla', 'abxlyaz', True), 18 | ('bla', 'abxalyz', False), 19 | ('abc', 'ab', False), 20 | ] 21 | ) 22 | def test_subsequence_of(source, target, result): 23 | assert subsequence_of(source, target) == result 24 | 25 | 26 | def test_bleu_score(): 27 | candidate = "this is a test".split() 28 | reference = "this is a small test".split() 29 | 30 | assert round( 31 | bleu_score( 32 | candidate, 33 | reference, 34 | weights=[0.5, 0.5], 35 | n=2, 36 | trim=True 37 | ), 38 | 2) == 0.64 39 | 40 | assert round( 41 | bleu_score( 42 | candidate, 43 | reference, 44 | weights=[0.5, 0.5], 45 | n=2, 46 | trim=False), 47 | 2) == 0.70 48 | 49 | assert round( 50 | bleu_score( 51 | candidate, 52 | reference, 53 | n=2, 54 | trim=False), 55 | 2) == 0.70 56 | 57 | 58 | def test_ungap(): 59 | matrix = ungap([['a', 'b'], ['x', '-'], ['y', '-']], ['proto', 'l1', 'l2'], 'proto') 60 | assert matrix[0][0] == 'a.b' 61 | assert matrix[1][0] == 'x' 62 | assert matrix[2][0] == "y" 63 | matrix2 = ungap([['a', 'b'], ['x', '-'], ['y', 'h']], ['proto', 'l1', 'l2'], 'proto') 64 | assert matrix2[0][1] == ["a", "b"][1] 65 | assert matrix2[1][1] == ["x", "-"][1] 66 | assert matrix2[2][1] == ["y", "h"][1] 67 | 68 | out = ungap([["p", "-", "a"], ["p", "j", "a"]], ["German", "E"], "E") 69 | assert out[1][0] == "p.j" 70 | 71 | alm = [['a', 'b'], ['-', '-'], ['-', '-']] 72 | assert ungap(alm, ['p', 'l1', 'l2'], 'p') == alm 73 | 74 | 75 | def test_clean_sound(): 76 | assert clean_sound("a/b") == "b" 77 | assert clean_sound("a") == "a" 78 | assert clean_sound("a/b.c/d") == "b.d" 79 | 80 | 81 | def test_unjoin(): 82 | assert unjoin("k.p a p u k.a/b".split())[0] == "k" 83 | 84 | 85 | def test_lingrex_path(): 86 | lingrex_path("test") 87 | 88 | 89 | def test_add_structure(): 90 | 91 | with pytest.raises(ValueError): 92 | add_structure( 93 | Wordlist( 94 | { 95 | 0: ["doculect", "concept", "tokens", "cogid"], 96 | 1: ["a", "b", "b l a".split(), 1], 97 | 2: ["b", "b", "b l a x".split(), 1], 98 | 3: ["c", "b", "b l i k u s".split(), 1], 99 | } 100 | ), 101 | model="bla", 102 | ) 103 | 104 | for m in ["cv", "c", "CcV", "nogap", "ps"]: 105 | D = { 106 | 0: ["doculect", "concept", "tokens", "cogid"], 107 | 1: ["a", "b", "b l a".split(), 1], 108 | 2: ["b", "b", "b l a x".split(), 1], 109 | 3: ["c", "b", "b l i k u s".split(), 1], 110 | 4: ["d", "b", "b l u k", 2], 111 | } 112 | wl = Alignments(D, transcription="tokens") 113 | add_structure(wl, m) 114 | 115 | for m in ["cv", "c", "CcV", "nogap", "ps"]: 116 | D = { 117 | 0: ["doculect", "concept", "tokens", "cogids"], 118 | 1: ["a", "b", "b l a".split(), [1]], 119 | 2: ["b", "b", "b l a x".split(), [1]], 120 | 3: ["c", "b", "b l i k u s".split(), [1]], 121 | } 122 | wl = Alignments(D, ref="cogids", transcription="tokens") 123 | add_structure(wl, m, ref="cogids") 124 | 125 | 126 | def test_prep_wordlist(wl_with_alignments): 127 | test_wl = prep_wordlist(Wordlist(wl_with_alignments)) 128 | 129 | assert len(test_wl) == 4 130 | assert "+" not in test_wl[1, "tokens"] 131 | assert "_" not in test_wl[2, "tokens"] 132 | -------------------------------------------------------------------------------- /tests/test_workflows.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import pathlib 3 | import subprocess 4 | 5 | import pytest 6 | 7 | 8 | @pytest.fixture 9 | def clean_dir(tmp_path): # pragma: no cover 10 | def _clean_dir(d): 11 | shutil.copytree(pathlib.Path(__file__).parent / 'workflows' / d, tmp_path / d) 12 | return tmp_path / d 13 | return _clean_dir 14 | 15 | 16 | def _run(wd, *cmds): # pragma: no cover 17 | for cmd in cmds: 18 | try: 19 | subprocess.check_call(cmd, cwd=wd, shell=True) 20 | except subprocess.CalledProcessError as e: # pragma: no cover 21 | print(e) 22 | print(e.output) 23 | raise 24 | 25 | 26 | @pytest.mark.workflow 27 | def test_bodt(clean_dir): # pragma: no cover 28 | _run( 29 | clean_dir('bodt-2019'), 30 | 'python predict.py', 31 | 'python test-prediction.py bodt-khobwa-cleaned.tsv -r 0.5', 32 | ) 33 | 34 | 35 | @pytest.mark.workflow 36 | def test_list(clean_dir): # pragma: no cover 37 | _run( 38 | clean_dir('list-2019'), 39 | 'python general.py', 40 | 'python predict.py data/burmish-240-8.tsv -r 0.75 --runs 2', 41 | 'python predict.py data/chinese-623-14.tsv -r 0.75 --runs 2', 42 | 'python predict.py data/polynesian-210-10.tsv -r 0.75 --runs 2', 43 | 'python predict.py data/japanese-200-10.tsv -c crossid -r 0.75 --runs 2', 44 | ) 45 | 46 | 47 | @pytest.mark.workflow 48 | def test_wu(clean_dir): # pragma: no cover 49 | _run( 50 | clean_dir('wu-2020'), 51 | 'python 4_crosssemantic.py', 52 | 'python 5_correspondence.py', 53 | ) 54 | -------------------------------------------------------------------------------- /tests/workflows/bodt-2019/predict.py: -------------------------------------------------------------------------------- 1 | from lingpy import * 2 | from sys import argv 3 | from lingrex.copar import CoPaR 4 | from sys import argv 5 | 6 | cp = CoPaR('bodt-khobwa-cleaned.tsv', ref='crossids', fuzzy=True, 7 | minrefs=2, structure='structure', transcription="tokens") 8 | 9 | # make function to extract correspondence patterns 10 | cp.get_sites() 11 | cp.cluster_sites() 12 | cp.sites_to_pattern() 13 | 14 | preds, purity, pudity = cp.predict_words() 15 | goods = 0 16 | with open('predictions-automatic.tsv', 'w') as f: 17 | f.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\n'.format( 18 | 'NUMBER', 'GOOD_PREDICTION', 'COGNATESET', 'LANGUAGE', 'CONCEPT', 'MORPHEME', 'WORD1', 19 | 'WORD2', 'WORD3' 20 | )) 21 | num = 1 22 | for key, vals in sorted(preds.items(), key=lambda x: x[0]): 23 | # get the morphemes 24 | idx = cp.msa['crossids'][key]['ID'][0] 25 | cidx = cp[idx, 'crossids'].index(key) 26 | try: 27 | morph = cp[idx, 'morphemes'][cidx] 28 | except: 29 | morph = '?' 30 | for doc in vals: 31 | val1 = ' '.join([x.split('|')[0] for x in vals[doc]]) 32 | if "Ø" in val1: 33 | no = '?' 34 | else: 35 | no = '' 36 | goods += 1 37 | val2 = ' '.join(['|'.join(x.split('|')[0:2]) for x in vals[doc]]) 38 | val3 = ' '.join(vals[doc]) 39 | 40 | f.write('\t'.join([str(num), no, str(key), doc, cp[idx, 'concept'], 41 | morph, val1, val2, val3])+'\n') 42 | num += 1 43 | print('useful predictions', goods) 44 | -------------------------------------------------------------------------------- /tests/workflows/bodt-2019/results/README.md: -------------------------------------------------------------------------------- 1 | bla 2 | -------------------------------------------------------------------------------- /tests/workflows/bodt-2019/test-prediction.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import random 3 | import collections 4 | 5 | from lingrex.copar import * 6 | from lingpy.basictypes import * 7 | from tabulate import tabulate 8 | 9 | def run_experiments( 10 | f, 11 | ref, 12 | ratio, 13 | subset=None, 14 | runs=100, 15 | verbose=False, 16 | fuzzy=True, 17 | samples=1, 18 | noout=False, 19 | score_mode='pairs' 20 | ): 21 | 22 | if not noout: 23 | outfile = codecs.open( 24 | 'results/'+f.split('/')[-1][:-4]+'-'+str(int(ratio*100+0.5))+'.txt', 25 | 'w', 'utf-8') 26 | outfile.write('\t'.join([ 27 | 'accuracy', 'proportion', 'density', 'fuzziness', 'coverage', 28 | 'purity', 'sounds', 'missing', 'csetsize', 'clusters', 'props', 29 | 'patterns', 'predicted', 'predictable', 'removed', 'regular', 30 | 'purityx'])+'\n') 31 | 32 | cpb = CoPaR(f, ref=ref, fuzzy=fuzzy, split_on_tones=False, 33 | segments='segments', transcription="segments") 34 | 35 | if not noout: 36 | inout = codecs.open( 37 | 'results/'+f.split('/')[-1][:-4]+'-individual-'+str(int(ratio*100+0.5))+'.tsv', 38 | 'w', 'utf-8') 39 | inout.write('\t'.join(['run', 'doculect','accuracy', 'purity', 'words', 'sounds'])+'\n') 40 | 41 | # define the scores 42 | all_scores = [] 43 | all_samples = set() 44 | all_pscores = {d: [] for d in cpb.cols} 45 | all_pud = {d: [] for d in cpb.cols} 46 | all_words = {d: [] for d in cpb.cols} 47 | all_sounds = {d: [] for d in cpb.cols} 48 | for key, msa in cpb.msa[ref].items(): 49 | for alm, t in zip(msa['alignment'], msa['taxa']): 50 | all_samples.add((key, ' '.join(alm), t)) 51 | 52 | for run in range(runs): 53 | remove_idxs = random.sample(all_samples, int(len(all_samples)*ratio+0.5)) 54 | D = {0: cpb.columns} 55 | for idx, cogid, alm, tax, tokens, structures in cpb.iter_rows( 56 | ref, 'alignment', 'doculect', 'segments', 'structure'): 57 | if fuzzy: 58 | cogids, alms, toks, strucs = [], [], [], [] 59 | for c, a, t, s in zip(cogid, lists(alm).n, lists(tokens).n, 60 | lists(structures).n): 61 | if (c, str(a), tax) not in remove_idxs: 62 | cogids += [c] 63 | alms += [str(a)] 64 | toks += [str(t)] 65 | strucs += [str(s)] 66 | if not cogids: 67 | pass 68 | else: 69 | D[idx] = cpb[idx] 70 | D[idx][cpb.header[ref]] = ints(cogids) 71 | D[idx][cpb.header['segments']] = ' + '.join(toks) 72 | D[idx][cpb.header['structure']] = ' + '.join(strucs) 73 | D[idx][cpb.header['alignment']] = ' + '.join(alms) 74 | else: 75 | if (cogid, str(alm), tax) in remove_idxs: 76 | pass 77 | else: 78 | D[idx] = cpb[idx] 79 | 80 | cp = CoPaR(D, ref=ref, fuzzy=fuzzy, split_on_tones=False, 81 | segments='segments', transcription="segments", minrefs=2, 82 | structure="structure") 83 | if 'l' in argv: 84 | cp.load_patterns() 85 | else: 86 | cp.get_sites() 87 | cp.cluster_sites(score_mode=score_mode) 88 | cp.sites_to_pattern() 89 | 90 | # compute size of alphabets 91 | sounds = {d: collections.defaultdict(int) for d in cp.cols} 92 | for idx, doc, tks in cp.iter_rows('doculect', 'segments'): 93 | for t in tks: 94 | if t != '+': 95 | sounds[doc][t.split('/')[1] if '/' in t else t] += 1 96 | ave = sum([len(s) for s in sounds.values()]) / cp.width 97 | 98 | # good words 99 | our_sample = {} 100 | for cogid, alm, doc in remove_idxs: 101 | our_sample[cogid, doc] = strings(alm) 102 | pscores = {d: [] for d in cp.cols} 103 | 104 | regs = sum([len(a[1]) for a in cp.clusters.items() if len(a[1]) > 1]) / len(cp.sites) 105 | 106 | predicted, purity, pudity = cp.predict_words(minrefs=2, samples=samples) 107 | scores = [] 108 | unknown, all_segs, predictable, cogsize = 0, 0, 0, 0 109 | for k, v in predicted.items(): 110 | for doc in v: 111 | if (k, doc) in our_sample and (doc == subset or not subset): 112 | predictable += 1 113 | cogsize += len(cp.msa[ref][k]['ID']) 114 | 115 | # check for different alignments 116 | msaA = cp.msa[ref][k] 117 | msaB = cpb.msa[ref][k] 118 | if len(msaA['alignment'][0]) != len(msaB['alignment'][0]): 119 | # carve out the taxa which are still existent to find which 120 | # column to delete 121 | new_alm = [msaB['alignment'][i] for i in 122 | range(len(msaB['alignment'])) if msaB['taxa'][i] in \ 123 | msaA['taxa']] 124 | almA, almB = [], [] 125 | for i in range(len(msaA['alignment'][0])): 126 | almA += [tuple([line[i] for line in msaA['alignment']])] 127 | for i in range(len(msaB['alignment'][0])): 128 | almB += [tuple([line[i] for line in new_alm])] 129 | out = [] 130 | for i, col in enumerate(almB): 131 | if col not in almA: 132 | out += [i] 133 | else: 134 | out = [] 135 | 136 | wA, wB = v[doc], our_sample[k, doc] 137 | ms = 0 138 | wB = strings([x for i, x in enumerate(wB) if i not in out]) 139 | for a, b in zip(wA, wB): 140 | b = b.split('/')[1] if '/' in b else b 141 | a = a.split('|') 142 | for i, a_ in enumerate(a): 143 | if b == a_: 144 | ms += 1 * (1/(i+1)) 145 | if a[0] == 'Ø': 146 | unknown += 1 147 | all_segs += 1 148 | 149 | score = ms / len(wA) 150 | pscores[doc] += [score] 151 | if verbose: 152 | print('{0:5}\t{1:15}\t{2:20}\t{3:20}\t{4:.2f}\t{5}'.format( 153 | str(k), doc, str(wA), str(wB), score, len(set(msaA['taxa'])))) 154 | if verbose and score != 1.0: 155 | purs = [] 156 | for i, elm in enumerate(wA): 157 | if (k, i) in purity: 158 | purs += ['{0:.2f}'.format(purity[k, i][doc])] 159 | else: 160 | purs += ['?'] 161 | print((cogid, i) in cp.sites) 162 | print([_s for _s in cp.sites if _s[0] == cogid], 163 | cogid) 164 | print('<---') 165 | print('\t'.join([x for x in wA])) 166 | print('\t'.join([x for x in wB])) 167 | print('\t'.join(purs)) 168 | print('--->') 169 | scores += [score] 170 | ubound = cp.upper_bound() 171 | all_scores += [( 172 | sum(scores) / len(scores), 173 | len(cp) / len(cpb), 174 | density(cp, ref=ref), 175 | cp.fuzziness(), 176 | sum(pudity.values()) / len(pudity.values()), 177 | ave, 178 | unknown/all_segs, 179 | cogsize / predictable, 180 | len(cp.clusters), 181 | len(cp.clusters) / ubound, 182 | len(cp.sites), 183 | predictable / len(remove_idxs), 184 | predictable, 185 | len(remove_idxs), 186 | regs, 187 | cp.purity() 188 | )] 189 | if verbose: 190 | print('{0:.2f}'.format(all_scores[-1][0])) 191 | 192 | cov = cp.coverage() 193 | for p in pscores: 194 | all_pscores[p] += [sum(pscores[p]) / len(pscores[p])] 195 | all_pud[p] += [pudity[p]] 196 | all_words[p] += [cov[p]] 197 | all_sounds[p] += [len(sounds[p])] 198 | 199 | if not noout: 200 | inout.write('\t'.join([ 201 | str(run+1), 202 | p, 203 | str(all_pscores[p][-1]), 204 | str(pudity[p]), 205 | str(cov[p]), 206 | str(len(sounds[p])) 207 | ])+'\n') 208 | if not noout: 209 | outfile.write(str(run+1)+'\t'+'\t'.join(['{0:.4f}'.format(x) for x in 210 | all_scores[-1]])+'\n') 211 | print('{0:.2f} / {1:.2f}'.format(sum(scores) / len(scores), len(cp) / 212 | len(cpb))) 213 | 214 | 215 | new_scores = [[ 216 | 'accuracy', 'proportion', 'density' 217 | ]] 218 | new_scores += [[ 219 | round(sum([x[0] for x in all_scores]) / len(all_scores), 4), 220 | round(sum([x[1] for x in all_scores]) / len(all_scores), 4), 221 | round(sum([x[2] for x in all_scores]) / len(all_scores), 4), 222 | #round(sum([x[3] for x in all_scores]) / len(all_scores), 4), 223 | #round(sum([x[4] for x in all_scores]) / len(all_scores), 4), 224 | #round(sum([x[5] for x in all_scores]) / len(all_scores), 4), 225 | #round(sum([x[6] for x in all_scores]) / len(all_scores), 4), 226 | #round(sum([x[7] for x in all_scores]) / len(all_scores), 4), 227 | #round(sum([x[8] for x in all_scores]) / len(all_scores), 4), 228 | #round(sum([x[9] for x in all_scores]) / len(all_scores), 4), 229 | #round(sum([x[10] for x in all_scores]) / len(all_scores), 4), 230 | #round(sum([x[11] for x in all_scores]) / len(all_scores), 4), 231 | #round(sum([x[12] for x in all_scores]) / len(all_scores), 4), 232 | #round(sum([x[13] for x in all_scores]) / len(all_scores), 4), 233 | #round(sum([x[14] for x in all_scores]) / len(all_scores), 4), 234 | #round(sum([x[15] for x in all_scores]) / len(all_scores), 4), 235 | #round(sum([x[16] for x in all_scores]) / len(all_scores), 4), 236 | ]] 237 | if not noout: 238 | outfile.close() 239 | inout.close() 240 | 241 | 242 | if noout: 243 | print(tabulate(new_scores, headers='firstrow')) 244 | 245 | return purity, pudity, sounds, cp 246 | 247 | if __name__ == '__main__': 248 | from sys import argv 249 | 250 | # defaults 251 | f = argv[1] 252 | ref = 'crossids' 253 | ratio = 0.5 254 | proto = None 255 | verbose = False 256 | runs = 2 257 | samples = 1 258 | noout = False 259 | 260 | # parse arguments 261 | if '-r' in argv: 262 | ratio = float(argv[argv.index('-r')+1]) 263 | if '-c' in argv: 264 | ref = argv[argv.index('-c')+1] 265 | if '-v' in argv or '--verbose' in argv: 266 | verbose = True 267 | if '--runs' in argv: 268 | runs = int(argv[argv.index('--runs')+1]) 269 | if ref in ['crossid', 'cogid']: 270 | fuzzy = False 271 | else: 272 | fuzzy = True 273 | if '--samples' in argv: 274 | samples = int(argv[argv.index('--samples')+1]) 275 | if '--noout' in argv: 276 | noout = True 277 | 278 | if '--seed' in argv: 279 | random.seed(1) 280 | 281 | p1, p2, p3, cop = run_experiments( 282 | f, 283 | ref, 284 | ratio, 285 | fuzzy=fuzzy, 286 | verbose=verbose, 287 | runs=runs, 288 | samples=samples, 289 | noout=noout, 290 | ) 291 | if verbose: 292 | cop.add_patterns() 293 | cop.output( 294 | 'tsv', 295 | filename='results/'+f.split('/')[1].split('-')[0]+str(int(100*ratio+0.5)), 296 | ignore='all', 297 | prettify=False 298 | ) 299 | 300 | 301 | -------------------------------------------------------------------------------- /tests/workflows/list-2019/general.py: -------------------------------------------------------------------------------- 1 | from lingpy import * 2 | from lingrex.copar import * 3 | from glob import glob 4 | from tabulate import tabulate 5 | import numpy as np 6 | 7 | data = [ 8 | ('burmish-240-8', 'crossids'), 9 | ('chinese-623-14', 'crossids'), 10 | ('polynesian-210-10', 'crossids'), 11 | ('japanese-200-10', 'crossid') 12 | ] 13 | 14 | table = [[ 15 | 'dataset', 16 | 'sites', 17 | 'patterns', 18 | 'singletons', 19 | 'coverage', 20 | 'irregulars', 21 | 'purity' 22 | ]] 23 | for f, c in data: 24 | name = f.split('-')[0] 25 | print(name) 26 | cp = CoPaR('data/'+f+'.tsv', ref=c, fuzzy=(c=='crossids'), 27 | transcription="segments", segments='segments', 28 | minrefs=2, structure="structure") 29 | cp.get_sites() 30 | cp.cluster_sites() 31 | cp.sites_to_pattern() 32 | cp.add_patterns() 33 | singletons = len([a for a in cp.clusters.items() if len(a[1]) == 1]) 34 | cp.irregular_patterns() 35 | iregs = sum([len(a) for a in cp.ipatterns.values()]) 36 | table += [[ 37 | name, 38 | len(cp.sites), 39 | len(cp.clusters), 40 | singletons, 41 | '{0:.2f}'.format( 42 | (len(cp.sites)-singletons) / len(cp.sites)), 43 | iregs, 44 | cp.purity() 45 | ]] 46 | cp.output('tsv', filename='results/out-'+name) 47 | print(tabulate(table, headers='firstrow', tablefmt='latex')) 48 | -------------------------------------------------------------------------------- /tests/workflows/list-2019/predict.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | from lingpy import * 4 | from lingrex.copar import * 5 | from sys import argv 6 | import random 7 | from lingpy.basictypes import * 8 | from tabulate import tabulate 9 | from lingpy.compare.sanity import average_coverage 10 | import codecs 11 | 12 | 13 | class CustomCoPaR(CoPaR): 14 | def stats(self, score_mode='pairs'): 15 | rest = 0 16 | if self._mode == 'fuzzy': 17 | for idx, cogids, alms in self.iter_rows(self._ref, self._alignment): 18 | for alm, cogid in zip(alms, cogids): 19 | if cogid not in self.msa[self._ref]: 20 | rest += len(alm) 21 | else: 22 | pass 23 | else: 24 | for idx, cogid, alm in self.iter_rows(self._ref, self._alignment): 25 | if cogid not in self.msa[self._ref]: 26 | rest += len(alm) 27 | scores = [0 for i in range(rest)] 28 | for (p, ptn), sites in self.clusters.items(): 29 | scores += len(sites) * [score_patterns( 30 | [ 31 | self.sites[site][1] for site in sites 32 | ], mode=score_mode)] 33 | return sum(scores) / len(scores) 34 | 35 | 36 | def run_experiments( 37 | f, 38 | ref, 39 | ratio, 40 | subset=None, 41 | runs=100, 42 | verbose=False, 43 | fuzzy=True, 44 | samples=1, 45 | noout=False, 46 | score_mode='pairs' 47 | ): 48 | 49 | if not noout: 50 | outfile = codecs.open( 51 | 'results/'+f.split('/')[-1][:-4]+'-'+str(int(ratio*100+0.5))+'.txt', 52 | 'w', 'utf-8') 53 | outfile.write('\t'.join([ 54 | 'accuracy', 'proportion', 'density', 'fuzziness', 'coverage', 55 | 'purity', 'sounds', 'missing', 'csetsize', 'clusters', 'props', 56 | 'patterns', 'predicted', 'predictable', 'removed', 'regular', 57 | 'purityx'])+'\n') 58 | 59 | cpb = CustomCoPaR(f, ref=ref, fuzzy=fuzzy, split_on_tones=False, 60 | segments='segments', minrefs=2, structure="structure", 61 | transcription="segments") 62 | 63 | if not noout: 64 | inout = codecs.open( 65 | 'results/'+f.split('/')[-1][:-4]+'-individual-'+str(int(ratio*100+0.5))+'.tsv', 66 | 'w', 'utf-8') 67 | inout.write('\t'.join(['run', 'doculect','accuracy', 'purity', 'words', 'sounds'])+'\n') 68 | 69 | # define the scores 70 | all_scores = [] 71 | all_samples = set() 72 | all_pscores = {d: [] for d in cpb.cols} 73 | all_pud = {d: [] for d in cpb.cols} 74 | all_words = {d: [] for d in cpb.cols} 75 | all_sounds = {d: [] for d in cpb.cols} 76 | for key, msa in cpb.msa[ref].items(): 77 | for alm, t in zip(msa['alignment'], msa['taxa']): 78 | all_samples.add((key, ' '.join(alm), t)) 79 | 80 | for run in range(runs): 81 | remove_idxs = random.sample(list(all_samples), int(len(all_samples)*ratio+0.5)) 82 | D = {0: cpb.columns} 83 | for idx, cogid, alm, tax, tokens, structures in cpb.iter_rows( 84 | ref, 'alignment', 'doculect', 'segments', 'structure'): 85 | if fuzzy: 86 | cogids, alms, toks, strucs = [], [], [], [] 87 | for c, a, t, s in zip(cogid, lists(alm).n, lists(tokens).n, 88 | lists(structures).n): 89 | if (c, str(a), tax) not in remove_idxs: 90 | cogids += [c] 91 | alms += [str(a)] 92 | toks += [str(t)] 93 | strucs += [str(s)] 94 | if not cogids: 95 | pass 96 | else: 97 | D[idx] = cpb[idx] 98 | D[idx][cpb.header[ref]] = ints(cogids) 99 | D[idx][cpb.header['segments']] = ' + '.join(toks) 100 | D[idx][cpb.header['structure']] = ' + '.join(strucs) 101 | D[idx][cpb.header['alignment']] = ' + '.join(alms) 102 | else: 103 | if (cogid, str(alm), tax) in remove_idxs: 104 | pass 105 | else: 106 | D[idx] = cpb[idx] 107 | 108 | cp = CustomCoPaR(D, ref=ref, fuzzy=fuzzy, split_on_tones=False, 109 | segments='segments', minrefs=2, structure="structure", 110 | transcription="segments") 111 | if 'l' in argv: 112 | cp.load_patterns() 113 | else: 114 | cp.get_sites() 115 | cp.cluster_sites(score_mode=score_mode) 116 | cp.sites_to_pattern() 117 | 118 | # compute size of alphabets 119 | sounds = {d: collections.defaultdict(int) for d in cp.cols} 120 | for idx, doc, tks in cp.iter_rows('doculect', 'segments'): 121 | for t in tks: 122 | if t != '+': 123 | sounds[doc][t.split('/')[1] if '/' in t else t] += 1 124 | ave = sum([len(s) for s in sounds.values()]) / cp.width 125 | 126 | # good words 127 | our_sample = {} 128 | for cogid, alm, doc in remove_idxs: 129 | our_sample[cogid, doc] = strings(alm) 130 | pscores = {d: [] for d in cp.cols} 131 | 132 | regs = sum([len(a[1]) for a in cp.clusters.items() if len(a[1]) > 1]) / len(cp.sites) 133 | 134 | predicted, purity, pudity = cp.predict_words(minrefs=2, samples=samples) 135 | scores = [] 136 | unknown, all_segs, predictable, cogsize = 0, 0, 0, 0 137 | for k, v in predicted.items(): 138 | for doc in v: 139 | if (k, doc) in our_sample and (doc == subset or not subset): 140 | predictable += 1 141 | cogsize += len(cp.msa[ref][k]['ID']) 142 | 143 | # check for different alignments 144 | msaA = cp.msa[ref][k] 145 | msaB = cpb.msa[ref][k] 146 | if len(msaA['alignment'][0]) != len(msaB['alignment'][0]): 147 | # carve out the taxa which are still existent to find which 148 | # column to delete 149 | new_alm = [msaB['alignment'][i] for i in 150 | range(len(msaB['alignment'])) if msaB['taxa'][i] in \ 151 | msaA['taxa']] 152 | almA, almB = [], [] 153 | for i in range(len(msaA['alignment'][0])): 154 | almA += [tuple([line[i] for line in msaA['alignment']])] 155 | for i in range(len(msaB['alignment'][0])): 156 | almB += [tuple([line[i] for line in new_alm])] 157 | out = [] 158 | for i, col in enumerate(almB): 159 | if col not in almA: 160 | out += [i] 161 | else: 162 | out = [] 163 | 164 | wA, wB = v[doc], our_sample[k, doc] 165 | ms = 0 166 | wB = strings([x for i, x in enumerate(wB) if i not in out]) 167 | for a, b in zip(wA, wB): 168 | b = b.split('/')[1] if '/' in b else b 169 | a = a.split('|') 170 | for i, a_ in enumerate(a): 171 | if b == a_: 172 | ms += 1 * (1/(i+1)) 173 | if a[0] == 'Ø': 174 | unknown += 1 175 | all_segs += 1 176 | 177 | score = ms / len(wA) 178 | pscores[doc] += [score] 179 | if verbose: 180 | print('{0:5}\t{1:15}\t{2:20}\t{3:20}\t{4:.2f}\t{5}'.format( 181 | str(k), doc, str(wA), str(wB), score, len(set(msaA['taxa'])))) 182 | if verbose and score != 1.0: 183 | purs = [] 184 | for i, elm in enumerate(wA): 185 | if (k, i) in purity: 186 | purs += ['{0:.2f}'.format(purity[k, i][doc])] 187 | else: 188 | purs += ['?'] 189 | print((cogid, i) in cp.sites) 190 | print([_s for _s in cp.sites if _s[0] == cogid], 191 | cogid) 192 | print('<---') 193 | print('\t'.join([x for x in wA])) 194 | print('\t'.join([x for x in wB])) 195 | print('\t'.join(purs)) 196 | print('--->') 197 | scores += [score] 198 | ubound = cp.upper_bound() 199 | all_scores += [( 200 | sum(scores) / len(scores), 201 | len(cp) / len(cpb), 202 | density(cp, ref=ref), 203 | cp.fuzziness(), 204 | cp.stats(score_mode=score_mode), 205 | sum(pudity.values()) / len(pudity.values()), 206 | ave, 207 | unknown/all_segs, 208 | cogsize / predictable, 209 | len(cp.clusters), 210 | len(cp.clusters) / ubound, 211 | len(cp.sites), 212 | predictable / len(remove_idxs), 213 | predictable, 214 | len(remove_idxs), 215 | regs, 216 | cp.purity() 217 | )] 218 | if verbose: 219 | print('{0:.2f}'.format(all_scores[-1][0])) 220 | 221 | cov = cp.coverage() 222 | for p in pscores: 223 | all_pscores[p] += [sum(pscores[p]) / len(pscores[p])] 224 | all_pud[p] += [pudity[p]] 225 | all_words[p] += [cov[p]] 226 | all_sounds[p] += [len(sounds[p])] 227 | 228 | if not noout: 229 | inout.write('\t'.join([ 230 | str(run+1), 231 | p, 232 | str(all_pscores[p][-1]), 233 | str(pudity[p]), 234 | str(cov[p]), 235 | str(len(sounds[p])) 236 | ])+'\n') 237 | if not noout: 238 | outfile.write(str(run+1)+'\t'+'\t'.join(['{0:.4f}'.format(x) for x in 239 | all_scores[-1]])+'\n') 240 | print('{0:.2f} / {1:.2f}'.format(sum(scores) / len(scores), len(cp) / 241 | len(cpb))) 242 | 243 | 244 | new_scores = [[ 245 | 'accuracy', 'proportion', 'density', 'fuzziness', 'coverage', 246 | 'purity', 'sounds', 'missing', 'csetsize', 'clusters', 'props', 247 | 'patterns', 'predicted', 'predictable', 'removed', 'regs', 'purityx']] 248 | new_scores += [[ 249 | round(sum([x[0] for x in all_scores]) / len(all_scores), 4), 250 | round(sum([x[1] for x in all_scores]) / len(all_scores), 4), 251 | round(sum([x[2] for x in all_scores]) / len(all_scores), 4), 252 | round(sum([x[3] for x in all_scores]) / len(all_scores), 4), 253 | round(sum([x[4] for x in all_scores]) / len(all_scores), 4), 254 | round(sum([x[5] for x in all_scores]) / len(all_scores), 4), 255 | round(sum([x[6] for x in all_scores]) / len(all_scores), 4), 256 | round(sum([x[7] for x in all_scores]) / len(all_scores), 4), 257 | round(sum([x[8] for x in all_scores]) / len(all_scores), 4), 258 | round(sum([x[9] for x in all_scores]) / len(all_scores), 4), 259 | round(sum([x[10] for x in all_scores]) / len(all_scores), 4), 260 | round(sum([x[11] for x in all_scores]) / len(all_scores), 4), 261 | round(sum([x[12] for x in all_scores]) / len(all_scores), 4), 262 | round(sum([x[13] for x in all_scores]) / len(all_scores), 4), 263 | round(sum([x[14] for x in all_scores]) / len(all_scores), 4), 264 | round(sum([x[15] for x in all_scores]) / len(all_scores), 4), 265 | round(sum([x[16] for x in all_scores]) / len(all_scores), 4), 266 | ]] 267 | if not noout: 268 | outfile.close() 269 | inout.close() 270 | 271 | 272 | if noout: 273 | print(tabulate(new_scores, headers='firstrow')) 274 | 275 | return purity, pudity, sounds, cp 276 | 277 | if __name__ == '__main__': 278 | from sys import argv 279 | 280 | # defaults 281 | f = argv[1] 282 | ref = 'crossids' 283 | ratio = 0.5 284 | proto = None 285 | verbose = False 286 | runs = 100 287 | samples = 1 288 | noout = False 289 | 290 | # parse arguments 291 | if '-r' in argv: 292 | ratio = float(argv[argv.index('-r')+1]) 293 | if '-c' in argv: 294 | ref = argv[argv.index('-c')+1] 295 | if '-v' in argv or '--verbose' in argv: 296 | verbose = True 297 | if '--runs' in argv: 298 | runs = int(argv[argv.index('--runs')+1]) 299 | if ref in ['crossid', 'cogid']: 300 | fuzzy = False 301 | else: 302 | fuzzy = True 303 | if '--samples' in argv: 304 | samples = int(argv[argv.index('--samples')+1]) 305 | if '--noout' in argv: 306 | noout = True 307 | 308 | if '--seed' in argv: 309 | random.seed(1) 310 | 311 | p1, p2, p3, cop = run_experiments( 312 | f, 313 | ref, 314 | ratio, 315 | fuzzy=fuzzy, 316 | verbose=verbose, 317 | runs=runs, 318 | samples=samples, 319 | noout=noout, 320 | ) 321 | if verbose: 322 | cop.add_patterns() 323 | cop.output( 324 | 'tsv', 325 | filename='results/'+f.split('/')[1].split('-')[0]+str(int(100*ratio+0.5)), 326 | ignore='all', 327 | prettify=False 328 | ) 329 | 330 | 331 | -------------------------------------------------------------------------------- /tests/workflows/list-2019/results/burmish-240-8-75.txt: -------------------------------------------------------------------------------- 1 | accuracy proportion density fuzziness coverage purity sounds missing csetsize clusters props patterns predicted predictable removed regular purityx 2 | -------------------------------------------------------------------------------- /tests/workflows/list-2019/results/burmish-240-8-individual-75.tsv: -------------------------------------------------------------------------------- 1 | run doculect accuracy purity words sounds 2 | -------------------------------------------------------------------------------- /tests/workflows/list-2019/results/chinese-623-14-75.txt: -------------------------------------------------------------------------------- 1 | accuracy proportion density fuzziness coverage purity sounds missing csetsize clusters props patterns predicted predictable removed regular purityx 2 | 1 0.6463 0.2982 0.6337 3.1740 0.0812 0.8311 38.6429 0.0673 3.5227 395.0000 0.1665 2374.0000 0.7893 4955.0000 6278.0000 0.9629 0.7026 3 | 2 0.6188 0.2981 0.6419 3.2229 0.0735 0.8218 38.3571 0.0742 3.5347 411.0000 0.1717 2396.0000 0.7983 5012.0000 6278.0000 0.9604 0.6931 4 | -------------------------------------------------------------------------------- /tests/workflows/list-2019/results/chinese-623-14-individual-75.tsv: -------------------------------------------------------------------------------- 1 | run doculect accuracy purity words sounds 2 | 1 Beijing 0.6744631185807659 0.8506857410315988 189 39 3 | 1 Changsha 0.6591755319148934 0.8400152536420613 173 41 4 | 1 Fuzhou 0.6043639740018567 0.8470977566718286 193 35 5 | 1 Guangzhou 0.6362084456424079 0.8374048533808206 181 39 6 | 1 Jinan 0.6690212373037854 0.8379081874636615 193 42 7 | 1 Meixian 0.67212962962963 0.8434349335736812 185 29 8 | 1 Nanchang 0.6397363465160077 0.8268322390279454 177 39 9 | 1 Nanjing 0.6633239171374766 0.8294375350540676 188 41 10 | 1 Shanghai 0.6420765027322405 0.8397458588789625 183 49 11 | 1 Suzhou 0.655978835978836 0.8226080732036224 172 47 12 | 1 Taibei 0.6005606523955145 0.7923705147078326 152 32 13 | 1 Taoyuan 0.6374999999999997 0.8149330026110324 157 31 14 | 1 Wenzhou 0.6068965517241384 0.8134687675211292 169 47 15 | 1 Xiamen 0.6850368324125231 0.8392717648505066 184 30 16 | 2 Beijing 0.6341463414634145 0.8277661174300526 179 38 17 | 2 Changsha 0.6031168831168832 0.82247660983077 171 39 18 | 2 Fuzhou 0.5586936936936938 0.8183899161818443 186 36 19 | 2 Guangzhou 0.614628623188406 0.8246327090967067 188 39 20 | 2 Jinan 0.637148047229791 0.8232004499357977 192 40 21 | 2 Meixian 0.6361261261261262 0.824309193838572 175 29 22 | 2 Nanchang 0.6289552238805969 0.8172120514596539 189 37 23 | 2 Nanjing 0.6422586520947178 0.8384229507738024 180 42 24 | 2 Shanghai 0.6264075067024129 0.8295751313354329 179 47 25 | 2 Suzhou 0.6202416918429002 0.8051516609454498 168 48 26 | 2 Taibei 0.576380042462845 0.8095576704338178 168 33 27 | 2 Taoyuan 0.6294585987261145 0.8073248812775506 169 32 28 | 2 Wenzhou 0.5803894927536231 0.8261543247139483 183 46 29 | 2 Xiamen 0.6708115183246076 0.8314737379495394 168 31 30 | -------------------------------------------------------------------------------- /tests/workflows/list-2019/results/japanese-200-10-75.txt: -------------------------------------------------------------------------------- 1 | accuracy proportion density fuzziness coverage purity sounds missing csetsize clusters props patterns predicted predictable removed regular purityx 2 | 1 0.5297 0.3182 0.3381 2.3938 0.0005 0.7356 32.3000 0.1836 2.7977 149.0000 0.2358 645.0000 0.5227 702.0000 1343.0000 0.9256 0.6899 3 | 2 0.4828 0.3177 0.3377 2.3785 0.0012 0.7263 32.8000 0.1971 2.7854 144.0000 0.2326 642.0000 0.5309 713.0000 1343.0000 0.9097 0.7167 4 | -------------------------------------------------------------------------------- /tests/workflows/list-2019/results/japanese-200-10-individual-75.tsv: -------------------------------------------------------------------------------- 1 | run doculect accuracy purity words sounds 2 | 1 Amami 0.38053221288515415 0.7248868060731588 57 37 3 | 1 Hachijō 0.5685611879160267 0.7477516322783029 80 33 4 | 1 Kagoshima 0.4537838139950816 0.7140323216889822 70 33 5 | 1 Kyōto 0.5509259259259259 0.6733292869047514 46 29 6 | 1 Kōchi 0.5926393728222995 0.7242764951529171 57 26 7 | 1 Miyako 0.42766884531590404 0.7591016034249201 78 42 8 | 1 Oki 0.547769066976384 0.7448634608988914 55 30 9 | 1 Sado 0.6281055900621118 0.7545201944366466 69 30 10 | 1 Shuri 0.46807181889149113 0.7354638428741851 69 32 11 | 1 Tōkyō 0.6226190476190475 0.7776557302734873 51 31 12 | 2 Amami 0.416017316017316 0.6837437518510925 56 35 13 | 2 Hachijō 0.4747655122655123 0.7243134412799231 79 35 14 | 2 Kagoshima 0.4744912494912495 0.7571340080753896 55 32 15 | 2 Kyōto 0.5557037674507556 0.7372087276595939 51 31 16 | 2 Kōchi 0.6043197278911564 0.7710841857961863 65 30 17 | 2 Miyako 0.29142195767195767 0.7082720008598198 80 41 18 | 2 Oki 0.43308217801888677 0.6922538976524312 55 27 19 | 2 Sado 0.5647407407407408 0.7331145433445981 67 34 20 | 2 Shuri 0.40829554043839755 0.7336815929986455 62 34 21 | 2 Tōkyō 0.5507012393998695 0.7225082960724509 61 29 22 | -------------------------------------------------------------------------------- /tests/workflows/list-2019/results/polynesian-210-10-75.txt: -------------------------------------------------------------------------------- 1 | accuracy proportion density fuzziness coverage purity sounds missing csetsize clusters props patterns predicted predictable removed regular purityx 2 | 1 0.7424 0.4545 0.1935 2.2027 0.0306 0.8429 19.0000 0.1039 3.2306 69.0000 0.1405 528.0000 0.3649 620.0000 1699.0000 0.9564 0.8270 3 | 2 0.7377 0.4429 0.1843 2.2170 0.0282 0.8357 19.0000 0.1135 2.9928 61.0000 0.1525 470.0000 0.3284 558.0000 1699.0000 0.9596 0.8412 4 | -------------------------------------------------------------------------------- /tests/workflows/list-2019/results/polynesian-210-10-individual-75.tsv: -------------------------------------------------------------------------------- 1 | run doculect accuracy purity words sounds 2 | 1 Austral_1213 0.7098360655737704 0.8369797720278489 86 16 3 | 1 Austral_128 0.8045893719806763 0.8336284193571929 93 18 4 | 1 Hawaiian_52 0.6782738095238096 0.8515142513002715 111 17 5 | 1 Mangareva_239 0.7875661375661377 0.853256880850012 104 22 6 | 1 Maori_85 0.6363095238095239 0.8017538854798525 95 20 7 | 1 NorthMarquesan_38 0.6984848484848484 0.8621553400918279 115 20 8 | 1 Rapanui_264 0.7744252873563217 0.8713437633013035 104 19 9 | 1 Sikaiana_243 0.7531446540880504 0.8360935254027674 91 20 10 | 1 Tahitian_173 0.7721264367816093 0.8183473427878986 74 19 11 | 1 Tuamotuan_246 0.8179687499999999 0.8635059484169247 91 19 12 | 2 Austral_1213 0.6935374149659864 0.7930747188193066 91 17 13 | 2 Austral_128 0.7558974358974357 0.8525730691600037 86 17 14 | 2 Hawaiian_52 0.7429824561403509 0.8765556454861049 101 18 15 | 2 Mangareva_239 0.7440251572327042 0.8626147997883004 96 22 16 | 2 Maori_85 0.7645502645502644 0.8442533194274631 94 19 17 | 2 NorthMarquesan_38 0.7689244663382592 0.8642062532709653 117 20 18 | 2 Rapanui_264 0.6323529411764706 0.8183746212518835 104 18 19 | 2 Sikaiana_243 0.6937984496124031 0.8316400309821942 102 20 20 | 2 Tahitian_173 0.7957627118644067 0.8161348483260537 69 19 21 | 2 Tuamotuan_246 0.7492460317460317 0.7972076549006404 94 20 22 | -------------------------------------------------------------------------------- /tests/workflows/wu-2020/4_crosssemantic.py: -------------------------------------------------------------------------------- 1 | from lingpy import * 2 | from lingrex.colex import find_colexified_alignments, find_bad_internal_alignments 3 | from lingrex.align import template_alignment 4 | from sys import argv 5 | 6 | if 'all' in argv: 7 | fname='A_Chen_' 8 | else: 9 | fname='D_Chen_' 10 | 11 | 12 | alms = Alignments(fname+'aligned.tsv', ref='cogids') 13 | print('[i] search for bad internal alignments') 14 | find_bad_internal_alignments(alms) 15 | 16 | print('[i] search for colexified alignments') 17 | find_colexified_alignments( 18 | alms, 19 | cognates='cogids', 20 | ref='crossids' 21 | ) 22 | 23 | # re-align the data 24 | print('[i] re-align the data') 25 | template_alignment(alms, 26 | ref='crossids', 27 | template='imnct+imnct+imnct+imnct+imnct+imnct', 28 | structure = 'structure', 29 | fuzzy=True, 30 | segments='tokens') 31 | 32 | alms.output('tsv', filename=fname+'crossids', prettify=False) 33 | -------------------------------------------------------------------------------- /tests/workflows/wu-2020/5_correspondence.py: -------------------------------------------------------------------------------- 1 | from lingrex.copar import CoPaR 2 | from sys import argv 3 | 4 | if 'all' in argv: 5 | fname='A_Chen_' 6 | else: 7 | fname='D_Chen_' 8 | 9 | cop = CoPaR( 10 | fname+'crossids.tsv', 11 | ref='crossids', 12 | fuzzy=True, 13 | segments='tokens', 14 | minrefs=3, 15 | structure="structure" 16 | ) 17 | cop.get_sites() 18 | cop.cluster_sites() 19 | cop.sites_to_pattern() 20 | cop.add_patterns() 21 | cop.write_patterns(fname+'all_patterns.tsv') 22 | cop.output('tsv', filename=fname+'patterns', prettify=False) 23 | 24 | # statistics 25 | sps=['i','m','n','c','t'] 26 | 27 | total_correspondence_sets = len(cop.clusters) 28 | print('{0}: {1}'.format('The total sound correspondence cluster sets', total_correspondence_sets)) 29 | 30 | print('The number of regular correspondence sets in each position') 31 | for sp in sps: 32 | t = [x[1] for x, y in cop.clusters.items() if len(y)>1 and x[0] ==sp] 33 | print('{0}: {1}'.format(sp, len(t))) 34 | 35 | print('The number of singletons in each position ') 36 | for sp in sps: 37 | t = [x[1] for x, y in cop.clusters.items() if len(y)==1 and x[0] ==sp] 38 | print('{0}: {1}'.format(sp, len(t))) 39 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py{38,39,310} 3 | skip_missing_interpreters = true 4 | 5 | [testenv] 6 | extras = test 7 | commands = pytest {posargs} 8 | --------------------------------------------------------------------------------