├── .github
    └── workflows
    │   └── python-package.yml
├── .gitignore
├── .zenodo.json
├── LICENSE
├── MANIFEST.in
├── README.md
├── RELEASING.md
├── pip-requirements.txt
├── setup.cfg
├── setup.py
├── src
    └── lingrex
    │   ├── __init__.py
    │   ├── align.py
    │   ├── borrowing.py
    │   ├── cognates.py
    │   ├── colex.py
    │   ├── copar.py
    │   ├── evaluate.py
    │   ├── fuzzy.py
    │   ├── reconstruct.py
    │   ├── regularity.py
    │   ├── trimming.py
    │   └── util.py
├── tests
    ├── conftest.py
    ├── data
    │   ├── east-polynesian.tsv
    │   ├── hillburmish.tsv
    │   └── wordlist.tsv
    ├── test_align.py
    ├── test_borrowing.py
    ├── test_cognates.py
    ├── test_colex.py
    ├── test_copar.py
    ├── test_evaluate.py
    ├── test_fuzzy.py
    ├── test_reconstruct.py
    ├── test_regularity.py
    ├── test_trimming.py
    ├── test_util.py
    ├── test_workflows.py
    └── workflows
    │   ├── bodt-2019
    │       ├── bodt-khobwa-cleaned.tsv
    │       ├── predict.py
    │       ├── results
    │       │   └── README.md
    │       └── test-prediction.py
    │   ├── list-2019
    │       ├── data
    │       │   ├── burmish-240-8.tsv
    │       │   ├── chinese-203-19.tsv
    │       │   ├── chinese-623-14.tsv
    │       │   ├── east-polynesian.tsv
    │       │   ├── japanese-200-10.tsv
    │       │   └── polynesian-210-10.tsv
    │       ├── general.py
    │       ├── predict.py
    │       └── results
    │       │   ├── burmish-240-8-75.txt
    │       │   ├── burmish-240-8-individual-75.tsv
    │       │   ├── chinese-623-14-75.txt
    │       │   ├── chinese-623-14-individual-75.tsv
    │       │   ├── japanese-200-10-75.txt
    │       │   ├── japanese-200-10-individual-75.tsv
    │       │   ├── out-burmish.tsv
    │       │   ├── out-chinese.tsv
    │       │   ├── out-japanese.tsv
    │       │   ├── out-polynesian.tsv
    │       │   ├── polynesian-210-10-75.txt
    │       │   └── polynesian-210-10-individual-75.tsv
    │   └── wu-2020
    │       ├── 4_crosssemantic.py
    │       ├── 5_correspondence.py
    │       ├── D_Chen_aligned.tsv
    │       ├── D_Chen_all_patterns.tsv
    │       ├── D_Chen_crossids.tsv
    │       └── D_Chen_patterns.tsv
└── tox.ini


/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         python-version: [3.8, 3.9, "3.10"]
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v4
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v4
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install .[test]
27 |     - name: Test with pytest
28 |       run: |
29 |         pytest
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .info
  6 | tests/bak/*
  7 | dev/
  8 | # C extensions
  9 | *.so
 10 | *.swp
 11 | .idea
 12 | # Distribution / packaging
 13 | .Python
 14 | env/
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # dotenv
 86 | .env
 87 | 
 88 | # virtualenv
 89 | .venv
 90 | venv/
 91 | ENV/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # MacOS
107 | *.DS_Store
108 | 


--------------------------------------------------------------------------------
/.zenodo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "title": "LingRex. Linguistic Reconstruction with LingPy",
 3 |     "creators": [
 4 |         {
 5 |             "name": "Johann-Mattis List"
 6 |         },
 7 |         {
 8 |             "name": "Robert Forkel"
 9 |         }
10 |     ],
11 |     "access_right": "open",
12 |     "keywords": [
13 |         "linguistics"
14 |     ],
15 |     "license": {
16 |         "id": "CC-BY-4.0"
17 |     },
18 |     "upload_type": "software",
19 |     "communities": [
20 |         {
21 |             "identifier": "digling"
22 |         },
23 |         {
24 |             "identifier": "calc"
25 |         }
26 |     ]
27 | }
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 LingPy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE
2 | graft src
3 | global-exclude *.py[co]
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LingRex: Linguistic Reconstruction with LingPy
 2 | 
 3 | [![Build Status](https://github.com/lingpy/lingrex/workflows/tests/badge.svg)](https://github.com/lingpy/lingrex/actions?query=workflow%3Atests)
 4 | [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.1544943.svg)](https://doi.org/10.5281/zenodo.1544943)
 5 | [![PyPI version](https://badge.fury.io/py/lingrex.png)](https://badge.fury.io/py/lingrex)
 6 | 
 7 | LingRex offers the code needed for the automatic inference of sound correspondence patterns as described in the following paper:
 8 | 
 9 | > List, J.-M. (2019): Automatic inference of sound correspondence patterns across multiple languages. Computational Linguistics 45.1. 137-161. [DOI: 10.1162/coli_a_00344](https://doi.org/10.1162/coli_a_00344)
10 | 
11 | To test this workflow, please check the workflow code example in [`tests/workflows/list-2019`](tests/workflows/list-2019).
12 | 
13 | LingRex offers also the code needed for a baseline algorithm for automatic word prediction or automatic phonological reconstruction in a supervised fashion.
14 | 
15 | > List, J.-M. and R. Forkel and N. W. Hill (2022): A New Framework for Fast Automated Phonological Reconstruction Using Trimmed Alignments and Sound Correspondence Patterns. Proceedings of the 3rd International Workshop on Computational Approaches to Historical Language Change (LChange 2022). Dublin. Ireland. https://aclanthology.org/2022.lchange-1.9
16 | 
17 | This algorithm is also used as a baseline for a Shared Task on the Prediction of Cognate Reflexes (https://sigtyp.github.io/st2022.html), organized as part of the SIGTYP Workshop at NAACL 2022.
18 | 
19 | > List, J.-M., E. Vylomova, R. Forkel, N. Hill, and R. Cotterell (2022): The SIGTYP shared task on the prediction of cognate reflexes. In: Proceedings of the 4th Workshop on Computational Typology and Multilingual NLP. Association for Computational Linguistics 52-62. https://aclanthology.org/2022.sigtyp-1.7
20 | 
21 | Methods for the handling of partial cognates were introduced in a study by Wu and List (2023):
22 | 
23 | > Wu, M.-S. and J.-M. List (2023): Annotating cognates in phylogenetic studies of South-East Asian languages. Language Dynamics and Change. https://doi.org/10.1163/22105832-bja10023
24 | 
25 | Methods for the trimming of phonetic alignments were introduced in a study by Blum and List (2023):
26 | 
27 | > Blum, F. and J.-M. List (2023): Trimming phonetic alignments improves the inference of sound correspondence patterns from multilingual wordlists. In: Proceedings of the 5th Workshop on Computational Typology and Multilingual NLP. Association for Computational Linguistics. 52-64. https://aclanthology.org/2023.sigtyp-1.6.pdf
28 | 
29 | Methods for the handling and creation of fuzzy / uncertain phonological reconstructions were introduced in a study by List et al. (2023):
30 | 
31 | > List, J.-M.; Hill, N. W.; Blum, F.; and Forkel, R. (2023): A New Framework for the Representation and Computation of Uncertainty in Phonological Reconstruction. Proceedings of the 4th Workshop on Computational Approaches to Historical Language Change. 22-32. https://aclanthology.org/2023.lchange-1.3
32 | 
33 | When using this package in your research, please make sure to quote the respective papers, depending on the algorithms you use, and quote the software package as follows:
34 | 
35 | > List, J.-M. and R. Forkel (2023): LingRex: Linguistic Reconstruction with LingPy. [Computer software, Version 1.4.0]. With contributions by Frederic Blum and Mei-Shin Wu. Leipzig: Max Planck Institute for Evolutionary Anthropology. https://pypi.org/project/lingrex
36 | 
37 | Since this software package itself makes use of LingPy's alignment algorithms, you should also quote the LingPy package itself.
38 | 
39 | > List, J.-M. and R. Forkel (2023): LingPy. A Python library for quantitative tasks in historical linguistics. Version 2.6.10. Max Planck Institute for Evolutionary Anthropology: Leipzig. https://lingpy.org
40 | 
41 | ## Installation
42 | 
43 | Install the package via `pip`:
44 | 
45 | ```shell
46 | pip install lingrex
47 | ```
48 | 
49 | ## Further Examples
50 | 
51 | The borrowing detection algorithm implemented in LingRex is introduced in the
52 | paper:
53 | 
54 | > List, J.-M. and R. Forkel (2021): Automated identification of borrowings in multilingual wordlists [version 1; peer review: 3 approved, 1 approved with reservations]. Open Research Europe 1.79. 1-11. [DOI: 10.12688/openreseurope.13843.1](https://doi.org/10.12688/openreseurope.13843.1)
55 | 
56 | If you use this algorithm, please cite LingRex and this paper.
57 | 
58 | In addition to the paper in which the correspondence pattern inference algorithm was first introduced, LingRex also offers the code to compute the workflow described in the following paper:
59 | 
60 | > Wu, M.-S., N. Schweikhard, T. Bodt, N. Hill, and J.-M. List (2020): Computer-Assisted Language Comparison. State of the Art. Journal of Open Humanities Data 6.2. 1-14. [DOI: 10.5334/johd.12](https://doi.org/10.5334/johd.12)
61 | 
62 | To test this workflow, please check the workflow code example in `tests/workflows/wu-2020`. 
63 | 
64 | If you use this workflow in your work, please quote this paper as well.
65 | 
66 | In addition, our experiment (with T. Bodt) on predicting words with the help of sound correspondence patterns also made use of the LingRex package.
67 | 
68 | > Bodt, T. and J.-M. List (2021): Reflex prediction. A case study of Western Kho-Bwa. Diachronica 0.0. 1-38. [DOI: 10.1075/dia.20009.bod](https://doi.org/10.1075/dia.20009.bod)
69 | 
70 | To test this workflow, please check the workflow code example in `tests/workflows/bodt-2019`.
71 | 


--------------------------------------------------------------------------------
/RELEASING.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Releasing lingrex
 3 | 
 4 | - Do platform test via tox:
 5 |   ```shell
 6 |   tox -r
 7 |   ```
 8 | - test if the workflow scripts still work:
 9 |   ```shell
10 |   pytest -m"workflow"
11 |   ```
12 | 
13 | - Make sure statement coverage >= 99%
14 | - Use black and flake8 to make the code unified:
15 |   ```shell
16 |   flake8 src
17 |   black src/lingrex/*.py
18 |   ```
19 | 
20 | - Update the version number, by removing the trailing `.dev0` in:
21 |   - `setup.cfg`
22 |   - `src/lingrex/__init__.py`
23 | 
24 | - Check metadata in `.zenodo.json`
25 | 
26 | - Create the release commit:
27 |   ```shell
28 |   git commit -a -m "release <VERSION>"
29 |   ```
30 | 
31 | - Create a release tag:
32 |   ```shell
33 |   git tag -a v<VERSION> -m"<VERSION> release"
34 |   ```
35 | 
36 | - Release to PyPI:
37 |   ```shell
38 |   rm dist/*
39 |   python setup.py sdist bdist_wheel
40 |   twine upload dist/*
41 |   ```
42 | 
43 | - Push to github:
44 |   ```shell
45 |   git push origin
46 |   git push --tags
47 |   ```
48 | 
49 | - Change version for the next release cycle, i.e. incrementing and adding .dev0
50 |   - `setup.cfg`
51 |   - `src/lingrex/__init__.py`
52 | 
53 | - Commit/push the version change:
54 |   ```shell
55 |   git commit -a -m "bump version for development"
56 |   git push origin
57 |   ```
58 | 


--------------------------------------------------------------------------------
/pip-requirements.txt:
--------------------------------------------------------------------------------
1 | lingpy >= 2.6.8
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
  1 | [metadata]
  2 | name = lingrex
  3 | version = 1.4.3.dev0
  4 | author = Johann-Mattis List
  5 | author_email = mattis.list@uni-passau.de
  6 | description = Web-Based Tool for Computer-Assisted Language Comparison
  7 | long_description = file: README.md
  8 | long_description_content_type = text/markdown
  9 | keywords =
 10 |     linguistics
 11 |     computational linguistics
 12 |     linguistic reconstruction
 13 |     cognate detection
 14 | license = MIT
 15 | license_files = LICENSE
 16 | url = https://pypi.org/project/lingrex
 17 | platforms = any
 18 | classifiers =
 19 |     Development Status :: 5 - Production/Stable
 20 |     Intended Audience :: Developers
 21 |     Intended Audience :: Science/Research
 22 |     Natural Language :: English
 23 |     Operating System :: OS Independent
 24 |     Programming Language :: Python :: 3
 25 |     Programming Language :: Python :: 3.8
 26 |     Programming Language :: Python :: 3.9
 27 |     Programming Language :: Python :: 3.10
 28 |     Programming Language :: Python :: 3.11
 29 |     Programming Language :: Python :: 3.12
 30 |     Programming Language :: Python :: Implementation :: CPython
 31 |     Programming Language :: Python :: Implementation :: PyPy
 32 |     License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 33 | 
 34 | [options]
 35 | zip_safe = False
 36 | packages = find:
 37 | package_dir =
 38 |     = src
 39 | python_requires = >=3.8
 40 | install_requires =
 41 |     lingpy>=2.6.13
 42 | include_package_data = True
 43 | 
 44 | [options.packages.find]
 45 | where = src
 46 | 
 47 | [options.package_data]
 48 | 
 49 | [options.entry_points]
 50 | 
 51 | [options.extras_require]
 52 | dev =
 53 |     build
 54 |     wheel
 55 |     twine
 56 |     tox
 57 |     black
 58 |     flake8
 59 | 
 60 | test =
 61 |     pytest
 62 |     pytest-cov
 63 |     pytest-mock
 64 |     coverage
 65 | 
 66 | 
 67 | 
 68 | [bdist_wheel]
 69 | universal = 1
 70 | 
 71 | [flake8]
 72 | ignore = E711,E712,D100,D101,D103,D102,D301,E731
 73 | max-line-length = 100
 74 | exclude = .tox,cython
 75 | 
 76 | [tool:pytest]
 77 | minversion = 5
 78 | testpaths = tests
 79 | addopts = --cov
 80 | 
 81 | [easy_install]
 82 | zip_ok = false
 83 | 
 84 | [coverage:run]
 85 | source =
 86 |     lingrex
 87 |     tests
 88 | 
 89 | [coverage:report]
 90 | show_missing = true
 91 | skip_covered = true
 92 | 
 93 | 
 94 | [tox:tox]
 95 | envlist = py38, py39, py310, py311
 96 | isolated_build = true
 97 | skip_missing_interpreter = true
 98 | 
 99 | [testenv]
100 | deps = .[test]
101 | commands = pytest {posargs}
102 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """
2 | Setup for LingRex
3 | """
4 | from setuptools import setup
5 | setup()
6 | 
7 | 


--------------------------------------------------------------------------------
/src/lingrex/__init__.py:
--------------------------------------------------------------------------------
1 | from lingrex.copar import CoPaR, density
2 | 
3 | assert CoPaR and density
4 | __version__ = "1.4.3.dev0"
5 | 


--------------------------------------------------------------------------------
/src/lingrex/align.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Various phonetic alignment functions.
  3 | """
  4 | from lingpy import basictypes as bt
  5 | 
  6 | 
  7 | def gap_free_pairwise(seqA, seqB, syllables=None, gap="-"):
  8 |     """
  9 |     Carry out a gap-free alignment in which segments are merged instead of gapped.
 10 |     """
 11 |     syllables = [] if syllables is None else syllables
 12 |     start = True
 13 |     merge = False
 14 |     outA, outB = [], []
 15 |     for i, (charA, charB) in enumerate(zip(seqA, seqB)):
 16 |         if i in syllables:
 17 |             start = True
 18 |         if start and charB == gap:
 19 |             outA.append(charA + ">")
 20 |             merge = True
 21 |         elif not merge and charB == gap:
 22 |             outA[-1] += "<" + charA
 23 |         elif merge:
 24 |             if charB == gap:
 25 |                 outA[-1] += charA + ">"
 26 |             else:
 27 |                 outA[-1] += charA
 28 |                 outB.append(charB)
 29 |                 merge = False
 30 |         else:
 31 |             outA.append(charA)
 32 |             outB.append(charB)
 33 |         start = False
 34 |     return outA, outB
 35 | 
 36 | 
 37 | def align_to_template(sequence, structures, template, gap="-"):
 38 |     """
 39 |     Align a sequence to a template.
 40 |     """
 41 |     if (len(sequence) != len(structures)) or (len(template) < len(sequence)):
 42 |         raise ValueError(
 43 |             "sequence {0} and structure {1} have different length".format(
 44 |                 repr(sequence), repr(structures)
 45 |             )
 46 |         )
 47 |     if len([x for x in structures if x not in template]) != 0:
 48 |         raise ValueError(
 49 |             "{0} items in the structure {1} is not in the template".format(
 50 |                 len([x for x in structures if x not in template]), repr(structures)
 51 |             )
 52 |         )
 53 | 
 54 |     out = []
 55 |     idxA, idxB = 0, 0
 56 |     while idxB < len(template):
 57 |         if idxA < len(sequence):
 58 |             segment, structure = sequence[idxA], structures[idxA]
 59 |         else:
 60 |             segment, structure = gap, ""
 61 |         current_structure = template[idxB]
 62 |         if current_structure == structure:
 63 |             out.append(segment)
 64 |             idxA += 1
 65 |         else:
 66 |             out.append(gap)
 67 |         idxB += 1
 68 | 
 69 |     return out
 70 | 
 71 | 
 72 | def shrink_alignments(alignments):
 73 |     """
 74 |     Remove columns from alignment which all consist of gaps.
 75 |     """
 76 |     excludes = []
 77 |     for i in range(len(alignments[0])):
 78 |         col = set([line[i] for line in alignments])
 79 |         if "-" in col and len(col) == 1:
 80 |             excludes.append(i)
 81 |     return [
 82 |         [site for i, site in enumerate(alignment) if i not in excludes]
 83 |         for alignment in alignments
 84 |     ]
 85 | 
 86 | 
 87 | def shrink(tokens, structures, converter):
 88 |     """
 89 |     Shrink tokens according to the converter.
 90 | 
 91 |     .. note:: Works only for shrinking two structure elements so far.
 92 |     """
 93 |     outt, outs = [], []
 94 |     sm, merge = None, False
 95 |     for i in range(len(tokens)):
 96 |         if i > 0:
 97 |             sm = " ".join([structures[i - 1], structures[i]])
 98 |             if sm in converter:
 99 |                 outt += [tokens[i - 1] + tokens[i]]
100 |                 outs += [converter[sm]]
101 |                 merge = True
102 |             elif not merge:
103 |                 outt += [tokens[i - 1]]
104 |                 outs += [converter.get(structures[i - 1], structures[i - 1])]
105 |             else:
106 |                 merge = False
107 |     if sm not in converter:
108 |         outt += [tokens[i]]
109 |         outs += [converter.get(structures[i], structures[i])]
110 |     return outt, outs
111 | 
112 | 
113 | def shrink_template(
114 |     wordlist,
115 |     structure="structure",
116 |     segments="tokens",
117 |     converter={"i m": "I", "i": "I", "n c": "R", "n": "R", "c": "R"},
118 |     new_structure="structure2",
119 |     new_tokens="tokens2",
120 |     override=False,
121 | ):
122 |     """
123 |     Reduce a template by merging certain parts of the structure.
124 |     """
125 |     D = {}
126 |     for idx, strucs, tokens in wordlist.iter_rows(structure, segments):
127 |         D[idx] = shrink(tokens, strucs, converter)
128 |     wordlist.add_entries(new_structure, D, lambda x: bt.lists(x[1]), override=override)
129 |     wordlist.add_entries(new_tokens, D, lambda x: bt.lists(x[0]), override=override)
130 | 
131 | 
132 | def template_alignment(
133 |     wordlist,
134 |     ref="cogid",
135 |     template="CCCCVVccccT_CCCCVVccccT_CCCCVVccccT_CCCCVVccccT_CCCCvvT",
136 |     structure="structure",
137 |     fuzzy=False,
138 |     segments="tokens",
139 |     gap="-",
140 |     alignment="alignment",
141 |     override=True,
142 | ):
143 |     """
144 |     Function aligns the cognate sets in a wordlist to a template.
145 | 
146 |     Note
147 |     ----
148 |     This function was first introduced in Wu et al. (2020).
149 | 
150 |     > Wu, M.-S., N. Schweikhard, T. Bodt, N. Hill, and J.-M. List (2020):
151 |     > Computer-Assisted Language Comparison. State of the Art. Journal of Open
152 |     > Humanities Data 6.2. 1-14. DOI: https://doi.org/10.5334/johd.12
153 |     """
154 | 
155 |     for idx, tokens, structures in wordlist.iter_rows(segments, structure):
156 |         wordlist[idx, segments], wordlist[idx, structure] = bt.lists(tokens), bt.lists(
157 |             structures
158 |         )
159 | 
160 |     etd = wordlist.get_etymdict(ref)
161 |     A = {}
162 |     if not fuzzy:
163 |         for cogid, vals in etd.items():
164 |             idxs = []
165 |             for val in vals:
166 |                 if val:
167 |                     idxs += val
168 |             alignments = shrink_alignments(
169 |                 [
170 |                     align_to_template(
171 |                         wordlist[idx, segments],
172 |                         wordlist[idx, structure],
173 |                         template,
174 |                         gap=gap,
175 |                     )
176 |                     for idx in idxs
177 |                 ]
178 |             )
179 |             for idx, alm in zip(idxs, alignments):
180 |                 A[idx] = alm
181 |     if fuzzy:
182 |         cogid2alm = {}
183 |         # only align the first item
184 |         for cogid, vals in etd.items():
185 |             idxs, alms, strucs = [], [], []
186 |             for val in vals:
187 |                 if val:
188 |                     idxs += val
189 |                     alms += [
190 |                         wordlist[idx, segments].n[wordlist[idx, ref].index(cogid)]
191 |                         for idx in val
192 |                     ]
193 |                     strucs += [
194 |                         wordlist[idx, structure].n[wordlist[idx, ref].index(cogid)]
195 |                         for idx in val
196 |                     ]
197 |             alignments = shrink_alignments(
198 |                 [
199 |                     align_to_template(alm, struc, template, gap=gap)
200 |                     for alm, struc in zip(alms, strucs)
201 |                 ]
202 |             )
203 |             for idx, alm in zip(idxs, alignments):
204 |                 cogid2alm[cogid, idx] = " ".join(alm)
205 |         # second iteration, add the alignments per cogid
206 |         for idx, cogids in wordlist.iter_rows(ref):
207 |             A[idx] = bt.lists(
208 |                 " + ".join([cogid2alm.get((cogid, idx), "?") for cogid in cogids])
209 |             )
210 |     wordlist.add_entries(alignment, A, lambda x: x, override=override)
211 | 


--------------------------------------------------------------------------------
/src/lingrex/borrowing.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Basic code for borrowing detection.
  3 | """
  4 | import itertools
  5 | import collections
  6 | 
  7 | from lingpy import Pairwise
  8 | from lingpy.compare.partial import Partial
  9 | from lingpy.compare.lexstat import LexStat
 10 | 
 11 | import networkx as nx
 12 | 
 13 | from lingpy.util import pb
 14 | 
 15 | 
 16 | def internal_cognates(
 17 |     wordlist,
 18 |     family="family",
 19 |     partial=True,
 20 |     method="lexstat",
 21 |     runs=10000,
 22 |     threshold=0.50,
 23 |     smooth=1,
 24 |     ratio=(2, 1),
 25 |     vscale=0.5,
 26 |     restricted_chars="_",
 27 |     modes=[("global", -1, 0.5), ("overlap", -1, 0.5)],
 28 |     ref="autocogids",
 29 |     cluster_method="upgma",
 30 |     model="sca",
 31 | ):
 32 |     """
 33 |     Cluster the data into cognate sets, but only inside each family.
 34 | 
 35 |     :param family: name of the column in which language family information can
 36 |        be found (defaults="family")
 37 | 
 38 |     Note
 39 |     ----
 40 |     This method was first introduced by List and Forkel (2022).
 41 | 
 42 |     > List, J.-M. and R. Forkel (2022): Automated identification of borrowings
 43 |     > in multilingual wordlists [version 3; peer review: 4 approved]. Open
 44 |     > Research Europe 1.79. 1-11. DOI: https://doi.org/10.12688/openreseurope.13843.3
 45 |     """
 46 |     families = {wordlist[k, family] for k in wordlist}
 47 | 
 48 |     # split data into parts
 49 |     D = {k: {} for k in sorted(families)}
 50 |     for idx, fam in wordlist.iter_rows(family):
 51 |         D[fam][idx] = [cell for cell in wordlist[idx]]
 52 | 
 53 |     gcogid = 0
 54 |     G = {}
 55 |     for fam, data in D.items():
 56 |         data[0] = [h for h in wordlist.columns]
 57 |         if partial:
 58 |             lex = Partial(data, model=model)
 59 |             if method == "lexstat":
 60 |                 lex.get_partial_scorer(
 61 |                     runs=runs,
 62 |                     smooth=smooth,
 63 |                     ratio=ratio,
 64 |                     vscale=vscale,
 65 |                     restricted_chars=restricted_chars,
 66 |                     modes=modes,
 67 |                 )
 68 |             lex.partial_cluster(
 69 |                 ref=ref,
 70 |                 method=method,
 71 |                 cluster_method=cluster_method,
 72 |                 threshold=threshold,
 73 |             )
 74 |         else:
 75 |             lex = LexStat(data, model=model)
 76 |             if method == "lexstat":
 77 |                 lex.get_scorer(
 78 |                     runs=runs,
 79 |                     smooth=smooth,
 80 |                     ratio=ratio,
 81 |                     vscale=vscale,
 82 |                     restricted_chars=restricted_chars,
 83 |                     modes=modes,
 84 |                 )
 85 |             lex.cluster(
 86 |                 ref=ref,
 87 |                 method=method,
 88 |                 cluster_method=cluster_method,
 89 |                 threshold=threshold,
 90 |             )
 91 | 
 92 |         # prepare global cognate indicies
 93 |         if partial:
 94 |             C = {idx: len(lex[idx, ref]) * [0] for idx in lex}
 95 |             etd = lex.get_etymdict(ref=ref)
 96 |             for cogid, idxs in etd.items():
 97 |                 for idx_ in idxs:
 98 |                     if idx_:
 99 |                         for idx in idx_:
100 |                             cogids = lex[idx, ref]
101 |                             C[idx][cogids.index(cogid)] = cogid + gcogid
102 |         else:
103 |             C = {idx: 0 for idx in lex}
104 |             etd = lex.get_etymdict(ref=ref)
105 |             for cogid, idxs in etd.items():
106 |                 for idx_ in idxs:
107 |                     if idx_:
108 |                         for idx in idx_:
109 |                             C[idx] = cogid + gcogid
110 |         for idx in lex:
111 |             G[idx] = C[idx]
112 |         gcogid += max(etd) + 1
113 | 
114 |     renumber = {}
115 |     cogid = 1
116 |     if partial:
117 |         for idx, vals in G.items():
118 |             f = wordlist[idx, family]
119 |             new_cogids = []
120 |             for v in vals:
121 |                 if (f, v) in renumber:
122 |                     new_cogids += [renumber[f, v]]
123 |                 else:
124 |                     renumber[f, v] = cogid
125 |                     new_cogids += [cogid]
126 |                     cogid += 1
127 |             G[idx] = new_cogids
128 |     else:
129 |         for idx, val in G.items():
130 |             f = wordlist[idx, family]
131 |             if (f, val) not in renumber:
132 |                 renumber[f, val] = cogid
133 |                 cogid += 1
134 |             G[idx] = renumber[f, val]
135 | 
136 |     wordlist.add_entries(ref, G, lambda x: x)
137 | 
138 | 
139 | def external_cognates(
140 |     wordlist,
141 |     cognates="autocogid",
142 |     ref="autoborid",
143 |     threshold=0.3,
144 |     segments="tokens",
145 |     gop=-1,
146 |     family="family",
147 |     doculect="doculect",
148 |     concept="concept",
149 |     align_mode="overlap",
150 | ):
151 |     """
152 |     Compute language-external cognates and assign them to cognate sets.
153 | 
154 |     :param cognates: The column which holds previously calculated cognates.
155 |     :param ref: The column which will store the new borrowing identifiers.
156 |     :param family: The column storing family information.
157 |     :param doculect: The column storing doculect information.
158 | 
159 |     Note
160 |     ----
161 |     This method was first introduced by List and Forkel (2022).
162 | 
163 |     > List, J.-M. and R. Forkel (2022): Automated identification of borrowings
164 |     > in multilingual wordlists [version 3; peer review: 4 approved]. Open
165 |     > Research Europe 1.79. 1-11. DOI: https://doi.org/10.12688/openreseurope.13843.3
166 |     """
167 | 
168 |     B = {}
169 |     borid = 1
170 |     # iterate over the concepts
171 |     for concept in pb(wordlist.rows):
172 |         idxs = wordlist.get_list(row=concept, flat=True)
173 |         for idx in idxs:
174 |             B[idx] = 0
175 |         taxa = [wordlist[idx, doculect] for idx in idxs]
176 |         famis = [wordlist[idx, family] for idx in idxs]
177 |         if len(set(famis)) > 1:
178 |             G = nx.Graph()
179 |             tokens = [wordlist[idx, segments] for idx in idxs]
180 |             cogids = [wordlist[idx, cognates] for idx in idxs]
181 | 
182 |             # assemble cogids to groups
183 |             groups = collections.defaultdict(list)
184 |             for i, d, t, c in zip(idxs, taxa, tokens, cogids):
185 |                 groups[c] += [(i, d, t)]
186 | 
187 |             for group, items in groups.items():
188 |                 G.add_node(
189 |                     str(group),
190 |                     concept=concept,
191 |                     taxa=", ".join([t[1] for t in items]),
192 |                     idxs=", ".join([str(t[0]) for t in items]),
193 |                     family=wordlist[[t[0] for t in items][0], family],
194 |                 )
195 | 
196 |             # compare groups
197 |             for (gA, iA), (gB, iB) in itertools.combinations(list(groups.items()), r=2):
198 |                 if G.nodes[str(gA)]["family"] != G.nodes[str(gB)]["family"]:
199 |                     wpairs = [
200 |                         (" ".join(a[2]), " ".join(b[2]))
201 |                         for a, b in itertools.product(iA, iB)
202 |                     ]
203 | 
204 |                     pairs = Pairwise(wpairs)
205 |                     pairs.align(distance=True, gop=gop, mode=align_mode)
206 |                     dst = []
207 |                     for i, p in enumerate(pairs._alignments):
208 |                         dst += [p[2]]
209 | 
210 |                     dst = sum(dst) / len(dst)
211 |                     if dst <= threshold:
212 |                         G.add_edge(str(gA), str(gB), distance=dst)
213 | 
214 |             # components
215 |             for i, comp in enumerate(nx.connected_components(G)):
216 |                 if len(comp) > 1:
217 |                     table = []
218 |                     for cogid in comp:
219 |                         idxs = [int(x) for x in G.nodes[cogid]["idxs"].split(", ")]
220 |                         for idx in idxs:
221 |                             table += [
222 |                                 [
223 |                                     wordlist[idx, doculect],
224 |                                     wordlist[idx, concept],
225 |                                     str(wordlist[idx, segments]),
226 |                                     wordlist[idx, family],
227 |                                     cogid,
228 |                                 ]
229 |                             ]
230 |                         for idx in idxs:
231 |                             B[idx] = borid
232 |                     borid += 1
233 |     wordlist.add_entries(ref, B, lambda x: x)
234 | 


--------------------------------------------------------------------------------
/src/lingrex/cognates.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Operations with cognate sets.
 3 | """
 4 | import collections
 5 | 
 6 | from clldutils.text import strip_brackets, split_text
 7 | import lingpy
 8 | 
 9 | 
10 | def common_morpheme_cognates(
11 |     wordlist, cognates="cogids", ref="autoid", morphemes="automorphemes", override=True
12 | ):
13 |     """
14 |     Convert partial cognates to full cognates.
15 | 
16 |     Note
17 |     ----
18 |     This method was first introduced by Wu and List (to appear).
19 | 
20 |     > Wu, Mei-Shin and List, Johann-Mattis (to appear): Annotating cognates in
21 |     > phylogenetic studies of South-East Asian languages. Language Dynamics and
22 |     > Change. Preprint: https://doi.org/10.17613/rabq-7z45
23 |     """
24 | 
25 |     C, M = {}, {}
26 |     current = 1
27 |     for concept in wordlist.rows:
28 |         base = split_text(strip_brackets(concept))[0].upper().replace(" ", "_")
29 |         idxs = wordlist.get_list(row=concept, flat=True)
30 |         cogids = collections.defaultdict(list)
31 |         for idx in idxs:
32 |             M[idx] = [c for c in wordlist[idx, cognates]]
33 |             for cogid in lingpy.basictypes.ints(wordlist[idx, cognates]):
34 |                 cogids[cogid] += [idx]
35 |         for i, (cogid, idxs) in enumerate(
36 |             sorted(cogids.items(), key=lambda x: len(x[1]), reverse=True)
37 |         ):
38 |             for idx in idxs:
39 |                 if idx not in C:
40 |                     C[idx] = current
41 |                     M[idx][M[idx].index(cogid)] = base
42 |                 else:
43 |                     M[idx][M[idx].index(cogid)] = "_" + base.lower()
44 |             current += 1
45 |     wordlist.add_entries(ref, C, lambda x: x)
46 |     if morphemes:
47 |         wordlist.add_entries(morphemes, M, lambda x: x, override=override)
48 | 
49 | 
50 | def salient_cognates(
51 |     wordlist, cognates="cogids", ref="newcogid", morphemes="morphemes", override=True
52 | ):
53 |     """
54 |     Convert partial cognates to full cognates ignoring non-salient cognate sets.
55 | 
56 |     Note
57 |     ----
58 |     This method was first introduced by Wu and List (to appear).
59 | 
60 |     > Wu, Mei-Shin and List, Johann-Mattis (to appear): Annotating cognates in
61 |     > phylogenetic studies of South-East Asian languages. Language Dynamics and
62 |     > Change. Preprint: https://doi.org/10.17613/rabq-7z45
63 |     """
64 | 
65 |     lookup, D = {}, {}
66 |     for idx, cogids, morphemes in wordlist.iter_rows(cognates, morphemes):
67 |         selected_cogids = []
68 |         for cogid, morpheme in zip(cogids, morphemes):
69 |             if not morpheme.startswith("_"):
70 |                 selected_cogids += [cogid]
71 |         salient = tuple(selected_cogids)
72 |         if salient in lookup:
73 |             D[idx] = lookup[salient]
74 |         elif D.values():
75 |             next_cogid = max(D.values()) + 1
76 |             lookup[salient] = next_cogid
77 |             D[idx] = next_cogid
78 |         else:
79 |             lookup[salient] = 1
80 |             D[idx] = 1
81 | 
82 |     wordlist.add_entries(ref, D, lambda x: x, override=override)
83 | 


--------------------------------------------------------------------------------
/src/lingrex/colex.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions for partial colexification manipulations.
  3 | """
  4 | import collections
  5 | 
  6 | 
  7 | def find_bad_internal_alignments(alignments, ref="cogids"):
  8 |     """
  9 |     Helper function discards wrongly assigned cross-semantic cognates.
 10 | 
 11 |     Note
 12 |     ----
 13 |     The function essentially iterates over the alignments and picks
 14 |     out those in which the same language has the same cognate ID, and if
 15 |     the alignment itself differs, it assigns it a new cognate ID. It
 16 |     presupposes that the data has not been analyzed in search for
 17 |     cross-semantic cognates.
 18 |     """
 19 |     newIDs = {}
 20 | 
 21 |     def get_all_indices(lst):
 22 |         idxs = collections.defaultdict(list)
 23 |         for i, l in enumerate(lst):
 24 |             idxs[l] += [i]
 25 |         return idxs
 26 | 
 27 |     new_cogid = max(alignments.msa[ref]) + 1
 28 |     for cogid, msa in alignments.msa[ref].items():
 29 |         idxs = [i for t, i in get_all_indices(msa["taxa"]).items() if len(i) > 1]
 30 |         for idx in idxs:
 31 |             tups = [tuple(msa["alignment"][x]) for x in idx]
 32 |             if len(set(tups)) > 1:
 33 |                 bestt = sorted(tups, key=lambda x: tups.count(x), reverse=True)[0]
 34 |                 for x in idx:
 35 |                     if tuple(msa["alignment"][x]) != bestt:
 36 |                         newIDs[msa["ID"][x]] = (cogid, new_cogid)
 37 |                         new_cogid += 1
 38 | 
 39 |     for idx, (cogid, new_cogid) in newIDs.items():
 40 |         this_idx = alignments[idx, ref].index(cogid)
 41 |         alignments[idx, ref][this_idx] = new_cogid
 42 | 
 43 | 
 44 | def expand_alignment(msa, taxa, missing="Ø"):
 45 |     """
 46 |     Expand an alignment by adding a symbol for missing taxa.
 47 |     """
 48 |     out = []
 49 |     for taxon in taxa:
 50 |         if taxon in msa["taxa"]:
 51 |             tidx = msa["taxa"].index(taxon)
 52 |             out.append(
 53 |                 [x.split("/")[1] if "/" in x else x for x in msa["alignment"][tidx]]
 54 |             )
 55 |         else:
 56 |             out.append(len(msa["alignment"][0]) * [missing])
 57 |     return out
 58 | 
 59 | 
 60 | def compatible(msa1, msa2, missing="Ø", gap="-"):
 61 |     """
 62 |     Compare two alignments and check whether they colexify.
 63 |     """
 64 |     matches = 0
 65 |     for line1, line2 in zip(msa1, msa2):
 66 |         if [x for x in line1 if x != gap] == [
 67 |             x for x in line2 if x != gap
 68 |         ] and missing not in line1 + line2:
 69 |             matches += 1
 70 |         else:
 71 |             if list(set(line1))[0] != missing and list(set(line2))[0] != missing:
 72 |                 return False
 73 |     return matches
 74 | 
 75 | 
 76 | def merge_alignments(almA, almB, missing="Ø", gap="-"):
 77 |     """
 78 |     Merge two alignments which are compatible.
 79 |     """
 80 |     out = []
 81 |     missing_taxa = []
 82 |     for k, (a, b) in enumerate(zip(almA, almB)):
 83 |         if (
 84 |             len(set(a)) == 1
 85 |             and list(set(a))[0] == missing  # noqa: W503
 86 |             and len(set(b)) == 1  # noqa: W503
 87 |             and list(set(b))[0] == missing  # noqa: W503
 88 |         ):
 89 |             missing_taxa += [k]
 90 |     i, j = 0, 0
 91 |     while i < len(almA[0]) and j < len(almB[0]):
 92 |         colA, colB = [row[i] for row in almA], [row[j] for row in almB]
 93 |         if colA == colB:
 94 |             out += [colA]
 95 |             i += 1
 96 |             j += 1
 97 |         else:
 98 |             col = []
 99 |             for a, b in zip(colA, colB):
100 |                 if a == gap and a != b and b != missing:
101 |                     ncol = []
102 |                     for k, c in enumerate(colA):
103 |                         if c == missing and k not in missing_taxa:
104 |                             ncol += [gap]
105 |                         else:
106 |                             ncol += [c]
107 |                     out += [ncol]
108 |                     i += 1
109 |                     col = []
110 |                     break
111 |                 if b == gap and a != b and a != missing:
112 |                     ncol = []
113 |                     for k, c in enumerate(colB):
114 |                         if c == missing and k not in missing_taxa:
115 |                             ncol += [gap]
116 |                         else:
117 |                             ncol += [c]
118 |                     out += [ncol]
119 |                     j += 1
120 |                     col = []
121 |                     break
122 | 
123 |                 col.append(b if a == missing else a)
124 |             if col:
125 |                 out += [col]
126 |                 i += 1
127 |                 j += 1
128 |     if i < len(almA[0]):
129 |         ncol = []
130 |         for k, c in enumerate([row[i] for row in almA]):
131 |             if c == missing and k not in missing_taxa:
132 |                 ncol += [gap]
133 |             else:
134 |                 ncol += [c]
135 |         out += [ncol]
136 |     elif j < len(almB[0]):
137 |         ncol = []
138 |         for k, c in enumerate([row[j] for row in almB]):
139 |             if c == missing and k not in missing_taxa:
140 |                 ncol += [gap]
141 |             else:
142 |                 ncol += [c]
143 |         out += [ncol]
144 | 
145 |     nalm = []
146 |     for i in range(len(out[0])):
147 |         nalm += [[row[i] for row in out]]
148 |     return nalm
149 | 
150 | 
151 | def find_colexified_alignments(
152 |     alignments, cognates="cogids", missing="Ø", ref="crossids"
153 | ):
154 |     """
155 |     Identify identical alignments in a dataset and label them as homophones.
156 | 
157 |     Note
158 |     ----
159 |     This function was first introduced in Wu et al. (2020).
160 | 
161 |     > Wu, M.-S., N. Schweikhard, T. Bodt, N. Hill, and J.-M. List (2020):
162 |     > Computer-Assisted Language Comparison. State of the Art. Journal of Open
163 |     > Humanities Data 6.2. 1-14. DOI: https://doi.org/10.5334/johd.12
164 |     """
165 | 
166 |     queue = []
167 |     for cogid, msa in sorted(
168 |         alignments.msa[cognates].items(),
169 |         key=lambda x: len(set(x[1]["taxa"])),
170 |         reverse=True,
171 |     ):
172 |         queue += [(cogid, expand_alignment(msa, alignments.taxa, missing=missing))]
173 | 
174 |     merged = {}
175 | 
176 |     while queue:
177 |         this_cogid, this_msa = queue.pop(0)
178 |         deletes = []
179 |         merged[this_cogid] = this_cogid
180 |         for i, (other_cogid, other_msa) in enumerate(queue):
181 |             if compatible(this_msa, other_msa) >= 1:
182 |                 this_msa = merge_alignments(this_msa, other_msa)
183 |                 merged[other_cogid] = this_cogid
184 |                 deletes += [i]
185 | 
186 |         for i in deletes[::-1]:
187 |             del queue[i]
188 | 
189 |     # assemble the clusters now
190 |     if alignments._mode == "fuzzy":
191 |         alignments.add_entries(ref, cognates, lambda x: [merged.get(y, y) for y in x])
192 |     else:
193 |         alignments.add_entries(ref, cognates, lambda x: merged.get(x, x))
194 | 


--------------------------------------------------------------------------------
/src/lingrex/copar.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import pathlib
  3 | import itertools
  4 | import collections
  5 | 
  6 | from lingpy.sequence.sound_classes import class2tokens
  7 | from lingpy.settings import rc
  8 | from lingpy.align.sca import get_consensus, Alignments
  9 | from lingpy.util import pb
 10 | from lingpy import log
 11 | from lingpy import basictypes as bt
 12 | 
 13 | import networkx as nx
 14 | 
 15 | 
 16 | def consensus_pattern(patterns, missing="Ø"):
 17 |     """
 18 |     Return consensus pattern of multiple patterns.
 19 | 
 20 |     :param patterns: list of patterns
 21 |     :param missing: the character used to represent missing values
 22 | 
 23 |     .. note:: This consensus method raises an error if the patterns contain incompatible
 24 |        columns (non-identical values apart from the missing data character in the same
 25 |        column).
 26 |     """
 27 |     out = []
 28 |     for i in range(len(patterns[0])):
 29 |         col = [line[i] for line in patterns]
 30 |         no_gaps = [x for x in col if x != missing]
 31 |         if len(set(no_gaps)) > 1:
 32 |             raise ValueError("Your patterns are incompatible")
 33 |         out += [no_gaps[0] if no_gaps else missing]
 34 |     return tuple(out)
 35 | 
 36 | 
 37 | def incompatible_columns(patterns, missing="Ø"):
 38 |     """
 39 |     Compute whether a pattern has incompatible columns.
 40 |     """
 41 |     columns = []
 42 |     for i in range(len(patterns[0])):
 43 |         col = [
 44 |             patterns[j][i] for j in range(len(patterns)) if patterns[j][i] != missing
 45 |         ]
 46 |         columns.append("*" if len(set(col)) > 1 else "")
 47 |     return columns
 48 | 
 49 | 
 50 | def score_patterns(patterns, missing="Ø", mode="coverage"):
 51 |     """
 52 |     Function gives a score for the overall number of reflexes.
 53 | 
 54 |     .. note:: This score tells simply to which degree a pattern is filled. It divides the
 55 |        number of cells not containing missing data by the number of cells in the
 56 |        matrix.
 57 |     """
 58 |     # return -1 if the patterns are not compatible
 59 |     for i in range(len(patterns[0])):
 60 |         if len(set([row[i] for row in patterns if row[i] != missing])) > 1:
 61 |             return -1
 62 |     if len(patterns) <= 1:
 63 |         return -1
 64 | 
 65 |     if mode not in ["ranked", "pairs", "squared", "coverage"]:
 66 |         raise ValueError("you must select an appropriate mode")
 67 | 
 68 |     # we rank the columns by sorting them first
 69 |     if mode == "ranked":
 70 |         cols = []
 71 |         for i in range(len(patterns[0])):
 72 |             cols += [sum([0 if row[i] == missing else 1 for row in patterns])]
 73 |         # sort the columns
 74 |         ranks, cols = list(range(1, len(cols) + 1))[::-1], sorted(cols, reverse=True)
 75 |         scores = []
 76 |         for rank, col in zip(ranks, cols):
 77 |             scores += [rank * col]
 78 |         return sum(scores) / sum(ranks) / len(patterns)
 79 | 
 80 |     if mode == "squared":
 81 |         psize = len(patterns[0])
 82 |         scores = [((psize - row.count(missing)) / psize) ** 2 for row in patterns]
 83 |         return sum(scores) / len(scores)
 84 | 
 85 |     if mode == "pairs":
 86 |         # count the number of pairs in the data
 87 |         pairs = 0
 88 |         covered = 0
 89 |         m, n = len(patterns[0]), len(patterns)
 90 |         for i in range(n):
 91 |             vals = m - patterns[i].count(missing)
 92 |             pairs += (vals**2 - vals) / 2
 93 |         for i in range(m):
 94 |             vals = n - [p[i] for p in patterns].count(missing)
 95 |             pairs += (vals**2 - vals) / 2
 96 |             if vals != 0:
 97 |                 covered += 1
 98 |         return ((pairs / n) / covered) / m
 99 | 
100 |     if mode == "coverage":
101 |         cols = []
102 |         for i in range(len(patterns[0])):
103 |             col = [row[i] for row in patterns]
104 |             cols += [len(patterns) - col.count(missing)]
105 |         return (sum(cols) / len(patterns[0])) / len(patterns)  # * len(patterns[0]))
106 | 
107 | 
108 | def compatible_columns(colA, colB, missing="Ø", gap="-"):
109 |     """Check for column compatibility.
110 | 
111 |     Parameters
112 |     ----------
113 |     colA, colB = list
114 |         Lists (sequence type) containing a given pattern.
115 |     missing : str (default="Ø")
116 |         A gap in the sense of "missing data", that is, a cognate set for which
117 |         a value in a given language is absent.
118 | 
119 |     Returns
120 |     -------
121 |     matches, mismatches : tuple
122 |         The score for matches gives zero if there is no conflict but also no
123 |         match. For mismatches it is accordingly. So if you seek for
124 |         compatibility, a mismatch greater 0 means the patterns are not
125 |         compatible.
126 |     """
127 |     matches, mismatches = 0, 0
128 |     for a, b in zip(colA, colB):
129 |         if missing not in [a, b]:
130 |             if a != b:
131 |                 mismatches += 1
132 |             else:
133 |                 if a != gap:
134 |                     matches += 1
135 |     return matches, mismatches
136 | 
137 | 
138 | def density(wordlist, ref="cogid"):
139 |     """Compute the density of a wordlist.
140 | 
141 |     Note
142 |     ----
143 |     We define the density of a wordlist by measuring how many words can be
144 |     explained by the same cognate set.
145 |     """
146 |     scores = []
147 |     for concept in wordlist.rows:
148 |         idxs = wordlist.get_list(row=concept, flat=True)
149 |         cogids = [wordlist[idx, ref] for idx in idxs]
150 |         sums = [1 / cogids.count(cogid) for idx, cogid in zip(idxs, cogids)]
151 |         scores.append(sum(sums) / len(sums))
152 |     return 1 - sum(scores) / len(scores)
153 | 
154 | 
155 | class CoPaR(Alignments):
156 |     """Correspondence Pattern Recognition class
157 | 
158 |     Parameters
159 |     ----------
160 |     wordlist : ~lingpy.basic.wordlist.Wordlist
161 |         A wordlist object which should have a column for segments and a column
162 |         for cognate sets. Since the class inherits from LingPy's
163 |         Alignments-class, the same kind of data should be submitted.
164 |     ref : str (default="cogid")
165 |         The column which stores the cognate sets.
166 |     segments : str (default="tokens")
167 |         The column which stores the segmented transcriptions.
168 |     alignment : str (default="alignment")
169 |         The column which stores the alignments (or will store the alignments if
170 |         they have not yet been computed).
171 | 
172 |     Note
173 |     ----
174 |     This method was first introduced in List (2019).
175 | 
176 |     > List, J.-M. (2019): Automatic inference of sound correspondence patterns
177 |     > across multiple languages. Computational Linguistics 45.1. 137-161. DOI:
178 |     > http://doi.org/10.1162/coli_a_00344
179 |     """
180 | 
181 |     def __init__(
182 |         self,
183 |         wordlist,
184 |         minrefs=3,
185 |         ref="cogids",
186 |         structure="structure",
187 |         missing="Ø",
188 |         gap="-",
189 |         irregular="!?",
190 |         **keywords
191 |     ):
192 |         Alignments.__init__(self, wordlist, ref=ref, **keywords)
193 |         self.ref = ref
194 |         self._structure = structure
195 |         self.minrefs = minrefs
196 |         self.missing = missing
197 |         self.gap = gap
198 |         self.irregular = irregular
199 |         if structure not in self.columns:
200 |             raise ValueError("no column {0} for structure was found".format(structure))
201 | 
202 |     def positions_from_prostrings(self, cogid, indices, alignment, structures):
203 |         """
204 |         Return positions matching from an alignment and user-defined prosodic strings
205 |         """
206 |         if self._mode == "fuzzy":
207 |             strucs = []
208 |             for idx, struc, alm in zip(indices, structures, alignment):
209 |                 pos_ = self[idx, self._ref].index(cogid)
210 |                 strucs += [class2tokens(struc.n[pos_], alm)]
211 |         else:
212 |             strucs = [
213 |                 class2tokens(struc, alm) for struc, alm in zip(structures, alignment)
214 |             ]
215 |         get_consensus(alignment, gaps=True)
216 |         prostring = []
217 |         for i in range(len(strucs[0])):
218 |             row = [x[i] for x in strucs if x[i] != "-"]
219 |             prostring += [row[0] if row else "+"]
220 |         return [(i, p) for i, p in enumerate(prostring)]
221 | 
222 |     def reflexes_from_pos(
223 |         self, position, taxa, current_taxa, alignment, missing, irregular
224 |     ):
225 |         reflexes = []
226 |         for t in taxa:
227 |             if t not in current_taxa:
228 |                 reflexes += [missing]
229 |             else:
230 |                 reflex = alignment[current_taxa.index(t)][position]
231 |                 if "/" in reflex:
232 |                     reflex = reflex.split("/")[1]
233 |                 elif reflex[0] in irregular:
234 |                     reflex = missing
235 |                 reflexes += [reflex]
236 |         return reflexes
237 | 
238 |     def _check(self):
239 |         """
240 |         Check for problematic patterns in the data.
241 |         """
242 |         errors = []
243 |         for idx, struc, alm in self.iter_rows(self._structure, self._alignment):
244 |             self[idx, self._structure] = self._str_type(struc)
245 |             self[idx, self._alignment] = self._str_type(alm)
246 |             if not len(self[idx, self._structure]) == len(
247 |                 [x for x in self[idx, self._alignment] if x != "-"]
248 |             ):
249 |                 print(
250 |                     idx,
251 |                     self[idx, self._structure],
252 |                     "|",
253 |                     self[idx, self._alignment],
254 |                     "|",
255 |                     self[idx, "tokens"],
256 |                 )
257 |                 log.warning("alignment and structure do not match in {0}".format(idx))
258 |                 errors += [idx]
259 |         return errors
260 | 
261 |     def get_sites(self):
262 |         """
263 |         Retrieve the alignment sites of interest for initial analysis.
264 |         """
265 |         sites, all_sites, taxa = (
266 |             collections.OrderedDict(),
267 |             collections.OrderedDict(),
268 |             self.cols,
269 |         )
270 |         errors = self._check()
271 |         if errors:
272 |             raise ValueError("found {0} problems in the data".format(len(errors)))
273 | 
274 |         # iterate over all sites in the alignment
275 |         visited = []
276 |         for cogid, msa in pb(
277 |             sorted(self.msa[self.ref].items()),
278 |             desc="CoPaR: get_patterns()",
279 |             total=len(self.msa[self.ref]),
280 |         ):
281 |             # get essential data: taxa, alignment, etc.
282 |             _taxa = [t for t in taxa if t in msa["taxa"]]
283 |             _idxs = {t: msa["taxa"].index(t) for t in _taxa}
284 |             _alms = [msa["alignment"][_idxs[t]] for t in _taxa]
285 |             _wlid = [msa["ID"][_idxs[t]] for t in _taxa]
286 | 
287 |             # store visited entries
288 |             visited += msa["ID"]
289 |             if len(_taxa) >= self.minrefs:
290 |                 if self._mode == "fuzzy":
291 |                     _strucs = []
292 |                     for _widx in _wlid:
293 |                         _these_strucs = self[_widx, self._structure]
294 |                         _strucs += [_these_strucs]
295 |                 else:
296 |                     _strucs = [self[idx, self._structure] for idx in _wlid]
297 |                 positions = self.positions_from_prostrings(cogid, _wlid, _alms, _strucs)
298 |                 for pidx, pos in positions:
299 |                     reflexes = self.reflexes_from_pos(
300 |                         pidx, taxa, _taxa, _alms, self.missing, self.irregular
301 |                     )
302 |                     sites[cogid, pidx] = [pos, tuple(reflexes)]
303 |             for pidx in range(len(_alms[0])):
304 |                 reflexes = self.reflexes_from_pos(
305 |                     pidx, taxa, _taxa, _alms, self.missing, self.irregular
306 |                 )
307 |                 all_sites[cogid, pidx] = reflexes
308 | 
309 |         # add non-visited segments
310 |         for idx in [i for i in self if i not in visited]:
311 |             if self._mode == "fuzzy":
312 |                 for tt, ss, cogid in zip(
313 |                     self[idx, self._segments].n,
314 |                     self[idx, self._structure].n,
315 |                     self[idx, self._ref],
316 |                 ):
317 |                     for i, (t, s) in enumerate(zip(tt, ss)):
318 |                         all_sites[cogid, i] = [
319 |                             self.missing if tax != self[idx][self._colIdx] else t
320 |                             for tax in self.cols
321 |                         ]
322 |             else:
323 |                 for i, (t, s) in enumerate(
324 |                     zip(self[idx, self._segments], self[idx, self._structure])
325 |                 ):
326 |                     all_sites[self[idx, self.ref], i] = [
327 |                         self.missing if tax != self[idx][self._colIdx] else t
328 |                         for tax in self.cols
329 |                     ]
330 | 
331 |         self.sites = sites
332 |         self.all_sites = all_sites
333 | 
334 |     def cluster_sites(self, match_threshold=1, score_mode="pairs"):
335 |         """Cluster alignment sites using greedy clique cover.
336 |         :param match_threshold: The threshold of matches for accepting two
337 |             compatible columns.
338 |         :param score_mode: select between "pairs", "coverage"
339 | 
340 |         .. note:: This algorithm follows the spirit of the Welsh-Powell algorithm for
341 |            graph coloring. Since graph coloring is the inverse of clique
342 |            partitioning, we can use the algorithm in the same spirit.
343 | 
344 |         """
345 |         if not hasattr(self, "clusters"):
346 |             self.clusters = collections.defaultdict(list)
347 |             for (cogid, idx), (pos, ptn) in self.sites.items():
348 |                 self.clusters[pos, ptn] += [(cogid, idx)]
349 |         clusters = self.clusters
350 |         while True:
351 |             prog = 0
352 |             with pb(
353 |                 desc="CoPaR: cluster_sites()", total=len(self.clusters)
354 |             ) as progress:
355 |                 sorted_clusters = sorted(
356 |                     clusters.items(),
357 |                     key=lambda x: (
358 |                         score_patterns(
359 |                             [self.sites[y][1] for y in x[1]], mode=score_mode
360 |                         ),
361 |                         len(x[1]),
362 |                     ),
363 |                     reverse=True,
364 |                 )
365 |                 out = []
366 |                 while sorted_clusters:
367 |                     ((this_pos, this_cluster), these_vals), remaining_clusters = (
368 |                         sorted_clusters[0],
369 |                         sorted_clusters[1:],
370 |                     )
371 |                     queue = []
372 |                     for (next_pos, next_cluster), next_vals in remaining_clusters:
373 |                         match, mism = compatible_columns(
374 |                             this_cluster,
375 |                             next_cluster,
376 |                             missing=self.missing,
377 |                             gap=self.gap,
378 |                         )
379 |                         if (
380 |                             this_pos == next_pos
381 |                             and match >= match_threshold  # noqa: W503
382 |                             and mism == 0  # noqa: W503
383 |                         ):
384 |                             this_cluster = consensus_pattern(
385 |                                 [this_cluster, next_cluster]
386 |                             )
387 |                             these_vals += next_vals
388 |                         else:
389 |                             queue += [((next_pos, next_cluster), next_vals)]
390 |                     sorted_clusters = queue
391 |                     out += [((this_pos, this_cluster), these_vals)]
392 |                     progress.update(len(self.sites) - len(queue) - prog)
393 |                     prog = len(self.sites) - len(queue)
394 |                 clusters = {tuple(a): b for a, b in out}
395 |                 alls = [c for c in clusters]
396 |                 match = 0
397 |                 for i, (_a, a) in enumerate(alls):
398 |                     for j, (_b, b) in enumerate(alls):
399 |                         if i < j and _a == _b:
400 |                             ma, mi = compatible_columns(
401 |                                 a, b, missing=self.missing, gap=self.gap
402 |                             )
403 |                             if ma and not mi:
404 |                                 match += 1
405 |                 if not match:
406 |                     break
407 |                 else:
408 |                     log.warning(
409 |                         "iterating, since {0} clusters can further be merged".format(
410 |                             match
411 |                         )
412 |                     )
413 |         self.clusters = clusters
414 |         self.ordered_clusters = sorted(clusters, key=lambda x: len(x[1]))
415 | 
416 |     def sites_to_pattern(self, threshold=1):
417 |         """Algorithm assigns alignment sites to patterns.
418 | 
419 |         Notes
420 |         -----
421 |         We rank according to general compatibility.
422 |         """
423 |         asites = collections.defaultdict(list)
424 |         for consensus in pb(
425 |             self.clusters, desc="CoPaR: sites_to_pattern()", total=len(self.clusters)
426 |         ):
427 |             sites = self.clusters[consensus]
428 |             for cog, pos in sites:
429 |                 struc, pattern = self.sites[cog, pos]
430 |                 for strucB, consensusB in self.clusters:
431 |                     ma, mi = compatible_columns(pattern, consensusB)
432 |                     if struc == strucB and not mi and ma >= threshold:
433 |                         asites[cog, pos] += [(ma, struc, consensusB)]
434 |         self.patterns = asites
435 | 
436 |     def fuzziness(self):
437 |         return sum([len(b) for a, b in self.patterns.items()]) / len(self.patterns)
438 | 
439 |     def irregular_patterns(self, accepted=2, matches=1, irregular_prefix="!"):
440 |         """
441 |         Try to assign irregular patterns to accepted patterns.
442 | 
443 |         Parameters
444 |         ----------
445 |         accepted : int (default=2)
446 |             Minimal size of clusters that we regard as regular.
447 | 
448 |         """
449 |         bad_clusters = [
450 |             (clr, pts[0]) for clr, pts in self.clusters.items() if len(pts) == 1
451 |         ]
452 |         good_clusters = sorted(
453 |             [(clr, pts) for clr, pts in self.clusters.items() if len(pts) >= accepted],
454 |             key=lambda x: len(x[1]),
455 |             reverse=True,
456 |         )
457 |         new_clusters = {clr: [] for clr, pts in good_clusters}
458 |         irregular_patterns = []
459 |         for clr, ptn in bad_clusters:
460 |             if ptn.count(self.missing) <= 2:
461 |                 for clrB, pts in good_clusters:
462 |                     match, mism = compatible_columns(clr[1], clrB[1])
463 |                     if mism <= matches and match > matches:
464 |                         new_clusters[clrB] += [clr]
465 |                         irregular_patterns += [clr]
466 |                         break
467 |         # re-assign alignments to the data by adding the irregular character
468 |         for key, value in sorted(
469 |             new_clusters.items(), key=lambda x: len(x[1]), reverse=True
470 |         ):
471 |             if len(value) > 0:
472 |                 for i, pattern in enumerate(value):
473 |                     pt = []
474 |                     for lid, (a, b) in enumerate(zip(key[1], pattern[1])):
475 |                         if a != b and self.missing not in [a, b]:
476 |                             pt += [irregular_prefix + b]
477 |                             # assign pattern to the corresponding alignments
478 |                             for cogid, position in self.clusters[pattern]:
479 |                                 if self._mode == "fuzzy":
480 |                                     word_indices = self.etd[self.ref][cogid][lid]
481 |                                     if word_indices:
482 |                                         for widx in word_indices:
483 |                                             # get the position in the alignment
484 |                                             alms = self[widx, self._alignment].n
485 |                                             cog_pos = self[widx, self.ref].index(cogid)
486 |                                             new_alm = alms[cog_pos]
487 |                                             new_alm[position] = "{0}{1}/{2}".format(
488 |                                                 irregular_prefix, b, a
489 |                                             )
490 |                                             alms[cog_pos] = new_alm
491 |                                             self[
492 |                                                 widx, self._alignment
493 |                                             ] = self._str_type(
494 |                                                 " + ".join(
495 |                                                     [" ".join(x) for x in alms]
496 |                                                 ).split()
497 |                                             )
498 |                                 else:
499 |                                     word_indices = self.etd[self.ref][cogid][lid]
500 |                                     if word_indices:
501 |                                         for widx in word_indices:
502 |                                             alm = self._str_type(
503 |                                                 self[widx, self._alignment]
504 |                                             )
505 |                                             alm[position] = "{0}{1}/{2}".format(
506 |                                                 irregular_prefix, b, a
507 |                                             )
508 |                                             self[
509 |                                                 widx, self._alignment
510 |                                             ] = self._str_type(" ".join(alm))
511 |                         else:
512 |                             pt += [b]
513 | 
514 |         self.ipatterns = new_clusters
515 |         for pattern, data in [
516 |             (a, b) for a, b in bad_clusters if a not in irregular_patterns
517 |         ]:
518 |             cogid, position = data
519 |             if self._mode == "fuzzy":
520 |                 for indices in [idx for idx in self.etd[self.ref][cogid] if idx]:
521 |                     for widx in indices:
522 |                         cog_pos = self[widx, self.ref].index(cogid)
523 |                         alms = self[widx, self._alignment].n
524 |                         new_alm = alms[cog_pos]
525 |                         new_alm[position] = "{0}{1}".format(
526 |                             irregular_prefix, new_alm[position]
527 |                         )
528 |                         alms[cog_pos] = new_alm
529 |                         self[widx, self._alignment] = self._str_type(
530 |                             " + ".join([" ".join(x) for x in alms]).split()
531 |                         )
532 | 
533 |         return new_clusters
534 | 
535 |     def load_patterns(self, patterns="patterns"):
536 |         self.id2ptn = collections.OrderedDict()
537 |         self.clusters = collections.OrderedDict()
538 |         self.id2pos = collections.defaultdict(set)
539 |         self.sites = collections.OrderedDict()
540 |         # get the template
541 |         template = [self.missing for m in self.cols]
542 |         tidx = {self.cols[i]: i for i in range(self.width)}
543 |         for idx, ptn, alm, struc, doc, cogs in self.iter_rows(
544 |             patterns, self._alignment, self._structure, "doculect", self._ref
545 |         ):
546 |             if self._mode == "fuzzy":
547 |                 ptn = bt.lists(ptn)
548 |                 for i in range(len(alm.n)):
549 |                     for j, (p, a) in enumerate(zip(ptn.n[i], alm.n[i])):
550 |                         if not p == "0/n":
551 |                             this_pattern = self.id2ptn.get(p, [t for t in template])
552 |                             if this_pattern[tidx[doc]] == "Ø":
553 |                                 this_pattern[tidx[doc]] = a
554 |                             self.id2ptn[p] = this_pattern
555 |                             self.id2pos[p].add((cogs[i], j))
556 |             else:
557 |                 for j, (p, a) in enumerate(zip(ptn, alm)):
558 |                     if not p == "0/n":
559 |                         this_pattern = self.id2ptn.get(p, [t for t in template])
560 |                         if this_pattern[tidx[doc]] == "Ø":
561 |                             this_pattern[tidx[doc]] = a
562 |                         self.id2ptn[p] = this_pattern
563 |                         self.id2pos[p].add((cogs, j))
564 | 
565 |         self.ptn2id = {tuple(v): k for k, v in self.id2ptn.items()}
566 |         for k, v in self.id2ptn.items():
567 |             self.clusters[tuple(v)] = list(self.id2pos[k])
568 |             self.id2pos[k] = list(self.id2pos[k])
569 |             for s in self.id2pos[k]:
570 |                 self.sites[s] = [(len(self.id2pos[k]), tuple(v))]
571 | 
572 |     def add_patterns(
573 |         self, ref="patterns", irregular_patterns=False, proto=False, override=True
574 |     ):
575 |         """Assign patterns to a new column in the word list."""
576 |         if not hasattr(self, "id2ptn"):
577 |             self.id2ptn = {}
578 |         if not hasattr(self, "pattern2id"):
579 |             self.ptn2id = {}
580 |         if proto:
581 |             pidx = self.cols.index(proto)
582 |         else:
583 |             pidx = 0
584 | 
585 |         if irregular_patterns:
586 |             new_clusters = collections.defaultdict(list)
587 |             for reg, iregs in self.ipatterns.items():
588 |                 for cogid, position in self.clusters[reg]:
589 |                     new_clusters[reg] += [(cogid, position)]
590 |                 for ireg in iregs:
591 |                     for cogid, position in self.clusters[ireg]:
592 |                         new_clusters[reg] += [(cogid, position)]
593 |         else:
594 |             new_clusters = self.clusters
595 |         for pattern, rest in self.clusters.items():
596 |             for cogid, position in rest:
597 |                 if (cogid, position) not in new_clusters[pattern]:
598 |                     new_clusters[pattern] += [(cogid, position)]
599 | 
600 |         P = {
601 |             idx: bt.lists(
602 |                 [
603 |                     "0" if x not in rc("morpheme_separators") else "+"
604 |                     for x in self[idx, self._alignment]
605 |                 ]
606 |             )
607 |             for idx in self
608 |         }
609 |         for i, ((struc, pattern), data) in enumerate(
610 |             sorted(new_clusters.items(), key=lambda x: len(x), reverse=True)
611 |         ):
612 |             pattern_id = "{0}".format(
613 |                 i + 1 #, len(self.clusters[struc, pattern]), pattern[pidx]
614 |             )
615 |             self.id2ptn[pattern_id] = pattern
616 |             self.ptn2id[pattern] = pattern_id
617 |             for cogid, position in data:
618 |                 word_indices = [c for c in self.etd[self.ref][cogid] if c]
619 |                 for idxs in word_indices:
620 |                     for idx in idxs:
621 |                         if self._mode == "fuzzy":
622 |                             pattern_position = self[idx, self.ref].index(cogid)
623 |                             this_pattern = P[idx].n[pattern_position]
624 |                             try:
625 |                                 this_pattern[position] = pattern_id
626 |                                 P[idx].change(pattern_position, this_pattern)
627 |                             except:  # noqa: E722
628 |                                 log.warning("error in {0}".format(cogid))
629 | 
630 |                         else:
631 |                             P[idx][position] = pattern_id
632 |         self.add_entries(ref, P, lambda x: x, override=override)
633 | 
634 |     def write_patterns(self, filename, proto=False, irregular_patterns=False):
635 |         if proto:
636 |             pidx = self.cols.index(proto)
637 |         else:
638 |             pidx = 0
639 | 
640 |         if not hasattr(self, "id2ptn"):
641 |             raise ValueError("You should run CoPaR.add_patterns first!")
642 | 
643 |         if irregular_patterns:
644 |             new_clusters = collections.defaultdict(list)
645 |             for (pos, reg), iregs in self.ipatterns.items():
646 |                 for cogid, position in self.clusters[pos, reg]:
647 |                     new_clusters[pos, reg] += [(cogid, position)]
648 |                 for _, ireg in iregs:
649 |                     ireg_ = list(ireg)
650 |                     print(ireg_)
651 |                     for i, (a, b) in enumerate(zip(reg, ireg)):
652 |                         print(i, a, b)
653 |                         if a != b and b != self.missing:
654 |                             ireg_[i] = a + "/" + b
655 |                     ireg_ = tuple(ireg_)
656 |                     self.ptn2id[ireg_] = self.ptn2id[reg]
657 |                     for cogid, position in self.clusters[pos, ireg]:
658 |                         new_clusters[pos, ireg_] += [(cogid, position)]
659 |         else:
660 |             new_clusters = self.clusters
661 |         for (struc, pattern), rest in self.clusters.items():
662 |             for cogid, position in rest:
663 |                 if (cogid, position) not in new_clusters[struc, pattern]:
664 |                     new_clusters[struc, pattern] += [(cogid, position)]
665 |         text = "ID\tSTRUCTURE\tFREQUENCY\t{0}\t{1}\tCOGNATESETS\tCONCEPTS\n".format(
666 |             self.cols[pidx], "\t".join([c for c in self.cols if c != self.cols[pidx]])
667 |         )
668 | 
669 |         sound = ""
670 |         idx = 0
671 |         for (struc, pattern), entries in sorted(
672 |             new_clusters.items(),
673 |             key=lambda x: (x[0][0], x[0][1][pidx], len(x[1])),
674 |             reverse=True,
675 |         ):
676 |             if sound != pattern[pidx]:
677 |                 sound = pattern[pidx]
678 |                 idx = 0
679 |             concepts = []
680 |             for x, y in entries:
681 |                 for entry in self.etd[self.ref][x]:
682 |                     if entry:
683 |                         for value in entry:
684 |                             concepts += [self[value, "concept"]]
685 |             concepts = " / ".join(sorted(set(concepts)))
686 | 
687 |             idx += 1
688 |             text += "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(
689 |                 self.ptn2id[pattern].split("/")[0],
690 |                 struc,
691 |                 len(entries),
692 |                 pattern[pidx],
693 |                 "\t".join([p for i, p in enumerate(pattern) if i != pidx]),
694 |                 ", ".join(["{0}:{1}".format(x, y) for x, y in entries]),
695 |                 concepts,
696 |             )
697 |         pathlib.Path(filename).write_text(text, encoding="utf8")
698 | 
699 |     def purity(self):
700 |         """
701 |         Compute the purity of the cluster analysis.
702 | 
703 |         .. note:: The purity is here interpreted as the degree to which
704 |            patterns are filled with non-missing values. In this sense, it
705 |            indicates to which degree information is computed and to which
706 |            degree information is already provided by the data itself.
707 |         """
708 | 
709 |         def get_purity(patterns):
710 |             all_sums = []
711 |             for i in range(len(patterns[0])):
712 |                 col = [line[i] for line in patterns]
713 |                 subset = set(col)
714 |                 sums = []
715 |                 for itm in subset:
716 |                     if itm != self.missing:
717 |                         sums += [col.count(itm) ** 2]
718 |                 if sums:
719 |                     sums = math.sqrt(sum(sums)) / len(col)
720 |                 else:
721 |                     sums = 0
722 |                 all_sums += [sums]
723 |             return sum(all_sums) / len(all_sums)
724 | 
725 |         graph = self.get_cluster_graph()
726 |         purities = []
727 |         for node, data in graph.nodes(data=True):
728 |             patterns = []
729 |             for neighbor in graph[node]:
730 |                 patterns += [graph.nodes[neighbor]["pattern"].split()]
731 |             if patterns:
732 |                 purities += [get_purity(patterns)]
733 |             else:
734 |                 purities += [0]
735 |         return sum(purities) / len(purities)
736 | 
737 |     def get_cluster_graph(self):
738 |         """
739 |         Compute a graph of the clusters.
740 | 
741 |         .. note:: In the cluster graph, the sites in the alignments are the
742 |            nodes and the edges are drawn between nodes assigned to the same
743 |            pattern.
744 |         """
745 | 
746 |         graph = nx.Graph()
747 |         for (pos, ptn), sites in self.clusters.items():
748 |             for site in sites:
749 |                 graph.add_node(
750 |                     "{0[0]}-{0[1]}".format(site),
751 |                     pattern=" ".join(ptn),
752 |                     site=" ".join(self.sites[site][1]),
753 |                 )
754 | 
755 |         for ((s1, p1), ptn1), ((s2, p2), ptn2) in itertools.combinations(
756 |             self.sites.items(), r=2
757 |         ):
758 |             if ptn1[0] == ptn2[0]:
759 |                 m, mm = compatible_columns(ptn1[1], ptn2[1])
760 |                 if m and not mm:
761 |                     graph.add_edge("{0}-{1}".format(s1, p1), "{0}-{1}".format(s2, p2))
762 |         return graph
763 | 
764 |     def upper_bound(self):
765 |         """
766 |         Compute upper bound for clique partitioning following Bhasker 1991.
767 |         """
768 |         degs = {s: 0 for s in self.sites}
769 |         sings = {s: 0 for s in self.sites}
770 |         for (nA, (posA, ptnA)), (nB, (posB, ptnB)) in itertools.combinations(
771 |             self.sites.items(), r=2
772 |         ):
773 |             if posA == posB:
774 |                 m, n = compatible_columns(ptnA, ptnB)
775 |                 if n > 0:
776 |                     degs[nA] += 1
777 |                     degs[nB] += 1
778 |                 else:
779 |                     sings[nA] += 1
780 |                     sings[nB] += 1
781 |             else:
782 |                 degs[nA] += 1
783 |                 degs[nB] += 1
784 | 
785 |         return max([b for a, b in degs.items() if sings[a] > 0])
786 | 
787 |     def predict_words(self, **kw):
788 |         """
789 |         Predict patterns for those cognate sets where we have missing data.
790 | 
791 |         .. note::
792 | 
793 |            Purity (one of the return values) measures how well a given sound
794 |            for a given site is reflected by one single sound (rather than
795 |            multiple patterns pointing to different sounds) for a given
796 |            doculect. It may be seen as a control case for the purity of a given
797 |            prediction: if there are many alternative possibilities, this means
798 |            that there is more uncertainty regarding the reconstructions or
799 |            predictions.
800 | 
801 |         """
802 |         if not hasattr(self, "sites"):
803 |             raise ValueError("You need to compute alignment sites first")
804 | 
805 |         minrefs = self.minrefs
806 |         missing = self.missing
807 |         samples = kw.get("samples", 3)
808 | 
809 |         # pre-analyse the data to get for each site the best patterns in ranked
810 |         # form
811 |         ranked_sites = {}
812 |         ranked_clusters = sorted(
813 |             [(s, p, len(f)) for (s, p), f in self.clusters.items()],
814 |             key=lambda x: x[2],
815 |             reverse=True,
816 |         )
817 |         for (cogid, pos), ptns in self.patterns.items():
818 |             struc, ptn = self.sites[cogid, pos]
819 |             missings = [i for i in range(self.width) if ptn[i] == missing]
820 |             if (struc, ptn) in self.clusters:
821 |                 ranked_sites[cogid, pos] = [
822 |                     (len(self.clusters[struc, ptn]), struc, ptn)
823 |                 ]
824 |             else:
825 |                 ranked_sites[cogid, pos] = [(1, struc, ptn)]
826 |             for strucB, ptnB, freq in ranked_clusters:
827 |                 m, mm = compatible_columns(ptn, ptnB)
828 |                 if struc == strucB and m >= 1 and mm == 0:
829 |                     if len(missings) > len(
830 |                         [ptnB[i] for i in missings if ptnB[i] == missing]
831 |                     ):
832 |                         ranked_sites[cogid, pos] += [(freq, strucB, ptnB)]
833 | 
834 |         purity = {site: {} for site in ranked_sites}
835 | 
836 |         preds = {}
837 |         for cogid, msa in self.msa[self._ref].items():
838 |             missings = [t for t in self.cols if t not in msa["taxa"]]
839 |             if len(set(msa["taxa"])) >= minrefs:
840 |                 words = [bt.strings("") for m in missings]
841 |                 for i, m in enumerate(missings):
842 |                     tidx = self.cols.index(m)
843 |                     for j in range(len(msa["alignment"][0])):
844 |                         segments = collections.defaultdict(int)
845 |                         sidx = 0
846 |                         if (cogid, j) in ranked_sites:
847 |                             while True:
848 |                                 this_segment = ranked_sites[cogid, j][sidx][2][tidx]
849 |                                 score = ranked_sites[cogid, j][sidx][0]
850 |                                 if this_segment != missing:
851 |                                     segments[this_segment] += score
852 |                                 sidx += 1
853 |                                 if sidx == len(ranked_sites[cogid, j]):
854 |                                     break
855 | 
856 |                         if not (cogid, j) in purity:
857 |                             purity[cogid, j] = {}
858 | 
859 |                         if not segments:
860 |                             words[i] += ["Ø"]
861 |                             purity[cogid, j][m] = 0
862 |                         else:
863 |                             purity[cogid, j][m] = math.sqrt(
864 |                                 sum(
865 |                                     [
866 |                                         (s / sum(segments.values())) ** 2
867 |                                         for s in segments.values()
868 |                                     ]
869 |                                 )
870 |                             )
871 |                             words[i] += [
872 |                                 "|".join(
873 |                                     [
874 |                                         s
875 |                                         for s in sorted(
876 |                                             segments,
877 |                                             key=lambda x: segments[x],
878 |                                             reverse=True,
879 |                                         )
880 |                                     ][:samples]
881 |                                 )
882 |                             ]
883 |                 if words:
884 |                     preds[cogid] = dict(zip(missings, words))
885 | 
886 |         pudity = {doc: [] for doc in self.cols}
887 |         for site, docs in purity.items():
888 |             for doc in docs:
889 |                 pudity[doc] += [purity[site][doc]]
890 |         for doc, purs in pudity.items():
891 |             if purs:
892 |                 pudity[doc] = sum(purs) / len(purs)
893 |             else:
894 |                 pudity[doc] = 0
895 | 
896 |         return preds, purity, pudity
897 | 


--------------------------------------------------------------------------------
/src/lingrex/evaluate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Miscellaneous evaluation functions.
 3 | """
 4 | import statistics
 5 | from lingpy.evaluate.acd import _get_bcubed_score as bcs
 6 | import lingpy
 7 | 
 8 | 
 9 | def compare_cognate_sets(wordlist, refA, refB):
10 |     """
11 |     Compute cognate set comparison statistics by computing B-Cubed Scores.
12 | 
13 |     Note
14 |     ----
15 |     This check was first described in Wu and List (2023).
16 | 
17 |     > Wu, M.-S. and J.-M. List (2023): Annotating cognates in phylogenetic studies
18 |     > of South-East Asian languages. Language Dynamics and Change. 161-197.
19 |     > DOI: https://doi.org/10.1163/22105832-bja10023
20 |     """
21 |     ranks = []
22 |     for concept in wordlist.rows:
23 |         cogsA = wordlist.get_list(row=concept, flat=True, entry=refA)
24 |         cogsB = wordlist.get_list(row=concept, flat=True, entry=refB)
25 |         p, r = bcs(cogsA, cogsB), bcs(cogsB, cogsA)
26 |         f = 2 * (p * r) / (p + r)
27 |         ranks += [[concept, p, r, f]]
28 |     return ranks
29 | 
30 | 
31 | def cross_semantic_cognate_statistics(
32 |     wordlist,
33 |     ref="cogids",
34 |     concept="concept",
35 |     morpheme_glosses="morphemes",
36 |     ignore_affixes=True,
37 |     affixes=("suf", "suffix", "SUF", "SUFFIX"),
38 | ):
39 |     """
40 |     Calculate colexification statistics for partial colexifications.
41 | 
42 |     :param wordlist: A LingPy wordlist.
43 |     :param ref: Reference to the column with cognate identifiers.
44 |     :param concept: Reference to the concept column.
45 |     :param morpheme_glosses: Reference to the morpheme glosses.
46 |     :param ignore_affixes: If set to True, will ignore morphemes flagged as affixes.
47 |     :param affixes: List of strings that trigger that a morpheme gloss is
48 |         ignored if it contains one of them as a substring.
49 | 
50 |     Note
51 |     ----
52 |     This check was first described in Wu and List (2023).
53 | 
54 |     > Wu, M.-S. and J.-M. List (2023): Annotating cognates in phylogenetic studies
55 |     > of South-East Asian languages. Language Dynamics and Change. 161-197.
56 |     > DOI: https://doi.org/10.1163/22105832-bja10023
57 |     """
58 | 
59 |     # type check for basic types if they are not there
60 |     for idx, cogids, morphemes in wordlist.iter_rows(ref, morpheme_glosses):
61 |         wordlist[idx, ref] = lingpy.basictypes.ints(cogids)
62 |         wordlist[idx, morpheme_glosses] = lingpy.basictypes.strings(morphemes)
63 | 
64 |     if ignore_affixes:
65 |         D = {}
66 |         for idx, cogids, morphemes in wordlist.iter_rows(ref, morpheme_glosses):
67 |             new_cogids = []
68 |             for cogid, morpheme in zip(cogids, morphemes):
69 |                 if not sum([1 if s in morpheme else 0 for s in affixes]):
70 |                     new_cogids += [cogid]
71 |             D[idx] = lingpy.basictypes.ints(new_cogids)
72 |         wordlist.add_entries(ref + "_derived", D, lambda x: x)
73 |         new_ref = ref + "_derived"
74 |     else:
75 |         new_ref = ref
76 | 
77 |     etd = wordlist.get_etymdict(ref=new_ref)
78 |     indices = {ln: {} for ln in wordlist.cols}
79 |     for i, ln in enumerate(wordlist.cols):
80 |         for cogid, reflexes in etd.items():
81 |             if reflexes[i]:
82 |                 concepts = [wordlist[idx, concept] for idx in reflexes[i]]
83 |                 indices[ln][cogid] = len(set(concepts)) - 1
84 | 
85 |     all_scores = []
86 |     for cnc in wordlist.rows:
87 |         # Loop through all the concepts in the data
88 |         reflexes = wordlist.get_list(
89 |             row=cnc, flat=True
90 |         )  # The lexical entries of the concept.
91 |         scores = []
92 |         for idx in reflexes:
93 |             doculect, cogids = wordlist[idx, "doculect"], wordlist[idx, new_ref]
94 |             scores += [statistics.mean([indices[doculect][cogid] for cogid in cogids])]
95 |         all_scores += [[cnc, statistics.mean(scores)]]
96 |     return sorted(all_scores, key=lambda x: (x[1], x[0]))
97 | 


--------------------------------------------------------------------------------
/src/lingrex/fuzzy.py:
--------------------------------------------------------------------------------
  1 | """Create fuzzy reconstructions."""
  2 | from lingrex.reconstruct import PatternReconstructor
  3 | import random
  4 | from lingpy.util import pb as progressbar 
  5 | import lingpy
  6 | 
  7 | 
  8 | def ntile(words, n=5, gap="-", missing="Ø"):
  9 |     """
 10 |     Represent aligned words in form of n-tiles.
 11 |     """
 12 |     if len(words) == 1:
 13 |         return ' '.join([x for x in words[0] if x != gap])
 14 | 
 15 |     # start counting the occurrences
 16 |     cols = []
 17 |     for i in range(len(words[0])):
 18 |         col = [line[i] for line in words]
 19 |         cols += [col]
 20 |     
 21 |     ntile = len(words) / n 
 22 | 
 23 |     sounds = []
 24 |     for col in cols:
 25 |         col = [x for x in col if x != missing]
 26 |         if not col:
 27 |             sounds += ['?']
 28 |         else:
 29 |             ntile = len(col) / n
 30 |             dist = {}
 31 |             sounds += [[]]
 32 |             for s in set(col):
 33 |                 dist[s] = int(col.count(s) / ntile + 0.5)
 34 |             for s, t in sorted(dist.items(), key=lambda x: x[1], reverse=True):
 35 |                 for i in range(t):
 36 |                     sounds[-1] += [s]
 37 |             iterated = 0
 38 |             while len(sounds[-1]) < n:
 39 |                 sounds[-1] += sounds[-1]
 40 |                 iterated += 1
 41 |                 if iterated >= n:
 42 |                     sounds[-1] += n * ["Ø"]
 43 |             sounds[-1] = sorted(sounds[-1][:n], key=lambda x:
 44 |                     sounds[-1].count(x), reverse=True)
 45 |             sounds[-1] = '|'.join(sounds[-1])
 46 | 
 47 |     return ' '.join([s for s in sounds if s.split('|').count(gap) !=
 48 |         len(s.split('|'))-1])
 49 | 
 50 | 
 51 | class FuzzyReconstructor:
 52 |     """
 53 |     Carry out fuzzy reconstructions by running reconstructions from different parts of the data.
 54 | 
 55 |     Note
 56 |     ----
 57 |     This method was introduced in the study by List et al. (forthcoming):
 58 | 
 59 |     > List, J.-M.; Hill, N. W.; Blum, F.; and Forkel, R. (forthcoming): A New Framework for the
 60 |     > Representation and Computation of Uncertainty in Phonological Reconstruction. To appear in:
 61 |     > Proceedings of the 4th Workshop on Computational Approaches to Historical Language Change.
 62 |     """
 63 | 
 64 |     def __init__(self, infile, target, ref="cogid", fuzzy=False, transcription="form"):
 65 |         if isinstance(infile, (str, dict)):
 66 |             wordlist = lingpy.align.sca.Alignments(
 67 |                 infile, ref=ref, transcription=transcription
 68 |             )
 69 |         elif isinstance(
 70 |             infile, (lingpy.align.sca.Alignments, lingpy.basic.wordlist.Wordlist)
 71 |         ):
 72 |             wordlist = infile
 73 |         else:
 74 |             raise ValueError("Argument for infile must be a string or a wordlist.")
 75 |         self.wordlist = wordlist
 76 |         self.target = target or self.wordlist.cols[0]
 77 |         self.ref = ref
 78 |         self.fuzzy = fuzzy
 79 | 
 80 |     def random_splits(self, splits=10, retain=0.9):
 81 |         idxs = [
 82 |             idx
 83 |             for idx in self.wordlist
 84 |             if self.wordlist[idx, "doculect"] != self.target
 85 |         ]
 86 | 
 87 |         tidxs = self.wordlist.get_list(col=self.target, flat=True)
 88 |         cogids = [self.wordlist[idx, self.ref] for idx in tidxs]
 89 | 
 90 |         self.samples = []
 91 |         for i in range(splits):
 92 |             self.samples += [random.sample(idxs, int(retain * len(idxs) + 0.5))]
 93 | 
 94 |         self.wordlists = {}
 95 |         for i, sample in enumerate(self.samples):
 96 |             D = {0: [c for c in self.wordlist.columns]}
 97 |             for idx in sample:
 98 |                 D[idx] = [self.wordlist[idx, c] for c in D[0]]
 99 |             selected_cogids = [self.wordlist[idx, self.ref] for idx in sample]
100 |             for cogid, tidx in zip(cogids, tidxs):
101 |                 if cogid in selected_cogids:
102 |                     D[tidx] = [self.wordlist[tidx, c] for c in D[0]]
103 |             self.wordlists[i] = PatternReconstructor(
104 |                 D, ref=self.ref, target=self.target, fuzzy=self.fuzzy
105 |             )
106 | 
107 | 
108 |     def fit_samples(self, clf, onehot=False, func=None, aligned=False, pb=False):
109 |         pb = progressbar if pb else lambda x, desc: x
110 |         for i, wordlist in pb(self.wordlists.items(), desc="fitting data"):
111 |             wordlist.fit(clf=clf(), onehot=onehot, func=func, aligned=aligned)
112 | 
113 | 
114 |     def predict(
115 |         self,
116 |         alignment,
117 |         languages,
118 |         desegment=True,
119 |         orchar="¦",
120 |         scorechar=":",
121 |         output="percentiles",
122 |     ):
123 |         words = []
124 |         for i, wordlist in self.wordlists.items():
125 |             word = wordlist.predict(alignment, languages, desegment=False)
126 | 
127 |             words += [word]
128 |         # transform to dictionary
129 |         counts = {i: [] for i in range(len(words[0]))}
130 |         for word in words:
131 |             for i, sound in enumerate(word):
132 |                 counts[i] += [sound]
133 |         # get percentiles
134 |         if output in ["percentiles", "wp"]:
135 |             out = []
136 |             for i, sounds in sorted(counts.items(), key=lambda x: x[0]):
137 |                 distinct = {s: sounds.count(s) / len(sounds) for s in set(sounds)}
138 |                 distinct_s = [
139 |                     "{0}{1}{2}".format(k, scorechar, int(100 * v + 0.5))
140 |                     for k, v in sorted(
141 |                         distinct.items(), key=lambda x: x[1], reverse=True
142 |                     )
143 |                 ]
144 | 
145 |                 out += [orchar.join(distinct_s)]
146 |             if output == "percentiles":
147 |                 return out
148 |             return words, out
149 |         elif output == "words":
150 |             return words
151 | 
152 | 


--------------------------------------------------------------------------------
/src/lingrex/reconstruct.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module provides methods for linguistic reconstruction.
  3 | """
  4 | 
  5 | import itertools
  6 | import collections
  7 | 
  8 | from lingpy.align.sca import Alignments, get_consensus
  9 | from lingpy.sequence.sound_classes import prosodic_string, class2tokens
 10 | from lingpy.align.multiple import Multiple
 11 | from lingpy.align.pairwise import edit_dist, nw_align
 12 | from lingpy.evaluate.acd import _get_bcubed_score as get_bcubed_score
 13 | from lingpy.align.sca import normalize_alignment
 14 | import networkx as nx
 15 | from networkx.algorithms.clique import find_cliques
 16 | from lingpy import log
 17 | 
 18 | from lingrex.util import clean_sound, ungap, alm2tok
 19 | 
 20 | 
 21 | class CorPaRClassifier(object):
 22 |     """
 23 |     A classifier for word prediction based on correspondence patterns.
 24 | 
 25 |     Note
 26 |     ----
 27 |     This classifier was first used in List et al. (2022).
 28 | 
 29 |     > List, J.-M., N. Hill, and R. Forkel (2022): A new framework for fast
 30 |     > automated phonological reconstruction using trimmed alignments and sound
 31 |     > correspondence patterns. In: Proceedings of the 3rd Workshop on
 32 |     > Computational Approaches to Historical Language Change. Association for
 33 |     > Computational Linguistics 89-96. URL: https://aclanthology.org/2022.lchange-1.9
 34 |     """
 35 | 
 36 |     def __init__(self, minrefs=2, missing=0, threshold=1):
 37 |         self.G = nx.Graph()
 38 |         self.missing = 0
 39 |         self.threshold = threshold
 40 | 
 41 |     def compatible(self, ptA, ptB):
 42 |         """
 43 |         Check for compatibility of two patterns.
 44 |         """
 45 |         res = {True: 0, False: 0}
 46 |         for a, b in zip(ptA, ptB):
 47 |             if a and b:
 48 |                 res[a == b] += 1
 49 |         return res[True], res[False]
 50 | 
 51 |     def consensus(self, nodes):
 52 |         """
 53 |         Create a consensus pattern of multiple alignment sites.
 54 |         """
 55 |         cons = []
 56 |         for i in range(len(nodes[0])):
 57 |             nocons = True
 58 |             for node in nodes:
 59 |                 if node[i] != self.missing:
 60 |                     cons += [node[i]]
 61 |                     nocons = False
 62 |                     break
 63 |             if nocons:
 64 |                 cons += [self.missing]
 65 |         return tuple(cons)
 66 | 
 67 |     def fit(self, X, y):
 68 |         """
 69 |         Train the prediction of data in y with data in X.
 70 | 
 71 |         :param X: Two-dimensional array with observations.
 72 |         :param y: One-dimensional array with results.
 73 |         """
 74 |         # get identical patterns
 75 |         P = collections.defaultdict(list)
 76 |         for i, row in enumerate(X):
 77 |             P[tuple(row + [y[i]])] += [i]
 78 |         # make graph
 79 |         for (pA, vA), (pB, vB) in itertools.combinations(P.items(), r=2):
 80 |             match_, mismatch = self.compatible(pA, pB)
 81 |             if not mismatch and match_ >= self.threshold:
 82 |                 if pA not in self.G:
 83 |                     self.G.add_node(pA, freq=len(vA))
 84 |                 if pB not in self.G:
 85 |                     self.G.add_node(pB, freq=len(vB))
 86 |                 self.G.add_edge(pA, pB, weight=match_)
 87 |         self.patterns = collections.defaultdict(collections.Counter)
 88 |         self.lookup = collections.defaultdict(collections.Counter)
 89 |         # get cliques
 90 |         for nodes in find_cliques(self.G):
 91 |             cons = self.consensus(list(nodes))
 92 |             self.patterns[cons[:-1]][cons[-1]] = len(nodes)
 93 |             for node in nodes:
 94 |                 self.lookup[node[:-1]][cons[:-1]] += len(nodes)
 95 |         self.predictions = {
 96 |             ptn: counts.most_common(1)[0][0] for ptn, counts in self.patterns.items()
 97 |         }
 98 |         for ptn, counts in self.lookup.items():
 99 |             self.predictions[ptn] = self.predictions[counts.most_common(1)[0][0]]
100 | 
101 |         # make index of data points for quick search based on attested data
102 |         self.ptnlkp = collections.defaultdict(list)
103 |         for ptn in self.patterns:
104 |             for i in range(len(ptn)):
105 |                 if ptn[i] != self.missing:
106 |                     self.ptnlkp[i, ptn[i]] += [ptn]
107 | 
108 |     def predict(self, matrix):
109 |         out = []
110 |         for row in matrix:
111 |             ptn = tuple(row)
112 |             if ptn in self.predictions:
113 |                 out.append(self.predictions[ptn])
114 |             else:
115 |                 candidates = collections.Counter()
116 |                 for i in range(len(ptn) - 1):
117 |                     if ptn[i] != self.missing:
118 |                         for ptnB in self.ptnlkp[i, ptn[i]]:
119 |                             if ptnB not in candidates:
120 |                                 match_, mismatch = self.compatible(ptn, ptnB)
121 |                                 if match_ and not mismatch:
122 |                                     candidates[ptnB] = match_ + len(ptn)
123 |                                 elif match_ - mismatch:
124 |                                     candidates[ptnB] = match_ - mismatch
125 |                 if candidates:
126 |                     self.predictions[tuple(row)] = self.predictions[
127 |                         candidates.most_common(1)[0][0]
128 |                     ]
129 |                     out += [self.predictions[tuple(row)]]
130 |                 else:
131 |                     out += [self.missing]
132 |         return out
133 | 
134 | 
135 | class ReconstructionBase(Alignments):
136 |     """
137 |     Basic class for the phonological reconstruction.
138 |     """
139 | 
140 |     def __init__(
141 |         self,
142 |         infile,
143 |         target,
144 |         ref="cogids",
145 |         fuzzy=True,
146 |         transcription="form",
147 |         missing="Ø",
148 |         gap="-",
149 |     ):
150 |         Alignments.__init__(
151 |             self, infile, fuzzy=fuzzy, ref=ref, transcription=transcription
152 |         )
153 |         self.target = target
154 |         self.missing = missing
155 |         self.gap = gap
156 |         self.languages = [t for t in self.cols if t != target]
157 |         self.target = target
158 |         self.tgtidx = self.cols.index(target)
159 |         self.lngidx = {t: self.cols.index(t) for t in self.languages}
160 | 
161 |     def iter_sequences(self, aligned=False):
162 |         """
163 |         Iterate over aligned or unaligned sequences with or without the target \
164 |                 sequence.
165 |         """
166 |         seq_ref = self._alignments if aligned else self._segments
167 |         for cogid, idxs in self.etd[self._ref].items():
168 |             if idxs[self.tgtidx]:
169 |                 if self._mode == "fuzzy":
170 |                     target = self[idxs[self.tgtidx][0], seq_ref].n[
171 |                         self[idxs[self.tgtidx][0], self._ref].index(cogid)
172 |                     ]
173 |                 else:
174 |                     target = self[idxs[self.tgtidx][0], seq_ref]
175 |                 alignment, languages = [], []
176 |                 for j, lng in enumerate(self.languages):
177 |                     lidx = self.lngidx[lng]
178 |                     if idxs[lidx]:
179 |                         languages += [lng]
180 |                         idx = idxs[lidx][0]
181 |                         if self._mode == "fuzzy":
182 |                             alm = self[idx, seq_ref].n[
183 |                                 self[idx, self._ref].index(cogid)
184 |                             ]
185 |                         else:
186 |                             alm = self[idx, seq_ref]
187 |                         alignment.append([clean_sound(x) for x in alm])
188 |                 alignment.append([clean_sound(x) for x in target])
189 |                 if aligned:
190 |                     alignment = normalize_alignment(alignment)
191 |                 languages.append(self.target)
192 |                 yield cogid, alignment, languages
193 | 
194 | 
195 | class OneHot(object):
196 |     """
197 |     Create a one-hot-encoder from a matrix.
198 |     """
199 | 
200 |     def __init__(self, matrix):
201 |         self.vals = []
202 |         for i in range(len(matrix[0])):
203 |             cols = [row[i] for row in matrix]
204 |             self.vals += [sorted(set(cols)) + ["?"]]
205 | 
206 |     def __call__(self, matrix):
207 |         out = [[] for row in matrix]
208 |         for i, vals in enumerate(self.vals):
209 |             for j in range(len(matrix)):
210 |                 template = [0 for k in vals]
211 |                 try:
212 |                     template[matrix[j][i]] = 1
213 |                 except IndexError:
214 |                     template[-1] = 1
215 |                 out[j] += template
216 |         return out
217 | 
218 | 
219 | def transform_alignment(
220 |     seqs,
221 |     languages,
222 |     all_languages,
223 |     align=True,
224 |     training=True,
225 |     missing="Ø",
226 |     gap="-",
227 |     startend=False,
228 |     prosody=False,
229 |     position=False,
230 |     firstlast=False,
231 | ):
232 |     """
233 |     Basic alignment function used for phonological reconstruction.
234 |     """
235 |     if align:
236 |         seqs = [[s for s in seq if s != gap] for seq in seqs]
237 |         msa = Multiple([[s for s in seq if s != gap] for seq in seqs])
238 |         msa.prog_align()
239 |         alms = [alm for alm in msa.alm_matrix]
240 |     else:
241 |         alms = normalize_alignment([s for s in seqs])
242 |         seqs = [[s for s in seq if s != gap] for seq in seqs]
243 |     if training:
244 |         alms = ungap(alms, languages, languages[-1])
245 |         these_seqs = seqs[:-1]
246 |     else:
247 |         these_seqs = seqs
248 | 
249 |     matrix = [[missing for x in all_languages] for y in alms[0]]
250 |     for i in range(len(alms[0])):
251 |         for j, lng in enumerate(languages):
252 |             lidx = all_languages.index(lng)
253 |             snd = clean_sound(alms[j][i])
254 |             matrix[i][lidx] = snd
255 |     if position:
256 |         for i in range(len(matrix)):
257 |             matrix[i] += [i]
258 |     if startend:
259 |         matrix[0] += [0]
260 |         for i in range(1, len(matrix) - 1):
261 |             matrix[i] += [1]
262 |         if len(matrix) > 1:
263 |             matrix[-1] += [2]
264 |     if prosody:
265 |         for i, c in enumerate(
266 |             get_consensus(
267 |                 [
268 |                     class2tokens(prosodic_string(these_seqs[j], _output="CcV"), alms[j])
269 |                     for j in range(len(these_seqs))
270 |                 ],
271 |                 gaps=True,
272 |             )
273 |         ):
274 |             matrix[i] += [c]
275 |     if firstlast:
276 |         if training:
277 |             all_seqs = len(all_languages) - 1
278 |         else:
279 |             all_seqs = len(all_languages)
280 |         for i, row in enumerate(matrix):
281 |             for j in range(all_seqs):
282 |                 matrix[i] += [matrix[0][j], matrix[-1][j]]
283 | 
284 |     # for debugging
285 |     for row in matrix:
286 |         assert len(row) == len(matrix[0])
287 |     return matrix
288 | 
289 | 
290 | class PatternReconstructor(ReconstructionBase):
291 |     """
292 |     Automatic reconstruction with correspondence patterns.
293 | 
294 |     Note
295 |     ----
296 |     This classifier was first used in List et al. (2022).
297 | 
298 |     > List, J.-M., N. Hill, and R. Forkel (2022): A new framework for fast
299 |     > automated phonological reconstruction using trimmed alignments and sound
300 |     > correspondence patterns. In: Proceedings of the 3rd Workshop on
301 |     > Computational Approaches to Historical Language Change. Association for
302 |     > Computational Linguistics 89-96. URL: https://aclanthology.org/2022.lchange-1.9
303 |     """
304 | 
305 |     def fit(self, clf=None, onehot=False, func=None, aligned=False):
306 |         """
307 |         Fit a classifier to the data.
308 | 
309 |         :param clf: a classifier with a predict function.
310 |         """
311 |         self.patterns = collections.defaultdict(lambda: collections.defaultdict(list))
312 |         self.occurrences = collections.defaultdict(list)
313 |         self.func = func or transform_alignment
314 | 
315 |         for cogid, alignment, languages in self.iter_sequences():
316 |             if len(alignment) >= 2:
317 |                 matrix = self.func(
318 |                     alignment, languages, self.languages + [self.target], training=True
319 |                 )
320 |                 for i, row in enumerate(matrix):
321 |                     ptn = tuple(
322 |                         row[: len(self.languages)] + row[len(self.languages) + 1 :]
323 |                     )
324 |                     self.patterns[ptn][row[len(self.languages)]] += [(cogid, i)]
325 |                     for j, lng in enumerate(self.languages):
326 |                         if row[j] not in [self.missing]:
327 |                             self.occurrences[lng, j, row[j]] += [(cogid, i)]
328 |                     for j in range(len(self.languages) + 1, len(row)):
329 |                         self.occurrences[
330 |                             "feature-{0}".format(j - 1), j - 1, row[j]
331 |                         ] += [(cogid, i)]
332 | 
333 |         self.snd2idx = {(i, self.missing): 0 for i in range(len(matrix[0]))}
334 |         for i in range(len(matrix[0])):
335 |             self.snd2idx[i, self.gap] = 1
336 | 
337 |         idxtracker = {i: 2 for i in range(len(matrix[0]))}
338 |         for lng, lidx, sound in self.occurrences:
339 |             last_idx = idxtracker[lidx]
340 |             if (lidx, sound) not in self.snd2idx:
341 |                 self.snd2idx[lidx, sound] = last_idx
342 |                 idxtracker[lidx] += 1
343 | 
344 |         self.tgt2idx = {}
345 |         idx = 1
346 |         for pattern in self.patterns:
347 |             for sound in self.patterns[pattern]:
348 |                 if sound not in self.tgt2idx:
349 |                     self.tgt2idx[sound] = idx
350 |                     idx += 1
351 | 
352 |         self.matrix = []
353 |         self.solutions = []
354 |         for pattern, sounds in self.patterns.items():
355 |             for sound, vals in sounds.items():
356 |                 tidx = self.tgt2idx[sound]
357 |                 row = []
358 |                 for i in range(len(pattern)):
359 |                     sidx = self.snd2idx[i, pattern[i]]
360 |                     row += [sidx]
361 |                 for cogid, idx in vals:
362 |                     self.matrix += [row]
363 |                     self.solutions += [tidx]
364 |         self.dim = len(self.matrix[0])
365 |         if clf is not None:
366 |             self.clf = clf
367 |         else:
368 |             self.clf = CorPaRClassifier()
369 |         log.info("fitting classifier")
370 |         if onehot:
371 |             self.onehot = OneHot(self.matrix)
372 |             self.clf.fit(self.onehot(self.matrix), self.solutions)
373 |         else:
374 |             self.clf.fit(self.matrix, self.solutions)
375 |         self.idx2tgt = {v: k for k, v in self.tgt2idx.items()}
376 |         log.info("fitted the classifier")
377 | 
378 |     def predict(self, alignment, languages, unknown="?", onehot=False, desegment=True):
379 |         """
380 |         Predict a word form from an alignment.
381 | 
382 |         :param desegment: Return the form without gaps and ungapped tokens.
383 |         """
384 |         matrix = self.func(alignment, languages, self.languages, training=False)
385 |         for row in matrix:
386 |             assert len(row) == self.dim
387 |         new_matrix = [[0 for char in row] for row in matrix]
388 |         for i, row in enumerate(matrix):
389 |             for j, char in enumerate(row):
390 |                 new_matrix[i][j] = self.snd2idx.get((j, char), 0)
391 |         if hasattr(self, "onehot"):
392 |             new_matrix = self.onehot(new_matrix)
393 |         out = [self.idx2tgt.get(idx, unknown) for idx in self.clf.predict(new_matrix)]
394 |         return alm2tok(out) if desegment else out
395 | 
396 | 
397 | def eval_by_dist(data, func=None, **kw):
398 |     """
399 |     Evaluate by measuring distances between sequences.
400 | 
401 |     :param data: List of tuples with prediction and attested sequence.
402 |     :param func: Alignment function (defaults to edit distance)
403 | 
404 |     :note: Defaults to the unnormalized edit distance.
405 |     """
406 |     func = func or edit_dist
407 |     scores = []
408 |     for seqA, seqB in data:
409 |         if not seqA:
410 |             seqA = ["?"]
411 |         if not seqB:
412 |             seqB = ["?"]
413 |         scores += [func(seqA, seqB, **kw)]
414 |     return sum(scores) / len(scores)
415 | 
416 | 
417 | def eval_by_bcubes(data, func=None, **kw):
418 |     """
419 |     Evaluate by measuring B-Cubed F-scores.
420 | 
421 |     :param data: List of tuples with prediction and attested sequence.
422 |     :param func: Alignment function (defaults to Needleman-Wunsch)
423 | 
424 |     Note
425 |     ----
426 |     This evaluation was first introduced in List (2019).
427 | 
428 |     > List, J.-M. (2019): Beyond Edit Distances: Comparing linguistic
429 |     > reconstruction systems. Theoretical Linguistics 45.3-4. 1-10. DOI:
430 |     > https://doi.org/10.1515/tl-2019-0016
431 |     """
432 |     numsA, numsB = {"": 0}, {"": 0}
433 |     func = func or nw_align
434 |     almsA, almsB = [], []
435 |     for seqA, seqB in data:
436 |         if not seqA:
437 |             seqA = ["?"]
438 |         if not seqB:
439 |             seqB = ["?"]
440 |         almA, almB, score = func(seqA, seqB, **kw)
441 |         for a, b in zip(almA, almB):
442 |             if a not in numsA:
443 |                 numsA[a] = max(numsA.values()) + 1
444 |             if b not in numsB:
445 |                 numsB[b] = max(numsB.values()) + 1
446 |             almsA += [numsA[a]]
447 |             almsB += [numsB[b]]
448 |     p, r = get_bcubed_score(almsA, almsB), get_bcubed_score(almsB, almsA)
449 |     return 2 * (p * r) / (p + r)
450 | 


--------------------------------------------------------------------------------
/src/lingrex/regularity.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Calculate regularity metrics on dataset.
  3 | """
  4 | import statistics
  5 | 
  6 | from lingpy import log
  7 | 
  8 | 
  9 | def regularity(wordlist, threshold=3, ref="cogid", min_refs=3,
 10 |                word_threshold=0.75, sound_classes="cv"):
 11 |     """
 12 |     Check regularity in three flavors.
 13 | 
 14 |     - regularity based on the number of correspondence patterns that have more
 15 |       or the same number of sites as threshold
 16 |     - the proportion of correspondence patterns identified as regular via
 17 |       threshold counting all alignment sites
 18 |     - the proportion of words that we judge regular, judging words to be
 19 |       regular when more than the proportion word_threshold of sites are judged
 20 |       to be regular since they can be assigned to patterns that are covered by
 21 |       more than threshol sites
 22 | 
 23 |     :param wordlist: A lingpy Wordlist.
 24 |     :type wordlist: :class:lingpy.Wordlist
 25 |     :param threshold: The minimum number of alignment sites for a cognate set
 26 |         to be considered in the computation of regular words. Defaults to '3'.
 27 |     :type threshold: int
 28 |     :param ref: The column which stores the cognate sets, defaults to 'cogid'
 29 |     :type ref: str
 30 |     :param min_refs: The minimum number of occurrences a correspondence pattern
 31 |         to be considered recurring. Defaults to '3'.
 32 |     :type min_refs: int
 33 |     :param word_threshold: The relative threshold of patterns that need to be regular
 34 |         in order for a word to be considered regular as well. Defaults to '0.75'.
 35 |     :type word_threshold: float
 36 |     :param sound_classes: A string of characters or a list or a set of strings
 37 |         that contain the sound classes that the regularity should concentrate on.
 38 |     :type sound_clasess: str, list, set, tuple
 39 |     :return: Different scores of regularity.
 40 |     :rtype: tuple
 41 | 
 42 | 
 43 |     Note
 44 |     ----
 45 |     These regularity checks were first introduced in a study by Blum and List (2023):
 46 | 
 47 |     > Blum, F. and J.-M. List (2023): Trimming phonetic alignments improves the inference of
 48 |     > sound correspondence patterns from multilingual wordlists.
 49 |     > In: Proceedings of the 5th Workshop on Computational Typology and Multilingual NLP.
 50 |     > Association for Computational Linguistics 52-64. https://aclanthology.org/2023.sigtyp-1.6
 51 |     """
 52 |     if not hasattr(wordlist, "clusters"):
 53 |         raise ValueError("need a CoPaR object with clusters")
 54 |     patterns = len({p: len(vals) for p, vals in wordlist.clusters.items() \
 55 |             if p[0] in sound_classes})
 56 |     regular_patterns = len(
 57 |         [p for p, vals in wordlist.clusters.items() \
 58 |                 if len(vals) >= threshold and p[0] in sound_classes])
 59 |     regular_proportion = sum(
 60 |         [len(vals) for p, vals in wordlist.clusters.items() \
 61 |                 if len(vals) >= threshold and p[0] in sound_classes]
 62 |     )
 63 |     full_proportion = sum([len(vals) for p, vals in wordlist.clusters.items() \
 64 |             if p[0] in sound_classes])
 65 | 
 66 |     # get the proportion of words
 67 |     regular_words, irregular_words = 0, 0
 68 |     for cogid, msa in filter(
 69 |         lambda x: len(set(x[1]["taxa"])) >= min_refs, wordlist.msa[ref].items()
 70 |     ):
 71 |         scores = []
 72 |         for idx in range(len(msa["alignment"][0])):
 73 |             if (cogid, idx) not in wordlist.patterns:  # pragma: no cover
 74 |                 log.warning("duplicate cognate in {0} / {1}".format(cogid, idx))
 75 |             else:
 76 |                 if wordlist.patterns[cogid, idx][0][1] in sound_classes:
 77 |                     if (
 78 |                         max(
 79 |                             [
 80 |                                 len(wordlist.clusters[b, c])
 81 |                                 for a, b, c in wordlist.patterns[cogid, idx]
 82 |                             ]
 83 |                         )
 84 |                         >= threshold
 85 |                     ):
 86 |                         scores.append(1)
 87 |                     else:
 88 |                         scores.append(0)
 89 |         if scores:
 90 |             if statistics.mean(scores) >= word_threshold:
 91 |                 regular_words += len(set(msa["taxa"]))
 92 |             else:
 93 |                 irregular_words += len(set(msa["taxa"]))
 94 | 
 95 |     return (
 96 |         regular_patterns,
 97 |         patterns - regular_patterns,
 98 |         patterns,
 99 |         round((regular_patterns / patterns), 2),
100 |         regular_proportion,
101 |         full_proportion - regular_proportion,
102 |         full_proportion,
103 |         round((regular_proportion / full_proportion), 2),
104 |         regular_words,
105 |         irregular_words,
106 |         regular_words + irregular_words,
107 |         round((regular_words / (regular_words + irregular_words)), 2),
108 |     )
109 | 


--------------------------------------------------------------------------------
/src/lingrex/trimming.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functionality to trim alignments by removing sites.
  3 | """
  4 | import random
  5 | import typing
  6 | import functools
  7 | import itertools
  8 | import collections
  9 | 
 10 | from lingpy.sequence.sound_classes import token2class
 11 | 
 12 | from lingrex.util import subsequence_of
 13 | 
 14 | __all__ = ["GAP", "Site", "Sites", "prep_alignments"]
 15 | GAP = "-"
 16 | 
 17 | 
 18 | class Site(list):
 19 |     """
 20 |     A site in an alignment is a "column", i.e. a list of the n-th sound in the aligned words.
 21 |     """
 22 | 
 23 |     def gap_ratio(self, gap: str = GAP) -> float:
 24 |         return self.count(gap) / len(self)
 25 | 
 26 |     def first_sound(self, gap=GAP):
 27 |         for s in itertools.dropwhile(lambda c: c == gap, self):
 28 |             return s
 29 | 
 30 |     def soundclass(self, gap: str = GAP) -> str:
 31 |         return token2class(self.first_sound(gap=gap) or "+", "cv")
 32 | 
 33 | 
 34 | class Sites(list):
 35 |     """
 36 |     A Sites object represents an alignment in the orthogonal view, i.e. listing columns rather
 37 |     than rows.
 38 | 
 39 |     .. code-block:: python
 40 | 
 41 |         >>> s = Sites([list('s-terb-'), list('mete---'), list('-ate-bu'), list('--te-b-')])
 42 |         >>> print(s)
 43 |         s	-	t	e	r	b	-
 44 |         m	e	t	e	-	-	-
 45 |         -	a	t	e	-	b	u
 46 |         -	-	t	e	-	b	-
 47 |         >>> print(s.trimmed(strategy='gap-oriented'))
 48 |         t	e	b
 49 |         t	e	-
 50 |         t	e	b
 51 |         t	e	b
 52 |         >>> print(s.trimmed(strategy='core-oriented'))
 53 |         t	e	r	b
 54 |         t	e	-	-
 55 |         t	e	-	b
 56 |         t	e	-	b
 57 |         >>> print(s.trimmed(strategy='core-oriented', threshold=0.6))
 58 |         s	-	t	e	r	b
 59 |         m	e	t	e	-	-
 60 |         -	a	t	e	-	b
 61 |         -	-	t	e	-	b
 62 | 
 63 |     Note
 64 |     ----
 65 |     Trimming of sites in an alignment was first introduced in a study by Blum and List (2023):
 66 | 
 67 |     > Blum, F. and J.-M. List (2023): Trimming phonetic alignments improves the inference of
 68 |     > sound correspondence patterns from multilingual wordlists.
 69 |     > In: Proceedings of the 5th Workshop on Computational Typology and Multilingual NLP.
 70 |     > Association for Computational Linguistics 52-64. https://aclanthology.org/2023.sigtyp-1.6
 71 |     """
 72 | 
 73 |     def __init__(
 74 |         self,
 75 |         alms: typing.Optional[typing.List[typing.List[str]]] = None,
 76 |         sites: typing.Optional[typing.List[Site]] = None,
 77 |         gap: str = GAP,
 78 |     ):
 79 |         """
 80 |         :parameter alms: List of aligned sequences.
 81 |         :parameter gap: String that codes gaps in alignment sites.
 82 |         """
 83 |         assert (alms or sites) and not (alms and sites)
 84 |         assert alms is None or (
 85 |             isinstance(alms[0], list) and isinstance(alms[0][0], str)
 86 |         ), "Expected list of lists of str, got {}".format(alms)
 87 |         self.gap = gap
 88 |         super().__init__(
 89 |             sites
 90 |             if sites
 91 |             else (Site([row[i] for row in alms]) for i in range(len(alms[0])))
 92 |         )
 93 | 
 94 |     @property
 95 |     def gap_ratios(self) -> typing.List[float]:
 96 |         return [s.gap_ratio(gap=self.gap) for s in self]
 97 | 
 98 |     @property
 99 |     def soundclasses(self) -> typing.List[str]:
100 |         return [s.soundclass(gap=self.gap) for s in self]
101 | 
102 |     def _trimmed(self, idxs: typing.Iterable[int]) -> "Sites":
103 |         """
104 |         Trim by removing the sites specified by index in `idxs`.
105 |         """
106 |         idxs = set(idxs)
107 |         return Sites(sites=[s for idx, s in enumerate(self) if idx not in idxs])
108 | 
109 |     def to_alignment(self) -> typing.List[typing.List[str]]:
110 |         return [[s[i] for s in self] for i in range(len(self[0]))]
111 | 
112 |     def __str__(self):
113 |         return "\n".join("\t".join(w) for w in self.to_alignment())
114 | 
115 |     def trimmed(
116 |         self,
117 |         strategy: str = "gap-oriented",
118 |         threshold: float = 0.5,
119 |         skeletons: typing.Iterable[str] = ("CV", "VC"),
120 |         strict_ratio: bool = True,
121 |         exclude="_+",
122 |     ) -> "Sites":
123 |         """
124 |         Trim by removing candidate sites as long as this leaves an alignment containing at least
125 |         one of the cv-patterns from `skeletons`.
126 | 
127 |         Candidates are identified using `strategy`:
128 |         - `'gap-oriented'`: Trim alignment sites by gaps. Candidates are groups of sites with the \
129 |           same gap ratio. Candidate groups are tried in order of decreasing gap ratio, and the \
130 |           trimming stops when not all sites in a group could be trimmed.
131 |         - `'core-oriented'`: Trim alignment sites by gaps, preserving a core of sites. Candidates \
132 |           are tried from start and end inwards.
133 | 
134 |         :parameter threshold: Threshold for gap ratio to qualify sites for trimming.
135 |         :param skeletons: Iterable of syllable-skeletons at least one of which should be preserved \
136 |         for further processing.
137 |         :param exclude: Sequence of strings that should be excluded from further processing,\
138 |         e.g. morpheme boundaries.
139 |         """
140 |         if strategy in {"gap-oriented", "gap"}:
141 |             # Sites with big enough gap ratio, grouped by ratio, ordered by decreasing ratio.
142 |             candidates = [
143 |                 [i[0] for i in idxs]
144 |                 for score, idxs in
145 |                 # Note that the sort order must be a total ordering to make trimming reproducible.
146 |                 itertools.groupby(
147 |                     sorted(
148 |                         enumerate(self.gap_ratios),
149 |                         key=lambda x: (x[1], -x[0]),
150 |                         reverse=True,
151 |                     ),
152 |                     lambda i: i[1],
153 |                 )
154 |                 if score >= threshold
155 |             ]
156 |         elif strategy in {"core-oriented", "core"}:
157 |             gap_or_not = [
158 |                 self.gap if ratio >= threshold else "S" for ratio in self.gap_ratios
159 |             ]
160 |             takewhile_gap = functools.partial(
161 |                 itertools.takewhile, lambda c: c[1] == self.gap
162 |             )
163 |             leading_gaps = [i for i, _ in takewhile_gap(enumerate(gap_or_not))]
164 |             trailing_gaps = [
165 |                 len(gap_or_not) - 1 - i
166 |                 for i, _ in takewhile_gap(enumerate(reversed(gap_or_not)))
167 |             ]
168 |             candidates = trailing_gaps + leading_gaps
169 |         else:
170 |             raise ValueError(
171 |                 "Unknown strategy: {}".format(strategy)
172 |             )  # pragma: no cover
173 | 
174 |         skeleton = list(enumerate(self.soundclasses))
175 |         idxs = {i for i, c in skeleton if c in exclude}  # Exclude markers.
176 |         for idxss in candidates:
177 |             if not isinstance(idxss, list):
178 |                 idxss = [idxss]
179 |             trimmed = []
180 |             for idx in idxss:
181 |                 current_skeleton = [c for i, c in skeleton if i not in idxs | {idx}]
182 |                 if any(subsequence_of(s, current_skeleton) for s in skeletons):
183 |                     # Trimming this site leaves a "big enough" remainder.
184 |                     idxs.add(idx)
185 |                     trimmed.append(True)
186 |                 else:
187 |                     trimmed.append(False)
188 |             if strict_ratio and not all(trimmed):
189 |                 break
190 |         return self._trimmed(idxs)
191 | 
192 |     def trimmed_random(
193 |         self,
194 |         strategy: str = "gap-oriented",
195 |         threshold: float = 0.5,
196 |         skeletons: typing.Iterable[str] = ("CV", "VC"),
197 |         exclude="_+",
198 |     ) -> "Sites":
199 |         """
200 |         For a base trim function, return a random version with a similar CV distribution.
201 | 
202 |         :parameter method: Trimming function that should be applied to compute the CV distribution.\
203 |         Specified as name of a suitable method of `Sites`, or as callable.
204 |         :parameter threshold: Threshold by which sites with gaps should be trimmed.
205 |         :param skeletons: Tuple of syllable-skeletons that should be preserved
206 |             for further processing. Defaults to '("CV", "VC")'.
207 |         """
208 |         reference_skeleton = (
209 |             Sites(self.to_alignment(), gap=self.gap)
210 |             .trimmed(
211 |                 strategy=strategy,
212 |                 threshold=threshold,
213 |                 skeletons=skeletons,
214 |                 exclude=exclude,
215 |             )
216 |             .soundclasses
217 |         )
218 |         # create a freq dict of ref skel
219 |         rs_freqs = collections.Counter(reference_skeleton)
220 |         # get a dictionary of indices by position
221 |         indices = {  # soundclass mapped to list of indices in cv template.
222 |             sc: [i[0] for i in items]
223 |             for sc, items in itertools.groupby(
224 |                 sorted(enumerate(self.soundclasses), key=lambda ii: ii[1]),
225 |                 lambda ii: ii[1],
226 |             )
227 |         }
228 |         # random sample indices to be retained
229 |         retain = [random.sample(indices[c], rs_freqs[c]) for c, _ in rs_freqs.items()]
230 |         retain = set(itertools.chain(*retain))
231 |         return self._trimmed([i for i in range(len(self)) if i not in retain])
232 | 
233 | 
234 | def prep_alignments(aligned_wl, skeletons=("CV", "VC"), ref="cogid"):
235 |     """ "
236 |     Preparing the alignments assures that the structure is correctly
237 |     added to the wordlist.
238 | 
239 |     :param wordlist: A lingpy Alignments.
240 |     :type wordlist: :class:lingpy.Alignments
241 |     :param skeletons: Tuple of syllable-skeletons that should be preserved
242 |         for further processing. Defaults to '("CV", "VC")'.
243 |     :type skeletons: tuple
244 |     :param ref: The column which stores the cognate sets, defaults to 'cogid'
245 |     :type ref: str
246 |     :return: Pre-processed alignments.
247 |     :rtype: :class:lingpy.Alignments
248 |     """
249 |     whitelist = []
250 |     for _, msa in aligned_wl.msa[ref].items():
251 |         skel = Sites(msa["alignment"]).soundclasses
252 |         if any([subsequence_of(s, skel) for s in skeletons]):
253 |             whitelist += msa["ID"]
254 |     aligned_wl.add_entries(
255 |         "structure", "tokens", lambda x: " ".join(Sites([x]).soundclasses)
256 |     )
257 |     dct = {0: aligned_wl.columns}
258 |     for idx in whitelist:
259 |         dct[idx] = aligned_wl[idx]
260 |     return aligned_wl.__class__(dct, transcription="form")
261 | 


--------------------------------------------------------------------------------
/src/lingrex/util.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility functions for the lingrex package.
  3 | """
  4 | import math
  5 | import pathlib
  6 | 
  7 | from lingpy import tokens2class, prosodic_string
  8 | from lingpy.align.sca import get_consensus
  9 | from lingpy import basictypes as bt
 10 | from lingpy.sequence.ngrams import get_n_ngrams
 11 | 
 12 | 
 13 | def subsequence_of(source, target):
 14 |     """
 15 |     Check if all items of source appear in target in order, but not necessarily consecutively.
 16 |     """
 17 |     i = 0
 18 |     for c in source:
 19 |         try:
 20 |             i += target[i:].index(c) + 1
 21 |         except ValueError:  # c is not in the remainder of target.
 22 |             return False
 23 |     return True
 24 | 
 25 | 
 26 | def lingrex_path(*comps):
 27 |     return str(pathlib.Path(__file__).parent.joinpath(*comps))
 28 | 
 29 | 
 30 | def bleu_score(word, reference, n=4, weights=None, trim=True):
 31 |     """
 32 |     Compute the BLEU score for predicted word and reference.
 33 | 
 34 |     :param word: the predicted word
 35 |     :param reference: the predicted reference
 36 |     :param n: the order of ngrams
 37 |     :param weights: list of weights, should be the same size as n
 38 |     :param trim: bool, decide to trim n-grams or not
 39 |     """
 40 |     weights = [1 / n for x in range(n)] if weights is None else weights
 41 | 
 42 |     scores = []
 43 |     for i in range(1, n + 1):
 44 |         new_wrd = list(get_n_ngrams(word, i))
 45 |         new_ref = list(get_n_ngrams(reference, i))
 46 |         if trim and i > 1:
 47 |             new_wrd = new_wrd[i - 1 : -(i - 1)]
 48 |             new_ref = new_ref[i - 1 : -(i - 1)]
 49 | 
 50 |         clipped, divide = [], []
 51 |         for itm in set(new_wrd):
 52 |             clipped += [new_ref.count(itm)]
 53 |             divide += [new_wrd.count(itm)]
 54 |         scores += [sum(clipped) / sum(divide)]
 55 | 
 56 |     # calculate arithmetic mean
 57 |     out_score = 1
 58 |     for weight, score in zip(weights, scores):
 59 |         out_score = out_score * (score**weight)
 60 | 
 61 |     bp = (
 62 |         1
 63 |         if len(word) > len(reference)
 64 |         else math.e ** (1 - (len(reference) / len(word)))
 65 |     )
 66 |     return bp * (out_score ** (1 / sum(weights)))
 67 | 
 68 | 
 69 | def clean_sound(sound):
 70 |     """
 71 |     Get rid of "a/b" notation for sound segments.
 72 |     """
 73 |     return ".".join([s.split("/")[1] if "/" in s else s for s in sound.split(".")])
 74 | 
 75 | 
 76 | def alm2tok(seq, gap="-"):
 77 |     """
 78 |     Turn an alignment into a sequence.
 79 |     """
 80 |     return [clean_sound(x) for x in unjoin(seq) if x != gap]
 81 | 
 82 | 
 83 | def unjoin(seq):
 84 |     """
 85 |     Turn segments joined by a dot into unjoined segments.
 86 |     """
 87 |     out = []
 88 |     for itm in seq:
 89 |         out += itm.split(".")
 90 |     return out
 91 | 
 92 | 
 93 | def ungap(alignment, languages, proto):
 94 |     """
 95 |     Trim an MSA to remove all gaps in the target sequence.
 96 |     :examples:
 97 |       >>> ungap([['a', 'b'], ['x', '-'], ['y', '-']], ['proto', 'l1', 'l2'], 'proto')
 98 |       ... [['a.b'], ['x'], ['y']]
 99 |       >>> ungap([['a', 'b'], ['x', '-'], ['y', 'h']], ['proto', 'l1', 'l2'], 'proto')
100 |       ... [['a', 'b'], ['x', '-'], ['y', 'h']]
101 | 
102 |     Note
103 |     ----
104 |     This procedure for multiple alignments was first introduced in List et al.
105 |     (2022).
106 | 
107 |     > List, J.-M., N. Hill, and R. Forkel (2022): A new framework for fast
108 |     > automated phonological reconstruction using trimmed alignments and sound
109 |     > correspondence patterns. In: Proceedings of the 3rd Workshop on
110 |     > Computational Approaches to Historical Language Change. Association for
111 |     > Computational Linguistics 89-96. URL: https://aclanthology.org/2022.lchange-1.9
112 |     """
113 |     pidxs = [i for i, taxon in enumerate(languages) if taxon == proto]
114 |     merges = []
115 |     for i in range(len(alignment[0])):  # go through the rows of the alignment ...
116 |         col = [row[i] for row in alignment]
117 |         # ... looking for gap-only alignments (in non-proto languages):
118 |         if {site for j, site in enumerate(col) if j not in pidxs} == {"-"}:
119 |             merges += [i]
120 |     if not merges:
121 |         return alignment
122 |     new_alms = []
123 |     for i, row in enumerate(alignment):
124 |         new_alm, mergeit, started = [], False, True
125 |         for j, cell in enumerate(row):
126 |             if j in merges or mergeit:
127 |                 mergeit = False
128 |                 if not started:  # j != 0:
129 |                     if cell != "-":
130 |                         new_alm[-1] += "." + cell if new_alm[-1] else cell
131 |                 else:
132 |                     mergeit = True
133 |                     new_alm.append("" if cell == "-" else cell)
134 |             else:
135 |                 started = False
136 |                 new_alm.append(cell)
137 |         new_alms.append([cell or "-" for cell in new_alm])
138 |     return new_alms
139 | 
140 | 
141 | def add_structure(
142 |     wordlist, model="cv", segments="tokens", structure="structure", ref="cogid", gap="-"
143 | ):
144 |     """
145 |     Add structure to a wordlist to make sure correspondence patterns can be inferred.
146 |     """
147 |     if model not in ["cv", "c", "CcV", "ps", "nogap"]:
148 |         raise ValueError("[i] you need to select a valid model")
149 |     D = {}
150 |     if model == "cv":
151 |         for idx, tks in wordlist.iter_rows(segments):
152 |             D[idx] = " ".join(tokens2class(tks, "cv")).lower()
153 | 
154 |     if model == "c":
155 |         for idx, tks in wordlist.iter_rows(segments):
156 |             D[idx] = (
157 |                 " ".join(tokens2class(tks, "cv"))
158 |                 .lower()
159 |                 .replace("v", "c")
160 |                 .replace("t", "c")
161 |             )
162 |     if model == "nogap":
163 |         assert hasattr(wordlist, "msa")
164 |         for cogid, msa in wordlist.msa[ref].items():
165 |             cons = [
166 |                 "c" if c != gap else gap
167 |                 for c in get_consensus(msa["alignment"], gaps=True)
168 |             ]
169 |             for idx, alm in zip(msa["ID"], msa["alignment"]):
170 |                 struc = []
171 |                 for a, b in zip(cons, alm):
172 |                     if b != "-":
173 |                         struc += [a]
174 |                 D[idx] = " ".join(struc)
175 |         for idx, tks in wordlist.iter_rows(segments):
176 |             if idx not in D:
177 |                 D[idx] = " ".join(["c" if c != "+" else c for c in tks])
178 |     if model == "CcV":
179 |         for idx, tks in wordlist.iter_rows(segments):
180 |             D[idx] = " ".join(
181 |                 list(prosodic_string(tks, _output="CcV").replace("_", "+"))
182 |             )
183 |     if model == "ps":
184 |         for idx, tks in wordlist.iter_rows(segments):
185 |             D[idx] = " ".join(list(prosodic_string(tks)))
186 | 
187 |     if hasattr(wordlist, "_mode") and wordlist._mode == "fuzzy":
188 |         struc_ = bt.lists
189 |     else:
190 |         struc_ = bt.strings
191 |     wordlist.add_entries(structure, D, lambda x: struc_(x))
192 | 
193 | 
194 | def prep_wordlist(wordlist, min_refs=3, exclude="_+"):
195 |     """
196 |     Preprocessing will make sure that the data are unified.
197 | 
198 |     - delete markers of morpheme boundaries (often inconsistently applied), as
199 |       indicated by exclude
200 |     - only consider cognate sets with size > min_refs (unique taxa), as identified by
201 |     - delete duplicate words in the same cognate set
202 | 
203 |     :param wordlist: A lingpy Wordlist.
204 |     :type wordlist: :class:lingpy.Wordlist
205 |     :param min_ref: The minimun number of words in a cognate set.
206 |         Defaults to '3'.
207 |     :type min_ref: int
208 |     :param exclude: Sequence of strings that should be excluded from further processing,
209 |         e.g. morpheme boundaries. Defaults to '_+'.
210 |     :param exclude: str
211 |     :return: Pre-processed wordlist.
212 |     :rtype: :class:lingpy.Wordlist
213 |     """
214 |     whitelist = []
215 |     for _, idxs in wordlist.get_etymdict(ref="cogid").items():
216 |         visited, all_indices = set(), []
217 |         for idx in map(lambda x: x[0], filter(lambda x: x, idxs)):
218 |             if wordlist[idx, "doculect"] not in visited:
219 |                 visited.add(wordlist[idx, "doculect"])
220 |                 all_indices += [idx]
221 |         if len(visited) >= min_refs:
222 |             whitelist += all_indices
223 |     for idx, tokens in wordlist.iter_rows("tokens"):
224 |         wordlist[idx, "tokens"] = [t for t in tokens if t not in exclude]
225 | 
226 |     dct = {0: wordlist.columns}
227 |     for idx in whitelist:
228 |         dct[idx] = wordlist[idx]
229 |     return wordlist.__class__(dct)
230 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def data():
 8 |     return pathlib.Path(__file__).parent / 'data'
 9 | 
10 | 
11 | @pytest.fixture
12 | def wl_with_alignments():
13 |     return {
14 |         0: ["doculect", "concept", "form", "tokens", "alignment", "cogid"],
15 |         1: ["A", "one", "atawu", "a t a w u", "a t a w u", 1],
16 |         2: ["B", "one", "a_twu", "a _ t w u", "a t - w u", 1],
17 |         3: ["C", "one", "tawu", "t a w u", "- t a w u", 1],
18 |         4: ["D", "one", "tefu", "tʲ e f u", "- t e f u", 1],
19 |         5: ["A", "two", "satu", "s a t u", "s a t u", 2],
20 |         6: ["A", "two", "seram", "s e r a m", "s e r a m", 2]
21 |     }
22 | 


--------------------------------------------------------------------------------
/tests/data/hillburmish.tsv:
--------------------------------------------------------------------------------
 1 | ID	DOCULECT	CONCEPT	VALUE	FORM	TOKENS	NOTE	COGIDS
 2 | 1	OldBurmese	I	ṅa	ṅa	ṅ/ŋ a		665
 3 | 4	Atsi	I	ŋo⁵¹	ŋo⁵¹	ŋ o ⁵¹		665
 4 | 6	Lashi	I	ŋo³¹	ŋo³¹	ŋ o ³¹		665
 5 | 9	ProtoBurmish	I	*ŋa	ŋa¹	ŋ a ¹		665
 6 | 147	Atsi	banana (plantain)	ŋoʔ⁵⁵ mjuʔ²¹	ŋoʔ⁵⁵+mjuʔ²¹	ŋ o ʔ ⁵⁵ + m j u ʔ ²¹		681 3302
 7 | 149	Lashi	banana (plantain)	ŋɔʔ⁵⁵ mju̱k⁵⁵	ŋɔʔ⁵⁵+mju̱k⁵⁵	ŋ ɔ ʔ ⁵⁵ + m j u̱ k ⁵⁵		681 3304
 8 | 151	ProtoBurmish	banana (plantain)	*ŋak	ŋak⁴	ŋ a k ⁴		681
 9 | 167	ProtoBurmish	be (in the house)	*ŋji	ŋji¹	ŋ j i ¹		488
10 | 283	ProtoBurmish	blue	*ŋjuŋ	ŋjuŋ¹	ŋ j u ŋ ¹		698
11 | 284	ProtoBurmish	blue	*ŋju	ŋju¹	ŋ j u ¹		699
12 | 665	Atsi	cooked, be (rice) / done	ŋjoʔ²¹	ŋjoʔ²¹	ŋ j o ʔ ²¹		684
13 | 667	Lashi	cooked, be (rice) / done	ŋjɔːʔ³¹	ŋjɔːʔ³¹	ŋ j ɔː ʔ ³¹		684
14 | 670	ProtoBurmish	cooked, be (rice) / done	*ŋjak	ŋjak⁴	ŋ j a k ⁴		684
15 | 733	OldBurmese	cry	ṅuiw	ṅui̯	ṅ/ŋ ui̯		693
16 | 736	Atsi	cry	ŋau⁵¹	ŋau⁵¹	ŋ au ⁵¹		693
17 | 738	Lashi	cry	ŋaːu³¹	ŋaːu³¹	ŋ aːu ³¹		693
18 | 741	ProtoBurmish	cry	*ŋu	ŋu¹	ŋ u ¹		693
19 | 799	ProtoBurmish	day (time)	*ŋjiX	ŋji³	ŋ j i ³		491
20 | 1230	Lashi	fifteen	tshĕ³³ ŋ³³	tshĕ³³+ŋ³³	tsʰ ĕ ³³ + ŋ ³³		3295 666
21 | 1232	ProtoBurmish	fifteen	*ŋaX	ŋa³	ŋ a ³		667
22 | 1275	OldBurmese	five	ṅaḥ	ṅaḥ	ṅ/ŋ a ḥ/⁵		666
23 | 1278	Atsi	five	ŋo²¹	ŋo²¹	ŋ o ²¹		666
24 | 1283	ProtoBurmish	five	*ŋaH	ŋa²	ŋ a ²		666
25 | 2888	OldBurmese	salty	ṅan	ṅan	ṅ/ŋ a n		683
26 | 2895	ProtoBurmish	salty	*ŋan	ŋan¹	ŋ a n ¹		683
27 | 3109	OldBurmese	silver	ṅuy	ṅuj	ṅ/ŋ u j		696
28 | 3112	Atsi	silver	ŋun⁵¹	ŋun⁵¹	ŋ u n ⁵¹		696
29 | 3114	Lashi	silver	ŋə³¹	ŋə³¹	ŋ ə ³¹		696
30 | 3117	ProtoBurmish	silver	*ŋui	ŋui¹	ŋ ui ¹		696
31 | 3170	OldBurmese	small	ṅay	ṅai	ṅ/ŋ ai		668
32 | 3174	Lashi	small	ŋɛː³¹	ŋɛː³¹	ŋ ɛː ³¹		668
33 | 3177	ProtoBurmish	small	*ŋai	ŋai¹	ŋ ai ¹		668
34 | 3619	Atsi	tongs (fire)	ŋjap²¹	ŋjap²¹	ŋ j a p ²¹		686
35 | 3621	Lashi	tongs (fire)	ŋjap³¹ tsei⁵⁵	ŋjap³¹+tsei⁵⁵	ŋ j a p ³¹ + ts ei ⁵⁵		686 3310
36 | 3623	ProtoBurmish	tongs (fire)	*ŋjat	ŋjat⁴	ŋ j a t ⁴		686
37 | 4030	OldBurmese	young	ṅay	ṅai	ṅ/ŋ ai		668
38 | 


--------------------------------------------------------------------------------
/tests/test_align.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from lingrex.align import (
 3 |     gap_free_pairwise,
 4 |     align_to_template,
 5 |     shrink_alignments,
 6 |     template_alignment,
 7 |     shrink_template,
 8 | )
 9 | from lingpy import Wordlist
10 | 
11 | 
12 | def test_gap_free_pairwise():
13 | 
14 |     seqA, seqB = list("andra"), list("an-ra")
15 | 
16 |     almA, almB = gap_free_pairwise(seqA, seqB)
17 |     assert almA[1] == "n<d"
18 | 
19 |     almA, almB = gap_free_pairwise(seqA, seqB, syllables=[0, 2])
20 |     assert almA[2] == "d>r"
21 | 
22 |     seqA, seqB = list("este"), list("-ste")
23 |     almA, almB = gap_free_pairwise(seqA, seqB)
24 |     assert almA[0] == "e>s"
25 | 
26 |     seqA, seqB = list("euste"), list("--ste")
27 |     almA, almB = gap_free_pairwise(seqA, seqB)
28 |     assert almA[0] == "e>u>s"
29 | 
30 | 
31 | def test_align_to_template():
32 |     out = align_to_template("ka", "Cv", "Cvc")
33 |     assert out[-1] == "-"
34 | 
35 |     with pytest.raises(ValueError):
36 |         align_to_template("ka", "c", "Cvc")
37 |     with pytest.raises(ValueError):
38 |         align_to_template("ka", "cv", "Cv")
39 | 
40 | 
41 | def test_shrink_alignments():
42 |     out = shrink_alignments([["a", "b", "-"], ["a", "b", "-"]])
43 |     assert len(out[0]) == 2
44 | 
45 | 
46 | @pytest.fixture
47 | def wldata():
48 |     return {
49 |         0: ["doculect", "concept", "tokens", "structure", "cogid"],
50 |         1: ["a", "a", "b au".split(), "i n".split(), 1],
51 |         2: ["b", "a", "b o k".split(), "i n c".split(), 1],
52 |         3: ["c", "a", "b w a k".split(), "i m n c".split(), 1],
53 |     }
54 | 
55 | 
56 | @pytest.fixture
57 | def wldata_listvalued_cogid(wldata):
58 |     return {k: v if k == 0 else v[:-1] + [[v[-1]]] for k, v in wldata.items()}
59 | 
60 | 
61 | def test_template_alignment(wldata, wldata_listvalued_cogid):
62 |     wl = Wordlist(wldata)
63 |     template_alignment(wl, fuzzy=False, template="imnc")
64 |     assert "alignment" in wl.columns
65 |     wl = Wordlist(wldata_listvalued_cogid)
66 |     template_alignment(wl, fuzzy=True, template="imnc")
67 |     assert "alignment" in wl.columns
68 | 
69 | 
70 | def test_shrink_template(wldata_listvalued_cogid):
71 |     wl = Wordlist(wldata_listvalued_cogid)
72 |     shrink_template(wl)
73 |     assert wl[2, "tokens2"][-1] == "ok"
74 | 


--------------------------------------------------------------------------------
/tests/test_borrowing.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from lingrex.borrowing import internal_cognates, external_cognates
 3 | from lingpy import Wordlist
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def wl(data):
 8 |     return Wordlist(str(data / 'wordlist.tsv'))
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     'kw,success',
13 |     [
14 |         (
15 |                 dict(ref="autocogids", partial=True, method="lexstat"),
16 |                 lambda wl: "autocogids" in wl.columns),
17 |         (
18 |                 dict(ref="autocogid", partial=False, method="lexstat"),
19 |                 lambda wl: "autocogid" in wl.columns),
20 |         (
21 |                 dict(ref="autocogids", partial=True, method="sca"),
22 |                 lambda wl: "autocogids" in wl.columns),
23 |     ]
24 | )
25 | def test_internal_cognates(kw, success, wl):
26 |     internal_cognates(wl, runs=10, **kw)
27 |     assert success(wl)
28 |     
29 |     etd = wl.get_etymdict(ref=kw["ref"])
30 |     
31 |     for cogid, vals in etd.items():
32 |         concepts = []
33 |         for idx_ in vals:
34 |             if idx_:
35 |                 for idx in idx_:
36 |                     concepts += [wl[idx, "concept"]]
37 |         assert len(set(concepts)) == 1
38 | 
39 | 
40 | 
41 | def test_external_cognates(wl):
42 |     external_cognates(wl, cognates="cogid", ref="borrids")
43 |     assert "borrids" in wl.columns
44 | 


--------------------------------------------------------------------------------
/tests/test_cognates.py:
--------------------------------------------------------------------------------
 1 | from lingrex.cognates import common_morpheme_cognates, salient_cognates
 2 | from lingpy import Wordlist
 3 | 
 4 | 
 5 | def test_common_morpheme_cognates():
 6 |     wl = Wordlist({
 7 |         0: ["doculect", "concept", "ipa", "tokens", "cogids"],
 8 |         1: ["a", "a", "pla", "p l a + p u", [1, 2]],
 9 |         2: ["b", "a", "pla", "p l a t + k i", [1, 3]],
10 |         3: ["c", "a", "pla", "k i + p l u p", [4, 1]],
11 |         4: ["d", "a", "pla", "p l a k", [1]],
12 |         5: ["a", "b", "pla", "t r a", [2]],
13 |         6: ["b", "b", "pla", "t a t", [2]],
14 |         7: ["c", "b", "pla", "d r ə p", [2]],
15 |     })
16 |     common_morpheme_cognates(wl)
17 |     assert wl[1, "autocogid"] == wl[2, "autocogid"]
18 | 
19 | 
20 | def test_salient_cognates():
21 |     wl = Wordlist({
22 |         0: ["doculect", "concept", "morphemes", "tokens", "cogids",],
23 |         1: ["a", "a", "pla _pi".split(), "p l a + p u".split(), [1, 2]],
24 |         2: ["b", "a", "pla _po".split(), "p l a t + k i".split(), [1, 3]],
25 |         3: ["c", "a", "_po pla".split(), "k i + p l u p".split(), [4, 1]],
26 |         4: ["d", "a", "pla".split(), "p l a k".split(), [1]],
27 |         5: ["a", "b", "pla".split(), "t r a".split(), [2]],
28 |         6: ["b", "b", "pla".split(), "t a t".split(), [2]],
29 |         7: ["c", "b", "pla".split(), "d r ə p".split(), [2]],
30 |     })
31 |     salient_cognates(wl)
32 |     assert wl[1, "newcogid"] == wl[2, "newcogid"]
33 | 


--------------------------------------------------------------------------------
/tests/test_colex.py:
--------------------------------------------------------------------------------
  1 | from lingrex.colex import (
  2 |     expand_alignment,
  3 |     find_bad_internal_alignments,
  4 |     compatible,
  5 |     merge_alignments,
  6 |     find_colexified_alignments,
  7 | )
  8 | from lingpy import Alignments
  9 | 
 10 | 
 11 | def test_find_bad_internal_alignments():
 12 |     wl = Alignments(
 13 |         {
 14 |             0: ["doculect", "concept", "ipa", "tokens", "alignment", "cogids"],
 15 |             1: ["a", "a", "bla", "b l a", "b l a -".split(), [1]],
 16 |             2: ["b", "a", "bla", "b l a k", "b l a k".split(), [1]],
 17 |             3: ["c", "a", "bla", "b a k", "b - a k".split(), [1]],
 18 |             4: ["a", "b", "bla", "b l a k", "b l a k".split(), [1]],
 19 |             5: ["b", "b", "bla", "b l a k", "b l a k".split(), [1]],
 20 |             6: ["a", "c", "bla", "b l a", "b l a -".split(), [1]],
 21 |         },
 22 |         ref="cogids",
 23 |     )
 24 |     find_bad_internal_alignments(wl)
 25 |     assert wl[4, "cogids"][0] != 1
 26 | 
 27 | 
 28 | def test_expand_alignment():
 29 |     missing = "?"
 30 |     out = expand_alignment(
 31 |         {"taxa": ["a", "b", "c"], "alignment": [["t", "a"], ["t/p", "u"], ["t", "-"]]},
 32 |         ["a", "d", "b", "c"],
 33 |         missing=missing,
 34 |     )
 35 |     assert out[1][1] == missing
 36 |     assert out[2][0] == "p"
 37 | 
 38 | 
 39 | def test_compatible():
 40 |     missing = "?"
 41 |     matches = compatible(
 42 |         [["a", "b"], [missing, missing], ["a", "c"], ["a", "d"]],
 43 |         [
 44 |             ["a", "-", "b"],
 45 |             ["a", "x", "b"],
 46 |             ["a", "-", "c"],
 47 |             [missing, missing, missing],
 48 |         ],
 49 |         missing=missing,
 50 |     )
 51 |     assert matches == 2
 52 | 
 53 |     matches = compatible(
 54 |         [["a", "b"], [missing, missing], ["a", "c"], ["a", "d"]],
 55 |         [
 56 |             ["a", "-", "c"],
 57 |             ["a", "x", "b"],
 58 |             ["a", "-", "c"],
 59 |             [missing, missing, missing],
 60 |         ],
 61 |         missing=missing,
 62 |     )
 63 |     assert not matches
 64 | 
 65 | 
 66 | def test_merge_alignments():
 67 |     missing = "?"
 68 |     matches = merge_alignments(
 69 |         [
 70 |             ["-", "a", "b"],
 71 |             [missing, missing, missing],
 72 |             ["-", "a", "c"],
 73 |             ["x", "a", "d"],
 74 |         ],
 75 |         [
 76 |             ["a", "-", "b"],
 77 |             ["a", "x", "b"],
 78 |             ["a", "-", "c"],
 79 |             [missing, missing, missing],
 80 |         ],
 81 |         missing=missing,
 82 |     )
 83 |     assert len(matches[0]) == 4
 84 | 
 85 |     missing = "?"
 86 |     matches = merge_alignments(
 87 |         [["a", "b"], ["a", "c"], ["a", "d"]],
 88 |         [
 89 |             ["a", "-", "b"],
 90 |             ["a", "x", "b"],
 91 |             ["a", "-", "c"],
 92 |         ],
 93 |         missing=missing,
 94 |     )
 95 |     assert len(matches[0]) == 3
 96 | 
 97 |     missing = "?"
 98 |     matches = merge_alignments(
 99 |         [["a", "b"], ["a", "c"], [missing, missing], ["a", "d"]],
100 |         [
101 |             ["a", "-", "b", "-"],
102 |             ["a", "x", "b", "-"],
103 |             ["a", "-", "c", "e"],
104 |             [missing, missing, missing, missing],
105 |         ],
106 |         missing=missing,
107 |     )
108 |     assert len(matches[0]) == 4
109 | 
110 | 
111 | def test_find_colexified_alignments():
112 |     wl = Alignments(
113 |         {
114 |             0: ["doculect", "concept", "ipa", "tokens", "alignment", "cogids"],
115 |             1: ["a", "a", "bla", "b l a", "b l a -".split(), [1]],
116 |             2: ["b", "a", "bla", "b l a k", "b l a k".split(), [1]],
117 |             3: ["c", "a", "bla", "b a k", "b - a k".split(), [1]],
118 |             4: ["a", "b", "bla", "b l a k", "b l a -".split(), [2]],
119 |             5: ["b", "b", "bla", "b l a k", "b l a k".split(), [2]],
120 |             6: ["a", "c", "bla", "b l a", "- b l a".split(), [3]],
121 |             7: ["d", "c", "bla", "a b l", "a b l -".split(), [3]],
122 |         },
123 |         ref="cogids",
124 |     )
125 | 
126 |     find_colexified_alignments(wl)
127 |     assert wl[1, "crossids"][0] == 1
128 | 
129 |     wl = Alignments(
130 |         {
131 |             0: ["doculect", "concept", "ipa", "tokens", "alignment", "cogid"],
132 |             1: ["a", "a", "bla", "b l a", "b l a -".split(), 1],
133 |             2: ["b", "a", "bla", "b l a k", "b l a k".split(), 1],
134 |             3: ["c", "a", "bla", "b a k", "b - a k".split(), 1],
135 |             4: ["a", "b", "bla", "b l a k", "b l a -".split(), 2],
136 |             5: ["b", "b", "bla", "b l a k", "b l a k".split(), 2],
137 |             6: ["a", "c", "bla", "b l a", "- b l a".split(), 3],
138 |             7: ["d", "c", "bla", "a b l", "a b l -".split(), 3],
139 |         },
140 |         ref="cogid",
141 |     )
142 | 
143 |     find_colexified_alignments(wl, cognates="cogid", ref="crossid")
144 |     assert wl[1, "crossid"] == 1
145 | 


--------------------------------------------------------------------------------
/tests/test_copar.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from lingrex.copar import (
  3 |     CoPaR,
  4 |     consensus_pattern,
  5 |     incompatible_columns,
  6 |     score_patterns,
  7 |     density,
  8 | )
  9 | from lingpy import Wordlist, Alignments
 10 | from lingrex.util import add_structure
 11 | 
 12 | 
 13 | def test_consensus_pattern():
 14 |     missing = "?"
 15 |     out = consensus_pattern(
 16 |         [["a", "b", "c"], ["a", "b", missing], [missing, missing, "c"]], missing=missing
 17 |     )
 18 |     assert out == ("a", "b", "c")
 19 |     with pytest.raises(ValueError):
 20 |         consensus_pattern([["a", "b"], ["a", "c"]])
 21 | 
 22 | 
 23 | def test_incompatible_columns():
 24 |     missing = "?"
 25 |     out = incompatible_columns(
 26 |         [
 27 |             ["a", "b", "c"],
 28 |             ["I", "b", "c"],
 29 |             ["a", "b", missing],
 30 |             [missing, missing, "c"],
 31 |         ],
 32 |         missing=missing,
 33 |     )
 34 |     assert out[0] == "*"
 35 | 
 36 | 
 37 | @pytest.mark.parametrize(
 38 |     'patterns,mode,result',
 39 |     [
 40 |         ([["a", "b", "c"], ["a", "b", "d"], ["a", "b", "?"], ["?", "?", "c"]], 'coverage', -1),
 41 |         (["a", "b", "c"], 'coverage', -1),
 42 |         ([["a", "b", "c"], ["a", "b", "c"], ["a", "b", "?"], ["?", "?", "c"]], 'ranked', 0.75),
 43 |         ([["a", "b", "c"], ["a", "b", "c"], ["a", "b", "?"], ["?", "?", "c"]], 'squared', 0.64),
 44 |         ([["a", "b", "c"], ["a", "b", "c"], ["a", "b", "?"], ["?", "?", "c"]], 'pairs', 0.44),
 45 |         ([["a", "b", "c"], ["a", "b", "c"], ["a", "b", "?"], ["?", "?", "c"]], 'coverage', 0.75),
 46 |     ]
 47 | )
 48 | def test_score_patterns(patterns, mode, result):
 49 |     assert result == pytest.approx(score_patterns(patterns, missing='?', mode=mode), abs=1e-2)
 50 | 
 51 | 
 52 | def test_score_patterns_error():
 53 |     with pytest.raises(ValueError):
 54 |         score_patterns([["a", "b"], ["a", "b"]], mode="bla")
 55 | 
 56 | 
 57 | def test_density():
 58 |     D = {
 59 |         0: ["doculect", "concept", "tokens", "ipa", "cogid"],
 60 |         1: ["a", "b", "t o x t ə".split(), "tochter", 1],
 61 |         2: ["b", "b", "t o x t ə".split(), "tochter", 1],
 62 |         3: ["c", "b", "t o x t ə".split(), "tochter", 1],
 63 |         4: ["a", "c", "t o x t ə".split(), "tochter", 2],
 64 |         5: ["b", "c", "t o x t ə".split(), "tochter", 2],
 65 |         6: ["c", "c", "t o x t ə".split(), "tochter", 2],
 66 |     }
 67 |     assert 0.67 == pytest.approx(density(Wordlist(D), ref="cogid"), abs=1e-2)
 68 | 
 69 | 
 70 | def test_CoPaR_fuzzy():
 71 |     D = {
 72 |         0: ["doculect", "concept", "ipa", "tokens", "cogids", "alignment"],
 73 |         1: ["a", "a", "pla", "p l a", [1], "p l a -".split()],
 74 |         2: ["b", "a", "pla", "p l a t", [1], "p l a t".split()],
 75 |         3: ["c", "a", "pla", "p l u p", [1], "p l u p".split()],
 76 |         4: ["d", "a", "pla", "p l a k", [1], "p l a k".split()],
 77 |         5: ["a", "b", "pla", "t r a", [2], "t r a -".split()],
 78 |         6: ["b", "b", "pla", "t a t", [2], "t - a t".split()],
 79 |         7: ["c", "b", "pla", "d r ə p", [2], "d r ə p".split()],
 80 |         # 8: ["a", "b", "pla", "p l a k", [1], "d x a k".split()],
 81 |         9: ["a", "c", "pla", "k l a", [3], "k r a -".split()],
 82 |         # 10: ["a", "c", "pla", "p l a t", [1], "k  a t".split()],
 83 |         11: ["c", "c", "pla", "k l ə p", [3], "k l ə p".split()],
 84 |         12: ["d", "c", "pla", "g l a k", [3], "g l a k".split()],
 85 |         13: ["d", "f", "buk", "b u k", [4], "b u k".split()],
 86 |     }
 87 |     alms = Alignments(D, ref="cogids", transcription="ipa")
 88 |     with pytest.raises(ValueError):
 89 |         CoPaR(alms, ref="cogids", structure="structure", minrefs=2)
 90 |     add_structure(alms, model="cv", structure="structure")
 91 |     cop = CoPaR(alms, ref="cogids", structure="structure", minrefs=1)
 92 |     cop.get_sites()
 93 |     assert len(cop.sites) == 12
 94 |     cop.cluster_sites()
 95 |     assert len(cop.clusters) == 9
 96 |     cop.sites_to_pattern()
 97 |     cop.add_patterns()
 98 |     cop.irregular_patterns()
 99 |     cop.fuzziness()
100 |     # get the cluster graph
101 |     G = cop.get_cluster_graph()
102 |     assert len(G.nodes) == len(cop.sites)
103 | 
104 |     # compute the purity of the cluster graph
105 |     assert round(cop.purity(), 2) == 0.42
106 |     cop.load_patterns()
107 | 
108 | 
109 | def test_CoPaR_plain(tmp_path):
110 |     D = {
111 |         0: ["doculect", "concept", "ipa", "tokens", "cogid", "alignment"],
112 |         1: ["a", "a", "pla", "p l a", 1, "p l a -".split()],
113 |         2: ["b", "a", "pla", "p l a t", 1, "p l a t".split()],
114 |         3: ["c", "a", "pla", "p l u p", 1, "p l u p".split()],
115 |         4: ["d", "a", "pla", "p l a k", 1, "p l a k".split()],
116 |         5: ["a", "b", "pla", "t r a", 2, "t r a -".split()],
117 |         6: ["b", "b", "pla", "t a t", 2, "t - a t".split()],
118 |         7: ["c", "b", "pla", "d r ə p", 2, "d r ə p".split()],
119 |         # 8: ["a", "b", "pla", "p l a k", [1], "d x a k".split()],
120 |         9: ["a", "c", "pla", "k l a", 3, "k r a -".split()],
121 |         # 10: ["a", "c", "pla", "p l a t", [1], "k  a t".split()],
122 |         11: ["c", "c", "pla", "k l ə p", 3, "k l ə p".split()],
123 |         12: ["d", "c", "pla", "g l a k", 3, "g l a k".split()],
124 |         13: ["d", "f", "buk", "b u k", 4, "b u k".split()],
125 |     }
126 |     alms = Alignments(D, ref="cogid", transcription="ipa")
127 |     add_structure(alms, model="cv", structure="structure")
128 |     cop = CoPaR(alms, ref="cogid", structure="structure", minrefs=1)
129 | 
130 |     with pytest.raises(ValueError):
131 |         cop.write_patterns("f")
132 |     with pytest.raises(ValueError):
133 |         cop.predict_words()
134 | 
135 |     cop.get_sites()
136 |     assert len(cop.sites) == 12
137 |     cop.cluster_sites()
138 |     assert len(cop.clusters) == 9
139 |     cop.sites_to_pattern()
140 |     cop.irregular_patterns()
141 |     cop.add_patterns(proto="a", irregular_patterns=True)
142 |     cop.fuzziness()
143 |     # get the cluster graph
144 |     G = cop.get_cluster_graph()
145 |     assert len(G.nodes) == len(cop.sites)
146 | 
147 |     # compute the purity of the cluster graph
148 |     assert round(cop.purity(), 2) == 0.42
149 | 
150 |     assert cop.upper_bound() > 1
151 | 
152 |     cop.write_patterns(tmp_path / 'test')
153 |     cop.write_patterns(tmp_path / 'test', proto="a", irregular_patterns=True)
154 |     cop.predict_words()
155 |     cop.load_patterns()
156 | 
157 | 
158 | def test_polynesian(data):
159 |     cop = CoPaR(str(data / "east-polynesian.tsv"), ref="cogid", segments="segments")
160 |     cop.align()
161 |     cop.get_sites()
162 |     cop.cluster_sites()
163 |     cop.add_patterns()
164 | 
165 | 
166 | def test_warnings():
167 |     D = {
168 |         0: ["doculect", "concept", "ipa", "tokens", "cogids", "alignment", "structure"],
169 |         1: ["a", "a", "pla", "p l a", [1], "p l a -".split(), "i m n c".split()],
170 |         2: ["b", "a", "pla", "p l a t", [1], "p l a t".split(), "i n c".split()],
171 |         3: ["c", "a", "pla", "p l u p", [1], "p l u p".split(), "i m n c".split()],
172 |         4: ["d", "a", "pla", "p l a k", [1], "p l a k".split(), "i m n c".split()],
173 |     }
174 |     alms = Alignments(D, ref="cogids")
175 |     cop = CoPaR(alms, structure="structure")
176 |     with pytest.raises(ValueError):
177 |         cop.get_sites()
178 |     D = {
179 |         0: ["doculect", "concept", "ipa", "tokens", "cogids", "alignment", "structure"],
180 |         1: ["a", "a", "pla", "p l a", [1], "p !l a -".split(), "i m n".split()],
181 |         2: ["b", "a", "pla", "p l a t", [1], "p f/l a t".split(), "i m n c".split()],
182 |         3: ["c", "a", "pla", "p l u p", [1], "p l u p".split(), "i m n c".split()],
183 |         4: ["d", "a", "pla", "p l a k", [1], "p l a k".split(), "i m n c".split()],
184 |     }
185 |     cop = CoPaR(D)
186 |     cop.get_sites()
187 | 


--------------------------------------------------------------------------------
/tests/test_evaluate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test evaluate module of lingrex.
 3 | """
 4 | from lingrex.evaluate import (
 5 |         compare_cognate_sets,
 6 |         cross_semantic_cognate_statistics
 7 |         )
 8 | from lingpy import Wordlist
 9 | 
10 | 
11 | def test_compare_cognate_sets():
12 | 
13 |     wordlist = Wordlist({
14 |             0: ["doculect", "concept", "form", "looseid", "strictid"],
15 |             1: ["a", "a", "b", "1", "2"],
16 |             2: ["b", "a", "c", "1", "3"],
17 |             3: ["c", "a", "c", "1", "2"],
18 |             4: ["d", "a", "d", "1", "4"]
19 |             })
20 |     ranks = compare_cognate_sets(
21 |             wordlist, "strictid", "looseid")
22 |     assert len(ranks) == 1
23 |     assert ranks[0][0] == "a"
24 |     assert ranks[0][1] == 1
25 |     assert ranks[0][2] == 0.375 
26 | 
27 | 
28 | def test_cross_semantic_cognate_statistics():
29 | 
30 |     wordlist = Wordlist({
31 |             0: ["doculect", "concept", "form", "cogids", "morphemes"],
32 |             1: ["a", "A", "a + b", "1 2", "a _suf"],
33 |             2: ["b", "A", "a + b", "1 2", "a _suf"],
34 |             3: ["c", "A", "c + d + a", "3 4 1", "_suf d a"],
35 |             4: ["d", "A", "d + e", "4 5", "d e"],
36 |             5: ["a", "B", "a + f", "1 6", "a f"],
37 |             6: ["b", "B", "a + f", "1 6", "a f"],
38 |             7: ["c", "C", "g + h + a", "7 8 1", "g h a"],
39 |             8: ["d", "C", "h + i", "8 9", "h i"],
40 |             })
41 |     ranks = cross_semantic_cognate_statistics(
42 |             wordlist,
43 |             concept="concept",
44 |             morpheme_glosses="morphemes",
45 |             ignore_affixes=True
46 |             )
47 |     assert len(ranks) == 3
48 |     assert ranks[0][0] == "C"
49 |     assert ranks[2][1] == 0.625
50 |     wordlist = Wordlist({
51 |             0: ["doculect", "concept", "form", "cogids", "morphemes"],
52 |             1: ["a", "A", "a + b", "1 2", "a _suf"],
53 |             2: ["b", "A", "a + b", "1 2", "a _suf"],
54 |             3: ["c", "A", "c + d + a", "3 4 1", "_suf d a"],
55 |             4: ["d", "A", "d + e", "4 5", "d e"],
56 |             5: ["a", "B", "a + f", "1 6", "a f"],
57 |             6: ["b", "B", "a + f", "1 6", "a f"],
58 |             7: ["c", "C", "g + h + a", "7 8 1", "g h a"],
59 |             8: ["d", "C", "h + i", "8 9", "h i"],
60 |             })
61 |     ranks2 = cross_semantic_cognate_statistics(
62 |             wordlist,
63 |             concept="concept",
64 |             morpheme_glosses="morphemes",
65 |             ignore_affixes=False
66 |             )
67 |     assert ranks2[2][1] != ranks[2][1]
68 |     assert ranks2[2][1] == 0.5
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/tests/test_fuzzy.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test fuzzy reconstruction.
 3 | """
 4 | import pytest
 5 | from lingrex.fuzzy import FuzzyReconstructor, ntile
 6 | from lingrex.reconstruct import CorPaRClassifier
 7 | import random
 8 | import lingpy
 9 | 
10 | 
11 | 
12 | def test_ntile():
13 |     assert set(
14 |             ntile(
15 |                 ["kap", "kap", "kup", "kup"
16 |                     ], 2).split(" ")[1].split("|")) == set(["a", "u"])
17 |     # counting is not the same for missing data!
18 |     assert ntile(["kap", "kØp", "kØp"], n=2) == 'k|k a|a p|p'
19 |     
20 | def test_FuzzyReconstructor(data):
21 |     random.seed(1234)
22 |     
23 |     pytest.raises(ValueError, FuzzyReconstructor, 1, "ProtoBurmish")
24 |     pt = FuzzyReconstructor(str(data / "hillburmish.tsv"), "ProtoBurmish", ref="cogids",
25 |             fuzzy=False)
26 |     alms = lingpy.align.sca.Alignments(
27 |             str(data / "hillburmish.tsv"),
28 |             transcription="form", ref="cogids")
29 |     pt = FuzzyReconstructor(alms, "ProtoBurmish", ref="cogids",
30 |             fuzzy=False)
31 |     pt.random_splits()
32 |     assert hasattr(pt, "wordlists")
33 |     
34 |     clf = lambda: CorPaRClassifier()
35 |     pt.fit_samples(clf)
36 |     predis = pt.predict(
37 |             pt.wordlist.msa["cogids"][665]["seqs"][:3],
38 |             ["Atsi", "Lashi", "OldBurmese"],
39 |             desegment=True
40 |             )
41 |     assert predis[0] == "ŋ:100"
42 |     predis = pt.predict(
43 |             pt.wordlist.msa["cogids"][666]["seqs"][:3],
44 |             ["Atsi", "Lashi", "OldBurmese"],
45 |             desegment=True
46 |             )
47 |     assert predis[-1] == "?:90¦⁴:10"
48 | 
49 |     predis = pt.predict(
50 |             pt.wordlist.msa["cogids"][665]["seqs"][:3],
51 |             ["Atsi", "Lashi", "OldBurmese"],
52 |             desegment=True,
53 |             output="percentiles"
54 |             )
55 |     assert predis[0] == "ŋ:100"
56 | 
57 |     words, predis = pt.predict(
58 |             pt.wordlist.msa["cogids"][665]["seqs"][:3],
59 |             ["Atsi", "Lashi", "OldBurmese"],
60 |             desegment=True,
61 |             output="wp"
62 |             )
63 |     assert predis[0] == "ŋ:100"
64 | 
65 |     words = pt.predict(
66 |             pt.wordlist.msa["cogids"][665]["seqs"][:3],
67 |             ["Atsi", "Lashi", "OldBurmese"],
68 |             desegment=True,
69 |             output="words"
70 |             )
71 |     assert words[0][0] == "ŋ"
72 | 
73 | 
74 | 
75 | 
76 |     
77 | 


--------------------------------------------------------------------------------
/tests/test_reconstruct.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test the reconstruction module of lingrex.
  3 | """
  4 | import pytest
  5 | from lingrex.reconstruct import (
  6 |         CorPaRClassifier,
  7 |         OneHot,
  8 |         ReconstructionBase,
  9 |         PatternReconstructor,
 10 |         transform_alignment,
 11 |         eval_by_dist,
 12 |         eval_by_bcubes
 13 |         )
 14 | from functools import partial
 15 | 
 16 | 
 17 | 
 18 | def test_transform_alignment():
 19 | 
 20 |     out = transform_alignment(
 21 |             [["b", "a", "k"], ["b", "a"]],
 22 |             ["a", "b"],
 23 |             ["a", "b", "u"],
 24 |             training=False
 25 |             )
 26 |     assert len(out) == 3
 27 | 
 28 |     out = transform_alignment(
 29 |             [["b", "k"], ["b", "a", "k"]],
 30 |             ["a", "b"],
 31 |             ["a", "b", "u"],
 32 |             training=True,
 33 | 
 34 |             )
 35 |     assert len(out) == 2
 36 | 
 37 |     out = transform_alignment(
 38 |             [["b", "k"], ["b", "a", "k"]],
 39 |             ["a", "b"],
 40 |             ["a", "b", "u"],
 41 |             training=True,
 42 |             firstlast=True
 43 | 
 44 |             )
 45 |     assert out[0][-1] == "k"
 46 | 
 47 |     out = transform_alignment(
 48 |             [["b", "k"], ["b", "a", "k"]],
 49 |             ["a", "b"],
 50 |             ["a", "b", "u"],
 51 |             training=True,
 52 |             startend=True
 53 |             )
 54 |     assert out[0][-1] == 0
 55 | 
 56 | 
 57 | def test_PatternReconstructor(data):
 58 | 
 59 |     pt = PatternReconstructor(str(data / "hillburmish.tsv"), "ProtoBurmish", ref="cogids",
 60 |             )
 61 |     t1 = partial(transform_alignment, align=True, position=False,
 62 |             prosody=False, startend=False, firstlast=False)
 63 |     t2 = partial(transform_alignment, align=True, position=True,
 64 |             prosody=True, startend=True, firstlast=True)
 65 |     pt.fit(func=t1)
 66 |     assert pt.predict(
 67 |             pt.msa["cogids"][665]["seqs"][:3],
 68 |             ["Atsi", "Lashi", "OldBurmese"],
 69 |             desegment=True
 70 |             ) == ['ŋ', 'a', '¹']
 71 |     pt.fit(func=t2)
 72 |     assert pt.predict(
 73 |             pt.msa["cogids"][665]["seqs"][:3],
 74 |             ["Atsi", "Lashi", "OldBurmese"],
 75 |             desegment=True
 76 |             ) == ['ŋ', 'a', '¹']
 77 | 
 78 |     pt.fit(func=t1, onehot=True)
 79 |     assert pt.predict(
 80 |             pt.msa["cogids"][665]["seqs"][:3],
 81 |             ["Atsi", "Lashi", "OldBurmese"],
 82 |             desegment=True
 83 |             ) == ['ŋ', 'a', '¹']
 84 | 
 85 | def test_eval_by_dist():
 86 |     assert eval_by_dist([[["t", "a"], ["t", "o"]]]) == 1
 87 |     assert eval_by_dist([[["t", "a"], []]]) == 2
 88 | 
 89 |     assert eval_by_dist([[["t", "a"], ["t", "o"]]], normalized=True) == 0.5
 90 | 
 91 | def test_eval_by_bcubes():
 92 |     assert eval_by_bcubes([[["t", "a"], ["t", "a"]]]) == 1
 93 |     assert eval_by_bcubes([
 94 |         [["t", "a"], ["t", "o"]]
 95 |         ]) == 1.0
 96 |     assert eval_by_bcubes([
 97 |         [["t", "a"], []]
 98 |         ]) == 1
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/tests/test_regularity.py:
--------------------------------------------------------------------------------
 1 | from pytest import raises
 2 | from lingpy import Wordlist, Alignments
 3 | from lingrex.copar import CoPaR
 4 | from lingrex.util import add_structure
 5 | from lingrex.regularity import regularity
 6 | 
 7 | 
 8 | dummy_wl = {
 9 |     0: ["doculect", "concept", "form", "ipa", "alignment", "cogid"],
10 |     1: ["A", "one", "atawu", "atawu", "a t a w u", 1],
11 |     2: ["B", "one", "atwu", "atwu", "a t - w u", 1],
12 |     3: ["C", "one", "tawu", "tawu", "- t a w u", 1],
13 |     4: ["D", "one", "tefu", "tefu", "- t e f u", 1],
14 |     5: ["A", "two", "satu", "satu", "s a t u", 2],
15 |     6: ["B", "two", "setu", "setu", "s e t u", 2],
16 |     7: ["C", "two", "situ", "situ", "s i t u", 2]
17 | }
18 | 
19 | 
20 | def test_regularity():
21 |     test_wl = Wordlist(dummy_wl)
22 |     with raises(ValueError):
23 |         regularity(test_wl)
24 | 
25 |     test_alg = Alignments(test_wl)
26 |     add_structure(test_alg, model="cv", structure="structure")
27 |     print(test_alg.structure)
28 |     test_alg = CoPaR(test_alg, ref="cogid")
29 |     test_alg.get_sites()
30 |     test_alg.cluster_sites()
31 |     test_alg.sites_to_pattern()
32 |     output = regularity(test_alg, threshold=2, word_threshold=0.5,
33 |                         sound_classes="cv")
34 | 
35 |     assert output == (2, 5, 7, 0.29, 4, 5, 9, 0.44, 3, 4, 7, 0.43)
36 | 


--------------------------------------------------------------------------------
/tests/test_trimming.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from lingpy import Alignments
  4 | 
  5 | from lingrex.trimming import *
  6 | 
  7 | 
  8 | def test_Site():
  9 |     site = Site([GAP, 'a', GAP, 't'])
 10 |     assert site.gap_ratio() == pytest.approx(0.5)
 11 |     assert site.gap_ratio(gap='#') == pytest.approx(0.0)
 12 |     assert site.soundclass() == 'V'
 13 |     assert site.soundclass(gap='a') == '0'
 14 | 
 15 | 
 16 | @pytest.mark.parametrize(
 17 |     'alms,gap,ratios',
 18 |     [
 19 |         (["aaa", "aa-"], '-', [0.0, 0.0, 0.5]),
 20 |         (["aa", "a#"], '#', [0.0, 0.5]),
 21 |     ]
 22 | )
 23 | def test_gap_ratio(alms, gap, ratios):
 24 |     assert Sites([list(w) for w in alms], gap=gap).gap_ratios == ratios
 25 | 
 26 | 
 27 | def test_trimmed():
 28 |     alm = [list("toxta-"), list("to-tir"), list("to-t-r"), list("do--ar")]
 29 |     assert " ".join(Sites(alm)._trimmed([2, 5]).to_alignment()[0]) == "t o t a"
 30 | 
 31 | 
 32 | def test_soundclasses():
 33 |     assert Sites([list("-bc"), list("ab-")], gap="-").soundclasses == ["V", "C", "C"]
 34 | 
 35 | 
 36 | @pytest.mark.parametrize(
 37 |     'alms,kw,result',
 38 |     [
 39 |         (["abc", "a-c", "--c"], {}, list('ac')),
 40 |         (["abc", "a-c", "--c"], dict(skeletons=['VCC']), list('abc')),
 41 |         (["a+bco", "-+cco", "-+cco"], {}, list('bco')),
 42 |         (["a+b", "-+c", "-+c"], dict(exclude=""), list('a+b')),
 43 |         ([
 44 |              #"- - n u - - 'b/b a".split(),
 45 |              '- - - - d ù/u - -'.split(),
 46 |              '- - - - d ú/u - -'.split(),
 47 |              '- - - - d ù/u - -'.split(),
 48 |              "ɾ u 'w/w a s i ɾ a".split(),
 49 |              '- - - - s u - e'.split(),
 50 |              "- - n u - - 'b/b a".split(),
 51 |              '- - - - d u l -'.split(),
 52 |              '- - n u k - w ɔ'.split(),
 53 |          ], {}, ['d', 'ù/u']),
 54 |         ([
 55 |              "- - n u - - 'b/b a".split(),
 56 |              '- - - - d ù/u - -'.split(),
 57 |              '- - - - d ú/u - -'.split(),
 58 |              '- - - - d ù/u - -'.split(),
 59 |              "ɾ u 'w/w a s i ɾ a".split(),
 60 |              '- - - - s u - e'.split(),
 61 |              #"- - n u - - 'b/b a".split(),
 62 |              '- - - - d u l -'.split(),
 63 |              '- - n u k - w ɔ'.split(),
 64 |          ], {}, ['-', '-']),
 65 |         # Non-overlapping alignments:
 66 |         (['- - a b'.split(), 'a b - -'.split(), 'a b - -'.split()], {}, ['-', '-']),
 67 |         #
 68 |         (['- a b'.split(), 'b a -'.split(), 'b a -'.split()], {}, ['-', 'a']),
 69 |         (['- a b'.split(), 'b - -'.split(), 'b - -'.split()], {}, ['-', 'a']),
 70 |         ([
 71 |              '- a b c'.split(),
 72 |              'b - - -'.split(),
 73 |              'b - - -'.split(),
 74 |              'b - - d'.split()
 75 |          ], {}, ['-', 'a', 'c']),
 76 |         ([list('bbabb'), list('bb-bb'), list('-b-b-'), list('-b-b-')], {}, list('bbabb')),
 77 |         ([list('bbabb'), list('bb-bb'), list('-b-b-'), list('-b-b-')], {'strict_ratio': False}, list('bab')),
 78 |     ]
 79 | )
 80 | def test_trim_by_gap(alms, kw, result):
 81 |     assert Sites([list(w) for w in alms]).trimmed(**kw).to_alignment()[0] == result
 82 | 
 83 | 
 84 | @pytest.mark.parametrize(
 85 |     'alms,kw,result',
 86 |     [
 87 |         (["--mat", "-xmut", "--mit", "m-xit"], {}, list('mat')),
 88 |         (["--mat--", "-xmut--", "--mitx-", "m-xit-x"], {}, list('mat')),
 89 |         ([
 90 |             "- - n u - - 'b/b a".split(),
 91 |             '- - - - d ù/u - -'.split(),
 92 |             '- - - - d ú/u - -'.split(),
 93 |             '- - - - d ù/u - -'.split(),
 94 |             "ɾ u 'w/w a s i ɾ a".split(),
 95 |             '- - - - s u - e'.split(),
 96 |             "- - n u - - 'b/b a".split(),
 97 |             '- - - - d u l -'.split(),
 98 |             '- - n u k - w ɔ'.split(),
 99 |          ], {}, ['-', '-', "'b/b", 'a']),
100 |         ([list('bbabb'), list('bb-bb'), list('-b-b-'), list('-b-b-')], {}, list('bab')),
101 |     ]
102 | )
103 | def test_trim_by_core(alms, kw, result):
104 |     sites = Sites([list(w) for w in alms])
105 |     assert sites.trimmed(strategy='core', **kw).to_alignment()[0] == result
106 |     assert str(sites)
107 | 
108 | 
109 | def test_trim_random(mocker):
110 |     mocker.patch('lingrex.trimming.random', mocker.Mock(sample=lambda pop, k: list(pop)[:k]))
111 |     alms = [list(w) for w in ["--mat", "-xmut", "m-xut", "--xit"]]
112 |     assert len(Sites(alms).trimmed()) == len(Sites(alms).trimmed_random())
113 |     assert set(Sites(alms).trimmed().soundclasses) == \
114 |         set(Sites(alms).trimmed_random().soundclasses)
115 |     assert Sites(alms).trimmed_random(strategy='core')
116 | 
117 | 
118 | def test_prep_alignments(wl_with_alignments):
119 |     test_wl = prep_alignments(Alignments(wl_with_alignments, transcription="form"))
120 |     assert test_wl[4, "structure"] == "C V C V"
121 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from lingpy import Wordlist, Alignments
  3 | from lingrex.util import lingrex_path, add_structure
  4 | from lingrex.util import ungap, clean_sound, unjoin, alm2tok, bleu_score
  5 | from lingrex.util import prep_wordlist, subsequence_of
  6 | 
  7 | 
  8 | @pytest.mark.parametrize(
  9 |     'source,target,result',
 10 |     [
 11 |         ('cvc', 'cvcvc', True),
 12 |         ('cvc', 'cxcvc', True),
 13 |         ('vcc', 'vc', False),
 14 |         ('vcc', 'vcc', True),
 15 |         ('bla', 'bla', True),
 16 |         ('bla', 'bxlyaz', True),
 17 |         ('bla', 'abxlyaz', True),
 18 |         ('bla', 'abxalyz', False),
 19 |         ('abc', 'ab', False),
 20 |     ]
 21 | )
 22 | def test_subsequence_of(source, target, result):
 23 |     assert subsequence_of(source, target) == result
 24 | 
 25 | 
 26 | def test_bleu_score():
 27 |     candidate = "this is a test".split()
 28 |     reference = "this is a small test".split()
 29 | 
 30 |     assert round(
 31 |             bleu_score(
 32 |                 candidate, 
 33 |                 reference, 
 34 |                 weights=[0.5, 0.5],
 35 |                 n=2,
 36 |                 trim=True
 37 |                 ),
 38 |             2) == 0.64
 39 | 
 40 |     assert round(
 41 |         bleu_score(
 42 |             candidate,
 43 |             reference,
 44 |             weights=[0.5, 0.5],
 45 |             n=2,
 46 |             trim=False),
 47 |         2) == 0.70
 48 | 
 49 |     assert round(
 50 |             bleu_score(
 51 |                 candidate,
 52 |                 reference,
 53 |                 n=2,
 54 |                 trim=False),
 55 |             2) == 0.70
 56 | 
 57 | 
 58 | def test_ungap():
 59 |     matrix = ungap([['a', 'b'], ['x', '-'], ['y', '-']], ['proto', 'l1', 'l2'], 'proto')
 60 |     assert matrix[0][0] == 'a.b'
 61 |     assert matrix[1][0] == 'x'
 62 |     assert matrix[2][0] == "y"
 63 |     matrix2 = ungap([['a', 'b'], ['x', '-'], ['y', 'h']], ['proto', 'l1', 'l2'], 'proto')
 64 |     assert matrix2[0][1] == ["a", "b"][1]
 65 |     assert matrix2[1][1] == ["x", "-"][1]
 66 |     assert matrix2[2][1] == ["y", "h"][1]
 67 | 
 68 |     out = ungap([["p", "-", "a"], ["p", "j", "a"]], ["German", "E"], "E")
 69 |     assert out[1][0] == "p.j"
 70 |     
 71 |     alm = [['a', 'b'], ['-', '-'], ['-', '-']]
 72 |     assert ungap(alm, ['p', 'l1', 'l2'], 'p') == alm
 73 | 
 74 | 
 75 | def test_clean_sound():
 76 |     assert clean_sound("a/b") == "b"
 77 |     assert clean_sound("a") == "a"
 78 |     assert clean_sound("a/b.c/d") == "b.d"
 79 | 
 80 | 
 81 | def test_unjoin():
 82 |     assert unjoin("k.p a p u k.a/b".split())[0] == "k"
 83 | 
 84 | 
 85 | def test_lingrex_path():
 86 |     lingrex_path("test")
 87 | 
 88 | 
 89 | def test_add_structure():
 90 | 
 91 |     with pytest.raises(ValueError):
 92 |         add_structure(
 93 |             Wordlist(
 94 |                 {
 95 |                     0: ["doculect", "concept", "tokens", "cogid"],
 96 |                     1: ["a", "b", "b l a".split(), 1],
 97 |                     2: ["b", "b", "b l a x".split(), 1],
 98 |                     3: ["c", "b", "b l i k u s".split(), 1],
 99 |                 }
100 |             ),
101 |             model="bla",
102 |         )
103 | 
104 |     for m in ["cv", "c", "CcV", "nogap", "ps"]:
105 |         D = {
106 |             0: ["doculect", "concept", "tokens", "cogid"],
107 |             1: ["a", "b", "b l a".split(), 1],
108 |             2: ["b", "b", "b l a x".split(), 1],
109 |             3: ["c", "b", "b l i k u s".split(), 1],
110 |             4: ["d", "b", "b l u k", 2],
111 |         }
112 |         wl = Alignments(D, transcription="tokens")
113 |         add_structure(wl, m)
114 | 
115 |     for m in ["cv", "c", "CcV", "nogap", "ps"]:
116 |         D = {
117 |             0: ["doculect", "concept", "tokens", "cogids"],
118 |             1: ["a", "b", "b l a".split(), [1]],
119 |             2: ["b", "b", "b l a x".split(), [1]],
120 |             3: ["c", "b", "b l i k u s".split(), [1]],
121 |         }
122 |         wl = Alignments(D, ref="cogids", transcription="tokens")
123 |         add_structure(wl, m, ref="cogids")
124 | 
125 | 
126 | def test_prep_wordlist(wl_with_alignments):
127 |     test_wl = prep_wordlist(Wordlist(wl_with_alignments))
128 | 
129 |     assert len(test_wl) == 4
130 |     assert "+" not in test_wl[1, "tokens"]
131 |     assert "_" not in test_wl[2, "tokens"]
132 | 


--------------------------------------------------------------------------------
/tests/test_workflows.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import pathlib
 3 | import subprocess
 4 | 
 5 | import pytest
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def clean_dir(tmp_path):  # pragma: no cover
10 |     def _clean_dir(d):
11 |         shutil.copytree(pathlib.Path(__file__).parent / 'workflows' / d, tmp_path / d)
12 |         return tmp_path / d
13 |     return _clean_dir
14 | 
15 | 
16 | def _run(wd, *cmds):  # pragma: no cover
17 |     for cmd in cmds:
18 |         try:
19 |             subprocess.check_call(cmd, cwd=wd, shell=True)
20 |         except subprocess.CalledProcessError as e:  # pragma: no cover
21 |             print(e)
22 |             print(e.output)
23 |             raise
24 | 
25 | 
26 | @pytest.mark.workflow
27 | def test_bodt(clean_dir):  # pragma: no cover
28 |     _run(
29 |         clean_dir('bodt-2019'),
30 |         'python predict.py',
31 |         'python test-prediction.py bodt-khobwa-cleaned.tsv -r 0.5',
32 |     )
33 | 
34 | 
35 | @pytest.mark.workflow
36 | def test_list(clean_dir):  # pragma: no cover
37 |     _run(
38 |         clean_dir('list-2019'),
39 |         'python general.py',
40 |         'python predict.py data/burmish-240-8.tsv -r 0.75 --runs 2',
41 |         'python predict.py data/chinese-623-14.tsv -r 0.75 --runs 2',
42 |         'python predict.py data/polynesian-210-10.tsv -r 0.75 --runs 2',
43 |         'python predict.py data/japanese-200-10.tsv -c crossid -r 0.75 --runs 2',
44 |     )
45 | 
46 | 
47 | @pytest.mark.workflow
48 | def test_wu(clean_dir):  # pragma: no cover
49 |     _run(
50 |         clean_dir('wu-2020'),
51 |         'python 4_crosssemantic.py',
52 |         'python 5_correspondence.py',
53 |     )
54 | 


--------------------------------------------------------------------------------
/tests/workflows/bodt-2019/predict.py:
--------------------------------------------------------------------------------
 1 | from lingpy import *
 2 | from sys import argv
 3 | from lingrex.copar import CoPaR
 4 | from sys import argv
 5 | 
 6 | cp = CoPaR('bodt-khobwa-cleaned.tsv', ref='crossids', fuzzy=True, 
 7 |        minrefs=2, structure='structure', transcription="tokens")
 8 | 
 9 | # make function to extract correspondence patterns
10 | cp.get_sites()
11 | cp.cluster_sites()
12 | cp.sites_to_pattern()
13 | 
14 | preds, purity, pudity = cp.predict_words()
15 | goods = 0
16 | with open('predictions-automatic.tsv', 'w') as f:
17 |     f.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\n'.format(
18 |         'NUMBER', 'GOOD_PREDICTION', 'COGNATESET', 'LANGUAGE', 'CONCEPT', 'MORPHEME', 'WORD1',
19 |         'WORD2', 'WORD3'
20 |         ))
21 |     num = 1
22 |     for key, vals in sorted(preds.items(), key=lambda x: x[0]):
23 |         # get the morphemes
24 |         idx = cp.msa['crossids'][key]['ID'][0]
25 |         cidx = cp[idx, 'crossids'].index(key)
26 |         try:
27 |             morph = cp[idx, 'morphemes'][cidx]
28 |         except:
29 |             morph = '?'
30 |         for doc in vals:
31 |             val1 = ' '.join([x.split('|')[0] for x in vals[doc]])
32 |             if "Ø" in val1:
33 |                 no = '?'
34 |             else:
35 |                 no = ''
36 |                 goods += 1
37 |             val2 = ' '.join(['|'.join(x.split('|')[0:2]) for x in vals[doc]])
38 |             val3 = ' '.join(vals[doc])
39 | 
40 |             f.write('\t'.join([str(num), no, str(key), doc, cp[idx, 'concept'],
41 |                 morph, val1, val2, val3])+'\n')
42 |             num += 1
43 | print('useful predictions', goods)
44 | 


--------------------------------------------------------------------------------
/tests/workflows/bodt-2019/results/README.md:
--------------------------------------------------------------------------------
1 | bla
2 | 


--------------------------------------------------------------------------------
/tests/workflows/bodt-2019/test-prediction.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import random
  3 | import collections
  4 | 
  5 | from lingrex.copar import *
  6 | from lingpy.basictypes import *
  7 | from tabulate import tabulate
  8 | 
  9 | def run_experiments(
 10 |         f, 
 11 |         ref, 
 12 |         ratio, 
 13 |         subset=None, 
 14 |         runs=100, 
 15 |         verbose=False, 
 16 |         fuzzy=True, 
 17 |         samples=1, 
 18 |         noout=False,
 19 |         score_mode='pairs'
 20 |         ):
 21 |     
 22 |     if not noout:
 23 |         outfile = codecs.open(
 24 |                 'results/'+f.split('/')[-1][:-4]+'-'+str(int(ratio*100+0.5))+'.txt', 
 25 |                 'w', 'utf-8')
 26 |         outfile.write('\t'.join([
 27 |                 'accuracy', 'proportion', 'density', 'fuzziness', 'coverage',
 28 |                 'purity', 'sounds', 'missing', 'csetsize', 'clusters', 'props',
 29 |                 'patterns', 'predicted', 'predictable', 'removed', 'regular',
 30 |                 'purityx'])+'\n')
 31 | 
 32 |     cpb = CoPaR(f, ref=ref, fuzzy=fuzzy, split_on_tones=False,
 33 |             segments='segments', transcription="segments")
 34 |     
 35 |     if not noout:
 36 |         inout = codecs.open(
 37 |                 'results/'+f.split('/')[-1][:-4]+'-individual-'+str(int(ratio*100+0.5))+'.tsv', 
 38 |                 'w', 'utf-8')
 39 |         inout.write('\t'.join(['run', 'doculect','accuracy', 'purity', 'words', 'sounds'])+'\n')
 40 |     
 41 |     # define the scores
 42 |     all_scores = []
 43 |     all_samples = set()
 44 |     all_pscores = {d: [] for d in cpb.cols}
 45 |     all_pud = {d: [] for d in cpb.cols}
 46 |     all_words = {d: [] for d in cpb.cols}
 47 |     all_sounds = {d: [] for d in cpb.cols}
 48 |     for key, msa in cpb.msa[ref].items():
 49 |         for alm, t in zip(msa['alignment'], msa['taxa']):
 50 |             all_samples.add((key, ' '.join(alm), t))
 51 |    
 52 |     for run in range(runs):    
 53 |         remove_idxs = random.sample(all_samples, int(len(all_samples)*ratio+0.5))
 54 |         D = {0: cpb.columns}
 55 |         for idx, cogid, alm, tax, tokens, structures in cpb.iter_rows(
 56 |                 ref, 'alignment', 'doculect', 'segments', 'structure'):
 57 |             if fuzzy:
 58 |                 cogids, alms, toks, strucs = [], [], [], []
 59 |                 for c, a, t, s in zip(cogid, lists(alm).n, lists(tokens).n,
 60 |                         lists(structures).n):
 61 |                     if (c, str(a), tax) not in remove_idxs:
 62 |                         cogids += [c]
 63 |                         alms += [str(a)]
 64 |                         toks += [str(t)]
 65 |                         strucs += [str(s)]
 66 |                 if not cogids:
 67 |                     pass
 68 |                 else:
 69 |                     D[idx] = cpb[idx]
 70 |                     D[idx][cpb.header[ref]] = ints(cogids)
 71 |                     D[idx][cpb.header['segments']] = ' + '.join(toks)
 72 |                     D[idx][cpb.header['structure']] = ' + '.join(strucs)
 73 |                     D[idx][cpb.header['alignment']] = ' + '.join(alms)
 74 |             else:
 75 |                 if (cogid, str(alm), tax) in remove_idxs:
 76 |                     pass
 77 |                 else:
 78 |                     D[idx] = cpb[idx]
 79 |         
 80 |         cp = CoPaR(D, ref=ref, fuzzy=fuzzy, split_on_tones=False,
 81 |                 segments='segments', transcription="segments", minrefs=2,
 82 |                 structure="structure")
 83 |         if 'l' in argv: 
 84 |             cp.load_patterns()
 85 |         else:
 86 |             cp.get_sites()
 87 |             cp.cluster_sites(score_mode=score_mode)
 88 |             cp.sites_to_pattern()
 89 | 
 90 |         # compute size of alphabets
 91 |         sounds = {d: collections.defaultdict(int) for d in cp.cols}
 92 |         for idx, doc, tks in cp.iter_rows('doculect', 'segments'):
 93 |             for t in tks:
 94 |                 if t != '+':
 95 |                     sounds[doc][t.split('/')[1] if '/' in t else t] += 1
 96 |         ave = sum([len(s) for s in sounds.values()]) / cp.width
 97 | 
 98 |         # good words
 99 |         our_sample = {}
100 |         for cogid, alm, doc in remove_idxs:
101 |             our_sample[cogid, doc] = strings(alm)
102 |         pscores = {d: [] for d in cp.cols}
103 |         
104 |         regs = sum([len(a[1]) for a in cp.clusters.items() if len(a[1]) > 1]) / len(cp.sites)
105 |                 
106 |         predicted, purity, pudity = cp.predict_words(minrefs=2, samples=samples)
107 |         scores = []
108 |         unknown, all_segs, predictable, cogsize = 0, 0, 0, 0
109 |         for k, v in predicted.items():
110 |             for doc in v:
111 |                 if (k, doc) in our_sample and (doc == subset or not subset):
112 |                     predictable += 1
113 |                     cogsize += len(cp.msa[ref][k]['ID'])
114 | 
115 |                     # check for different alignments
116 |                     msaA = cp.msa[ref][k]
117 |                     msaB = cpb.msa[ref][k]
118 |                     if len(msaA['alignment'][0]) != len(msaB['alignment'][0]):
119 |                         # carve out the taxa which are still existent to find which
120 |                         # column to delete
121 |                         new_alm = [msaB['alignment'][i] for i in
122 |                             range(len(msaB['alignment'])) if msaB['taxa'][i] in \
123 |                                     msaA['taxa']]
124 |                         almA, almB = [], []
125 |                         for i in range(len(msaA['alignment'][0])):
126 |                             almA += [tuple([line[i] for line in msaA['alignment']])]
127 |                         for i in range(len(msaB['alignment'][0])):
128 |                             almB += [tuple([line[i] for line in new_alm])]
129 |                         out = []
130 |                         for i, col in enumerate(almB):
131 |                             if col not in almA:
132 |                                 out += [i]
133 |                     else:
134 |                         out = []
135 | 
136 |                     wA, wB = v[doc], our_sample[k, doc]
137 |                     ms = 0
138 |                     wB = strings([x for i, x in enumerate(wB) if i not in out]) 
139 |                     for a, b in zip(wA, wB):
140 |                         b = b.split('/')[1] if '/' in b else b
141 |                         a = a.split('|')
142 |                         for i, a_ in enumerate(a):
143 |                             if b == a_:
144 |                                 ms += 1 * (1/(i+1))
145 |                         if a[0] == 'Ø':
146 |                             unknown += 1
147 |                         all_segs += 1
148 |         
149 |                     score = ms / len(wA)
150 |                     pscores[doc] += [score]
151 |                     if verbose: 
152 |                         print('{0:5}\t{1:15}\t{2:20}\t{3:20}\t{4:.2f}\t{5}'.format(
153 |                             str(k), doc, str(wA), str(wB), score, len(set(msaA['taxa']))))
154 |                     if verbose and score != 1.0:
155 |                         purs = []
156 |                         for i, elm in enumerate(wA):
157 |                             if (k, i) in purity:
158 |                                 purs += ['{0:.2f}'.format(purity[k, i][doc])]
159 |                             else:
160 |                                 purs += ['?']
161 |                                 print((cogid, i) in cp.sites)
162 |                                 print([_s for _s in cp.sites if _s[0] == cogid],
163 |                                         cogid)
164 |                         print('<---')
165 |                         print('\t'.join([x for x in wA]))
166 |                         print('\t'.join([x for x in wB]))
167 |                         print('\t'.join(purs))
168 |                         print('--->')
169 |                     scores += [score]
170 |         ubound = cp.upper_bound()
171 |         all_scores += [(
172 |             sum(scores) / len(scores),         
173 |             len(cp) / len(cpb),
174 |             density(cp, ref=ref), 
175 |             cp.fuzziness(),
176 |             sum(pudity.values()) / len(pudity.values()), 
177 |             ave, 
178 |             unknown/all_segs,
179 |             cogsize / predictable, 
180 |             len(cp.clusters), 
181 |             len(cp.clusters) / ubound,
182 |             len(cp.sites),
183 |             predictable / len(remove_idxs),
184 |             predictable,
185 |             len(remove_idxs),
186 |             regs,
187 |             cp.purity()
188 |             )]
189 |         if verbose:
190 |             print('{0:.2f}'.format(all_scores[-1][0]))
191 |         
192 |         cov = cp.coverage()
193 |         for p in pscores:
194 |             all_pscores[p] += [sum(pscores[p]) / len(pscores[p])]
195 |             all_pud[p] += [pudity[p]]
196 |             all_words[p] += [cov[p]]
197 |             all_sounds[p] += [len(sounds[p])]
198 |             
199 |             if not noout:
200 |                 inout.write('\t'.join([
201 |                     str(run+1),
202 |                     p, 
203 |                     str(all_pscores[p][-1]), 
204 |                     str(pudity[p]), 
205 |                     str(cov[p]),
206 |                     str(len(sounds[p]))
207 |                     ])+'\n')
208 |         if not noout:
209 |             outfile.write(str(run+1)+'\t'+'\t'.join(['{0:.4f}'.format(x) for x in
210 |                 all_scores[-1]])+'\n')
211 |         print('{0:.2f} / {1:.2f}'.format(sum(scores) / len(scores), len(cp) /
212 |             len(cpb)))
213 | 
214 |     
215 |     new_scores = [[
216 |             'accuracy', 'proportion', 'density'
217 |             ]]
218 |     new_scores += [[
219 |         round(sum([x[0] for x in all_scores]) / len(all_scores), 4),
220 |         round(sum([x[1] for x in all_scores]) / len(all_scores), 4),
221 |         round(sum([x[2] for x in all_scores]) / len(all_scores), 4),
222 |         #round(sum([x[3] for x in all_scores]) / len(all_scores), 4),
223 |         #round(sum([x[4] for x in all_scores]) / len(all_scores), 4),
224 |         #round(sum([x[5] for x in all_scores]) / len(all_scores), 4),
225 |         #round(sum([x[6] for x in all_scores]) / len(all_scores), 4),
226 |         #round(sum([x[7] for x in all_scores]) / len(all_scores), 4),
227 |         #round(sum([x[8] for x in all_scores]) / len(all_scores), 4),
228 |         #round(sum([x[9] for x in all_scores]) / len(all_scores), 4),
229 |         #round(sum([x[10] for x in all_scores]) / len(all_scores), 4),
230 |         #round(sum([x[11] for x in all_scores]) / len(all_scores), 4),
231 |         #round(sum([x[12] for x in all_scores]) / len(all_scores), 4),
232 |         #round(sum([x[13] for x in all_scores]) / len(all_scores), 4),
233 |         #round(sum([x[14] for x in all_scores]) / len(all_scores), 4),
234 |         #round(sum([x[15] for x in all_scores]) / len(all_scores), 4),
235 |         #round(sum([x[16] for x in all_scores]) / len(all_scores), 4),
236 |             ]]
237 |     if not noout:
238 |         outfile.close()
239 |         inout.close()
240 | 
241 | 
242 |     if noout:
243 |         print(tabulate(new_scores, headers='firstrow'))
244 |         
245 |     return purity, pudity, sounds, cp
246 | 
247 | if __name__ == '__main__':
248 |     from sys import argv
249 | 
250 |     # defaults
251 |     f = argv[1]
252 |     ref = 'crossids'
253 |     ratio = 0.5
254 |     proto = None
255 |     verbose = False
256 |     runs = 2
257 |     samples = 1
258 |     noout = False
259 |     
260 |     # parse arguments
261 |     if '-r' in argv:
262 |         ratio = float(argv[argv.index('-r')+1])
263 |     if '-c' in argv:
264 |         ref = argv[argv.index('-c')+1]
265 |     if '-v' in argv or '--verbose' in argv:
266 |         verbose = True
267 |     if '--runs' in argv:
268 |         runs = int(argv[argv.index('--runs')+1])
269 |     if ref in ['crossid', 'cogid']:
270 |         fuzzy = False
271 |     else:
272 |         fuzzy = True
273 |     if '--samples' in argv:
274 |         samples = int(argv[argv.index('--samples')+1])
275 |     if '--noout' in argv:
276 |         noout = True
277 | 
278 |     if '--seed' in argv:
279 |         random.seed(1)
280 |         
281 |     p1, p2, p3, cop = run_experiments(
282 |             f, 
283 |             ref, 
284 |             ratio, 
285 |             fuzzy=fuzzy, 
286 |             verbose=verbose,
287 |             runs=runs, 
288 |             samples=samples, 
289 |             noout=noout,
290 |             )
291 |     if verbose:
292 |         cop.add_patterns()
293 |         cop.output(
294 |                 'tsv',
295 |                 filename='results/'+f.split('/')[1].split('-')[0]+str(int(100*ratio+0.5)),
296 |                 ignore='all',
297 |                 prettify=False
298 |                 )
299 | 
300 |     
301 | 


--------------------------------------------------------------------------------
/tests/workflows/list-2019/general.py:
--------------------------------------------------------------------------------
 1 | from lingpy import *
 2 | from lingrex.copar import *
 3 | from glob import glob
 4 | from tabulate import tabulate
 5 | import numpy as np
 6 | 
 7 | data = [
 8 |         ('burmish-240-8', 'crossids'),
 9 |         ('chinese-623-14', 'crossids'),
10 |         ('polynesian-210-10', 'crossids'),
11 |         ('japanese-200-10', 'crossid')
12 |         ]
13 | 
14 | table = [[
15 |     'dataset',
16 |     'sites',
17 |     'patterns',
18 |     'singletons',
19 |     'coverage',
20 |     'irregulars',
21 |     'purity'
22 |     ]]
23 | for f, c in data:
24 |     name = f.split('-')[0]
25 |     print(name)
26 |     cp = CoPaR('data/'+f+'.tsv', ref=c, fuzzy=(c=='crossids'),
27 |             transcription="segments", segments='segments',
28 |             minrefs=2, structure="structure")
29 |     cp.get_sites()
30 |     cp.cluster_sites()
31 |     cp.sites_to_pattern()
32 |     cp.add_patterns()
33 |     singletons = len([a for a in cp.clusters.items() if len(a[1]) == 1])
34 |     cp.irregular_patterns()
35 |     iregs = sum([len(a) for a in cp.ipatterns.values()])
36 |     table += [[
37 |         name,
38 |         len(cp.sites),
39 |         len(cp.clusters),
40 |         singletons,
41 |         '{0:.2f}'.format(
42 |             (len(cp.sites)-singletons) / len(cp.sites)),
43 |         iregs,
44 |         cp.purity()
45 |         ]]
46 |     cp.output('tsv', filename='results/out-'+name)
47 | print(tabulate(table, headers='firstrow', tablefmt='latex')) 
48 | 


--------------------------------------------------------------------------------
/tests/workflows/list-2019/predict.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | 
  3 | from lingpy import *
  4 | from lingrex.copar import *
  5 | from sys import argv
  6 | import random
  7 | from lingpy.basictypes import *
  8 | from tabulate import tabulate
  9 | from lingpy.compare.sanity import average_coverage
 10 | import codecs
 11 | 
 12 | 
 13 | class CustomCoPaR(CoPaR):
 14 |     def stats(self, score_mode='pairs'):
 15 |         rest = 0
 16 |         if self._mode == 'fuzzy':
 17 |             for idx, cogids, alms in self.iter_rows(self._ref, self._alignment):
 18 |                 for alm, cogid in zip(alms, cogids):
 19 |                     if cogid not in self.msa[self._ref]:
 20 |                         rest += len(alm)
 21 |                     else:
 22 |                         pass
 23 |         else:
 24 |             for idx, cogid, alm in self.iter_rows(self._ref, self._alignment):
 25 |                 if cogid not in self.msa[self._ref]:
 26 |                     rest += len(alm)
 27 |         scores = [0 for i in range(rest)]
 28 |         for (p, ptn), sites in self.clusters.items():
 29 |             scores += len(sites) * [score_patterns(
 30 |                 [
 31 |                     self.sites[site][1] for site in sites
 32 |                 ], mode=score_mode)]
 33 |         return sum(scores) / len(scores)
 34 | 
 35 | 
 36 | def run_experiments(
 37 |         f, 
 38 |         ref, 
 39 |         ratio, 
 40 |         subset=None, 
 41 |         runs=100, 
 42 |         verbose=False, 
 43 |         fuzzy=True, 
 44 |         samples=1, 
 45 |         noout=False,
 46 |         score_mode='pairs'
 47 |         ):
 48 |     
 49 |     if not noout:
 50 |         outfile = codecs.open(
 51 |                 'results/'+f.split('/')[-1][:-4]+'-'+str(int(ratio*100+0.5))+'.txt', 
 52 |                 'w', 'utf-8')
 53 |         outfile.write('\t'.join([
 54 |                 'accuracy', 'proportion', 'density', 'fuzziness', 'coverage',
 55 |                 'purity', 'sounds', 'missing', 'csetsize', 'clusters', 'props',
 56 |                 'patterns', 'predicted', 'predictable', 'removed', 'regular',
 57 |                 'purityx'])+'\n')
 58 | 
 59 |     cpb = CustomCoPaR(f, ref=ref, fuzzy=fuzzy, split_on_tones=False,
 60 |             segments='segments', minrefs=2, structure="structure",
 61 |             transcription="segments")
 62 |     
 63 |     if not noout:
 64 |         inout = codecs.open(
 65 |                 'results/'+f.split('/')[-1][:-4]+'-individual-'+str(int(ratio*100+0.5))+'.tsv', 
 66 |                 'w', 'utf-8')
 67 |         inout.write('\t'.join(['run', 'doculect','accuracy', 'purity', 'words', 'sounds'])+'\n')
 68 |     
 69 |     # define the scores
 70 |     all_scores = []
 71 |     all_samples = set()
 72 |     all_pscores = {d: [] for d in cpb.cols}
 73 |     all_pud = {d: [] for d in cpb.cols}
 74 |     all_words = {d: [] for d in cpb.cols}
 75 |     all_sounds = {d: [] for d in cpb.cols}
 76 |     for key, msa in cpb.msa[ref].items():
 77 |         for alm, t in zip(msa['alignment'], msa['taxa']):
 78 |             all_samples.add((key, ' '.join(alm), t))
 79 |    
 80 |     for run in range(runs):    
 81 |         remove_idxs = random.sample(list(all_samples), int(len(all_samples)*ratio+0.5))
 82 |         D = {0: cpb.columns}
 83 |         for idx, cogid, alm, tax, tokens, structures in cpb.iter_rows(
 84 |                 ref, 'alignment', 'doculect', 'segments', 'structure'):
 85 |             if fuzzy:
 86 |                 cogids, alms, toks, strucs = [], [], [], []
 87 |                 for c, a, t, s in zip(cogid, lists(alm).n, lists(tokens).n,
 88 |                         lists(structures).n):
 89 |                     if (c, str(a), tax) not in remove_idxs:
 90 |                         cogids += [c]
 91 |                         alms += [str(a)]
 92 |                         toks += [str(t)]
 93 |                         strucs += [str(s)]
 94 |                 if not cogids:
 95 |                     pass
 96 |                 else:
 97 |                     D[idx] = cpb[idx]
 98 |                     D[idx][cpb.header[ref]] = ints(cogids)
 99 |                     D[idx][cpb.header['segments']] = ' + '.join(toks)
100 |                     D[idx][cpb.header['structure']] = ' + '.join(strucs)
101 |                     D[idx][cpb.header['alignment']] = ' + '.join(alms)
102 |             else:
103 |                 if (cogid, str(alm), tax) in remove_idxs:
104 |                     pass
105 |                 else:
106 |                     D[idx] = cpb[idx]
107 |         
108 |         cp = CustomCoPaR(D, ref=ref, fuzzy=fuzzy, split_on_tones=False,
109 |                 segments='segments', minrefs=2, structure="structure",
110 |                 transcription="segments")
111 |         if 'l' in argv: 
112 |             cp.load_patterns()
113 |         else:
114 |             cp.get_sites()
115 |             cp.cluster_sites(score_mode=score_mode)
116 |             cp.sites_to_pattern()
117 | 
118 |         # compute size of alphabets
119 |         sounds = {d: collections.defaultdict(int) for d in cp.cols}
120 |         for idx, doc, tks in cp.iter_rows('doculect', 'segments'):
121 |             for t in tks:
122 |                 if t != '+':
123 |                     sounds[doc][t.split('/')[1] if '/' in t else t] += 1
124 |         ave = sum([len(s) for s in sounds.values()]) / cp.width
125 | 
126 |         # good words
127 |         our_sample = {}
128 |         for cogid, alm, doc in remove_idxs:
129 |             our_sample[cogid, doc] = strings(alm)
130 |         pscores = {d: [] for d in cp.cols}
131 |         
132 |         regs = sum([len(a[1]) for a in cp.clusters.items() if len(a[1]) > 1]) / len(cp.sites)
133 |                 
134 |         predicted, purity, pudity = cp.predict_words(minrefs=2, samples=samples)
135 |         scores = []
136 |         unknown, all_segs, predictable, cogsize = 0, 0, 0, 0
137 |         for k, v in predicted.items():
138 |             for doc in v:
139 |                 if (k, doc) in our_sample and (doc == subset or not subset):
140 |                     predictable += 1
141 |                     cogsize += len(cp.msa[ref][k]['ID'])
142 | 
143 |                     # check for different alignments
144 |                     msaA = cp.msa[ref][k]
145 |                     msaB = cpb.msa[ref][k]
146 |                     if len(msaA['alignment'][0]) != len(msaB['alignment'][0]):
147 |                         # carve out the taxa which are still existent to find which
148 |                         # column to delete
149 |                         new_alm = [msaB['alignment'][i] for i in
150 |                             range(len(msaB['alignment'])) if msaB['taxa'][i] in \
151 |                                     msaA['taxa']]
152 |                         almA, almB = [], []
153 |                         for i in range(len(msaA['alignment'][0])):
154 |                             almA += [tuple([line[i] for line in msaA['alignment']])]
155 |                         for i in range(len(msaB['alignment'][0])):
156 |                             almB += [tuple([line[i] for line in new_alm])]
157 |                         out = []
158 |                         for i, col in enumerate(almB):
159 |                             if col not in almA:
160 |                                 out += [i]
161 |                     else:
162 |                         out = []
163 | 
164 |                     wA, wB = v[doc], our_sample[k, doc]
165 |                     ms = 0
166 |                     wB = strings([x for i, x in enumerate(wB) if i not in out]) 
167 |                     for a, b in zip(wA, wB):
168 |                         b = b.split('/')[1] if '/' in b else b
169 |                         a = a.split('|')
170 |                         for i, a_ in enumerate(a):
171 |                             if b == a_:
172 |                                 ms += 1 * (1/(i+1))
173 |                         if a[0] == 'Ø':
174 |                             unknown += 1
175 |                         all_segs += 1
176 |         
177 |                     score = ms / len(wA)
178 |                     pscores[doc] += [score]
179 |                     if verbose: 
180 |                         print('{0:5}\t{1:15}\t{2:20}\t{3:20}\t{4:.2f}\t{5}'.format(
181 |                             str(k), doc, str(wA), str(wB), score, len(set(msaA['taxa']))))
182 |                     if verbose and score != 1.0:
183 |                         purs = []
184 |                         for i, elm in enumerate(wA):
185 |                             if (k, i) in purity:
186 |                                 purs += ['{0:.2f}'.format(purity[k, i][doc])]
187 |                             else:
188 |                                 purs += ['?']
189 |                                 print((cogid, i) in cp.sites)
190 |                                 print([_s for _s in cp.sites if _s[0] == cogid],
191 |                                         cogid)
192 |                         print('<---')
193 |                         print('\t'.join([x for x in wA]))
194 |                         print('\t'.join([x for x in wB]))
195 |                         print('\t'.join(purs))
196 |                         print('--->')
197 |                     scores += [score]
198 |         ubound = cp.upper_bound()
199 |         all_scores += [(
200 |             sum(scores) / len(scores),         
201 |             len(cp) / len(cpb),
202 |             density(cp, ref=ref), 
203 |             cp.fuzziness(),
204 |             cp.stats(score_mode=score_mode), 
205 |             sum(pudity.values()) / len(pudity.values()), 
206 |             ave, 
207 |             unknown/all_segs,
208 |             cogsize / predictable, 
209 |             len(cp.clusters), 
210 |             len(cp.clusters) / ubound,
211 |             len(cp.sites),
212 |             predictable / len(remove_idxs),
213 |             predictable,
214 |             len(remove_idxs),
215 |             regs,
216 |             cp.purity()
217 |             )]
218 |         if verbose:
219 |             print('{0:.2f}'.format(all_scores[-1][0]))
220 |         
221 |         cov = cp.coverage()
222 |         for p in pscores:
223 |             all_pscores[p] += [sum(pscores[p]) / len(pscores[p])]
224 |             all_pud[p] += [pudity[p]]
225 |             all_words[p] += [cov[p]]
226 |             all_sounds[p] += [len(sounds[p])]
227 |             
228 |             if not noout:
229 |                 inout.write('\t'.join([
230 |                     str(run+1),
231 |                     p, 
232 |                     str(all_pscores[p][-1]), 
233 |                     str(pudity[p]), 
234 |                     str(cov[p]),
235 |                     str(len(sounds[p]))
236 |                     ])+'\n')
237 |         if not noout:
238 |             outfile.write(str(run+1)+'\t'+'\t'.join(['{0:.4f}'.format(x) for x in
239 |                 all_scores[-1]])+'\n')
240 |         print('{0:.2f} / {1:.2f}'.format(sum(scores) / len(scores), len(cp) /
241 |             len(cpb)))
242 | 
243 |     
244 |     new_scores = [[
245 |             'accuracy', 'proportion', 'density', 'fuzziness', 'coverage',
246 |             'purity', 'sounds', 'missing', 'csetsize', 'clusters', 'props',
247 |             'patterns', 'predicted', 'predictable', 'removed', 'regs', 'purityx']]
248 |     new_scores += [[
249 |         round(sum([x[0] for x in all_scores]) / len(all_scores), 4),
250 |         round(sum([x[1] for x in all_scores]) / len(all_scores), 4),
251 |         round(sum([x[2] for x in all_scores]) / len(all_scores), 4),
252 |         round(sum([x[3] for x in all_scores]) / len(all_scores), 4),
253 |         round(sum([x[4] for x in all_scores]) / len(all_scores), 4),
254 |         round(sum([x[5] for x in all_scores]) / len(all_scores), 4),
255 |         round(sum([x[6] for x in all_scores]) / len(all_scores), 4),
256 |         round(sum([x[7] for x in all_scores]) / len(all_scores), 4),
257 |         round(sum([x[8] for x in all_scores]) / len(all_scores), 4),
258 |         round(sum([x[9] for x in all_scores]) / len(all_scores), 4),
259 |         round(sum([x[10] for x in all_scores]) / len(all_scores), 4),
260 |         round(sum([x[11] for x in all_scores]) / len(all_scores), 4),
261 |         round(sum([x[12] for x in all_scores]) / len(all_scores), 4),
262 |         round(sum([x[13] for x in all_scores]) / len(all_scores), 4),
263 |         round(sum([x[14] for x in all_scores]) / len(all_scores), 4),
264 |         round(sum([x[15] for x in all_scores]) / len(all_scores), 4),
265 |         round(sum([x[16] for x in all_scores]) / len(all_scores), 4),
266 |             ]]
267 |     if not noout:
268 |         outfile.close()
269 |         inout.close()
270 | 
271 | 
272 |     if noout:
273 |         print(tabulate(new_scores, headers='firstrow'))
274 |         
275 |     return purity, pudity, sounds, cp
276 | 
277 | if __name__ == '__main__':
278 |     from sys import argv
279 | 
280 |     # defaults
281 |     f = argv[1]
282 |     ref = 'crossids'
283 |     ratio = 0.5
284 |     proto = None
285 |     verbose = False
286 |     runs = 100
287 |     samples = 1
288 |     noout = False
289 |     
290 |     # parse arguments
291 |     if '-r' in argv:
292 |         ratio = float(argv[argv.index('-r')+1])
293 |     if '-c' in argv:
294 |         ref = argv[argv.index('-c')+1]
295 |     if '-v' in argv or '--verbose' in argv:
296 |         verbose = True
297 |     if '--runs' in argv:
298 |         runs = int(argv[argv.index('--runs')+1])
299 |     if ref in ['crossid', 'cogid']:
300 |         fuzzy = False
301 |     else:
302 |         fuzzy = True
303 |     if '--samples' in argv:
304 |         samples = int(argv[argv.index('--samples')+1])
305 |     if '--noout' in argv:
306 |         noout = True
307 | 
308 |     if '--seed' in argv:
309 |         random.seed(1)
310 |         
311 |     p1, p2, p3, cop = run_experiments(
312 |             f, 
313 |             ref, 
314 |             ratio, 
315 |             fuzzy=fuzzy, 
316 |             verbose=verbose,
317 |             runs=runs, 
318 |             samples=samples, 
319 |             noout=noout,
320 |             )
321 |     if verbose:
322 |         cop.add_patterns()
323 |         cop.output(
324 |                 'tsv',
325 |                 filename='results/'+f.split('/')[1].split('-')[0]+str(int(100*ratio+0.5)),
326 |                 ignore='all',
327 |                 prettify=False
328 |                 )
329 | 
330 |     
331 | 


--------------------------------------------------------------------------------
/tests/workflows/list-2019/results/burmish-240-8-75.txt:
--------------------------------------------------------------------------------
1 | accuracy	proportion	density	fuzziness	coverage	purity	sounds	missing	csetsize	clusters	props	patterns	predicted	predictable	removed	regular	purityx
2 | 


--------------------------------------------------------------------------------
/tests/workflows/list-2019/results/burmish-240-8-individual-75.tsv:
--------------------------------------------------------------------------------
1 | run	doculect	accuracy	purity	words	sounds
2 | 


--------------------------------------------------------------------------------
/tests/workflows/list-2019/results/chinese-623-14-75.txt:
--------------------------------------------------------------------------------
1 | accuracy	proportion	density	fuzziness	coverage	purity	sounds	missing	csetsize	clusters	props	patterns	predicted	predictable	removed	regular	purityx
2 | 1	0.6463	0.2982	0.6337	3.1740	0.0812	0.8311	38.6429	0.0673	3.5227	395.0000	0.1665	2374.0000	0.7893	4955.0000	6278.0000	0.9629	0.7026
3 | 2	0.6188	0.2981	0.6419	3.2229	0.0735	0.8218	38.3571	0.0742	3.5347	411.0000	0.1717	2396.0000	0.7983	5012.0000	6278.0000	0.9604	0.6931
4 | 


--------------------------------------------------------------------------------
/tests/workflows/list-2019/results/chinese-623-14-individual-75.tsv:
--------------------------------------------------------------------------------
 1 | run	doculect	accuracy	purity	words	sounds
 2 | 1	Beijing	0.6744631185807659	0.8506857410315988	189	39
 3 | 1	Changsha	0.6591755319148934	0.8400152536420613	173	41
 4 | 1	Fuzhou	0.6043639740018567	0.8470977566718286	193	35
 5 | 1	Guangzhou	0.6362084456424079	0.8374048533808206	181	39
 6 | 1	Jinan	0.6690212373037854	0.8379081874636615	193	42
 7 | 1	Meixian	0.67212962962963	0.8434349335736812	185	29
 8 | 1	Nanchang	0.6397363465160077	0.8268322390279454	177	39
 9 | 1	Nanjing	0.6633239171374766	0.8294375350540676	188	41
10 | 1	Shanghai	0.6420765027322405	0.8397458588789625	183	49
11 | 1	Suzhou	0.655978835978836	0.8226080732036224	172	47
12 | 1	Taibei	0.6005606523955145	0.7923705147078326	152	32
13 | 1	Taoyuan	0.6374999999999997	0.8149330026110324	157	31
14 | 1	Wenzhou	0.6068965517241384	0.8134687675211292	169	47
15 | 1	Xiamen	0.6850368324125231	0.8392717648505066	184	30
16 | 2	Beijing	0.6341463414634145	0.8277661174300526	179	38
17 | 2	Changsha	0.6031168831168832	0.82247660983077	171	39
18 | 2	Fuzhou	0.5586936936936938	0.8183899161818443	186	36
19 | 2	Guangzhou	0.614628623188406	0.8246327090967067	188	39
20 | 2	Jinan	0.637148047229791	0.8232004499357977	192	40
21 | 2	Meixian	0.6361261261261262	0.824309193838572	175	29
22 | 2	Nanchang	0.6289552238805969	0.8172120514596539	189	37
23 | 2	Nanjing	0.6422586520947178	0.8384229507738024	180	42
24 | 2	Shanghai	0.6264075067024129	0.8295751313354329	179	47
25 | 2	Suzhou	0.6202416918429002	0.8051516609454498	168	48
26 | 2	Taibei	0.576380042462845	0.8095576704338178	168	33
27 | 2	Taoyuan	0.6294585987261145	0.8073248812775506	169	32
28 | 2	Wenzhou	0.5803894927536231	0.8261543247139483	183	46
29 | 2	Xiamen	0.6708115183246076	0.8314737379495394	168	31
30 | 


--------------------------------------------------------------------------------
/tests/workflows/list-2019/results/japanese-200-10-75.txt:
--------------------------------------------------------------------------------
1 | accuracy	proportion	density	fuzziness	coverage	purity	sounds	missing	csetsize	clusters	props	patterns	predicted	predictable	removed	regular	purityx
2 | 1	0.5297	0.3182	0.3381	2.3938	0.0005	0.7356	32.3000	0.1836	2.7977	149.0000	0.2358	645.0000	0.5227	702.0000	1343.0000	0.9256	0.6899
3 | 2	0.4828	0.3177	0.3377	2.3785	0.0012	0.7263	32.8000	0.1971	2.7854	144.0000	0.2326	642.0000	0.5309	713.0000	1343.0000	0.9097	0.7167
4 | 


--------------------------------------------------------------------------------
/tests/workflows/list-2019/results/japanese-200-10-individual-75.tsv:
--------------------------------------------------------------------------------
 1 | run	doculect	accuracy	purity	words	sounds
 2 | 1	Amami	0.38053221288515415	0.7248868060731588	57	37
 3 | 1	Hachijō	0.5685611879160267	0.7477516322783029	80	33
 4 | 1	Kagoshima	0.4537838139950816	0.7140323216889822	70	33
 5 | 1	Kyōto	0.5509259259259259	0.6733292869047514	46	29
 6 | 1	Kōchi	0.5926393728222995	0.7242764951529171	57	26
 7 | 1	Miyako	0.42766884531590404	0.7591016034249201	78	42
 8 | 1	Oki	0.547769066976384	0.7448634608988914	55	30
 9 | 1	Sado	0.6281055900621118	0.7545201944366466	69	30
10 | 1	Shuri	0.46807181889149113	0.7354638428741851	69	32
11 | 1	Tōkyō	0.6226190476190475	0.7776557302734873	51	31
12 | 2	Amami	0.416017316017316	0.6837437518510925	56	35
13 | 2	Hachijō	0.4747655122655123	0.7243134412799231	79	35
14 | 2	Kagoshima	0.4744912494912495	0.7571340080753896	55	32
15 | 2	Kyōto	0.5557037674507556	0.7372087276595939	51	31
16 | 2	Kōchi	0.6043197278911564	0.7710841857961863	65	30
17 | 2	Miyako	0.29142195767195767	0.7082720008598198	80	41
18 | 2	Oki	0.43308217801888677	0.6922538976524312	55	27
19 | 2	Sado	0.5647407407407408	0.7331145433445981	67	34
20 | 2	Shuri	0.40829554043839755	0.7336815929986455	62	34
21 | 2	Tōkyō	0.5507012393998695	0.7225082960724509	61	29
22 | 


--------------------------------------------------------------------------------
/tests/workflows/list-2019/results/polynesian-210-10-75.txt:
--------------------------------------------------------------------------------
1 | accuracy	proportion	density	fuzziness	coverage	purity	sounds	missing	csetsize	clusters	props	patterns	predicted	predictable	removed	regular	purityx
2 | 1	0.7424	0.4545	0.1935	2.2027	0.0306	0.8429	19.0000	0.1039	3.2306	69.0000	0.1405	528.0000	0.3649	620.0000	1699.0000	0.9564	0.8270
3 | 2	0.7377	0.4429	0.1843	2.2170	0.0282	0.8357	19.0000	0.1135	2.9928	61.0000	0.1525	470.0000	0.3284	558.0000	1699.0000	0.9596	0.8412
4 | 


--------------------------------------------------------------------------------
/tests/workflows/list-2019/results/polynesian-210-10-individual-75.tsv:
--------------------------------------------------------------------------------
 1 | run	doculect	accuracy	purity	words	sounds
 2 | 1	Austral_1213	0.7098360655737704	0.8369797720278489	86	16
 3 | 1	Austral_128	0.8045893719806763	0.8336284193571929	93	18
 4 | 1	Hawaiian_52	0.6782738095238096	0.8515142513002715	111	17
 5 | 1	Mangareva_239	0.7875661375661377	0.853256880850012	104	22
 6 | 1	Maori_85	0.6363095238095239	0.8017538854798525	95	20
 7 | 1	NorthMarquesan_38	0.6984848484848484	0.8621553400918279	115	20
 8 | 1	Rapanui_264	0.7744252873563217	0.8713437633013035	104	19
 9 | 1	Sikaiana_243	0.7531446540880504	0.8360935254027674	91	20
10 | 1	Tahitian_173	0.7721264367816093	0.8183473427878986	74	19
11 | 1	Tuamotuan_246	0.8179687499999999	0.8635059484169247	91	19
12 | 2	Austral_1213	0.6935374149659864	0.7930747188193066	91	17
13 | 2	Austral_128	0.7558974358974357	0.8525730691600037	86	17
14 | 2	Hawaiian_52	0.7429824561403509	0.8765556454861049	101	18
15 | 2	Mangareva_239	0.7440251572327042	0.8626147997883004	96	22
16 | 2	Maori_85	0.7645502645502644	0.8442533194274631	94	19
17 | 2	NorthMarquesan_38	0.7689244663382592	0.8642062532709653	117	20
18 | 2	Rapanui_264	0.6323529411764706	0.8183746212518835	104	18
19 | 2	Sikaiana_243	0.6937984496124031	0.8316400309821942	102	20
20 | 2	Tahitian_173	0.7957627118644067	0.8161348483260537	69	19
21 | 2	Tuamotuan_246	0.7492460317460317	0.7972076549006404	94	20
22 | 


--------------------------------------------------------------------------------
/tests/workflows/wu-2020/4_crosssemantic.py:
--------------------------------------------------------------------------------
 1 | from lingpy import *
 2 | from lingrex.colex import find_colexified_alignments, find_bad_internal_alignments
 3 | from lingrex.align import template_alignment
 4 | from sys import argv
 5 | 
 6 | if 'all' in argv:
 7 |     fname='A_Chen_'
 8 | else:
 9 |     fname='D_Chen_'
10 | 
11 | 
12 | alms = Alignments(fname+'aligned.tsv', ref='cogids')
13 | print('[i] search for bad internal alignments')
14 | find_bad_internal_alignments(alms)
15 | 
16 | print('[i] search for colexified alignments')
17 | find_colexified_alignments(
18 |         alms,
19 |         cognates='cogids',
20 |         ref='crossids'
21 |         )
22 | 
23 | # re-align the data
24 | print('[i] re-align the data')
25 | template_alignment(alms,
26 |                    ref='crossids',
27 |                    template='imnct+imnct+imnct+imnct+imnct+imnct',
28 |                    structure = 'structure',
29 |                    fuzzy=True,
30 |                    segments='tokens')
31 | 
32 | alms.output('tsv', filename=fname+'crossids', prettify=False)
33 | 


--------------------------------------------------------------------------------
/tests/workflows/wu-2020/5_correspondence.py:
--------------------------------------------------------------------------------
 1 | from lingrex.copar import CoPaR
 2 | from sys import argv
 3 | 
 4 | if 'all' in argv:
 5 |     fname='A_Chen_'
 6 | else:
 7 |     fname='D_Chen_'
 8 | 
 9 | cop = CoPaR(
10 |         fname+'crossids.tsv',
11 |         ref='crossids',
12 |         fuzzy=True,
13 |         segments='tokens',
14 |         minrefs=3,
15 |         structure="structure"
16 |         )
17 | cop.get_sites()
18 | cop.cluster_sites()
19 | cop.sites_to_pattern()
20 | cop.add_patterns()
21 | cop.write_patterns(fname+'all_patterns.tsv')
22 | cop.output('tsv', filename=fname+'patterns', prettify=False)
23 | 
24 | # statistics 
25 | sps=['i','m','n','c','t']
26 | 
27 | total_correspondence_sets = len(cop.clusters)
28 | print('{0}: {1}'.format('The total sound correspondence cluster sets', total_correspondence_sets))
29 | 
30 | print('The number of regular correspondence sets in each position')
31 | for sp in sps:
32 |     t = [x[1] for x, y in cop.clusters.items() if len(y)>1 and x[0] ==sp]
33 |     print('{0}: {1}'.format(sp, len(t)))
34 | 
35 | print('The number of singletons in each position ')
36 | for sp in sps:
37 |     t = [x[1] for x, y in cop.clusters.items() if len(y)==1 and x[0] ==sp]
38 |     print('{0}: {1}'.format(sp, len(t)))
39 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py{38,39,310}
3 | skip_missing_interpreters = true
4 | 
5 | [testenv]
6 | extras = test
7 | commands = pytest {posargs}
8 | 


--------------------------------------------------------------------------------