├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── LICENSE.md ├── Makefile ├── README.md ├── examples ├── ordinal │ ├── ordinal.diagnose │ ├── ordinal.kl │ └── ordinal.tex └── preservation │ ├── preservation.diagnose │ └── preservation.kl ├── img ├── preservation-after.png └── preservation-before.png ├── knowledge_clustering ├── __init__.py ├── _version.py ├── add_anchor.py ├── add_quotes.py ├── autofinder.py ├── check_update.py ├── clustering.py ├── config.py ├── cst.py ├── data │ ├── english.ini │ └── french.ini ├── diagnose.py ├── distance.py ├── file_updater.py ├── knowledges.py ├── misc.py ├── scope_meaning.py ├── scripts │ ├── __init__.py │ └── app.py └── tex_document.py ├── pyproject.toml ├── setup.cfg └── tests ├── .ordinal.diagnose.original ├── .ordinal.kl.original ├── .ordinal.kl.solution ├── .ordinal.tex.original ├── test_addquotes.py ├── test_anchor.py ├── test_autofinder.py └── test_clustering.py /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | jobs: 16 | deploy: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: '3.x' 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install build 30 | - name: Build package 31 | run: python -m build 32 | - name: Publish package 33 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 34 | with: 35 | user: __token__ 36 | password: ${{ secrets.PYPI_TOKEN }} 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | *.pyc 3 | *.egg-info* 4 | *.whl 5 | *.tar.gz 6 | dist/* 7 | build/* 8 | .vim/* 9 | kw-devel/* 10 | venv-black/* 11 | *.venv 12 | .vscode 13 | .coverage 14 | kl3.11/* -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Rémi Morvan, Thomas Colcombet and Aliaume Lopez 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | VENV-BLACK=black.venv 2 | 3 | .PHONY: black check test coverage build deploy-test 4 | 5 | black: 6 | source ./$(VENV-BLACK)/bin/activate && black . 7 | 8 | check: 9 | mypy knowledge_clustering/*.py --check-untyped-defs # Check typing 10 | pylint knowledge_clustering/*.py # Linter 11 | 12 | test: 13 | python -m pytest tests/ -v 14 | 15 | coverage: 16 | python -m pytest tests/ --cov 17 | 18 | build: 19 | python -m build . 20 | 21 | deploy-test: knowledge_clustering/_version.py 22 | python -m twine upload --repository testpypi dist/* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # knowledge-clustering 2 | 3 | [![PyPI](https://img.shields.io/pypi/v/knowledge-clustering.svg)](https://pypi.python.org/pypi/knowledge-clustering) 4 | 5 | Command-line tool to help with the use of the [knowledge LaTeX package](https://ctan.org/pkg/knowledge). 6 | A tutorial on how to use both `knowledge` and `knowledge-clustering` can be found [here](https://github.com/remimorvan/knowledge-examples). 7 | 8 | ## Principle 9 | 10 | The goal of `knowledge-clustering` is to help the user write a LaTeX document with 11 | the [knowledge package](https://ctan.org/pkg/knowledge). 12 | It has three features: 13 | 14 | - **Clustering**: provide suggestions to the user of what notions should be grouped together. 15 | - **Add quotes**: find where you might have missed some quotes in your document. 16 | - **Anchor points**: find where you might have missed anchor points in your document. 17 | 18 | The **clustering** algorithm is meant to be used while writing your document, while the last two tools 19 | should be used when your document is (nearly) ready to be published, to check if everything is right. 20 | 21 | ## Installation 22 | 23 | To install (or upgrade) `knowledge-clustering`, you need to have Python 3.9 (or a more recent version), and then run 24 | 25 | python3 -m pip install --upgrade knowledge-clustering 26 | 27 | and then 28 | 29 | knowledge init 30 | 31 | To check if you have the latest version of `knowledge-clustering`, you can run 32 | 33 | knowledge --version 34 | 35 | ## Clustering notions 36 | 37 | ### Syntax 38 | 39 | ``` 40 | Usage: knowledge cluster [OPTIONS] 41 | 42 | Defines, as a comment and in the knowledge files, all the knowledges 43 | occuring in the file. 44 | 45 | Options: 46 | -k, --knowledge FILE File containing the knowledges that are already 47 | defined. Multiple files are allowed; new 48 | knowledges will be written in the last one. If 49 | the option is not specified, all .kl file in the 50 | current directory (and subdirectory, 51 | recursively) will be taken. If there are 52 | multiple files, exactly one of them must end 53 | with `default.kl`. 54 | -d, --diagnose FILE Diagnose file produced by LaTeX. If the option 55 | is not specified, the unique .diagnose file in 56 | the current directory (and subdirectory, 57 | recursively) is taken instead. 58 | -l, --lang [en|fr] Language of your TeX document. 59 | -S, --scope / --no-scope Print the scopes defined in the knowledge file 60 | and print the possible meaning of those scope 61 | inferred by knowledge-clustering. 62 | -P, --print / --no-print Print all new knowledges. 63 | -N, --no-update / --update Don't look on PyPI if a newer version of 64 | knowledge-clustering is available. 65 | -c, --config-file TEXT Specify the configuration file. By default the 66 | configuration file in the folder 67 | /Users/rmorvan/knowledge- 68 | clustering/knowledge_clustering/data 69 | corresponding to your language is used. 70 | --help Show this message and exit. 71 | ``` 72 | 73 | ### Example 74 | 75 | Example files can be found in the `examples/` folder. 76 | 77 | While writing some document, you have defined some knowledges in a file called `preservation.kl` (distinct 78 | from your main `LaTeX`). 79 | You continued writing your `LaTeX` document (not provided in the `examples/` folder) 80 | for some time, and used some knowledges that were undefined. 81 | When compiling, `LaTeX` and the [`knowledge package`](https://ctan.org/pkg/knowledge) gives you a warning 82 | and writes in a `.diagnose` file some information explaining what went wrong. This `.diagnose` file contains 83 | a section called "Undefined knowledges" containing all knowledges used in your main `LaTeX` file but not 84 | defined in `preservation.kl`. We reproduced this section 85 | in the `preservation.diagnose` file. 86 | 87 | ![Screenshot of the `preservation.kl` and `preservation.diagnose` files before running knowledge-clustering. `preservation.kl` contains three knowledges, while `preservation.diagnose` contains five undefined knowledges.](img/preservation-before.png "Files `preservation.kl` and `preservation.diagnose` before running knowledge-clustering") 88 | 89 | Normally, you would add every undefined knowledge, one after the other, in your 90 | `preservation.kl`. This is quite burdensome and can 91 | largely be automated. This is precisely what `knowledge-clustering` does: after running 92 | 93 | knowledge cluster -k preservation.kl -d preservation.diagnose 94 | 95 | your file `preservation.diagnose` is left unchanged 96 | but `preservation.kl` is updated with comments. 97 | 98 | The `cluster` command is optional: you can also write `knowledge -k preservation.kl -d preservation.diagnose`. 99 | 100 | ![After running knowledge-clustering, the five undefined knowledges are included in the `preservation.kl` file as comments.](img/preservation-after.png "Files `preservation.kl` and `preservation.diagnose` after running knowledge-clustering`") 101 | 102 | Now you simply have to check that the recommendations of `knowledge-clustering` are 103 | correct, and uncomment those lines. 104 | 105 | ### Autofinder 106 | 107 | If the current directory (and its recursive subdirectories) contains 108 | a unique `.diagnose` file and a unique `.kl` file, 109 | you can simply write `knowledge cluster` (or `knowledge`): the files will be automatically found. 110 | 111 | ### Multiple knowledge files 112 | 113 | If you have **multiple knowledge files**, you can use the `-k` option multiple times. 114 | For instance, you could write: 115 | 116 | knowledge cluster -k 1.kl -k 2.kl -d ordinal.diagnose 117 | 118 | Synonyms of knowledges defined in `1.kl` (resp. `2.kl`) will be defined, as comments, 119 | in `1.kl` (resp. `2.kl`). New knowledges will always be added, as comments, to the last 120 | file, which is `2.kl` in the example. 121 | 122 | You can also use the autofinder in this case, using `knowledge cluster` 123 | or `knowledge`: if multiple `.kl` files are present in the current directory (and 124 | its recursive subdirectories), exactly one of them must end with `default.kl`—this is 125 | where new knowledges will be put. 126 | 127 | ## Adding quotes 128 | 129 | /!\ This feature is somewhat experimental. 130 | 131 | ``` 132 | Usage: knowledge addquotes [OPTIONS] 133 | 134 | Finds knowledges defined in the knowledge files that appear in tex file 135 | without quote symbols. Proposes to add quotes around them. 136 | 137 | Options: 138 | -t, --tex FILE Your TeX file. [required] 139 | -k, --knowledge FILE File containing the knowledges that are already 140 | defined. Multiple files are allowed; new 141 | knowledges will be written in the last one. If 142 | the option is not specified, all .kl file in the 143 | current directory (and subdirectory, 144 | recursively) will be taken. If there are 145 | multiple files, exactly one of them must end 146 | with `default.kl`. 147 | -p, --print INTEGER When finding a match, number of lines (preceding 148 | the match) that are printed in the prompt to the 149 | user. 150 | -N, --no-update / --update 151 | --help Show this message and exit. 152 | ``` 153 | 154 | After running 155 | 156 | knowledge addquotes -t mydocument.tex -k knowledges1.kl -k knowledges2.kl 157 | 158 | your prompt will propose to add quotes around defined knowledges, 159 | and to define synonyms of knowledges that occur in your TeX file. For instance, if 160 | "algorithm" is a defined knowledge and "algorithms" occurs in your TeX file, then 161 | it will propose to you to define "algorithms" as a synonym of the knowledge "algorithm", 162 | and to add a pair of quotes around the string "algorithms" that occurs in your TeX file. 163 | 164 | Whenever the algorithm finds a match for a knowledge, it will print the line of 165 | the document where it found the match, and emphasize the string corresponding to the knowledge. 166 | If you want to print more than one line, you can use the `-p` (or `--print`) option 167 | to print more than one line. 168 | 169 | ## Finding missing anchor points 170 | 171 | ``` 172 | Usage: knowledge anchor [OPTIONS] 173 | 174 | Prints warning when a knowledge is introduced but is not preceded by an 175 | anchor point. 176 | 177 | Options: 178 | -t, --tex FILE Your TeX file. [required] 179 | -s, --space INTEGER Number of characters tolerated between an anchor 180 | point and the introduction of a knowledge. 181 | (Default value: 200) 182 | -N, --no-update / --update 183 | --help Show this message and exit. 184 | ``` 185 | 186 | When one runs 187 | 188 | knowledge anchor -t mydocument.tex 189 | 190 | the tool will print the lines of the document containing the 191 | introduction of a knowledge that is not preceded by an anchor point. 192 | The tolerance on how far away the anchor point can be from the 193 | introduction of a knowledge can be changed with the `-s` (or `--space`) 194 | option. The default value is 150 characters (corresponding to 2-3 lines in a 195 | TeX document). 196 | 197 | ## Devel using virtualenv 198 | 199 | Using `venv` and the `--editable` option from `pip` allows for an easy 200 | setup of a development environment that will match a future user install without 201 | the hassle. 202 | 203 | For bash and Zsh users 204 | 205 | ```bash 206 | python3 -m venv kl.venv 207 | source ./kl.venv/bin/activate 208 | python3 -m pip install --editable . 209 | ``` 210 | 211 | For fish users 212 | 213 | ```fish 214 | python3 -m venv kl.venv 215 | source ./kl.venv/bin/activate.fish 216 | python3 -m pip install --editable . 217 | ``` 218 | 219 | ## FAQ 220 | 221 | - `knowledge: command not found` after installing `knowledge-clustering` 222 | > Make sure you have Python>=3.9. 223 | 224 | - When running `knowledge`, I obtain a long message error indicating "Resource punkt not found." 225 | > Run `knowledge init`. 226 | 227 | - My shell doesn't autocomplete the command `knowledge`. 228 | > Depending on whether you use `zsh` or `bash` write 229 | > 230 | > eval "`pip completion --`" 231 | > 232 | > (where `` is either `zsh` or `bash`) 233 | > in your `.zshrc` (or `.bashrc`) file and then, 234 | > either launch a new terminal or run `source ~/.zshrc` 235 | > (or `source ~/.bashrc`). 236 | 237 | - `Error: Got unexpected extra argument` when using multiple knowledge files. 238 | > You should use the option `-k` before **every** knowledge file, like in 239 | > 240 | > knowledge cluster -k 1.kl -k 2.kl -d blabla.diagnose 241 | 242 | - I've updated `knowledge-clustering` but I still don't have the last version (which can be checked using `knowledge --version`): 243 | This can happen if you have multiple versions of `python` (and multiple versions 244 | of `knowledge-clustering`). 245 | > Type `where python3`, and uninstall `knowledge-clustering` 246 | from everywhere (using `/python3 -m pip uninstall knowledge-clustering`). 247 | Try to then reinstall `knowledge-clustering` 248 | by running `python3 -m pip install --upgrade knowledge-clustering`. 249 | -------------------------------------------------------------------------------- /examples/ordinal/ordinal.diagnose: -------------------------------------------------------------------------------- 1 | *********** 2 | * Summary * 3 | *********** 4 | 5 | 181 undefined knowledge(s). 6 | 1 autoreference(s) are introduced twice. 7 | 1 autoreference(s) are used but not introduced. 8 | 9 | 44 autoreference(s) are properly used. 10 | 1 autoreference(s) are defined but not used. 11 | 12 | 13 | ******** 14 | * Help * 15 | ******** 16 | 17 | \knowledgeconfigure{diagnose bar=false} deactivate `|'-notation in diagnose file. 18 | \knowledgeconfigure{diagnose help=false} deactivate long help in the diagnose file. 19 | \knowledgeconfigure{diagnose line=true} add line numbers to diagnose file. 20 | 21 | ************************ 22 | * Undefined knowledges * 23 | ************************ 24 | 25 | \knowledge{ignore} 26 | % introduction.tex:5 27 | | \FO -separability 28 | | \FO -formula 29 | | countable ordinal words 30 | % introduction.tex:6 31 | | separation problem 32 | % introduction.tex:7 33 | | regular languages of countable ordinal words 34 | % introduction.tex:9 35 | | yes 36 | | \FO -separator 37 | % introduction.tex:10 38 | | separates 39 | % introduction.tex:11 40 | | ie 41 | % introduction.tex:12 42 | | no 43 | | witness function 44 | % introduction.tex:13 45 | | \FO -sentence 46 | % introduction.tex:23 47 | | Countable ordinal words 48 | % introduction.tex:24 49 | | regular languages@COW 50 | % introduction.tex:31 51 | | countable ordinals 52 | | ordinal monoids 53 | % introduction.tex:35 54 | | \FO -pointlike sets 55 | | ordinal monoid 56 | % introduction.tex:36 57 | | \FO -definable@lang 58 | | saturation 59 | | \FO -approximant 60 | % introduction.tex:41 61 | | aperiodic 62 | | syntactic monoid 63 | | $\Jeq $-trivial 64 | % introduction.tex:42 65 | | aperiodic pointlike sets 66 | | covering problem 67 | % introduction.tex:46 68 | | covering problems 69 | % introduction.tex:49 70 | | scattered@linord 71 | % introduction.tex:53 72 | | \FO -separation 73 | % introduction.tex:57 74 | | first-order logic 75 | | first-order definable maps 76 | % introduction.tex:58 77 | | algorithm 78 | % introduction.tex:60 79 | | pointlikes 80 | % preliminaries.tex:10 81 | | linear ordering 82 | % preliminaries.tex:11 83 | | countable@linord 84 | | finite@linord 85 | % preliminaries.tex:12 86 | | linear orderings 87 | % preliminaries.tex:13 88 | | morphism@linord 89 | % preliminaries.tex:15 90 | | isomorphism@linord 91 | % preliminaries.tex:16 92 | | morphism@linord 93 | % preliminaries.tex:18 94 | | sum@linord 95 | | product@linord 96 | % preliminaries.tex:42 97 | | well-founded 98 | % preliminaries.tex:44 99 | | ordinal 100 | % preliminaries.tex:45 101 | | isomorphism@linord 102 | % preliminaries.tex:48 103 | | ordinals 104 | % preliminaries.tex:50 105 | | embedding@linord 106 | % preliminaries.tex:53 107 | | successor ordinal 108 | % preliminaries.tex:54 109 | | limit ordinal 110 | % preliminaries.tex:94 111 | | word 112 | % preliminaries.tex:95 113 | | domain 114 | % preliminaries.tex:97 115 | | countable@word 116 | | finite@word 117 | | scattered@word 118 | | $\omega $-word 119 | % preliminaries.tex:98 120 | | countable@linord 121 | | finite@linord 122 | | scattered@linord 123 | % preliminaries.tex:99 124 | | countable ordinal word 125 | | countable@linord 126 | | ordinal@linord 127 | % preliminaries.tex:102 128 | | finite words 129 | % preliminaries.tex:112 130 | | omega iteration 131 | % preliminaries.tex:132 132 | | semigroup 133 | % preliminaries.tex:133 134 | | monoid 135 | % preliminaries.tex:137 136 | | idempotent 137 | % preliminaries.tex:139 138 | | idempotent power 139 | % preliminaries.tex:147 140 | | group-trivial 141 | % preliminaries.tex:150 142 | | countable ordinal 143 | % preliminaries.tex:152 144 | | words 145 | % preliminaries.tex:168 146 | | generalised product 147 | % preliminaries.tex:177 148 | | generalised associativity 149 | % preliminaries.tex:186 150 | | ordinal monoid morphism 151 | % preliminaries.tex:192 152 | | ordered ordinal monoid 153 | % preliminaries.tex:196 154 | | alphabet 155 | | recognised@OM 156 | % preliminaries.tex:200 157 | | recognisable@OM 158 | % preliminaries.tex:201 159 | | recognisable@OM 160 | % preliminaries.tex:202 161 | | regular@cow 162 | % preliminaries.tex:236 163 | | presentation@OM 164 | % preliminaries.tex:244 165 | | power ordinal monoid 166 | % preliminaries.tex:299 167 | | Free variables 168 | % preliminaries.tex:300 169 | | free variables 170 | % preliminaries.tex:303 171 | | valuation 172 | % preliminaries.tex:304 173 | | word@ord 174 | % preliminaries.tex:310 175 | | word@ord 176 | % preliminaries.tex:315 177 | | satisfies 178 | | accepts 179 | % preliminaries.tex:319 180 | | \FO -definable@lang 181 | % preliminaries.tex:321 182 | | words@ord 183 | % preliminaries.tex:339 184 | | Bedon's theorem 185 | % preliminaries.tex:341 186 | | \FO -definable@lang 187 | % preliminaries.tex:342 188 | | recognised@OM 189 | % preliminaries.tex:348 190 | | \FO -definable@map 191 | % preliminaries.tex:350 192 | | \FO -definable language 193 | % preliminaries.tex:352 194 | | \FO -definable@map 195 | % preliminaries.tex:375 196 | | \FO -definable@map 197 | % preliminaries.tex:378 198 | | \FO -definable@map 199 | % preliminaries.tex:397 200 | | condensation 201 | % preliminaries.tex:407 202 | | condensation formula 203 | % preliminaries.tex:425 204 | | finite condensation 205 | % preliminaries.tex:432 206 | | word@ord 207 | % preliminaries.tex:437 208 | | \FO -definable functions 209 | % preliminaries.tex:438 210 | | \FO -definable function 211 | % preliminaries.tex:444 212 | | condensation \FO -formula 213 | % preliminaries.tex:452 214 | | \FO -definable@map 215 | % algorithm.tex:84 216 | | recognised@OM 217 | % algorithm.tex:122 218 | | $\omega $-iteration 219 | % algorithm.tex:160 220 | | words@ord 221 | | words@ord 222 | | words@ord 223 | | words@ord 224 | | words@ord 225 | | words@ord 226 | | words@ord 227 | | words@ord 228 | | words@ord 229 | | words@ord 230 | | words@ord 231 | | words@ord 232 | | words@ord 233 | | words@ord 234 | | words@ord 235 | | words@ord 236 | | words@ord 237 | | words@ord 238 | | words@ord 239 | | words@ord 240 | | words@ord 241 | | words@ord 242 | | words@ord 243 | | words@ord 244 | % algorithm.tex:182 245 | | \FO -definable@lang 246 | | recognising@OM 247 | % algorithm.tex:186 248 | | presentation@OM 249 | % algorithm.tex:193 250 | | \FO -separable 251 | % algorithm.tex:206 252 | | \FO -separator sentence 253 | % algorithm.tex:207 254 | | pointlike sets 255 | % answer-no.tex:10 256 | | quantifier depth 257 | % answer-no.tex:12 258 | | \FOk -equivalent 259 | % answer-no.tex:61 260 | | \FO -separated 261 | % answer-no.tex:164 262 | | \FO -inseparability 263 | % answer-yes.tex:8 264 | | ordinal monoids with merge 265 | % answer-yes.tex:9 266 | | \FO -approximants 267 | | \FO -definable@map 268 | % answer-yes.tex:10 269 | | ordinal monoid with merge 270 | % answer-yes.tex:11 271 | | $\omega $-words 272 | % answer-yes.tex:13 273 | | Merge operators 274 | | {\FO }-approximants 275 | % answer-yes.tex:19 276 | | presentation@OM 277 | % answer-yes.tex:20 278 | | merge operator 279 | % answer-yes.tex:33 280 | | \FO -definable@map 281 | % answer-yes.tex:35 282 | | \FO -definable map 283 | % answer-yes.tex:67 284 | | \FO -definable@map 285 | | \FO -definable@lang 286 | % answer-yes.tex:89 287 | | regular language@cow 288 | | recognised@OM 289 | % answer-yes.tex:91 290 | | aperiodicity 291 | % answer-yes.tex:106 292 | | words@ord 293 | % answer-yes.tex:219 294 | | \FO -definable@lang 295 | % answer-yes.tex:261 296 | | Green's relations 297 | % answer-yes.tex:347 298 | | \FO -definable@lang 299 | % answer-yes.tex:348 300 | | \FO -definable@map 301 | % related.tex:10 302 | | \FO -covering problem 303 | % related.tex:11 304 | | regular languages@cow 305 | | \FO -definable languages 306 | % related.tex:12 307 | | separation problems 308 | % related.tex:13 309 | | separable 310 | % related.tex:20 311 | | \FOk -closure 312 | % related.tex:38 313 | | semigroups 314 | % related.tex:53 315 | | \FO -covering 316 | % related.tex:55 317 | | aperiodic pointlikes 318 | % related.tex:60 319 | | finite words 320 | % conclusion.tex:5 321 | | words of countable ordinal length 322 | | words of length~$\omega $ 323 | % conclusion.tex:8 324 | | $\omega $-words 325 | % conclusion.tex:12 326 | | $\omega $-iterations 327 | % conclusion.tex:15 328 | | finite word 329 | % conclusion.tex:18 330 | | scattered@word 331 | | countable words 332 | | scattered@word 333 | | inseparability 334 | | semigroup 335 | | words 336 | | semigroups 337 | | countable ordinal word 338 | 339 | **************************** 340 | * autoref-introduced-twice * 341 | **************************** 342 | 343 | % answer-yes.tex:66 344 | answer-yes.tex:66: {\singordmap }{default}{base} 345 | answer-yes.tex:66: {\singordmap }{default}{base} 346 | 347 | 348 | ****************************** 349 | * Autoref used without intro * 350 | ****************************** 351 | 352 | % macros.tex:303 353 | \nointro{default}{base}{\Jeq } 354 | 355 | 356 | *********************************** 357 | * Autoref introduced but not used * 358 | *********************************** 359 | 360 | % macros.tex:157 361 | macros.tex:157: {\lessord }{default}{base} 362 | 363 | 364 | -------------------------------------------------------------------------------- /examples/ordinal/ordinal.kl: -------------------------------------------------------------------------------- 1 | %%%%% NEW KNOWLEDGES 2 | % 3 | %\knowledge{notion} 4 | % | \FO -separability 5 | % | \FO -separator 6 | % | separates 7 | % | \FO -separation 8 | % | \FO -separable 9 | % | \FO -separated 10 | % | separable 11 | % | inseparability 12 | % | \FO -inseparability 13 | % 14 | %\knowledge{notion} 15 | % | \FO -formula 16 | % 17 | %\knowledge{notion} 18 | % | countable ordinal words 19 | % | Countable ordinal words 20 | % | countable ordinal word 21 | % 22 | %\knowledge{notion} 23 | % | separation problem 24 | % | separation problems 25 | % 26 | %\knowledge{notion} 27 | % | regular languages of countable ordinal words 28 | % 29 | %\knowledge{notion} 30 | % | yes 31 | % 32 | %\knowledge{notion} 33 | % | ie 34 | % 35 | %\knowledge{notion} 36 | % | no 37 | % 38 | %\knowledge{notion} 39 | % | witness function 40 | % 41 | %\knowledge{notion} 42 | % | \FO -sentence 43 | % 44 | %\knowledge{notion} 45 | % | regular languages@COW 46 | % 47 | %\knowledge{notion} 48 | % | countable ordinals 49 | % | countable ordinal 50 | % 51 | %\knowledge{notion} 52 | % | ordinal monoids 53 | % | ordinal monoid 54 | % 55 | %\knowledge{notion} 56 | % | \FO -pointlike sets 57 | % | pointlike sets 58 | % 59 | %\knowledge{notion} 60 | % | \FO -definable@lang 61 | % 62 | %\knowledge{notion} 63 | % | saturation 64 | % 65 | %\knowledge{notion} 66 | % | \FO -approximant 67 | % | \FO -approximants 68 | % | {\FO }-approximants 69 | % 70 | %\knowledge{notion} 71 | % | aperiodic 72 | % | aperiodicity 73 | % 74 | %\knowledge{notion} 75 | % | syntactic monoid 76 | % 77 | %\knowledge{notion} 78 | % | $\Jeq $-trivial 79 | % 80 | %\knowledge{notion} 81 | % | aperiodic pointlike sets 82 | % 83 | %\knowledge{notion} 84 | % | covering problem 85 | % | covering problems 86 | % | \FO -covering problem 87 | % 88 | %\knowledge{notion} 89 | % | scattered@linord 90 | % 91 | %\knowledge{notion} 92 | % | first-order logic 93 | % 94 | %\knowledge{notion} 95 | % | first-order definable maps 96 | % 97 | %\knowledge{notion} 98 | % | algorithm 99 | % 100 | %\knowledge{notion} 101 | % | pointlikes 102 | % 103 | %\knowledge{notion} 104 | % | linear ordering 105 | % | linear orderings 106 | % 107 | %\knowledge{notion} 108 | % | countable@linord 109 | % 110 | %\knowledge{notion} 111 | % | finite@linord 112 | % 113 | %\knowledge{notion} 114 | % | morphism@linord 115 | % 116 | %\knowledge{notion} 117 | % | isomorphism@linord 118 | % 119 | %\knowledge{notion} 120 | % | sum@linord 121 | % 122 | %\knowledge{notion} 123 | % | product@linord 124 | % 125 | %\knowledge{notion} 126 | % | well-founded 127 | % 128 | %\knowledge{notion} 129 | % | ordinal 130 | % | ordinals 131 | % 132 | %\knowledge{notion} 133 | % | embedding@linord 134 | % 135 | %\knowledge{notion} 136 | % | successor ordinal 137 | % 138 | %\knowledge{notion} 139 | % | limit ordinal 140 | % 141 | %\knowledge{notion} 142 | % | word 143 | % | $\omega $-word 144 | % | words 145 | % | $\omega $-words 146 | % 147 | %\knowledge{notion} 148 | % | domain 149 | % 150 | %\knowledge{notion} 151 | % | countable@word 152 | % | countable words 153 | % 154 | %\knowledge{notion} 155 | % | finite@word 156 | % | finite words 157 | % | finite words 158 | % | finite word 159 | % 160 | %\knowledge{notion} 161 | % | scattered@word 162 | % 163 | %\knowledge{notion} 164 | % | ordinal@linord 165 | % 166 | %\knowledge{notion} 167 | % | omega iteration 168 | % 169 | %\knowledge{notion} 170 | % | semigroup 171 | % | semigroups 172 | % 173 | %\knowledge{notion} 174 | % | monoid 175 | % 176 | %\knowledge{notion} 177 | % | idempotent 178 | % 179 | %\knowledge{notion} 180 | % | idempotent power 181 | % 182 | %\knowledge{notion} 183 | % | group-trivial 184 | % 185 | %\knowledge{notion} 186 | % | generalised product 187 | % 188 | %\knowledge{notion} 189 | % | generalised associativity 190 | % 191 | %\knowledge{notion} 192 | % | ordinal monoid morphism 193 | % 194 | %\knowledge{notion} 195 | % | ordered ordinal monoid 196 | % 197 | %\knowledge{notion} 198 | % | alphabet 199 | % 200 | %\knowledge{notion} 201 | % | recognised@OM 202 | % | recognisable@OM 203 | % | recognising@OM 204 | % 205 | %\knowledge{notion} 206 | % | regular@cow 207 | % 208 | %\knowledge{notion} 209 | % | presentation@OM 210 | % 211 | %\knowledge{notion} 212 | % | power ordinal monoid 213 | % 214 | %\knowledge{notion} 215 | % | Free variables 216 | % | free variables 217 | % 218 | %\knowledge{notion} 219 | % | valuation 220 | % 221 | %\knowledge{notion} 222 | % | word@ord 223 | % | words@ord 224 | % 225 | %\knowledge{notion} 226 | % | satisfies 227 | % 228 | %\knowledge{notion} 229 | % | accepts 230 | % 231 | %\knowledge{notion} 232 | % | Bedon's theorem 233 | % 234 | %\knowledge{notion} 235 | % | \FO -definable@map 236 | % | \FO -definable map 237 | % 238 | %\knowledge{notion} 239 | % | \FO -definable language 240 | % | \FO -definable languages 241 | % 242 | %\knowledge{notion} 243 | % | condensation 244 | % 245 | %\knowledge{notion} 246 | % | condensation formula 247 | % | condensation \FO -formula 248 | % 249 | %\knowledge{notion} 250 | % | finite condensation 251 | % 252 | %\knowledge{notion} 253 | % | \FO -definable functions 254 | % | \FO -definable function 255 | % 256 | %\knowledge{notion} 257 | % | $\omega $-iteration 258 | % | $\omega $-iterations 259 | % 260 | %\knowledge{notion} 261 | % | \FO -separator sentence 262 | % 263 | %\knowledge{notion} 264 | % | quantifier depth 265 | % 266 | %\knowledge{notion} 267 | % | \FOk -equivalent 268 | % 269 | %\knowledge{notion} 270 | % | ordinal monoids with merge 271 | % | ordinal monoid with merge 272 | % 273 | %\knowledge{notion} 274 | % | Merge operators 275 | % | merge operator 276 | % 277 | %\knowledge{notion} 278 | % | regular language@cow 279 | % | regular languages@cow 280 | % 281 | %\knowledge{notion} 282 | % | Green's relations 283 | % 284 | %\knowledge{notion} 285 | % | \FOk -closure 286 | % 287 | %\knowledge{notion} 288 | % | \FO -covering 289 | % 290 | %\knowledge{notion} 291 | % | aperiodic pointlikes 292 | % 293 | %\knowledge{notion} 294 | % | words of countable ordinal length 295 | % 296 | %\knowledge{notion} 297 | % | words of length~$\omega $ 298 | -------------------------------------------------------------------------------- /examples/ordinal/ordinal.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | 3 | \usepackage[utf8]{inputenc} 4 | \usepackage[T1]{fontenc} 5 | \pdfoutput = 1 6 | 7 | \usepackage[breaklinks,hidelinks]{hyperref} 8 | \usepackage{xcolor} 9 | 10 | \usepackage{knowledge} 11 | \knowledgeconfigure{notion} 12 | \knowledgeconfigure{quotation} 13 | \input{ordinal-kl.tex} 14 | 15 | \title{Blabla} 16 | \date{\today} 17 | \author{Charles-Édouard} 18 | 19 | 20 | \begin{document} 21 | 22 | \maketitle 23 | 24 | \AP ""word"" 25 | "words" 26 | ""word@@ord"" 27 | "countable ordinal word" 28 | ""regular language over countable ordinals"" 29 | \kl[ord]{regular languages} 30 | \intro{separation} 31 | "inseparability" 32 | ""semigroup"" 33 | \kl{semigroups} 34 | 35 | 36 | \end{document} -------------------------------------------------------------------------------- /examples/preservation/preservation.diagnose: -------------------------------------------------------------------------------- 1 | ************************ 2 | * Undefined knowledges * 3 | ************************ 4 | 5 | \knowledge{ignore} 6 | | preserved under extension 7 | | preservation under extension 8 | | preservation under substructures 9 | | substructures 10 | | homomorphisms 11 | -------------------------------------------------------------------------------- /examples/preservation/preservation.kl: -------------------------------------------------------------------------------- 1 | \knowledge{notion} 2 | | extensions 3 | 4 | \knowledge{notion} 5 | | preservation under extensions 6 | % | preservation under extension 7 | 8 | \knowledge{notion} 9 | | preserved under extensions 10 | % | preserved under extension 11 | %%%%% NEW KNOWLEDGES 12 | % 13 | %\knowledge{notion} 14 | % | preservation under substructures 15 | % 16 | %\knowledge{notion} 17 | % | substructures 18 | % 19 | %\knowledge{notion} 20 | % | homomorphisms 21 | -------------------------------------------------------------------------------- /img/preservation-after.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remimorvan/knowledge-clustering/4ab80f28b0a1796d682eaf365828580f05366ba6/img/preservation-after.png -------------------------------------------------------------------------------- /img/preservation-before.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remimorvan/knowledge-clustering/4ab80f28b0a1796d682eaf365828580f05366ba6/img/preservation-before.png -------------------------------------------------------------------------------- /knowledge_clustering/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remimorvan/knowledge-clustering/4ab80f28b0a1796d682eaf365828580f05366ba6/knowledge_clustering/__init__.py -------------------------------------------------------------------------------- /knowledge_clustering/_version.py: -------------------------------------------------------------------------------- 1 | """Version of knowledge-clustering.""" 2 | VERSION = "0.7.4" 3 | -------------------------------------------------------------------------------- /knowledge_clustering/add_anchor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adding anchor points to a document. 3 | """ 4 | 5 | from __future__ import annotations # Support of `|` for type union in Python 3.9 6 | 7 | import re # Regular expressions 8 | from typing import TextIO 9 | import sys 10 | 11 | from knowledge_clustering.tex_document import TexDocument 12 | from knowledge_clustering import misc, cst 13 | 14 | 15 | def app(tex_filename: str, space: int, out: TextIO = sys.stdout) -> None: 16 | """ 17 | Prints warning when a knowledge is introduced but is not preceded by an anchor point. 18 | Args: 19 | tex_filename: the name of the tex file. 20 | space: an integer specifying the maximal number of characters allowed between the 21 | introduction of a knowledge and an anchor point. 22 | out: an output stream. 23 | """ 24 | with open(tex_filename, "r", encoding="utf-8") as f: 25 | tex_doc = TexDocument(f.read()) 26 | return missing_anchor(tex_doc, space, out) 27 | 28 | 29 | def missing_anchor(tex_doc: TexDocument, space: int, out: TextIO) -> None: 30 | """ 31 | Prints line numbers containing the introduction of a knowledge which 32 | is further away from an anchor point than the integer given as input. 33 | 34 | Args: 35 | tex_doc: a TeX document. 36 | space: the maximal distance between the introduction of a 37 | knowledge and the anchor point preceeding it. 38 | out: an output stream. 39 | """ 40 | # First, compute the list of pairs (i1,i2,i3,i4) corresponding to 41 | # the indices in s = tex_doc.tex_cleaned of some pair in cst.INTRO_DELIMITERS, i.e. 42 | # (s[i1:i2], s[i3:i4]) is in cst.INTRO_DELIMITERS 43 | matches: list[tuple[int, int, int, int]] = [] 44 | is_end_of_match = [False for _ in range(len(tex_doc.tex_cleaned))] 45 | for beg_str, end_str in cst.INTRO_DELIMITERS: 46 | for i_match in re.finditer(re.escape(beg_str), tex_doc.tex_cleaned): 47 | i1: int = i_match.start() 48 | i2: int = i_match.end() 49 | if not is_end_of_match[i1]: 50 | i3: int = i2 + tex_doc.tex_cleaned[i2:].find(end_str) 51 | i4: int = i3 + len(end_str) 52 | if i3 != -1: 53 | matches.append((i1, i2, i3, i4)) 54 | is_end_of_match[i3] = True 55 | matches.sort(key=lambda x: x[0]) 56 | for i1, i2, i3, _ in matches: 57 | beg: int = max(0, i1 - space) 58 | if not any(ap_str in tex_doc.tex_cleaned[beg:i1] for ap_str in cst.AP_STRING): 59 | start_pt: int | None = tex_doc.pointer[i1] 60 | if start_pt is not None: 61 | message: str = f"Missing anchor point at line {tex_doc.find_line[start_pt]} (knowledge: {misc.emph(tex_doc.tex_cleaned[i2:i3])})." 62 | print(message, file=out) 63 | else: 64 | raise IndexError("Undefined pointer", tex_doc.pointer, i1) 65 | -------------------------------------------------------------------------------- /knowledge_clustering/add_quotes.py: -------------------------------------------------------------------------------- 1 | """ 2 | Add missing quotes around knowledges occuring in a TeX document. 3 | """ 4 | 5 | from __future__ import annotations # Support of `|` for type union in Python 3.9 6 | 7 | import re # Regular expressions 8 | from typing import NamedTuple, TextIO 9 | import sys 10 | 11 | from knowledge_clustering.knowledges import KnowledgesList, remove_redundant_files 12 | from knowledge_clustering.tex_document import TexDocument 13 | from knowledge_clustering import file_updater, misc, cst 14 | 15 | 16 | class NewKL(NamedTuple): 17 | """ 18 | Object storing a new knowledge, together with its starting and ending point in some TeX 19 | document, together with a smaller knowledge, that is already known, and is a substring of 20 | the knowledge. 21 | """ 22 | 23 | kl_origin: str 24 | start_origin: int 25 | end_origin: int 26 | kl: str 27 | start: int 28 | end: int 29 | 30 | 31 | class AddQuote(NamedTuple): 32 | """ 33 | Stores the starting and ending indexes of the occurence of some knowledge in a TeX document. 34 | """ 35 | 36 | kl: str 37 | start: int 38 | end: int 39 | 40 | 41 | def ask_consent(message: str, inp: TextIO, out: TextIO): 42 | """ 43 | Asks whether the user wants to do an action, after printing the string `message`. 44 | Returns a boolean. 45 | """ 46 | print(message, file=out) 47 | ans = inp.readline().rstrip("\n") 48 | return ans.lower() in ["y", "yes"] 49 | 50 | 51 | def app( 52 | tex_filename: str, 53 | kl_filenames: list[str], 54 | print_line: int, 55 | inp: TextIO = sys.stdin, 56 | out: TextIO = sys.stdout, 57 | ) -> None: 58 | """ 59 | Finds knowledges defined in the knowledge file that appear in tex file without quote 60 | symbols. Proposes to add quotes around them. 61 | Args: 62 | tex_filename: the name of the tex file. 63 | kl_filenames: the names of the knowledge files. 64 | print_line: an integer specifying how many lines of the tex file should be printed. 65 | inp: input stream. 66 | out: output stream. 67 | """ 68 | tex_hash = file_updater.hash_file(tex_filename) 69 | with open(tex_filename, "r", encoding="utf-8") as f: 70 | tex_doc = TexDocument(f.read()) 71 | f.close() 72 | kls = KnowledgesList(remove_redundant_files(kl_filenames)) 73 | tex_document_new, new_knowledges = quote_maximal_substrings( 74 | tex_doc, kls, print_line, inp, out 75 | ) 76 | with file_updater.AtomicUpdate(tex_filename, original_hash=tex_hash) as f: 77 | f.write(tex_document_new) 78 | f.close() 79 | for known_kl, new_kl in new_knowledges: 80 | kls.define_synonym_of(new_kl, known_kl) 81 | kls.write_knowledges_in_file(nocomment=True) 82 | 83 | 84 | def add_quote( 85 | tex_doc: TexDocument, 86 | operations: list[NewKL | AddQuote], 87 | print_line: int, 88 | inp: TextIO, 89 | out: TextIO, 90 | ) -> tuple[str, list[tuple[str, str]]]: 91 | """ 92 | In the TeX document, for every operation of type AddQuote, proposes to add quotes before 93 | and after the match with the knowledge. 94 | For every operation of type NewKL, proposes to define a new knowledge, and to add 95 | quotes before and after the match. 96 | 97 | Args: 98 | tex_doc: a TeX document. 99 | operations: a list of operations, whose type is either NewKL or AddQuote. 100 | print_line: an integer specifying how many lines of the tex file should be printed. 101 | inp: an input stream. 102 | out: an output stram. 103 | Given a tex code, and a list of triples (_, start, end), add a quote before the 104 | start and after the end. If the boolean interactive if true, asks the user 105 | if they want to add quotes: moreover, print the print_line lines preceding 106 | the match before asking the user's input. 107 | """ 108 | result: str = "" 109 | new_knowledges: list[tuple[str, str]] = [] 110 | ignore_synonym = [] 111 | ignore_subknowledge = [] 112 | operations.sort(key=lambda x: x.start) 113 | operations_addquote: list[AddQuote] = [] 114 | for op in operations: 115 | if isinstance(op, NewKL): 116 | if op.kl not in ignore_synonym: 117 | if op.kl not in [k for (_, k) in new_knowledges]: 118 | # Propose to the user to define a synonym 119 | tex_doc.print(op.start, op.end, print_line, out) 120 | message = ( 121 | f"Do you want to add `{misc.emph_alt(op.kl)}` as a synonym " 122 | f"of `{misc.emph_alt(op.kl_origin)}` and add quotes? [y/n] " 123 | ) 124 | if ask_consent(message, inp, out): 125 | # Adds op.kl as a new knowledge, defined as a synonym of op.kl_origin 126 | new_knowledges.append((op.kl_origin, op.kl)) 127 | operations_addquote.append(AddQuote(op.kl, op.start, op.end)) 128 | # Removes any operations occuring on a substring of our new knowledge 129 | for op2 in operations: 130 | if isinstance(op2, AddQuote): 131 | if op.start <= op2.start and op2.end <= op.end: 132 | operations.remove(op2) 133 | else: 134 | # From this point, do not propose again to define op.kl as a new knowledge. 135 | ignore_synonym.append(op.kl) 136 | if ( 137 | op.kl_origin 138 | == tex_doc.tex_code[op.start_origin : op.end_origin + 1] 139 | ): 140 | # Propose to the user to add quotes around the original knowledge 141 | # instead, if we have a precise match. 142 | if ask_consent( 143 | f"Add quotes around `{misc.emph(op.kl_origin)}` instead? [y/n] ", 144 | inp, 145 | out, 146 | ): 147 | operations_addquote.append( 148 | AddQuote( 149 | op.kl_origin, op.start_origin, op.end_origin 150 | ) 151 | ) 152 | else: 153 | ignore_subknowledge.append(op.kl) 154 | print("", file=out) 155 | else: 156 | # If op.kl was already accepted as a synonym earlier, treat it 157 | # as a regular knowledge 158 | op = AddQuote(op.kl, op.start, op.end) 159 | elif op.kl not in ignore_subknowledge: 160 | # If the user doesn't want op.kl as a synonym but might want 161 | # to add quotes around op.kl_origin 162 | op = AddQuote(op.kl_origin, op.start_origin, op.end_origin) 163 | elif isinstance(op, AddQuote): 164 | tex_doc.print(op.start, op.end, print_line, out) 165 | if ask_consent("Add quotes? [y/n] ", inp, out): 166 | operations_addquote.append(op) 167 | print("", file=out) 168 | add_quote_before = [tex_doc.pointer[op.start] for op in operations_addquote] 169 | add_quote_after = [tex_doc.pointer[op.end] for op in operations_addquote] 170 | # Simply add quotes before and after every positions corresponding to the beginning / end of 171 | # a match with a knowledge. 172 | for i, char in enumerate(tex_doc.tex_code): 173 | if i in add_quote_before: 174 | result += '"' 175 | result += char 176 | if i in add_quote_after: 177 | result += '"' 178 | print( 179 | f"Added {len(operations_addquote)} pair" 180 | + ("s" if len(operations_addquote) > 1 else "") 181 | + f" of quotes. Defined {len(new_knowledges)} synonym" 182 | + ("s." if len(new_knowledges) > 1 else "."), 183 | file=out, 184 | ) 185 | return result, new_knowledges 186 | 187 | 188 | def quote_maximal_substrings( 189 | tex_doc: TexDocument, 190 | kls: KnowledgesList, 191 | print_line: int, 192 | inp: TextIO, 193 | out: TextIO, 194 | ) -> tuple[str, list[tuple[str, str]]]: 195 | """ 196 | Finds knowledges defined in the knowledge file that appear in tex file without quote 197 | symbols. Proposes to add quotes around them. 198 | 199 | Args: 200 | tex_doc: a TeX document. 201 | kls: list of knowledges. 202 | print_line: an integer specifying how many lines of the tex file should be printed. 203 | inp: input stream. 204 | out: output stream. 205 | """ 206 | 207 | def stop_expanding(char): 208 | return not char.isalpha() 209 | 210 | ignore_position = [False] * tex_doc.length 211 | add_quote_location: list[NewKL | AddQuote] = [] 212 | for ignore_case in [False, True]: 213 | # Start the algo by being case sensitive, then run it while being insensitive. 214 | for s1 in kls.get_sorted_knowledges(): 215 | match_list = ( 216 | re.finditer(re.escape(s1), tex_doc.tex_cleaned, re.IGNORECASE) 217 | if ignore_case 218 | else re.finditer(re.escape(s1), tex_doc.tex_cleaned) 219 | ) 220 | for match in match_list: 221 | start, end = match.start(), match.end() - 1 222 | if not ignore_position[start]: 223 | # Ignore every infix of s1 that is also a substring of the list 224 | for i in range(start, end + 1): 225 | ignore_position[i] = True 226 | for s2 in kls.dependency[s1]: 227 | for submatch in re.finditer( 228 | re.escape(s2), tex_doc.tex_cleaned[start : end + 1] 229 | ): 230 | ignore_position[start + submatch.start()] = True 231 | # Check if s1 is precedeed by quotes, if not, either check 232 | # if we can define a new knowledge, or add the match to the 233 | # list of quotes to add. 234 | if not any( 235 | tex_doc.tex_cleaned.endswith(beg_kl, 0, start) 236 | and tex_doc.tex_cleaned.startswith(end_kl, end + 1) 237 | for (beg_kl, end_kl) in cst.KL_DELIMITERS 238 | ): 239 | start2, end2 = start, end 240 | while start2 > 0 and not stop_expanding( 241 | tex_doc.tex_cleaned[start2 - 1] 242 | ): 243 | start2 -= 1 244 | while end2 + 1 < len( 245 | tex_doc.tex_cleaned 246 | ) and not stop_expanding(tex_doc.tex_cleaned[end2 + 1]): 247 | end2 += 1 248 | # text_cleaned[start2: end2 + 1] is the maximal substring 249 | # containing text_cleaned[start, end + 1] = s1 as a factor, 250 | # and obtained by only addings letters (no space). 251 | new_kl = tex_doc.tex_cleaned[start2 : end2 + 1] 252 | if s1 != new_kl: 253 | # Propose to add new_kl as a new knowledge 254 | add_quote_location.append( 255 | NewKL(s1, start, end, new_kl, start2, end2) 256 | ) 257 | else: 258 | add_quote_location.append(AddQuote(s1, start, end)) 259 | return add_quote(tex_doc, add_quote_location, print_line, inp, out) 260 | -------------------------------------------------------------------------------- /knowledge_clustering/autofinder.py: -------------------------------------------------------------------------------- 1 | """Automatically finds files in the current directory.""" 2 | 3 | from __future__ import annotations # Support of `|` for type union in Python 3.9 4 | 5 | from pathlib import Path 6 | 7 | 8 | class NoFile(Exception): 9 | """When no file is found.""" 10 | 11 | 12 | class TooManyFiles(Exception): 13 | """When too many files are found compared to what was expected.""" 14 | 15 | 16 | def find_ext(dr: Path, ext: str) -> list[Path]: 17 | """ 18 | Lists all files present in a directory (and its subdirectory, recursively) 19 | that ends with some given extension. 20 | """ 21 | return list(dr.glob(f"**/*.{ext}")) 22 | 23 | 24 | def get_unique_diagnose_file(dr: Path) -> Path: 25 | """ 26 | Returns the unique .diagnose file present in a directory (and its subdirectory, recursively), 27 | fails otherwise. 28 | """ 29 | dg_files = find_ext(dr, "diagnose") 30 | if len(dg_files) == 0: 31 | raise NoFile("No .diagnose file present in the directory.") 32 | if len(dg_files) > 1: 33 | raise TooManyFiles( 34 | f"Multiple .diagnose files present in the directory: \ 35 | {dg_files[0]} and {dg_files[1]}." 36 | ) 37 | return dg_files[0] 38 | 39 | 40 | def get_knowledge_files(dr: Path) -> list[Path]: 41 | """ 42 | Returns the list of all .kl files present in a directory (and its subdirectory, recursively). 43 | Fails if there is no .kl file. Fails if there are multiple .kl file, but not a unique one 44 | ending with `default.kl`. 45 | """ 46 | kl_files = find_ext(dr, "kl") 47 | if len(kl_files) == 0: 48 | raise NoFile("No .kl file present in the directory.") 49 | if len(kl_files) == 1: 50 | return kl_files 51 | list_default = [] 52 | for i, p in enumerate(kl_files): 53 | if str(p).endswith("default.kl"): 54 | list_default.append(i) 55 | if len(list_default) == 0: 56 | raise NoFile("No file ending with `default.kl` present in the directory.") 57 | if len(list_default) > 1: 58 | raise TooManyFiles( 59 | f"Multiple files ending with `default.kl` present in the directory: \ 60 | {kl_files[list_default[0]]} and {kl_files[list_default[1]]}." 61 | ) 62 | idx_default = list_default[0] 63 | idx_last = len(kl_files) - 1 64 | kl_files[idx_last], kl_files[idx_default] = ( 65 | kl_files[idx_default], 66 | kl_files[idx_last], 67 | ) 68 | return kl_files 69 | -------------------------------------------------------------------------------- /knowledge_clustering/check_update.py: -------------------------------------------------------------------------------- 1 | """ 2 | Checks if there is a newer version of knowledge-clustering available on PyPI. 3 | """ 4 | 5 | import requests 6 | 7 | from knowledge_clustering import _version 8 | from knowledge_clustering.misc import add_bold, add_red, add_orange, add_green 9 | from knowledge_clustering.cst import TIMEOUT_REQUEST 10 | 11 | 12 | def check_update() -> None: 13 | """ 14 | Checks if an update is available, and if so, prints a message in 15 | the string pointer given as input. 16 | """ 17 | # From https://stackoverflow.com/a/62571316/19340201 18 | try: 19 | package = "knowledge-clustering" 20 | response = requests.get( 21 | f"https://pypi.org/pypi/{package}/json", timeout=TIMEOUT_REQUEST 22 | ) 23 | latest_version: str = response.json()["info"]["version"] 24 | is_available: bool = latest_version != _version.VERSION 25 | except requests.exceptions.RequestException: 26 | is_available = False 27 | latest_version = "" 28 | # If available, print message 29 | msg = "" 30 | if is_available: 31 | msg += ( 32 | "\n" 33 | + add_bold(add_orange("[notice]")) 34 | + " A new release of knowledge-clustering is available: " 35 | + add_red(_version.VERSION) 36 | + " -> " 37 | + add_green(latest_version) 38 | ) 39 | msg += ( 40 | "\n" 41 | + add_bold(add_orange("[notice]")) 42 | + " To update, run: " 43 | + add_green("python3 -m pip install --upgrade knowledge-clustering") 44 | ) 45 | print(msg) 46 | -------------------------------------------------------------------------------- /knowledge_clustering/clustering.py: -------------------------------------------------------------------------------- 1 | """Clustering algorithm.""" 2 | 3 | from __future__ import annotations # Support of `|` for type union in Python 3.9 4 | from pathlib import Path 5 | 6 | import copy 7 | 8 | from knowledge_clustering import distance, config, scope_meaning, diagnose, cst 9 | from knowledge_clustering.knowledges import KnowledgesList, remove_redundant_files 10 | from knowledge_clustering.misc import emph 11 | 12 | 13 | def app( 14 | kl_filename: list[str], 15 | dg_filename: str, 16 | scope: bool, 17 | print_kl: bool, 18 | lang: str, 19 | config_filename: None | Path, 20 | ): 21 | """ 22 | Defines, as a comment and in the knowledge file, all the knowledges occuring 23 | in the diagnose file. 24 | Args: 25 | kl_filename: the list of name of the knowledge files. 26 | dg_filename: the name of the diagnose file. 27 | scope: a boolean specifying whether the scopes meaning should be printed. 28 | lang: the langage of the document. 29 | config_filename: a configuration file, specifying prefixes to ignore. 30 | """ 31 | kls = KnowledgesList(remove_redundant_files(kl_filename)) 32 | 33 | if config_filename is None: 34 | config_filename = cst.CONFIG_FILE[lang] 35 | 36 | list_prefixes = config.parse(config_filename) 37 | 38 | scopes_meaning = scope_meaning.infer_all_scopes( 39 | kls.get_all_bags(), cst.NLTK_LANG[lang] 40 | ) 41 | if scope: 42 | scope_meaning.print_scopes(scopes_meaning, print_meaning=True) 43 | unknown_knowledges = diagnose.parse(dg_filename) 44 | 45 | if len(unknown_knowledges) == 0: 46 | return 47 | 48 | # update `kl` using the clustering algorithm 49 | clustering( 50 | kls, 51 | unknown_knowledges, 52 | cst.ALPHA, 53 | list_prefixes, 54 | scopes_meaning, 55 | cst.NLTK_LANG[lang], 56 | ) 57 | print( 58 | f"Found a solution by adding {len(kls.get_new_bags())} new bag" 59 | + ("s" if len(kls.get_new_bags()) >= 2 else "") 60 | + ".\n" 61 | ) 62 | changed_filenames = [ 63 | kl.filename for kl in kls.get_all_kls_struct() if kl.was_changed() 64 | ] 65 | if len(changed_filenames) == 0: 66 | msg = "No file was changed." 67 | elif not print_kl: 68 | msg = "The following files were changed:" 69 | for i, fn in enumerate(changed_filenames): 70 | msg += emph(f" {fn}") 71 | msg += "," if i < len(changed_filenames) - 1 else "." 72 | else: 73 | msg = "" 74 | for i, fn in enumerate(changed_filenames): 75 | msg += "Added in file " + emph(f" {fn}") + ":\n" 76 | for kl in kls.get_new_knowledges_in_file(fn): 77 | msg += f"\t{kl}\n" 78 | print(msg) 79 | kls.write_knowledges_in_file() 80 | 81 | 82 | def clustering( 83 | kls: KnowledgesList, 84 | unknown_kl: list[str], 85 | alpha: float, 86 | list_prefixes: list[str], 87 | scopes_meaning: dict[str, list[list[str]]], 88 | lang: str, 89 | ): 90 | """ 91 | Adds all knowledges in unknown_kl to the structure kls. 92 | 93 | The invariant satisfied by the algorithm is the following: 94 | any two notions in the same bag are near, where near either means: 95 | - both in the same bag of knowledges at the beggining of the algorithm ; 96 | - at distance (from module "dist") at most alpha if at least one of 97 | the two notions initially belongs to unknown_kls. 98 | 99 | Args: 100 | kls: known knowledges. 101 | unknown_kl: a list of unknown knowledges. 102 | alpha: a threshold indicating the maximal distance allowed for clustering 103 | two knowkledges together. 104 | list_prefixes: a list of prefixes that are ignored when computing the 105 | distance between two knowledges. 106 | scope_meaning: a dictionnary, assigning to every scope a list of 107 | its possible meanings, each possible meaning being a list of words; 108 | used to compute the distance. 109 | lang: a string describing the language of the document; 110 | a value from the dictionnary knowledge_clustering.app._NLTK_LANG; 111 | used to compute the distance. 112 | """ 113 | kl_processed_old = [] 114 | kl_processed_new = kls.get_all_knowledges() 115 | while unknown_kl: 116 | # If there is no newly processed knowledge, pick an unknown knowledge 117 | # and add it to a new bag. 118 | if not kl_processed_new: 119 | kl = unknown_kl[0] 120 | unknown_kl = unknown_kl[1:] 121 | kls.add_new_bag(kl) 122 | kl_processed_new = [kl] 123 | size_kl_processed_new = len(kl_processed_new) 124 | # Tries to add every unknown knowledge to a bag 125 | unknown_kl_copy = copy.copy(unknown_kl) 126 | for kl in unknown_kl_copy: 127 | dist_min = None 128 | kl2_min_list = [] 129 | # Finds the processed notion that is at a minimal distance from kl 130 | for kl2 in kl_processed_new: 131 | d = distance.distance(kl, kl2, list_prefixes, scopes_meaning, lang) 132 | if dist_min is None or d < dist_min: 133 | dist_min = d 134 | kl2_min_list = [kl2] 135 | elif d == dist_min: 136 | kl2_min_list.append(kl2) 137 | # If this minimal distance is smaller than the threshold alpha, add kl to the bag 138 | if dist_min is not None and dist_min <= alpha: 139 | # Choose kl2_min in kl2_min_list minimising the edit distance 140 | kl2_min = distance.minimise_levenshtein_distance(kl, kl2_min_list) 141 | # Add kl to the bag of kl2_min 142 | kls.define_synonym_of(kl, kl2_min) 143 | unknown_kl.remove(kl) 144 | kl_processed_new.append(kl) 145 | # Every "new processed knowledge" that was known at the beginning of the while iteration 146 | # becomes an "old processed knowledge" 147 | kl_processed_old += kl_processed_new[:size_kl_processed_new] 148 | kl_processed_new = kl_processed_new[size_kl_processed_new:] 149 | -------------------------------------------------------------------------------- /knowledge_clustering/config.py: -------------------------------------------------------------------------------- 1 | """Parse a configuration file.""" 2 | 3 | from __future__ import annotations # Support of `|` for type union in Python 3.9 4 | from pathlib import Path 5 | 6 | import configparser 7 | 8 | 9 | class ListConfigParser(configparser.ConfigParser): 10 | """Extended Config Parser to handle lists.""" 11 | 12 | def getlist(self, section, option): 13 | """Return list in some config file.""" 14 | value = self.get(section, option) 15 | return list(x.split("#")[0].strip() for x in value.splitlines()) 16 | 17 | # def getlistint(self, section, option): 18 | # return [int(x) for x in self.getlist(section, option)] 19 | 20 | 21 | def parse(filename: Path) -> list[str]: 22 | """ 23 | Reads a config file and returns the list of words occuring 24 | under the keyphrase `[DEFAULT] PREFIXES_SIMILAR=`. 25 | 26 | Args: 27 | filename: the name of a config file. 28 | 29 | Returns: 30 | a list of prefixes that should be ignored by the clustering algorithm. 31 | """ 32 | p = ListConfigParser() 33 | p.read(filename) 34 | return p.getlist("DEFAULT", "PREFIXES_SIMILAR") 35 | -------------------------------------------------------------------------------- /knowledge_clustering/cst.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constants used throughout knowledge-clustering. 3 | """ 4 | 5 | from __future__ import annotations # Support of `|` for type union in Python 3.9 6 | 7 | from pathlib import Path 8 | from importlib import resources 9 | 10 | ALPHA = 0 11 | 12 | CONFIG_FILENAME: dict[str, str] = {"en": "english.ini", "fr": "french.ini"} 13 | ref = resources.files("knowledge_clustering") / "data" 14 | with resources.as_file(ref) as path: 15 | CONFIG_DIR: Path = path 16 | CONFIG_FILE: dict[str, Path] = dict() 17 | for lan, filename in CONFIG_FILENAME.items(): 18 | ref_file = resources.files("knowledge_clustering") / f"data/{filename}" 19 | with resources.as_file(ref_file) as path_file: 20 | CONFIG_FILE[lan] = path_file 21 | NLTK_LANG: dict[str, str] = {"en": "english", "fr": "french"} 22 | 23 | INTRO_DELIMITERS: list[tuple[str, str]] = [ 24 | ('""', '""'), 25 | ("\\intro{", "}"), 26 | ("\\reintro{", "}"), 27 | ("\\phantomintro{", "}"), 28 | ("\\intro[", "]"), 29 | ("\\reintro[", "]"), 30 | ("\\phantomintro[", "]"), 31 | ] 32 | AP_STRING: list[str] = ["\\AP", "\\itemAP"] 33 | 34 | KL_DELIMITERS: list[tuple[str, str]] = [ 35 | ('"', '"'), 36 | ('"', "@"), 37 | ("@", '"'), 38 | ("@", "@"), 39 | ("\\kl{", "}"), 40 | ("\\intro{", "}"), 41 | ("\\reintro{", "}"), 42 | ("\\phantomintro{", "}"), 43 | ("\\kl[", "]"), 44 | ("\\intro[", "]"), 45 | ("\\reintro[", "]"), 46 | ("\\phantomintro[", "]"), 47 | ] 48 | 49 | SEPARATION_HEADING_KL_BLOCK = "************************" 50 | 51 | IMPORTANT_POS = [ 52 | "CD", 53 | "JJ", 54 | "JJR", 55 | "JJS", 56 | "NN", 57 | "NNP", 58 | "NNS", 59 | "PDT", 60 | "RB", 61 | "RBR", 62 | "RBS", 63 | "VB", 64 | "VBD", 65 | "VBG", 66 | "VBN", 67 | "VBP", 68 | "VBZ", 69 | ] 70 | IGNORE_SUFFIXES = ["", "s"] 71 | INFINITY = 10000 72 | IGNORE_CHAR_BACKSLASH = [ 73 | # LaTeX accents defined using non-alphanumerical commands 74 | "\\`", 75 | "\\'", 76 | "\\^", 77 | '\\"', 78 | "\\~", 79 | "\\=", 80 | "\\.", 81 | "\\-", # Hyphen 82 | ] 83 | IGNORE_CHAR_NO_BACKSLASH = ["{", "}"] 84 | SPACE_CHAR = ["~", "\\\\"] 85 | 86 | DISCARD_LINE = "%%%%% NEW KNOWLEDGES " 87 | 88 | TIMEOUT_REQUEST: float = ( 89 | 0.25 # Timeout to resquest the latest version 90 | # of knowledge-clustering (in seconds) 91 | ) 92 | -------------------------------------------------------------------------------- /knowledge_clustering/data/english.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | PREFIXES_SIMILAR= 3 | # Empty string 4 | - # ignore dashes 5 | a # (a)chromatic 6 | il 7 | im 8 | in # (in)separable 9 | ir 10 | non 11 | non- # (non-)atomic 12 | un # (un)ambiguous 13 | -------------------------------------------------------------------------------- /knowledge_clustering/data/french.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | PREFIXES_SIMILAR= 3 | # Empty string 4 | - # ignore dashes 5 | a 6 | il 7 | im 8 | in 9 | ir 10 | non 11 | non- 12 | -------------------------------------------------------------------------------- /knowledge_clustering/diagnose.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for handling the .diagnose files. 3 | """ 4 | 5 | from __future__ import annotations # Support of `|` for type union in Python 3.9 6 | 7 | from typing import Callable, Generator 8 | from knowledge_clustering import cst 9 | 10 | 11 | def automata_line(state: int, line: str) -> tuple[int, str | None]: 12 | """ 13 | Transition function of a transducers parsing knowledges from a diagnose file, 14 | which is read line by line. 15 | 16 | Args: 17 | state: the curring state of the automata, with the following semantic: 18 | 0: waiting for knowledge block; 19 | 1: seen the heading of a knowledge block; 20 | 2: we are in a knowledge block. 21 | line: a line of the .diagnose document. 22 | 23 | Returns: 24 | a pair (state, kl) where state is the new state of the automaton, 25 | and kl is either None, or a knowledge parsed while reading the line given as input. 26 | """ 27 | if state == 0 and "Undefined knowledges" in line: 28 | return 1, None 29 | if state == 1 and cst.SEPARATION_HEADING_KL_BLOCK in line: 30 | return 2, None 31 | if (state in {0, 2}) and cst.SEPARATION_HEADING_KL_BLOCK in line: 32 | return 0, None 33 | if state == 2 and "| " in line: 34 | s = (line.split("| ", 1)[1]).split("\n", 1)[0] 35 | return 2, s 36 | return state, None 37 | 38 | 39 | def unroll( 40 | automata: Callable[[int, str], tuple[int, str | None]], 41 | initial_state: int, 42 | str_input: list[str], 43 | ) -> Generator[str | None, None, None]: 44 | """Builds a generator object from the transition function of a transducer.""" 45 | state: int = initial_state 46 | z: str | None 47 | for y in str_input: 48 | state, z = automata(state, y) 49 | yield z 50 | 51 | 52 | def parse(filename: str) -> list[str]: 53 | """ 54 | Parses a diagnose file and returns the knowledges it contains. 55 | 56 | Args: 57 | filename: the name of the .diagnose file. 58 | 59 | Returns: 60 | a list of knowledges. 61 | """ 62 | with open(filename, encoding="utf-8") as f: 63 | list_notions = [] 64 | for notion in unroll(automata_line, 0, f.readlines()): 65 | if notion is not None and notion not in list_notions: 66 | list_notions.append(notion) 67 | return list(list_notions) 68 | -------------------------------------------------------------------------------- /knowledge_clustering/distance.py: -------------------------------------------------------------------------------- 1 | """Compute the distance between two knowledges.""" 2 | 3 | from __future__ import annotations # Support of `|` for type union in Python 3.9 4 | 5 | import copy 6 | import nltk # type: ignore 7 | import nltk.stem.snowball as nss # type: ignore 8 | from unidecode import unidecode 9 | 10 | from knowledge_clustering import cst 11 | from knowledge_clustering.misc import emph 12 | 13 | # --- 14 | # Edit distance 15 | # --- 16 | 17 | 18 | def levenshtein_distance(s: str, t: str) -> int: 19 | """ 20 | Computes the Levenshtein (insertions, deletions or substitutions are allowed) 21 | edit distance between two strings. 22 | """ 23 | # Implementation of the Wagner–Fischer algorithm 24 | # https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm 25 | m, n = len(s), len(t) 26 | dist = [[0 for _ in range(n + 1)] for _ in range(m + 1)] 27 | for i in range(1, m + 1): 28 | dist[i][0] = i 29 | for j in range(1, n + 1): 30 | dist[0][j] = j 31 | for j in range(1, n + 1): 32 | for i in range(1, m + 1): 33 | substitution_cost = 0 if s[i - 1] == t[j - 1] else 1 34 | dist[i][j] = min( 35 | dist[i - 1][j] + 1, 36 | dist[i][j - 1] + 1, 37 | dist[i - 1][j - 1] + substitution_cost, 38 | ) 39 | return dist[m][n] 40 | 41 | 42 | def minimise_levenshtein_distance(s: str, t_list: list[str]) -> str: 43 | """ 44 | Given a string s, and a non-empty list of strings, returns an element of t_list 45 | minimising the edit distance with s. 46 | """ 47 | t_min = t_list[0] 48 | dist_min = levenshtein_distance(s, t_min) 49 | for t in t_list[1:]: 50 | dist = levenshtein_distance(s, t) 51 | if dist < dist_min: 52 | t_min = t 53 | dist_min = dist 54 | return t_min 55 | 56 | 57 | # --- 58 | # Functions to extract content from strings 59 | # --- 60 | 61 | 62 | def extract_scope(notion: str) -> tuple[str, str]: 63 | """ 64 | Given a notion of the form "knowledge@scope" or "knowledge", 65 | returns a pair consisting of the knowledge and the (possibly empty) scope. 66 | """ 67 | if "@" in notion: 68 | s = notion.split("@", 1) 69 | return s[0], s[1] 70 | return notion, "" 71 | 72 | 73 | def normalise_notion(notion: str) -> str: 74 | """ 75 | Returns the substring of a notion obtained by removing math, commands, accents 76 | and non-brekable spaces. 77 | """ 78 | notion_norm = notion.lower() # to lowercase 79 | while "$" in notion_norm: 80 | sp = notion_norm.split("$", 2) 81 | if len(sp) <= 1: 82 | break 83 | notion_norm = sp[0] + sp[2] 84 | for remove_char in cst.IGNORE_CHAR_BACKSLASH: 85 | while remove_char in notion_norm: 86 | # If the notion contains remove_char, remove it. 87 | sp = notion_norm.split(remove_char, 1) 88 | notion_norm = sp[0] + sp[1] 89 | for space_char in cst.SPACE_CHAR: 90 | while space_char in notion_norm: 91 | # If the notion contains remove_char, replace it with a space. 92 | sp = notion_norm.split(space_char, 1) 93 | notion_norm = sp[0] + " " + sp[1] 94 | while "\\" in notion_norm: 95 | # If the notion contains a backslash, remove every letter following the backslash 96 | # see https://tex.stackexchange.com/a/34381/206008 for naming conventions of TeX commands 97 | sp = notion_norm.split("\\", 1) 98 | pref, suff = sp[0], sp[1] 99 | i = 0 100 | while i < len(suff) and suff[i].isalpha(): 101 | i += 1 102 | notion_norm = pref + suff[i:] 103 | for remove_char in cst.IGNORE_CHAR_NO_BACKSLASH: 104 | while remove_char in notion_norm: 105 | # If the notion contains remove_char, remove it. 106 | sp = notion_norm.split(remove_char, 1) 107 | notion_norm = sp[0] + sp[1] 108 | return unidecode(notion_norm) # Ascii-fy (in particular, remove accents) the result 109 | 110 | 111 | def breakup_notion(notion: str, lang: str) -> tuple[list[str], str]: 112 | """ 113 | Takes a notion, and a language, and returns 114 | a set of words contained in the notion. 115 | 116 | If the language is `english`, remove unimportant words. 117 | Important words are: cardinals, preposition or conjunction, subordinating, 118 | adjectives, nouns, pre-determiners, adverbs, verbs (list defined in cst.IMPORTANT_POS). 119 | 120 | """ 121 | kl, scope = extract_scope(normalise_notion(notion)) 122 | try: 123 | if lang == "english": 124 | words_with_POStag = nltk.pos_tag( # pylint: disable=invalid-name 125 | nltk.word_tokenize(kl, language="english") 126 | ) 127 | important_words = { 128 | w for (w, pos) in words_with_POStag if pos in cst.IMPORTANT_POS 129 | } 130 | return (list(important_words), scope) 131 | return (list(set(nltk.word_tokenize(kl, language=lang))), scope) 132 | except LookupError as e: 133 | raise LookupError( 134 | f"Missing NLTK data. Run `" 135 | + emph("knowledge init") 136 | + "` before using the cluster command." 137 | ) from e 138 | 139 | 140 | # --- 141 | # Computing the distance between two notions 142 | # --- 143 | 144 | 145 | def similar_words(w1: str, w2: str, list_prefixes: list[str], stemmer) -> bool: 146 | """ 147 | Checks if two words w1 and w2 are similar up to taking their stem (removing a suffix) 148 | and removing a prefix in the list `list_prefixes`. 149 | """ 150 | if w1 == w2: 151 | return True 152 | for s1 in [w1, stemmer.stem(w1)]: 153 | for s2 in [w2, stemmer.stem(w2)]: 154 | for p in list_prefixes: 155 | for s in cst.IGNORE_SUFFIXES: 156 | if p + s1 + s == s2 or s1 == p + s2 + s: 157 | return True 158 | return False 159 | 160 | 161 | def __semi_distance_sets_of_words( 162 | set_words1: list[str], set_words2: list[str], list_prefixes: list[str], stemmer 163 | ) -> tuple[int, int]: 164 | """ 165 | Given two sets of words (considered up to permutation), computes the 166 | numbers of words of w1 that aren't close to a word of w2 and reciprocally. 167 | """ 168 | for w1 in set_words1: 169 | similar_to_w1 = [ 170 | w2 for w2 in set_words2 if similar_words(w1, w2, list_prefixes, stemmer) 171 | ] 172 | # If you find a pair of similar words, remove them. 173 | if len(similar_to_w1) > 0: 174 | w2 = similar_to_w1[0] 175 | set_words1.remove(w1) 176 | set_words2.remove(w2) 177 | return __semi_distance_sets_of_words( 178 | set_words1, set_words2, list_prefixes, stemmer 179 | ) 180 | return (len(set_words1), len(set_words2)) 181 | 182 | 183 | def inclusion_sets_of_words( 184 | set_words1: list[str], set_words2: list[str], list_prefixes: list[str], stemmer 185 | ) -> bool: 186 | """ 187 | Given two sets of words (considered up to permutation), are 188 | all words of the first set similar to words of the second set? 189 | """ 190 | d1, _ = __semi_distance_sets_of_words( 191 | set_words1, set_words2, list_prefixes, stemmer 192 | ) 193 | return d1 == 0 194 | 195 | 196 | def distance_sets_of_words( 197 | set_words1: list[str], set_words2: list[str], list_prefixes: list[str], stemmer 198 | ) -> int: 199 | """ 200 | Given two sets of words (considered up to permutation), computes the distance between them. 201 | """ 202 | d1, d2 = __semi_distance_sets_of_words( 203 | set_words1, set_words2, list_prefixes, stemmer 204 | ) 205 | return d1 + d2 206 | 207 | 208 | def new_stemmer(lang: str): 209 | """Returns a stemmer.""" 210 | return nss.SnowballStemmer(lang) 211 | 212 | 213 | def distance( 214 | notion1: str, 215 | notion2: str, 216 | list_prefixes: list[str], 217 | scopes_meaning: dict[str, list[list[str]]], 218 | lang: str, 219 | ) -> int: 220 | """ 221 | Measures the distance between two notions, given a list of prefixes to ignore and 222 | a list of possible meaning for each scope. 223 | Args: 224 | notion1: first notion 225 | notion2: second notion 226 | list_prefixes: a list of prefixes that will be ignored 227 | scope_meaning: a dictionnary, assigning to every scope a list of 228 | its possible meanings, each possible meaning being a list of words 229 | lang: the identifier of some language (e.g. "english") 230 | 231 | Returns: 232 | The distance between notion1 and notion2. 233 | """ 234 | kl1_words, sc1 = breakup_notion(notion1, lang) 235 | kl2_words, sc2 = breakup_notion(notion2, lang) 236 | stemmer = new_stemmer(lang) 237 | if sc1 != "" and sc2 != "" and sc1 != sc2: 238 | return cst.INFINITY 239 | if len(kl1_words) == 0 and len(kl2_words) == 0: 240 | # Can happen if the notion is a command 241 | return 0 242 | if len(kl1_words) == 0 or len(kl2_words) == 0: 243 | # Can happen if the notion is a command 244 | return cst.INFINITY 245 | if sc1 == sc2: 246 | return distance_sets_of_words(kl1_words, kl2_words, list_prefixes, stemmer) 247 | if sc1 == "": 248 | kl1_words, sc1, kl2_words, sc2 = kl2_words, sc2, kl1_words, sc1 249 | # sc2 is empty and sc1 isn't 250 | # return the minimal distance obtained by replacing sc1 by one of its possible meanings 251 | dist = cst.INFINITY 252 | if sc1 in scopes_meaning: 253 | sc1_meaning = scopes_meaning[sc1] 254 | else: 255 | sc1_meaning = [[sc1]] 256 | for meaning in sc1_meaning: 257 | kl1_with_meaning = list(copy.copy(kl1_words)) 258 | kl1_with_meaning.extend([w for w in meaning if w not in kl1_with_meaning]) 259 | dist = min( 260 | dist, 261 | distance_sets_of_words(kl1_with_meaning, kl2_words, list_prefixes, stemmer), 262 | ) 263 | return dist 264 | -------------------------------------------------------------------------------- /knowledge_clustering/file_updater.py: -------------------------------------------------------------------------------- 1 | """ 2 | Allow to atomically update a file by writing to a temporary 3 | file and comparing hashes. 4 | In case of conflicting uses, the user has to manually merge 5 | and a prompt is offered using click. 6 | """ 7 | 8 | from __future__ import annotations # Support of `|` for type union in Python 3.9 9 | 10 | from pathlib import Path 11 | 12 | import hashlib 13 | import tempfile 14 | import click 15 | 16 | 17 | def hash_file(filepath: str): 18 | """ 19 | Compute a hash of the content of the given filepath 20 | """ 21 | with open(filepath, "rb") as f: 22 | file_hash = hashlib.blake2b() 23 | chunk: bytes = f.read(8192) 24 | while chunk: 25 | file_hash.update(chunk) 26 | chunk = f.read(8192) 27 | return file_hash 28 | 29 | 30 | class AtomicUpdate: 31 | """ 32 | A small class using a temporary file to ensure that we have 33 | properly replaced the content. Prompts the user if we detect 34 | a change in the hash of the file given as input. 35 | """ 36 | 37 | def __init__(self, filename: str, original_hash=None): 38 | self.filename: str = filename 39 | self.hash = hash_file(filename) 40 | self.ctx = tempfile.NamedTemporaryFile(mode="w", dir=Path.cwd(), delete=False) 41 | self.tmp = None 42 | if ( 43 | original_hash is not None 44 | and original_hash.hexdigest() != self.hash.hexdigest() 45 | ): 46 | click.confirm( 47 | f"File {self.filename} has been modified during the run of the program, \ 48 | erase anyway?", 49 | default=None, 50 | abort=True, 51 | prompt_suffix=": ", 52 | show_default=True, 53 | err=False, 54 | ) 55 | 56 | def __enter__(self): 57 | self.tmp = self.ctx.__enter__() # type: ignore 58 | return self.tmp 59 | 60 | def __exit__(self, typ, value, traceback): 61 | new_hash = hash_file(self.filename) 62 | if self.tmp is not None: 63 | if new_hash.hexdigest() != self.hash.hexdigest(): 64 | print(f"{new_hash.hexdigest()} ≠ {self.hash.hexdigest()}") 65 | confirm = click.confirm( 66 | f"File {self.filename} has been modified\ 67 | during the run of \ 68 | the program, erase anyway?", 69 | default=None, 70 | abort=False, 71 | prompt_suffix=": ", 72 | show_default=True, 73 | err=False, 74 | ) 75 | if confirm is False: 76 | print(f"Temporary file accessible at {self.tmp.name}") 77 | return self.ctx.__exit__(typ, value, traceback) 78 | _ = Path(self.tmp.name).replace(self.filename) 79 | return self.ctx.__exit__(typ, value, traceback) 80 | -------------------------------------------------------------------------------- /knowledge_clustering/knowledges.py: -------------------------------------------------------------------------------- 1 | """Manipulating known knowledges.""" 2 | 3 | from __future__ import annotations # Support of `|` for type union in Python 3.9 4 | 5 | from typing import NamedTuple 6 | import toposort # Topological sort pylint: disable=import-error 7 | 8 | import knowledge_clustering.file_updater as fu 9 | from knowledge_clustering import cst 10 | from knowledge_clustering.misc import add_orange, add_bold 11 | 12 | 13 | class DocInfoTex(NamedTuple): 14 | """Lines of a TeX document.""" 15 | 16 | lines: list[str] 17 | 18 | 19 | class DocInfoKnowledge(NamedTuple): 20 | """Lines of TeX document corresponding to the definition of a knowledge.""" 21 | 22 | lines: list[str] 23 | command: str 24 | number: int 25 | 26 | 27 | def flat(list_of_list): 28 | """Flattens a list of list into a single list.""" 29 | return [x for y in list_of_list for x in y] 30 | 31 | 32 | class Knowledges: 33 | def __init__(self, filename): 34 | """ 35 | Reads a knowledge file from a file descriptor f. 36 | 37 | Args: 38 | filename: the name of a file containing knowledges. 39 | 40 | Computes: 41 | self.original_hash: the hash of the document ; 42 | self.document: a list of records, either of the form: 43 | { 44 | "type"="tex", 45 | "lines"= list of strings (the lines) 46 | } 47 | or { 48 | "type"="knowledge", 49 | "lines"= list of strings (the lines) 50 | "command" = string representing the line introducing the knowledge, 51 | "number" = the number of the knowledge 52 | } 53 | self.known_knowledges: a list of list of strings. 54 | Each list of strings contains strings corresponding to the same knowledge. 55 | The position in the string corresponds to the "number" field in the above 56 | document description. 57 | """ 58 | self.bags: list[list[str]] = [] # Lists of lists, containing knowledges. 59 | self.filename: str = filename 60 | self.original_hash = fu.hash_file(filename) 61 | with open(filename, encoding="utf-8") as file: 62 | lines: list[str] = file.readlines() 63 | 64 | document: list[DocInfoTex | DocInfoKnowledge] = [] 65 | knowledges: list[list[str]] = [] 66 | 67 | reading_mode: str = "tex" 68 | current_block: list[str] = [] 69 | current_kl_cmd: str = "" 70 | current_kl_strs: list[str] = [] 71 | 72 | def push_block(): 73 | nonlocal reading_mode 74 | nonlocal document 75 | nonlocal current_block 76 | nonlocal current_kl_cmd 77 | nonlocal current_kl_strs 78 | nonlocal knowledges 79 | nonlocal current_kl_strs 80 | if reading_mode == "tex" and len(current_block) > 0: 81 | document.append(DocInfoTex(lines=current_block)) 82 | current_block = [] 83 | elif reading_mode == "knowledge": 84 | document.append( 85 | DocInfoKnowledge( 86 | lines=current_block, 87 | command=current_kl_cmd, 88 | number=len(knowledges), 89 | ) 90 | ) 91 | current_block = [] 92 | current_kl_cmd = "" 93 | knowledges.append(current_kl_strs) 94 | current_kl_strs = [] 95 | 96 | def line_is_discard(line): 97 | return line.strip() == cst.DISCARD_LINE.strip() 98 | 99 | def line_is_comment(line): 100 | return line.strip().startswith("%") 101 | 102 | def line_is_knowledge(line): 103 | return line.strip().startswith("\\knowledge{") 104 | 105 | def bar_knowledge_from_line(line): 106 | line = line.strip() 107 | if line.startswith("|"): 108 | return line[1:].strip() 109 | return None 110 | 111 | def line_is_comment_bar_knowledge_from_line(line): 112 | line = line.strip() 113 | if line.startswith("%"): 114 | return (line[1:].strip()).startswith("|") 115 | return False 116 | 117 | for line in lines: 118 | if line[-1] == "\n": 119 | line = line[:-1] 120 | if reading_mode == "discard" and not line_is_comment(line): 121 | reading_mode = "tex" 122 | if line_is_discard(line): 123 | push_block() 124 | reading_mode = "discard" 125 | elif line_is_knowledge(line): 126 | push_block() 127 | reading_mode = "knowledge" 128 | current_kl_cmd = line 129 | current_block = [line] 130 | current_kl_strs = [] 131 | elif reading_mode == "knowledge": 132 | kl = bar_knowledge_from_line(line) 133 | if kl is not None: 134 | current_block.append(line) 135 | current_kl_strs.append(kl) 136 | elif line_is_comment_bar_knowledge_from_line(line): 137 | pass 138 | else: 139 | push_block() 140 | reading_mode = "tex" 141 | current_block = [line] 142 | elif reading_mode == "tex": 143 | current_block.append(line) 144 | push_block() 145 | self.document = document 146 | self.bags = knowledges 147 | self.nb_known_bags: int = len(self.bags) 148 | self.length_known_bags: list[int] = [len(bag) for bag in self.bags] 149 | 150 | def get_all_bags(self) -> list[list[str]]: 151 | """Returns all bags as a list of lists of strings.""" 152 | return self.bags 153 | 154 | def get_old_bags(self) -> list[list[str]]: 155 | """Returns all bags that were present at the last checkpoint, 156 | as a list of lists of strings.""" 157 | return self.bags[: self.nb_known_bags] 158 | 159 | def get_new_bags(self) -> list[list[str]]: 160 | """Returns all bags that were not added since the last checkpoint, 161 | as a list of lists of strings.""" 162 | return self.bags[self.nb_known_bags :] 163 | 164 | def get_all_knowledges(self) -> list[str]: 165 | """Returns all knowledges, as a list of strings.""" 166 | return flat(self.bags) 167 | 168 | def get_known_knowledges_in_bag(self, b_id: int) -> list[str]: 169 | """Returns the list of knowledges contained in the `b_id`-th bag 170 | during the last checkpoint, as a list of strings.""" 171 | if b_id < self.nb_known_bags: 172 | return self.bags[b_id][: self.length_known_bags[b_id]] 173 | return [] 174 | 175 | def get_new_knowledges_in_bag(self, b_id: int) -> list[str]: 176 | """Returns the list of knowledges contained in the `id`-th bag 177 | that were added since the last checkpoint, as a list of strings.""" 178 | if b_id < self.nb_known_bags: 179 | return self.bags[b_id][self.length_known_bags[b_id] :] 180 | return self.bags[b_id] 181 | 182 | def add_new_bag(self, kl: str) -> None: 183 | """Adds a new bag that contains only the string `kl`.""" 184 | self.bags.append([kl]) 185 | 186 | def define_synonym_of(self, kl1: str, kl2: str) -> None: 187 | """ 188 | Defines a new knowledge (string) `kl1` as a new synonym of the already 189 | existing knowledge (string) `kl2`. 190 | """ 191 | for b_id, bag in enumerate(self.bags): 192 | if kl2 in bag: 193 | self.bags[b_id].append(kl1) 194 | return 195 | raise KeyError(f"Error: {kl2} is not a knowledge.") 196 | 197 | def was_changed(self) -> bool: 198 | """ 199 | Returns whether kl has new bags or new synonyms. 200 | """ 201 | if len(self.get_new_bags()) > 0: 202 | return True 203 | for b_id in range(len(self.get_old_bags())): 204 | if len(self.get_new_knowledges_in_bag(b_id)) > 0: 205 | return True 206 | return False 207 | 208 | def write_knowledges_in_file(self, nocomment: bool = False) -> None: 209 | """ 210 | Writes the new synonyms and new knowledges in the file containing the knowledges. 211 | """ 212 | with fu.AtomicUpdate(self.filename, original_hash=self.original_hash) as file: 213 | for b in self.document: 214 | if isinstance(b, DocInfoTex): 215 | for line in b.lines: 216 | file.write(line + "\n") 217 | elif isinstance(b, DocInfoKnowledge): 218 | for line in b.lines: 219 | file.write(line + "\n") 220 | if b.number < self.nb_known_bags: 221 | for kl in self.get_new_knowledges_in_bag(b.number): 222 | file.write((f" | {kl}\n" if nocomment else f"% | {kl}\n")) 223 | if len(self.get_new_bags()) > 0: 224 | file.write(cst.DISCARD_LINE + "\n") 225 | for bag in self.get_new_bags(): 226 | if len(bag) > 0: 227 | file.write("%\n") 228 | file.write("%\\knowledge{notion}\n") 229 | for kl in bag: 230 | file.write((f" | {kl}\n" if nocomment else f"% | {kl}\n")) 231 | 232 | 233 | class KnowledgesList: 234 | def __init__(self, kls_filenames: list[str]): 235 | """ 236 | Reads a list of knowledge files. 237 | 238 | Args: 239 | kls_list: the list of filenames containing knowledges. 240 | """ 241 | self.nb_file: int = len(kls_filenames) 242 | self.kls_list: dict[str, Knowledges] = { 243 | fn: Knowledges(fn) for fn in kls_filenames 244 | } 245 | self.default_fn: str = kls_filenames[self.nb_file - 1] 246 | self.compute_dependency_graph() 247 | 248 | def get_all_kls_struct(self) -> list[Knowledges]: 249 | """Returns the list of all knowledge structures""" 250 | return list(self.kls_list.values()) 251 | 252 | def default_kls(self) -> Knowledges: 253 | """Returns the default kls.""" 254 | return self.kls_list[self.default_fn] 255 | 256 | def get_all_bags(self) -> list[list[str]]: 257 | """Returns all bags as a list of lists of strings.""" 258 | return flat([kls.get_all_bags() for kls in self.kls_list.values()]) 259 | 260 | def get_all_knowledges(self) -> list[str]: 261 | """Returns all knowledges, as a list of strings.""" 262 | return flat([kls.get_all_knowledges() for kls in self.kls_list.values()]) 263 | 264 | def get_sorted_knowledges(self) -> list[str]: 265 | """Returns all knowledges, sorted by topological sort.""" 266 | return self.all_knowledges_sorted 267 | 268 | def add_new_bag(self, kl: str) -> None: 269 | """Adds a new bag that contains only the string `kl`.""" 270 | self.default_kls().add_new_bag(kl) 271 | 272 | def define_synonym_of(self, kl1: str, kl2: str) -> None: 273 | """ 274 | Defines a new knowledge (string) `kl1` as a new synonym of the already 275 | existing knowledge (string) `kl2`. 276 | """ 277 | for kls in self.kls_list.values(): 278 | for b_id, bag in enumerate(kls.bags): 279 | if kl2 in bag: 280 | kls.bags[b_id].append(kl1) 281 | return 282 | raise KeyError(f"Error: {kl2} is not a knowledge.") 283 | 284 | def write_knowledges_in_file(self, nocomment: bool = False) -> None: 285 | """ 286 | Writes the new synonyms and new knowledges in the file containing the knowledges. 287 | """ 288 | for kls in self.kls_list.values(): 289 | kls.write_knowledges_in_file(nocomment) 290 | 291 | def get_new_bags(self) -> list[list[str]]: 292 | """Returns all bags that were added since the last checkpoint, 293 | as a list of lists of strings.""" 294 | return self.default_kls().get_new_bags() 295 | 296 | def get_new_knowledges_in_file(self, fn: str) -> list[str]: 297 | """Returns all new knowledges that were added in some file since the last 298 | checkpoint, as a list of strings.""" 299 | if fn not in self.kls_list: 300 | raise KeyError(f"No knowledge file named {fn}.") 301 | return flat( 302 | [ 303 | self.kls_list[fn].get_new_knowledges_in_bag(bag_id) 304 | for bag_id in range(len(self.kls_list[fn].get_all_bags())) 305 | ] 306 | ) 307 | 308 | def compute_dependency_graph(self) -> None: 309 | """ 310 | Computes the dependency graph of all knowledges, for the substring relation. 311 | Then, sort all knowledges using topological sorting. 312 | Result are stored in self.dependency and self.all_knowledges_sorted. 313 | """ 314 | dependency: dict[str, set[str]] = {} 315 | dependency_reversed: dict[str, set[str]] = {} 316 | for s1 in self.get_all_knowledges(): 317 | dependency[s1] = { 318 | s2 for s2 in self.get_all_knowledges() if s2 in s1 and s1 != s2 319 | } 320 | dependency_reversed[s1] = { 321 | s2 for s2 in self.get_all_knowledges() if s1 in s2 and s1 != s2 322 | } 323 | self.dependency: dict[str, set[str]] = dependency 324 | self.all_knowledges_sorted: list[str] = list( 325 | toposort.toposort_flatten(dependency_reversed) 326 | ) 327 | 328 | 329 | def remove_redundant_files(list_filenames: list[str]) -> list[str]: 330 | """ 331 | Given a list of filenames, return the same list without duplicates, and output a warning 332 | if there is such a duplicate. 333 | """ 334 | output: list[str] = [] 335 | for fn in list_filenames: 336 | if fn in output: 337 | print( 338 | add_bold(add_orange("[Warning]")) 339 | + f" same knowledge file given twice ({fn}), second occurrence is ignored." 340 | ) 341 | else: 342 | output.append(fn) 343 | return output 344 | -------------------------------------------------------------------------------- /knowledge_clustering/misc.py: -------------------------------------------------------------------------------- 1 | """Misc functions, for emphasizing a string.""" 2 | 3 | from __future__ import annotations # Support of `|` for type union in Python 3.9 4 | 5 | 6 | BEGIN_EMPH: str = "\033[1m\033[95m" 7 | BEGIN_EMPH_ALT: str = "\033[1m\033[92m" 8 | BEGIN_BOLD: str = "\033[1m" 9 | BEGIN_RED: str = "\033[31m" 10 | BEGIN_ORANGE: str = "\033[33m" 11 | BEGIN_GREEN: str = "\033[32m" 12 | END_EMPH: str = "\033[0m" 13 | 14 | 15 | def emph(string: str) -> str: 16 | """Emphasizes a string.""" 17 | return BEGIN_EMPH + string + END_EMPH 18 | 19 | 20 | def emph_alt(string: str) -> str: 21 | """Alternative emphasis of a string.""" 22 | return BEGIN_EMPH_ALT + string + END_EMPH 23 | 24 | 25 | def add_red(string: str) -> str: 26 | """Puts a string in red.""" 27 | return BEGIN_RED + string + END_EMPH 28 | 29 | 30 | def add_orange(string: str) -> str: 31 | """Puts a string in orange.""" 32 | return BEGIN_ORANGE + string + END_EMPH 33 | 34 | 35 | def add_green(string: str) -> str: 36 | """Puts a string in green.""" 37 | return BEGIN_GREEN + string + END_EMPH 38 | 39 | 40 | def add_bold(string: str) -> str: 41 | """Puts a string in bold.""" 42 | return BEGIN_BOLD + string + END_EMPH 43 | -------------------------------------------------------------------------------- /knowledge_clustering/scope_meaning.py: -------------------------------------------------------------------------------- 1 | """Infer the scope from known knowledges.""" 2 | 3 | from __future__ import annotations # Support of `|` for type union in Python 3.9 4 | from typing import TypeVar 5 | 6 | import copy 7 | 8 | import knowledge_clustering.distance as dist 9 | 10 | T = TypeVar("T") # Generic type 11 | 12 | 13 | def union_list_of_lists(l1: list[T], l2: list[T]) -> list[T]: 14 | """Returns the union (without repetition) of two lists of lists.""" 15 | s = copy.copy(l1) 16 | for sublist in l2: 17 | if sublist not in s: 18 | s.append(sublist) 19 | return s 20 | 21 | 22 | def infer_scope(list_kl: list[str], scope: str, lang: str, stemmer) -> list[list[str]]: 23 | """ 24 | Takes a list of knowledges that all belong to the same bag and a scope. 25 | 26 | If the list contains a knowledge with this scope, we try to infer the meaning of the scope 27 | by looking at similar knowledges. 28 | 29 | Example: 30 | Running the algorithm on ["word@some-scope", "countable ordinal word", 31 | "ordinal word", "scattered language"] for the scope `some-scope` will return 32 | the list [["countable", "ordinal"], ["ordinal"]]. 33 | """ 34 | result: list[list[str]] = [] 35 | list_kl_broke: list[tuple[list[str], str]] = [ 36 | dist.breakup_notion(kl, lang) for kl in list_kl 37 | ] 38 | for kl1_words, sc1 in list_kl_broke: 39 | if sc1 == scope: 40 | for kl2_words, sc2 in list_kl_broke: 41 | if sc2 == "": 42 | if dist.inclusion_sets_of_words( 43 | kl1_words, kl2_words, [""], stemmer 44 | ): 45 | # If every word of kl1 appears in kl2 and kl2 has an empty scope, 46 | # return the words in kl2 not appearing in kl1 47 | result.append([w for w in kl2_words if w not in kl1_words]) 48 | return result 49 | 50 | 51 | def infer_all_scopes( 52 | known_knowledges: list[list[str]], lang: str 53 | ) -> dict[str, list[list[str]]]: 54 | """ 55 | Given known knowledges and a langage, returns the infer meaning of scopes occuring 56 | in said these knowledges. 57 | """ 58 | list_scopes: set[str] = { 59 | sc for bag in known_knowledges for (_, sc) in map(dist.extract_scope, bag) 60 | } 61 | if "" in list_scopes: 62 | list_scopes.remove("") 63 | scopes_meaning: dict[str, list[list[str]]] = {sc: [] for sc in list_scopes} 64 | stemmer = dist.new_stemmer(lang) 65 | for scope in list_scopes: 66 | for bag in known_knowledges: 67 | scopes_meaning[scope] = union_list_of_lists( 68 | scopes_meaning[scope], infer_scope(bag, scope, lang, stemmer) 69 | ) 70 | if [scope] not in scopes_meaning[scope]: 71 | scopes_meaning[scope].append([scope]) 72 | return scopes_meaning 73 | 74 | 75 | def print_scopes( 76 | scopes_meaning: dict[str, list[list[str]]], print_meaning: bool = False 77 | ) -> None: 78 | """Prints the infered meaning of scopes.""" 79 | print("Defined scopes:") 80 | if not print_meaning: 81 | print("\t", list(scopes_meaning.keys())) 82 | else: 83 | for sc in scopes_meaning: 84 | print(f"\t@{sc}:{scopes_meaning[sc]}") 85 | -------------------------------------------------------------------------------- /knowledge_clustering/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/remimorvan/knowledge-clustering/4ab80f28b0a1796d682eaf365828580f05366ba6/knowledge_clustering/scripts/__init__.py -------------------------------------------------------------------------------- /knowledge_clustering/scripts/app.py: -------------------------------------------------------------------------------- 1 | """ 2 | Launching knowledge commands (init, cluster, addquotes, anchor). 3 | """ 4 | 5 | from __future__ import annotations # Support of `|` for type union in Python 3.9 6 | from pathlib import Path 7 | 8 | import os 9 | import sys 10 | import click 11 | from click_default_group import DefaultGroup # type: ignore 12 | import nltk # type: ignore 13 | 14 | from knowledge_clustering import ( 15 | add_anchor, 16 | add_quotes, 17 | clustering, 18 | cst, 19 | _version, 20 | autofinder, 21 | ) 22 | from knowledge_clustering.check_update import check_update 23 | from knowledge_clustering.misc import add_red, add_bold 24 | 25 | 26 | # https://stackoverflow.com/a/67324391/19340201 27 | class AliasedGroup(DefaultGroup): 28 | """Group where `AP` is a synonym for `anchor`.""" 29 | 30 | def get_command(self, ctx, cmd_name): 31 | if cmd_name in ["anchor", "AP"]: 32 | return DefaultGroup.get_command(self, ctx, "anchor") 33 | return DefaultGroup.get_command(self, ctx, cmd_name) 34 | 35 | 36 | @click.group(cls=AliasedGroup, default="cluster", default_if_no_args=True) 37 | @click.version_option(_version.VERSION) 38 | def cli(): 39 | """Automated notion clustering for the knowledge LaTeX package""" 40 | 41 | 42 | @cli.command() 43 | def init(): 44 | """Downloads the required NLTK packages.""" 45 | nltk.download("punkt") 46 | nltk.download("punkt_tab") 47 | nltk.download("averaged_perceptron_tagger") 48 | nltk.download("averaged_perceptron_tagger_eng") 49 | 50 | 51 | @cli.command() 52 | @click.option( 53 | "--knowledge", 54 | "-k", 55 | "kl_filename", 56 | multiple=True, 57 | type=click.Path( 58 | exists=True, file_okay=True, dir_okay=False, writable=True, readable=True 59 | ), 60 | help="File containing the knowledges that are already defined. \ 61 | Multiple files are allowed; new knowledges will be written in the last one. \ 62 | If the option is not specified, all .kl file in the current directory (and subdirectory, \ 63 | recursively) will be taken. If there are multiple files, exactly one of them must end \ 64 | with `default.kl`.", 65 | required=False, 66 | ) 67 | @click.option( 68 | "--diagnose", 69 | "-d", 70 | "dg_filename", 71 | type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True), 72 | help="Diagnose file produced by LaTeX. If the option is not specified, the unique \ 73 | .diagnose file in the current directory (and subdirectory, recursively) is taken instead.", 74 | required=False, 75 | ) 76 | @click.option( 77 | "--lang", 78 | "-l", 79 | default="en", 80 | type=click.Choice(["en", "fr"]), 81 | help="Language of your TeX document.", 82 | ) 83 | @click.option( 84 | "--scope/--no-scope", 85 | "-S/ ", 86 | default=False, 87 | help="Print the scopes defined in the knowledge file and print \ 88 | the possible meaning of those scope inferred by knowledge-clustering.", 89 | ) 90 | @click.option( 91 | "--print/--no-print", 92 | "-P/ ", 93 | "print_kl", 94 | default=False, 95 | help="Print all new knowledges.", 96 | ) 97 | @click.option( 98 | "--no-update/--update", 99 | "-N/ ", 100 | "noupdate", 101 | default=False, 102 | help="Don't look on PyPI if a newer version of knowledge-clustering is available.", 103 | ) 104 | @click.option( 105 | "--config-file", 106 | "-c", 107 | "config_filename", 108 | default=None, 109 | help=f"Specify the configuration file. By default the configuration file \ 110 | in the folder {cst.CONFIG_DIR} corresponding to your language is used.", 111 | ) 112 | def cluster( 113 | kl_filename: tuple[str], 114 | dg_filename: str, 115 | lang: str, 116 | scope: bool, 117 | print_kl: bool, 118 | noupdate: bool, 119 | config_filename: None | str, 120 | ): 121 | """ 122 | Defines, as a comment and in the knowledge files, all the knowledges occuring in the file. 123 | """ 124 | try: 125 | if not dg_filename: 126 | dg_filename = autofinder.get_unique_diagnose_file(Path(".")) 127 | kl_filename = list(kl_filename) 128 | if not kl_filename: 129 | kl_filename = autofinder.get_knowledge_files(Path(".")) 130 | clustering.app(kl_filename, dg_filename, scope, print_kl, lang, config_filename) 131 | if not noupdate: 132 | check_update() 133 | except (autofinder.NoFile, autofinder.TooManyFiles) as e: 134 | print(add_bold(add_red("\n[Error] ")) + e.args[0]) 135 | 136 | 137 | @cli.command() 138 | @click.option( 139 | "--tex", 140 | "-t", 141 | "tex_filename", 142 | type=click.Path( 143 | exists=True, file_okay=True, dir_okay=False, writable=True, readable=True 144 | ), 145 | help="Your TeX file.", 146 | required=True, 147 | ) 148 | @click.option( 149 | "--knowledge", 150 | "-k", 151 | "kl_filename", 152 | multiple=True, 153 | type=click.Path( 154 | exists=True, file_okay=True, dir_okay=False, writable=True, readable=True 155 | ), 156 | help="File containing the knowledges that are already defined. \ 157 | Multiple files are allowed; new knowledges will be written in the last one. \ 158 | If the option is not specified, all .kl file in the current directory (and subdirectory, \ 159 | recursively) will be taken. If there are multiple files, exactly one of them must end \ 160 | with `default.kl`.", 161 | required=False, 162 | ) 163 | @click.option( 164 | "--print", 165 | "-p", 166 | "print_line", 167 | type=int, 168 | default=1, 169 | help="When finding a match, number of lines (preceding the match) that are printed \ 170 | in the prompt to the user.", 171 | ) 172 | @click.option( 173 | "--no-update/--update", 174 | "-N/ ", 175 | "noupdate", 176 | default=False, 177 | ) 178 | def addquotes(tex_filename: str, kl_filename: str, print_line: int, noupdate: bool): 179 | """ 180 | Finds knowledges defined in the knowledge files that appear in tex file without quote 181 | symbols. Proposes to add quotes around them. 182 | """ 183 | try: 184 | kl_filename = list(kl_filename) 185 | if not kl_filename: 186 | kl_filename = autofinder.get_knowledge_files(Path(".")) 187 | add_quotes.app(tex_filename, kl_filename, print_line) 188 | if not noupdate: 189 | check_update() 190 | except (autofinder.NoFile, autofinder.TooManyFiles) as e: 191 | print(add_bold(add_red("\n[Error] ")) + e.args[0]) 192 | 193 | 194 | @cli.command() 195 | @click.option( 196 | "--tex", 197 | "-t", 198 | "tex_filename", 199 | type=click.Path( 200 | exists=True, file_okay=True, dir_okay=False, writable=True, readable=True 201 | ), 202 | help="Your TeX file.", 203 | required=True, 204 | ) 205 | @click.option( 206 | "--space", 207 | "-s", 208 | type=int, 209 | default=200, 210 | help="Number of characters tolerated between an anchor point and the introduction \ 211 | of a knowledge. (Default value: 200)", 212 | ) 213 | @click.option( 214 | "--no-update/--update", 215 | "-N/ ", 216 | "noupdate", 217 | default=False, 218 | ) 219 | def anchor(tex_filename: str, space: int, noupdate: bool): 220 | """ 221 | Prints warning when a knowledge is introduced but is not preceded by an anchor point. 222 | """ 223 | add_anchor.app(tex_filename, space) 224 | if not noupdate: 225 | check_update() 226 | 227 | 228 | if __name__ == "__main__": 229 | cli() 230 | -------------------------------------------------------------------------------- /knowledge_clustering/tex_document.py: -------------------------------------------------------------------------------- 1 | """Handling a Tex document.""" 2 | 3 | from __future__ import annotations # Support of `|` for type union in Python 3.9 4 | from typing import TextIO 5 | 6 | from knowledge_clustering import misc 7 | 8 | 9 | class TexDocument: 10 | """Class for handling a tex document.""" 11 | 12 | def __init__(self, tex_code: str) -> None: 13 | self.tex_code: str = tex_code 14 | self.lines: list[str] = self.tex_code.split("\n") 15 | self.__update_col_line() 16 | self.__clean() 17 | self.length: int = len(self.tex_cleaned) 18 | 19 | def __update_col_line(self) -> None: 20 | """ 21 | Compute two arrays, saying for each index i of self.text, at what column and 22 | what line of the text this index is located. 23 | """ 24 | self.find_line: list[int] = [0] * len(self.tex_code) 25 | self.find_col: list[int] = [0] * len(self.tex_code) 26 | line: int = 1 27 | col: int = 1 28 | for position, letter in enumerate(self.tex_code): 29 | self.find_line[position] = line 30 | self.find_col[position] = col 31 | if letter == "\n": 32 | line += 1 33 | col = 1 34 | else: 35 | col += 1 36 | 37 | def __clean(self): 38 | """ 39 | Reads self.tex_code (the original tex file), given as a single string. 40 | Converts spaces, tabulations and new lines into a single space, except 41 | if there is two consecutive new lines. Removes commented lines. 42 | The cleaned file is stored in self.tex_cleaned. A pointer 43 | from tex_cleaned to tex_code, in the form of an array, is produced in self.pointer. 44 | """ 45 | 46 | # Essentially, the algorithm is a deterministic transducer with five states 47 | # 0: the last character is `normal` (not a space, a tab, nor a new line) ; initial state 48 | # 1: the last character is not normal, 49 | # and no new line was read since the last normal character 50 | # 2: the last character is not normal, 51 | # and exactly one new line was read since the last normal character 52 | # 3: the last character is not normal, 53 | # and at least two new lines were read since the last normal character 54 | # 4: the line is commented. 55 | def is_normal(letter: str) -> bool: 56 | return letter not in [" ", "\t", "\n", "%"] 57 | 58 | def transition( 59 | state: int, letter: str, counter: int 60 | ) -> tuple[int, str, int | None]: 61 | """ 62 | Input: curent state, input letter and the size of produced output so far 63 | Output: returns the new state, the output, and the pointer of the input letter. 64 | """ 65 | if is_normal(letter): 66 | if state == 4: 67 | return (4, "", None) 68 | return (0, letter, counter) 69 | if letter == "%": 70 | return (4, "", None) 71 | if letter == "\n": 72 | if state == 4: 73 | return (0, "", None) 74 | if state == 0: 75 | return (2, " ", None) 76 | if state == 1: 77 | return (2, "", None) 78 | if state == 2: 79 | return (3, "\\par ", counter) 80 | return (3, "", None) 81 | if letter in [" ", "\t"]: 82 | if state == 0: 83 | return (1, " ", counter) 84 | return (state, "", None) 85 | raise KeyError("Transition not defined", state, letter) 86 | 87 | state: int = 0 88 | tex_cleaned: str = "" 89 | m: int = 0 90 | pointer: list[None | int] = [] 91 | for position, letter in enumerate(self.tex_code): 92 | state, output, input_pointer = transition(state, letter, m) 93 | tex_cleaned += output 94 | m += len(output) 95 | # Put position at index input_pointer 96 | if input_pointer is not None: 97 | pointer += [None] * (input_pointer - len(pointer)) + [position] 98 | self.tex_cleaned: str = tex_cleaned 99 | self.pointer: list[None | int] = pointer 100 | 101 | def print(self, start: int, end: int, n: int, out: TextIO): 102 | """ 103 | Prints the lines between positions (in the clean tex) `start` and `end` 104 | together with `n`-1 lines preceding `start`. 105 | Emphasize the part between `start` and `end`. 106 | """ 107 | start_p = self.pointer[start] 108 | end_p = self.pointer[end] 109 | if isinstance(start_p, int) and isinstance(end_p, int): 110 | l_start: int = self.find_line[start_p] 111 | c_start: int = self.find_col[start_p] 112 | l_end: int = self.find_line[end_p] 113 | c_end: int = self.find_col[end_p] 114 | for i in range(max(0, l_start - n), l_end): 115 | if i + 1 == l_start and i + 1 == l_end: 116 | print( 117 | f"l{i+1}: \t{self.lines[i][:c_start-1]}" 118 | + misc.emph(self.lines[i][c_start - 1 : c_end]) 119 | + self.lines[i][c_end:], 120 | file=out, 121 | ) 122 | elif i + 1 == l_start: 123 | print( 124 | f"l{i+1}: \t{self.lines[i][:c_start-1]}" 125 | + misc.emph(self.lines[i][c_start - 1 :]), 126 | file=out, 127 | ) 128 | elif i + 1 == l_end: 129 | print( 130 | f"l{i+1}: \t" 131 | + misc.emph(self.lines[i][:c_end]) 132 | + self.lines[i][c_end:], 133 | file=out, 134 | ) 135 | elif l_start < i + 1 and i + 1 < l_end: 136 | print(f"l{i+1}: \t" + misc.emph(self.lines[i]), file=out) 137 | else: 138 | print(f"l{i+1}: \t{self.lines[i]}", file=out) 139 | else: 140 | raise IndexError("Undefined pointer", self.pointer, (start, end)) 141 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel", 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = knowledge-clustering 3 | version = attr: knowledge_clustering._version.VERSION 4 | author = Rémi Morvan 5 | author_email = remi@morvan.xyz 6 | description = Automated notion clustering for the knowledge LaTeX package 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/remimorvan/knowledge-clustering 10 | project_urls = 11 | Bug Tracker = https://github.com/remimorvan/knowledge-clustering/issues 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | License :: OSI Approved :: MIT License 15 | Operating System :: OS Independent 16 | keywords = 17 | knowledge :: latex :: clustering 18 | 19 | [options] 20 | packages = find: 21 | python_requires = >=3.9 22 | install_requires = 23 | click 24 | click_default_group 25 | nltk 26 | spacy 27 | toposort 28 | unidecode 29 | requests 30 | 31 | [options.package_data] 32 | * = data/* 33 | 34 | [options.entry_points] 35 | console_scripts = 36 | knowledge = knowledge_clustering.scripts.app:cli 37 | 38 | [options.extras_require] 39 | tests = 40 | pytest 41 | filecmp -------------------------------------------------------------------------------- /tests/.ordinal.diagnose.original: -------------------------------------------------------------------------------- 1 | ************************ 2 | * Undefined knowledges * 3 | ************************ 4 | 5 | \knowledge{ignore} 6 | | inseparability 7 | | semigroup 8 | | words 9 | | semigroups 10 | | countable ordinal word -------------------------------------------------------------------------------- /tests/.ordinal.kl.original: -------------------------------------------------------------------------------- 1 | \knowledge{notion} 2 | | word 3 | 4 | \knowledge{notion} 5 | | word@ord 6 | 7 | \knowledge{notion} 8 | | regular language over countable ordinals 9 | | regular languages@ord 10 | 11 | \knowledge{notion} 12 | | separation 13 | -------------------------------------------------------------------------------- /tests/.ordinal.kl.solution: -------------------------------------------------------------------------------- 1 | \knowledge{notion} 2 | | word 3 | % | words 4 | 5 | \knowledge{notion} 6 | | word@ord 7 | % | countable ordinal word 8 | 9 | \knowledge{notion} 10 | | regular language over countable ordinals 11 | | regular languages@ord 12 | 13 | \knowledge{notion} 14 | | separation 15 | % | inseparability 16 | %%%%% NEW KNOWLEDGES 17 | % 18 | %\knowledge{notion} 19 | % | semigroup 20 | % | semigroups 21 | -------------------------------------------------------------------------------- /tests/.ordinal.tex.original: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | 3 | \usepackage[utf8]{inputenc} 4 | \usepackage[T1]{fontenc} 5 | \pdfoutput = 1 6 | 7 | \usepackage[breaklinks,hidelinks]{hyperref} 8 | \usepackage{xcolor} 9 | 10 | \usepackage{knowledge} 11 | \knowledgeconfigure{notion} 12 | \knowledgeconfigure{quotation} 13 | \input{ordinal-kl.tex} 14 | 15 | \title{Blabla} 16 | \date{\today} 17 | \author{Charles-Édouard} 18 | 19 | 20 | \begin{document} 21 | 22 | \maketitle 23 | 24 | \AP ""word"" 25 | "words" 26 | ""word@@ord"" 27 | "countable ordinal word" 28 | 29 | blablablablablablablablablablablablablablablablablablablabla 30 | blablablablablablablablablablablablablablablablablablablabla 31 | blablablablablablablablablablablablablablablablablablablabla 32 | 33 | ""regular language over countable ordinals"" 34 | \kl[ord]{regular languages} 35 | \intro{separation} 36 | 37 | "inseparability" 38 | ""semigroup"" 39 | \kl{semigroups} 40 | 41 | 42 | \end{document} -------------------------------------------------------------------------------- /tests/test_addquotes.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the modules of knowledge_clustering on which the addquotes command is based. 3 | """ 4 | 5 | from pathlib import Path 6 | import shutil 7 | 8 | from knowledge_clustering.add_quotes import app as app_addquotes 9 | 10 | 11 | def test_app_addquotes() -> None: 12 | """Tests the addquotes command.""" 13 | shutil.copy("tests/.ordinal.tex.original", "tests/ordinal.tex") 14 | shutil.copy("tests/.ordinal.kl.original", "tests/ordinal.kl") 15 | with open("tests/yes.txt", "w", encoding="utf-8") as yes: 16 | yes.write("y\n" * 100) 17 | with open("tests/yes.txt", "r", encoding="utf-8") as inp: 18 | with open("tests/output_addquotes.txt", "w", encoding="utf-8") as out: 19 | app_addquotes("tests/ordinal.tex", ["tests/ordinal.kl"], 1, inp, out) 20 | with open("tests/output_addquotes.txt", "r", encoding="utf-8") as out: 21 | nb_line_output = sum(1 for _ in out) 22 | b: bool = nb_line_output == 7 23 | p = Path("tests/") 24 | for filename in ["yes.txt", "ordinal.tex", "ordinal.kl", "output_addquotes.txt"]: 25 | (p / filename).unlink() 26 | assert b 27 | -------------------------------------------------------------------------------- /tests/test_anchor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the modules of knowledge_clustering on which the anchor command is based. 3 | """ 4 | 5 | from pathlib import Path 6 | import shutil 7 | 8 | from knowledge_clustering.add_anchor import app as app_anchor 9 | 10 | 11 | def test_app_anchor() -> None: 12 | """Tests the anchor command.""" 13 | shutil.copy("tests/.ordinal.tex.original", "tests/ordinal.tex") 14 | with open("tests/output_anchor.txt", "w", encoding="utf-8") as out: 15 | app_anchor("tests/ordinal.tex", 200, out) 16 | nb_line_output = sum( 17 | 1 for line in open("tests/output_anchor.txt", encoding="utf-8") 18 | ) 19 | b1: bool = nb_line_output == 3 20 | with open("tests/output_anchor.txt", "w", encoding="utf-8") as out: 21 | app_anchor("tests/ordinal.tex", 5, out) 22 | with open("tests/output_anchor.txt", "r", encoding="utf-8") as out: 23 | nb_line_output = sum(1 for _ in out) 24 | b2: bool = nb_line_output == 4 25 | p = Path("tests/") 26 | for filename in ["ordinal.tex", "output_anchor.txt"]: 27 | (p / filename).unlink() 28 | assert b1 and b2 29 | -------------------------------------------------------------------------------- /tests/test_autofinder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the autofinder module. 3 | """ 4 | 5 | from pathlib import Path 6 | 7 | from knowledge_clustering.autofinder import ( 8 | NoFile, 9 | TooManyFiles, 10 | get_unique_diagnose_file, 11 | get_knowledge_files, 12 | ) 13 | 14 | 15 | def test_autofinder() -> None: 16 | """Test function for the functions get_unique_diagnose_file, get_knowledge_files 17 | from the module autofinder.""" 18 | p = Path("tests/testaf/") 19 | p.mkdir() 20 | test_results = [False] * 6 21 | # 0th test with 1 diagnose file and 3 .kl with a unique default file (OK) 22 | (p / "subdir1").mkdir() 23 | (p / "subdir2").mkdir() 24 | (p / "subdir3").mkdir() 25 | (p / "subdir1/coolproject.diagnose").touch() 26 | (p / "subdir2/abbreviations.kl").touch() 27 | (p / "subdir2/main-default.kl").touch() 28 | (p / "subdir3/omega-automata.kl").touch() 29 | # Content of testaf directory: 30 | # - subdir1 31 | # |-- coolproject.diagnose 32 | # - subdir2 33 | # |-- abbreviations.kl 34 | # |-- main-default.kl 35 | # - subdir3 36 | # |-- omega-automata.kl 37 | try: 38 | dg_file = get_unique_diagnose_file(p) 39 | kl_files = get_knowledge_files(p) 40 | if ( 41 | str(dg_file) == "tests/testaf/subdir1/coolproject.diagnose" 42 | and len(kl_files) == 3 43 | and str(kl_files[2]) == "tests/testaf/subdir2/main-default.kl" 44 | ): 45 | test_results[0] = True 46 | except (NoFile, TooManyFiles): 47 | pass 48 | # 1st test with 1 diagnose file and 4 .kl with a two default files (not OK) 49 | (p / "subdir3/secondary-default.kl").touch() 50 | # Content of testaf directory: 51 | # - subdir1 52 | # |-- coolproject.diagnose 53 | # - subdir2 54 | # |-- abbreviations.kl 55 | # |-- main-default.kl 56 | # - subdir3 57 | # |-- omega-automata.kl 58 | # |-- secondary-default.kl 59 | try: 60 | _ = get_knowledge_files(p) 61 | except TooManyFiles: 62 | test_results[1] = True 63 | # 2nd test with 1 diagnose file and 2 .kl with no default files (not OK) 64 | (p / "subdir2/main-default.kl").unlink() 65 | (p / "subdir3/secondary-default.kl").unlink() 66 | # Content of testaf directory: 67 | # - subdir1 68 | # |-- coolproject.diagnose 69 | # - subdir2 70 | # |-- abbreviations.kl 71 | # - subdir3 72 | # |-- omega-automata.kl 73 | try: 74 | _ = get_knowledge_files(p) 75 | except NoFile: 76 | test_results[2] = True 77 | # 3rd test with 1 diagnose file and 1 .kl with no default files (OK) 78 | (p / "subdir2/abbreviations.kl").unlink() 79 | # Content of testaf directory: 80 | # - subdir1 81 | # |-- coolproject.diagnose 82 | # - subdir2 83 | # - subdir3 84 | # |-- omega-automata.kl 85 | try: 86 | _ = get_knowledge_files(p) 87 | test_results[3] = True 88 | except (NoFile, TooManyFiles): 89 | pass 90 | # 4th test with 2 diagnose file and 1 .kl with no default files (not OK) 91 | (p / "subdir2/another-file.diagnose").touch() 92 | # Content of testaf directory: 93 | # - subdir1 94 | # |-- coolproject.diagnose 95 | # - subdir2 96 | # |-- another-file.diagnose 97 | # - subdir3 98 | # |-- omega-automata.kl 99 | try: 100 | _ = get_unique_diagnose_file(p) 101 | except TooManyFiles: 102 | test_results[4] = True 103 | # 5th test with no diagnose file and 1 .kl with no default files (not OK) 104 | (p / "subdir1/coolproject.diagnose").unlink() 105 | (p / "subdir2/another-file.diagnose").unlink() 106 | # Content of testaf directory: 107 | # - subdir1 108 | # - subdir2 109 | # - subdir3 110 | # |-- omega-automata.kl 111 | try: 112 | _ = get_unique_diagnose_file(p) 113 | except NoFile: 114 | test_results[5] = True 115 | # Remove all files and directory created for the test 116 | (p / "subdir3/omega-automata.kl").unlink() 117 | for dirname in ["subdir3", "subdir2", "subdir1", ""]: 118 | (p / dirname).rmdir() 119 | assert all(test_results) 120 | -------------------------------------------------------------------------------- /tests/test_clustering.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the modules of knowledge_clustering on which the cluster command is based. 3 | """ 4 | 5 | from typing import TypeVar 6 | from pathlib import Path 7 | import filecmp 8 | import shutil 9 | 10 | from knowledge_clustering.distance import distance, new_stemmer, normalise_notion 11 | from knowledge_clustering.scope_meaning import infer_scope, infer_all_scopes 12 | from knowledge_clustering.clustering import clustering 13 | from knowledge_clustering.knowledges import Knowledges 14 | from knowledge_clustering.diagnose import parse as parse_diagnose 15 | from knowledge_clustering.config import parse as parse_config 16 | from knowledge_clustering.clustering import app as app_clustering 17 | 18 | T = TypeVar("T") # Generic type 19 | 20 | 21 | def test_normalise() -> None: 22 | assert ( 23 | normalise_notion("two-way\\\\rational~relation") == "two-way rational relation" 24 | ) 25 | 26 | 27 | def test_distance() -> None: 28 | """Test functions from the the distance module.""" 29 | assert distance("", "", [""], {}, "english") == 0 30 | # Tests where only the empty word is allowed as a prefix. No prior scope meaning is known. 31 | assert distance("ordinal semigroup", "ordinal semigroups", [""], {}, "english") == 0 32 | assert distance("cheval", "chevaux", [""], {}, "french") == 0 33 | assert distance("cheval", "chevaux", [""], {}, "english") > 0 34 | # Tests with a scope 35 | assert distance("ordinal semigroup", "semigroups@ordinal", [""], {}, "english") == 0 36 | assert distance("semigroup", "semigroups@ordinal", [""], {}, "english") > 0 37 | # Tests with prefixes 38 | assert distance("foo", "turbofoo", ["", "turbo"], {}, "english") == 0 39 | assert distance("foo", "turbofoo", [""], {}, "english") > 0 40 | assert distance("foo", "megafoo", ["", "turbo"], {}, "english") > 0 41 | assert distance("full", "non-full", ["", "non-"], {}, "english") == 0 42 | # Test with accent and math 43 | assert distance("Büchi", 'B\\"uchi', [""], {}, "english") == 0 44 | assert ( 45 | distance("Büchi", '\\textsf{$\\omega$-B\\"{u}chi}', ["", "-"], {}, "english") 46 | == 0 47 | ) 48 | # Tests with scope 49 | assert ( 50 | distance("word@ord", "ordinal word", [""], {"ord": [["ordinal"]]}, "english") 51 | == 0 52 | ) 53 | assert distance("word@ord", "ordinal word", [""], {}, "english") > 0 54 | # Tests with scope (should be case-insensitive) 55 | assert distance("foo@BaR", "foo@bar", [""], {}, "english") == 0 56 | # Tests with space 57 | assert distance("foo~bar", "foo bar", [""], {}, "english") == 0 58 | assert distance("foo\\\\bar", "foo bar", [""], {}, "english") == 0 59 | assert distance("foo\\\\ bar", "foo bar", [""], {}, "english") == 0 60 | assert ( 61 | distance("two-way\\\\rational@rel", "two-way rational@rel", [""], {}, "english") 62 | == 0 63 | ) 64 | 65 | 66 | def compare(l1: list[list[T]], l2: list[list[T]]) -> bool: 67 | """Compares if two lists of lists contain the same elements.""" 68 | 69 | def compare_lists(t1: list[T], t2: list[T]) -> bool: 70 | return set(t1) == set(t2) 71 | 72 | for t1 in l1: 73 | if not any(compare_lists(t1, t2) for t2 in l2): 74 | return False 75 | for t2 in l2: 76 | if not any(compare_lists(t1, t2) for t1 in l1): 77 | return False 78 | return True 79 | 80 | 81 | def test_scope_meaning() -> None: 82 | """Tests functions from the module scope_meaning""" 83 | # Test infer_scope 84 | assert compare( 85 | infer_scope( 86 | ["regular language over countable ordinals", "regular languages@ord"], 87 | "ord", 88 | "english", 89 | new_stemmer("english"), 90 | ), 91 | [["ordinals", "countable"]], 92 | ) 93 | # Test infer_all_scopes 94 | assert compare( 95 | infer_all_scopes( 96 | [ 97 | [ 98 | "word@some-scope", 99 | "foo word", 100 | ], 101 | ["langage@some-scope", "bar langage"], 102 | ], 103 | "english", 104 | )["some-scope"], 105 | [["foo"], ["bar"], ["some-scope"]], 106 | ) 107 | 108 | 109 | def test_clustering() -> None: 110 | """Tests functions from the clustering module.""" 111 | kls = Knowledges("tests/.ordinal.kl.original") 112 | unknown_kl = parse_diagnose("tests/.ordinal.diagnose.original") 113 | list_prefixes = parse_config("knowledge_clustering/data/english.ini") 114 | scopes_meaning = infer_all_scopes(kls.get_all_bags(), "english") 115 | clustering(kls, unknown_kl, 0, list_prefixes, scopes_meaning, "english") 116 | solution = [ 117 | ["word", "words"], 118 | ["word@ord", "countable ordinal word"], 119 | ["regular language over countable ordinals", "regular languages@ord"], 120 | ["separation", "inseparability"], 121 | ["semigroup", "semigroups"], 122 | ] 123 | assert compare(kls.get_all_bags(), solution) 124 | 125 | 126 | def test_app_clustering() -> None: 127 | """Tests the cluster command.""" 128 | for filename in ["ordinal.kl", "ordinal.diagnose"]: 129 | shutil.copy(f"tests/.{filename}.original", f"tests/{filename}") 130 | app_clustering( 131 | ["tests/ordinal.kl"], "tests/ordinal.diagnose", False, False, "en", None 132 | ) 133 | # Diagnose file should be left unchanged… 134 | assert filecmp.cmp( 135 | "tests/ordinal.diagnose", "tests/.ordinal.diagnose.original", shallow=False 136 | ) 137 | # Check if knowledge file has good content 138 | assert filecmp.cmp("tests/ordinal.kl", "tests/.ordinal.kl.solution", shallow=False) 139 | p = Path("tests/") 140 | for filename in ["ordinal.kl", "ordinal.diagnose"]: 141 | (p / filename).unlink() 142 | --------------------------------------------------------------------------------