├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── LICENSE.md
├── Makefile
├── README.md
├── examples
    ├── ordinal
    │   ├── ordinal.diagnose
    │   ├── ordinal.kl
    │   └── ordinal.tex
    └── preservation
    │   ├── preservation.diagnose
    │   └── preservation.kl
├── img
    ├── preservation-after.png
    └── preservation-before.png
├── knowledge_clustering
    ├── __init__.py
    ├── _version.py
    ├── add_anchor.py
    ├── add_quotes.py
    ├── autofinder.py
    ├── check_update.py
    ├── clustering.py
    ├── config.py
    ├── cst.py
    ├── data
    │   ├── english.ini
    │   └── french.ini
    ├── diagnose.py
    ├── distance.py
    ├── file_updater.py
    ├── knowledges.py
    ├── misc.py
    ├── scope_meaning.py
    ├── scripts
    │   ├── __init__.py
    │   └── app.py
    └── tex_document.py
├── pyproject.toml
├── setup.cfg
└── tests
    ├── .ordinal.diagnose.original
    ├── .ordinal.kl.original
    ├── .ordinal.kl.solution
    ├── .ordinal.tex.original
    ├── test_addquotes.py
    ├── test_anchor.py
    ├── test_autofinder.py
    └── test_clustering.py


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | jobs:
16 |   deploy:
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v2
22 |     - name: Set up Python
23 |       uses: actions/setup-python@v2
24 |       with:
25 |         python-version: '3.x'
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install build
30 |     - name: Build package
31 |       run: python -m build
32 |     - name: Publish package
33 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
34 |       with:
35 |         user: __token__
36 |         password: ${{ secrets.PYPI_TOKEN }}
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.DS_Store
 2 | *.pyc
 3 | *.egg-info*
 4 | *.whl
 5 | *.tar.gz
 6 | dist/*
 7 | build/*
 8 | .vim/*
 9 | kw-devel/*
10 | venv-black/*
11 | *.venv
12 | .vscode
13 | .coverage
14 | kl3.11/*


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Rémi Morvan, Thomas Colcombet and Aliaume Lopez
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | VENV-BLACK=black.venv
 2 | 
 3 | .PHONY: black check test coverage build deploy-test
 4 | 
 5 | black: 
 6 | 	source ./$(VENV-BLACK)/bin/activate && black .
 7 | 
 8 | check: 
 9 | 	mypy knowledge_clustering/*.py --check-untyped-defs # Check typing
10 | 	pylint knowledge_clustering/*.py # Linter
11 | 
12 | test:
13 | 	python -m pytest tests/ -v
14 | 
15 | coverage:
16 | 	python -m pytest tests/ --cov
17 | 
18 | build: 
19 | 	python -m build .
20 | 
21 | deploy-test: knowledge_clustering/_version.py
22 | 	python -m twine upload --repository testpypi dist/* 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # knowledge-clustering
  2 | 
  3 | [![PyPI](https://img.shields.io/pypi/v/knowledge-clustering.svg)](https://pypi.python.org/pypi/knowledge-clustering)
  4 | 
  5 | Command-line tool to help with the use of the [knowledge LaTeX package](https://ctan.org/pkg/knowledge).
  6 | A tutorial on how to use both `knowledge` and `knowledge-clustering` can be found [here](https://github.com/remimorvan/knowledge-examples).
  7 | 
  8 | ## Principle
  9 | 
 10 | The goal of `knowledge-clustering` is to help the user write a LaTeX document with
 11 | the [knowledge package](https://ctan.org/pkg/knowledge).
 12 | It has three features:
 13 | 
 14 |   - **Clustering**: provide suggestions to the user of what notions should be grouped together.
 15 |   - **Add quotes**: find where you might have missed some quotes in your document.
 16 |   - **Anchor points**: find where you might have missed anchor points in your document.
 17 | 
 18 | The **clustering** algorithm is meant to be used while writing your document, while the last two tools
 19 | should be used when your document is (nearly) ready to be published, to check if everything is right.
 20 | 
 21 | ## Installation
 22 | 
 23 | To install (or upgrade) `knowledge-clustering`, you need to have Python 3.9 (or a more recent version), and then run
 24 | 
 25 |     python3 -m pip install --upgrade knowledge-clustering
 26 | 
 27 | and then
 28 | 
 29 |     knowledge init
 30 |     
 31 | To check if you have the latest version of `knowledge-clustering`, you can run
 32 | 
 33 |     knowledge --version
 34 | 
 35 | ## Clustering notions 
 36 | 
 37 | ### Syntax
 38 | 
 39 | ```
 40 | Usage: knowledge cluster [OPTIONS]
 41 | 
 42 |   Defines, as a comment and in the knowledge files, all the knowledges
 43 |   occuring in the file.
 44 | 
 45 | Options:
 46 |   -k, --knowledge FILE        File containing the knowledges that are already
 47 |                               defined. Multiple files are allowed; new
 48 |                               knowledges will be written in the last one. If
 49 |                               the option is not specified, all .kl file in the
 50 |                               current directory (and subdirectory,
 51 |                               recursively) will be taken. If there are
 52 |                               multiple files, exactly one of them must end
 53 |                               with `default.kl`.
 54 |   -d, --diagnose FILE         Diagnose file produced by LaTeX. If the option
 55 |                               is not specified, the unique .diagnose file in
 56 |                               the current directory (and subdirectory,
 57 |                               recursively) is taken instead.
 58 |   -l, --lang [en|fr]          Language of your TeX document.
 59 |   -S, --scope / --no-scope    Print the scopes defined in the knowledge file
 60 |                               and print the possible meaning of those scope
 61 |                               inferred by knowledge-clustering.
 62 |   -P, --print / --no-print    Print all new knowledges.
 63 |   -N, --no-update / --update  Don't look on PyPI if a newer version of
 64 |                               knowledge-clustering is available.
 65 |   -c, --config-file TEXT      Specify the configuration file. By default the
 66 |                               configuration file in the folder
 67 |                               /Users/rmorvan/knowledge-
 68 |                               clustering/knowledge_clustering/data
 69 |                               corresponding to your language is used.
 70 |   --help                      Show this message and exit.
 71 | ```
 72 | 
 73 | ### Example
 74 | 
 75 | Example files can be found in the `examples/` folder.
 76 | 
 77 | While writing some document, you have defined some knowledges in a file called `preservation.kl` (distinct
 78 | from your main `LaTeX`).
 79 | You continued writing your `LaTeX` document (not provided in the `examples/` folder)
 80 | for some time, and used some knowledges that were undefined.
 81 | When compiling, `LaTeX` and the [`knowledge package`](https://ctan.org/pkg/knowledge) gives you a warning
 82 | and writes in a `.diagnose` file some information explaining what went wrong. This `.diagnose` file contains
 83 | a section called "Undefined knowledges" containing all knowledges used in your main `LaTeX` file but not
 84 | defined in `preservation.kl`. We reproduced this section
 85 | in the `preservation.diagnose` file.
 86 | 
 87 | ![Screenshot of the `preservation.kl` and `preservation.diagnose` files before running knowledge-clustering. `preservation.kl` contains three knowledges, while `preservation.diagnose` contains five undefined knowledges.](img/preservation-before.png "Files `preservation.kl` and `preservation.diagnose` before running knowledge-clustering")
 88 | 
 89 | Normally, you would add every undefined knowledge, one after the other, in your
 90 | `preservation.kl`. This is quite burdensome and can
 91 | largely be automated. This is precisely what `knowledge-clustering` does: after running
 92 | 
 93 |     knowledge cluster -k preservation.kl -d preservation.diagnose
 94 | 
 95 | your file `preservation.diagnose` is left unchanged
 96 | but `preservation.kl` is updated with comments.
 97 | 
 98 | The `cluster` command is optional: you can also write `knowledge -k preservation.kl -d preservation.diagnose`.
 99 | 
100 | ![After running knowledge-clustering, the five undefined knowledges are included in the `preservation.kl` file as comments.](img/preservation-after.png "Files `preservation.kl` and `preservation.diagnose` after running knowledge-clustering`")
101 | 
102 | Now you simply have to check that the recommendations of `knowledge-clustering` are
103 | correct, and uncomment those lines.
104 | 
105 | ### Autofinder
106 | 
107 | If the current directory (and its recursive subdirectories) contains
108 | a unique `.diagnose` file and a unique `.kl` file,
109 | you can simply write `knowledge cluster` (or `knowledge`): the files will be automatically found.
110 | 
111 | ### Multiple knowledge files
112 | 
113 | If you have **multiple knowledge files**, you can use the `-k` option multiple times.
114 | For instance, you could write:
115 | 
116 | 	knowledge cluster -k 1.kl -k 2.kl -d ordinal.diagnose
117 | 
118 | Synonyms of knowledges defined in `1.kl` (resp. `2.kl`) will be defined, as comments,
119 | in `1.kl` (resp. `2.kl`). New knowledges will always be added, as comments, to the last
120 | file, which is `2.kl` in the example.
121 | 
122 | You can also use the autofinder in this case, using `knowledge cluster`
123 | or `knowledge`: if multiple `.kl` files are present in the current directory (and
124 | its recursive subdirectories), exactly one of them must end with `default.kl`—this is
125 | where new knowledges will be put.
126 | 
127 | ## Adding quotes
128 | 
129 | /!\ This feature is somewhat experimental.
130 | 
131 | ```
132 | Usage: knowledge addquotes [OPTIONS]
133 | 
134 |   Finds knowledges defined in the knowledge files that appear in tex file
135 |   without quote symbols. Proposes to add quotes around them.
136 | 
137 | Options:
138 |   -t, --tex FILE              Your TeX file.  [required]
139 |   -k, --knowledge FILE        File containing the knowledges that are already
140 |                               defined. Multiple files are allowed; new
141 |                               knowledges will be written in the last one. If
142 |                               the option is not specified, all .kl file in the
143 |                               current directory (and subdirectory,
144 |                               recursively) will be taken. If there are
145 |                               multiple files, exactly one of them must end
146 |                               with `default.kl`.
147 |   -p, --print INTEGER         When finding a match, number of lines (preceding
148 |                               the match) that are printed in the prompt to the
149 |                               user.
150 |   -N, --no-update / --update
151 |   --help                      Show this message and exit.
152 | ```
153 | 
154 | After running 
155 | 
156 |     knowledge addquotes -t mydocument.tex -k knowledges1.kl -k knowledges2.kl
157 | 
158 | your prompt will propose to add quotes around defined knowledges,
159 | and to define synonyms of knowledges that occur in your TeX file. For instance, if
160 | "algorithm" is a defined knowledge and "algorithms" occurs in your TeX file, then
161 | it will propose to you to define "algorithms" as a synonym of the knowledge "algorithm",
162 | and to add a pair of quotes around the string "algorithms" that occurs in your TeX file.
163 | 
164 | Whenever the algorithm finds a match for a knowledge, it will print the line of
165 | the document where it found the match, and emphasize the string corresponding to the knowledge.
166 | If you want to print more than one line, you can use the `-p` (or `--print`) option
167 | to print more than one line.
168 | 
169 | ## Finding missing anchor points
170 | 
171 | ```
172 | Usage: knowledge anchor [OPTIONS]
173 | 
174 |   Prints warning when a knowledge is introduced but is not preceded by an
175 |   anchor point.
176 | 
177 | Options:
178 |   -t, --tex FILE              Your TeX file.  [required]
179 |   -s, --space INTEGER         Number of characters tolerated between an anchor
180 |                               point and the introduction of a knowledge.
181 |                               (Default value: 200)
182 |   -N, --no-update / --update
183 |   --help                      Show this message and exit.
184 | ```
185 | 
186 | When one runs
187 | 
188 |     knowledge anchor -t mydocument.tex
189 | 
190 | the tool will print the lines of the document containing the
191 | introduction of a knowledge that is not preceded by an anchor point.
192 | The tolerance on how far away the anchor point can be from the
193 | introduction of a knowledge can be changed with the `-s` (or `--space`)
194 | option. The default value is 150 characters (corresponding to 2-3 lines in a
195 | TeX document).
196 | 
197 | ## Devel using virtualenv
198 | 
199 | Using `venv` and the `--editable` option from `pip` allows for an easy
200 | setup of a development environment that will match a future user install without
201 | the hassle.
202 | 
203 | For bash and Zsh users
204 | 
205 | ```bash
206 | python3 -m venv kl.venv
207 | source ./kl.venv/bin/activate
208 | python3 -m pip install --editable .
209 | ```
210 | 
211 | For fish users
212 | 
213 | ```fish
214 | python3 -m venv kl.venv
215 | source ./kl.venv/bin/activate.fish
216 | python3 -m pip install --editable .
217 | ```
218 | 
219 | ## FAQ
220 | 
221 | - `knowledge: command not found` after installing `knowledge-clustering`
222 |   > Make sure you have Python>=3.9.
223 |   
224 | - When running `knowledge`, I obtain a long message error indicating "Resource punkt not found."
225 |   > Run `knowledge init`.
226 | 
227 | - My shell doesn't autocomplete the command `knowledge`.
228 |   > Depending on whether you use `zsh` or `bash` write
229 |   >
230 |   >     eval "`pip completion --<shellname>`"
231 |   >
232 |   > (where `<shellname>` is either `zsh` or `bash`)
233 |   > in your `.zshrc` (or `.bashrc`) file and then,
234 |   > either launch a new terminal or run `source ~/.zshrc`
235 |   > (or `source ~/.bashrc`).
236 | 
237 | - `Error: Got unexpected extra argument` when using multiple knowledge files.
238 |   > You should use the option `-k` before **every** knowledge file, like in
239 |   >
240 |   > 	knowledge cluster -k 1.kl -k 2.kl -d blabla.diagnose 
241 | 
242 | - I've updated `knowledge-clustering` but I still don't have the last version (which can be checked using `knowledge --version`):
243 |   This can happen if you have multiple versions of `python` (and multiple versions
244 |   of `knowledge-clustering`).
245 |   > Type `where python3`, and uninstall `knowledge-clustering`
246 |   from everywhere (using `<path>/python3 -m pip uninstall knowledge-clustering`).
247 |   Try to then reinstall `knowledge-clustering`
248 |   by running `python3 -m pip install --upgrade knowledge-clustering`.
249 | 


--------------------------------------------------------------------------------
/examples/ordinal/ordinal.diagnose:
--------------------------------------------------------------------------------
  1 | ***********
  2 | * Summary *
  3 | ***********
  4 | 
  5 | 181 undefined knowledge(s).
  6 | 1 autoreference(s) are introduced twice.
  7 | 1 autoreference(s) are used but not introduced.
  8 | 
  9 | 44 autoreference(s) are properly used.
 10 | 1 autoreference(s) are defined but not used.
 11 | 
 12 | 
 13 | ********
 14 | * Help *
 15 | ********
 16 | 
 17 | \knowledgeconfigure{diagnose bar=false} deactivate `|'-notation in diagnose file.
 18 | \knowledgeconfigure{diagnose help=false} deactivate long help in the diagnose file.
 19 | \knowledgeconfigure{diagnose line=true} add line numbers to diagnose file.
 20 | 
 21 | ************************
 22 | * Undefined knowledges *
 23 | ************************
 24 | 
 25 | \knowledge{ignore}
 26 | % introduction.tex:5
 27 |  | \FO -separability
 28 |  | \FO -formula
 29 |  | countable ordinal words
 30 | % introduction.tex:6
 31 |  | separation problem
 32 | % introduction.tex:7
 33 |  | regular languages of countable ordinal words
 34 | % introduction.tex:9
 35 |  | yes
 36 |  | \FO -separator
 37 | % introduction.tex:10
 38 |  | separates
 39 | % introduction.tex:11
 40 |  | ie
 41 | % introduction.tex:12
 42 |  | no
 43 |  | witness function
 44 | % introduction.tex:13
 45 |  | \FO -sentence
 46 | % introduction.tex:23
 47 |  | Countable ordinal words
 48 | % introduction.tex:24
 49 |  | regular languages@COW
 50 | % introduction.tex:31
 51 |  | countable ordinals
 52 |  | ordinal monoids
 53 | % introduction.tex:35
 54 |  | \FO -pointlike sets
 55 |  | ordinal monoid
 56 | % introduction.tex:36
 57 |  | \FO -definable@lang
 58 |  | saturation
 59 |  | \FO -approximant
 60 | % introduction.tex:41
 61 |  | aperiodic
 62 |  | syntactic monoid
 63 |  | $\Jeq $-trivial
 64 | % introduction.tex:42
 65 |  | aperiodic pointlike sets
 66 |  | covering problem
 67 | % introduction.tex:46
 68 |  | covering problems
 69 | % introduction.tex:49
 70 |  | scattered@linord
 71 | % introduction.tex:53
 72 |  | \FO -separation
 73 | % introduction.tex:57
 74 |  | first-order logic
 75 |  | first-order definable maps
 76 | % introduction.tex:58
 77 |  | algorithm
 78 | % introduction.tex:60
 79 |  | pointlikes
 80 | % preliminaries.tex:10
 81 |  | linear ordering
 82 | % preliminaries.tex:11
 83 |  | countable@linord
 84 |  | finite@linord
 85 | % preliminaries.tex:12
 86 |  | linear orderings
 87 | % preliminaries.tex:13
 88 |  | morphism@linord
 89 | % preliminaries.tex:15
 90 |  | isomorphism@linord
 91 | % preliminaries.tex:16
 92 |  | morphism@linord
 93 | % preliminaries.tex:18
 94 |  | sum@linord
 95 |  | product@linord
 96 | % preliminaries.tex:42
 97 |  | well-founded
 98 | % preliminaries.tex:44
 99 |  | ordinal
100 | % preliminaries.tex:45
101 |  | isomorphism@linord
102 | % preliminaries.tex:48
103 |  | ordinals
104 | % preliminaries.tex:50
105 |  | embedding@linord
106 | % preliminaries.tex:53
107 |  | successor ordinal
108 | % preliminaries.tex:54
109 |  | limit ordinal
110 | % preliminaries.tex:94
111 |  | word
112 | % preliminaries.tex:95
113 |  | domain
114 | % preliminaries.tex:97
115 |  | countable@word
116 |  | finite@word
117 |  | scattered@word
118 |  | $\omega $-word
119 | % preliminaries.tex:98
120 |  | countable@linord
121 |  | finite@linord
122 |  | scattered@linord
123 | % preliminaries.tex:99
124 |  | countable ordinal word
125 |  | countable@linord
126 |  | ordinal@linord
127 | % preliminaries.tex:102
128 |  | finite words
129 | % preliminaries.tex:112
130 |  | omega iteration
131 | % preliminaries.tex:132
132 |  | semigroup
133 | % preliminaries.tex:133
134 |  | monoid
135 | % preliminaries.tex:137
136 |  | idempotent
137 | % preliminaries.tex:139
138 |  | idempotent power
139 | % preliminaries.tex:147
140 |  | group-trivial
141 | % preliminaries.tex:150
142 |  | countable ordinal
143 | % preliminaries.tex:152
144 |  | words
145 | % preliminaries.tex:168
146 |  | generalised product
147 | % preliminaries.tex:177
148 |  | generalised associativity
149 | % preliminaries.tex:186
150 |  | ordinal monoid morphism
151 | % preliminaries.tex:192
152 |  | ordered ordinal monoid
153 | % preliminaries.tex:196
154 |  | alphabet
155 |  | recognised@OM
156 | % preliminaries.tex:200
157 |  | recognisable@OM
158 | % preliminaries.tex:201
159 |  | recognisable@OM
160 | % preliminaries.tex:202
161 |  | regular@cow
162 | % preliminaries.tex:236
163 |  | presentation@OM
164 | % preliminaries.tex:244
165 |  | power ordinal monoid
166 | % preliminaries.tex:299
167 |  | Free variables
168 | % preliminaries.tex:300
169 |  | free variables
170 | % preliminaries.tex:303
171 |  | valuation
172 | % preliminaries.tex:304
173 |  | word@ord
174 | % preliminaries.tex:310
175 |  | word@ord
176 | % preliminaries.tex:315
177 |  | satisfies
178 |  | accepts
179 | % preliminaries.tex:319
180 |  | \FO -definable@lang
181 | % preliminaries.tex:321
182 |  | words@ord
183 | % preliminaries.tex:339
184 |  | Bedon's theorem
185 | % preliminaries.tex:341
186 |  | \FO -definable@lang
187 | % preliminaries.tex:342
188 |  | recognised@OM
189 | % preliminaries.tex:348
190 |  | \FO -definable@map
191 | % preliminaries.tex:350
192 |  | \FO -definable language
193 | % preliminaries.tex:352
194 |  | \FO -definable@map
195 | % preliminaries.tex:375
196 |  | \FO -definable@map
197 | % preliminaries.tex:378
198 |  | \FO -definable@map
199 | % preliminaries.tex:397
200 |  | condensation
201 | % preliminaries.tex:407
202 |  | condensation formula
203 | % preliminaries.tex:425
204 |  | finite condensation
205 | % preliminaries.tex:432
206 |  | word@ord
207 | % preliminaries.tex:437
208 |  | \FO -definable functions
209 | % preliminaries.tex:438
210 |  | \FO -definable function
211 | % preliminaries.tex:444
212 |  | condensation \FO -formula
213 | % preliminaries.tex:452
214 |  | \FO -definable@map
215 | % algorithm.tex:84
216 |  | recognised@OM
217 | % algorithm.tex:122
218 |  | $\omega $-iteration
219 | % algorithm.tex:160
220 |  | words@ord
221 |  | words@ord
222 |  | words@ord
223 |  | words@ord
224 |  | words@ord
225 |  | words@ord
226 |  | words@ord
227 |  | words@ord
228 |  | words@ord
229 |  | words@ord
230 |  | words@ord
231 |  | words@ord
232 |  | words@ord
233 |  | words@ord
234 |  | words@ord
235 |  | words@ord
236 |  | words@ord
237 |  | words@ord
238 |  | words@ord
239 |  | words@ord
240 |  | words@ord
241 |  | words@ord
242 |  | words@ord
243 |  | words@ord
244 | % algorithm.tex:182
245 |  | \FO -definable@lang
246 |  | recognising@OM
247 | % algorithm.tex:186
248 |  | presentation@OM
249 | % algorithm.tex:193
250 |  | \FO -separable
251 | % algorithm.tex:206
252 |  | \FO -separator sentence
253 | % algorithm.tex:207
254 |  | pointlike sets
255 | % answer-no.tex:10
256 |  | quantifier depth
257 | % answer-no.tex:12
258 |  | \FOk -equivalent
259 | % answer-no.tex:61
260 |  | \FO -separated
261 | % answer-no.tex:164
262 |  | \FO -inseparability
263 | % answer-yes.tex:8
264 |  | ordinal monoids with merge
265 | % answer-yes.tex:9
266 |  | \FO -approximants
267 |  | \FO -definable@map
268 | % answer-yes.tex:10
269 |  | ordinal monoid with merge
270 | % answer-yes.tex:11
271 |  | $\omega $-words
272 | % answer-yes.tex:13
273 |  | Merge operators
274 |  | {\FO }-approximants
275 | % answer-yes.tex:19
276 |  | presentation@OM
277 | % answer-yes.tex:20
278 |  | merge operator
279 | % answer-yes.tex:33
280 |  | \FO -definable@map
281 | % answer-yes.tex:35
282 |  | \FO -definable map
283 | % answer-yes.tex:67
284 |  | \FO -definable@map
285 |  | \FO -definable@lang
286 | % answer-yes.tex:89
287 |  | regular language@cow
288 |  | recognised@OM
289 | % answer-yes.tex:91
290 |  | aperiodicity
291 | % answer-yes.tex:106
292 |  | words@ord
293 | % answer-yes.tex:219
294 |  | \FO -definable@lang
295 | % answer-yes.tex:261
296 |  | Green's relations
297 | % answer-yes.tex:347
298 |  | \FO -definable@lang
299 | % answer-yes.tex:348
300 |  | \FO -definable@map
301 | % related.tex:10
302 |  | \FO -covering problem
303 | % related.tex:11
304 |  | regular languages@cow
305 |  | \FO -definable languages
306 | % related.tex:12
307 |  | separation problems
308 | % related.tex:13
309 |  | separable
310 | % related.tex:20
311 |  | \FOk -closure
312 | % related.tex:38
313 |  | semigroups
314 | % related.tex:53
315 |  | \FO -covering
316 | % related.tex:55
317 |  | aperiodic pointlikes
318 | % related.tex:60
319 |  | finite words 
320 | % conclusion.tex:5
321 |  | words of countable ordinal length
322 |  | words of length~$\omega $
323 | % conclusion.tex:8
324 |  | $\omega $-words
325 | % conclusion.tex:12
326 |  | $\omega $-iterations
327 | % conclusion.tex:15
328 |  | finite word
329 | % conclusion.tex:18
330 |  | scattered@word
331 |  | countable words
332 |  | scattered@word
333 |  | inseparability
334 |  | semigroup
335 |  | words
336 |  | semigroups
337 |  | countable ordinal word
338 | 
339 | ****************************
340 | * autoref-introduced-twice *
341 | ****************************
342 | 
343 | % answer-yes.tex:66
344 | answer-yes.tex:66: {\singordmap }{default}{base}
345 | answer-yes.tex:66: {\singordmap }{default}{base}
346 | 
347 | 
348 | ******************************
349 | * Autoref used without intro *
350 | ******************************
351 | 
352 | % macros.tex:303
353 | \nointro{default}{base}{\Jeq }
354 | 
355 | 
356 | ***********************************
357 | * Autoref introduced but not used *
358 | ***********************************
359 | 
360 | % macros.tex:157
361 | macros.tex:157: {\lessord }{default}{base}
362 | 
363 | 
364 | 


--------------------------------------------------------------------------------
/examples/ordinal/ordinal.kl:
--------------------------------------------------------------------------------
  1 | %%%%% NEW KNOWLEDGES 
  2 | %
  3 | %\knowledge{notion}
  4 | %  | \FO -separability
  5 | %  | \FO -separator
  6 | %  | separates
  7 | %  | \FO -separation
  8 | %  | \FO -separable
  9 | %  | \FO -separated
 10 | %  | separable
 11 | %  | inseparability
 12 | %  | \FO -inseparability
 13 | %
 14 | %\knowledge{notion}
 15 | %  | \FO -formula
 16 | %
 17 | %\knowledge{notion}
 18 | %  | countable ordinal words
 19 | %  | Countable ordinal words
 20 | %  | countable ordinal word
 21 | %
 22 | %\knowledge{notion}
 23 | %  | separation problem
 24 | %  | separation problems
 25 | %
 26 | %\knowledge{notion}
 27 | %  | regular languages of countable ordinal words
 28 | %
 29 | %\knowledge{notion}
 30 | %  | yes
 31 | %
 32 | %\knowledge{notion}
 33 | %  | ie
 34 | %
 35 | %\knowledge{notion}
 36 | %  | no
 37 | %
 38 | %\knowledge{notion}
 39 | %  | witness function
 40 | %
 41 | %\knowledge{notion}
 42 | %  | \FO -sentence
 43 | %
 44 | %\knowledge{notion}
 45 | %  | regular languages@COW
 46 | %
 47 | %\knowledge{notion}
 48 | %  | countable ordinals
 49 | %  | countable ordinal
 50 | %
 51 | %\knowledge{notion}
 52 | %  | ordinal monoids
 53 | %  | ordinal monoid
 54 | %
 55 | %\knowledge{notion}
 56 | %  | \FO -pointlike sets
 57 | %  | pointlike sets
 58 | %
 59 | %\knowledge{notion}
 60 | %  | \FO -definable@lang
 61 | %
 62 | %\knowledge{notion}
 63 | %  | saturation
 64 | %
 65 | %\knowledge{notion}
 66 | %  | \FO -approximant
 67 | %  | \FO -approximants
 68 | %  | {\FO }-approximants
 69 | %
 70 | %\knowledge{notion}
 71 | %  | aperiodic
 72 | %  | aperiodicity
 73 | %
 74 | %\knowledge{notion}
 75 | %  | syntactic monoid
 76 | %
 77 | %\knowledge{notion}
 78 | %  | $\Jeq $-trivial
 79 | %
 80 | %\knowledge{notion}
 81 | %  | aperiodic pointlike sets
 82 | %
 83 | %\knowledge{notion}
 84 | %  | covering problem
 85 | %  | covering problems
 86 | %  | \FO -covering problem
 87 | %
 88 | %\knowledge{notion}
 89 | %  | scattered@linord
 90 | %
 91 | %\knowledge{notion}
 92 | %  | first-order logic
 93 | %
 94 | %\knowledge{notion}
 95 | %  | first-order definable maps
 96 | %
 97 | %\knowledge{notion}
 98 | %  | algorithm
 99 | %
100 | %\knowledge{notion}
101 | %  | pointlikes
102 | %
103 | %\knowledge{notion}
104 | %  | linear ordering
105 | %  | linear orderings
106 | %
107 | %\knowledge{notion}
108 | %  | countable@linord
109 | %
110 | %\knowledge{notion}
111 | %  | finite@linord
112 | %
113 | %\knowledge{notion}
114 | %  | morphism@linord
115 | %
116 | %\knowledge{notion}
117 | %  | isomorphism@linord
118 | %
119 | %\knowledge{notion}
120 | %  | sum@linord
121 | %
122 | %\knowledge{notion}
123 | %  | product@linord
124 | %
125 | %\knowledge{notion}
126 | %  | well-founded
127 | %
128 | %\knowledge{notion}
129 | %  | ordinal
130 | %  | ordinals
131 | %
132 | %\knowledge{notion}
133 | %  | embedding@linord
134 | %
135 | %\knowledge{notion}
136 | %  | successor ordinal
137 | %
138 | %\knowledge{notion}
139 | %  | limit ordinal
140 | %
141 | %\knowledge{notion}
142 | %  | word
143 | %  | $\omega $-word
144 | %  | words
145 | %  | $\omega $-words
146 | %
147 | %\knowledge{notion}
148 | %  | domain
149 | %
150 | %\knowledge{notion}
151 | %  | countable@word
152 | %  | countable words
153 | %
154 | %\knowledge{notion}
155 | %  | finite@word
156 | %  | finite words
157 | %  | finite words 
158 | %  | finite word
159 | %
160 | %\knowledge{notion}
161 | %  | scattered@word
162 | %
163 | %\knowledge{notion}
164 | %  | ordinal@linord
165 | %
166 | %\knowledge{notion}
167 | %  | omega iteration
168 | %
169 | %\knowledge{notion}
170 | %  | semigroup
171 | %  | semigroups
172 | %
173 | %\knowledge{notion}
174 | %  | monoid
175 | %
176 | %\knowledge{notion}
177 | %  | idempotent
178 | %
179 | %\knowledge{notion}
180 | %  | idempotent power
181 | %
182 | %\knowledge{notion}
183 | %  | group-trivial
184 | %
185 | %\knowledge{notion}
186 | %  | generalised product
187 | %
188 | %\knowledge{notion}
189 | %  | generalised associativity
190 | %
191 | %\knowledge{notion}
192 | %  | ordinal monoid morphism
193 | %
194 | %\knowledge{notion}
195 | %  | ordered ordinal monoid
196 | %
197 | %\knowledge{notion}
198 | %  | alphabet
199 | %
200 | %\knowledge{notion}
201 | %  | recognised@OM
202 | %  | recognisable@OM
203 | %  | recognising@OM
204 | %
205 | %\knowledge{notion}
206 | %  | regular@cow
207 | %
208 | %\knowledge{notion}
209 | %  | presentation@OM
210 | %
211 | %\knowledge{notion}
212 | %  | power ordinal monoid
213 | %
214 | %\knowledge{notion}
215 | %  | Free variables
216 | %  | free variables
217 | %
218 | %\knowledge{notion}
219 | %  | valuation
220 | %
221 | %\knowledge{notion}
222 | %  | word@ord
223 | %  | words@ord
224 | %
225 | %\knowledge{notion}
226 | %  | satisfies
227 | %
228 | %\knowledge{notion}
229 | %  | accepts
230 | %
231 | %\knowledge{notion}
232 | %  | Bedon's theorem
233 | %
234 | %\knowledge{notion}
235 | %  | \FO -definable@map
236 | %  | \FO -definable map
237 | %
238 | %\knowledge{notion}
239 | %  | \FO -definable language
240 | %  | \FO -definable languages
241 | %
242 | %\knowledge{notion}
243 | %  | condensation
244 | %
245 | %\knowledge{notion}
246 | %  | condensation formula
247 | %  | condensation \FO -formula
248 | %
249 | %\knowledge{notion}
250 | %  | finite condensation
251 | %
252 | %\knowledge{notion}
253 | %  | \FO -definable functions
254 | %  | \FO -definable function
255 | %
256 | %\knowledge{notion}
257 | %  | $\omega $-iteration
258 | %  | $\omega $-iterations
259 | %
260 | %\knowledge{notion}
261 | %  | \FO -separator sentence
262 | %
263 | %\knowledge{notion}
264 | %  | quantifier depth
265 | %
266 | %\knowledge{notion}
267 | %  | \FOk -equivalent
268 | %
269 | %\knowledge{notion}
270 | %  | ordinal monoids with merge
271 | %  | ordinal monoid with merge
272 | %
273 | %\knowledge{notion}
274 | %  | Merge operators
275 | %  | merge operator
276 | %
277 | %\knowledge{notion}
278 | %  | regular language@cow
279 | %  | regular languages@cow
280 | %
281 | %\knowledge{notion}
282 | %  | Green's relations
283 | %
284 | %\knowledge{notion}
285 | %  | \FOk -closure
286 | %
287 | %\knowledge{notion}
288 | %  | \FO -covering
289 | %
290 | %\knowledge{notion}
291 | %  | aperiodic pointlikes
292 | %
293 | %\knowledge{notion}
294 | %  | words of countable ordinal length
295 | %
296 | %\knowledge{notion}
297 | %  | words of length~$\omega $
298 | 


--------------------------------------------------------------------------------
/examples/ordinal/ordinal.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | 
 3 | \usepackage[utf8]{inputenc}
 4 | \usepackage[T1]{fontenc}
 5 | \pdfoutput = 1
 6 | 
 7 | \usepackage[breaklinks,hidelinks]{hyperref} 
 8 | \usepackage{xcolor} 
 9 | 
10 | \usepackage{knowledge}
11 | \knowledgeconfigure{notion}
12 | \knowledgeconfigure{quotation}
13 | \input{ordinal-kl.tex}
14 | 
15 | \title{Blabla}
16 | \date{\today}
17 | \author{Charles-Édouard}
18 | 
19 | 
20 | \begin{document}
21 | 
22 | \maketitle
23 | 
24 | \AP ""word""
25 | "words"
26 | ""word@@ord""
27 | "countable ordinal word"
28 | ""regular language over countable ordinals""
29 | \kl[ord]{regular languages}
30 | \intro{separation}
31 | "inseparability"
32 | ""semigroup""
33 | \kl{semigroups}
34 | 
35 | 
36 | \end{document}


--------------------------------------------------------------------------------
/examples/preservation/preservation.diagnose:
--------------------------------------------------------------------------------
 1 | ************************
 2 | * Undefined knowledges *
 3 | ************************
 4 | 
 5 | \knowledge{ignore}
 6 |     | preserved under extension
 7 |     | preservation under extension
 8 |     | preservation under substructures
 9 |     | substructures
10 |     | homomorphisms
11 | 


--------------------------------------------------------------------------------
/examples/preservation/preservation.kl:
--------------------------------------------------------------------------------
 1 | \knowledge{notion}
 2 | | extensions
 3 | 
 4 | \knowledge{notion}
 5 | | preservation under extensions
 6 | %  | preservation under extension
 7 | 
 8 | \knowledge{notion}
 9 | | preserved under extensions
10 | %  | preserved under extension
11 | %%%%% NEW KNOWLEDGES 
12 | %
13 | %\knowledge{notion}
14 | %  | preservation under substructures
15 | %
16 | %\knowledge{notion}
17 | %  | substructures
18 | %
19 | %\knowledge{notion}
20 | %  | homomorphisms
21 | 


--------------------------------------------------------------------------------
/img/preservation-after.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remimorvan/knowledge-clustering/4ab80f28b0a1796d682eaf365828580f05366ba6/img/preservation-after.png


--------------------------------------------------------------------------------
/img/preservation-before.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remimorvan/knowledge-clustering/4ab80f28b0a1796d682eaf365828580f05366ba6/img/preservation-before.png


--------------------------------------------------------------------------------
/knowledge_clustering/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remimorvan/knowledge-clustering/4ab80f28b0a1796d682eaf365828580f05366ba6/knowledge_clustering/__init__.py


--------------------------------------------------------------------------------
/knowledge_clustering/_version.py:
--------------------------------------------------------------------------------
1 | """Version of knowledge-clustering."""
2 | VERSION = "0.7.4"
3 | 


--------------------------------------------------------------------------------
/knowledge_clustering/add_anchor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Adding anchor points to a document.
 3 | """
 4 | 
 5 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
 6 | 
 7 | import re  # Regular expressions
 8 | from typing import TextIO
 9 | import sys
10 | 
11 | from knowledge_clustering.tex_document import TexDocument
12 | from knowledge_clustering import misc, cst
13 | 
14 | 
15 | def app(tex_filename: str, space: int, out: TextIO = sys.stdout) -> None:
16 |     """
17 |     Prints warning when a knowledge is introduced but is not preceded by an anchor point.
18 |     Args:
19 |         tex_filename: the name of the tex file.
20 |         space: an integer specifying the maximal number of characters allowed between the
21 |             introduction of a knowledge and an anchor point.
22 |         out: an output stream.
23 |     """
24 |     with open(tex_filename, "r", encoding="utf-8") as f:
25 |         tex_doc = TexDocument(f.read())
26 |     return missing_anchor(tex_doc, space, out)
27 | 
28 | 
29 | def missing_anchor(tex_doc: TexDocument, space: int, out: TextIO) -> None:
30 |     """
31 |     Prints line numbers containing the introduction of a knowledge which
32 |     is further away from an anchor point than the integer given as input.
33 | 
34 |     Args:
35 |         tex_doc: a TeX document.
36 |         space: the maximal distance between the introduction of a
37 |             knowledge and the anchor point preceeding it.
38 |         out: an output stream.
39 |     """
40 |     # First, compute the list of pairs (i1,i2,i3,i4) corresponding to
41 |     # the indices in s = tex_doc.tex_cleaned of some pair in cst.INTRO_DELIMITERS, i.e.
42 |     # (s[i1:i2], s[i3:i4]) is in cst.INTRO_DELIMITERS
43 |     matches: list[tuple[int, int, int, int]] = []
44 |     is_end_of_match = [False for _ in range(len(tex_doc.tex_cleaned))]
45 |     for beg_str, end_str in cst.INTRO_DELIMITERS:
46 |         for i_match in re.finditer(re.escape(beg_str), tex_doc.tex_cleaned):
47 |             i1: int = i_match.start()
48 |             i2: int = i_match.end()
49 |             if not is_end_of_match[i1]:
50 |                 i3: int = i2 + tex_doc.tex_cleaned[i2:].find(end_str)
51 |                 i4: int = i3 + len(end_str)
52 |                 if i3 != -1:
53 |                     matches.append((i1, i2, i3, i4))
54 |                     is_end_of_match[i3] = True
55 |     matches.sort(key=lambda x: x[0])
56 |     for i1, i2, i3, _ in matches:
57 |         beg: int = max(0, i1 - space)
58 |         if not any(ap_str in tex_doc.tex_cleaned[beg:i1] for ap_str in cst.AP_STRING):
59 |             start_pt: int | None = tex_doc.pointer[i1]
60 |             if start_pt is not None:
61 |                 message: str = f"Missing anchor point at line {tex_doc.find_line[start_pt]} (knowledge: {misc.emph(tex_doc.tex_cleaned[i2:i3])})."
62 |                 print(message, file=out)
63 |             else:
64 |                 raise IndexError("Undefined pointer", tex_doc.pointer, i1)
65 | 


--------------------------------------------------------------------------------
/knowledge_clustering/add_quotes.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Add missing quotes around knowledges occuring in a TeX document.
  3 | """
  4 | 
  5 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
  6 | 
  7 | import re  # Regular expressions
  8 | from typing import NamedTuple, TextIO
  9 | import sys
 10 | 
 11 | from knowledge_clustering.knowledges import KnowledgesList, remove_redundant_files
 12 | from knowledge_clustering.tex_document import TexDocument
 13 | from knowledge_clustering import file_updater, misc, cst
 14 | 
 15 | 
 16 | class NewKL(NamedTuple):
 17 |     """
 18 |     Object storing a new knowledge, together with its starting and ending point in some TeX
 19 |     document, together with a smaller knowledge, that is already known, and is a substring of
 20 |     the knowledge.
 21 |     """
 22 | 
 23 |     kl_origin: str
 24 |     start_origin: int
 25 |     end_origin: int
 26 |     kl: str
 27 |     start: int
 28 |     end: int
 29 | 
 30 | 
 31 | class AddQuote(NamedTuple):
 32 |     """
 33 |     Stores the starting and ending indexes of the occurence of some knowledge in a TeX document.
 34 |     """
 35 | 
 36 |     kl: str
 37 |     start: int
 38 |     end: int
 39 | 
 40 | 
 41 | def ask_consent(message: str, inp: TextIO, out: TextIO):
 42 |     """
 43 |     Asks whether the user wants to do an action, after printing the string `message`.
 44 |     Returns a boolean.
 45 |     """
 46 |     print(message, file=out)
 47 |     ans = inp.readline().rstrip("\n")
 48 |     return ans.lower() in ["y", "yes"]
 49 | 
 50 | 
 51 | def app(
 52 |     tex_filename: str,
 53 |     kl_filenames: list[str],
 54 |     print_line: int,
 55 |     inp: TextIO = sys.stdin,
 56 |     out: TextIO = sys.stdout,
 57 | ) -> None:
 58 |     """
 59 |     Finds knowledges defined in the knowledge file that appear in tex file without quote
 60 |     symbols. Proposes to add quotes around them.
 61 |     Args:
 62 |         tex_filename: the name of the tex file.
 63 |         kl_filenames: the names of the knowledge files.
 64 |         print_line: an integer specifying how many lines of the tex file should be printed.
 65 |         inp: input stream.
 66 |         out: output stream.
 67 |     """
 68 |     tex_hash = file_updater.hash_file(tex_filename)
 69 |     with open(tex_filename, "r", encoding="utf-8") as f:
 70 |         tex_doc = TexDocument(f.read())
 71 |     f.close()
 72 |     kls = KnowledgesList(remove_redundant_files(kl_filenames))
 73 |     tex_document_new, new_knowledges = quote_maximal_substrings(
 74 |         tex_doc, kls, print_line, inp, out
 75 |     )
 76 |     with file_updater.AtomicUpdate(tex_filename, original_hash=tex_hash) as f:
 77 |         f.write(tex_document_new)
 78 |     f.close()
 79 |     for known_kl, new_kl in new_knowledges:
 80 |         kls.define_synonym_of(new_kl, known_kl)
 81 |     kls.write_knowledges_in_file(nocomment=True)
 82 | 
 83 | 
 84 | def add_quote(
 85 |     tex_doc: TexDocument,
 86 |     operations: list[NewKL | AddQuote],
 87 |     print_line: int,
 88 |     inp: TextIO,
 89 |     out: TextIO,
 90 | ) -> tuple[str, list[tuple[str, str]]]:
 91 |     """
 92 |     In the TeX document, for every operation of type AddQuote, proposes to add quotes before
 93 |     and after the match with the knowledge.
 94 |     For every operation of type NewKL, proposes to define a new knowledge, and to add
 95 |     quotes before and after the match.
 96 | 
 97 |     Args:
 98 |         tex_doc: a TeX document.
 99 |         operations: a list of operations, whose type is either NewKL or AddQuote.
100 |         print_line: an integer specifying how many lines of the tex file should be printed.
101 |         inp: an input stream.
102 |         out: an output stram.
103 |     Given a tex code, and a list of triples (_, start, end), add a quote before the
104 |     start and after the end. If the boolean interactive if true, asks the user
105 |     if they want to add quotes: moreover, print the print_line lines preceding
106 |     the match before asking the user's input.
107 |     """
108 |     result: str = ""
109 |     new_knowledges: list[tuple[str, str]] = []
110 |     ignore_synonym = []
111 |     ignore_subknowledge = []
112 |     operations.sort(key=lambda x: x.start)
113 |     operations_addquote: list[AddQuote] = []
114 |     for op in operations:
115 |         if isinstance(op, NewKL):
116 |             if op.kl not in ignore_synonym:
117 |                 if op.kl not in [k for (_, k) in new_knowledges]:
118 |                     # Propose to the user to define a synonym
119 |                     tex_doc.print(op.start, op.end, print_line, out)
120 |                     message = (
121 |                         f"Do you want to add `{misc.emph_alt(op.kl)}` as a synonym "
122 |                         f"of `{misc.emph_alt(op.kl_origin)}` and add quotes? [y/n] "
123 |                     )
124 |                     if ask_consent(message, inp, out):
125 |                         # Adds op.kl as a new knowledge, defined as a synonym of op.kl_origin
126 |                         new_knowledges.append((op.kl_origin, op.kl))
127 |                         operations_addquote.append(AddQuote(op.kl, op.start, op.end))
128 |                         # Removes any operations occuring on a substring of our new knowledge
129 |                         for op2 in operations:
130 |                             if isinstance(op2, AddQuote):
131 |                                 if op.start <= op2.start and op2.end <= op.end:
132 |                                     operations.remove(op2)
133 |                     else:
134 |                         # From this point, do not propose again to define op.kl as a new knowledge.
135 |                         ignore_synonym.append(op.kl)
136 |                         if (
137 |                             op.kl_origin
138 |                             == tex_doc.tex_code[op.start_origin : op.end_origin + 1]
139 |                         ):
140 |                             # Propose to the user to add quotes around the original knowledge
141 |                             # instead, if we have a precise match.
142 |                             if ask_consent(
143 |                                 f"Add quotes around `{misc.emph(op.kl_origin)}` instead? [y/n] ",
144 |                                 inp,
145 |                                 out,
146 |                             ):
147 |                                 operations_addquote.append(
148 |                                     AddQuote(
149 |                                         op.kl_origin, op.start_origin, op.end_origin
150 |                                     )
151 |                                 )
152 |                             else:
153 |                                 ignore_subknowledge.append(op.kl)
154 |                     print("", file=out)
155 |                 else:
156 |                     # If op.kl was already accepted as a synonym earlier, treat it
157 |                     # as a regular knowledge
158 |                     op = AddQuote(op.kl, op.start, op.end)
159 |             elif op.kl not in ignore_subknowledge:
160 |                 # If the user doesn't want op.kl as a synonym but might want
161 |                 # to add quotes around op.kl_origin
162 |                 op = AddQuote(op.kl_origin, op.start_origin, op.end_origin)
163 |         elif isinstance(op, AddQuote):
164 |             tex_doc.print(op.start, op.end, print_line, out)
165 |             if ask_consent("Add quotes? [y/n] ", inp, out):
166 |                 operations_addquote.append(op)
167 |             print("", file=out)
168 |     add_quote_before = [tex_doc.pointer[op.start] for op in operations_addquote]
169 |     add_quote_after = [tex_doc.pointer[op.end] for op in operations_addquote]
170 |     # Simply add quotes before and after every positions corresponding to the beginning / end of
171 |     # a match with a knowledge.
172 |     for i, char in enumerate(tex_doc.tex_code):
173 |         if i in add_quote_before:
174 |             result += '"'
175 |         result += char
176 |         if i in add_quote_after:
177 |             result += '"'
178 |     print(
179 |         f"Added {len(operations_addquote)} pair"
180 |         + ("s" if len(operations_addquote) > 1 else "")
181 |         + f" of quotes. Defined {len(new_knowledges)} synonym"
182 |         + ("s." if len(new_knowledges) > 1 else "."),
183 |         file=out,
184 |     )
185 |     return result, new_knowledges
186 | 
187 | 
188 | def quote_maximal_substrings(
189 |     tex_doc: TexDocument,
190 |     kls: KnowledgesList,
191 |     print_line: int,
192 |     inp: TextIO,
193 |     out: TextIO,
194 | ) -> tuple[str, list[tuple[str, str]]]:
195 |     """
196 |     Finds knowledges defined in the knowledge file that appear in tex file without quote
197 |     symbols. Proposes to add quotes around them.
198 | 
199 |     Args:
200 |         tex_doc: a TeX document.
201 |         kls: list of knowledges.
202 |         print_line: an integer specifying how many lines of the tex file should be printed.
203 |         inp: input stream.
204 |         out: output stream.
205 |     """
206 | 
207 |     def stop_expanding(char):
208 |         return not char.isalpha()
209 | 
210 |     ignore_position = [False] * tex_doc.length
211 |     add_quote_location: list[NewKL | AddQuote] = []
212 |     for ignore_case in [False, True]:
213 |         # Start the algo by being case sensitive, then run it while being insensitive.
214 |         for s1 in kls.get_sorted_knowledges():
215 |             match_list = (
216 |                 re.finditer(re.escape(s1), tex_doc.tex_cleaned, re.IGNORECASE)
217 |                 if ignore_case
218 |                 else re.finditer(re.escape(s1), tex_doc.tex_cleaned)
219 |             )
220 |             for match in match_list:
221 |                 start, end = match.start(), match.end() - 1
222 |                 if not ignore_position[start]:
223 |                     # Ignore every infix of s1 that is also a substring of the list
224 |                     for i in range(start, end + 1):
225 |                         ignore_position[i] = True
226 |                     for s2 in kls.dependency[s1]:
227 |                         for submatch in re.finditer(
228 |                             re.escape(s2), tex_doc.tex_cleaned[start : end + 1]
229 |                         ):
230 |                             ignore_position[start + submatch.start()] = True
231 |                     # Check if s1 is precedeed by quotes, if not, either check
232 |                     # if we can define a new knowledge, or add the match to the
233 |                     # list of quotes to add.
234 |                     if not any(
235 |                         tex_doc.tex_cleaned.endswith(beg_kl, 0, start)
236 |                         and tex_doc.tex_cleaned.startswith(end_kl, end + 1)
237 |                         for (beg_kl, end_kl) in cst.KL_DELIMITERS
238 |                     ):
239 |                         start2, end2 = start, end
240 |                         while start2 > 0 and not stop_expanding(
241 |                             tex_doc.tex_cleaned[start2 - 1]
242 |                         ):
243 |                             start2 -= 1
244 |                         while end2 + 1 < len(
245 |                             tex_doc.tex_cleaned
246 |                         ) and not stop_expanding(tex_doc.tex_cleaned[end2 + 1]):
247 |                             end2 += 1
248 |                         # text_cleaned[start2: end2 + 1] is the maximal substring
249 |                         # containing text_cleaned[start, end + 1] = s1 as a factor,
250 |                         # and obtained by only addings letters (no space).
251 |                         new_kl = tex_doc.tex_cleaned[start2 : end2 + 1]
252 |                         if s1 != new_kl:
253 |                             # Propose to add new_kl as a new knowledge
254 |                             add_quote_location.append(
255 |                                 NewKL(s1, start, end, new_kl, start2, end2)
256 |                             )
257 |                         else:
258 |                             add_quote_location.append(AddQuote(s1, start, end))
259 |     return add_quote(tex_doc, add_quote_location, print_line, inp, out)
260 | 


--------------------------------------------------------------------------------
/knowledge_clustering/autofinder.py:
--------------------------------------------------------------------------------
 1 | """Automatically finds files in the current directory."""
 2 | 
 3 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
 4 | 
 5 | from pathlib import Path
 6 | 
 7 | 
 8 | class NoFile(Exception):
 9 |     """When no file is found."""
10 | 
11 | 
12 | class TooManyFiles(Exception):
13 |     """When too many files are found compared to what was expected."""
14 | 
15 | 
16 | def find_ext(dr: Path, ext: str) -> list[Path]:
17 |     """
18 |     Lists all files present in a directory (and its subdirectory, recursively)
19 |     that ends with some given extension.
20 |     """
21 |     return list(dr.glob(f"**/*.{ext}"))
22 | 
23 | 
24 | def get_unique_diagnose_file(dr: Path) -> Path:
25 |     """
26 |     Returns the unique .diagnose file present in a directory (and its subdirectory, recursively),
27 |     fails otherwise.
28 |     """
29 |     dg_files = find_ext(dr, "diagnose")
30 |     if len(dg_files) == 0:
31 |         raise NoFile("No .diagnose file present in the directory.")
32 |     if len(dg_files) > 1:
33 |         raise TooManyFiles(
34 |             f"Multiple .diagnose files present in the directory: \
35 | {dg_files[0]} and {dg_files[1]}."
36 |         )
37 |     return dg_files[0]
38 | 
39 | 
40 | def get_knowledge_files(dr: Path) -> list[Path]:
41 |     """
42 |     Returns the list of all .kl files present in a directory (and its subdirectory, recursively).
43 |     Fails if there is no .kl file. Fails if there are multiple .kl file, but not a unique one
44 |     ending with `default.kl`.
45 |     """
46 |     kl_files = find_ext(dr, "kl")
47 |     if len(kl_files) == 0:
48 |         raise NoFile("No .kl file present in the directory.")
49 |     if len(kl_files) == 1:
50 |         return kl_files
51 |     list_default = []
52 |     for i, p in enumerate(kl_files):
53 |         if str(p).endswith("default.kl"):
54 |             list_default.append(i)
55 |     if len(list_default) == 0:
56 |         raise NoFile("No file ending with `default.kl` present in the directory.")
57 |     if len(list_default) > 1:
58 |         raise TooManyFiles(
59 |             f"Multiple files ending with `default.kl` present in the directory: \
60 | {kl_files[list_default[0]]} and {kl_files[list_default[1]]}."
61 |         )
62 |     idx_default = list_default[0]
63 |     idx_last = len(kl_files) - 1
64 |     kl_files[idx_last], kl_files[idx_default] = (
65 |         kl_files[idx_default],
66 |         kl_files[idx_last],
67 |     )
68 |     return kl_files
69 | 


--------------------------------------------------------------------------------
/knowledge_clustering/check_update.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Checks if there is a newer version of knowledge-clustering available on PyPI.
 3 | """
 4 | 
 5 | import requests
 6 | 
 7 | from knowledge_clustering import _version
 8 | from knowledge_clustering.misc import add_bold, add_red, add_orange, add_green
 9 | from knowledge_clustering.cst import TIMEOUT_REQUEST
10 | 
11 | 
12 | def check_update() -> None:
13 |     """
14 |     Checks if an update is available, and if so, prints a message in
15 |     the string pointer given as input.
16 |     """
17 |     # From https://stackoverflow.com/a/62571316/19340201
18 |     try:
19 |         package = "knowledge-clustering"
20 |         response = requests.get(
21 |             f"https://pypi.org/pypi/{package}/json", timeout=TIMEOUT_REQUEST
22 |         )
23 |         latest_version: str = response.json()["info"]["version"]
24 |         is_available: bool = latest_version != _version.VERSION
25 |     except requests.exceptions.RequestException:
26 |         is_available = False
27 |         latest_version = ""
28 |     # If available, print message
29 |     msg = ""
30 |     if is_available:
31 |         msg += (
32 |             "\n"
33 |             + add_bold(add_orange("[notice]"))
34 |             + " A new release of knowledge-clustering is available: "
35 |             + add_red(_version.VERSION)
36 |             + " -> "
37 |             + add_green(latest_version)
38 |         )
39 |         msg += (
40 |             "\n"
41 |             + add_bold(add_orange("[notice]"))
42 |             + " To update, run: "
43 |             + add_green("python3 -m pip install --upgrade knowledge-clustering")
44 |         )
45 |     print(msg)
46 | 


--------------------------------------------------------------------------------
/knowledge_clustering/clustering.py:
--------------------------------------------------------------------------------
  1 | """Clustering algorithm."""
  2 | 
  3 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
  4 | from pathlib import Path
  5 | 
  6 | import copy
  7 | 
  8 | from knowledge_clustering import distance, config, scope_meaning, diagnose, cst
  9 | from knowledge_clustering.knowledges import KnowledgesList, remove_redundant_files
 10 | from knowledge_clustering.misc import emph
 11 | 
 12 | 
 13 | def app(
 14 |     kl_filename: list[str],
 15 |     dg_filename: str,
 16 |     scope: bool,
 17 |     print_kl: bool,
 18 |     lang: str,
 19 |     config_filename: None | Path,
 20 | ):
 21 |     """
 22 |     Defines, as a comment and in the knowledge file, all the knowledges occuring
 23 |     in the diagnose file.
 24 |     Args:
 25 |         kl_filename: the list of name of the knowledge files.
 26 |         dg_filename: the name of the diagnose file.
 27 |         scope: a boolean specifying whether the scopes meaning should be printed.
 28 |         lang: the langage of the document.
 29 |         config_filename: a configuration file, specifying prefixes to ignore.
 30 |     """
 31 |     kls = KnowledgesList(remove_redundant_files(kl_filename))
 32 | 
 33 |     if config_filename is None:
 34 |         config_filename = cst.CONFIG_FILE[lang]
 35 | 
 36 |     list_prefixes = config.parse(config_filename)
 37 | 
 38 |     scopes_meaning = scope_meaning.infer_all_scopes(
 39 |         kls.get_all_bags(), cst.NLTK_LANG[lang]
 40 |     )
 41 |     if scope:
 42 |         scope_meaning.print_scopes(scopes_meaning, print_meaning=True)
 43 |     unknown_knowledges = diagnose.parse(dg_filename)
 44 | 
 45 |     if len(unknown_knowledges) == 0:
 46 |         return
 47 | 
 48 |     # update `kl` using the clustering algorithm
 49 |     clustering(
 50 |         kls,
 51 |         unknown_knowledges,
 52 |         cst.ALPHA,
 53 |         list_prefixes,
 54 |         scopes_meaning,
 55 |         cst.NLTK_LANG[lang],
 56 |     )
 57 |     print(
 58 |         f"Found a solution by adding {len(kls.get_new_bags())} new bag"
 59 |         + ("s" if len(kls.get_new_bags()) >= 2 else "")
 60 |         + ".\n"
 61 |     )
 62 |     changed_filenames = [
 63 |         kl.filename for kl in kls.get_all_kls_struct() if kl.was_changed()
 64 |     ]
 65 |     if len(changed_filenames) == 0:
 66 |         msg = "No file was changed."
 67 |     elif not print_kl:
 68 |         msg = "The following files were changed:"
 69 |         for i, fn in enumerate(changed_filenames):
 70 |             msg += emph(f" {fn}")
 71 |             msg += "," if i < len(changed_filenames) - 1 else "."
 72 |     else:
 73 |         msg = ""
 74 |         for i, fn in enumerate(changed_filenames):
 75 |             msg += "Added in file " + emph(f" {fn}") + ":\n"
 76 |             for kl in kls.get_new_knowledges_in_file(fn):
 77 |                 msg += f"\t{kl}\n"
 78 |     print(msg)
 79 |     kls.write_knowledges_in_file()
 80 | 
 81 | 
 82 | def clustering(
 83 |     kls: KnowledgesList,
 84 |     unknown_kl: list[str],
 85 |     alpha: float,
 86 |     list_prefixes: list[str],
 87 |     scopes_meaning: dict[str, list[list[str]]],
 88 |     lang: str,
 89 | ):
 90 |     """
 91 |     Adds all knowledges in unknown_kl to the structure kls.
 92 | 
 93 |     The invariant satisfied by the algorithm is the following:
 94 |         any two notions in the same bag are near, where near either means:
 95 |             - both in the same bag of knowledges at the beggining of the algorithm ;
 96 |             - at distance (from module "dist") at most alpha if at least one of
 97 |                 the two notions initially belongs to unknown_kls.
 98 | 
 99 |     Args:
100 |         kls: known knowledges.
101 |         unknown_kl: a list of unknown knowledges.
102 |         alpha: a threshold indicating the maximal distance allowed for clustering
103 |             two knowkledges together.
104 |         list_prefixes: a list of prefixes that are ignored when computing the
105 |             distance between two knowledges.
106 |         scope_meaning: a dictionnary, assigning to every scope a list of
107 |             its possible meanings, each possible meaning being a list of words;
108 |             used to compute the distance.
109 |         lang: a string describing the language of the document;
110 |             a value from the dictionnary knowledge_clustering.app._NLTK_LANG;
111 |             used to compute the distance.
112 |     """
113 |     kl_processed_old = []
114 |     kl_processed_new = kls.get_all_knowledges()
115 |     while unknown_kl:
116 |         # If there is no newly processed knowledge, pick an unknown knowledge
117 |         # and add it to a new bag.
118 |         if not kl_processed_new:
119 |             kl = unknown_kl[0]
120 |             unknown_kl = unknown_kl[1:]
121 |             kls.add_new_bag(kl)
122 |             kl_processed_new = [kl]
123 |         size_kl_processed_new = len(kl_processed_new)
124 |         # Tries to add every unknown knowledge to a bag
125 |         unknown_kl_copy = copy.copy(unknown_kl)
126 |         for kl in unknown_kl_copy:
127 |             dist_min = None
128 |             kl2_min_list = []
129 |             # Finds the processed notion that is at a minimal distance from kl
130 |             for kl2 in kl_processed_new:
131 |                 d = distance.distance(kl, kl2, list_prefixes, scopes_meaning, lang)
132 |                 if dist_min is None or d < dist_min:
133 |                     dist_min = d
134 |                     kl2_min_list = [kl2]
135 |                 elif d == dist_min:
136 |                     kl2_min_list.append(kl2)
137 |             # If this minimal distance is smaller than the threshold alpha, add kl to the bag
138 |             if dist_min is not None and dist_min <= alpha:
139 |                 # Choose kl2_min in kl2_min_list minimising the edit distance
140 |                 kl2_min = distance.minimise_levenshtein_distance(kl, kl2_min_list)
141 |                 # Add kl to the bag of kl2_min
142 |                 kls.define_synonym_of(kl, kl2_min)
143 |                 unknown_kl.remove(kl)
144 |                 kl_processed_new.append(kl)
145 |         # Every "new processed knowledge" that was known at the beginning of the while iteration
146 |         # becomes an "old processed knowledge"
147 |         kl_processed_old += kl_processed_new[:size_kl_processed_new]
148 |         kl_processed_new = kl_processed_new[size_kl_processed_new:]
149 | 


--------------------------------------------------------------------------------
/knowledge_clustering/config.py:
--------------------------------------------------------------------------------
 1 | """Parse a configuration file."""
 2 | 
 3 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
 4 | from pathlib import Path
 5 | 
 6 | import configparser
 7 | 
 8 | 
 9 | class ListConfigParser(configparser.ConfigParser):
10 |     """Extended Config Parser to handle lists."""
11 | 
12 |     def getlist(self, section, option):
13 |         """Return list in some config file."""
14 |         value = self.get(section, option)
15 |         return list(x.split("#")[0].strip() for x in value.splitlines())
16 | 
17 |     # def getlistint(self, section, option):
18 |     #     return [int(x) for x in self.getlist(section, option)]
19 | 
20 | 
21 | def parse(filename: Path) -> list[str]:
22 |     """
23 |     Reads a config file and returns the list of words occuring
24 |     under the keyphrase `[DEFAULT] PREFIXES_SIMILAR=`.
25 | 
26 |     Args:
27 |         filename: the name of a config file.
28 | 
29 |     Returns:
30 |         a list of prefixes that should be ignored by the clustering algorithm.
31 |     """
32 |     p = ListConfigParser()
33 |     p.read(filename)
34 |     return p.getlist("DEFAULT", "PREFIXES_SIMILAR")
35 | 


--------------------------------------------------------------------------------
/knowledge_clustering/cst.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Constants used throughout knowledge-clustering.
 3 | """
 4 | 
 5 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
 6 | 
 7 | from pathlib import Path
 8 | from importlib import resources
 9 | 
10 | ALPHA = 0
11 | 
12 | CONFIG_FILENAME: dict[str, str] = {"en": "english.ini", "fr": "french.ini"}
13 | ref = resources.files("knowledge_clustering") / "data"
14 | with resources.as_file(ref) as path:
15 |     CONFIG_DIR: Path = path
16 | CONFIG_FILE: dict[str, Path] = dict()
17 | for lan, filename in CONFIG_FILENAME.items():
18 |     ref_file = resources.files("knowledge_clustering") / f"data/{filename}"
19 |     with resources.as_file(ref_file) as path_file:
20 |         CONFIG_FILE[lan] = path_file
21 | NLTK_LANG: dict[str, str] = {"en": "english", "fr": "french"}
22 | 
23 | INTRO_DELIMITERS: list[tuple[str, str]] = [
24 |     ('""', '""'),
25 |     ("\\intro{", "}"),
26 |     ("\\reintro{", "}"),
27 |     ("\\phantomintro{", "}"),
28 |     ("\\intro[", "]"),
29 |     ("\\reintro[", "]"),
30 |     ("\\phantomintro[", "]"),
31 | ]
32 | AP_STRING: list[str] = ["\\AP", "\\itemAP"]
33 | 
34 | KL_DELIMITERS: list[tuple[str, str]] = [
35 |     ('"', '"'),
36 |     ('"', "@"),
37 |     ("@", '"'),
38 |     ("@", "@"),
39 |     ("\\kl{", "}"),
40 |     ("\\intro{", "}"),
41 |     ("\\reintro{", "}"),
42 |     ("\\phantomintro{", "}"),
43 |     ("\\kl[", "]"),
44 |     ("\\intro[", "]"),
45 |     ("\\reintro[", "]"),
46 |     ("\\phantomintro[", "]"),
47 | ]
48 | 
49 | SEPARATION_HEADING_KL_BLOCK = "************************"
50 | 
51 | IMPORTANT_POS = [
52 |     "CD",
53 |     "JJ",
54 |     "JJR",
55 |     "JJS",
56 |     "NN",
57 |     "NNP",
58 |     "NNS",
59 |     "PDT",
60 |     "RB",
61 |     "RBR",
62 |     "RBS",
63 |     "VB",
64 |     "VBD",
65 |     "VBG",
66 |     "VBN",
67 |     "VBP",
68 |     "VBZ",
69 | ]
70 | IGNORE_SUFFIXES = ["", "s"]
71 | INFINITY = 10000
72 | IGNORE_CHAR_BACKSLASH = [
73 |     # LaTeX accents defined using non-alphanumerical commands
74 |     "\\`",
75 |     "\\'",
76 |     "\\^",
77 |     '\\"',
78 |     "\\~",
79 |     "\\=",
80 |     "\\.",
81 |     "\\-",  # Hyphen
82 | ]
83 | IGNORE_CHAR_NO_BACKSLASH = ["{", "}"]
84 | SPACE_CHAR = ["~", "\\\\"]
85 | 
86 | DISCARD_LINE = "%%%%% NEW KNOWLEDGES "
87 | 
88 | TIMEOUT_REQUEST: float = (
89 |     0.25  # Timeout to resquest the latest version
90 |     # of knowledge-clustering (in seconds)
91 | )
92 | 


--------------------------------------------------------------------------------
/knowledge_clustering/data/english.ini:
--------------------------------------------------------------------------------
 1 | [DEFAULT]
 2 | PREFIXES_SIMILAR=
 3 |     # Empty string
 4 |     - # ignore dashes
 5 |     a # (a)chromatic
 6 |     il
 7 |     im
 8 |     in # (in)separable
 9 |     ir
10 |     non
11 |     non- # (non-)atomic
12 |     un # (un)ambiguous
13 | 


--------------------------------------------------------------------------------
/knowledge_clustering/data/french.ini:
--------------------------------------------------------------------------------
 1 | [DEFAULT]
 2 | PREFIXES_SIMILAR=
 3 |     # Empty string
 4 |     - # ignore dashes
 5 |     a
 6 |     il
 7 |     im
 8 |     in
 9 |     ir
10 |     non
11 |     non-
12 | 


--------------------------------------------------------------------------------
/knowledge_clustering/diagnose.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Functions for handling the .diagnose files.
 3 | """
 4 | 
 5 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
 6 | 
 7 | from typing import Callable, Generator
 8 | from knowledge_clustering import cst
 9 | 
10 | 
11 | def automata_line(state: int, line: str) -> tuple[int, str | None]:
12 |     """
13 |     Transition function of a transducers parsing knowledges from a diagnose file,
14 |     which is read line by line.
15 | 
16 |     Args:
17 |             state: the curring state of the automata, with the following semantic:
18 |                     0: waiting for knowledge block;
19 |                     1: seen the heading of a knowledge block;
20 |                     2: we are in a knowledge block.
21 |             line: a line of the .diagnose document.
22 | 
23 |     Returns:
24 |             a pair (state, kl) where state is the new state of the automaton,
25 |             and kl is either None, or a knowledge parsed while reading the line given as input.
26 |     """
27 |     if state == 0 and "Undefined knowledges" in line:
28 |         return 1, None
29 |     if state == 1 and cst.SEPARATION_HEADING_KL_BLOCK in line:
30 |         return 2, None
31 |     if (state in {0, 2}) and cst.SEPARATION_HEADING_KL_BLOCK in line:
32 |         return 0, None
33 |     if state == 2 and "| " in line:
34 |         s = (line.split("| ", 1)[1]).split("\n", 1)[0]
35 |         return 2, s
36 |     return state, None
37 | 
38 | 
39 | def unroll(
40 |     automata: Callable[[int, str], tuple[int, str | None]],
41 |     initial_state: int,
42 |     str_input: list[str],
43 | ) -> Generator[str | None, None, None]:
44 |     """Builds a generator object from the transition function of a transducer."""
45 |     state: int = initial_state
46 |     z: str | None
47 |     for y in str_input:
48 |         state, z = automata(state, y)
49 |         yield z
50 | 
51 | 
52 | def parse(filename: str) -> list[str]:
53 |     """
54 |     Parses a diagnose file and returns the knowledges it contains.
55 | 
56 |     Args:
57 |             filename: the name of the .diagnose file.
58 | 
59 |     Returns:
60 |             a list of knowledges.
61 |     """
62 |     with open(filename, encoding="utf-8") as f:
63 |         list_notions = []
64 |         for notion in unroll(automata_line, 0, f.readlines()):
65 |             if notion is not None and notion not in list_notions:
66 |                 list_notions.append(notion)
67 |     return list(list_notions)
68 | 


--------------------------------------------------------------------------------
/knowledge_clustering/distance.py:
--------------------------------------------------------------------------------
  1 | """Compute the distance between two knowledges."""
  2 | 
  3 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
  4 | 
  5 | import copy
  6 | import nltk  # type: ignore
  7 | import nltk.stem.snowball as nss  # type: ignore
  8 | from unidecode import unidecode
  9 | 
 10 | from knowledge_clustering import cst
 11 | from knowledge_clustering.misc import emph
 12 | 
 13 | # ---
 14 | # Edit distance
 15 | # ---
 16 | 
 17 | 
 18 | def levenshtein_distance(s: str, t: str) -> int:
 19 |     """
 20 |     Computes the Levenshtein (insertions, deletions or substitutions are allowed)
 21 |     edit distance between two strings.
 22 |     """
 23 |     # Implementation of the Wagner–Fischer algorithm
 24 |     # https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm
 25 |     m, n = len(s), len(t)
 26 |     dist = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
 27 |     for i in range(1, m + 1):
 28 |         dist[i][0] = i
 29 |     for j in range(1, n + 1):
 30 |         dist[0][j] = j
 31 |     for j in range(1, n + 1):
 32 |         for i in range(1, m + 1):
 33 |             substitution_cost = 0 if s[i - 1] == t[j - 1] else 1
 34 |             dist[i][j] = min(
 35 |                 dist[i - 1][j] + 1,
 36 |                 dist[i][j - 1] + 1,
 37 |                 dist[i - 1][j - 1] + substitution_cost,
 38 |             )
 39 |     return dist[m][n]
 40 | 
 41 | 
 42 | def minimise_levenshtein_distance(s: str, t_list: list[str]) -> str:
 43 |     """
 44 |     Given a string s, and a non-empty list of strings, returns an element of t_list
 45 |     minimising the edit distance with s.
 46 |     """
 47 |     t_min = t_list[0]
 48 |     dist_min = levenshtein_distance(s, t_min)
 49 |     for t in t_list[1:]:
 50 |         dist = levenshtein_distance(s, t)
 51 |         if dist < dist_min:
 52 |             t_min = t
 53 |             dist_min = dist
 54 |     return t_min
 55 | 
 56 | 
 57 | # ---
 58 | # Functions to extract content from strings
 59 | # ---
 60 | 
 61 | 
 62 | def extract_scope(notion: str) -> tuple[str, str]:
 63 |     """
 64 |     Given a notion of the form "knowledge@scope" or "knowledge",
 65 |     returns a pair consisting of the knowledge and the (possibly empty) scope.
 66 |     """
 67 |     if "@" in notion:
 68 |         s = notion.split("@", 1)
 69 |         return s[0], s[1]
 70 |     return notion, ""
 71 | 
 72 | 
 73 | def normalise_notion(notion: str) -> str:
 74 |     """
 75 |     Returns the substring of a notion obtained by removing math, commands, accents
 76 |     and non-brekable spaces.
 77 |     """
 78 |     notion_norm = notion.lower()  # to lowercase
 79 |     while "$" in notion_norm:
 80 |         sp = notion_norm.split("$", 2)
 81 |         if len(sp) <= 1:
 82 |             break
 83 |         notion_norm = sp[0] + sp[2]
 84 |     for remove_char in cst.IGNORE_CHAR_BACKSLASH:
 85 |         while remove_char in notion_norm:
 86 |             # If the notion contains remove_char, remove it.
 87 |             sp = notion_norm.split(remove_char, 1)
 88 |             notion_norm = sp[0] + sp[1]
 89 |     for space_char in cst.SPACE_CHAR:
 90 |         while space_char in notion_norm:
 91 |             # If the notion contains remove_char, replace it with a space.
 92 |             sp = notion_norm.split(space_char, 1)
 93 |             notion_norm = sp[0] + " " + sp[1]
 94 |     while "\\" in notion_norm:
 95 |         # If the notion contains a backslash, remove every letter following the backslash
 96 |         # see https://tex.stackexchange.com/a/34381/206008 for naming conventions of TeX commands
 97 |         sp = notion_norm.split("\\", 1)
 98 |         pref, suff = sp[0], sp[1]
 99 |         i = 0
100 |         while i < len(suff) and suff[i].isalpha():
101 |             i += 1
102 |         notion_norm = pref + suff[i:]
103 |     for remove_char in cst.IGNORE_CHAR_NO_BACKSLASH:
104 |         while remove_char in notion_norm:
105 |             # If the notion contains remove_char, remove it.
106 |             sp = notion_norm.split(remove_char, 1)
107 |             notion_norm = sp[0] + sp[1]
108 |     return unidecode(notion_norm)  # Ascii-fy (in particular, remove accents) the result
109 | 
110 | 
111 | def breakup_notion(notion: str, lang: str) -> tuple[list[str], str]:
112 |     """
113 |     Takes a notion, and a language, and returns
114 |     a set of words contained in the notion.
115 | 
116 |     If the language is `english`, remove unimportant words.
117 |     Important words are: cardinals, preposition or conjunction, subordinating,
118 |     adjectives, nouns, pre-determiners, adverbs, verbs (list defined in cst.IMPORTANT_POS).
119 | 
120 |     """
121 |     kl, scope = extract_scope(normalise_notion(notion))
122 |     try:
123 |         if lang == "english":
124 |             words_with_POStag = nltk.pos_tag(  # pylint: disable=invalid-name
125 |                 nltk.word_tokenize(kl, language="english")
126 |             )
127 |             important_words = {
128 |                 w for (w, pos) in words_with_POStag if pos in cst.IMPORTANT_POS
129 |             }
130 |             return (list(important_words), scope)
131 |         return (list(set(nltk.word_tokenize(kl, language=lang))), scope)
132 |     except LookupError as e:
133 |         raise LookupError(
134 |             f"Missing NLTK data. Run `"
135 |             + emph("knowledge init")
136 |             + "` before using the cluster command."
137 |         ) from e
138 | 
139 | 
140 | # ---
141 | # Computing the distance between two notions
142 | # ---
143 | 
144 | 
145 | def similar_words(w1: str, w2: str, list_prefixes: list[str], stemmer) -> bool:
146 |     """
147 |     Checks if two words w1 and w2 are similar up to taking their stem (removing a suffix)
148 |     and removing a prefix in the list `list_prefixes`.
149 |     """
150 |     if w1 == w2:
151 |         return True
152 |     for s1 in [w1, stemmer.stem(w1)]:
153 |         for s2 in [w2, stemmer.stem(w2)]:
154 |             for p in list_prefixes:
155 |                 for s in cst.IGNORE_SUFFIXES:
156 |                     if p + s1 + s == s2 or s1 == p + s2 + s:
157 |                         return True
158 |     return False
159 | 
160 | 
161 | def __semi_distance_sets_of_words(
162 |     set_words1: list[str], set_words2: list[str], list_prefixes: list[str], stemmer
163 | ) -> tuple[int, int]:
164 |     """
165 |     Given two sets of words (considered up to permutation), computes the
166 |     numbers of words of w1 that aren't close to a word of w2 and reciprocally.
167 |     """
168 |     for w1 in set_words1:
169 |         similar_to_w1 = [
170 |             w2 for w2 in set_words2 if similar_words(w1, w2, list_prefixes, stemmer)
171 |         ]
172 |         # If you find a pair of similar words, remove them.
173 |         if len(similar_to_w1) > 0:
174 |             w2 = similar_to_w1[0]
175 |             set_words1.remove(w1)
176 |             set_words2.remove(w2)
177 |             return __semi_distance_sets_of_words(
178 |                 set_words1, set_words2, list_prefixes, stemmer
179 |             )
180 |     return (len(set_words1), len(set_words2))
181 | 
182 | 
183 | def inclusion_sets_of_words(
184 |     set_words1: list[str], set_words2: list[str], list_prefixes: list[str], stemmer
185 | ) -> bool:
186 |     """
187 |     Given two sets of words (considered up to permutation), are
188 |     all words of the first set similar to words of the second set?
189 |     """
190 |     d1, _ = __semi_distance_sets_of_words(
191 |         set_words1, set_words2, list_prefixes, stemmer
192 |     )
193 |     return d1 == 0
194 | 
195 | 
196 | def distance_sets_of_words(
197 |     set_words1: list[str], set_words2: list[str], list_prefixes: list[str], stemmer
198 | ) -> int:
199 |     """
200 |     Given two sets of words (considered up to permutation), computes the distance between them.
201 |     """
202 |     d1, d2 = __semi_distance_sets_of_words(
203 |         set_words1, set_words2, list_prefixes, stemmer
204 |     )
205 |     return d1 + d2
206 | 
207 | 
208 | def new_stemmer(lang: str):
209 |     """Returns a stemmer."""
210 |     return nss.SnowballStemmer(lang)
211 | 
212 | 
213 | def distance(
214 |     notion1: str,
215 |     notion2: str,
216 |     list_prefixes: list[str],
217 |     scopes_meaning: dict[str, list[list[str]]],
218 |     lang: str,
219 | ) -> int:
220 |     """
221 |     Measures the distance between two notions, given a list of prefixes to ignore and
222 |     a list of possible meaning for each scope.
223 |     Args:
224 |         notion1: first notion
225 |         notion2: second notion
226 |         list_prefixes: a list of prefixes that will be ignored
227 |         scope_meaning: a dictionnary, assigning to every scope a list of
228 |             its possible meanings, each possible meaning being a list of words
229 |         lang: the identifier of some language (e.g. "english")
230 | 
231 |     Returns:
232 |         The distance between notion1 and notion2.
233 |     """
234 |     kl1_words, sc1 = breakup_notion(notion1, lang)
235 |     kl2_words, sc2 = breakup_notion(notion2, lang)
236 |     stemmer = new_stemmer(lang)
237 |     if sc1 != "" and sc2 != "" and sc1 != sc2:
238 |         return cst.INFINITY
239 |     if len(kl1_words) == 0 and len(kl2_words) == 0:
240 |         # Can happen if the notion is a command
241 |         return 0
242 |     if len(kl1_words) == 0 or len(kl2_words) == 0:
243 |         # Can happen if the notion is a command
244 |         return cst.INFINITY
245 |     if sc1 == sc2:
246 |         return distance_sets_of_words(kl1_words, kl2_words, list_prefixes, stemmer)
247 |     if sc1 == "":
248 |         kl1_words, sc1, kl2_words, sc2 = kl2_words, sc2, kl1_words, sc1
249 |     # sc2 is empty and sc1 isn't
250 |     # return the minimal distance obtained by replacing sc1 by one of its possible meanings
251 |     dist = cst.INFINITY
252 |     if sc1 in scopes_meaning:
253 |         sc1_meaning = scopes_meaning[sc1]
254 |     else:
255 |         sc1_meaning = [[sc1]]
256 |     for meaning in sc1_meaning:
257 |         kl1_with_meaning = list(copy.copy(kl1_words))
258 |         kl1_with_meaning.extend([w for w in meaning if w not in kl1_with_meaning])
259 |         dist = min(
260 |             dist,
261 |             distance_sets_of_words(kl1_with_meaning, kl2_words, list_prefixes, stemmer),
262 |         )
263 |     return dist
264 | 


--------------------------------------------------------------------------------
/knowledge_clustering/file_updater.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Allow to atomically update a file by writing to a temporary
 3 | file and comparing hashes.
 4 | In case of conflicting uses, the user has to manually merge
 5 | and a prompt is offered using click.
 6 | """
 7 | 
 8 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
 9 | 
10 | from pathlib import Path
11 | 
12 | import hashlib
13 | import tempfile
14 | import click
15 | 
16 | 
17 | def hash_file(filepath: str):
18 |     """
19 |     Compute a hash of the content of the given filepath
20 |     """
21 |     with open(filepath, "rb") as f:
22 |         file_hash = hashlib.blake2b()
23 |         chunk: bytes = f.read(8192)
24 |         while chunk:
25 |             file_hash.update(chunk)
26 |             chunk = f.read(8192)
27 |         return file_hash
28 | 
29 | 
30 | class AtomicUpdate:
31 |     """
32 |     A small class using a temporary file to ensure that we have
33 |     properly replaced the content. Prompts the user if we detect
34 |     a change in the hash of the file given as input.
35 |     """
36 | 
37 |     def __init__(self, filename: str, original_hash=None):
38 |         self.filename: str = filename
39 |         self.hash = hash_file(filename)
40 |         self.ctx = tempfile.NamedTemporaryFile(mode="w", dir=Path.cwd(), delete=False)
41 |         self.tmp = None
42 |         if (
43 |             original_hash is not None
44 |             and original_hash.hexdigest() != self.hash.hexdigest()
45 |         ):
46 |             click.confirm(
47 |                 f"File {self.filename} has been modified during the run of the program, \
48 | erase anyway?",
49 |                 default=None,
50 |                 abort=True,
51 |                 prompt_suffix=": ",
52 |                 show_default=True,
53 |                 err=False,
54 |             )
55 | 
56 |     def __enter__(self):
57 |         self.tmp = self.ctx.__enter__()  # type: ignore
58 |         return self.tmp
59 | 
60 |     def __exit__(self, typ, value, traceback):
61 |         new_hash = hash_file(self.filename)
62 |         if self.tmp is not None:
63 |             if new_hash.hexdigest() != self.hash.hexdigest():
64 |                 print(f"{new_hash.hexdigest()} ≠ {self.hash.hexdigest()}")
65 |                 confirm = click.confirm(
66 |                     f"File {self.filename} has been modified\
67 |                     during the run of \
68 |                     the program, erase anyway?",
69 |                     default=None,
70 |                     abort=False,
71 |                     prompt_suffix=": ",
72 |                     show_default=True,
73 |                     err=False,
74 |                 )
75 |                 if confirm is False:
76 |                     print(f"Temporary file accessible at {self.tmp.name}")
77 |                     return self.ctx.__exit__(typ, value, traceback)
78 |             _ = Path(self.tmp.name).replace(self.filename)
79 |         return self.ctx.__exit__(typ, value, traceback)
80 | 


--------------------------------------------------------------------------------
/knowledge_clustering/knowledges.py:
--------------------------------------------------------------------------------
  1 | """Manipulating known knowledges."""
  2 | 
  3 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
  4 | 
  5 | from typing import NamedTuple
  6 | import toposort  # Topological sort pylint: disable=import-error
  7 | 
  8 | import knowledge_clustering.file_updater as fu
  9 | from knowledge_clustering import cst
 10 | from knowledge_clustering.misc import add_orange, add_bold
 11 | 
 12 | 
 13 | class DocInfoTex(NamedTuple):
 14 |     """Lines of a TeX document."""
 15 | 
 16 |     lines: list[str]
 17 | 
 18 | 
 19 | class DocInfoKnowledge(NamedTuple):
 20 |     """Lines of TeX document corresponding to the definition of a knowledge."""
 21 | 
 22 |     lines: list[str]
 23 |     command: str
 24 |     number: int
 25 | 
 26 | 
 27 | def flat(list_of_list):
 28 |     """Flattens a list of list into a single list."""
 29 |     return [x for y in list_of_list for x in y]
 30 | 
 31 | 
 32 | class Knowledges:
 33 |     def __init__(self, filename):
 34 |         """
 35 |         Reads a knowledge file from a file descriptor f.
 36 | 
 37 |         Args:
 38 |             filename: the name of a file containing knowledges.
 39 | 
 40 |         Computes:
 41 |             self.original_hash: the hash of the document ;
 42 |             self.document: a list of records, either of the form:
 43 |                     {
 44 |                             "type"="tex",
 45 |                             "lines"= list of strings (the lines)
 46 |                     }
 47 |                     or {
 48 |                             "type"="knowledge",
 49 |                             "lines"= list of strings (the lines)
 50 |                             "command" = string representing the line introducing the knowledge,
 51 |                             "number" = the number of the knowledge
 52 |                     }
 53 |             self.known_knowledges: a list of list of strings.
 54 |                     Each list of strings contains strings corresponding to the same knowledge.
 55 |                     The position in the string corresponds to the "number" field in the above
 56 |                     document description.
 57 |         """
 58 |         self.bags: list[list[str]] = []  # Lists of lists, containing knowledges.
 59 |         self.filename: str = filename
 60 |         self.original_hash = fu.hash_file(filename)
 61 |         with open(filename, encoding="utf-8") as file:
 62 |             lines: list[str] = file.readlines()
 63 | 
 64 |             document: list[DocInfoTex | DocInfoKnowledge] = []
 65 |             knowledges: list[list[str]] = []
 66 | 
 67 |             reading_mode: str = "tex"
 68 |             current_block: list[str] = []
 69 |             current_kl_cmd: str = ""
 70 |             current_kl_strs: list[str] = []
 71 | 
 72 |             def push_block():
 73 |                 nonlocal reading_mode
 74 |                 nonlocal document
 75 |                 nonlocal current_block
 76 |                 nonlocal current_kl_cmd
 77 |                 nonlocal current_kl_strs
 78 |                 nonlocal knowledges
 79 |                 nonlocal current_kl_strs
 80 |                 if reading_mode == "tex" and len(current_block) > 0:
 81 |                     document.append(DocInfoTex(lines=current_block))
 82 |                     current_block = []
 83 |                 elif reading_mode == "knowledge":
 84 |                     document.append(
 85 |                         DocInfoKnowledge(
 86 |                             lines=current_block,
 87 |                             command=current_kl_cmd,
 88 |                             number=len(knowledges),
 89 |                         )
 90 |                     )
 91 |                     current_block = []
 92 |                     current_kl_cmd = ""
 93 |                     knowledges.append(current_kl_strs)
 94 |                     current_kl_strs = []
 95 | 
 96 |             def line_is_discard(line):
 97 |                 return line.strip() == cst.DISCARD_LINE.strip()
 98 | 
 99 |             def line_is_comment(line):
100 |                 return line.strip().startswith("%")
101 | 
102 |             def line_is_knowledge(line):
103 |                 return line.strip().startswith("\\knowledge{")
104 | 
105 |             def bar_knowledge_from_line(line):
106 |                 line = line.strip()
107 |                 if line.startswith("|"):
108 |                     return line[1:].strip()
109 |                 return None
110 | 
111 |             def line_is_comment_bar_knowledge_from_line(line):
112 |                 line = line.strip()
113 |                 if line.startswith("%"):
114 |                     return (line[1:].strip()).startswith("|")
115 |                 return False
116 | 
117 |             for line in lines:
118 |                 if line[-1] == "\n":
119 |                     line = line[:-1]
120 |                 if reading_mode == "discard" and not line_is_comment(line):
121 |                     reading_mode = "tex"
122 |                 if line_is_discard(line):
123 |                     push_block()
124 |                     reading_mode = "discard"
125 |                 elif line_is_knowledge(line):
126 |                     push_block()
127 |                     reading_mode = "knowledge"
128 |                     current_kl_cmd = line
129 |                     current_block = [line]
130 |                     current_kl_strs = []
131 |                 elif reading_mode == "knowledge":
132 |                     kl = bar_knowledge_from_line(line)
133 |                     if kl is not None:
134 |                         current_block.append(line)
135 |                         current_kl_strs.append(kl)
136 |                     elif line_is_comment_bar_knowledge_from_line(line):
137 |                         pass
138 |                     else:
139 |                         push_block()
140 |                         reading_mode = "tex"
141 |                         current_block = [line]
142 |                 elif reading_mode == "tex":
143 |                     current_block.append(line)
144 |             push_block()
145 |             self.document = document
146 |             self.bags = knowledges
147 |             self.nb_known_bags: int = len(self.bags)
148 |             self.length_known_bags: list[int] = [len(bag) for bag in self.bags]
149 | 
150 |     def get_all_bags(self) -> list[list[str]]:
151 |         """Returns all bags as a list of lists of strings."""
152 |         return self.bags
153 | 
154 |     def get_old_bags(self) -> list[list[str]]:
155 |         """Returns all bags that were present at the last checkpoint,
156 |         as a list of lists of strings."""
157 |         return self.bags[: self.nb_known_bags]
158 | 
159 |     def get_new_bags(self) -> list[list[str]]:
160 |         """Returns all bags that were not added since the last checkpoint,
161 |         as a list of lists of strings."""
162 |         return self.bags[self.nb_known_bags :]
163 | 
164 |     def get_all_knowledges(self) -> list[str]:
165 |         """Returns all knowledges, as a list of strings."""
166 |         return flat(self.bags)
167 | 
168 |     def get_known_knowledges_in_bag(self, b_id: int) -> list[str]:
169 |         """Returns the list of knowledges contained in the `b_id`-th bag
170 |         during the last checkpoint, as a list of strings."""
171 |         if b_id < self.nb_known_bags:
172 |             return self.bags[b_id][: self.length_known_bags[b_id]]
173 |         return []
174 | 
175 |     def get_new_knowledges_in_bag(self, b_id: int) -> list[str]:
176 |         """Returns the list of knowledges contained in the `id`-th bag
177 |         that were added since the last checkpoint, as a list of strings."""
178 |         if b_id < self.nb_known_bags:
179 |             return self.bags[b_id][self.length_known_bags[b_id] :]
180 |         return self.bags[b_id]
181 | 
182 |     def add_new_bag(self, kl: str) -> None:
183 |         """Adds a new bag that contains only the string `kl`."""
184 |         self.bags.append([kl])
185 | 
186 |     def define_synonym_of(self, kl1: str, kl2: str) -> None:
187 |         """
188 |         Defines a new knowledge (string) `kl1` as a new synonym of the already
189 |         existing knowledge (string) `kl2`.
190 |         """
191 |         for b_id, bag in enumerate(self.bags):
192 |             if kl2 in bag:
193 |                 self.bags[b_id].append(kl1)
194 |                 return
195 |         raise KeyError(f"Error: {kl2} is not a knowledge.")
196 | 
197 |     def was_changed(self) -> bool:
198 |         """
199 |         Returns whether kl has new bags or new synonyms.
200 |         """
201 |         if len(self.get_new_bags()) > 0:
202 |             return True
203 |         for b_id in range(len(self.get_old_bags())):
204 |             if len(self.get_new_knowledges_in_bag(b_id)) > 0:
205 |                 return True
206 |         return False
207 | 
208 |     def write_knowledges_in_file(self, nocomment: bool = False) -> None:
209 |         """
210 |         Writes the new synonyms and new knowledges in the file containing the knowledges.
211 |         """
212 |         with fu.AtomicUpdate(self.filename, original_hash=self.original_hash) as file:
213 |             for b in self.document:
214 |                 if isinstance(b, DocInfoTex):
215 |                     for line in b.lines:
216 |                         file.write(line + "\n")
217 |                 elif isinstance(b, DocInfoKnowledge):
218 |                     for line in b.lines:
219 |                         file.write(line + "\n")
220 |                     if b.number < self.nb_known_bags:
221 |                         for kl in self.get_new_knowledges_in_bag(b.number):
222 |                             file.write((f" | {kl}\n" if nocomment else f"%  | {kl}\n"))
223 |             if len(self.get_new_bags()) > 0:
224 |                 file.write(cst.DISCARD_LINE + "\n")
225 |                 for bag in self.get_new_bags():
226 |                     if len(bag) > 0:
227 |                         file.write("%\n")
228 |                         file.write("%\\knowledge{notion}\n")
229 |                         for kl in bag:
230 |                             file.write((f" | {kl}\n" if nocomment else f"%  | {kl}\n"))
231 | 
232 | 
233 | class KnowledgesList:
234 |     def __init__(self, kls_filenames: list[str]):
235 |         """
236 |         Reads a list of knowledge files.
237 | 
238 |         Args:
239 |             kls_list: the list of filenames containing knowledges.
240 |         """
241 |         self.nb_file: int = len(kls_filenames)
242 |         self.kls_list: dict[str, Knowledges] = {
243 |             fn: Knowledges(fn) for fn in kls_filenames
244 |         }
245 |         self.default_fn: str = kls_filenames[self.nb_file - 1]
246 |         self.compute_dependency_graph()
247 | 
248 |     def get_all_kls_struct(self) -> list[Knowledges]:
249 |         """Returns the list of all knowledge structures"""
250 |         return list(self.kls_list.values())
251 | 
252 |     def default_kls(self) -> Knowledges:
253 |         """Returns the default kls."""
254 |         return self.kls_list[self.default_fn]
255 | 
256 |     def get_all_bags(self) -> list[list[str]]:
257 |         """Returns all bags as a list of lists of strings."""
258 |         return flat([kls.get_all_bags() for kls in self.kls_list.values()])
259 | 
260 |     def get_all_knowledges(self) -> list[str]:
261 |         """Returns all knowledges, as a list of strings."""
262 |         return flat([kls.get_all_knowledges() for kls in self.kls_list.values()])
263 | 
264 |     def get_sorted_knowledges(self) -> list[str]:
265 |         """Returns all knowledges, sorted by topological sort."""
266 |         return self.all_knowledges_sorted
267 | 
268 |     def add_new_bag(self, kl: str) -> None:
269 |         """Adds a new bag that contains only the string `kl`."""
270 |         self.default_kls().add_new_bag(kl)
271 | 
272 |     def define_synonym_of(self, kl1: str, kl2: str) -> None:
273 |         """
274 |         Defines a new knowledge (string) `kl1` as a new synonym of the already
275 |         existing knowledge (string) `kl2`.
276 |         """
277 |         for kls in self.kls_list.values():
278 |             for b_id, bag in enumerate(kls.bags):
279 |                 if kl2 in bag:
280 |                     kls.bags[b_id].append(kl1)
281 |                     return
282 |         raise KeyError(f"Error: {kl2} is not a knowledge.")
283 | 
284 |     def write_knowledges_in_file(self, nocomment: bool = False) -> None:
285 |         """
286 |         Writes the new synonyms and new knowledges in the file containing the knowledges.
287 |         """
288 |         for kls in self.kls_list.values():
289 |             kls.write_knowledges_in_file(nocomment)
290 | 
291 |     def get_new_bags(self) -> list[list[str]]:
292 |         """Returns all bags that were added since the last checkpoint,
293 |         as a list of lists of strings."""
294 |         return self.default_kls().get_new_bags()
295 | 
296 |     def get_new_knowledges_in_file(self, fn: str) -> list[str]:
297 |         """Returns all new knowledges that were added in some file since the last
298 |         checkpoint, as a list of strings."""
299 |         if fn not in self.kls_list:
300 |             raise KeyError(f"No knowledge file named {fn}.")
301 |         return flat(
302 |             [
303 |                 self.kls_list[fn].get_new_knowledges_in_bag(bag_id)
304 |                 for bag_id in range(len(self.kls_list[fn].get_all_bags()))
305 |             ]
306 |         )
307 | 
308 |     def compute_dependency_graph(self) -> None:
309 |         """
310 |         Computes the dependency graph of all knowledges, for the substring relation.
311 |         Then, sort all knowledges using topological sorting.
312 |         Result are stored in self.dependency and self.all_knowledges_sorted.
313 |         """
314 |         dependency: dict[str, set[str]] = {}
315 |         dependency_reversed: dict[str, set[str]] = {}
316 |         for s1 in self.get_all_knowledges():
317 |             dependency[s1] = {
318 |                 s2 for s2 in self.get_all_knowledges() if s2 in s1 and s1 != s2
319 |             }
320 |             dependency_reversed[s1] = {
321 |                 s2 for s2 in self.get_all_knowledges() if s1 in s2 and s1 != s2
322 |             }
323 |         self.dependency: dict[str, set[str]] = dependency
324 |         self.all_knowledges_sorted: list[str] = list(
325 |             toposort.toposort_flatten(dependency_reversed)
326 |         )
327 | 
328 | 
329 | def remove_redundant_files(list_filenames: list[str]) -> list[str]:
330 |     """
331 |     Given a list of filenames, return the same list without duplicates, and output a warning
332 |     if there is such a duplicate.
333 |     """
334 |     output: list[str] = []
335 |     for fn in list_filenames:
336 |         if fn in output:
337 |             print(
338 |                 add_bold(add_orange("[Warning]"))
339 |                 + f" same knowledge file given twice ({fn}), second occurrence is ignored."
340 |             )
341 |         else:
342 |             output.append(fn)
343 |     return output
344 | 


--------------------------------------------------------------------------------
/knowledge_clustering/misc.py:
--------------------------------------------------------------------------------
 1 | """Misc functions, for emphasizing a string."""
 2 | 
 3 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
 4 | 
 5 | 
 6 | BEGIN_EMPH: str = "\033[1m\033[95m"
 7 | BEGIN_EMPH_ALT: str = "\033[1m\033[92m"
 8 | BEGIN_BOLD: str = "\033[1m"
 9 | BEGIN_RED: str = "\033[31m"
10 | BEGIN_ORANGE: str = "\033[33m"
11 | BEGIN_GREEN: str = "\033[32m"
12 | END_EMPH: str = "\033[0m"
13 | 
14 | 
15 | def emph(string: str) -> str:
16 |     """Emphasizes a string."""
17 |     return BEGIN_EMPH + string + END_EMPH
18 | 
19 | 
20 | def emph_alt(string: str) -> str:
21 |     """Alternative emphasis of a string."""
22 |     return BEGIN_EMPH_ALT + string + END_EMPH
23 | 
24 | 
25 | def add_red(string: str) -> str:
26 |     """Puts a string in red."""
27 |     return BEGIN_RED + string + END_EMPH
28 | 
29 | 
30 | def add_orange(string: str) -> str:
31 |     """Puts a string in orange."""
32 |     return BEGIN_ORANGE + string + END_EMPH
33 | 
34 | 
35 | def add_green(string: str) -> str:
36 |     """Puts a string in green."""
37 |     return BEGIN_GREEN + string + END_EMPH
38 | 
39 | 
40 | def add_bold(string: str) -> str:
41 |     """Puts a string in bold."""
42 |     return BEGIN_BOLD + string + END_EMPH
43 | 


--------------------------------------------------------------------------------
/knowledge_clustering/scope_meaning.py:
--------------------------------------------------------------------------------
 1 | """Infer the scope from known knowledges."""
 2 | 
 3 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
 4 | from typing import TypeVar
 5 | 
 6 | import copy
 7 | 
 8 | import knowledge_clustering.distance as dist
 9 | 
10 | T = TypeVar("T")  # Generic type
11 | 
12 | 
13 | def union_list_of_lists(l1: list[T], l2: list[T]) -> list[T]:
14 |     """Returns the union (without repetition) of two lists of lists."""
15 |     s = copy.copy(l1)
16 |     for sublist in l2:
17 |         if sublist not in s:
18 |             s.append(sublist)
19 |     return s
20 | 
21 | 
22 | def infer_scope(list_kl: list[str], scope: str, lang: str, stemmer) -> list[list[str]]:
23 |     """
24 |     Takes a list of knowledges that all belong to the same bag and a scope.
25 | 
26 |     If the list contains a knowledge with this scope, we try to infer the meaning of the scope
27 |     by looking at similar knowledges.
28 | 
29 |     Example:
30 |         Running the algorithm on ["word@some-scope", "countable ordinal word",
31 |         "ordinal word", "scattered language"] for the scope `some-scope` will return
32 |         the list [["countable", "ordinal"], ["ordinal"]].
33 |     """
34 |     result: list[list[str]] = []
35 |     list_kl_broke: list[tuple[list[str], str]] = [
36 |         dist.breakup_notion(kl, lang) for kl in list_kl
37 |     ]
38 |     for kl1_words, sc1 in list_kl_broke:
39 |         if sc1 == scope:
40 |             for kl2_words, sc2 in list_kl_broke:
41 |                 if sc2 == "":
42 |                     if dist.inclusion_sets_of_words(
43 |                         kl1_words, kl2_words, [""], stemmer
44 |                     ):
45 |                         # If every word of kl1 appears in kl2 and kl2 has an empty scope,
46 |                         # return the words in kl2 not appearing in kl1
47 |                         result.append([w for w in kl2_words if w not in kl1_words])
48 |     return result
49 | 
50 | 
51 | def infer_all_scopes(
52 |     known_knowledges: list[list[str]], lang: str
53 | ) -> dict[str, list[list[str]]]:
54 |     """
55 |     Given known knowledges and a langage, returns the infer meaning of scopes occuring
56 |     in said these knowledges.
57 |     """
58 |     list_scopes: set[str] = {
59 |         sc for bag in known_knowledges for (_, sc) in map(dist.extract_scope, bag)
60 |     }
61 |     if "" in list_scopes:
62 |         list_scopes.remove("")
63 |     scopes_meaning: dict[str, list[list[str]]] = {sc: [] for sc in list_scopes}
64 |     stemmer = dist.new_stemmer(lang)
65 |     for scope in list_scopes:
66 |         for bag in known_knowledges:
67 |             scopes_meaning[scope] = union_list_of_lists(
68 |                 scopes_meaning[scope], infer_scope(bag, scope, lang, stemmer)
69 |             )
70 |         if [scope] not in scopes_meaning[scope]:
71 |             scopes_meaning[scope].append([scope])
72 |     return scopes_meaning
73 | 
74 | 
75 | def print_scopes(
76 |     scopes_meaning: dict[str, list[list[str]]], print_meaning: bool = False
77 | ) -> None:
78 |     """Prints the infered meaning of scopes."""
79 |     print("Defined scopes:")
80 |     if not print_meaning:
81 |         print("\t", list(scopes_meaning.keys()))
82 |     else:
83 |         for sc in scopes_meaning:
84 |             print(f"\t@{sc}:{scopes_meaning[sc]}")
85 | 


--------------------------------------------------------------------------------
/knowledge_clustering/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/remimorvan/knowledge-clustering/4ab80f28b0a1796d682eaf365828580f05366ba6/knowledge_clustering/scripts/__init__.py


--------------------------------------------------------------------------------
/knowledge_clustering/scripts/app.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Launching knowledge commands (init, cluster, addquotes, anchor).
  3 | """
  4 | 
  5 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
  6 | from pathlib import Path
  7 | 
  8 | import os
  9 | import sys
 10 | import click
 11 | from click_default_group import DefaultGroup  # type: ignore
 12 | import nltk  # type: ignore
 13 | 
 14 | from knowledge_clustering import (
 15 |     add_anchor,
 16 |     add_quotes,
 17 |     clustering,
 18 |     cst,
 19 |     _version,
 20 |     autofinder,
 21 | )
 22 | from knowledge_clustering.check_update import check_update
 23 | from knowledge_clustering.misc import add_red, add_bold
 24 | 
 25 | 
 26 | # https://stackoverflow.com/a/67324391/19340201
 27 | class AliasedGroup(DefaultGroup):
 28 |     """Group where `AP` is a synonym for `anchor`."""
 29 | 
 30 |     def get_command(self, ctx, cmd_name):
 31 |         if cmd_name in ["anchor", "AP"]:
 32 |             return DefaultGroup.get_command(self, ctx, "anchor")
 33 |         return DefaultGroup.get_command(self, ctx, cmd_name)
 34 | 
 35 | 
 36 | @click.group(cls=AliasedGroup, default="cluster", default_if_no_args=True)
 37 | @click.version_option(_version.VERSION)
 38 | def cli():
 39 |     """Automated notion clustering for the knowledge LaTeX package"""
 40 | 
 41 | 
 42 | @cli.command()
 43 | def init():
 44 |     """Downloads the required NLTK packages."""
 45 |     nltk.download("punkt")
 46 |     nltk.download("punkt_tab")
 47 |     nltk.download("averaged_perceptron_tagger")
 48 |     nltk.download("averaged_perceptron_tagger_eng")
 49 | 
 50 | 
 51 | @cli.command()
 52 | @click.option(
 53 |     "--knowledge",
 54 |     "-k",
 55 |     "kl_filename",
 56 |     multiple=True,
 57 |     type=click.Path(
 58 |         exists=True, file_okay=True, dir_okay=False, writable=True, readable=True
 59 |     ),
 60 |     help="File containing the knowledges that are already defined. \
 61 | Multiple files are allowed; new knowledges will be written in the last one. \
 62 | If the option is not specified, all .kl file in the current directory (and subdirectory, \
 63 | recursively) will be taken. If there are multiple files, exactly one of them must end \
 64 | with `default.kl`.",
 65 |     required=False,
 66 | )
 67 | @click.option(
 68 |     "--diagnose",
 69 |     "-d",
 70 |     "dg_filename",
 71 |     type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True),
 72 |     help="Diagnose file produced by LaTeX. If the option is not specified, the unique \
 73 | .diagnose file in the current directory (and subdirectory, recursively) is taken instead.",
 74 |     required=False,
 75 | )
 76 | @click.option(
 77 |     "--lang",
 78 |     "-l",
 79 |     default="en",
 80 |     type=click.Choice(["en", "fr"]),
 81 |     help="Language of your TeX document.",
 82 | )
 83 | @click.option(
 84 |     "--scope/--no-scope",
 85 |     "-S/ ",
 86 |     default=False,
 87 |     help="Print the scopes defined in the knowledge file and print \
 88 | the possible meaning of those scope inferred by knowledge-clustering.",
 89 | )
 90 | @click.option(
 91 |     "--print/--no-print",
 92 |     "-P/ ",
 93 |     "print_kl",
 94 |     default=False,
 95 |     help="Print all new knowledges.",
 96 | )
 97 | @click.option(
 98 |     "--no-update/--update",
 99 |     "-N/ ",
100 |     "noupdate",
101 |     default=False,
102 |     help="Don't look on PyPI if a newer version of knowledge-clustering is available.",
103 | )
104 | @click.option(
105 |     "--config-file",
106 |     "-c",
107 |     "config_filename",
108 |     default=None,
109 |     help=f"Specify the configuration file. By default the configuration file \
110 | in the folder {cst.CONFIG_DIR} corresponding to your language is used.",
111 | )
112 | def cluster(
113 |     kl_filename: tuple[str],
114 |     dg_filename: str,
115 |     lang: str,
116 |     scope: bool,
117 |     print_kl: bool,
118 |     noupdate: bool,
119 |     config_filename: None | str,
120 | ):
121 |     """
122 |     Defines, as a comment and in the knowledge files, all the knowledges occuring in the file.
123 |     """
124 |     try:
125 |         if not dg_filename:
126 |             dg_filename = autofinder.get_unique_diagnose_file(Path("."))
127 |         kl_filename = list(kl_filename)
128 |         if not kl_filename:
129 |             kl_filename = autofinder.get_knowledge_files(Path("."))
130 |         clustering.app(kl_filename, dg_filename, scope, print_kl, lang, config_filename)
131 |         if not noupdate:
132 |             check_update()
133 |     except (autofinder.NoFile, autofinder.TooManyFiles) as e:
134 |         print(add_bold(add_red("\n[Error] ")) + e.args[0])
135 | 
136 | 
137 | @cli.command()
138 | @click.option(
139 |     "--tex",
140 |     "-t",
141 |     "tex_filename",
142 |     type=click.Path(
143 |         exists=True, file_okay=True, dir_okay=False, writable=True, readable=True
144 |     ),
145 |     help="Your TeX file.",
146 |     required=True,
147 | )
148 | @click.option(
149 |     "--knowledge",
150 |     "-k",
151 |     "kl_filename",
152 |     multiple=True,
153 |     type=click.Path(
154 |         exists=True, file_okay=True, dir_okay=False, writable=True, readable=True
155 |     ),
156 |     help="File containing the knowledges that are already defined. \
157 | Multiple files are allowed; new knowledges will be written in the last one. \
158 | If the option is not specified, all .kl file in the current directory (and subdirectory, \
159 | recursively) will be taken. If there are multiple files, exactly one of them must end \
160 | with `default.kl`.",
161 |     required=False,
162 | )
163 | @click.option(
164 |     "--print",
165 |     "-p",
166 |     "print_line",
167 |     type=int,
168 |     default=1,
169 |     help="When finding a match, number of lines (preceding the match) that are printed \
170 | in the prompt to the user.",
171 | )
172 | @click.option(
173 |     "--no-update/--update",
174 |     "-N/ ",
175 |     "noupdate",
176 |     default=False,
177 | )
178 | def addquotes(tex_filename: str, kl_filename: str, print_line: int, noupdate: bool):
179 |     """
180 |     Finds knowledges defined in the knowledge files that appear in tex file without quote
181 |     symbols. Proposes to add quotes around them.
182 |     """
183 |     try:
184 |         kl_filename = list(kl_filename)
185 |         if not kl_filename:
186 |             kl_filename = autofinder.get_knowledge_files(Path("."))
187 |         add_quotes.app(tex_filename, kl_filename, print_line)
188 |         if not noupdate:
189 |             check_update()
190 |     except (autofinder.NoFile, autofinder.TooManyFiles) as e:
191 |         print(add_bold(add_red("\n[Error] ")) + e.args[0])
192 | 
193 | 
194 | @cli.command()
195 | @click.option(
196 |     "--tex",
197 |     "-t",
198 |     "tex_filename",
199 |     type=click.Path(
200 |         exists=True, file_okay=True, dir_okay=False, writable=True, readable=True
201 |     ),
202 |     help="Your TeX file.",
203 |     required=True,
204 | )
205 | @click.option(
206 |     "--space",
207 |     "-s",
208 |     type=int,
209 |     default=200,
210 |     help="Number of characters tolerated between an anchor point and the introduction \
211 | of a knowledge. (Default value: 200)",
212 | )
213 | @click.option(
214 |     "--no-update/--update",
215 |     "-N/ ",
216 |     "noupdate",
217 |     default=False,
218 | )
219 | def anchor(tex_filename: str, space: int, noupdate: bool):
220 |     """
221 |     Prints warning when a knowledge is introduced but is not preceded by an anchor point.
222 |     """
223 |     add_anchor.app(tex_filename, space)
224 |     if not noupdate:
225 |         check_update()
226 | 
227 | 
228 | if __name__ == "__main__":
229 |     cli()
230 | 


--------------------------------------------------------------------------------
/knowledge_clustering/tex_document.py:
--------------------------------------------------------------------------------
  1 | """Handling a Tex document."""
  2 | 
  3 | from __future__ import annotations  # Support of `|` for type union in Python 3.9
  4 | from typing import TextIO
  5 | 
  6 | from knowledge_clustering import misc
  7 | 
  8 | 
  9 | class TexDocument:
 10 |     """Class for handling a tex document."""
 11 | 
 12 |     def __init__(self, tex_code: str) -> None:
 13 |         self.tex_code: str = tex_code
 14 |         self.lines: list[str] = self.tex_code.split("\n")
 15 |         self.__update_col_line()
 16 |         self.__clean()
 17 |         self.length: int = len(self.tex_cleaned)
 18 | 
 19 |     def __update_col_line(self) -> None:
 20 |         """
 21 |         Compute two arrays, saying for each index i of self.text, at what column and
 22 |         what line of the text this index is located.
 23 |         """
 24 |         self.find_line: list[int] = [0] * len(self.tex_code)
 25 |         self.find_col: list[int] = [0] * len(self.tex_code)
 26 |         line: int = 1
 27 |         col: int = 1
 28 |         for position, letter in enumerate(self.tex_code):
 29 |             self.find_line[position] = line
 30 |             self.find_col[position] = col
 31 |             if letter == "\n":
 32 |                 line += 1
 33 |                 col = 1
 34 |             else:
 35 |                 col += 1
 36 | 
 37 |     def __clean(self):
 38 |         """
 39 |         Reads self.tex_code (the original tex file), given as a single string.
 40 |         Converts spaces, tabulations and new lines into a single space, except
 41 |         if there is two consecutive new lines. Removes commented lines.
 42 |         The cleaned file is stored in self.tex_cleaned. A pointer
 43 |         from tex_cleaned to tex_code, in the form of an array, is produced in self.pointer.
 44 |         """
 45 | 
 46 |         # Essentially, the algorithm is a deterministic transducer with five states
 47 |         # 0: the last character is `normal` (not a space, a tab, nor a new line) ; initial state
 48 |         # 1: the last character is not normal,
 49 |         #   and no new line was read since the last normal character
 50 |         # 2: the last character is not normal,
 51 |         #   and exactly one new line was read since the last normal character
 52 |         # 3: the last character is not normal,
 53 |         #   and at least two new lines were read since the last normal character
 54 |         # 4: the line is commented.
 55 |         def is_normal(letter: str) -> bool:
 56 |             return letter not in [" ", "\t", "\n", "%"]
 57 | 
 58 |         def transition(
 59 |             state: int, letter: str, counter: int
 60 |         ) -> tuple[int, str, int | None]:
 61 |             """
 62 |             Input: curent state, input letter and the size of produced output so far
 63 |             Output: returns the new state, the output, and the pointer of the input letter.
 64 |             """
 65 |             if is_normal(letter):
 66 |                 if state == 4:
 67 |                     return (4, "", None)
 68 |                 return (0, letter, counter)
 69 |             if letter == "%":
 70 |                 return (4, "", None)
 71 |             if letter == "\n":
 72 |                 if state == 4:
 73 |                     return (0, "", None)
 74 |                 if state == 0:
 75 |                     return (2, " ", None)
 76 |                 if state == 1:
 77 |                     return (2, "", None)
 78 |                 if state == 2:
 79 |                     return (3, "\\par ", counter)
 80 |                 return (3, "", None)
 81 |             if letter in [" ", "\t"]:
 82 |                 if state == 0:
 83 |                     return (1, " ", counter)
 84 |                 return (state, "", None)
 85 |             raise KeyError("Transition not defined", state, letter)
 86 | 
 87 |         state: int = 0
 88 |         tex_cleaned: str = ""
 89 |         m: int = 0
 90 |         pointer: list[None | int] = []
 91 |         for position, letter in enumerate(self.tex_code):
 92 |             state, output, input_pointer = transition(state, letter, m)
 93 |             tex_cleaned += output
 94 |             m += len(output)
 95 |             # Put position at index input_pointer
 96 |             if input_pointer is not None:
 97 |                 pointer += [None] * (input_pointer - len(pointer)) + [position]
 98 |         self.tex_cleaned: str = tex_cleaned
 99 |         self.pointer: list[None | int] = pointer
100 | 
101 |     def print(self, start: int, end: int, n: int, out: TextIO):
102 |         """
103 |         Prints the lines between positions (in the clean tex) `start` and `end`
104 |         together with `n`-1 lines preceding `start`.
105 |         Emphasize the part between `start` and `end`.
106 |         """
107 |         start_p = self.pointer[start]
108 |         end_p = self.pointer[end]
109 |         if isinstance(start_p, int) and isinstance(end_p, int):
110 |             l_start: int = self.find_line[start_p]
111 |             c_start: int = self.find_col[start_p]
112 |             l_end: int = self.find_line[end_p]
113 |             c_end: int = self.find_col[end_p]
114 |             for i in range(max(0, l_start - n), l_end):
115 |                 if i + 1 == l_start and i + 1 == l_end:
116 |                     print(
117 |                         f"l{i+1}: \t{self.lines[i][:c_start-1]}"
118 |                         + misc.emph(self.lines[i][c_start - 1 : c_end])
119 |                         + self.lines[i][c_end:],
120 |                         file=out,
121 |                     )
122 |                 elif i + 1 == l_start:
123 |                     print(
124 |                         f"l{i+1}: \t{self.lines[i][:c_start-1]}"
125 |                         + misc.emph(self.lines[i][c_start - 1 :]),
126 |                         file=out,
127 |                     )
128 |                 elif i + 1 == l_end:
129 |                     print(
130 |                         f"l{i+1}: \t"
131 |                         + misc.emph(self.lines[i][:c_end])
132 |                         + self.lines[i][c_end:],
133 |                         file=out,
134 |                     )
135 |                 elif l_start < i + 1 and i + 1 < l_end:
136 |                     print(f"l{i+1}: \t" + misc.emph(self.lines[i]), file=out)
137 |                 else:
138 |                     print(f"l{i+1}: \t{self.lines[i]}", file=out)
139 |         else:
140 |             raise IndexError("Undefined pointer", self.pointer, (start, end))
141 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=42",
4 |     "wheel",
5 | ]
6 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = knowledge-clustering
 3 | version = attr: knowledge_clustering._version.VERSION
 4 | author = Rémi Morvan
 5 | author_email = remi@morvan.xyz
 6 | description = Automated notion clustering for the knowledge LaTeX package
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | url = https://github.com/remimorvan/knowledge-clustering
10 | project_urls =
11 |     Bug Tracker = https://github.com/remimorvan/knowledge-clustering/issues
12 | classifiers =
13 |     Programming Language :: Python :: 3
14 |     License :: OSI Approved :: MIT License
15 |     Operating System :: OS Independent
16 | keywords =
17 |     knowledge :: latex :: clustering
18 | 
19 | [options]
20 | packages = find:
21 | python_requires = >=3.9
22 | install_requires =
23 |     click
24 |     click_default_group
25 |     nltk
26 |     spacy
27 |     toposort
28 |     unidecode
29 |     requests
30 | 
31 | [options.package_data]
32 | * = data/*
33 | 
34 | [options.entry_points]
35 | console_scripts =
36 |     knowledge = knowledge_clustering.scripts.app:cli
37 | 
38 | [options.extras_require]
39 | tests =
40 |     pytest
41 |     filecmp


--------------------------------------------------------------------------------
/tests/.ordinal.diagnose.original:
--------------------------------------------------------------------------------
 1 | ************************
 2 | * Undefined knowledges *
 3 | ************************
 4 | 
 5 | \knowledge{ignore}
 6 |     | inseparability
 7 |     | semigroup
 8 |     | words
 9 |     | semigroups
10 |     | countable ordinal word


--------------------------------------------------------------------------------
/tests/.ordinal.kl.original:
--------------------------------------------------------------------------------
 1 | \knowledge{notion}
 2 |  | word
 3 | 
 4 | \knowledge{notion}
 5 |  | word@ord
 6 | 
 7 | \knowledge{notion}
 8 |  | regular language over countable ordinals
 9 |  | regular languages@ord
10 | 
11 |  \knowledge{notion}
12 |  | separation
13 | 


--------------------------------------------------------------------------------
/tests/.ordinal.kl.solution:
--------------------------------------------------------------------------------
 1 | \knowledge{notion}
 2 |  | word
 3 | %  | words
 4 | 
 5 | \knowledge{notion}
 6 |  | word@ord
 7 | %  | countable ordinal word
 8 | 
 9 | \knowledge{notion}
10 |  | regular language over countable ordinals
11 |  | regular languages@ord
12 | 
13 |  \knowledge{notion}
14 |  | separation
15 | %  | inseparability
16 | %%%%% NEW KNOWLEDGES 
17 | %
18 | %\knowledge{notion}
19 | %  | semigroup
20 | %  | semigroups
21 | 


--------------------------------------------------------------------------------
/tests/.ordinal.tex.original:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | 
 3 | \usepackage[utf8]{inputenc}
 4 | \usepackage[T1]{fontenc}
 5 | \pdfoutput = 1
 6 | 
 7 | \usepackage[breaklinks,hidelinks]{hyperref} 
 8 | \usepackage{xcolor} 
 9 | 
10 | \usepackage{knowledge}
11 | \knowledgeconfigure{notion}
12 | \knowledgeconfigure{quotation}
13 | \input{ordinal-kl.tex}
14 | 
15 | \title{Blabla}
16 | \date{\today}
17 | \author{Charles-Édouard}
18 | 
19 | 
20 | \begin{document}
21 | 
22 | \maketitle
23 | 
24 | \AP ""word""
25 | "words"
26 | ""word@@ord""
27 | "countable ordinal word"
28 | 
29 | blablablablablablablablablablablablablablablablablablablabla
30 | blablablablablablablablablablablablablablablablablablablabla
31 | blablablablablablablablablablablablablablablablablablablabla
32 | 
33 | ""regular language over countable ordinals""
34 | \kl[ord]{regular languages}
35 | \intro{separation}
36 | 
37 | "inseparability"
38 | ""semigroup""
39 | \kl{semigroups}
40 | 
41 | 
42 | \end{document}


--------------------------------------------------------------------------------
/tests/test_addquotes.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for the modules of knowledge_clustering on which the addquotes command is based.
 3 | """
 4 | 
 5 | from pathlib import Path
 6 | import shutil
 7 | 
 8 | from knowledge_clustering.add_quotes import app as app_addquotes
 9 | 
10 | 
11 | def test_app_addquotes() -> None:
12 |     """Tests the addquotes command."""
13 |     shutil.copy("tests/.ordinal.tex.original", "tests/ordinal.tex")
14 |     shutil.copy("tests/.ordinal.kl.original", "tests/ordinal.kl")
15 |     with open("tests/yes.txt", "w", encoding="utf-8") as yes:
16 |         yes.write("y\n" * 100)
17 |     with open("tests/yes.txt", "r", encoding="utf-8") as inp:
18 |         with open("tests/output_addquotes.txt", "w", encoding="utf-8") as out:
19 |             app_addquotes("tests/ordinal.tex", ["tests/ordinal.kl"], 1, inp, out)
20 |     with open("tests/output_addquotes.txt", "r", encoding="utf-8") as out:
21 |         nb_line_output = sum(1 for _ in out)
22 |     b: bool = nb_line_output == 7
23 |     p = Path("tests/")
24 |     for filename in ["yes.txt", "ordinal.tex", "ordinal.kl", "output_addquotes.txt"]:
25 |         (p / filename).unlink()
26 |     assert b
27 | 


--------------------------------------------------------------------------------
/tests/test_anchor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for the modules of knowledge_clustering on which the anchor command is based.
 3 | """
 4 | 
 5 | from pathlib import Path
 6 | import shutil
 7 | 
 8 | from knowledge_clustering.add_anchor import app as app_anchor
 9 | 
10 | 
11 | def test_app_anchor() -> None:
12 |     """Tests the anchor command."""
13 |     shutil.copy("tests/.ordinal.tex.original", "tests/ordinal.tex")
14 |     with open("tests/output_anchor.txt", "w", encoding="utf-8") as out:
15 |         app_anchor("tests/ordinal.tex", 200, out)
16 |     nb_line_output = sum(
17 |         1 for line in open("tests/output_anchor.txt", encoding="utf-8")
18 |     )
19 |     b1: bool = nb_line_output == 3
20 |     with open("tests/output_anchor.txt", "w", encoding="utf-8") as out:
21 |         app_anchor("tests/ordinal.tex", 5, out)
22 |     with open("tests/output_anchor.txt", "r", encoding="utf-8") as out:
23 |         nb_line_output = sum(1 for _ in out)
24 |     b2: bool = nb_line_output == 4
25 |     p = Path("tests/")
26 |     for filename in ["ordinal.tex", "output_anchor.txt"]:
27 |         (p / filename).unlink()
28 |     assert b1 and b2
29 | 


--------------------------------------------------------------------------------
/tests/test_autofinder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for the autofinder module.
  3 | """
  4 | 
  5 | from pathlib import Path
  6 | 
  7 | from knowledge_clustering.autofinder import (
  8 |     NoFile,
  9 |     TooManyFiles,
 10 |     get_unique_diagnose_file,
 11 |     get_knowledge_files,
 12 | )
 13 | 
 14 | 
 15 | def test_autofinder() -> None:
 16 |     """Test function for the functions get_unique_diagnose_file, get_knowledge_files
 17 |     from the module autofinder."""
 18 |     p = Path("tests/testaf/")
 19 |     p.mkdir()
 20 |     test_results = [False] * 6
 21 |     # 0th test with 1 diagnose file and 3 .kl with a unique default file (OK)
 22 |     (p / "subdir1").mkdir()
 23 |     (p / "subdir2").mkdir()
 24 |     (p / "subdir3").mkdir()
 25 |     (p / "subdir1/coolproject.diagnose").touch()
 26 |     (p / "subdir2/abbreviations.kl").touch()
 27 |     (p / "subdir2/main-default.kl").touch()
 28 |     (p / "subdir3/omega-automata.kl").touch()
 29 |     # Content of testaf directory:
 30 |     # - subdir1
 31 |     # |-- coolproject.diagnose
 32 |     # - subdir2
 33 |     # |-- abbreviations.kl
 34 |     # |-- main-default.kl
 35 |     # - subdir3
 36 |     # |-- omega-automata.kl
 37 |     try:
 38 |         dg_file = get_unique_diagnose_file(p)
 39 |         kl_files = get_knowledge_files(p)
 40 |         if (
 41 |             str(dg_file) == "tests/testaf/subdir1/coolproject.diagnose"
 42 |             and len(kl_files) == 3
 43 |             and str(kl_files[2]) == "tests/testaf/subdir2/main-default.kl"
 44 |         ):
 45 |             test_results[0] = True
 46 |     except (NoFile, TooManyFiles):
 47 |         pass
 48 |     # 1st test with 1 diagnose file and 4 .kl with a two default files (not OK)
 49 |     (p / "subdir3/secondary-default.kl").touch()
 50 |     # Content of testaf directory:
 51 |     # - subdir1
 52 |     # |-- coolproject.diagnose
 53 |     # - subdir2
 54 |     # |-- abbreviations.kl
 55 |     # |-- main-default.kl
 56 |     # - subdir3
 57 |     # |-- omega-automata.kl
 58 |     # |-- secondary-default.kl
 59 |     try:
 60 |         _ = get_knowledge_files(p)
 61 |     except TooManyFiles:
 62 |         test_results[1] = True
 63 |     # 2nd test with 1 diagnose file and 2 .kl with no default files (not OK)
 64 |     (p / "subdir2/main-default.kl").unlink()
 65 |     (p / "subdir3/secondary-default.kl").unlink()
 66 |     # Content of testaf directory:
 67 |     # - subdir1
 68 |     # |-- coolproject.diagnose
 69 |     # - subdir2
 70 |     # |-- abbreviations.kl
 71 |     # - subdir3
 72 |     # |-- omega-automata.kl
 73 |     try:
 74 |         _ = get_knowledge_files(p)
 75 |     except NoFile:
 76 |         test_results[2] = True
 77 |     # 3rd test with 1 diagnose file and 1 .kl with no default files (OK)
 78 |     (p / "subdir2/abbreviations.kl").unlink()
 79 |     # Content of testaf directory:
 80 |     # - subdir1
 81 |     # |-- coolproject.diagnose
 82 |     # - subdir2
 83 |     # - subdir3
 84 |     # |-- omega-automata.kl
 85 |     try:
 86 |         _ = get_knowledge_files(p)
 87 |         test_results[3] = True
 88 |     except (NoFile, TooManyFiles):
 89 |         pass
 90 |     # 4th test with 2 diagnose file and 1 .kl with no default files (not OK)
 91 |     (p / "subdir2/another-file.diagnose").touch()
 92 |     # Content of testaf directory:
 93 |     # - subdir1
 94 |     # |-- coolproject.diagnose
 95 |     # - subdir2
 96 |     # |-- another-file.diagnose
 97 |     # - subdir3
 98 |     # |-- omega-automata.kl
 99 |     try:
100 |         _ = get_unique_diagnose_file(p)
101 |     except TooManyFiles:
102 |         test_results[4] = True
103 |     # 5th test with no diagnose file and 1 .kl with no default files (not OK)
104 |     (p / "subdir1/coolproject.diagnose").unlink()
105 |     (p / "subdir2/another-file.diagnose").unlink()
106 |     # Content of testaf directory:
107 |     # - subdir1
108 |     # - subdir2
109 |     # - subdir3
110 |     # |-- omega-automata.kl
111 |     try:
112 |         _ = get_unique_diagnose_file(p)
113 |     except NoFile:
114 |         test_results[5] = True
115 |     # Remove all files and directory created for the test
116 |     (p / "subdir3/omega-automata.kl").unlink()
117 |     for dirname in ["subdir3", "subdir2", "subdir1", ""]:
118 |         (p / dirname).rmdir()
119 |     assert all(test_results)
120 | 


--------------------------------------------------------------------------------
/tests/test_clustering.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for the modules of knowledge_clustering on which the cluster command is based.
  3 | """
  4 | 
  5 | from typing import TypeVar
  6 | from pathlib import Path
  7 | import filecmp
  8 | import shutil
  9 | 
 10 | from knowledge_clustering.distance import distance, new_stemmer, normalise_notion
 11 | from knowledge_clustering.scope_meaning import infer_scope, infer_all_scopes
 12 | from knowledge_clustering.clustering import clustering
 13 | from knowledge_clustering.knowledges import Knowledges
 14 | from knowledge_clustering.diagnose import parse as parse_diagnose
 15 | from knowledge_clustering.config import parse as parse_config
 16 | from knowledge_clustering.clustering import app as app_clustering
 17 | 
 18 | T = TypeVar("T")  # Generic type
 19 | 
 20 | 
 21 | def test_normalise() -> None:
 22 |     assert (
 23 |         normalise_notion("two-way\\\\rational~relation") == "two-way rational relation"
 24 |     )
 25 | 
 26 | 
 27 | def test_distance() -> None:
 28 |     """Test functions from the the distance module."""
 29 |     assert distance("", "", [""], {}, "english") == 0
 30 |     # Tests where only the empty word is allowed as a prefix. No prior scope meaning is known.
 31 |     assert distance("ordinal semigroup", "ordinal semigroups", [""], {}, "english") == 0
 32 |     assert distance("cheval", "chevaux", [""], {}, "french") == 0
 33 |     assert distance("cheval", "chevaux", [""], {}, "english") > 0
 34 |     # Tests with a scope
 35 |     assert distance("ordinal semigroup", "semigroups@ordinal", [""], {}, "english") == 0
 36 |     assert distance("semigroup", "semigroups@ordinal", [""], {}, "english") > 0
 37 |     # Tests with prefixes
 38 |     assert distance("foo", "turbofoo", ["", "turbo"], {}, "english") == 0
 39 |     assert distance("foo", "turbofoo", [""], {}, "english") > 0
 40 |     assert distance("foo", "megafoo", ["", "turbo"], {}, "english") > 0
 41 |     assert distance("full", "non-full", ["", "non-"], {}, "english") == 0
 42 |     # Test with accent and math
 43 |     assert distance("Büchi", 'B\\"uchi', [""], {}, "english") == 0
 44 |     assert (
 45 |         distance("Büchi", '\\textsf{$\\omega$-B\\"{u}chi}', ["", "-"], {}, "english")
 46 |         == 0
 47 |     )
 48 |     # Tests with scope
 49 |     assert (
 50 |         distance("word@ord", "ordinal word", [""], {"ord": [["ordinal"]]}, "english")
 51 |         == 0
 52 |     )
 53 |     assert distance("word@ord", "ordinal word", [""], {}, "english") > 0
 54 |     # Tests with scope (should be case-insensitive)
 55 |     assert distance("foo@BaR", "foo@bar", [""], {}, "english") == 0
 56 |     # Tests with space
 57 |     assert distance("foo~bar", "foo bar", [""], {}, "english") == 0
 58 |     assert distance("foo\\\\bar", "foo bar", [""], {}, "english") == 0
 59 |     assert distance("foo\\\\ bar", "foo bar", [""], {}, "english") == 0
 60 |     assert (
 61 |         distance("two-way\\\\rational@rel", "two-way rational@rel", [""], {}, "english")
 62 |         == 0
 63 |     )
 64 | 
 65 | 
 66 | def compare(l1: list[list[T]], l2: list[list[T]]) -> bool:
 67 |     """Compares if two lists of lists contain the same elements."""
 68 | 
 69 |     def compare_lists(t1: list[T], t2: list[T]) -> bool:
 70 |         return set(t1) == set(t2)
 71 | 
 72 |     for t1 in l1:
 73 |         if not any(compare_lists(t1, t2) for t2 in l2):
 74 |             return False
 75 |     for t2 in l2:
 76 |         if not any(compare_lists(t1, t2) for t1 in l1):
 77 |             return False
 78 |     return True
 79 | 
 80 | 
 81 | def test_scope_meaning() -> None:
 82 |     """Tests functions from the module scope_meaning"""
 83 |     # Test infer_scope
 84 |     assert compare(
 85 |         infer_scope(
 86 |             ["regular language over countable ordinals", "regular languages@ord"],
 87 |             "ord",
 88 |             "english",
 89 |             new_stemmer("english"),
 90 |         ),
 91 |         [["ordinals", "countable"]],
 92 |     )
 93 |     # Test infer_all_scopes
 94 |     assert compare(
 95 |         infer_all_scopes(
 96 |             [
 97 |                 [
 98 |                     "word@some-scope",
 99 |                     "foo word",
100 |                 ],
101 |                 ["langage@some-scope", "bar langage"],
102 |             ],
103 |             "english",
104 |         )["some-scope"],
105 |         [["foo"], ["bar"], ["some-scope"]],
106 |     )
107 | 
108 | 
109 | def test_clustering() -> None:
110 |     """Tests functions from the clustering module."""
111 |     kls = Knowledges("tests/.ordinal.kl.original")
112 |     unknown_kl = parse_diagnose("tests/.ordinal.diagnose.original")
113 |     list_prefixes = parse_config("knowledge_clustering/data/english.ini")
114 |     scopes_meaning = infer_all_scopes(kls.get_all_bags(), "english")
115 |     clustering(kls, unknown_kl, 0, list_prefixes, scopes_meaning, "english")
116 |     solution = [
117 |         ["word", "words"],
118 |         ["word@ord", "countable ordinal word"],
119 |         ["regular language over countable ordinals", "regular languages@ord"],
120 |         ["separation", "inseparability"],
121 |         ["semigroup", "semigroups"],
122 |     ]
123 |     assert compare(kls.get_all_bags(), solution)
124 | 
125 | 
126 | def test_app_clustering() -> None:
127 |     """Tests the cluster command."""
128 |     for filename in ["ordinal.kl", "ordinal.diagnose"]:
129 |         shutil.copy(f"tests/.{filename}.original", f"tests/{filename}")
130 |     app_clustering(
131 |         ["tests/ordinal.kl"], "tests/ordinal.diagnose", False, False, "en", None
132 |     )
133 |     # Diagnose file should be left unchanged…
134 |     assert filecmp.cmp(
135 |         "tests/ordinal.diagnose", "tests/.ordinal.diagnose.original", shallow=False
136 |     )
137 |     # Check if knowledge file has good content
138 |     assert filecmp.cmp("tests/ordinal.kl", "tests/.ordinal.kl.solution", shallow=False)
139 |     p = Path("tests/")
140 |     for filename in ["ordinal.kl", "ordinal.diagnose"]:
141 |         (p / filename).unlink()
142 | 


--------------------------------------------------------------------------------