├── tests ├── __init__.py ├── test_core.py ├── test_paths.py ├── test_classify_paragraphs.py ├── test_utils.py ├── test_html_encoding.py ├── test_dom_utils.py └── test_sax.py ├── MANIFEST.in ├── doc └── cs_classification_example.png ├── justext ├── stoplists │ ├── Cebuano.txt │ ├── Waray_Waray.txt │ ├── Haitian.txt │ ├── Ido.txt │ ├── Bishnupriya_Manipuri.txt │ ├── Piedmontese.txt │ ├── Volapuk.txt │ ├── Newar.txt │ ├── Lombard.txt │ ├── Igbo.txt │ ├── Aragonese.txt │ ├── Tagalog.txt │ ├── Walloon.txt │ ├── Low_Saxon.txt │ ├── Irish.txt │ ├── Afrikaans.txt │ ├── Urdu.txt │ ├── Catalan.txt │ ├── Western_Panjabi.txt │ ├── West_Frisian.txt │ ├── Vietnamese.txt │ ├── Simple_English.txt │ ├── Yoruba.txt │ ├── Breton.txt │ ├── Dutch.txt │ ├── Neapolitan.txt │ ├── Spanish.txt │ ├── Quechua.txt │ ├── Norwegian_Nynorsk.txt │ ├── Hindi.txt │ ├── Luxembourgish.txt │ ├── Swahili.txt │ ├── Gujarati.txt │ ├── French.txt │ ├── Galician.txt │ ├── Persian.txt │ ├── Norwegian_Bokmal.txt │ ├── Occitan.txt │ ├── Portuguese.txt │ ├── English.txt │ ├── Sicilian.txt │ ├── Albanian.txt │ ├── Aromanian.txt │ ├── Welsh.txt │ ├── Kurdish.txt │ ├── Danish.txt │ ├── Esperanto.txt │ └── Italian.txt ├── __init__.py ├── _compat.py ├── paragraph.py └── utils.py ├── .gitignore ├── setup.cfg ├── tasks.py ├── .github └── workflows │ ├── publish-to-pypi.yaml │ ├── codeql-analysis.yml │ ├── run-tests.yml │ └── ossar-analysis.yml ├── LICENSE.rst ├── web_demo ├── style.css └── script.js ├── setup.py ├── CHANGELOG.rst └── README.rst /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include LICENSE.rst 3 | include CHANGELOG.rst 4 | recursive-include justext *.txt 5 | -------------------------------------------------------------------------------- /doc/cs_classification_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/miso-belica/jusText/HEAD/doc/cs_classification_example.png -------------------------------------------------------------------------------- /justext/stoplists/Cebuano.txt: -------------------------------------------------------------------------------- 1 | sa 2 | ka 3 | Ang 4 | ug 5 | usa 6 | may 7 | Kini 8 | rehiyon 9 | departamento 10 | Pransiya. 11 | -------------------------------------------------------------------------------- /justext/stoplists/Waray_Waray.txt: -------------------------------------------------------------------------------- 1 | han 2 | ha 3 | An 4 | amo 5 | usa 6 | ka 7 | in 8 | nasod 9 | ngan 10 | rehiyon 11 | nga 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python files 2 | __pycache__ 3 | .venv 4 | *.py[co] 5 | *.egg-info 6 | 7 | # tests 8 | .coverage 9 | .cache 10 | 11 | # project files 12 | .idea 13 | -------------------------------------------------------------------------------- /tests/test_core.py: -------------------------------------------------------------------------------- 1 | import justext 2 | 3 | 4 | def test_words_should_be_split_by_br_tag(): 5 | paragraphs = justext.justext('abc
def becoming abcdef', justext.get_stoplist("English")) 6 | 7 | assert [p.text for p in paragraphs] == ["abc def becoming abcdef"] 8 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 3.0.2 3 | commit = true 4 | tag = false 5 | 6 | [bumpversion:file:setup.py] 7 | 8 | [bumpversion:file:justext/__init__.py] 9 | 10 | [tool:pytest] 11 | addopts = --quiet --tb=short --color=yes --cov=justext --cov-report=term-missing --no-cov-on-fail 12 | 13 | [bdist_wheel] 14 | universal = 1 15 | -------------------------------------------------------------------------------- /justext/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Copyright (c) 2011 Jan Pomikalek 5 | 6 | This software is licensed as described in the file LICENSE.rst. 7 | """ 8 | 9 | from __future__ import absolute_import 10 | 11 | from .utils import get_stoplists, get_stoplist 12 | from .core import justext 13 | 14 | 15 | __version__ = "3.0.2" 16 | -------------------------------------------------------------------------------- /justext/stoplists/Haitian.txt: -------------------------------------------------------------------------------- 1 | nan 2 | yon 3 | se 4 | ayisyen 5 | vil 6 | lang 7 | ki 8 | lane 9 | Etazini. 10 | Kiba 11 | pou 12 | ak 13 | peyi 14 | eta 15 | moun 16 | Li 17 | gen 18 | an 19 | Popilasyon 20 | pwovens 21 | menm 22 | kò 23 | Ayiti. 24 | rive 25 | li 26 | pale 27 | Kiba. 28 | l. 29 | dezyèm 30 | Kreyòl 31 | radyo 32 | Relasyon 33 | Kiba, 34 | -------------------------------------------------------------------------------- /justext/stoplists/Ido.txt: -------------------------------------------------------------------------------- 1 | la 2 | esas 3 | di 4 | e 5 | de 6 | La 7 | en 8 | o 9 | mezala 10 | qua 11 | havas 12 | urbo 13 | sub 14 | qui 15 | revenuo 16 | evo 17 | til 18 | por 19 | familii 20 | populo 21 | habitas 22 | Segun 23 | evas 24 | yari 25 | tota 26 | plu 27 | km² 28 | Esas 29 | homuli. 30 | grandeso 31 | hemanari 32 | Po 33 | homini 34 | povreso-lineo. 35 | plus 36 | kun 37 | evoza. 38 | areo 39 | aquo. 40 | esis 41 | mi²) 42 | ye 43 | lojanti 44 | du 45 | altra 46 | kontado 47 | -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from invoke import task, run 4 | 5 | 6 | @task 7 | def clean(): 8 | run("rm -rf .coverage .cache dist build") 9 | 10 | 11 | @task(clean, default=True) 12 | def test(): 13 | run("py.test") 14 | 15 | 16 | @task(test) 17 | def install(): 18 | run("python setup.py develop") 19 | 20 | 21 | @task(test) 22 | def release(): 23 | run("python setup.py register sdist bdist_wheel") 24 | run("twine upload dist/*") 25 | 26 | 27 | @task(test) 28 | def bump(version="patch"): 29 | run("bumpversion %s" % version) 30 | -------------------------------------------------------------------------------- /justext/stoplists/Bishnupriya_Manipuri.txt: -------------------------------------------------------------------------------- 1 | বারো 2 | উপাত্ত. 3 | হারহান 4 | সাক্ষরতার 5 | মা 6 | অতার 7 | মানু 8 | জনসংখ্যার 9 | ইউনিয়ন 10 | হান 11 | ইলাতাই 12 | বর্গ 13 | মারির 14 | জনসংখ্যা 15 | মানুলেহা 16 | (লোক 17 | গননা) 18 | গ। 19 | ভৌগলিক 20 | পৌরসভা 21 | মুনি 22 | ইলতাই 23 | জেলা/বেয়াপা 24 | ভারতর 25 | হারি 26 | এহার 27 | অনুসারে 28 | এগত 29 | এহানর 30 | রাজ্যর 31 | আহান। 32 | বাংলাদেশর 33 | এহান 34 | জিলার 35 | ব্রাজিলর 36 | শহর 37 | এরে 38 | আগ। 39 | আসে। 40 | মাপাহানর 41 | থাইতারা। 42 | ২০০০ 43 | দ্রাঘিমাংশ 44 | অক্ষাংশ 45 | আসি। 46 | ইউনিয়নর 47 | বা 48 | ইউনিট 49 | ঘরর 50 | কিলোমিটারে 51 | পানিহান 52 | মাইলে 53 | বসর 54 | পরিসি। 55 | গড় 56 | -------------------------------------------------------------------------------- /justext/stoplists/Piedmontese.txt: -------------------------------------------------------------------------------- 1 | a 2 | ëd 3 | l'é 4 | na 5 | an 6 | e 7 | con 8 | la 9 | dël 10 | ël 11 | A 12 | ant 13 | për 14 | dla 15 | comun-a 16 | La 17 | densità 18 | lenga 19 | che 20 | region 21 | Ël 22 | un 23 | da 24 | fa 25 | abitant, 26 | part 27 | o 28 | le 29 | al 30 | censiment 31 | l'ha 32 | km², 33 | lenghe 34 | së 35 | ab/km². 36 | Lenga 37 | Pais 38 | stend 39 | parlà 40 | parlà. 41 | surfassa 42 | dle 43 | l’é 44 | provincia 45 | dova 46 | fransèisa 47 | dipartiment 48 | scond 49 | ch'a 50 | popolassion 51 | ij 52 | dij 53 | aministrativa 54 | pì 55 | abitant. 56 | confin-a 57 | dzortut 58 | l'era 59 | comun 60 | Aministrassion. 61 | sirca 62 | son 63 | San 64 | sìndich 65 | specialment 66 | n'aira 67 | abitant 68 | as 69 | 'd 70 | fin 71 | sò 72 | -------------------------------------------------------------------------------- /justext/stoplists/Volapuk.txt: -------------------------------------------------------------------------------- 1 | in 2 | e 3 | mö 4 | binon 5 | a 6 | u 7 | äbinon 8 | bäldoti 9 | lifayelas 10 | plu 11 | bäldotü 12 | mens 13 | km². 14 | zif 15 | patedik 16 | topon 17 | videtü 18 | lunetü 19 | ela 20 | lifayels 21 | N 22 | äbinons 23 | jü 24 | Sürfat 25 | pösods 26 | läs 27 | labon 28 | L. 29 | belödanis 30 | me 31 | yels 32 | Ädabinons 33 | äbinädons 34 | lomanefs 35 | Demü 36 | voms 37 | mans 38 | ädabinoms 39 | Lemesed 40 | bidädas 41 | topäd: 42 | utanas 43 | ziläk: 44 | Fransän. 45 | tat: 46 | no 47 | Nüns 48 | taledavik. 49 | km² 50 | Lamerikän. 51 | Lödanef. 52 | nen 53 | valodik 54 | Lödanadensit 55 | lödanefa 56 | komot: 57 | el 58 | fa 59 | De 60 | bal 61 | älödons 62 | yela: 63 | Timü 64 | Ma 65 | pöpinumam 66 | sürfati 67 | Census 68 | älabons 69 | Lamerikänik), 70 | (Pöpinumamabür 71 | "U.S. 72 | Bureau" 73 | äbinädon 74 | pösod 75 | Lödanef 76 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-pypi.yaml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v4 12 | 13 | - name: Set up Python 14 | uses: actions/setup-python@v5 15 | with: 16 | python-version: '3.x' 17 | 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install setuptools build wheel twine 22 | 23 | # https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html 24 | - name: Build and publish 25 | env: 26 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 27 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 28 | run: | 29 | python -m build 30 | twine upload dist/* 31 | -------------------------------------------------------------------------------- /justext/stoplists/Newar.txt: -------------------------------------------------------------------------------- 1 | थ्व 2 | व 3 | दु। 4 | थाय्‌ 5 | ख। 6 | खः। 7 | खने 8 | नं 9 | भारतया 10 | कथं 11 | छ्येलिगु 12 | निसें 13 | भाषा 14 | थासय् 15 | राज्यया 16 | छगू 17 | छगु 18 | तक्क 19 | थन 20 | धुंका 21 | थाय्‌या 22 | उत्तराखण्ड 23 | दूगु 24 | दु 25 | यक्व 26 | सन् 27 | कुमाँउ 28 | छ्येलेज्या 29 | पहाडी 30 | गां 31 | प्रभाव 32 | भाषाया 33 | लिपा 34 | नोभेम्बर 35 | या। 36 | मू 37 | जुइ। 38 | ला। 39 | भाय् 40 | थाय् 41 | संकिपा 42 | मण्डलया 43 | संस्कृत 44 | जुगु 45 | हिन्दू 46 | आदि 47 | कथलं 48 | जुल। 49 | नेपाःया 50 | रुपय् 51 | खँग्वयागु 52 | ब्रिटिसतेसं 53 | ॠतु 54 | भाषे 55 | गढवाली 56 | वर्ग 57 | धर्मया 58 | मध्य 59 | थःगु 60 | नापं 61 | जनपदया 62 | पलिस्था 63 | थासय्‌ 64 | वर्णन 65 | अंग्रेजी, 66 | भाषा. 67 | मण्डलवार 68 | जुन 69 | परिवारया 70 | ज्यानुवरी 71 | उत्तर 72 | लावन। 73 | तःधंगु 74 | संस्कृतय् 75 | मेमेगु 76 | भारोपेली 77 | थासे 78 | छत्तीसगढ 79 | भूगोल. 80 | धर्म 81 | थाय्‌यात 82 | the 83 | इतिहास. 84 | नाप 85 | हिन्दी, 86 | स्वापू 87 | जूगुलिं 88 | ग्रन्थय् 89 | गढवाल 90 | -------------------------------------------------------------------------------- /justext/_compat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division, print_function, unicode_literals 5 | 6 | from sys import version_info 7 | 8 | 9 | PY3 = version_info[0] == 3 10 | 11 | 12 | if PY3: 13 | bytes = bytes 14 | unicode = str 15 | else: 16 | bytes = str 17 | unicode = unicode 18 | string_types = (bytes, unicode,) 19 | 20 | 21 | if PY3: 22 | import urllib.request as urllib 23 | from urllib.error import URLError 24 | else: 25 | import urllib2 as urllib 26 | URLError = urllib.URLError 27 | 28 | 29 | try: 30 | from contextlib import ignored 31 | except ImportError: 32 | from contextlib import contextmanager 33 | 34 | @contextmanager 35 | def ignored(*exceptions): 36 | try: 37 | yield 38 | except tuple(exceptions): 39 | pass 40 | 41 | 42 | # note that cgi is depecrated and removed since 3.8 43 | try: 44 | from html import escape 45 | except ImportError: 46 | from cgi import escape 47 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "Code scanning - action" 2 | 3 | on: 4 | push: 5 | pull_request: 6 | schedule: 7 | - cron: '0 17 * * 1' 8 | 9 | jobs: 10 | CodeQL-Build: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: Checkout repository 16 | uses: actions/checkout@v2 17 | with: 18 | # We must fetch at least the immediate parents so that if this is 19 | # a pull request then we can checkout the head. 20 | fetch-depth: 2 21 | 22 | # If this run was triggered by a pull request event, then checkout 23 | # the head of the pull request instead of the merge commit. 24 | - run: git checkout HEAD^2 25 | if: ${{ github.event_name == 'pull_request' }} 26 | 27 | # Initializes the CodeQL tools for scanning. 28 | - name: Initialize CodeQL 29 | uses: github/codeql-action/init@v2 30 | # Override language selection by uncommenting this and choosing your languages 31 | with: 32 | languages: python 33 | 34 | - name: Perform CodeQL Analysis 35 | uses: github/codeql-action/analyze@v2 36 | -------------------------------------------------------------------------------- /.github/workflows/run-tests.yml: -------------------------------------------------------------------------------- 1 | name: Run tests 2 | on: 3 | workflow_dispatch: 4 | pull_request: 5 | branches: 6 | - "main" 7 | 8 | jobs: 9 | tests: 10 | runs-on: ubuntu-latest 11 | timeout-minutes: 15 12 | strategy: 13 | matrix: 14 | # https://dev.to/misobelica/python-features-by-version-3318 15 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: "Install dependencies (compatible)" 25 | run: | 26 | python -m pip install --upgrade pip 27 | # pytest 4.6.X works best for older Python versions 28 | pip install -U "lxml[html_clean]" "pytest==4.6.9" codecov 'coverage==4.5.4' pytest-cov 29 | if: ${{ matrix.python-version == '2.7' }} 30 | 31 | - name: "Install dependencies (up to date)" 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install -U "lxml[html_clean]" pytest codecov coverage pytest-cov 35 | if: ${{ matrix.python-version != '2.7' }} 36 | 37 | - run: py.test tests 38 | env: 39 | CI: 1 40 | PYTHONDONTWRITEBYTECODE: 1 41 | -------------------------------------------------------------------------------- /LICENSE.rst: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011, Jan Pomikalek 2 | Copyright (c) 2013, Michal Belica 3 | 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, 7 | are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | * Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY 16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 22 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /justext/stoplists/Lombard.txt: -------------------------------------------------------------------------------- 1 | de 2 | la 3 | del 4 | e 5 | l'è 6 | a 7 | in 8 | El 9 | al 10 | 'na 11 | cümü 12 | el 13 | i 14 | che 15 | cumün 16 | da 17 | di 18 | una 19 | 'n 20 | La 21 | en 22 | km² 23 | se 24 | un 25 | g'ha 26 | 'l 27 | per 28 | densità 29 | regiù 30 | coi 31 | cunfìna 32 | ab./km². 33 | presapóch 34 | superfìce 35 | italià, 36 | Pruvìncia 37 | abitàncc, 38 | San 39 | sura 40 | ai 41 | sò 42 | süperfiss 43 | è 44 | Al 45 | Canton 46 | abitant. 47 | tröva 48 | part 49 | tacaa 50 | dal 51 | anca 52 | le 53 | meter 54 | (en 55 | livel 56 | altitüden 57 | svizzer 58 | an 59 | l'ha 60 | I 61 | gh'eva 62 | o 63 | In 64 | distret 65 | gh'ha 66 | mar 67 | ol 68 | teritori 69 | staa 70 | popolazion 71 | fà 72 | cità 73 | on 74 | mar. 75 | Un 76 | l'era 77 | A 78 | km², 79 | con 80 | tróa 81 | l’è 82 | stat 83 | ma 84 | gh'è 85 | cun 86 | abitant 87 | ul 88 | prima 89 | dei 90 | par 91 | cunt 92 | col 93 | L'è 94 | piö 95 | hinn 96 | méter 97 | popolasiù 98 | distrèt 99 | püssee 100 | l' 101 | nom 102 | pressapoch 103 | comun 104 | teretóre 105 | Pruvincia 106 | lengua 107 | gh'ìa 108 | sü 109 | é 110 | süperfìce 111 | ü 112 | sa 113 | paes 114 | ona 115 | ghe 116 | minga 117 | mia 118 | sö 119 | Piemónt, 120 | piemontés: 121 | Baden-Württemberg, 122 | magioranza 123 | abitàncc. 124 | leèl 125 | dì 126 | ("Regierungsbezirk") 127 | rüràl 128 | -------------------------------------------------------------------------------- /.github/workflows/ossar-analysis.yml: -------------------------------------------------------------------------------- 1 | # This workflow integrates a collection of open source static analysis tools 2 | # with GitHub code scanning. For documentation, or to provide feedback, visit 3 | # https://github.com/github/ossar-action 4 | name: OSSAR 5 | 6 | on: 7 | push: 8 | pull_request: 9 | 10 | jobs: 11 | OSSAR-Scan: 12 | # OSSAR runs on windows-latest. 13 | # ubuntu-latest and macos-latest support coming soon 14 | runs-on: windows-latest 15 | 16 | steps: 17 | # Checkout your code repository to scan 18 | - name: Checkout repository 19 | uses: actions/checkout@v3 20 | with: 21 | # We must fetch at least the immediate parents so that if this is 22 | # a pull request then we can checkout the head. 23 | fetch-depth: 2 24 | 25 | # If this run was triggered by a pull request event, then checkout 26 | # the head of the pull request instead of the merge commit. 27 | - run: git checkout HEAD^2 28 | if: ${{ github.event_name == 'pull_request' }} 29 | 30 | # Install dotnet, used by OSSAR 31 | - name: Install .NET 32 | uses: actions/setup-dotnet@v3 33 | with: 34 | dotnet-version: '3.1.201' 35 | 36 | # Run open source static analysis tools 37 | - name: Run OSSAR 38 | uses: github/ossar-action@v1 39 | id: ossar 40 | 41 | # Upload results to the Security tab 42 | - name: Upload OSSAR results 43 | uses: github/codeql-action/upload-sarif@v1 44 | with: 45 | sarif_file: ${{ steps.ossar.outputs.sarifFile }} 46 | -------------------------------------------------------------------------------- /justext/stoplists/Igbo.txt: -------------------------------------------------------------------------------- 1 | na 2 | nke 3 | bu 4 | ubochi 5 | afo 6 | the 7 | ndi 8 | mgbe 9 | of 10 | ya 11 | a 12 | Na 13 | nwere 14 | ihe 15 | ana-kpo 16 | onwa 17 | to 18 | and 19 | onye 20 | agugu 21 | di 22 | "Gregorian" 23 | kpo 24 | agwu. 25 | no 26 | Anyi 27 | calender, 28 | du 29 | ha 30 | ana 31 | in 32 | "leap 33 | ma 34 | is 35 | Önwa 36 | o 37 | mbu 38 | otu 39 | ala 40 | O 41 | year") 42 | obodo 43 | ("iri 44 | as 45 | May 46 | ka 47 | iri 48 | afo. 49 | madu 50 | The 51 | bụ 52 | that 53 | are 54 | ahu 55 | on 56 | n'ime 57 | April 58 | February 59 | Wikipedia 60 | abuo 61 | by 62 | for 63 | March 64 | ukwu 65 | July 66 | year". 67 | maka 68 | Í 69 | state 70 | be 71 | ("non-leap 72 | ebe 73 | with 74 | Ndi 75 | or 76 | ama 77 | ọ 78 | ne 79 | Amerika. 80 | bukwa 81 | Igbo 82 | mádu 83 | ya. 84 | ya, 85 | United 86 | si 87 | ȯ 88 | nor 89 | e 90 | afọr 91 | was 92 | onu 93 | January 94 | ike 95 | ruru 96 | States 97 | Naigeria. 98 | ishií 99 | anyi 100 | has 101 | mmadu 102 | June 103 | mere 104 | not 105 | tupu 106 | an 107 | wiki 108 | which 109 | nile, 110 | Ha 111 | uwa 112 | bi 113 | ébé 114 | bula 115 | ime 116 | nile. 117 | íshí 118 | nde 119 | àlà 120 | n'etiti 121 | anyanwu 122 | dị 123 | shí 124 | such 125 | can 126 | Mgbe 127 | nà 128 | State 129 | ó 130 | kwa 131 | its 132 | Ihe 133 | it 134 | buru 135 | have 136 | aha 137 | A 138 | ato 139 | iche 140 | nile 141 | Naigeria 142 | abụo 143 | year"), 144 | nọr 145 | he 146 | màkà 147 | calendar, 148 | egwu 149 | Obu 150 | -------------------------------------------------------------------------------- /justext/stoplists/Aragonese.txt: -------------------------------------------------------------------------------- 1 | de 2 | en 3 | a 4 | y 5 | o 6 | ye 7 | que 8 | una 9 | d'a 10 | con 11 | d'o 12 | un 13 | A 14 | os 15 | suya 16 | población 17 | la 18 | se 19 | por 20 | lo 21 | e 22 | as 23 | como 24 | u 25 | d'os 26 | En 27 | O 28 | superficie 29 | provincia 30 | densidat 31 | hab/km². 32 | habitants 33 | ta 34 | municipio 35 | suyo 36 | no 37 | d'as 38 | km² 39 | situato 40 | son 41 | comarca 42 | estió 43 | los 44 | francés 45 | más 46 | parte 47 | tamién 48 | yera 49 | Os 50 | capital 51 | ha 52 | dende 53 | entre 54 | dica 55 | sobre 56 | La 57 | rechión 58 | Cheografía. 59 | km², 60 | sieglo 61 | comuna 62 | las 63 | metros 64 | situata 65 | ran 66 | Ye 67 | km 68 | termin 69 | dos 70 | d'altaria 71 | pero 72 | ciudat 73 | i 74 | departamento 75 | bi 76 | encara 77 | nombre 78 | (en 79 | més 80 | localidat 81 | destrito 82 | on 83 | fa 84 | dimpués 85 | lugar 86 | cantón 87 | del 88 | quan 89 | aragonés 90 | río 91 | troba 92 | per 93 | comunidat 94 | Sant 95 | naixito 96 | As 97 | distancia 98 | partito 99 | autonoma 100 | oficialment) 101 | suyos 102 | d'un 103 | muga 104 | occitana 105 | mar, 106 | d'una 107 | estar 108 | gran 109 | fue 110 | luenga 111 | suyas 112 | l'anyo 113 | mientres 114 | norte 115 | catalán 116 | ya 117 | Se 118 | municipal 119 | fan 120 | pa 121 | tien 122 | anyos 123 | sud 124 | Espanya. 125 | puet 126 | Lo 127 | forma 128 | Castiella 129 | Imperio 130 | chudicial 131 | estato 132 | muerto 133 | San 134 | occitán, 135 | rei 136 | castellán) 137 | parti 138 | Guerra 139 | yeran 140 | I 141 | enta 142 | mesmo 143 | baixo 144 | contra 145 | Historia. 146 | tot 147 | primera 148 | Tamién 149 | Santa 150 | mas 151 | ziudat 152 | lugars 153 | Meyodía-Perineus. 154 | -------------------------------------------------------------------------------- /justext/stoplists/Tagalog.txt: -------------------------------------------------------------------------------- 1 | ng 2 | sa 3 | na 4 | ay 5 | ang 6 | mga 7 | isang 8 | at 9 | Ang 10 | o 11 | Si 12 | noong 13 | bansang 14 | ni 15 | ito 16 | may 17 | si 18 | bilang 19 | lalawigan 20 | siya 21 | artista 22 | mula 23 | kanyang 24 | hindi 25 | para 26 | Sa 27 | Pilipinas. 28 | Hapon. 29 | Italya. 30 | kung 31 | comune 32 | rin 33 | niya 34 | din 35 | bayan 36 | Timog 37 | taon 38 | lungsod 39 | of 40 | upang 41 | Ito 42 | Mga 43 | naging 44 | kanilang 45 | nito 46 | hanggang 47 | dahil 48 | isa 49 | unang 50 | Korea. 51 | the 52 | nang 53 | pa 54 | pang 55 | lamang 56 | palabas 57 | Noong 58 | saan 59 | taong 60 | San 61 | nasa 62 | ibang 63 | Lungsod 64 | telebisyon 65 | bahagi 66 | pamamagitan 67 | itong 68 | uri 69 | kay 70 | Estados 71 | dalawang 72 | mas 73 | siyang 74 | naman 75 | Ayon 76 | tao 77 | Pilipinas 78 | iba 79 | Siya 80 | de 81 | and 82 | sila 83 | kasama 84 | ilang 85 | populasyon 86 | loob 87 | panahon 88 | lahat 89 | karakter 90 | planetang 91 | matatagpuan 92 | kathang-isip 93 | ngunit 94 | pangunahin. 95 | nahahati 96 | (Ingles: 97 | kalendaryong 98 | ginagamit 99 | Comics. 100 | kilala 101 | Bayan 102 | tinatawag 103 | maraming 104 | barangay. 105 | Gregorian. 106 | maging 107 | walang 108 | niyang 109 | pangunahing 110 | katulad 111 | araw 112 | buong 113 | senso 114 | Barangay. 115 | kilalang 116 | wikang 117 | kasalukuyang 118 | May 119 | kaniyang 120 | kabahayan. 121 | klaseng 122 | nila 123 | Isa 124 | tulad 125 | pangalan 126 | dating 127 | bansa 128 | Isang 129 | mang-aawit 130 | habang 131 | salitang 132 | katao 133 | maaaring 134 | Unidos. 135 | in 136 | nito. 137 | ito. 138 | sining 139 | no 140 | bago 141 | anak 142 | ilalim 143 | tatlong 144 | politiko 145 | itinatag 146 | bagong 147 | dito 148 | kalendaryo. 149 | -------------------------------------------------------------------------------- /justext/stoplists/Walloon.txt: -------------------------------------------------------------------------------- 1 | l' 2 | di 3 | a 4 | d' 5 | des 6 | les 7 | est 8 | on 9 | et 10 | e 11 | li 12 | do 13 | ene 14 | ki 15 | k' 16 | del 17 | c' 18 | Li 19 | dins 20 | avou 21 | po 22 | s' 23 | i 24 | å 25 | so 26 | si 27 | anêyes 28 | Les 29 | pus 30 | n' 31 | eyet 32 | eto 33 | pa 34 | nén 35 | gn 36 | come 37 | stî 38 | el 39 | la 40 | walon 41 | al 42 | da 43 | esteut 44 | u 45 | ont 46 | sol 47 | C' 48 | èn 49 | Il 50 | On 51 | I 52 | sont 53 | il 54 | fwait 55 | mins 56 | tot 57 | anêye 58 | sieke 59 | ås 60 | ptit 61 | L' 62 | fé 63 | Walonreye, 64 | E 65 | aveut 66 | walon. 67 | ey 68 | co 69 | deus 70 | pout 71 | no 72 | bråmint 73 | djoûs 74 | payis 75 | minme 76 | ses 77 | va 78 | mots 79 | mot 80 | de 81 | walon, 82 | esse 83 | bén 84 | ans 85 | djins 86 | çou 87 | foirt 88 | disk' 89 | province 90 | Mins 91 | ancyin 92 | ban 93 | calindrî 94 | après 95 | î 96 | sovint 97 | Gn 98 | dit 99 | ni 100 | ôtes 101 | djoû 102 | ou 103 | inte 104 | vey 105 | sacwants 106 | fén 107 | ele 108 | pol 109 | ci 110 | tins 111 | Ene 112 | decimbe 113 | rebané 114 | tos 115 | comene 116 | prumî 117 | Walonreye 118 | lingaedje 119 | kel 120 | fourit 121 | l’ 122 | Dins 123 | cwand 124 | on-z 125 | A 126 | nos 127 | ls 128 | en 129 | lome 130 | "Po 131 | (li 132 | live 133 | vîs 134 | onk 135 | scrît 136 | motî." 137 | vî 138 | francès 139 | Ele 140 | todi 141 | moes 142 | (on 143 | metou 144 | Po 145 | sins 146 | motî 147 | Elle 148 | passé 149 | ça 150 | skepyî 151 | 19inme 152 | cawete 153 | ôte 154 | 20inme 155 | awousse, 156 | pådje 157 | may, 158 | leu 159 | Si 160 | djulete, 161 | måss, 162 | estént 163 | racsegnes 164 | divant 165 | octôbe, 166 | mete 167 | djanvî, 168 | setimbe, 169 | vént 170 | mwaisse 171 | avri, 172 | splitchant 173 | a-st 174 | etimolodjeye 175 | (fr: 176 | fevrî, 177 | po-z 178 | nôvimbe, 179 | -------------------------------------------------------------------------------- /justext/paragraph.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division, print_function, unicode_literals 5 | 6 | import re 7 | 8 | from .utils import normalize_whitespace 9 | 10 | 11 | HEADINGS_PATTERN = re.compile(r"\bh\d\b") 12 | 13 | 14 | class Paragraph(object): 15 | """Object representing one block of text in HTML.""" 16 | def __init__(self, path): 17 | self.dom_path = path.dom 18 | self.xpath = path.xpath 19 | self.text_nodes = [] 20 | self.chars_count_in_links = 0 21 | self.tags_count = 0 22 | self.class_type = "" # short | neargood | good | bad 23 | 24 | @property 25 | def is_heading(self): 26 | return bool(HEADINGS_PATTERN.search(self.dom_path)) 27 | 28 | @property 29 | def is_boilerplate(self): 30 | return self.class_type != "good" 31 | 32 | @property 33 | def text(self): 34 | text = "".join(self.text_nodes) 35 | return normalize_whitespace(text.strip()) 36 | 37 | def __len__(self): 38 | return len(self.text) 39 | 40 | @property 41 | def words_count(self): 42 | return len(self.text.split()) 43 | 44 | def contains_text(self): 45 | return bool(self.text_nodes) 46 | 47 | def append_text(self, text): 48 | text = normalize_whitespace(text) 49 | self.text_nodes.append(text) 50 | return text 51 | 52 | def stopwords_count(self, stopwords): 53 | return sum(word.lower() in stopwords for word in self.text.split()) 54 | 55 | def stopwords_density(self, stopwords): 56 | if self.words_count == 0: 57 | return 0 58 | 59 | return self.stopwords_count(stopwords) / self.words_count 60 | 61 | def links_density(self): 62 | text_length = len(self.text) 63 | if text_length == 0: 64 | return 0 65 | 66 | return self.chars_count_in_links / text_length 67 | -------------------------------------------------------------------------------- /justext/stoplists/Low_Saxon.txt: -------------------------------------------------------------------------------- 1 | de 2 | vun 3 | un 4 | dat 5 | in 6 | is 7 | en 8 | De 9 | den 10 | hett 11 | as 12 | to 13 | mit 14 | hebbt 15 | he 16 | ok 17 | sünd 18 | weer 19 | sik 20 | ut 21 | Inwahners 22 | op 23 | Dat 24 | an 25 | se 26 | Minschen 27 | bi 28 | ’n 29 | för 30 | noch 31 | sien 32 | in’n 33 | oder 34 | nich 35 | warrt 36 | worrn. 37 | Vun 38 | In 39 | annere 40 | een 41 | denn 42 | He 43 | Lüüd 44 | na 45 | so 46 | Kreis 47 | dor 48 | Afsluss 49 | Zensus 50 | Nakamen 51 | ünner 52 | wat 53 | As 54 | bit 55 | man 56 | up 57 | to’n 58 | dör 59 | Rebeet 60 | is. 61 | weer, 62 | County 63 | an’n 64 | aver 65 | gifft 66 | Johr 67 | Se 68 | km². 69 | buten 70 | Spraak 71 | (Stand 72 | över 73 | Sitt 74 | Verwalten 75 | van 76 | maakt. 77 | USA 78 | harrn 79 | Historie. 80 | hebbt. 81 | vun’n 82 | Demografie. 83 | üm 84 | in’t 85 | bi’n 86 | snackt 87 | mehr 88 | her 89 | grünnt 90 | maakt, 91 | Tiet 92 | leven 93 | ene 94 | hele 95 | boren 96 | stammt 97 | Kommun 98 | Gruppen 99 | Kopp 100 | harr 101 | ünnerscheedliche 102 | Naam 103 | je 104 | ansehn. 105 | Stadt 106 | höger 107 | an’e 108 | Engelsch 109 | sülven 110 | Inwahnertall 111 | angeven, 112 | wedder 113 | vun’t 114 | US-Bundsstaat 115 | Asien 116 | Dor 117 | Arbeit. 118 | Vöröllern 119 | kann 120 | Inkamen 121 | tohuus 122 | torekent. 123 | Universität, 124 | Highschool 125 | blots 126 | Indianers. 127 | Witte, 128 | Vörfohren 129 | US-Dollar. 130 | Latienamerika 131 | Armoot. 132 | Bachelor-Afsluss 133 | Afstammung 134 | Inkamensgrenz 135 | Eilannen 136 | von 137 | vör 138 | wurrn. 139 | Swarte, 140 | ehr 141 | (County) 142 | al 143 | wurrn 144 | Johren 145 | is, 146 | hett. 147 | disse 148 | liggt 149 | all 150 | em 151 | weern 152 | Pazifik. 153 | worrn 154 | twee 155 | Deel 156 | En 157 | Bi 158 | nu 159 | nah 160 | wurr 161 | af 162 | wenn 163 | veel 164 | maakt 165 | siene 166 | aber 167 | und 168 | warrt. 169 | Land 170 | wo 171 | sünd. 172 | eerste 173 | -------------------------------------------------------------------------------- /justext/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division, print_function, unicode_literals 5 | 6 | import re 7 | import os 8 | import sys 9 | import pkgutil 10 | 11 | MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE) 12 | 13 | 14 | def normalize_whitespace(text): 15 | """ 16 | Translates multiple whitespace into single space character. 17 | If there is at least one new line character chunk is replaced 18 | by single LF (Unix new line) character. 19 | """ 20 | return MULTIPLE_WHITESPACE_PATTERN.sub(_replace_whitespace, text) 21 | 22 | 23 | def _replace_whitespace(match): 24 | """Normalize all spacing characters that aren't a newline to a space.""" 25 | text = match.group() 26 | return "\n" if "\n" in text or "\r" in text else " " 27 | 28 | 29 | def is_blank(string): 30 | """ 31 | Returns `True` if string contains only white-space characters 32 | or is empty. Otherwise `False` is returned. 33 | """ 34 | return not string or string.isspace() 35 | 36 | 37 | def get_stoplists(): 38 | """Returns a collection of built-in stop-lists.""" 39 | path_to_stoplists = os.path.dirname(sys.modules["justext"].__file__) 40 | path_to_stoplists = os.path.join(path_to_stoplists, "stoplists") 41 | 42 | stoplist_names = [] 43 | for filename in os.listdir(path_to_stoplists): 44 | name, extension = os.path.splitext(filename) 45 | if extension == ".txt": 46 | stoplist_names.append(name) 47 | 48 | return frozenset(stoplist_names) 49 | 50 | 51 | def get_stoplist(language): 52 | """Returns an built-in stop-list for the language as a set of words.""" 53 | file_path = os.path.join("stoplists", "%s.txt" % language) 54 | try: 55 | stopwords = pkgutil.get_data("justext", file_path) 56 | except IOError: 57 | raise ValueError( 58 | "Stoplist for language '%s' is missing. " 59 | "Please use function 'get_stoplists' for complete list of stoplists " 60 | "and feel free to contribute by your own stoplist." % language 61 | ) 62 | 63 | return frozenset(w.decode("utf8").lower() for w in stopwords.splitlines()) 64 | -------------------------------------------------------------------------------- /web_demo/style.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: Trebuchet MS, Helvetica, Verdana, Arial, sans-serif; 3 | } 4 | 5 | h1 { 6 | margin-left: 10px; 7 | margin-bottom: 0.2em; 8 | } 9 | 10 | div.description { 11 | margin-left: 10px; 12 | margin-bottom: 0.7em; 13 | } 14 | 15 | p.heading { 16 | font-size: 120%; 17 | font-weight: bold; 18 | } 19 | 20 | p.good, p.heading, p.bad { 21 | font-family: Times New Roman, serif; 22 | width: 550px; 23 | text-align: justify; 24 | margin-top: 2px; 25 | margin-bottom: 2px; 26 | padding-left: 3px; 27 | padding-right: 3px; 28 | } 29 | 30 | p.good, p.heading { 31 | background-color: #cfc; 32 | } 33 | 34 | p.bad { 35 | /* color: #f00;*/ 36 | background-color: #fdd; 37 | font-size: 70%%; 38 | } 39 | 40 | div#output_wrapper { 41 | width: 800px; 42 | } 43 | 44 | div.paragraph_details { 45 | width: 240px; 46 | float: right; 47 | display: none; 48 | } 49 | 50 | div.paragraph_details table { 51 | font-size: 75%; 52 | border-collapse: collapse; 53 | } 54 | 55 | div.paragraph_details table td { 56 | padding-left: 2px; 57 | padding-right: 2px; 58 | border: 1px solid #fff; 59 | } 60 | 61 | div.paragraph_details.good table, 62 | div.paragraph_details.heading table { 63 | border: 1px solid #9f9; 64 | } 65 | 66 | div.paragraph_details.bad table { 67 | border: 1px solid #faa; 68 | } 69 | 70 | div.paragraph_details.good table tr, 71 | div.paragraph_details.heading table tr { 72 | background-color: #9f9; 73 | } 74 | 75 | div.paragraph_details.bad table tr { 76 | background-color: #faa; 77 | } 78 | 79 | span.stopword { 80 | text-decoration: underline; 81 | } 82 | 83 | fieldset { 84 | width: 528px; 85 | background-color: #eee; 86 | border: 1px solid black; 87 | } 88 | 89 | div#error { 90 | font-weight: bold; 91 | color: #a00; 92 | margin-bottom: .3em; 93 | } 94 | 95 | div#basic_options td.label { 96 | width: 80px; 97 | } 98 | 99 | div#basic_options input.wide { 100 | width: 438; 101 | } 102 | 103 | div#advanced_options { 104 | display: none; 105 | } 106 | 107 | input#submit { 108 | float: right; 109 | } 110 | 111 | div#footer { 112 | width: 558px; 113 | } 114 | -------------------------------------------------------------------------------- /justext/stoplists/Irish.txt: -------------------------------------------------------------------------------- 1 | an 2 | a 3 | na 4 | agus 5 | ar 6 | i 7 | ag 8 | go 9 | le 10 | sé 11 | sa 12 | é 13 | is 14 | bhí 15 | de 16 | Is 17 | mar 18 | in 19 | leis 20 | ina 21 | bhliain 22 | Tá 23 | ach 24 | raibh 25 | ó 26 | nó 27 | as 28 | Bhí 29 | atá 30 | den 31 | seo 32 | san 33 | í 34 | chuid 35 | siad 36 | sin, 37 | sin 38 | do 39 | chun 40 | ná 41 | faoi 42 | iad 43 | tá 44 | ann 45 | gur 46 | ea 47 | bhfuil 48 | Ó 49 | eile 50 | ba 51 | An 52 | nuair 53 | ón 54 | don 55 | dá 56 | sna 57 | chéad 58 | nach 59 | idir 60 | ní 61 | féin 62 | tháinig 63 | dtí 64 | amach 65 | Sa 66 | mó 67 | sí 68 | chomh 69 | I 70 | lá 71 | duine 72 | aon 73 | Ba 74 | níos 75 | chur 76 | suite 77 | mór 78 | air 79 | chuir 80 | acu 81 | áit 82 | Ní 83 | á 84 | réir 85 | rud 86 | éis 87 | aige 88 | Mar 89 | daoine 90 | ndiaidh 91 | teanga 92 | trí 93 | féidir 94 | dhá 95 | cuid 96 | chuaigh 97 | Átha 98 | maith 99 | áfach, 100 | seo, 101 | lucht 102 | dul 103 | féin, 104 | gach 105 | isteach 106 | ainm 107 | Mac 108 | nua 109 | am 110 | Nuair 111 | tar 112 | fuair 113 | ab 114 | chéile 115 | amháin 116 | bliana 117 | Ar 118 | ceann 119 | dhiaidh 120 | é. 121 | bheith 122 | rinne 123 | ann. 124 | aghaidh 125 | gan 126 | lena 127 | dhéanamh 128 | Baile 129 | linn 130 | taobh 131 | cur 132 | thug 133 | faoin 134 | hÉireann 135 | blianta 136 | cuireadh 137 | timpeall 138 | tír 139 | sin. 140 | Gaeilge 141 | fáil 142 | dó 143 | deireadh 144 | ann, 145 | lán 146 | cé 147 | teacht 148 | thabhairt 149 | thosaigh 150 | seo. 151 | Tháinig 152 | gContae 153 | nós 154 | úsáid 155 | gcuid 156 | obair 157 | bás 158 | Aontaithe 159 | measc 160 | bhain 161 | roinnt 162 | féin. 163 | roimh 164 | chathair 165 | mhór 166 | rith 167 | orthu 168 | cosúil 169 | De 170 | bhaint 171 | rialtas 172 | rugadh 173 | leabhar 174 | níor 175 | é, 176 | beag 177 | of 178 | mbliain 179 | leith 180 | Na 181 | Fuair 182 | eile, 183 | gcónaí 184 | Rugadh 185 | eile. 186 | síos 187 | Éirinn 188 | Stáit 189 | feadh 190 | thar 191 | dara 192 | bliain 193 | scríobh 194 | mbeadh 195 | air. 196 | baint 197 | mhó 198 | háirithe 199 | Fómhair 200 | Éireannach 201 | Cé 202 | teangacha 203 | fear 204 | t-ainm 205 | fud 206 | Cliath 207 | d'éirigh 208 | freisin. 209 | scéal 210 | lár 211 | bhíonn 212 | uair 213 | the 214 | haois 215 | Le 216 | John 217 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Copyright (c) 2011 Jan Pomikalek 5 | 6 | This software is licensed as described in the file LICENSE.rst. 7 | """ 8 | 9 | from __future__ import with_statement 10 | 11 | try: 12 | from setuptools import setup 13 | except ImportError: 14 | from distutils.core import setup 15 | 16 | 17 | with open("README.rst") as readme: 18 | with open("CHANGELOG.rst") as changelog: 19 | long_description = readme.read() + "\n\n" + changelog.read() 20 | 21 | 22 | setup( 23 | name="jusText", 24 | version="3.0.2", 25 | description="Heuristic based boilerplate removal tool", 26 | long_description=long_description, 27 | author="Jan Pomikálek", 28 | author_email="jan.pomikalek@gmail.com", 29 | maintainer="Michal Belica", 30 | maintainer_email="miso.belica@gmail.com", 31 | url="https://github.com/miso-belica/jusText", 32 | license="The BSD 2-Clause License", 33 | install_requires=[ 34 | 'lxml[html_clean] >= 4.4.2', 35 | 'backports.functools-lru-cache; python_version < "3.2"' 36 | ], 37 | tests_require=[ 38 | "pytest", 39 | "pytest-cov", 40 | "coverage", 41 | ], 42 | packages=["justext"], 43 | package_data={"justext": ["stoplists/*.txt"]}, 44 | classifiers=( 45 | "Development Status :: 5 - Production/Stable", 46 | "Intended Audience :: Developers", 47 | "Natural Language :: English", 48 | "License :: OSI Approved :: BSD License", 49 | "Operating System :: OS Independent", 50 | "Programming Language :: Python", 51 | "Programming Language :: Python :: 2", 52 | "Programming Language :: Python :: 2.7", 53 | "Programming Language :: Python :: 3", 54 | "Programming Language :: Python :: 3.5", 55 | "Programming Language :: Python :: 3.6", 56 | "Programming Language :: Python :: 3.7", 57 | "Programming Language :: Python :: 3.8", 58 | "Programming Language :: Python :: 3.9", 59 | "Programming Language :: Python :: 3.10", 60 | "Programming Language :: Python :: Implementation :: CPython", 61 | "Topic :: Internet :: WWW/HTTP", 62 | "Topic :: Software Development :: Pre-processors", 63 | "Topic :: Text Processing :: Filters", 64 | "Topic :: Text Processing :: Markup :: HTML", 65 | ), 66 | ) 67 | -------------------------------------------------------------------------------- /justext/stoplists/Afrikaans.txt: -------------------------------------------------------------------------------- 1 | die 2 | van 3 | en 4 | in 5 | 'n 6 | is 7 | het 8 | Die 9 | wat 10 | met 11 | as 12 | te 13 | op 14 | word 15 | sy 16 | om 17 | was 18 | deur 19 | vir 20 | se 21 | aan 22 | ook 23 | tot 24 | na 25 | In 26 | dat 27 | hy 28 | of 29 | dit 30 | nie 31 | hulle 32 | het. 33 | uit 34 | by 35 | maar 36 | kan 37 | soos 38 | Dit 39 | eerste 40 | is. 41 | oor 42 | ander 43 | Hy 44 | een 45 | ’n 46 | gebruik 47 | nie. 48 | meer 49 | tussen 50 | jaar 51 | word. 52 | onder 53 | hierdie 54 | daar 55 | twee 56 | het, 57 | teen 58 | groot 59 | baie 60 | is, 61 | stad 62 | begin 63 | hul 64 | waar 65 | toe 66 | egter 67 | sou 68 | bekend 69 | nuwe 70 | nog 71 | eeu 72 | haar 73 | grootste 74 | hom 75 | Sy 76 | deel 77 | gebied 78 | naam 79 | ‘n 80 | Daar 81 | nie, 82 | tydens 83 | Duitse 84 | later 85 | al 86 | Hierdie 87 | de 88 | veral 89 | land 90 | Op 91 | net 92 | sowat 93 | sal 94 | bestaan 95 | terwyl 96 | kon 97 | mense 98 | Franse 99 | voor 100 | weer 101 | dié 102 | laat 103 | gevolg 104 | aantal 105 | drie 106 | bevolking 107 | slegs 108 | word, 109 | steeds 110 | Hulle 111 | saam 112 | keer 113 | Suid-Afrika 114 | maak 115 | verskeie 116 | so 117 | vorm 118 | meeste 119 | Verenigde 120 | vanaf 121 | moet 122 | klein 123 | alle 124 | Britse 125 | tweede 126 | koning 127 | dorp 128 | kom 129 | reeds 130 | was. 131 | dikwels 132 | gemaak 133 | geen 134 | belangrike 135 | ten 136 | tyd 137 | hoofstad 138 | taal 139 | werk 140 | asook 141 | Pous 142 | dan 143 | plaaslike 144 | huidige 145 | dus 146 | politieke 147 | Amerikaanse 148 | einde 149 | dood 150 | ná 151 | miljoen 152 | eie 153 | klub 154 | volgens 155 | geword 156 | Met 157 | gehad 158 | As 159 | Suid-Afrikaanse 160 | ongeveer 161 | beskou 162 | regering 163 | sedert 164 | verskillende 165 | dieselfde 166 | vier 167 | moes 168 | gewoonlik 169 | naby 170 | gemeente 171 | tydperk 172 | nou 173 | Frankryk 174 | staan 175 | groep 176 | gestig 177 | sluit 178 | Europese 179 | verwys 180 | wêreld 181 | moontlik 182 | eers 183 | ekonomiese 184 | gebou 185 | Na 186 | omdat 187 | lê 188 | lank 189 | af 190 | gebiede 191 | lande 192 | selfs 193 | Mei 194 | Tweede 195 | nasionale 196 | tans 197 | elke 198 | provinsie 199 | Januarie 200 | was, 201 | Oktober 202 | jare 203 | laaste 204 | persent 205 | seun 206 | inwoners 207 | belangrikste 208 | mees 209 | lewe 210 | -------------------------------------------------------------------------------- /justext/stoplists/Urdu.txt: -------------------------------------------------------------------------------- 1 | کے 2 | میں 3 | کی 4 | اور 5 | سے 6 | کا 7 | اس 8 | کو 9 | ہے 10 | ہے۔ 11 | نے 12 | پر 13 | ایک 14 | کہ 15 | یہ 16 | بھی 17 | ان 18 | کر 19 | ہیں۔ 20 | جو 21 | کیا 22 | وہ 23 | ہیں 24 | و 25 | جس 26 | بعد 27 | ہو 28 | تو 29 | نہیں 30 | یا 31 | جاتا 32 | لیے 33 | تک 34 | ہی 35 | اپنے 36 | کرنے 37 | تھا 38 | آپ 39 | نہ 40 | ساتھ 41 | تھا۔ 42 | گیا 43 | نام 44 | کسی 45 | طور 46 | ہوتا 47 | اپنی 48 | شہر 49 | لیکن 50 | دیا 51 | اللہ 52 | بن 53 | تھے۔ 54 | تھے 55 | سب 56 | زیادہ 57 | کہا 58 | بہت 59 | پاکستان 60 | کرتے 61 | جب 62 | ہونے 63 | وقت 64 | ہوئے 65 | اسے 66 | ہے، 67 | حاصل 68 | ہوا 69 | طرح 70 | کوئی 71 | والے 72 | اسی 73 | دو 74 | وجہ 75 | حضرت 76 | استعمال 77 | دور 78 | جن 79 | شامل 80 | انہوں 81 | حکومت 82 | یہاں 83 | ہوتی 84 | تھی 85 | سال 86 | پیدا 87 | گیا۔ 88 | محمد 89 | جنگ 90 | کچھ 91 | تمام 92 | لئے 93 | تھی۔ 94 | جاتی 95 | پہلے 96 | کیا۔ 97 | پھر 98 | دنیا 99 | اردو 100 | ہر 101 | واقع 102 | طرف 103 | زبان 104 | گئی 105 | موجود 106 | صرف 107 | کام 108 | مگر 109 | اگر 110 | شروع 111 | ہوئی 112 | دی 113 | یعنی 114 | اہم 115 | بات 116 | بے 117 | مشہور 118 | قائم 119 | اب 120 | عام 121 | والی 122 | علی 123 | خان 124 | علاوہ 125 | کئی 126 | مطابق 127 | ہوتے 128 | مختلف 129 | رہے 130 | جبکہ 131 | اسلام 132 | گئے 133 | کرتا 134 | زندگی 135 | علیہ 136 | حصہ 137 | خود 138 | لفظ 139 | جہاں 140 | انہیں 141 | جا 142 | آبادی 143 | دوسرے 144 | پیش 145 | کم 146 | ملک 147 | لوگوں 148 | ہم 149 | بڑی 150 | جانے 151 | بلکہ 152 | دونوں 153 | کرنا 154 | رہا 155 | ضلع 156 | نظر 157 | کی۔ 158 | کہتے 159 | لے 160 | ہوا۔ 161 | آباد 162 | لوگ 163 | مسلمانوں 164 | سلطنت 165 | ایسے 166 | جائے 167 | دوران 168 | بڑے 169 | صدر 170 | علم 171 | مقام 172 | تین 173 | عمل 174 | بڑا 175 | خلاف 176 | کتاب 177 | جاتے 178 | ہیں، 179 | تعمیر 180 | بنا 181 | دیگر 182 | دے 183 | ترین 184 | تاریخ 185 | انگریزی 186 | صوبہ 187 | دوسری 188 | تعلیم 189 | ہوئے۔ 190 | علاقے 191 | اختیار 192 | تعداد 193 | علاقہ 194 | شاہ 195 | نظام 196 | دیا۔ 197 | عمر 198 | ہزار 199 | بیان 200 | امریکہ 201 | لیا 202 | اسلامی 203 | تعلق 204 | پہلی 205 | ہے. 206 | عظیم 207 | افراد 208 | درمیان 209 | کیونکہ 210 | فوج 211 | سکتا 212 | جانب 213 | غیر 214 | آج 215 | والا 216 | جگہ 217 | بارے 218 | سیاسی 219 | مسجد 220 | انسان 221 | ایسی 222 | امام 223 | ایسا 224 | اپنا 225 | ابن 226 | گئے۔ 227 | زمین 228 | کردار 229 | الدین 230 | مسلم 231 | ممالک 232 | احمد 233 | کرتی 234 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | Changelog for jusText 4 | ===================== 5 | 6 | 3.0.2 (2025-02-25) 7 | ------------------ 8 | - *BUG FIX:* Handle urllib imports in Python 2 and 3 correctly `#51 `_. 9 | 10 | 3.0.1 (2024-05-09) 11 | ------------------ 12 | - *BUG FIX:* Fix issue with new version of lxml `#48 `_. 13 | 14 | 3.0.0 (2021-10-21) 15 | ------------------ 16 | - *INCOMPATIBLE CHANGE:* Dropped support for Python 3.4 and below. 17 | - *BUG FIX:* Don't join words separated only by ``
`` tag. 18 | - *BUG FIX:* List available stop-lists alphabetically. 19 | 20 | 2.2.0 (2016-03-06) 21 | ------------------ 22 | - *INCOMPATIBLE CHANGE:* Stop words are case insensitive. 23 | - *INCOMPATIBLE CHANGE:* Dropped support for Python 3.2 24 | - *BUG FIX:* Preserve new lines from original text in paragraphs. 25 | 26 | 2.1.1 (2014-05-27) 27 | ------------------ 28 | - *BUG FIX:* Function ``decode_html`` now respects parameter ``errors`` when falling to ``default_encoding`` `#9 `_. 29 | 30 | 2.1.0 (2014-01-25) 31 | ------------------ 32 | - *FEATURE:* Added XPath selector to the paragrahs. XPath selector is also available in detailed output as ``xpath`` attribute of ``

`` tag `#5 `_. 33 | 34 | 2.0.0 (2013-08-26) 35 | ------------------ 36 | - *FEATURE:* Added pluggable DOM preprocessor. 37 | - *FEATURE:* Added support for Python 3.2+. 38 | - *INCOMPATIBLE CHANGE:* Paragraphs are instances of 39 | ``justext.paragraph.Paragraph``. 40 | - *INCOMPATIBLE CHANGE:* Script 'justext' removed in favour of 41 | command ``python -m justext``. 42 | - *FEATURE:* It's possible to enter an URI as input document in CLI. 43 | - *FEATURE:* It is possible to pass unicode string directly. 44 | 45 | 1.2.0 (2011-08-08) 46 | ------------------ 47 | - *FEATURE:* Character counts used instead of word counts where possible in 48 | order to make the algorithm work well in the language independent 49 | mode (without a stoplist) for languages where counting words is 50 | not easy (Japanese, Chinese, Thai, etc). 51 | - *BUG FIX:* More robust parsing of meta tags containing the information about 52 | used charset. 53 | - *BUG FIX:* Corrected decoding of HTML entities € to Ÿ 54 | 55 | 1.1.0 (2011-03-09) 56 | ------------------ 57 | - First public release. 58 | -------------------------------------------------------------------------------- /justext/stoplists/Catalan.txt: -------------------------------------------------------------------------------- 1 | de 2 | la 3 | i 4 | a 5 | el 6 | que 7 | en 8 | del 9 | va 10 | un 11 | per 12 | les 13 | una 14 | amb 15 | al 16 | els 17 | es 18 | és 19 | dels 20 | El 21 | més 22 | com 23 | La 24 | o 25 | ser 26 | seu 27 | era 28 | no 29 | van 30 | seva 31 | fou 32 | hi 33 | entre 34 | població 35 | Els 36 | En 37 | anys 38 | als 39 | però 40 | vivien 41 | ha 42 | fins 43 | persones 44 | A 45 | també 46 | tenia 47 | pel 48 | mitjana 49 | són 50 | cada 51 | família 52 | renda 53 | on 54 | part 55 | Per 56 | seus 57 | cap 58 | havia 59 | després 60 | d'un 61 | dones 62 | Va 63 | sobre 64 | des 65 | Es 66 | aquest 67 | molt 68 | tot 69 | qual 70 | ja 71 | gran 72 | ciutat 73 | Les 74 | d'una 75 | nombre 76 | durant 77 | segle 78 | nom 79 | dos 80 | menys 81 | fer 82 | altres 83 | Segons 84 | anys, 85 | primer 86 | quan 87 | cens 88 | l'any 89 | troba 90 | És 91 | li 92 | següent 93 | eren 94 | mitjà 95 | estat 96 | aquesta 97 | seves 98 | pot 99 | quals 100 | mentre 101 | pels 102 | habitatges 103 | habitatge 104 | té 105 | Sant 106 | Estats 107 | forma 108 | està 109 | contra 110 | Al 111 | properes. 112 | mateix 113 | municipi 114 | només 115 | encara 116 | primera 117 | tres 118 | poden 119 | dues 120 | rei 121 | si 122 | habitants. 123 | lloc 124 | habitants 125 | se 126 | fa 127 | sense 128 | l'estat 129 | sota 130 | anys. 131 | temps 132 | fill 133 | Units 134 | Després 135 | tenir 136 | I 137 | També 138 | terme 139 | uns 140 | així 141 | poder 142 | estaven 143 | homes 144 | tenien 145 | espècie 146 | Un 147 | tots 148 | regió 149 | han 150 | peix 151 | fet 152 | grup 153 | manera 154 | nou 155 | Aquest 156 | famílies 157 | mort 158 | mostra 159 | foren 160 | riu 161 | l'ordre 162 | alguns 163 | unitats 164 | nord 165 | dins 166 | densitat 167 | tant 168 | habitants, 169 | diversos 170 | Aquesta 171 | nens 172 | arribar 173 | homes. 174 | abans 175 | Durant 176 | què 177 | situat 178 | més. 179 | govern 180 | km². 181 | tipus 182 | sistema 183 | partir 184 | II 185 | guerra 186 | Demografia. 187 | poblacions 188 | familiars. 189 | sud 190 | L'any 191 | Dels 192 | morir 193 | (en 194 | parelles 195 | llindar 196 | corresponia 197 | davall 198 | edats 199 | vivint 200 | L'edat 201 | pobresa. 202 | manera: 203 | repartia 204 | habitatges, 205 | Fou 206 | molts 207 | No 208 | diverses 209 | càpita 210 | soles 211 | diferents 212 | soles. 213 | solteres, 214 | casades, 215 | d'aquest 216 | famílies. 217 | grans 218 | Guerra 219 | tenen 220 | província 221 | haver 222 | vida 223 | general 224 | estava 225 | segons 226 | qui 227 | poble 228 | -------------------------------------------------------------------------------- /web_demo/script.js: -------------------------------------------------------------------------------- 1 | function show_advanced() { 2 | advanced_options = document.getElementById("advanced_options"); 3 | advanced_options.style.display = "block"; 4 | show_advanced_link = document.getElementById("show_advanced_link"); 5 | show_advanced_link.style.display = "none"; 6 | hide_advanced_link = document.getElementById("hide_advanced_link"); 7 | hide_advanced_link.style.display = ""; 8 | } 9 | 10 | function hide_advanced() { 11 | advanced_options = document.getElementById("advanced_options"); 12 | advanced_options.style.display = "none"; 13 | show_advanced_link = document.getElementById("show_advanced_link"); 14 | show_advanced_link.style.display = ""; 15 | hide_advanced_link = document.getElementById("hide_advanced_link"); 16 | hide_advanced_link.style.display = "none"; 17 | } 18 | 19 | function hide_boilerplate() { 20 | paragraphs = document.getElementsByTagName("p"); 21 | for (var i=0; i 0 91 | 92 | def test_get_missing_stoplist(self): 93 | with pytest.raises(ValueError): 94 | get_stoplist("Klingon") 95 | -------------------------------------------------------------------------------- /justext/stoplists/Norwegian_Nynorsk.txt: -------------------------------------------------------------------------------- 1 | i 2 | og 3 | av 4 | er 5 | som 6 | til 7 | på 8 | ein 9 | med 10 | for 11 | det 12 | den 13 | vart 14 | var 15 | frå 16 | har 17 | dei 18 | å 19 | eit 20 | han 21 | ei 22 | I 23 | at 24 | om 25 | Han 26 | Det 27 | ved 28 | eller 29 | seg 30 | men 31 | mellom 32 | ligg 33 | etter 34 | kan 35 | ikkje 36 | hadde 37 | ut 38 | over 39 | vert 40 | Den 41 | andre 42 | der 43 | Dei 44 | òg 45 | under 46 | blei 47 | to 48 | mot 49 | opp 50 | då 51 | fekk 52 | første 53 | fleire 54 | lag 55 | mange 56 | kalla 57 | blir 58 | kom 59 | inn 60 | denne 61 | km 62 | del 63 | ho 64 | dette 65 | også 66 | byen 67 | går 68 | Ein 69 | vore 70 | år 71 | gjennom 72 | så 73 | sin 74 | kommune 75 | berre 76 | anna 77 | På 78 | store 79 | enn 80 | Ho 81 | Dette 82 | fram 83 | ha 84 | sitt 85 | største 86 | meir 87 | Etter 88 | rundt 89 | før 90 | ofte 91 | gjekk 92 | alle 93 | tre 94 | namnet 95 | tok 96 | saman 97 | desse 98 | medan 99 | sine 100 | noko 101 | mest 102 | same 103 | sidan 104 | hans 105 | si 106 | mykje 107 | sett 108 | dag 109 | meter 110 | norsk 111 | seinare 112 | både 113 | nytta 114 | kjem 115 | kjend 116 | stor 117 | øya 118 | kunne 119 | finst 120 | få 121 | tidlegare 122 | heile 123 | nord 124 | Noreg 125 | skal 126 | slik 127 | skulle 128 | vil 129 | elva 130 | Frå 131 | ulike 132 | norske 133 | de 134 | siste 135 | tillegg 136 | vere 137 | of 138 | bygd 139 | brukt 140 | nokre 141 | sør 142 | nye 143 | rekna 144 | The 145 | gav 146 | vorte 147 | mindre 148 | nær 149 | Denne 150 | renn 151 | ned 152 | sjølv 153 | gjorde 154 | når 155 | Byen 156 | grunn 157 | området 158 | the 159 | særleg 160 | vest 161 | fleste 162 | langs 163 | dømes 164 | tid 165 | viktig 166 | delen 167 | finn 168 | større 169 | her 170 | først 171 | like 172 | høgaste 173 | fire 174 | lang 175 | Då 176 | tida 177 | laga 178 | aust 179 | by 180 | Eit 181 | fødd 182 | namn 183 | kvart 184 | fyrste 185 | januar 186 | døydde 187 | no 188 | område 189 | heilt 190 | vera 191 | tyder 192 | svært 193 | kommune. 194 | starta 195 | medlem 196 | står 197 | kjende 198 | Under 199 | Historie. 200 | gamle 201 | delar 202 | utan 203 | stort 204 | landet 205 | spelte 206 | albumet 207 | tilbake 208 | framleis 209 | gong 210 | Namnet 211 | Desse 212 | Ved 213 | Fylkesveg 214 | vanleg 215 | oppdaga 216 | staden 217 | ny 218 | Russland 219 | utvikla 220 | danna 221 | igjen 222 | songen 223 | Elva 224 | bli 225 | skreiv 226 | innbyggjarar. 227 | slutten 228 | blitt 229 | New 230 | året 231 | Ei 232 | kvar 233 | Som 234 | Noreg. 235 | ville 236 | gjeve 237 | førte 238 | likevel 239 | får 240 | tyske 241 | fjellet 242 | ta 243 | Her 244 | bruk 245 | tidleg 246 | må 247 | truleg 248 | fordi 249 | skrive 250 | kring 251 | låg 252 | måtte 253 | byrja 254 | innan 255 | derimot 256 | deira 257 | millionar 258 | land 259 | Andre 260 | alt 261 | and 262 | oktober 263 | For 264 | oppretta 265 | lite 266 | gjerne 267 | gjort 268 | består 269 | Men 270 | heldt 271 | Mange 272 | flytta 273 | langt 274 | (russisk 275 | franske 276 | år. 277 | viser 278 | juli 279 | viktigaste 280 | grensa 281 | britiske 282 | sjå 283 | vidare 284 | vann 285 | mai 286 | la 287 | åra 288 | små 289 | juni 290 | gjer 291 | mars 292 | einaste 293 | att 294 | meste 295 | delt 296 | vatn 297 | spelt 298 | moderne 299 | elv 300 | fylke. 301 | kyrkja 302 | september 303 | august 304 | nesten 305 | godt 306 | perioden 307 | inneheld 308 | blant 309 | fått 310 | november 311 | fem 312 | Øya 313 | hovudsakleg 314 | St. 315 | halde 316 | Sidan 317 | samt 318 | fjorden 319 | regionen 320 | USA 321 | folk 322 | Sjølv 323 | desember 324 | russiske 325 | gå 326 | stod 327 | begge 328 | fjell 329 | havet 330 | Bakgrunnsstoff. 331 | rekkje 332 | Fjorden 333 | hatt 334 | april 335 | områda 336 | rett 337 | Med 338 | særs 339 | gjev 340 | kyrkje 341 | amerikanske 342 | kva 343 | samband 344 | Oslo 345 | Av 346 | lenger 347 | februar 348 | Til 349 | minst 350 | høgd 351 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. _jusText: http://code.google.com/p/justext/ 2 | .. _Python: http://www.python.org/ 3 | .. _lxml: http://lxml.de/ 4 | 5 | jusText 6 | ======= 7 | .. image:: https://github.com/miso-belica/jusText/actions/workflows/run-tests.yml/badge.svg 8 | :target: https://github.com/miso-belica/jusText/actions/workflows/run-tests.yml 9 | 10 | Program jusText is a tool for removing boilerplate content, such as navigation 11 | links, headers, and footers from HTML pages. It is 12 | `designed `_ to preserve 13 | mainly text containing full sentences and it is therefore well suited for 14 | creating linguistic resources such as Web corpora. You can 15 | `try it online `_. 16 | 17 | This is a fork of original (currently unmaintained) code of jusText_ hosted 18 | on Google Code. 19 | 20 | 21 | Adaptations of the algorithm to other languages: 22 | 23 | - `C++ `_ 24 | - `Go `_ 25 | - `Java `_ 26 | 27 | 28 | Some libraries using jusText: 29 | 30 | - `chirp `_ 31 | - `lazynlp `_ 32 | - `off-topic-memento-toolkit `_ 33 | - `pears `_ 34 | - `readability calculator `_ 35 | - `sky `_ 36 | 37 | 38 | Some currently (Jan 2020) maintained alternatives: 39 | 40 | - `dragnet `_ 41 | - `html2text `_ 42 | - `inscriptis `_ 43 | - `newspaper `_ 44 | - `python-readability `_ 45 | - `trafilatura `_ 46 | 47 | 48 | Installation 49 | ------------ 50 | Make sure you have Python_ 2.7+/3.5+ and `pip `_ 51 | (`Windows `_, 52 | `Linux `_) installed. 53 | Run simply: 54 | 55 | .. code-block:: bash 56 | 57 | $ [sudo] pip install justext 58 | 59 | 60 | Dependencies 61 | ------------ 62 | :: 63 | 64 | lxml (version depends on your Python version) 65 | 66 | 67 | Usage 68 | ----- 69 | .. code-block:: bash 70 | 71 | $ python -m justext -s Czech -o text.txt http://www.zdrojak.cz/clanky/automaticke-zabezpeceni/ 72 | $ python -m justext -s English -o plain_text.txt english_page.html 73 | $ python -m justext --help # for more info 74 | 75 | 76 | Python API 77 | ---------- 78 | .. code-block:: python 79 | 80 | import requests 81 | import justext 82 | 83 | response = requests.get("http://planet.python.org/") 84 | paragraphs = justext.justext(response.content, justext.get_stoplist("English")) 85 | for paragraph in paragraphs: 86 | if not paragraph.is_boilerplate: 87 | print paragraph.text 88 | 89 | 90 | Testing 91 | ------- 92 | Run tests via 93 | 94 | .. code-block:: bash 95 | 96 | $ py.test-2.7 && py.test-3.5 && py.test-3.6 && py.test-3.7 && py.test-3.8 && py.test-3.9 97 | 98 | 99 | Acknowledgements 100 | ---------------- 101 | .. _`Natural Language Processing Centre`: http://nlp.fi.muni.cz/en/nlpc 102 | .. _`Masaryk University in Brno`: http://nlp.fi.muni.cz/en 103 | .. _PRESEMT: http://presemt.eu/ 104 | .. _`Lexical Computing Ltd.`: http://lexicalcomputing.com/ 105 | .. _`PhD research`: http://is.muni.cz/th/45523/fi_d/phdthesis.pdf 106 | 107 | This software has been developed at the `Natural Language Processing Centre`_ of 108 | `Masaryk University in Brno`_ with a financial support from PRESEMT_ and 109 | `Lexical Computing Ltd.`_ It also relates to `PhD research`_ of Jan Pomikálek. 110 | -------------------------------------------------------------------------------- /justext/stoplists/Hindi.txt: -------------------------------------------------------------------------------- 1 | के 2 | में 3 | की 4 | और 5 | का 6 | से 7 | को 8 | है। 9 | एक 10 | है 11 | पर 12 | ने 13 | भी 14 | यह 15 | लिए 16 | है. 17 | किया 18 | इस 19 | कि 20 | है, 21 | हैं। 22 | ही 23 | जाता 24 | जो 25 | रूप 26 | कर 27 | करने 28 | नहीं 29 | हो 30 | हैं 31 | तथा 32 | द्वारा 33 | या 34 | साथ 35 | गया 36 | होता 37 | अपने 38 | तक 39 | बाद 40 | हैं. 41 | the 42 | कुछ 43 | तो 44 | एवं 45 | वह 46 | हैं, 47 | करते 48 | वे 49 | स्थित 50 | जा 51 | समय 52 | इसके 53 | था। 54 | होती 55 | दिया 56 | अधिक 57 | सकता 58 | भारत 59 | नाम 60 | था 61 | होने 62 | अन्य 63 | उनके 64 | में, 65 | हुए 66 | कई 67 | करता 68 | किसी 69 | वाले 70 | कारण 71 | of 72 | उन्होंने 73 | प्रकार 74 | हुआ 75 | बहुत 76 | होते 77 | अपनी 78 | जब 79 | सबसे 80 | ये 81 | इन 82 | जाती 83 | क्षेत्र 84 | दो 85 | पहले 86 | कहा 87 | जैसे 88 | लेकिन 89 | कम 90 | था. 91 | राज्य 92 | and 93 | प्रयोग 94 | न 95 | लिये 96 | वर्ष 97 | प्राप्त 98 | उसके 99 | इसका 100 | गाँव 101 | भारतीय 102 | उन्हें 103 | शामिल 104 | जाने 105 | उस 106 | कोई 107 | भाषा 108 | थे। 109 | सभी 110 | तरह 111 | इसे 112 | मे 113 | उनकी 114 | जाते 115 | उत्तर 116 | गाँव, 117 | प्रमुख 118 | बीच 119 | करना 120 | उसे 121 | जिसमें 122 | आदि 123 | फिल्म 124 | यहां 125 | रहा 126 | सकते 127 | काम 128 | प्रदेश 129 | in 130 | गए 131 | स्थान 132 | दोनों 133 | गया. 134 | था, 135 | उपयोग 136 | इसकी 137 | व 138 | हुई 139 | अनुसार 140 | थे 141 | लोगों 142 | ओर 143 | गई 144 | हिन्दी 145 | रहे 146 | निर्माण 147 | विशेष 148 | केवल 149 | कार्य 150 | वाली 151 | लगभग 152 | भाग 153 | to 154 | फिर 155 | किया. 156 | करती 157 | बार 158 | इसमें 159 | बिहार 160 | अनेक 161 | मंदिर 162 | कहते 163 | a 164 | विकास 165 | मुख्य 166 | जीवन 167 | थी। 168 | इसी 169 | शहर 170 | जिसे 171 | दिन 172 | किया। 173 | तीन 174 | उनका 175 | शब्द 176 | पूर्व 177 | विभिन्न 178 | अब 179 | देश 180 | लोग 181 | क्योंकि 182 | तब 183 | थी 184 | लिया 185 | शुरू 186 | गया। 187 | दौरान 188 | अपना 189 | पास 190 | दी 191 | माना 192 | प्रति 193 | विश्व 194 | बना 195 | यहाँ 196 | जाना 197 | सकती 198 | ऐसे 199 | थे. 200 | यदि 201 | उसकी 202 | सरकार 203 | महत्वपूर्ण 204 | अमेरिका 205 | ऐसा 206 | द 207 | दूसरे 208 | व्यक्ति 209 | प्रसिद्ध 210 | बन 211 | उन 212 | अथवा 213 | तौर 214 | बात 215 | जी 216 | काल 217 | अलग 218 | जिसके 219 | प्रदान 220 | किए 221 | बनी 222 | स्टेशन 223 | काफी 224 | आधार 225 | रही 226 | एक्स्प्रेस 227 | प्रभाव 228 | रेलवे 229 | बड़े 230 | वाला 231 | नामक 232 | संख्या 233 | साहित्य 234 | थे, 235 | उसने 236 | जिसका 237 | अर्थ 238 | प्राचीन 239 | साल 240 | कभी 241 | जहां 242 | राजा 243 | बारे 244 | दक्षिण 245 | देने 246 | जारी 247 | जन्म 248 | धर्म 249 | संयुक्त 250 | थी. 251 | गया, 252 | मूल 253 | उसका 254 | राष्ट्रीय 255 | इतिहास 256 | युद्ध 257 | आज 258 | प्रथम 259 | प्रत्येक 260 | जिससे 261 | संगीत 262 | बजे 263 | आम 264 | उपलब्ध 265 | देता 266 | अध्ययन 267 | पहली 268 | ले 269 | बड़ी 270 | हालांकि 271 | is 272 | उत्पन्न 273 | लेकर 274 | पता 275 | बनाने 276 | दिल्ली 277 | नदी 278 | शिक्षा 279 | जबकि 280 | The 281 | उच्च 282 | किये 283 | इस्तेमाल 284 | शरीर 285 | करके 286 | चार 287 | यात्रा 288 | जल 289 | स्थिति 290 | मध्य 291 | सन् 292 | रहता 293 | आप 294 | श्री 295 | देशों 296 | सामान्य 297 | बड़ा 298 | इससे 299 | समान 300 | किया, 301 | स्थापना 302 | स्तर 303 | दुनिया 304 | हुआ। 305 | स्थापित 306 | होकर 307 | सेवा 308 | भूमिका 309 | उदाहरण 310 | देते 311 | ट्रेन 312 | पानी 313 | इसलिए 314 | आ 315 | प्रणाली 316 | आगे 317 | माध्यम 318 | वर्तमान 319 | बाहर 320 | कंपनी 321 | (स्टेशन 322 | विकसित 323 | प्रकाशित 324 | आधुनिक 325 | अवधि 326 | हर 327 | जिस 328 | विज्ञान 329 | ऐसी 330 | थी, 331 | आवश्यक 332 | प्रदर्शन 333 | अतिरिक्त 334 | दूर 335 | समूह 336 | जिले 337 | बनाया 338 | भगवान 339 | संबंध 340 | नए 341 | ध्यान 342 | रखा 343 | अंत 344 | ऊपर 345 | पूरी 346 | रहते 347 | नगर 348 | उसी 349 | मानव 350 | अधिकांश 351 | आधारित 352 | शताब्दी 353 | मात्रा 354 | ई. 355 | प्रक्रिया 356 | गयी 357 | ऑफ़ 358 | की. 359 | हम 360 | शक्ति 361 | पहला 362 | लिए, 363 | पुरुष 364 | दशक 365 | जिनमें 366 | इनके 367 | मैं 368 | विचार 369 | विश्वविद्यालय 370 | कुल 371 | दिया. 372 | ऑफ 373 | क्षेत्रों 374 | अभी 375 | घर 376 | रचना 377 | रंग 378 | जिला 379 | पुरस्कार 380 | छोटे 381 | हिंदी 382 | -------------------------------------------------------------------------------- /justext/stoplists/Luxembourgish.txt: -------------------------------------------------------------------------------- 1 | an 2 | der 3 | de 4 | vun 5 | ass 6 | a 7 | am 8 | den 9 | vum 10 | eng 11 | déi 12 | huet 13 | zu 14 | op 15 | vu 16 | De 17 | e 18 | mat 19 | fir 20 | war 21 | dem 22 | sech 23 | gouf 24 | och 25 | en 26 | Den 27 | Joerhonnert 28 | ze 29 | Joer 30 | et 31 | aus 32 | enger 33 | als 34 | engem 35 | bis 36 | hien 37 | Canton 38 | si 39 | franséisch 40 | bei 41 | oder 42 | wéi 43 | gëtt 44 | deen 45 | net 46 | mam 47 | méi 48 | un 49 | um 50 | dat 51 | sinn 52 | duerch 53 | Departement 54 | datt 55 | ginn. 56 | Gemeng 57 | no 58 | gebuer 59 | Am 60 | se 61 | Arrondissement 62 | nach 63 | Regioun 64 | hie 65 | seng 66 | v. 67 | Déi 68 | awer 69 | Gemengen 70 | administrativ 71 | Andeelung 72 | Si 73 | zum 74 | huet. 75 | iwwer 76 | Lëtzebuerg 77 | ginn 78 | Kanton. 79 | hunn 80 | Et 81 | een 82 | Säit 83 | dee 84 | An 85 | wou 86 | Numm 87 | Chr. 88 | Hien 89 | do 90 | ee 91 | Stad 92 | gestuerwen 93 | hir 94 | deem 95 | hat 96 | befaasst 97 | sengem 98 | "Dës 99 | ass. 100 | läit 101 | __NOTOC__ 102 | huet, 103 | des 104 | Hie 105 | senger 106 | där 107 | all 108 | ginn, 109 | ass, 110 | la 111 | Uertschaft 112 | wat 113 | Zäit 114 | tëscht 115 | Mee 116 | säi 117 | Deel 118 | goufen 119 | ëm 120 | Um 121 | du 122 | gouf. 123 | ënner 124 | E 125 | ganz 126 | well 127 | hu 128 | virun 129 | Vun 130 | waren 131 | vill 132 | koum 133 | éischt 134 | Januar 135 | war, 136 | No 137 | gi 138 | Juni 139 | sinn. 140 | Juli 141 | zënter 142 | beim 143 | esou 144 | ënnert 145 | haut 146 | Dezember 147 | Eng 148 | Oktober 149 | September 150 | ëmmer 151 | August 152 | ronn 153 | lëtzebuergesche 154 | Lëtzebuerger 155 | nëmmen 156 | gehéiert 157 | géint 158 | Mäerz 159 | mee 160 | puer 161 | Loutrengen. 162 | grouss 163 | Februar 164 | krut 165 | kann 166 | Spaweck. 167 | Abrëll 168 | hunn. 169 | gëtt. 170 | Dës 171 | dräi 172 | sou 173 | Plaz 174 | zur 175 | Meter 176 | Op 177 | November 178 | gouf, 179 | éischte 180 | Fir 181 | nei 182 | dës 183 | nees 184 | säin 185 | geet 186 | iwwert 187 | u 188 | Provënz 189 | aner 190 | Dat 191 | war. 192 | zwou 193 | Lëtzebuerg, 194 | Seng 195 | him 196 | dann 197 | hire 198 | deenen 199 | Enn 200 | konnt 201 | Lëtzebuerg. 202 | Jean 203 | Kinnek 204 | kënnt 205 | sinn, 206 | Kierch 207 | Zu 208 | Film 209 | President 210 | zwee 211 | Frankräich 212 | of 213 | La 214 | laang 215 | nom 216 | Zënter 217 | Lëscht 218 | duerno 219 | steet 220 | allem 221 | bekannt 222 | NGC 223 | En 224 | tëschent 225 | A 226 | Ufank 227 | Joren 228 | Bei 229 | hunn, 230 | Member 231 | keng 232 | selwer 233 | hirem 234 | Mat 235 | Paräis 236 | wann 237 | m 238 | Stär 239 | Wéi 240 | km 241 | hat, 242 | dacks 243 | senge 244 | hirer 245 | Leit 246 | deene 247 | besteet 248 | erëm 249 | weider 250 | dunn 251 | Chr." 252 | gemaach 253 | d' 254 | hiren 255 | Duerch 256 | franséische 257 | dëser 258 | Als 259 | Dag 260 | zesumme 261 | Geschicht. 262 | d'éischt 263 | Louis 264 | Héicht 265 | goufe 266 | Krich 267 | Weltkrich 268 | kleng 269 | gebaut 270 | Land 271 | ouni 272 | just 273 | Awunner 274 | Doud 275 | Well 276 | Säi 277 | ongeféier 278 | grousse 279 | anere 280 | besonnesch 281 | Zweete 282 | wéinst 283 | Papp 284 | Kéier 285 | gemaach. 286 | Paul 287 | gegrënnt 288 | genannt 289 | leeft 290 | Stärebild 291 | nëmme 292 | dësem 293 | gutt 294 | Charles 295 | véier 296 | hat. 297 | haaptsächlech 298 | Famill 299 | D' 300 | staark 301 | Tour 302 | Joseph 303 | Och 304 | gëtt, 305 | à 306 | von 307 | meeschtens 308 | genannt. 309 | Pierre 310 | di 311 | Jong 312 | Se 313 | Le 314 | Gare 315 | Bis 316 | schonns 317 | sollt 318 | dofir 319 | goufen. 320 | soll 321 | lëtzebuergesch 322 | éischter 323 | zwéi 324 | éischten 325 | groussen 326 | direkt 327 | kënnen 328 | Dall 329 | kuerz 330 | Nom 331 | fréier 332 | Dëst 333 | also 334 | spéider 335 | z.B. 336 | Nodeems 337 | verschidde 338 | und 339 | Däitschland 340 | och. 341 | Kuckt 342 | däitsche 343 | bal 344 | dëse 345 | Liichtjoer 346 | Fläch 347 | drop 348 | schonn 349 | wäit 350 | Robert 351 | ware 352 | Staat 353 | laanscht 354 | Form 355 | Do 356 | genannt, 357 | da 358 | zwar 359 | Henri 360 | al 361 | hei 362 | lëtzebuergeschen 363 | Geschicht 364 | ka 365 | Beispill 366 | Dem 367 | nodeems 368 | gréisste 369 | I. 370 | gréissten 371 | däitsch 372 | neie 373 | spillt 374 | Universitéit 375 | während 376 | Geographie. 377 | John 378 | gegrënnt. 379 | héich 380 | eréischt 381 | belscher 382 | sengen 383 | New 384 | waren, 385 | elo 386 | Europa 387 | Laf 388 | -------------------------------------------------------------------------------- /justext/stoplists/Swahili.txt: -------------------------------------------------------------------------------- 1 | ya 2 | na 3 | wa 4 | katika 5 | kwa 6 | ni 7 | la 8 | za 9 | mwaka 10 | kama 11 | cha 12 | mji 13 | kuwa 14 | kutoka 15 | au 16 | Kwa 17 | yake 18 | wakazi 19 | kwenye 20 | hii 21 | vya 22 | pia 23 | jina 24 | nchini 25 | nchi 26 | kata 27 | wake 28 | kati 29 | alikuwa 30 | Mkoa 31 | lakini 32 | pamoja 33 | zaidi 34 | hadi 35 | wakati 36 | juu 37 | watu 38 | kwanza 39 | sensa 40 | miaka 41 | ina 42 | mujibu 43 | wapatao 44 | iliyofanyika 45 | Wilaya 46 | mara 47 | moja 48 | ambayo 49 | Katika 50 | mnamo 51 | sehemu 52 | mkuu 53 | kwamba 54 | baada 55 | Tanzania. 56 | Mji 57 | ilikuwa 58 | hasa 59 | eneo 60 | nyingi 61 | sana 62 | kubwa 63 | kazi 64 | sababu 65 | inahusu 66 | huko 67 | Makala 68 | humo. 69 | waishio 70 | chini 71 | Marekani 72 | Afrika 73 | lake 74 | ambao 75 | kila 76 | jimbo 77 | hiyo 78 | lugha 79 | ndani 80 | Kristo). 81 | Mwaka 82 | ili 83 | vile 84 | huu 85 | upande 86 | mwa 87 | yao 88 | mjini 89 | wengi 90 | njia 91 | aina 92 | wilaya 93 | hivyo 94 | nafasi 95 | hali 96 | muda 97 | (Baada 98 | kutokana 99 | mbalimbali 100 | kawaida 101 | hili 102 | hata 103 | kuna 104 | tu 105 | tarehe 106 | una 107 | karibu 108 | karne 109 | Kenya 110 | Baada 111 | dhidi 112 | ajili 113 | albamu 114 | ambapo 115 | yenye 116 | Eneo 117 | kuhusu 118 | maeneo 119 | muhimu 120 | maji 121 | wimbo 122 | BK 123 | siku 124 | maisha 125 | tangu 126 | mtu 127 | takriban 128 | New 129 | filamu 130 | serikali 131 | wanaoishi 132 | huu. 133 | idadi 134 | huwa 135 | mkubwa 136 | tena 137 | m 138 | pili 139 | Mungu 140 | tofauti 141 | kutumia 142 | zake 143 | huo 144 | wao 145 | zote 146 | mmoja 147 | nyingine 148 | Papa 149 | usawa 150 | bahari. 151 | mbili 152 | upo 153 | wote 154 | Kuna 155 | bila 156 | Wakati 157 | hizi 158 | kadhaa 159 | walikuwa 160 | kundi 161 | ambaye 162 | kipindi 163 | kufanya 164 | milioni 165 | kabla 166 | akiwa 167 | sasa 168 | muziki 169 | wenye 170 | yote 171 | wengine 172 | mto 173 | mkoa 174 | Uingereza 175 | kuanzia 176 | km². 177 | vita 178 | kuliko 179 | mwisho 180 | of 181 | tatu 182 | Idadi 183 | ambazo 184 | Hata 185 | Kanisa 186 | kuu 187 | matumizi 188 | si 189 | maarufu 190 | kanda 191 | baadaye 192 | kiwango 193 | Ni 194 | mpya 195 | Marekani. 196 | familia 197 | baadhi 198 | kupitia 199 | kiasi 200 | yake. 201 | Uturuki. 202 | ikiwa 203 | pekee 204 | kabisa 205 | kupata 206 | Chuo 207 | Ulaya 208 | kusini 209 | mfumo 210 | bora 211 | shule 212 | ndege 213 | watoto 214 | habari 215 | biashara 216 | nje 217 | asili 218 | Jina 219 | the 220 | mpaka 221 | Mashariki 222 | nyimbo 223 | Kusini 224 | dunia 225 | Kati 226 | msingi 227 | uwezo 228 | maana 229 | timu 230 | umri 231 | kaskazini 232 | tar. 233 | Bahari 234 | mwezi 235 | yeye 236 | yaani 237 | inaweza 238 | Lakini 239 | mengine 240 | hivi 241 | kampuni 242 | utawala 243 | sheria 244 | kifo 245 | nguvu 246 | bado 247 | hivyo, 248 | chake 249 | Pia 250 | Mnamo 251 | mwenye 252 | mengi 253 | Tangu 254 | Tuzo 255 | ulikuwa 256 | Hii 257 | rangi 258 | Kenya. 259 | jinsi 260 | Tanzania 261 | historia 262 | kutoa 263 | dawa 264 | Ujerumani 265 | haya 266 | The 267 | ugonjwa 268 | duniani 269 | mwingine 270 | mfano 271 | huku 272 | awali 273 | hapo 274 | ingawa 275 | wanyama 276 | jamii 277 | neno 278 | mambo 279 | (amezaliwa 280 | Kikuu 281 | iliopo 282 | Ufaransa. 283 | chama 284 | zao 285 | lilikuwa 286 | hilo 287 | sawa 288 | rasmi 289 | KK. 290 | mfalme 291 | kwenda 292 | ndogo 293 | damu 294 | ambalo 295 | mahali 296 | Jimbo 297 | jumla 298 | Historia. 299 | makuu 300 | uchaguzi 301 | ndiyo 302 | mshindi 303 | Kama 304 | Yesu 305 | elimu 306 | Umoja 307 | miji 308 | fulani 309 | zamani 310 | Kaskazini 311 | Yeye 312 | pwani 313 | Nobel 314 | Mto 315 | shirika 316 | chati 317 | kisiwa 318 | namna 319 | rais 320 | magharibi 321 | mabadiliko 322 | makao 323 | huduma 324 | uhuru 325 | makubwa 326 | and 327 | chuo 328 | "The 329 | kumi 330 | ambako 331 | mechi 332 | kidogo 333 | kitabu 334 | mbali 335 | chakula 336 | hiki 337 | Rais 338 | mita 339 | klabu 340 | Wimbo 341 | haki 342 | binadamu 343 | Jamhuri 344 | mtoto 345 | iko 346 | yake, 347 | Hasa 348 | halafu 349 | hatari 350 | vitabu 351 | asilimia 352 | mazingira 353 | Vita 354 | jeshi 355 | mashariki 356 | ndio 357 | shughuli 358 | kimataifa 359 | KK 360 | wake. 361 | taifa 362 | pa 363 | bali 364 | Mkuu 365 | wana 366 | kusababisha 367 | picha 368 | imani 369 | utafiti 370 | dini 371 | mdogo 372 | matatizo 373 | Ujerumani. 374 | Amerika 375 | nyumba 376 | kufikia 377 | Ziwa 378 | Kiingereza 379 | ana 380 | Uingereza. 381 | ile 382 | bahari 383 | hayo 384 | nne 385 | maendeleo 386 | -------------------------------------------------------------------------------- /justext/stoplists/Gujarati.txt: -------------------------------------------------------------------------------- 1 | છે. 2 | અને 3 | આ 4 | જ 5 | એક 6 | આવેલા 7 | તેમ 8 | ગામમાં 9 | ભાગમાં 10 | છે 11 | કે 12 | આવે 13 | કરવામાં 14 | પણ 15 | માટે 16 | ભારત 17 | મુખ્ય 18 | ગામ 19 | તે 20 | દેશના 21 | આવેલું 22 | છે, 23 | પશ્ચિમ 24 | મુખ્યત્વે 25 | રાજ્યના 26 | ગુજરાત 27 | પ્રાથમિક 28 | પશુપાલન 29 | જેવી 30 | ગામના 31 | ખેતી, 32 | (તા. 33 | શાળા, 34 | પર 35 | જે 36 | પ્રાપ્ય 37 | ખેતમજુરી 38 | સવલતો 39 | વ્યવસાય 40 | તાલુકામાં 41 | ડેરી 42 | લોકોનો 43 | પંચાયતઘર, 44 | આંગણવાડી 45 | સાથે 46 | ખેતી 47 | એવા 48 | કુલ 49 | દૂધની 50 | થયેલી 51 | કરે 52 | પાકની 53 | શાકભાજીના 54 | મહત્વનું 55 | હોય 56 | પૈકીના 57 | કરી 58 | તાલુકાઓ 59 | જિલ્લામાં 60 | દ્વારા 61 | લોકો 62 | તરીકે 63 | હતી. 64 | અન્ય 65 | બાજરી, 66 | થાય 67 | રીતે 68 | the 69 | હતો. 70 | એ 71 | તેના 72 | કપાસ, 73 | હતા. 74 | હતું. 75 | તેને 76 | વધુ 77 | તેની 78 | ઉપયોગ 79 | તેઓ 80 | અથવા 81 | ઉત્તર 82 | સૌથી 83 | જિલ્લાના 84 | of 85 | મધ્ય 86 | આવી 87 | કારણે 88 | ઘઉં, 89 | સુધી 90 | મકાઇ, 91 | તો 92 | જ્યારે 93 | જેમાં 94 | તેમના 95 | કરીને 96 | તથા 97 | શકે 98 | કરવા 99 | પરંતુ 100 | દિવસ 101 | માં 102 | દક્ષિણ 103 | તેમણે 104 | and 105 | ના 106 | હતું 107 | હતી 108 | ન 109 | આદિવાસી 110 | સમાવેશ 111 | પછી 112 | પૂર્વ 113 | કરતા 114 | બે 115 | ત્યારે 116 | વર્ષ 117 | તેમની 118 | ખાસ 119 | પ્રથમ 120 | તુવર 121 | સામાન્ય 122 | જીરુ, 123 | રહે 124 | મોટા 125 | ઓફ 126 | બાદ 127 | દિવેલી 128 | હતો 129 | થઇ 130 | નામ 131 | કર્યો 132 | આવ્યું 133 | વસ્તી 134 | હતા 135 | ઉપરાંત 136 | જેવા 137 | કેટલાક 138 | ખાતે 139 | in 140 | તેમને 141 | ડાંગર, 142 | ધરાવે 143 | શકાય 144 | દરમિયાન 145 | જો 146 | ધરાવતા 147 | તાલુકાનું 148 | ને 149 | આવ્યો 150 | પોતાના 151 | to 152 | મુજબ 153 | જાય 154 | થઈ 155 | તેમાં 156 | દુધની 157 | અલગ 158 | તેમજ 159 | થયો 160 | વિવિધ 161 | નથી. 162 | ની 163 | જેમ 164 | જોવા 165 | તેનો 166 | વધારે 167 | કર્યું 168 | આવ્યા 169 | મળે 170 | હતી, 171 | તેણે 172 | પોતાની 173 | ભાગ 174 | વડોદરા 175 | ટકા 176 | માત્ર 177 | સામે 178 | ઘણા 179 | જેવાં 180 | સમય 181 | a 182 | થી 183 | વસે 184 | ઘણી 185 | કરતાં 186 | રોજ 187 | કોઇ 188 | શરૂ 189 | કામ 190 | નથી 191 | બની 192 | ઉપર 193 | ભારતીય 194 | જેને 195 | આપવામાં 196 | આંગણવાડી, 197 | તમામ 198 | વચ્ચે 199 | તેનું 200 | બીજા 201 | ડુંગરાળ 202 | કોઈ 203 | અહીં 204 | જિલ્લો 205 | ૧૧ 206 | પ્રદેશમાં 207 | વગેરે 208 | ૧૩ 209 | નો 210 | ખૂબ 211 | લગભગ 212 | ત્રણ 213 | હતા, 214 | વ્યવસાયમાં 215 | ધ 216 | સાબરકાંઠા 217 | શાકભાજી 218 | લેવામાં 219 | આવેલી 220 | (અગિયાર) 221 | હતો, 222 | (તેર) 223 | કારણ 224 | ત્યાં 225 | વખત 226 | એટલે 227 | મોટી 228 | વસવાટ 229 | દરેક 230 | લોકોની 231 | વરિયાળી, 232 | ફિલ્મ 233 | સમયે 234 | બહાર 235 | કરવાની 236 | હતું, 237 | ઉપલબ્ધ 238 | ૮ 239 | અનેક 240 | એવી 241 | ભાષા 242 | પાસે 243 | તરફ 244 | પંચમહાલ 245 | છતાં 246 | કે, 247 | જિલ્લાનું 248 | તેવી 249 | જાહેર 250 | ૫ 251 | આપે 252 | મથક 253 | કહેવાય 254 | થયું 255 | શહેર 256 | is 257 | જ્યાં 258 | આમ 259 | પહેલા 260 | રહી 261 | મળી 262 | નવા 263 | એવું 264 | મેં 265 | થયા 266 | પોતાનું 267 | સ્થાન 268 | વર્ષના 269 | (આઠ) 270 | (પાંચ) 271 | વિસ્તારમાં 272 | હતાં. 273 | જેના 274 | આપી 275 | રજૂ 276 | ૧૨ 277 | ચાર 278 | તેવા 279 | દિવેલા, 280 | રજકો 281 | કર્યા 282 | પરથી 283 | ગુજરાતી 284 | દૂર 285 | રાજ્ય 286 | રચના 287 | બટાટા, 288 | પાંચ 289 | વિસ્તાર 290 | આવતા 291 | ગયા 292 | નાના 293 | એમ 294 | કેટલીક 295 | પડે 296 | દર 297 | કાર્યો 298 | રાષ્ટ્રીય 299 | પ્રકારના 300 | યુદ્ધ 301 | અંગે 302 | મૂળ 303 | પ્રદેશ 304 | વિકાસ 305 | સમગ્ર 306 | હોવાથી 307 | આધારિત 308 | તમાકુ 309 | કરી. 310 | જેથી 311 | થયેલ 312 | શ્રી 313 | શક્કરીયાં 314 | એન્ડ 315 | બંને 316 | શબ્દ 317 | રહ્યા 318 | પ્રકારની 319 | અસર 320 | તૈયાર 321 | પાણી 322 | મધ્યપૂર્વ 323 | યુનાઇટેડ 324 | સુરત 325 | વધારો 326 | તમાકુ, 327 | (બાર) 328 | દરમ્યાન 329 | થતો 330 | બી, 331 | બનાવવામાં 332 | as 333 | પાન, 334 | સમાન 335 | The 336 | પ્રાપ્ત 337 | હોવા 338 | નહીં 339 | ખેતમજુરી, 340 | હવે 341 | સંવત 342 | ઉત્પાદન 343 | માટેની 344 | પ્રમાણે 345 | અહીંના 346 | જન્મ 347 | સમયમાં 348 | પ્રમાણમાં 349 | જોકે 350 | વિશ્વ 351 | હેઠળ 352 | કહે 353 | દિવસે 354 | આધુનિક 355 | આદિવાસીઓની 356 | અત્યંત 357 | આગળ 358 | અર્થ 359 | શરૂઆત 360 | આવેલ 361 | મૃત્યુ 362 | જંગલ 363 | તેથી 364 | હિંદુ 365 | નગર 366 | દાહોદ 367 | કર્યું. 368 | અમેરિકન 369 | સ્થાનિક 370 | પહેલાં 371 | ચોક્કસ 372 | ત્યાર 373 | ૧૦ 374 | નાની 375 | કર્યો. 376 | દિવેલી, 377 | કી 378 | સ્થાપના 379 | આંતરરાષ્ટ્રીય 380 | પ્રાચીન 381 | વખતે 382 | નવી 383 | રાજ્યમાં 384 | જણાવ્યું 385 | મંદિર 386 | was 387 | માર્ગ 388 | અમુક 389 | ગામનાં 390 | ચલાવે 391 | ભગવાન 392 | ઓછી 393 | પાક 394 | પંચાયતઘર 395 | મહેસાણા 396 | બીજી 397 | વિશાળ 398 | એવો 399 | સુદ 400 | ઘણાં 401 | સેવા 402 | લોકપ્રિય 403 | મહિનાનો 404 | હોવાનું 405 | પૂરી 406 | શેરડી, 407 | ખેત-ઉત્પાદનો 408 | that 409 | વદ 410 | ઉદાહરણ 411 | અમદાવાદ 412 | નદી 413 | અનુસાર 414 | માહિતી 415 | ભરૂચ 416 | -------------------------------------------------------------------------------- /justext/stoplists/French.txt: -------------------------------------------------------------------------------- 1 | de 2 | la 3 | et 4 | le 5 | à 6 | en 7 | des 8 | les 9 | du 10 | est 11 | un 12 | dans 13 | une 14 | par 15 | au 16 | pour 17 | qui 18 | il 19 | que 20 | a 21 | Le 22 | sur 23 | Il 24 | son 25 | avec 26 | se 27 | La 28 | plus 29 | sont 30 | ou 31 | Les 32 | En 33 | sa 34 | aux 35 | ce 36 | été 37 | comme 38 | ses 39 | pas 40 | ne 41 | deux 42 | fut 43 | mais 44 | d'un 45 | cette 46 | fait 47 | d'une 48 | entre 49 | ont 50 | était 51 | elle 52 | Elle 53 | leur 54 | aussi 55 | dont 56 | lui 57 | même 58 | où 59 | sous 60 | alors 61 | ainsi 62 | également 63 | être 64 | y 65 | nom 66 | qu'il 67 | premier 68 | très 69 | après 70 | peut 71 | partie 72 | ville 73 | ces 74 | première 75 | on 76 | années 77 | puis 78 | depuis 79 | tout 80 | lors 81 | contre 82 | plusieurs 83 | Ce 84 | Cette 85 | Dans 86 | commune 87 | groupe 88 | sans 89 | trois 90 | avait 91 | vers 92 | avant 93 | À 94 | fin 95 | bien 96 | C'est 97 | On 98 | région 99 | fois 100 | né 101 | France 102 | De 103 | leurs 104 | tous 105 | faire 106 | encore 107 | Après 108 | Un 109 | Son 110 | avoir 111 | située 112 | peu 113 | Une 114 | donc 115 | autres 116 | notamment 117 | Au 118 | ans 119 | famille 120 | mort 121 | ils 122 | soit 123 | grande 124 | pendant 125 | partir 126 | cours 127 | quelques 128 | place 129 | moins 130 | monde 131 | grand 132 | temps 133 | devient 134 | lieu 135 | n'est 136 | nombre 137 | si 138 | français 139 | début 140 | nouveau 141 | pays 142 | nombreux 143 | forme 144 | titre 145 | suite 146 | souvent 147 | Pour 148 | non 149 | trouve 150 | politique 151 | général 152 | Ils 153 | vie 154 | étaient 155 | guerre 156 | village 157 | série 158 | département 159 | quatre 160 | of 161 | près 162 | Mais 163 | sera 164 | saison 165 | nouvelle 166 | jusqu'à 167 | film 168 | situé 169 | Ces 170 | chez 171 | c'est 172 | furent 173 | doit 174 | roi 175 | selon 176 | toute 177 | celle 178 | celui 179 | ensuite 180 | permet 181 | dernier 182 | toujours 183 | Biographie. 184 | juin 185 | Paris 186 | d'autres 187 | étant 188 | autre 189 | fils 190 | population 191 | chaque 192 | tant 193 | s'est 194 | afin 195 | reste 196 | mai 197 | part 198 | Jean 199 | toutes 200 | car 201 | A 202 | système 203 | Sa 204 | peuvent 205 | durant 206 | d’un 207 | ligne 208 | ayant 209 | septembre 210 | certains 211 | mars 212 | d’une 213 | club 214 | janvier 215 | compte 216 | cas 217 | nombreuses 218 | juillet 219 | dès 220 | point 221 | l'un 222 | gouvernement 223 | centre 224 | (en 225 | niveau 226 | française 227 | Liens 228 | deuxième 229 | rôle 230 | habitants 231 | décembre 232 | président 233 | province 234 | cet 235 | premiers 236 | octobre 237 | nommé 238 | externes. 239 | nord 240 | face 241 | qu'elle 242 | novembre 243 | membre 244 | mois 245 | Histoire. 246 | sud 247 | avril 248 | Guerre 249 | Par 250 | connu 251 | membres 252 | beaucoup 253 | Louis 254 | période 255 | jeu 256 | pouvoir 257 | prend 258 | Des 259 | jeune 260 | grâce 261 | carrière 262 | l'équipe 263 | août 264 | société 265 | laquelle 266 | droit 267 | Charles 268 | février 269 | chef 270 | n'a 271 | nationale 272 | musique 273 | père 274 | the 275 | seul 276 | "La 277 | va 278 | "Le 279 | ans, 280 | surtout 281 | parfois 282 | cinq 283 | déjà 284 | Pierre 285 | Ses 286 | jour 287 | France, 288 | sein 289 | joueur 290 | long 291 | dit 292 | service 293 | poste 294 | succès 295 | ancien 296 | New 297 | terme 298 | type 299 | développement 300 | construction 301 | fit 302 | corps 303 | personnes 304 | tour 305 | album 306 | eu 307 | château 308 | travail 309 | plupart 310 | base 311 | mise 312 | version 313 | présente 314 | petit 315 | projet 316 | troisième 317 | française, 318 | grands 319 | création 320 | homme 321 | devant 322 | raison 323 | ceux 324 | jamais 325 | environ 326 | tête 327 | utilisé 328 | seulement 329 | nous 330 | site 331 | l'on 332 | qu’il 333 | parmi 334 | milieu 335 | autour 336 | sorti 337 | produit 338 | national 339 | rue 340 | études 341 | millions 342 | John 343 | font 344 | but 345 | territoire 346 | Depuis 347 | production 348 | prix 349 | quand 350 | créé 351 | porte 352 | aurait 353 | aujourd'hui 354 | II 355 | France. 356 | américain 357 | ministre 358 | dernière 359 | donne 360 | possède 361 | langue 362 | espèce 363 | d'être 364 | travaux 365 | existe 366 | mis 367 | ans. 368 | cause 369 | rencontre 370 | rapport 371 | certaines 372 | petite 373 | joue 374 | fonction 375 | jours 376 | écrit 377 | s'agit 378 | (né 379 | football 380 | États-Unis 381 | hommes 382 | seule 383 | et, 384 | The 385 | match 386 | direction 387 | lequel 388 | siège 389 | l'année 390 | côté 391 | parti 392 | communes 393 | appelé 394 | maison 395 | mouvement 396 | premières 397 | Grand 398 | fille 399 | Selon 400 | République 401 | exemple 402 | and 403 | l'histoire 404 | Lors 405 | grandes 406 | généralement 407 | réalisé 408 | loi 409 | cependant 410 | Coupe 411 | lorsque 412 | passe 413 | avaient 414 | seconde 415 | -------------------------------------------------------------------------------- /justext/stoplists/Galician.txt: -------------------------------------------------------------------------------- 1 | de 2 | e 3 | a 4 | o 5 | que 6 | do 7 | en 8 | da 9 | un 10 | é 11 | no 12 | unha 13 | na 14 | os 15 | A 16 | dos 17 | O 18 | por 19 | como 20 | para 21 | ao 22 | se 23 | con 24 | as 25 | foi 26 | súa 27 | máis 28 | seu 29 | á 30 | ou 31 | das 32 | non 33 | ano 34 | En 35 | polo 36 | entre 37 | son 38 | co 39 | tamén 40 | Os 41 | pola 42 | seus 43 | coa 44 | parte 45 | nos 46 | sobre 47 | ser 48 | onde 49 | nas 50 | As 51 | anos 52 | dun 53 | ata 54 | pero 55 | ó 56 | era 57 | concello 58 | cidade 59 | No 60 | súas 61 | cando 62 | aos 63 | está 64 | dunha 65 | século 66 | durante 67 | primeiro 68 | lugar 69 | habitantes 70 | xa 71 | aínda 72 | ten 73 | nome 74 | dous 75 | San 76 | segundo 77 | foron 78 | moi 79 | primeira 80 | nun 81 | parroquia 82 | mesmo 83 | ás 84 | eran 85 | desde 86 | só 87 | polos 88 | Na 89 | forma 90 | poboación 91 | nunha 92 | tres 93 | este 94 | maior 95 | É 96 | todo 97 | despois 98 | dúas 99 | esta 100 | Foi 101 | homes 102 | contra 103 | así 104 | pode 105 | gran 106 | sen 107 | tiña 108 | provincia 109 | cos 110 | outros 111 | grupo 112 | illa 113 | vez 114 | dende 115 | coma 116 | sendo 117 | nado 118 | cunha 119 | baixo 120 | cara 121 | cada 122 | relación 123 | cal 124 | lle 125 | novo 126 | Estados 127 | Tamén 128 | rei 129 | hai 130 | mais 131 | río 132 | cales 133 | tempo 134 | cun 135 | norte 136 | comarca 137 | equipo 138 | número 139 | goberno 140 | xunto 141 | tivo 142 | Segundo 143 | Guerra 144 | banda 145 | están 146 | capital 147 | sistema 148 | outras 149 | supón 150 | sur 151 | Este 152 | estado 153 | Esta 154 | antes 155 | lingua 156 | grandes 157 | Durante 158 | rexión 159 | grande 160 | logo 161 | poden 162 | até 163 | partir 164 | quen 165 | vida 166 | estaba 167 | coñecido 168 | datos 169 | contaba 170 | todos 171 | Traxectoria. 172 | tanto 173 | catro 174 | centro 175 | moitos 176 | teñen 177 | varios 178 | obra 179 | Un 180 | día 181 | importante 182 | poder 183 | historia 184 | mentres 185 | habitantes. 186 | tempada 187 | tras 188 | Santa 189 | Universidade 190 | polas 191 | ben 192 | Por 193 | Con 194 | debido 195 | principal 196 | zona 197 | orixe 198 | Historia. 199 | serie 200 | guerra 201 | coas 202 | comezou 203 | maioría 204 | país 205 | menos 206 | familia 207 | deste 208 | longo 209 | primeiros 210 | época 211 | pouco 212 | e, 213 | toda 214 | algúns 215 | Unha 216 | uns 217 | segunda 218 | través 219 | el 220 | ter 221 | mundo 222 | Ao 223 | dentro 224 | Para 225 | medio 226 | partido 227 | España 228 | final 229 | nova 230 | decembro 231 | tipo 232 | período 233 | outra 234 | maio 235 | veces 236 | varias 237 | ademais 238 | xaneiro 239 | morte 240 | la 241 | xullo 242 | fixo 243 | case 244 | desta 245 | Nova 246 | xeito 247 | entón 248 | De 249 | (en 250 | María 251 | chamado 252 | xuño 253 | setembro 254 | Tiña 255 | marzo 256 | conta 257 | Entre 258 | fillo 259 | abril 260 | Galicia 261 | actual 262 | ós 263 | I 264 | mellor 265 | todo, 266 | outubro 267 | of 268 | política 269 | mesma 270 | agosto 271 | II 272 | que, 273 | millóns 274 | diferentes 275 | álbum 276 | caso 277 | punto 278 | fai 279 | territorio 280 | outro 281 | del 282 | superficie 283 | diminución 284 | preto 285 | Está 286 | obras 287 | Como 288 | moitas 289 | área 290 | uso 291 | finais 292 | persoas 293 | principais 294 | febreiro 295 | novembro 296 | situada 297 | presidente 298 | Europa 299 | Historia 300 | situado 301 | illas 302 | Aínda 303 | traballo 304 | Non 305 | coñecida 306 | Partido 307 | facer 308 | título 309 | moito 310 | fronte 311 | importantes 312 | Imperio 313 | Santiago 314 | base 315 | Reino 316 | países 317 | anos, 318 | pasou 319 | sempre 320 | todas 321 | chegou 322 | feito 323 | República 324 | nivel 325 | cinco 326 | días 327 | leste 328 | Ligazóns 329 | estes 330 | Unidos 331 | José 332 | hoxe 333 | externas. 334 | membros 335 | Despois 336 | calquera 337 | cidades 338 | oeste 339 | música 340 | Carlos 341 | fin 342 | anos. 343 | Ten 344 | The 345 | momento 346 | década 347 | Gran 348 | chamada 349 | total 350 | Cando 351 | km 352 | algunhas 353 | senón 354 | termo 355 | movemento 356 | Unión 357 | Segunda 358 | mulleres. 359 | propio 360 | the 361 | Tras 362 | tarde 363 | tan 364 | auga 365 | construción 366 | último 367 | finado 368 | Instituto 369 | pesar 370 | grupos 371 | carreira 372 | proceso 373 | resto 374 | sería 375 | xeral 376 | media 377 | desenvolvemento 378 | Estado 379 | galego 380 | exército 381 | Francia 382 | metros 383 | produción 384 | "The 385 | principalmente 386 | Nos 387 | club 388 | especialmente 389 | Manuel 390 | Pedro 391 | campo 392 | pois 393 | éxito 394 | respecto 395 | mediante 396 | estilo 397 | coruñés 398 | atópase 399 | conxunto 400 | antigo 401 | Terra 402 | deles 403 | liña 404 | Premio 405 | estas 406 | casa 407 | y 408 | disco 409 | "A 410 | político 411 | Real 412 | Nacional 413 | ante 414 | partidos 415 | posto 416 | membro 417 | libro 418 | igrexa 419 | eleccións 420 | nin 421 | reino 422 | habitantes, 423 | estivo 424 | único 425 | pai 426 | Desde 427 | puntos 428 | zonas 429 | primeiras 430 | fora 431 | converteuse 432 | Madrid 433 | seguinte 434 | oficial 435 | mulleres 436 | posición 437 | diversos 438 | tal 439 | ese 440 | poboación, 441 | aparece 442 | Igrexa 443 | papel 444 | versión 445 | costa 446 | porque 447 | cultura 448 | actividade 449 | antiga 450 | -------------------------------------------------------------------------------- /justext/stoplists/Persian.txt: -------------------------------------------------------------------------------- 1 | و 2 | در 3 | به 4 | از 5 | که 6 | این 7 | را 8 | با 9 | است. 10 | آن 11 | است 12 | سال 13 | یک 14 | برای 15 | او 16 | بر 17 | سیارک 18 | خود 19 | شد. 20 | تا 21 | یا 22 | شده 23 | نیز 24 | نام 25 | ایران 26 | قرار 27 | بود 28 | شهر 29 | یکی 30 | پس 31 | دو 32 | می 33 | کشف 34 | وی 35 | بود. 36 | هم 37 | می‌شود. 38 | استان 39 | استفاده 40 | بخش 41 | شد 42 | دیگر 43 | عنوان 44 | هر 45 | شهرستان 46 | کرد. 47 | دارد. 48 | توسط 49 | برابر 50 | وجود 51 | کار 52 | آنها 53 | هزار 54 | اما 55 | کرد 56 | کشور 57 | می‌شود 58 | مورد 59 | نفر 60 | صورت 61 | روی 62 | شده‌است 63 | دارد 64 | زمان 65 | همچنین 66 | دست 67 | زبان 68 | شده‌است. 69 | دارای 70 | دانشگاه 71 | جمعیت 72 | بوده 73 | واقع 74 | بعد 75 | می‌باشد. 76 | بین 77 | دهستان 78 | مرکز 79 | روز 80 | بسیار 81 | کرده 82 | مردم 83 | بیشتر 84 | کتاب 85 | روستا 86 | پیوند 87 | نظر 88 | پیش 89 | شرکت 90 | سه 91 | داشته 92 | قدر 93 | آب 94 | ولی 95 | انجام 96 | گروه 97 | مطلق 98 | فیلم 99 | میان 100 | تاریخ 101 | بزرگ 102 | بیرون. 103 | آمریکا 104 | زندگی 105 | مانند 106 | تهران 107 | جنگ 108 | اولین 109 | می‌کند. 110 | میلادی 111 | چند 112 | منطقه 113 | برخی 114 | بسیاری 115 | راه 116 | مرکزی 117 | آغاز 118 | داده 119 | دوره 120 | حال 121 | اصلی 122 | اول 123 | حدود 124 | می‌توان 125 | بار 126 | همین 127 | بازی 128 | تشکیل 129 | توابع 130 | تنها 131 | همراه 132 | محمد 133 | جنوب 134 | می‌کند 135 | گرفته 136 | جهان 137 | ملی 138 | دوران 139 | کردن 140 | سرشماری 141 | ایجاد 142 | تولید 143 | شدن 144 | دولت 145 | سازمان 146 | باید 147 | شمال 148 | است، 149 | جمعیت. 150 | بوده‌است. 151 | اسلامی 152 | اثر 153 | جمله 154 | شامل 155 | هستند. 156 | شرقی 157 | زیر 158 | منابع. 159 | ساخته 160 | ایران. 161 | دوم 162 | شاه 163 | اگر 164 | روستایی 165 | شکل 166 | باشد. 167 | موسیقی 168 | اساس 169 | هستند 170 | آثار 171 | نوع 172 | زیادی 173 | نشان 174 | دلیل 175 | طول 176 | ماه 177 | بیش 178 | ایرانی 179 | مختلف 180 | شود. 181 | سر 182 | همان 183 | ای 184 | چون 185 | توجه 186 | غربی 187 | بن 188 | نقش 189 | نخستین 190 | داشت 191 | تمام 192 | جمهوری 193 | پایان 194 | طور 195 | همه 196 | فارسی 197 | گفته 198 | می‌کنند. 199 | کند. 200 | زمین 201 | بود، 202 | سطح 203 | علی 204 | تیم 205 | می‌باشد 206 | جهانی 207 | قبل 208 | هجری 209 | جهت 210 | آن‌ها 211 | اینکه 212 | حزب 213 | سیاسی 214 | قابل 215 | باعث 216 | سپس 217 | باشد 218 | چهار 219 | تحت 220 | تغییر 221 | دیگری 222 | یعنی 223 | تعداد 224 | وارد 225 | منتشر 226 | داشت. 227 | آذربایجان 228 | ادامه 229 | نسبت 230 | بی 231 | بودند 232 | اشاره 233 | ۱۳۸۵ 234 | سوی 235 | ساخت 236 | غیر 237 | داد 238 | خواهد 239 | معروف 240 | بیست 241 | قرن 242 | انقلاب 243 | مجموعه 244 | داد. 245 | فعالیت 246 | حکومت 247 | می‌تواند 248 | جدید 249 | خط 250 | اطلاعات 251 | جای 252 | متر 253 | سمت 254 | تبدیل 255 | محل 256 | شروع 257 | معمولاً 258 | زمانی 259 | نه 260 | آمار 261 | انتخاب 262 | کنار 263 | برنامه 264 | ممکن 265 | بدون 266 | رشته 267 | تاریخی 268 | افزایش 269 | سیستم 270 | آنجا 271 | می‌شوند. 272 | گرفت. 273 | طی 274 | مجلس 275 | افراد 276 | دارند. 277 | باز 278 | مرگ 279 | براساس 280 | اعلام 281 | دهه 282 | حتی 283 | علوم 284 | طریق 285 | مدت 286 | مربوط 287 | پیدا 288 | روستای 289 | بنا 290 | مناطق 291 | هیچ 292 | ما 293 | چه 294 | دنیا 295 | شناخته 296 | دریافت 297 | کشورهای 298 | کوچک 299 | آنان 300 | غرب 301 | جریان 302 | دارند 303 | ایالات 304 | کمک 305 | حضور 306 | روش 307 | نوشته 308 | هنگام 309 | عمل 310 | چاپ 311 | کوه 312 | سید 313 | شدند. 314 | چنین 315 | سپتامبر 316 | درجه 317 | بالا 318 | ۲ 319 | کیلومتر 320 | فرانسه 321 | موجود 322 | کند 323 | رنگ 324 | رئیس 325 | قانون 326 | قدرت 327 | سی 328 | جنوبی 329 | روستاهای 330 | فوتبال 331 | مهم 332 | دور 333 | نزدیک 334 | حرکت 335 | داستان 336 | متحده 337 | فرهنگ 338 | کردند. 339 | من 340 | نمود. 341 | معنی 342 | نیروهای 343 | کم 344 | شهرهای 345 | نیروی 346 | سازی 347 | ۳ 348 | قسمت 349 | طرف 350 | آلمان 351 | خارج 352 | علت 353 | بودن 354 | آخرین 355 | مسجد 356 | مواد 357 | می‌کنند 358 | سایر 359 | تقویم 360 | خان 361 | زنان 362 | پسر 363 | دیده 364 | شود 365 | خانواده 366 | گرفت 367 | خانه 368 | علمی 369 | آزاد 370 | شمار 371 | بودند. 372 | آلبوم 373 | پنج 374 | سوم 375 | عضو 376 | طبیعی 377 | تر 378 | میلیون 379 | کاهش 380 | انسان 381 | اسلام 382 | فقط 383 | نتیجه 384 | طراحی 385 | آموزش 386 | شبکه 387 | ایالت 388 | بهترین 389 | (به 390 | گذشته 391 | اکنون 392 | گردید. 393 | عمومی 394 | نظامی 395 | بین‌المللی 396 | اجتماعی 397 | انگلیسی 398 | دومین 399 | شورای 400 | رسمی 401 | نیاز 402 | حالت 403 | ۵ 404 | ارائه 405 | می‌دهد. 406 | کننده 407 | پدر 408 | شمالی 409 | سال‌های 410 | ۴ 411 | خانوار) 412 | خاطر 413 | کامل 414 | قلعه 415 | سرعت 416 | بعضی 417 | کنند. 418 | بازار 419 | پی 420 | ریاست 421 | ایران، 422 | صد 423 | اعضای 424 | اروپا 425 | ویژه 426 | حاضر 427 | می‌دهد 428 | (در 429 | یافت 430 | شیخ 431 | حمله 432 | جایزه 433 | نمایش 434 | کردند 435 | شرق 436 | ترتیب 437 | زن 438 | بیماری 439 | قمری 440 | شد، 441 | یافت. 442 | فرهنگی 443 | کیلومتری 444 | جزیره 445 | درصد 446 | داخل 447 | ابتدا 448 | علاوه 449 | آمریکایی 450 | دکتر 451 | اکتبر 452 | انواع 453 | بررسی 454 | کنترل 455 | آمده 456 | وزارت 457 | خورشیدی 458 | عربی 459 | حقوق 460 | عراق 461 | بخشی 462 | ۱۰ 463 | زمینه 464 | البته 465 | امروزه 466 | انتخابات 467 | توسعه 468 | منبع. 469 | مشهور 470 | بدن 471 | رو 472 | مازندران 473 | تقسیم 474 | جان 475 | منابع 476 | کل 477 | فرزند 478 | سبک 479 | ارتباط 480 | حسین 481 | اقتصادی 482 | درباره 483 | میدان 484 | هنوز 485 | هنر 486 | -------------------------------------------------------------------------------- /justext/stoplists/Norwegian_Bokmal.txt: -------------------------------------------------------------------------------- 1 | i 2 | og 3 | av 4 | som 5 | er 6 | en 7 | til 8 | på 9 | ble 10 | for 11 | med 12 | var 13 | den 14 | det 15 | å 16 | fra 17 | de 18 | har 19 | et 20 | at 21 | han 22 | I 23 | også 24 | ved 25 | Han 26 | om 27 | Det 28 | seg 29 | men 30 | ikke 31 | hadde 32 | Den 33 | etter 34 | eller 35 | under 36 | kan 37 | andre 38 | første 39 | sin 40 | over 41 | to 42 | mot 43 | da 44 | De 45 | mellom 46 | fikk 47 | ut 48 | (født 49 | flere 50 | ligger 51 | år 52 | hans 53 | der 54 | kom 55 | opp 56 | blir 57 | dette 58 | hun 59 | denne 60 | blant 61 | så 62 | hvor 63 | inn 64 | være 65 | man 66 | ha 67 | mange 68 | vært 69 | blitt 70 | kjent 71 | Etter 72 | Dette 73 | En 74 | noen 75 | sammen 76 | annet 77 | tre 78 | før 79 | alle 80 | noe 81 | enn 82 | sitt 83 | del 84 | tidligere 85 | sine 86 | store 87 | mer 88 | kunne 89 | samme 90 | gikk 91 | siden 92 | gjennom 93 | senere 94 | På 95 | norsk 96 | of 97 | død 98 | skulle 99 | mest 100 | vant 101 | mens 102 | bare 103 | rundt 104 | både 105 | Hun 106 | skal 107 | vil 108 | disse 109 | tok 110 | nye 111 | få 112 | bli 113 | finnes 114 | ofte 115 | norske 116 | stor 117 | Norge 118 | kommune 119 | hele 120 | selv 121 | største 122 | slik 123 | går 124 | siste 125 | dag 126 | januar 127 | gang 128 | tilbake 129 | navnet 130 | byen 131 | deltok 132 | nå 133 | spilte 134 | fire 135 | rekke 136 | brukt 137 | grunn 138 | tillegg 139 | Hans 140 | begynte 141 | mai 142 | kalt 143 | Fra 144 | tid 145 | mars 146 | Da 147 | laget 148 | august 149 | samt 150 | først 151 | ham 152 | Denne 153 | gjorde 154 | juni 155 | når 156 | mye 157 | the 158 | navn 159 | april 160 | mindre 161 | desember 162 | uten 163 | ville 164 | oktober 165 | kommer 166 | september 167 | deres 168 | igjen 169 | juli 170 | februar 171 | dem 172 | hos 173 | km 174 | brukes 175 | plass 176 | amerikanske 177 | meter 178 | større 179 | november 180 | The 181 | startet 182 | svært 183 | tatt 184 | utgitt 185 | helt 186 | ned 187 | fleste 188 | døde 189 | tyske 190 | Under 191 | ny 192 | imidlertid 193 | måtte 194 | Ved 195 | derfor 196 | innen 197 | Men 198 | satt 199 | består 200 | Et 201 | kun 202 | må 203 | For 204 | frem 205 | gamle 206 | ulike 207 | New 208 | bygget 209 | skrevet 210 | Oslo 211 | særlig 212 | tiden 213 | forskjellige 214 | sett 215 | gitt 216 | egen 217 | deler 218 | County 219 | beste 220 | løpet 221 | hvert 222 | Som 223 | eksempel 224 | amerikansk 225 | slutten 226 | kjente 227 | spilt 228 | annen 229 | arbeidet 230 | delstaten 231 | bruk 232 | år. 233 | fem 234 | nord 235 | født 236 | førte 237 | fordi 238 | ca. 239 | dermed 240 | like 241 | hver 242 | ingen 243 | Disse 244 | perioden 245 | Norge. 246 | John 247 | fram 248 | eneste 249 | tredje 250 | stort 251 | ta 252 | USA 253 | medlem 254 | spiller 255 | ga 256 | skrev 257 | langs 258 | får 259 | hennes 260 | bak 261 | området 262 | viktig 263 | britiske 264 | ett 265 | Norges 266 | begge 267 | "The 268 | ganger 269 | flyttet 270 | and 271 | fått 272 | gir 273 | deretter 274 | spesielt 275 | videre 276 | årene 277 | sønn 278 | står 279 | utviklet 280 | kalles 281 | fortsatt 282 | lag 283 | la 284 | gjerne 285 | holdt 286 | gjort 287 | millioner 288 | Sommer-OL 289 | vi 290 | land 291 | leder 292 | sør 293 | langt 294 | innbyggere 295 | St. 296 | lang 297 | delen 298 | vanlig 299 | viser 300 | omkring 301 | godt 302 | hatt 303 | by 304 | utenfor 305 | lever 306 | gjør 307 | lå 308 | året 309 | nesten 310 | ledet 311 | tidlig 312 | følge 313 | landet 314 | franske 315 | liten 316 | Med 317 | allerede 318 | seks 319 | gi 320 | betyr 321 | opprinnelig 322 | rett 323 | alt 324 | basert 325 | opprettet 326 | OL 327 | nær 328 | lite 329 | Byen 330 | vanligvis 331 | moderne 332 | små 333 | provinsen 334 | olympiske 335 | form 336 | funnet 337 | Her 338 | regnes 339 | kort 340 | grunnlagt 341 | Noen 342 | Til 343 | aldri 344 | kommune. 345 | nytt 346 | gjøre 347 | begynnelsen 348 | kirke 349 | stedet 350 | svenske 351 | innenfor 352 | gå 353 | forhold 354 | kong 355 | viktigste 356 | tysk 357 | forbindelse 358 | vest 359 | eget 360 | Eksterne 361 | nok 362 | lenker. 363 | Selv 364 | se 365 | Norsk 366 | Mange 367 | valgt 368 | én 369 | samtidig 370 | etablert 371 | åpnet 372 | meget 373 | Historie. 374 | personer 375 | tar 376 | internasjonale 377 | USA. 378 | meste 379 | sted 380 | bandet 381 | mål 382 | politiske 383 | her 384 | lagt 385 | engelske 386 | grad 387 | finner 388 | enkelte 389 | filmen 390 | fant 391 | fortsatte 392 | øst 393 | mulig 394 | utnevnt 395 | krigen 396 | egne 397 | omtrent 398 | mennesker 399 | Siden 400 | Andre 401 | antall 402 | Når 403 | svensk 404 | inneholder 405 | von 406 | engelsk 407 | slutt 408 | gruppe 409 | gruppen 410 | Man 411 | litt 412 | Fylkesvei 413 | bruker 414 | dannet 415 | lange 416 | støtte 417 | ca 418 | komme 419 | sendt 420 | ei 421 | ganske 422 | slått 423 | bygd 424 | hjelp 425 | foran 426 | Sverige 427 | tross 428 | eldste 429 | hva 430 | brukte 431 | barn 432 | tilhører 433 | ser 434 | øya 435 | England 436 | VM 437 | bruke 438 | bl.a. 439 | Tyskland 440 | si 441 | olympisk 442 | delt 443 | albumet 444 | Senere 445 | god 446 | sterkt 447 | høyeste 448 | II 449 | gav 450 | hovedsakelig 451 | Oslo. 452 | verdenskrig 453 | knyttet 454 | slaget 455 | inntil 456 | minst 457 | Navnet 458 | via 459 | menn 460 | hvordan 461 | Frankrike 462 | verdens 463 | danske 464 | fast 465 | finne 466 | direkte 467 | politisk 468 | Universitetet 469 | endte 470 | dagens 471 | landets 472 | lokale 473 | klubben 474 | rolle 475 | -------------------------------------------------------------------------------- /tests/test_html_encoding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division, print_function, unicode_literals 5 | 6 | import unittest 7 | import pytest 8 | 9 | from justext.core import JustextError, decode_html 10 | 11 | 12 | class TestHtmlEncoding(unittest.TestCase): 13 | 14 | def assert_strings_equal(self, s1, s2): 15 | assert type(s1) == type(s2) 16 | assert s1 == s2 17 | 18 | def test_unicode(self): 19 | html = "ľščťžýáíéäňúô Ł€" 20 | decoded_html = decode_html(html) 21 | 22 | self.assert_strings_equal(html, decoded_html) 23 | 24 | def test_utf8_bytes(self): 25 | html = "ľščťžýáíéäňúô Ł€" 26 | decoded_html = decode_html(html.encode("utf8")) 27 | 28 | self.assert_strings_equal(html, decoded_html) 29 | 30 | def test_meta_detection_1(self): 31 | html = ' ľščťžäňôě' 32 | decoded_html = decode_html(html.encode("iso-8859-2")) 33 | 34 | self.assert_strings_equal(html, decoded_html) 35 | 36 | def test_meta_detection_2(self): 37 | html = ' ľščťžäňôě' 38 | decoded_html = decode_html(html.encode("iso-8859-2")) 39 | 40 | self.assert_strings_equal(html, decoded_html) 41 | 42 | def test_meta_detection_3(self): 43 | html = ' ľščťžäňôě' 44 | decoded_html = decode_html(html.encode("iso-8859-2")) 45 | 46 | self.assert_strings_equal(html, decoded_html) 47 | 48 | def test_meta_detection_4(self): 49 | html = ' ľščťžäňôě' 50 | decoded_html = decode_html(html.encode("iso-8859-2")) 51 | 52 | self.assert_strings_equal(html, decoded_html) 53 | 54 | def test_meta_detection_5(self): 55 | html = ' ľščťžäňôě' 56 | decoded_html = decode_html(html.encode("iso-8859-2")) 57 | 58 | self.assert_strings_equal(html, decoded_html) 59 | 60 | def test_meta_detection_6(self): 61 | html = ' ľščťžäňôě' 62 | decoded_html = decode_html(html.encode("iso-8859-2")) 63 | 64 | self.assert_strings_equal(html, decoded_html) 65 | 66 | def test_meta_detection_7(self): 67 | html = ' ľščťžäňôě' 68 | decoded_html = decode_html(html.encode("iso-8859-2")) 69 | 70 | self.assert_strings_equal(html, decoded_html) 71 | 72 | def test_meta_detection_8(self): 73 | html = ' ľščťžäňôě' 74 | decoded_html = decode_html(html.encode("iso-8859-2")) 75 | 76 | self.assert_strings_equal(html, decoded_html) 77 | 78 | def test_meta_detection_9(self): 79 | html = ' ľščťžäňôě' 80 | decoded_html = decode_html(html.encode("iso-8859-2")) 81 | 82 | self.assert_strings_equal(html, decoded_html) 83 | 84 | def test_meta_detection_charset_outside_1(self): 85 | html = ' charset="iso-fake-29" ľščťžäňôě' 86 | decoded_html = decode_html(html.encode("iso-8859-2")) 87 | 88 | self.assert_strings_equal(html, decoded_html) 89 | 90 | def test_meta_detection_charset_outside_2(self): 91 | html = ' charset="iso-fake-29" ľščťžäňôě' 92 | decoded_html = decode_html(html.encode("iso-8859-2")) 93 | 94 | self.assert_strings_equal(html, decoded_html) 95 | 96 | def test_meta_detection_charset_outside_3(self): 97 | html = ' charset="iso-fake-29" ľščťžäňôě' 98 | decoded_html = decode_html(html.encode("iso-8859-2")) 99 | 100 | self.assert_strings_equal(html, decoded_html) 101 | 102 | def test_unknown_encoding_in_strict_mode(self): 103 | html = 'ľščťžäňôě' 104 | with pytest.raises(JustextError): 105 | decode_html(html.encode("iso-8859-2"), errors='strict') 106 | 107 | def test_unknown_encoding_with_default_error_handler(self): 108 | html = 'ľščťžäňôě' 109 | decoded = decode_html(html.encode("iso-8859-2"), default_encoding="iso-8859-2") 110 | self.assertEqual(decoded, html) 111 | 112 | def test_default_encoding(self): 113 | html = 'ľščťžäňôě' 114 | decoded_html = decode_html(html.encode("iso-8859-2"), default_encoding="iso-8859-2") 115 | 116 | self.assert_strings_equal(html, decoded_html) 117 | 118 | def test_given_encoding(self): 119 | html = 'ľščťžäňôě' 120 | decoded_html = decode_html(html.encode("iso-8859-2"), encoding="iso-8859-2") 121 | 122 | self.assert_strings_equal(html, decoded_html) 123 | 124 | def test_given_wrong_encoding(self): 125 | html = 'ľščťžäňôě' 126 | decoded_html = decode_html(html.encode("iso-8859-2"), encoding="ASCII") 127 | 128 | self.assert_strings_equal("\ufffd" * len(html), decoded_html) 129 | 130 | def test_fake_encoding_in_meta(self): 131 | html = ' ľščťžäňôě' 132 | 133 | with pytest.raises(JustextError): 134 | decode_html(html.encode("iso-8859-2"), errors='strict') 135 | -------------------------------------------------------------------------------- /tests/test_dom_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division, print_function, unicode_literals 5 | 6 | import lxml.etree 7 | import pytest 8 | 9 | from lxml import html 10 | from justext.core import preprocessor, html_to_dom 11 | 12 | 13 | def test_remove_comments(): 14 | dom = html.fromstring( 15 | '' 16 | '

Header

' 17 | ' text' 18 | '

footer' 19 | '' 20 | ) 21 | 22 | expected = '

Header

text

footer

' 23 | returned = html.tostring(dom).decode("utf8") 24 | assert expected == returned 25 | 26 | dom = preprocessor(dom) 27 | 28 | expected = '

Header

text

footer

' 29 | returned = html.tostring(dom).decode("utf8") 30 | assert expected == returned 31 | 32 | 33 | def test_remove_head_tag(): 34 | html_string = ( 35 | 'Title' 36 | '

Header

' 37 | '

text

' 38 | '

footer like a boss

' 39 | '' 40 | ) 41 | 42 | dom = html.fromstring(html_string) 43 | returned = html.tostring(dom).decode("utf8") 44 | assert html_string == returned 45 | 46 | dom = preprocessor(dom) 47 | returned = html.tostring(dom).decode("utf8") 48 | expected = ( 49 | '' 50 | '

Header

' 51 | '

text

' 52 | '

footer like a boss

' 53 | '' 54 | ) 55 | assert expected == returned 56 | 57 | 58 | def test_preprocess_simple_unicode_string(): 59 | html_string = ( 60 | 'Title' 61 | '

Header

' 62 | '

pretextpostemphpopost

' 63 | '

footer like a boss

' 64 | '' 65 | ) 66 | 67 | dom = preprocessor(html_to_dom(html_string)) 68 | returned = html.tostring(dom).decode("utf8") 69 | expected = ( 70 | '' 71 | '

Header

' 72 | '

pretextpostemphpopost

' 73 | '

footer like a boss

' 74 | '' 75 | ) 76 | assert expected == returned 77 | 78 | 79 | def test_preprocess_simple_bytes_string(): 80 | html_string = ( 81 | b'Title' 82 | b'

Header

' 83 | b'

pretextpostemphpopost

' 84 | b'

footer like a boss

' 85 | b' \n' 86 | b'' 87 | ) 88 | 89 | dom = preprocessor(html_to_dom(html_string)) 90 | returned = html.tostring(dom).decode("utf8") 91 | expected = ( 92 | '' 93 | '

Header

' 94 | '

pretextpostemphpopost

' 95 | '

footer like a boss

' 96 | ' \n' 97 | '' 98 | ) 99 | assert expected == returned 100 | 101 | 102 | def test_preprocess_simple_unicode_xhtml_string_with_declaration(): 103 | html_string = ( 104 | '' 105 | '' 106 | '' 107 | '' 108 | 'Hello World' 109 | '' 110 | '' 111 | '' 112 | '' 113 | '' 114 | '' 115 | ) 116 | 117 | dom = preprocessor(html_to_dom(html_string)) 118 | returned = html.tostring(dom).decode("utf8") 119 | expected = ( 120 | '' 121 | '' 122 | '' 123 | '' 124 | ) 125 | assert expected == returned 126 | 127 | 128 | def test_preprocess_simple_bytes_xhtml_string_with_declaration(): 129 | html_string = ( 130 | b'' 131 | b'' 132 | b'' 133 | b'' 134 | b'Hello World' 135 | b'' 136 | b'' 137 | b'' 138 | b'' 139 | b'' 140 | b'' 141 | ) 142 | 143 | dom = preprocessor(html_to_dom(html_string)) 144 | returned = html.tostring(dom).decode("utf8") 145 | expected = ( 146 | '' 147 | '' 148 | '' 149 | '' 150 | ) 151 | assert expected == returned 152 | 153 | 154 | def test_lxml_do_not_hold_context_from_previous_parsing(): 155 | """ 156 | https://github.com/miso-belica/jusText/issues/17 157 | """ 158 | html_to_dom("") 159 | 160 | with pytest.raises(lxml.etree.ParserError) as e: 161 | html_to_dom("") 162 | 163 | assert "justext" not in str(e.value) 164 | -------------------------------------------------------------------------------- /justext/stoplists/Occitan.txt: -------------------------------------------------------------------------------- 1 | de 2 | la 3 | e 4 | lo 5 | en 6 | a 7 | que 8 | es 9 | per 10 | un 11 | del 12 | las 13 | una 14 | dins 15 | son 16 | los 17 | se 18 | Lo 19 | La 20 | o 21 | dels 22 | amb 23 | al 24 | mai 25 | foguèt 26 | coma 27 | deu 28 | ua 29 | pas 30 | __NOTOC__ 31 | region 32 | comuna 33 | qu'ei 34 | departament 35 | tanben 36 | En 37 | entre 38 | l'an 39 | au 40 | sus 41 | calendièr 42 | dens 43 | situada 44 | pagina 45 | concernís 46 | "Aquesta 47 | gregorian." 48 | d'un 49 | dau 50 | Los 51 | qui 52 | lei 53 | Es 54 | sa 55 | pus 56 | sègle 57 | i 58 | cap 59 | èra 60 | fòrça 61 | nom 62 | dab 63 | vila 64 | A 65 | mas 66 | (en 67 | Las 68 | deus 69 | ho 70 | d'una 71 | francés) 72 | d'Occitània, 73 | pòt 74 | dei 75 | Sant 76 | mei 77 | Se 78 | Sa 79 | fins 80 | era 81 | pel 82 | primièr 83 | francés 84 | çò 85 | Mar 86 | soa 87 | Mai 88 | tot 89 | dos 90 | lenga 91 | an 92 | Literatura 93 | Que 94 | d'Aquitània. 95 | ont 96 | eth 97 | Sent 98 | Musica 99 | Sciéncia 100 | ne 101 | lor 102 | Junh 103 | Espòrt 104 | Set 105 | Nov 106 | Gen 107 | Feb 108 | Oct 109 | Dec 110 | Ago 111 | Abr 112 | Julh 113 | president 114 | fin 115 | als 116 | com 117 | non 118 | populacion 119 | partida 120 | part 121 | Dins 122 | Gasconha, 123 | país 124 | des 125 | ans 126 | administrada 127 | dempuèi 128 | Un 129 | li 130 | foguèron 131 | pels 132 | tres 133 | capitala 134 | francesa 135 | fa 136 | sens 137 | siá 138 | Son 139 | leis 140 | Per 141 | peu 142 | pendent 143 | qu'es 144 | et 145 | sos 146 | èsser 147 | rei 148 | plan 149 | De 150 | sud 151 | Estats 152 | doas 153 | nòrd 154 | entà 155 | dera 156 | après 157 | temps 158 | Foguèt 159 | quand 160 | França 161 | familha 162 | forma 163 | grop 164 | aquela 165 | deth 166 | ciutat 167 | segon 168 | província 169 | estat 170 | abans 171 | granda 172 | sas 173 | còp 174 | partir 175 | contra 176 | aquel 177 | le 178 | Prèmi 179 | on 180 | encara 181 | durant 182 | poder 183 | que's 184 | Miègjorn-Pirenèus. 185 | I 186 | mes 187 | periòde 188 | màger 189 | à 190 | lengas 191 | òme 192 | aquesta 193 | istorica 194 | Joan 195 | francés: 196 | govèrn 197 | superfícia 198 | les 199 | còps 200 | sovent 201 | pauc 202 | Una 203 | Qu'ei 204 | (n. 205 | Istòria. 206 | Republica 207 | sistèma 208 | conegut 209 | Nauta 210 | nombre 211 | l'estat 212 | annadas 213 | far 214 | grand 215 | tròba 216 | mòrt 217 | pòdon 218 | qu'èra 219 | politic 220 | Guèrra 221 | de. 222 | causa 223 | abitants 224 | ei 225 | qualques 226 | deis 227 | sieu 228 | situat 229 | qu'a 230 | cada 231 | mar 232 | aprèp 233 | Al 234 | Lei 235 | centre 236 | el 237 | vèrs 238 | mens 239 | Nobel 240 | venguèt 241 | tota 242 | sul 243 | èran 244 | puèi 245 | jos 246 | II 247 | primièra 248 | e, 249 | territòri 250 | vida 251 | totala 252 | E 253 | junh 254 | francesa, 255 | Units 256 | sonque 257 | ambé 258 | aviá 259 | Tolosa 260 | totes 261 | ministre 262 | Geografia. 263 | biais 264 | meteis 265 | musica 266 | (o 267 | Dordonha 268 | enter 269 | d'ua 270 | faguèt 271 | the 272 | plaça 273 | l'entorn 274 | quatre 275 | occitana 276 | Perigòrd, 277 | començament 278 | març 279 | km 280 | guèrra 281 | lors 282 | aqueste 283 | Aquela 284 | alara 285 | cultura 286 | occitan 287 | totas 288 | sons 289 | solament 290 | doncas 291 | Gironda 292 | òbra 293 | païses 294 | important 295 | Buenos 296 | fach 297 | Aquesta 298 | julhet 299 | cas 300 | d'autres 301 | Après 302 | ara 303 | Pas 304 | luòc 305 | politica 306 | donc 307 | ja 308 | ven 309 | dire 310 | comunas 311 | País 312 | estimada 313 | (var. 314 | riu 315 | aquò 316 | Recebèt 317 | bèth 318 | punt 319 | damb 320 | of 321 | gaireben 322 | dinastia 323 | (lo 324 | autres 325 | va 326 | canton 327 | par 328 | drech 329 | filh 330 | y 331 | tèrme 332 | km². 333 | òbras 334 | principala 335 | Calais 336 | segonda 337 | regions 338 | Nòrd-Pas 339 | aver 340 | Calais. 341 | milions 342 | mot 343 | Província 344 | Sud 345 | generalament 346 | movement 347 | decembre 348 | basa 349 | cinc 350 | comte 351 | mond 352 | ben 353 | primièrs 354 | Nòrd 355 | Vinhana 356 | m 357 | d'Occitània 358 | exemple 359 | (nom 360 | Santa 361 | Pirenèus 362 | Mas 363 | Loís 364 | Pasmens, 365 | basco 366 | París 367 | genièr 368 | ath 369 | vilatge 370 | "La 371 | Aquel 372 | glèisa 373 | escrivan 374 | long 375 | membres 376 | tant 377 | d'agost 378 | perque 379 | estats 380 | mentre 381 | sei 382 | l'istòria 383 | l'Empèri 384 | importanta 385 | grèc 386 | dus 387 | est 388 | latin 389 | l'èst 390 | Carcin 391 | setembre 392 | espècias 393 | mai, 394 | tornar 395 | Au 396 | Lista 397 | França. 398 | l'oèst 399 | coneguda 400 | subretot 401 | Coma 402 | hèra 403 | desvolopament 404 | d'Òlt 405 | nomenat 406 | prèp 407 | per'mor 408 | Aqueste 409 | novembre 410 | jorn 411 | ans, 412 | d'aiga 413 | comencèt 414 | Era 415 | (e 416 | exemple, 417 | d'autras 418 | permet 419 | Pendent 420 | di 421 | considerat 422 | Le 423 | Eth 424 | produccion 425 | San 426 | majoritat 427 | uèi 428 | Segon 429 | mès 430 | s'i 431 | grands 432 | and 433 | ans. 434 | d'aquesta 435 | França, 436 | famós 437 | Lanas 438 | títol 439 | fan 440 | domeni 441 | da 442 | Amb 443 | segond 444 | nivèl 445 | Economia 446 | Quand 447 | Durant 448 | atau 449 | vilas 450 | (Occitània), 451 | personas 452 | Nòva 453 | lista 454 | preséncia 455 | puish 456 | dinc 457 | d'origina 458 | costat 459 | massa 460 | (de 461 | tipe 462 | aguèt 463 | cors 464 | Dempuèi 465 | paire 466 | soas 467 | parçan 468 | d'aquela 469 | elegit 470 | aus 471 | linha 472 | seriá 473 | er 474 | totjorn 475 | tèrras 476 | membre 477 | Economia. 478 | carrièra 479 | peus 480 | Tèrra 481 | comarca 482 | in 483 | l'illa 484 | purmèr 485 | Glèisa 486 | III 487 | quartièr 488 | sol 489 | nascut 490 | José 491 | du 492 | monde 493 | autre 494 | mejana 495 | illas 496 | gran 497 | pòble 498 | tèrra 499 | ancian 500 | -------------------------------------------------------------------------------- /justext/stoplists/Portuguese.txt: -------------------------------------------------------------------------------- 1 | de 2 | e 3 | a 4 | o 5 | do 6 | da 7 | em 8 | que 9 | uma 10 | um 11 | é 12 | com 13 | no 14 | para 15 | na 16 | por 17 | os 18 | foi 19 | como 20 | dos 21 | O 22 | A 23 | se 24 | as 25 | ao 26 | sua 27 | mais 28 | das 29 | seu 30 | à 31 | não 32 | Em 33 | ou 34 | pela 35 | pelo 36 | ser 37 | também 38 | são 39 | entre 40 | era 41 | tem 42 | mas 43 | seus 44 | nos 45 | cidade 46 | até 47 | Os 48 | onde 49 | No 50 | área 51 | ele 52 | São 53 | foram 54 | anos 55 | sobre 56 | nas 57 | quando 58 | população 59 | parte 60 | região 61 | sendo 62 | suas 63 | primeiro 64 | aos 65 | grande 66 | estado 67 | mesmo 68 | nome 69 | Foi 70 | É 71 | muito 72 | segundo 73 | família 74 | dois 75 | ainda 76 | já 77 | está 78 | durante 79 | primeira 80 | As 81 | maior 82 | pode 83 | Rio 84 | ano 85 | outros 86 | apenas 87 | km² 88 | Na 89 | ter 90 | forma 91 | após 92 | pelos 93 | qual 94 | depois 95 | dia 96 | século 97 | três 98 | município 99 | duas 100 | km², 101 | banda 102 | num 103 | De 104 | todos 105 | sem 106 | densidade 107 | contra 108 | às 109 | ela 110 | álbum 111 | desde 112 | sido 113 | então 114 | vez 115 | Ele 116 | tendo 117 | acordo 118 | comuna 119 | grupo 120 | localizada 121 | partir 122 | quais 123 | tinha 124 | cerca 125 | este 126 | alguns 127 | espécie 128 | teve 129 | cobertos 130 | outras 131 | habitantes. 132 | cada 133 | Estados 134 | hab/km². 135 | e, 136 | período 137 | através 138 | conhecido 139 | bem 140 | Com 141 | of 142 | Este 143 | tempo 144 | sistema 145 | Brasil 146 | assim 147 | além 148 | vários 149 | Segundo 150 | eram 151 | esta 152 | série 153 | final 154 | filme 155 | música 156 | Um 157 | José 158 | vida 159 | habitantes, 160 | Estende-se 161 | antes 162 | história 163 | estava 164 | pertencente 165 | podem 166 | fez 167 | departamento 168 | possui 169 | sob 170 | km 171 | João 172 | província 173 | novo 174 | americano 175 | principal 176 | Sua 177 | início 178 | numa 179 | só 180 | muitos 181 | estão 182 | devido 183 | Santa 184 | pessoas 185 | censo 186 | número 187 | distrito 188 | lançado 189 | administrativa 190 | Por 191 | dias 192 | há 193 | governo 194 | eles 195 | todo 196 | passou 197 | quatro 198 | terra 199 | Após 200 | The 201 | várias 202 | vezes 203 | grandes 204 | francesa 205 | algumas 206 | Universidade 207 | Guerra 208 | que, 209 | começou 210 | pois 211 | Uma 212 | the 213 | chamado 214 | enquanto 215 | havia 216 | gênero 217 | seguinte 218 | Para 219 | lugar 220 | Grande 221 | Nova 222 | todas 223 | trabalho 224 | Condado 225 | censos 226 | outro 227 | sempre 228 | nova 229 | média 230 | representa 231 | Estado 232 | década 233 | pelas 234 | Demografia. 235 | D. 236 | fim 237 | Possui 238 | fazer 239 | Paulo 240 | rio 241 | habitantes 242 | qualquer 243 | muitas 244 | Esta 245 | época 246 | nível 247 | anos, 248 | Como 249 | orbital 250 | Maria 251 | jogo 252 | Geografia. 253 | segunda 254 | mundo 255 | Brasil, 256 | toda 257 | meio 258 | Seu 259 | filho 260 | título 261 | programa 262 | lado 263 | Durante 264 | redor 265 | melhor 266 | maioria 267 | História. 268 | pouco 269 | Quando 270 | localizado 271 | conhecida 272 | asteróide 273 | hoje 274 | menos 275 | capital 276 | principais 277 | brasileiro 278 | sede 279 | poder 280 | mil 281 | país 282 | uso 283 | origem 284 | versão 285 | quase 286 | Ao 287 | produção 288 | tipo 289 | mesma 290 | faz 291 | água. 292 | canção 293 | estimada 294 | presidente 295 | seria 296 | Nacional 297 | volta 298 | carreira 299 | desta 300 | quem 301 | Igreja 302 | centro 303 | longo 304 | importante 305 | localidades 306 | local 307 | junto 308 | United 309 | Também 310 | Brasil. 311 | sul 312 | raio 313 | construção 314 | ficou 315 | livro 316 | Depois 317 | têm 318 | Unidos 319 | morte 320 | janeiro 321 | Carlos 322 | sucesso 323 | aproximadamente 324 | Além 325 | março 326 | norte-americano 327 | clube 328 | esse 329 | maio 330 | dezembro 331 | primeiros 332 | deste 333 | diversos 334 | populacional 335 | cinco 336 | "The 337 | obra 338 | States 339 | diagrama 340 | chamada 341 | desenvolvimento 342 | processo 343 | rei 344 | diversas 345 | lhe 346 | dentro 347 | Campeonato 348 | conta 349 | Bureau 350 | vizinhança. 351 | Ela 352 | Census 353 | principalmente 354 | casa 355 | Localidades 356 | norte 357 | outubro 358 | Jogos 359 | entanto, 360 | isso 361 | julho 362 | base 363 | setembro 364 | relação 365 | empresa 366 | equipe 367 | futebol 368 | tanto 369 | ilha 370 | velocidade 371 | essa 372 | agosto 373 | outra 374 | bairro 375 | abril 376 | junho 377 | ponto 378 | Janeiro 379 | caso 380 | obras 381 | próprio 382 | América 383 | Copa 384 | novembro 385 | and 386 | água 387 | embora 388 | recebeu 389 | maiores 390 | TV 391 | fora 392 | estilo 393 | Pedro 394 | tornou-se 395 | I 396 | milhões 397 | (em 398 | seja 399 | considerado 400 | países 401 | membros 402 | papel 403 | Apesar 404 | linha 405 | último 406 | pai 407 | Francisco 408 | guerra 409 | deve 410 | anos. 411 | John 412 | República 413 | acima 414 | Entre 415 | língua 416 | tarde 417 | fevereiro 418 | logo 419 | termo 420 | jogos 421 | Paulo, 422 | nacional 423 | conjunto 424 | personagem 425 | chegou 426 | incluindo 427 | "O 428 | corpo 429 | somente 430 | II 431 | Reino 432 | diferentes 433 | usado 434 | política 435 | metros 436 | político 437 | posição 438 | movimento 439 | Não 440 | criado 441 | Mas 442 | criação 443 | oficial 444 | único 445 | porque 446 | quanto 447 | projeto 448 | atual 449 | feito 450 | total 451 | Portugal 452 | género 453 | "A 454 | tornou 455 | áreas 456 | formação 457 | sexo 458 | comunidade 459 | Porto 460 | NGC 461 | edição 462 | cidades 463 | geralmente 464 | Sul 465 | nunca 466 | deu 467 | exemplo, 468 | aumento 469 | seis 470 | temporada 471 | fato 472 | m 473 | descoberto 474 | tal 475 | condado 476 | bastante 477 | fica 478 | membro 479 | Janeiro, 480 | atualmente 481 | única 482 | ano, 483 | importantes 484 | grupos 485 | antiga 486 | fronteira 487 | disso, 488 | tais 489 | antigo 490 | escola 491 | -------------------------------------------------------------------------------- /justext/stoplists/English.txt: -------------------------------------------------------------------------------- 1 | the 2 | of 3 | and 4 | in 5 | to 6 | a 7 | was 8 | is 9 | The 10 | for 11 | as 12 | on 13 | with 14 | by 15 | that 16 | from 17 | at 18 | his 19 | an 20 | he 21 | In 22 | are 23 | were 24 | which 25 | be 26 | has 27 | He 28 | it 29 | or 30 | also 31 | had 32 | first 33 | It 34 | their 35 | not 36 | but 37 | have 38 | who 39 | its 40 | one 41 | this 42 | been 43 | her 44 | two 45 | they 46 | other 47 | into 48 | after 49 | all 50 | when 51 | more 52 | This 53 | only 54 | would 55 | A 56 | she 57 | New 58 | most 59 | can 60 | over 61 | during 62 | where 63 | new 64 | used 65 | such 66 | up 67 | between 68 | many 69 | made 70 | some 71 | than 72 | out 73 | United 74 | known 75 | about 76 | time 77 | then 78 | became 79 | under 80 | "The 81 | being 82 | part 83 | there 84 | him 85 | years 86 | three 87 | through 88 | On 89 | including 90 | later 91 | will 92 | American 93 | both 94 | After 95 | until 96 | before 97 | She 98 | well 99 | no 100 | against 101 | while 102 | called 103 | second 104 | As 105 | several 106 | University 107 | number 108 | name 109 | these 110 | played 111 | early 112 | may 113 | They 114 | World 115 | His 116 | located 117 | National 118 | same 119 | them 120 | released 121 | There 122 | de 123 | area 124 | use 125 | work 126 | any 127 | school 128 | since 129 | team 130 | age 131 | so 132 | John 133 | won 134 | people 135 | began 136 | each 137 | year 138 | population 139 | now 140 | family 141 | film 142 | found 143 | city 144 | British 145 | four 146 | album 147 | could 148 | very 149 | However, 150 | South 151 | named 152 | At 153 | around 154 | took 155 | former 156 | because 157 | series 158 | For 159 | States 160 | did 161 | within 162 | state 163 | end 164 | based 165 | May 166 | I 167 | local 168 | held 169 | September 170 | still 171 | often 172 | those 173 | member 174 | small 175 | town 176 | along 177 | back 178 | School 179 | large 180 | January 181 | June 182 | group 183 | served 184 | March 185 | high 186 | own 187 | During 188 | North 189 | July 190 | October 191 | if 192 | like 193 | following 194 | built 195 | August 196 | April 197 | music 198 | born 199 | village 200 | game 201 | due 202 | last 203 | place 204 | home 205 | State 206 | left 207 | major 208 | set 209 | include 210 | U.S. 211 | much 212 | December 213 | November 214 | received 215 | When 216 | York 217 | main 218 | War 219 | public 220 | band 221 | (born 222 | season 223 | published 224 | even 225 | different 226 | original 227 | members 228 | station 229 | single 230 | government 231 | another 232 | near 233 | what 234 | died 235 | moved 236 | become 237 | just 238 | February 239 | company 240 | included 241 | song 242 | came 243 | led 244 | late 245 | form 246 | national 247 | make 248 | went 249 | These 250 | off 251 | show 252 | French 253 | five 254 | system 255 | few 256 | various 257 | given 258 | best 259 | English 260 | City 261 | long 262 | third 263 | among 264 | every 265 | West 266 | German 267 | using 268 | do 269 | said 270 | started 271 | currently 272 | having 273 | down 274 | next 275 | order 276 | One 277 | final 278 | take 279 | species 280 | established 281 | created 282 | life 283 | play 284 | line 285 | building 286 | History. 287 | political 288 | without 289 | support 290 | written 291 | district 292 | per 293 | produced 294 | High 295 | popular 296 | League 297 | service 298 | football 299 | considered 300 | St. 301 | way 302 | returned 303 | International 304 | book 305 | again 306 | although 307 | important 308 | living 309 | role 310 | River 311 | students 312 | married 313 | son 314 | top 315 | worked 316 | San 317 | continued 318 | however, 319 | founded 320 | joined 321 | appeared 322 | total 323 | power 324 | By 325 | record 326 | College 327 | side 328 | William 329 | title 330 | death 331 | County, 332 | years. 333 | career 334 | never 335 | From 336 | north 337 | club 338 | County 339 | military 340 | version 341 | European 342 | According 343 | old 344 | While 345 | six 346 | day 347 | average 348 | television 349 | similar 350 | world 351 | general 352 | million 353 | With 354 | Some 355 | water 356 | formed 357 | international 358 | usually 359 | current 360 | though 361 | south 362 | General 363 | time, 364 | community 365 | East 366 | House 367 | land 368 | Although 369 | George 370 | making 371 | player 372 | playing 373 | President 374 | development 375 | James 376 | developed 377 | common 378 | should 379 | great 380 | century 381 | does 382 | further 383 | run 384 | working 385 | largest 386 | recorded 387 | All 388 | lost 389 | must 390 | elected 391 | history 392 | seen 393 | live 394 | opened 395 | short 396 | taken 397 | once 398 | professional 399 | production 400 | point 401 | head 402 | children 403 | throughout 404 | games 405 | period 406 | himself 407 | originally 408 | term 409 | control 410 | available 411 | less 412 | King 413 | Its 414 | modern 415 | position 416 | eventually 417 | across 418 | Royal 419 | works 420 | site 421 | young 422 | wrote 423 | income 424 | house 425 | David 426 | Other 427 | Since 428 | how 429 | able 430 | full 431 | war 432 | Australian 433 | Air 434 | London 435 | Army 436 | includes 437 | law 438 | designed 439 | sold 440 | featured 441 | appointed 442 | business 443 | An 444 | get 445 | either 446 | Japanese 447 | program 448 | US 449 | Canadian 450 | father 451 | lead 452 | approximately 453 | performed 454 | leading 455 | radio 456 | upon 457 | remained 458 | famous 459 | Indian 460 | we 461 | Robert 462 | First 463 | help 464 | west 465 | gave 466 | announced 467 | men 468 | result 469 | times 470 | field 471 | you 472 | right 473 | east 474 | almost 475 | country 476 | story 477 | Church 478 | followed 479 | good 480 | days 481 | signed 482 | features 483 | together 484 | described 485 | research 486 | sent 487 | open 488 | special 489 | close 490 | see 491 | To 492 | character 493 | social 494 | miles 495 | rather 496 | life. 497 | Council 498 | Western 499 | (the 500 | party 501 | official 502 | years, 503 | church 504 | -------------------------------------------------------------------------------- /justext/stoplists/Sicilian.txt: -------------------------------------------------------------------------------- 1 | di 2 | e 3 | lu 4 | a 5 | la 6 | è 7 | li 8 | ca 9 | dâ 10 | dû 11 | na 12 | nu 13 | si 14 | pi 15 | Lu 16 | cu 17 | La 18 | dî 19 | ntô 20 | chi 21 | o 22 | nta 23 | comu 24 | sò 25 | fu 26 | cchiù 27 | 'n 28 | ô 29 | nun 30 | ntâ 31 | un 32 | sunnu 33 | n 34 | Li 35 | â 36 | cumuni 37 | supra 38 | pruvincia 39 | ma 40 | parti 41 | era 42 | puru 43 | ci 44 | macari 45 | veni 46 | unni 47 | tra 48 | Ntô 49 | tutti 50 | A 51 | assai 52 | granni 53 | havi 54 | u 55 | lingua 56 | dui 57 | San 58 | prima 59 | statu 60 | nomu 61 | cità 62 | i 63 | in 64 | dô 65 | da 66 | ntê 67 | nti 68 | circa 69 | abbitanti. 70 | quannu 71 | Sicilia 72 | cui 73 | pupulazzioni 74 | dê 75 | doppu 76 | abbitanti 77 | stu 78 | èssiri 79 | sulu 80 | pò 81 | paisi 82 | unu 83 | sta 84 | vinni 85 | àutri 86 | de 87 | primu 88 | ppi 89 | du 90 | ê 91 | pâ 92 | tanti 93 | eni 94 | Si 95 | mpurtanti 96 | Havi 97 | sô 98 | sèculu 99 | nzinu 100 | s'attrova 101 | (o 102 | chiddu 103 | fari 104 | foru 105 | câ 106 | chidda 107 | anni 108 | Nta 109 | E' 110 | fattu 111 | ntra 112 | citati 113 | so 114 | fici 115 | mentri 116 | una 117 | Fu 118 | sutta 119 | quali 120 | tuttu 121 | ‘n 122 | sicilianu 123 | tri 124 | diversi 125 | nni 126 | cû 127 | gruppu 128 | capitali 129 | ogni 130 | fini 131 | Pi 132 | iddu 133 | avìa 134 | stissu 135 | ancora 136 | sempri 137 | tempu 138 | forma 139 | chiddi 140 | usatu 141 | hannu 142 | famigghia 143 | the 144 | Di 145 | picca 146 | cci 147 | primi 148 | senza 149 | dialettu 150 | tutta 151 | È 152 | versu 153 | fa 154 | sti 155 | vota 156 | centru 157 | duranti 158 | quarchi 159 | speci 160 | Sicilia. 161 | Santa 162 | vita 163 | spissu 164 | storia 165 | palora 166 | Stati 167 | appoi 168 | riggiuni 169 | I 170 | accussì 171 | ri 172 | of 173 | ponnu 174 | quasi 175 | Santu 176 | secunnu 177 | tirritoriu 178 | principali 179 | Na 180 | latinu 181 | contra 182 | guerra 183 | misi 184 | ccu 185 | Doppu 186 | sistema 187 | voti 188 | ti 189 | II 190 | siddu 191 | particulari 192 | tèrmini 193 | munnu 194 | puntu 195 | siciliana 196 | mari 197 | sud 198 | l'anni 199 | re 200 | Sicilia, 201 | pirsuni 202 | chiù 203 | videmma 204 | non 205 | ni 206 | diri 207 | canusciutu 208 | picchì 209 | aviri 210 | ciumi 211 | appi 212 | Palermu 213 | già 214 | zona 215 | Nu 216 | Ntâ 217 | Stu 218 | chiamatu 219 | sunu 220 | su 221 | pî 222 | morti 223 | modu 224 | arcuni 225 | Storia. 226 | e, 227 | tipu 228 | Quannu 229 | spagnolu 230 | giugnu 231 | finu 232 | Regnu 233 | manu 234 | cuvernu 235 | usata 236 | L' 237 | vicinu 238 | patri 239 | 'nta 240 | Catania 241 | tantu 242 | liveddu 243 | pû 244 | Ma 245 | mùsica 246 | nord 247 | chistu 248 | cchiu 249 | pirsuna 250 | vari 251 | basi 252 | and 253 | francisi 254 | secunna 255 | siciliani 256 | Cu 257 | jinnaru 258 | travagghiu 259 | a.C. 260 | corpu 261 | Giuvanni 262 | 'N 263 | situatu 264 | dici 265 | Cresia 266 | èranu 267 | pirìudu 268 | nascìu 269 | mmeci 270 | nnâ 271 | certi 272 | essiri 273 | Sta 274 | fannu 275 | chî 276 | voli 277 | lingui 278 | annu 279 | sù 280 | capu 281 | grecu 282 | cosi 283 | giugnettu 284 | U 285 | chiamata 286 | avissi 287 | megghiu 288 | stissa 289 | quattru 290 | maiu 291 | prisenti 292 | cosa 293 | cumunità 294 | successu 295 | te 296 | quantu 297 | poi 298 | figghiu 299 | culuri 300 | putiri 301 | canzuna 302 | talianu 303 | l'àutri 304 | marzu 305 | stati 306 | metri 307 | Tra 308 | ca, 309 | sittèmmiru 310 | liggi 311 | d'un 312 | prisidenti 313 | campu 314 | fatta 315 | pirchì 316 | Uniti 317 | munnu. 318 | cèlibbri 319 | forti 320 | fatti 321 | terra 322 | aprili 323 | sia 324 | novu 325 | austu 326 | frivaru 327 | del 328 | zoè 329 | Mari 330 | regnu 331 | soi 332 | l'ìsula 333 | menzu 334 | chilòmitri 335 | Roma 336 | l'Italia 337 | autònuma 338 | autri 339 | gèniri 340 | dicèmmiru 341 | tradizzioni 342 | to 343 | Maria 344 | Comu 345 | ufficiali 346 | iddi 347 | casi 348 | addivintau 349 | mpiraturi 350 | veru 351 | casu 352 | postu 353 | funzioni 354 | dei 355 | c'è 356 | no 357 | porta 358 | canusciuta 359 | menu 360 | pri 361 | manera 362 | pupulari 363 | ha 364 | nnô 365 | pruduzzioni 366 | ecc. 367 | nna 368 | mai 369 | jornu 370 | miliuna 371 | ginirali 372 | cultura 373 | Ci 374 | chiama 375 | finòminu 376 | nùmmuru 377 | parrata 378 | tali 379 | apparteni 380 | siciliana. 381 | hà 382 | costa 383 | Francia 384 | genti 385 | libbru 386 | nuvèmmiru 387 | attraversu 388 | fora 389 | va 390 | Papa 391 | parra 392 | Castigghia 393 | 'a 394 | frati 395 | forsi 396 | sviluppu 397 | novi 398 | standard 399 | N 400 | l' 401 | nova 402 | cô 403 | (ca 404 | via 405 | vennu 406 | sèculu. 407 | nzemi 408 | nò 409 | idda 410 | canzuni 411 | matri 412 | signìfica 413 | sicilianu. 414 | pòpulu 415 | ssiri 416 | Catania. 417 | propiu 418 | palori 419 | usari 420 | pigghia 421 | grazzi 422 | pari 423 | cantanti 424 | Italia 425 | Carlu 426 | se 427 | uttùviru 428 | "La 429 | dintra 430 | sìmmulu 431 | fussi 432 | locu 433 | militari 434 | festa 435 | musicali 436 | supirfici 437 | Palermu, 438 | nazziunali 439 | (dû 440 | Palermu. 441 | bannera 442 | scola 443 | casa 444 | jè 445 | scrittu 446 | parola 447 | studiu 448 | oggi 449 | suprattuttu 450 | cristiani 451 | Sicilianu 452 | chissu 453 | ddu 454 | ngrisi 455 | pô 456 | causa 457 | (zoè 458 | ora 459 | pulìtica 460 | l'èbbica 461 | Catania, 462 | III 463 | partitu 464 | prisenza 465 | scritturi 466 | nzèmmula 467 | putissi 468 | León. 469 | gruppi 470 | pulìticu 471 | do 472 | oi 473 | zoni 474 | chiamati 475 | l'usu 476 | rapprisenta 477 | Ruggeru 478 | Duranti 479 | poti 480 | nomi 481 | Giuseppi 482 | Nun 483 | l'annu 484 | certu 485 | S. 486 | cincu 487 | Soria 488 | Dialettu. 489 | muvimentu 490 | mari. 491 | Re 492 | avìanu 493 | struttura 494 | sucitati 495 | nicu 496 | cumponi 497 | scrìviri 498 | Riggiu 499 | chisti 500 | pusizzioni 501 | murìu 502 | Usu 503 | matiriali 504 | bonu 505 | stili 506 | Pruvincia 507 | tiurìa 508 | Partitu 509 | E 510 | km 511 | duna 512 | rispettu 513 | avi 514 | Missina. 515 | -------------------------------------------------------------------------------- /justext/stoplists/Albanian.txt: -------------------------------------------------------------------------------- 1 | të 2 | e 3 | në 4 | dhe 5 | i 6 | me 7 | një 8 | nga 9 | për 10 | që 11 | është 12 | më 13 | te 14 | si 15 | u 16 | së 17 | edhe 18 | se 19 | ka 20 | Në 21 | ne 22 | nuk 23 | ishte 24 | janë 25 | tij 26 | shumë 27 | duke 28 | do 29 | vitin 30 | por 31 | ai 32 | këtë 33 | prej 34 | ku 35 | mund 36 | deri 37 | pas 38 | kanë 39 | disa 40 | kishte 41 | ose 42 | tyre 43 | parë 44 | dy 45 | vitit 46 | mbi 47 | cili 48 | qe 49 | gjatë 50 | kur 51 | ta 52 | vetëm 53 | saj 54 | atë 55 | ishin 56 | pjesë 57 | nje 58 | apo 59 | Ai 60 | qenë 61 | rreth 62 | ajo 63 | cila 64 | para 65 | cilat 66 | pa 67 | gjithë 68 | tjera 69 | Kjo 70 | Për 71 | kjo 72 | Me 73 | po 74 | per 75 | gjitha 76 | kësaj 77 | këto 78 | sipas 79 | Më 80 | madhe 81 | njohur 82 | ata 83 | ndërsa 84 | ato 85 | I 86 | sa 87 | etj. 88 | Pas 89 | duhet 90 | jo 91 | kundër 92 | kohë 93 | ky 94 | Një 95 | nën 96 | the 97 | herë 98 | madh 99 | këtij 100 | ndryshme 101 | tek 102 | çdo 103 | bërë 104 | kishin 105 | shqiptare 106 | Nga 107 | tij, 108 | Ky 109 | tjetër 110 | jetë 111 | eshte 112 | fundit 113 | mes 114 | cilët 115 | mirë 116 | pak 117 | tij. 118 | Ne 119 | mënyrë 120 | gjithashtu 121 | pasur 122 | ashtu 123 | sepse 124 | viteve 125 | of 126 | viti 127 | pasi 128 | Gjatë 129 | marrë 130 | Është 131 | ndër 132 | Të 133 | ndaj 134 | vend 135 | shkak 136 | lindi 137 | vonë 138 | E 139 | fund 140 | Si 141 | emrin 142 | Ka 143 | vitet 144 | pastaj 145 | mos 146 | kurse 147 | bë 148 | Ajo 149 | lartë 150 | mori 151 | Sipas 152 | cilën 153 | iu 154 | Por 155 | pse 156 | midis 157 | filloi 158 | as 159 | kështu 160 | sot 161 | këtyre 162 | tyre. 163 | dytë 164 | pjesën 165 | vjet 166 | siç 167 | kryesisht 168 | shume 169 | shqiptar 170 | vetë 171 | mëdha 172 | a 173 | bëhet 174 | tre 175 | shek. 176 | fillim 177 | ia 178 | na 179 | t’i 180 | cilin 181 | fshat 182 | Kosovës 183 | quajtur 184 | nëpër 185 | thotë 186 | gjuhën 187 | brenda 188 | tjerë 189 | Shqipërisë 190 | qytetit 191 | re 192 | Edhe 193 | vet 194 | kryesore 195 | vendin 196 | Kur 197 | luftës 198 | gjendet 199 | dhënë 200 | shekullit 201 | sidomos 202 | mjaft 203 | pranë 204 | t'i 205 | popullsisë 206 | kohës 207 | ju 208 | vendosur 209 | lindur 210 | ti 211 | politike 212 | bashku 213 | ndonjë 214 | rëndësishme 215 | Bashkuara 216 | krijuar 217 | and 218 | aq 219 | in 220 | Këto 221 | tyre, 222 | ri 223 | kohën 224 | saj. 225 | ana 226 | tё 227 | ketë 228 | asaj 229 | vogël 230 | lidhje 231 | ndodhet 232 | bazë 233 | shpesh 234 | dha 235 | tani 236 | Po 237 | to 238 | de 239 | Shtetet 240 | ditë 241 | Duke 242 | gjë 243 | numër 244 | jetën 245 | qytetin 246 | jashtë 247 | përdorur 248 | shtetin 249 | arriti 250 | shtetit 251 | katër 252 | bëri 253 | shqiptare. 254 | tepër 255 | kemi 256 | fillon 257 | km 258 | Ata 259 | formë 260 | merr 261 | vit 262 | shqiptarë 263 | quhet 264 | vendbanim 265 | ende 266 | qendër 267 | lidhur 268 | shkruar 269 | jane 270 | saj, 271 | kohe 272 | Kosovë. 273 | zakonisht 274 | jetës 275 | Shqipëri 276 | drejt 277 | vjetër 278 | filluar 279 | Shën 280 | Që 281 | larta 282 | nëse 283 | aty 284 | përfshirë 285 | atyre 286 | komunën 287 | shqipe 288 | vdiq 289 | nëpërmjet 290 | qyteti 291 | forcat 292 | bën 293 | fjalë 294 | përdoret 295 | atëherë 296 | fshatin 297 | qëllim 298 | zhvillimin 299 | arritur 300 | qytet 301 | shtrihet 302 | tërë 303 | atij 304 | tri 305 | përmes 306 | Kështu 307 | reja 308 | asnjë 309 | bënë 310 | anë 311 | vendit 312 | U 313 | pjesa 314 | mars 315 | shqiptare, 316 | tillë 317 | The 318 | cilit 319 | vende 320 | duket 321 | ushtarake 322 | vjen 323 | Disa 324 | bëjnë 325 | Amerikës. 326 | ndërtuar 327 | Ishte 328 | vërtetë 329 | shqiptarëve 330 | vite 331 | federal 332 | tilla 333 | prill 334 | familje 335 | shkurt 336 | veçantë 337 | gjuhës 338 | numri 339 | kryesor 340 | rast 341 | vendet 342 | pati 343 | maj 344 | janar 345 | jep 346 | pare 347 | kreu 348 | shumta 349 | Luftës 350 | shtator 351 | tregon 352 | botuar 353 | shteteve 354 | merret 355 | tjera. 356 | njihet 357 | shekullin 358 | afër 359 | fitoi 360 | njerëzit 361 | gjendje 362 | Kosovë 363 | këtu 364 | Nuk 365 | pra 366 | fshatit 367 | studimet 368 | qershor 369 | del 370 | dhjetor 371 | krye 372 | Ali 373 | kane 374 | kete 375 | ndërmjet 376 | cilës 377 | rëndësishëm 378 | tetor 379 | kombëtare 380 | zhvilluar 381 | anëtar 382 | fushën 383 | historinë 384 | lloj 385 | plotë 386 | pari 387 | anën 388 | grup 389 | gjuhë 390 | veri 391 | drejtim 392 | nëntor 393 | shqiptar. 394 | (në 395 | lumit 396 | shkollën 397 | Dhe 398 | fituar 399 | popullit 400 | luftë 401 | Kosovës. 402 | shqiptarët 403 | Ndërsa 404 | jug 405 | bashkë 406 | ekonomike 407 | shtetet 408 | mesme 409 | emër 410 | Shqipërisë. 411 | arsye 412 | milion 413 | II 414 | përgjithshme 415 | Pasi 416 | mundur 417 | gjetur 418 | gusht 419 | forcave 420 | Universitetin 421 | gjerë 422 | mbetur 423 | luftën 424 | moshën 425 | Lidhjes 426 | kulturore 427 | mbajtur 428 | korrik 429 | emri 430 | Gjithashtu 431 | punuar 432 | jenë 433 | fshati 434 | Perandorisë 435 | qoftë 436 | gjenden 437 | rol 438 | këta 439 | Countet 440 | zhvillimit 441 | t’u 442 | kam 443 | Republikës 444 | punë 445 | vepra 446 | banorë 447 | përsëri 448 | lënë 449 | jetojnë 450 | titullin 451 | larg 452 | m 453 | familjes 454 | vete 455 | botën 456 | County 457 | radhitur 458 | vendos 459 | rolin 460 | vendi 461 | përpara 462 | shtëpi 463 | parë, 464 | shteti 465 | ndryshme. 466 | luajtur 467 | filluan 468 | tjera, 469 | tu 470 | Shqipëria 471 | dorë 472 | cilave 473 | pjese 474 | vogla 475 | Shqipëri. 476 | historike 477 | nji 478 | rrugën 479 | paraqet 480 | njohura 481 | sipërfaqe 482 | formuar 483 | shquar 484 | njëri 485 | përveç 486 | vazhdon 487 | përbëhet 488 | kryer 489 | la 490 | duhej 491 | madje 492 | sistemit 493 | vazhdoi 494 | popullore 495 | kaluar 496 | politik 497 | pesë 498 | kulturës 499 | sukses 500 | ma 501 | Shqiptare 502 | Prej 503 | fetare 504 | vdekjes 505 | ardhur 506 | film 507 | Komuna 508 | morën 509 | tretë 510 | raste 511 | njëjtën 512 | Kosovës, 513 | vendosi 514 | shpejt 515 | shkencore 516 | arrin 517 | periudhës 518 | tjetër, 519 | punës 520 | t'u 521 | der 522 | tonë 523 | këngë 524 | punën 525 | gjithnjë 526 | drejtë 527 | doli 528 | gati 529 | kalon 530 | -------------------------------------------------------------------------------- /justext/stoplists/Aromanian.txt: -------------------------------------------------------------------------------- 1 | tu 2 | easte 3 | di 4 | un 5 | cãsãbã 6 | (Hoarã, 7 | shi 8 | cãsãbãlu 9 | de 10 | Ispania. 11 | Township, 12 | unã 13 | a 14 | comunã 15 | County), 16 | Kirghistan. 17 | nad 18 | cu 19 | la 20 | tsi 21 | ali 22 | icã 23 | pi 24 | ma 25 | vascã: 26 | (Pennsylvania), 27 | (Texas), 28 | Bad 29 | ti 30 | (North 31 | nu 32 | San 33 | Abruzzo. 34 | (Ohio), 35 | (Michigan), 36 | Ghermãnia. 37 | (Minnesota), 38 | (Iowa), 39 | City, 40 | (Missouri), 41 | Puglia. 42 | u 43 | del 44 | (Cãsãbã, 45 | ca 46 | cã 47 | Navarra. 48 | Portogallia. 49 | in 50 | (Kansas), 51 | Euskadi. 52 | La 53 | (New 54 | (Massachusetts), 55 | Japonia. 56 | (Oregon), 57 | Carolina), 58 | New 59 | va 60 | Tokyo, 61 | Dakota), 62 | Žďár 63 | s’ 64 | ditu 65 | Sázavou), 66 | Rioja. 67 | (Tennessee), 68 | capital 69 | multu 70 | limba 71 | (Kentucky), 72 | Ústí 73 | (Vermont), 74 | Soria 75 | avea 76 | Lake 77 | Basilicata. 78 | ira 79 | nai 80 | stat 81 | West 82 | eara 83 | El 84 | Hampshire), 85 | (Indiana), 86 | Lake, 87 | St. 88 | ta 89 | Orlicí), 90 | Santa 91 | (Utah), 92 | iu 93 | Tu 94 | lui 95 | Springs, 96 | (Soria). 97 | easti 98 | al 99 | North 100 | cum 101 | anlu 102 | Charter 103 | (Georgia), 104 | (Virginia), 105 | Valle 106 | dupu 107 | Park, 108 | dit 109 | do 110 | (South 111 | City 112 | cama 113 | Rychnov 114 | Horní 115 | cai 116 | pisti 117 | South 118 | da 119 | (Illinois), 120 | Cãsãbadzlj 121 | ea 122 | Kněžnou), 123 | am 124 | Village, 125 | Mount 126 | prefectura. 127 | Beach, 128 | Republica 129 | cari 130 | Vila 131 | Dolní 132 | ljei 133 | atsea 134 | d'Aosta. 135 | mashi 136 | tuti 137 | judetul 138 | Valley, 139 | Dahan-e 140 | Europa. 141 | Heights, 142 | der 143 | Cantabria. 144 | Třebíč), 145 | Creek, 146 | Hills, 147 | reghia 148 | Brod), 149 | East 150 | (Wiscosin), 151 | pãnã 152 | Havlíčkův 153 | iasti 154 | si 155 | sh 156 | Blansko), 157 | Washington 158 | Svitavy), 159 | Jindřichův 160 | mari 161 | Romãnia. 162 | ama 163 | i 164 | Lhota 165 | (Nebraska), 166 | Hill, 167 | Znojmo), 168 | Hradec), 169 | Jičín), 170 | nica 171 | Fort 172 | Chiyoda, 173 | dzãsi 174 | Itabashi, 175 | G. 176 | Chrudim), 177 | České 178 | A. 179 | (Washington), 180 | sãntu 181 | S. 182 | anda 183 | an 184 | Shi 185 | comuna 186 | C. 187 | tuts 188 | M. 189 | P. 190 | în 191 | B. 192 | Pardubice), 193 | N. 194 | Gallia. 195 | o 196 | Brno), 197 | L. 198 | Sar-e 199 | Limba 200 | of 201 | Pelhřimov), 202 | unů 203 | ghini 204 | D. 205 | Mladá 206 | capital-a 207 | ashi 208 | li 209 | Union 210 | Tábor), 211 | Osona. 212 | Unia 213 | oarã 214 | (Oklahoma), 215 | ãlj 216 | T. 217 | şi 218 | ari 219 | Calabria. 220 | R. 221 | Boleslav), 222 | (Colorado), 223 | F. 224 | Kazahstan. 225 | suntu 226 | Ves 227 | cãndu 228 | and 229 | Khvajeh 230 | Budějovice), 231 | ditů 232 | Jihlava), 233 | Příbram), 234 | Nová 235 | pod 236 | Deh 237 | unu 238 | Kariz-e 239 | Přerov), 240 | Catalunya 241 | Port 242 | Turcmenistan. 243 | g 244 | comarca 245 | lipseashti 246 | Bala, 247 | Green 248 | W. 249 | Franklin 250 | H. 251 | the 252 | fu 253 | multi 254 | Italia. 255 | Mexico), 256 | Benešov), 257 | (Maryland), 258 | Pleasant 259 | lu 260 | los 261 | Újezd 262 | Di 263 | Grove, 264 | Falls, 265 | europeanã 266 | E. 267 | Baix 268 | vidzu 269 | K. 270 | Ma 271 | Nový 272 | Hradec 273 | că 274 | Deh-e 275 | Khan, 276 | nã 277 | fãrã 278 | cãtã 279 | are 280 | Polania, 281 | apã 282 | O. 283 | Trutnov), 284 | Kolín), 285 | 2ª 286 | Náchod), 287 | Česká 288 | River, 289 | Monroe 290 | White 291 | Sant 292 | din 293 | (Mississippi), 294 | s-u 295 | (Ohio 296 | Jackson 297 | V. 298 | Zlín), 299 | regiunea 300 | Bruntál), 301 | Liberty 302 | (West 303 | tini 304 | Grand 305 | Rakovník), 306 | Nova 307 | el 308 | Catalunya. 309 | Jefferson 310 | Plzeň-South), 311 | putea 312 | cara 313 | São 314 | ună 315 | mata 316 | Plzeň-North), 317 | Aradamata 318 | s 319 | Semily), 320 | vrea 321 | Hora), 322 | le 323 | Kutná 324 | Creek 325 | Pa'in, 326 | Domažlice), 327 | Imeni 328 | (Idaho), 329 | Králové), 330 | Strakonice), 331 | Pohlania, 332 | Saint 333 | cathi 334 | Jablonec 335 | Uherské 336 | Frýdek-Místek), 337 | Marion 338 | anamisa 339 | ahurhi 340 | York), 341 | Perry 342 | mare 343 | hiljlu 344 | Hodonín), 345 | Liberec), 346 | Opava), 347 | lã 348 | mãhãlã), 349 | (California), 350 | Cãsãbãlu 351 | Island, 352 | Madison 353 | Hradiště), 354 | vãrã 355 | este 356 | armãneascã 357 | Murcia. 358 | I. 359 | Spring 360 | Nisou), 361 | sh’ 362 | câ 363 | Břeclav), 364 | mi 365 | 1ª 366 | Klatovy), 367 | Prague-West), 368 | sum 369 | Libia. 370 | tora 371 | shtea 372 | birbiljlu 373 | Darreh, 374 | Olomouc), 375 | nitsi 376 | Kyrgyzstan. 377 | parte 378 | Beroun), 379 | y 380 | Aradamatã 381 | Litoměřice), 382 | River 383 | Hill 384 | tutã 385 | Virginia), 386 | J. 387 | Valley 388 | Prague-East), 389 | Prostějov), 390 | au 391 | Empordà. 392 | Park 393 | Nymburk), 394 | "Nymphaea 395 | Iaponia. 396 | Ama 397 | tru 398 | Kyrgyzstan 399 | Cedar 400 | Český 401 | Numa 402 | Bay, 403 | regiune 404 | Nu 405 | Center, 406 | Point, 407 | Vyškov), 408 | Los 409 | Vsetín), 410 | Meguro, 411 | Lípa), 412 | De 413 | s-bagã 414 | Río 415 | noi 416 | iara 417 | mãri 418 | Ispania 419 | im 420 | Springfield 421 | Louny), 422 | marea 423 | Salem 424 | Africa 425 | Rock, 426 | anglicheascã 427 | U. 428 | mea 429 | shibã 430 | ocljii 431 | Kladno), 432 | Karlovy 433 | se 434 | bana 435 | Lincoln 436 | Richland 437 | unâ 438 | eali 439 | Las 440 | (Nevada), 441 | apridusã 442 | faptu 443 | e 444 | que 445 | yinitoru 446 | Mare. 447 | Písek), 448 | Grove 449 | makedonã. 450 | Villar 451 | s-hibã 452 | Tachov), 453 | lãngori 454 | lui, 455 | Fitica-pescu 456 | anyrãpsita 457 | Vary), 458 | Šumperk), 459 | chirolu 460 | Velké 461 | Sar 462 | Prachatice), 463 | Mãyiripsearea: 464 | lãngoriloru 465 | (Italia) 466 | Zara 467 | tutů 468 | lumea 469 | tsã 470 | Madrid 471 | Pine 472 | Varad 473 | Staré 474 | en 475 | featsi 476 | Oak 477 | alu 478 | (Florida), 479 | numa 480 | Děčín), 481 | Blue 482 | Vernon 483 | Le 484 | Mělník), 485 | Sierra 486 | (Mãhãlã 487 | Nové 488 | ãshi 489 | Comunidad 490 | Harrison 491 | Britania 492 | Siah 493 | Villanueva 494 | Red 495 | Krumlov), 496 | America 497 | s-ti 498 | Clinton 499 | ml 500 | -------------------------------------------------------------------------------- /justext/stoplists/Welsh.txt: -------------------------------------------------------------------------------- 1 | yn 2 | y 3 | a 4 | o 5 | i 6 | ar 7 | ei 8 | yr 9 | ac 10 | gan 11 | yw 12 | o'r 13 | oedd 14 | i'r 15 | Mae 16 | fel 17 | Yn 18 | CC 19 | am 20 | wedi 21 | sy'n 22 | yng 23 | mewn 24 | un 25 | ym 26 | mae 27 | ganrif 28 | ond 29 | eu 30 | a'r 31 | fod 32 | neu 33 | Roedd 34 | ôl 35 | gyda 36 | rhan 37 | Mae'r 38 | rhwng 39 | roedd 40 | Mae'n 41 | Y 42 | enw 43 | â 44 | cael 45 | yw'r 46 | tua 47 | nifer 48 | hefyd 49 | hyd 50 | cynnwys 51 | bod 52 | sydd 53 | Afon 54 | at 55 | hyn 56 | de 57 | er 58 | Cymru 59 | lle 60 | dros 61 | mae'n 62 | a'i 63 | pan 64 | mae'r 65 | ef 66 | rhai 67 | Ar 68 | ardal 69 | erbyn 70 | gyfer 71 | gael 72 | fwyaf 73 | ystod 74 | dan 75 | ger 76 | Ceir 77 | i'w 78 | cyntaf 79 | ddinas 80 | Sir 81 | iddo 82 | Cafodd 83 | wrth 84 | cyn 85 | Yr 86 | eraill 87 | daeth 88 | afon 89 | sef 90 | mwyaf 91 | iaith 92 | ap 93 | Fe'i 94 | gyda'r 95 | of 96 | trwy 97 | hen 98 | the 99 | gogledd 100 | arall 101 | na 102 | John 103 | ceir 104 | cyfnod 105 | brenin 106 | sawl 107 | bu 108 | ag 109 | ran 110 | o'i 111 | Ynys 112 | yma 113 | hanes 114 | dref 115 | llawer 116 | Saif 117 | ffordd 118 | ffilm 119 | â'r 120 | pentref 121 | Ysgol 122 | Bu 123 | iawn 124 | newydd 125 | Daeth 126 | Gymraeg 127 | ydy 128 | flwyddyn 129 | Cymraeg 130 | farw 131 | milltir 132 | boblogaeth 133 | ail 134 | bobl 135 | byw 136 | Unol 137 | fab 138 | mai 139 | megis 140 | ynys 141 | enwog 142 | mwyn 143 | nad 144 | mawr 145 | Er 146 | prif 147 | ysgol 148 | gyntaf 149 | yno 150 | Cymru. 151 | nid 152 | gorllewin 153 | Un 154 | wlad 155 | hynny 156 | Lloegr 157 | ddau 158 | ogystal 159 | safle 160 | mis 161 | gwaith 162 | pob 163 | Ei 164 | ers 165 | yna 166 | oes 167 | oherwydd 168 | Pentref 169 | ganrif. 170 | hefyd. 171 | hun 172 | cafodd 173 | "The 174 | dwyrain 175 | eglwys 176 | Llyn 177 | gorwedd 178 | gynnwys 179 | hanner 180 | and 181 | Swydd 182 | dim 183 | adnabyddus 184 | ddwy 185 | fe 186 | dod 187 | byd 188 | hi 189 | Saesneg 190 | gair 191 | Cymreig 192 | allan 193 | Owain 194 | Thomas 195 | William 196 | Sant 197 | Eglwys 198 | (ganwyd 199 | Ffrainc 200 | waith 201 | arbennig 202 | (Saesneg: 203 | dau 204 | lan 205 | fwy 206 | Ym 207 | fawr 208 | fewn 209 | unig 210 | Cymru, 211 | Gwlad 212 | llifo 213 | mwy 214 | uchaf 215 | brif 216 | heb 217 | ennill 218 | aelod 219 | ychydig 220 | Erbyn 221 | hwn 222 | Fel 223 | bob 224 | oddi 225 | felly 226 | Deyrnas 227 | dinas 228 | Môr 229 | unrhyw 230 | Dinas 231 | lleolir 232 | arfordir 233 | dilyn 234 | Nghymru 235 | The 236 | Gogledd 237 | gyfnod 238 | Ewrop 239 | bychan 240 | Canol 241 | De 242 | Jones 243 | ni 244 | Nid 245 | perthyn 246 | dwy 247 | dalaith 248 | Byd 249 | diwedd 250 | Pan 251 | ffurf 252 | ochr 253 | Hen 254 | chwarae 255 | km 256 | tua'r 257 | Daleithiau 258 | Ond 259 | I 260 | bardd 261 | gydag 262 | Wedi 263 | 19eg 264 | Eisteddfod 265 | wneud 266 | Rhyfel 267 | fe'i 268 | Iwerddon 269 | drwy 270 | Genedlaethol 271 | Sefydlwyd 272 | Ymerodraeth 273 | dal 274 | olaf 275 | swyddogol 276 | ben 277 | Oesoedd 278 | Mynydd 279 | Edward 280 | ganrif, 281 | Llywelyn 282 | weithiau 283 | pobl 284 | ffurfio 285 | aml 286 | ganddo 287 | Mai 288 | ffin 289 | defnyddio 290 | O 291 | Ionawr 292 | CC. 293 | Gruffudd 294 | grŵp 295 | ddiweddarach 296 | 1af 297 | tro 298 | ngogledd 299 | heddiw 300 | iddi 301 | Dafydd 302 | hi'n 303 | Castell 304 | aeth 305 | mynd 306 | gyfres 307 | gwneud 308 | Mawrth 309 | Ebrill 310 | Alban 311 | ddiwedd 312 | Prydain 313 | amser 314 | hon 315 | Hydref 316 | bu'n 317 | tu 318 | enwedig 319 | ddefnyddio 320 | fyddin 321 | gymuned 322 | Medi 323 | ddaeth 324 | llywodraeth 325 | tir 326 | bwysig 327 | Mehefin 328 | Awst 329 | dŵr 330 | debyg 331 | Gwynedd 332 | tan 333 | Mewn 334 | enillodd 335 | Ganed 336 | Robert 337 | rhoi 338 | Unedig 339 | brifddinas 340 | Newydd 341 | ieithoedd 342 | enghraifft 343 | Rhagfyr 344 | blynyddoedd 345 | lefel 346 | Gorffennaf 347 | rhannau 348 | ymladd 349 | sefydlwyd 350 | bennaf 351 | elwir 352 | wedi'i 353 | Oes 354 | Cenedlaethol 355 | sefydlu 356 | Gymru 357 | Syr 358 | gweld 359 | eraill. 360 | ymlaen 361 | llai 362 | 6ed 363 | sylweddol 364 | arfer 365 | 18fed 366 | Llundain 367 | tra 368 | teulu 369 | ferch 370 | dyddio 371 | gwmpas 372 | hanesyddol 373 | Bu'n 374 | ne 375 | castell 376 | rhaid 377 | iddynt 378 | miliwn 379 | symud 380 | Enillodd 381 | a'u 382 | term 383 | Lleolir 384 | bron 385 | bentref 386 | chwedl 387 | ymuno 388 | Prifysgol 389 | gyda'i 390 | (neu 391 | llyfr 392 | gwledydd 393 | ardaloedd 394 | Chwefror 395 | Gellir 396 | awdur 397 | system 398 | to 399 | ymerawdwr 400 | talaith 401 | 5ed 402 | Americanaidd 403 | Undeb 404 | gweithio 405 | hytrach 406 | Fawr 407 | oed 408 | ddod 409 | hefyd, 410 | Gweriniaeth 411 | ohonynt 412 | in 413 | diwrnod 414 | 2il 415 | newid 416 | Gall 417 | traddodiad 418 | ddim 419 | prifddinas 420 | ganolfan 421 | Tachwedd 422 | achos 423 | Dechreuodd 424 | rhanbarth 425 | Affrica 426 | poblogaeth 427 | arwain 428 | Fe 429 | dechreuodd 430 | Lloegr. 431 | Rufeinig 432 | hyn, 433 | agos 434 | eni 435 | fu 436 | gwlad 437 | Ail 438 | hir 439 | chafodd 440 | Ni 441 | sir 442 | cynnar 443 | wahanol 444 | amlwg 445 | 12fed 446 | rhyfel 447 | heddiw. 448 | trwy'r 449 | allanol. 450 | gerdd 451 | môr 452 | nes 453 | adeg 454 | dangos 455 | (ganed 456 | Dyffryn 457 | Rhys 458 | ganddi 459 | Prif 460 | gall 461 | greu 462 | 16eg 463 | ymestyn 464 | Seisnig 465 | Hanes. 466 | mab 467 | golygu 468 | dir 469 | Almaen 470 | lleol 471 | David 472 | dydd 473 | wraig 474 | ab 475 | deyrnas 476 | 13eg 477 | cyrraedd 478 | Harri 479 | rai 480 | mor 481 | nag 482 | gellir 483 | yno. 484 | ynysoedd 485 | awr 486 | James 487 | Ynysoedd 488 | gafodd 489 | ffilmiau 490 | Williams 491 | nofel 492 | "Y 493 | e.e. 494 | honno 495 | draws 496 | holl 497 | byd. 498 | ynghyd 499 | frawd 500 | (yn 501 | hun. 502 | tri 503 | 20fed 504 | Richard 505 | yma, 506 | nghanol 507 | dechrau 508 | wedyn 509 | adeilad 510 | mwyafrif 511 | mlynedd 512 | llyn 513 | lawer 514 | uchel 515 | o’r 516 | dad 517 | 14eg 518 | math 519 | lawr 520 | eto 521 | 15fed 522 | Dywedir 523 | bynnag, 524 | hynny, 525 | arwynebedd 526 | tair 527 | arian 528 | boblogaidd 529 | wreiddiol 530 | rhaglen 531 | gallu 532 | helaeth 533 | rhedeg 534 | San 535 | gerllaw 536 | Nghymru. 537 | addysg 538 | byddin 539 | swydd 540 | siarad 541 | Gorllewin 542 | aelodau 543 | ysgrifennu 544 | 3edd 545 | Ryfel 546 | poblogaidd 547 | Lladin 548 | maent 549 | George 550 | efallai 551 | -------------------------------------------------------------------------------- /justext/stoplists/Kurdish.txt: -------------------------------------------------------------------------------- 1 | û 2 | bi 3 | ji 4 | li 5 | de 6 | di 7 | ku 8 | xwe 9 | jî 10 | ye. 11 | ser 12 | ya 13 | ve 14 | Di 15 | wê 16 | wî 17 | tê 18 | e. 19 | sala 20 | re 21 | wan 22 | gundekî 23 | bo 24 | Li 25 | ew 26 | a 27 | vê 28 | Ji 29 | ber 30 | navê 31 | yên 32 | ye 33 | hatiye 34 | da 35 | yê 36 | gelek 37 | navçeya 38 | an 39 | aliyê 40 | bû. 41 | lê 42 | (bi 43 | nav 44 | Ew 45 | yek 46 | Bi 47 | e 48 | her 49 | ne 50 | ev 51 | wek 52 | Kurdistanê 53 | bû 54 | girêdayî 55 | grêdayî 56 | mezin 57 | dest 58 | Ev 59 | din 60 | ko 61 | bajarê 62 | dike. 63 | ê 64 | Lê 65 | zimanê 66 | mirov 67 | de, 68 | dewleta 69 | wekî 70 | kurdî 71 | tirkî: 72 | dema 73 | kirin. 74 | pir 75 | hat 76 | herî 77 | heta 78 | ên 79 | pê 80 | bin 81 | vî 82 | me 83 | kurd 84 | ra 85 | van 86 | dibe 87 | kir. 88 | hene. 89 | Kurd 90 | u 91 | navçe 92 | hin 93 | ne. 94 | du 95 | pêk 96 | Amedê 97 | hate 98 | min 99 | dike 100 | gund 101 | Kurdî 102 | in. 103 | hatine 104 | tên 105 | yekê 106 | hev 107 | Piştî 108 | nava 109 | tirk 110 | navbera 111 | bê 112 | weke 113 | sedsala 114 | gundê 115 | demê 116 | salên 117 | wir 118 | hemû 119 | niha 120 | Mêrdînê 121 | jê 122 | tenê 123 | dibe. 124 | yan 125 | e, 126 | valakirin. 127 | herêmê 128 | piştî 129 | jiyana 130 | Tirk 131 | jî, 132 | kirin 133 | kes 134 | berê 135 | kar 136 | em 137 | bûye. 138 | heye. 139 | ye, 140 | çend 141 | kir 142 | Şêx 143 | gor 144 | te 145 | tu 146 | êdî 147 | Her 148 | caran 149 | kiriye. 150 | Kurdan 151 | salan 152 | nû 153 | ava 154 | Wî 155 | dê 156 | çi 157 | herêma 158 | destê 159 | dikin. 160 | serê 161 | dijî 162 | bûye 163 | dikeve 164 | dide 165 | tê. 166 | xwedî 167 | dîrokî 168 | bû, 169 | Sala 170 | Navê 171 | gorî 172 | in 173 | navdar 174 | Xwedê 175 | gelê 176 | karê 177 | we 178 | bûn. 179 | km 180 | kurdan 181 | mîna 182 | mirovan 183 | cih 184 | dawî 185 | gundekê 186 | hinek 187 | Dema 188 | sê 189 | hatin 190 | zêde 191 | be 192 | gundên 193 | gel 194 | başûr 195 | ez 196 | sal 197 | gotin 198 | dikin 199 | bilind 200 | berî 201 | wiha 202 | dibêjin 203 | baş 204 | kesên 205 | heya 206 | taybetî 207 | ke 208 | Tirkiyê 209 | Kurdistana 210 | hemî 211 | vir 212 | bikaranîn. 213 | dîsa 214 | yekemîn 215 | dikare 216 | fermî 217 | bavê 218 | dûr 219 | ango 220 | îro 221 | zimanên 222 | Çiyayê 223 | gotin. 224 | cihê 225 | roja 226 | bakur 227 | cîhanê 228 | bûn 229 | rojavayê 230 | Îranê 231 | Komara 232 | Navîn 233 | şerê 234 | başûrê 235 | bike. 236 | tirkî 237 | dirêj 238 | anko 239 | nêzîkî 240 | welatê 241 | têne 242 | le 243 | dîroka 244 | pirr 245 | mazin 246 | hene 247 | giştî 248 | avê 249 | heman 250 | Deryaya 251 | Gelek 252 | dibin 253 | belav 254 | Kurdistan 255 | dijîn. 256 | malbata 257 | hem 258 | tevî 259 | jibo 260 | dewletê 261 | bakurê 262 | şer 263 | xebatên 264 | rojava 265 | xwe, 266 | cem 267 | Mihemed 268 | xo 269 | derbas 270 | xwe. 271 | Şirnexê 272 | bajêr 273 | dibin. 274 | Xelata 275 | çar 276 | deme 277 | re, 278 | wateya 279 | roj 280 | nêzî 281 | erdê 282 | welatên 283 | bike 284 | Hz. 285 | Gorî 286 | kurê 287 | girîng 288 | ji, 289 | dikarin 290 | heye 291 | Kurdên 292 | Mûşê 293 | berhemên 294 | Erzîromê 295 | cara 296 | kiriye 297 | hatîye 298 | kêm 299 | der 300 | yekem 301 | de. 302 | çû 303 | diçe 304 | hejmara 305 | ziman 306 | siyasî 307 | Zimanê 308 | hêla 309 | biçûk 310 | Hin 311 | dî 312 | hatibû 313 | dîtin. 314 | warê 315 | kevn 316 | pirtûka 317 | piranî 318 | Agiriyê 319 | zêdetir 320 | bikin 321 | divê 322 | berdewam 323 | Binêre: 324 | were 325 | Û 326 | hêzên 327 | dihat 328 | navenda 329 | Salvegerên 330 | deng 331 | alîyê 332 | mirovên 333 | gelekî 334 | him 335 | rastî 336 | herêmên 337 | pêş 338 | hê 339 | serokê 340 | Wê 341 | bajar 342 | xelkê 343 | piştre 344 | Roj 345 | weşandin. 346 | Cejn 347 | bajarên 348 | dike, 349 | kurdên 350 | Zanîngeha 351 | salveger. 352 | qasî 353 | ma 354 | riya 355 | the 356 | şewitandin. 357 | rê 358 | dora 359 | demên 360 | da, 361 | Ewropayê 362 | çêkirin. 363 | weha 364 | Stenbolê 365 | bibe 366 | ola 367 | Şerê 368 | serî 369 | alî 370 | demeke 371 | zanîn 372 | gundî 373 | Dîsa 374 | Partiya 375 | Gundê 376 | xwendina 377 | belê 378 | bêje 379 | mala 380 | Rihayê 381 | b.z. 382 | erebî 383 | çanda 384 | m 385 | Wekî 386 | rojê 387 | Heta 388 | kirin, 389 | hîn 390 | bikin. 391 | destpêka 392 | bona 393 | derê 394 | hwd 395 | Culemêrgê 396 | peyva 397 | paşê 398 | endamê 399 | Sêrtê 400 | in, 401 | tev 402 | dilê 403 | nivîskar 404 | hatina 405 | navên 406 | Tê 407 | cuda 408 | Elî 409 | rast 410 | dikir. 411 | ta 412 | beşa 413 | pêşîn 414 | dil 415 | rêya 416 | salê 417 | didin 418 | kirine. 419 | dibistana 420 | rewşa 421 | girtin. 422 | pêşî 423 | girtin 424 | xebata 425 | bandora 426 | pîroz 427 | awayekî 428 | va 429 | dawiya 430 | helbestên 431 | Yek 432 | xweş 433 | çiyayê 434 | wî, 435 | hezar 436 | dû 437 | reş 438 | Licê 439 | Fransayê 440 | salî 441 | km² 442 | welat 443 | Wanê 444 | diyar 445 | kirine 446 | Niha 447 | hizre 448 | zanîngeha 449 | dibêje 450 | navîn 451 | wisa 452 | mirina 453 | bide 454 | Piştre 455 | Ewropa 456 | of 457 | Ez 458 | tevlî 459 | zarokên 460 | herdu 461 | hwd. 462 | rûyê 463 | dide. 464 | peyda 465 | pirranî 466 | demekê 467 | ey 468 | parêzgeha 469 | kî 470 | Nobel 471 | awayî 472 | Efrîna 473 | behsa 474 | taybet 475 | dijî. 476 | pirtûkên 477 | ti 478 | kesan 479 | Dewleta 480 | rojhilatê 481 | kû 482 | nîşan 483 | zî 484 | avakirin. 485 | paytexta 486 | mafê 487 | daye 488 | bilî 489 | civakî 490 | bikar 491 | nas 492 | çemê 493 | gotina 494 | hûn 495 | digire. 496 | hevalên 497 | bike, 498 | naskirin. 499 | jin 500 | koma 501 | dinê. 502 | bûne. 503 | Mele 504 | amade 505 | jiyanê 506 | Em 507 | tiştên 508 | Bakurê 509 | ziman. 510 | dengê 511 | dinê 512 | Dêrika 513 | wi 514 | Dr. 515 | dihêt 516 | rojan) 517 | derdikeve 518 | kurmancî 519 | kuştin. 520 | (Qalib 521 | da. 522 | hember 523 | dikir 524 | hêdî 525 | destên 526 | Bo 527 | Mezin 528 | kir, 529 | xurt 530 | dîrokê 531 | axa 532 | ku, 533 | dayik 534 | navçeyên 535 | dewlemend 536 | bingeha 537 | nayê 538 | digel 539 | şunda 540 | xwediyê 541 | Koma 542 | kovara 543 | çawa 544 | Mirov 545 | alîkariya 546 | ve, 547 | bûne 548 | rojhilat 549 | kirina 550 | gora 551 | ya. 552 | yî 553 | hema 554 | ba 555 | dibe, 556 | zarokan 557 | azad 558 | Paşê 559 | Almanyayê 560 | Çemê 561 | Mala 562 | Rojhilata 563 | cureyên 564 | Tirkiyeyê 565 | fîzîkê 566 | dem 567 | yew 568 | Hejmara 569 | car 570 | derdora 571 | dikeve. 572 | Du 573 | meha 574 | meriv 575 | pey 576 | Tu 577 | -------------------------------------------------------------------------------- /justext/stoplists/Danish.txt: -------------------------------------------------------------------------------- 1 | i 2 | og 3 | af 4 | en 5 | er 6 | til 7 | at 8 | som 9 | den 10 | på 11 | for 12 | med 13 | var 14 | blev 15 | der 16 | det 17 | de 18 | et 19 | fra 20 | han 21 | har 22 | I 23 | ved 24 | ikke 25 | om 26 | men 27 | sig 28 | også 29 | Han 30 | havde 31 | hvor 32 | Den 33 | Det 34 | eller 35 | kan 36 | efter 37 | sin 38 | under 39 | man 40 | over 41 | første 42 | hans 43 | da 44 | to 45 | De 46 | mod 47 | så 48 | år 49 | fik 50 | deres 51 | være 52 | del 53 | mellem 54 | ligger 55 | andre 56 | kunne 57 | mange 58 | dog 59 | hun 60 | kun 61 | været 62 | flere 63 | En 64 | meget 65 | denne 66 | ud 67 | dansk 68 | alle 69 | blevet 70 | senere 71 | kom 72 | bliver 73 | mere 74 | end 75 | skulle 76 | sit 77 | have 78 | Efter 79 | Der 80 | store 81 | op 82 | sammen 83 | samt 84 | omkring 85 | samme 86 | tidligere 87 | danske 88 | sine 89 | ham 90 | siden 91 | skal 92 | (født 93 | bl.a. 94 | dette 95 | vil 96 | kendt 97 | mens 98 | blandt 99 | tre 100 | of 101 | mest 102 | stor 103 | På 104 | andet 105 | f.Kr. 106 | få 107 | tilbage 108 | forskellige 109 | dag 110 | både 111 | Hans 112 | selv 113 | gennem 114 | anden 115 | hvilket 116 | findes 117 | ind 118 | nogle 119 | hele 120 | nye 121 | ofte 122 | nu 123 | disse 124 | gik 125 | uden 126 | ville 127 | ca. 128 | række 129 | tyske 130 | Da 131 | blive 132 | gang 133 | hos 134 | hvis 135 | Hun 136 | Danmark 137 | Dette 138 | sidste 139 | indtil 140 | største 141 | først 142 | derfor 143 | januar 144 | før 145 | mindre 146 | dem 147 | århundrede 148 | maj 149 | The 150 | tid 151 | begyndte 152 | Ved 153 | byen 154 | inden 155 | vandt 156 | juni 157 | større 158 | marts 159 | når 160 | medlem 161 | september 162 | oktober 163 | april 164 | the 165 | grund 166 | spillede 167 | tog 168 | november 169 | frem 170 | navn 171 | august 172 | december 173 | juli 174 | Et 175 | fire 176 | Fra 177 | februar 178 | amerikanske 179 | Men 180 | stadig 181 | navnet 182 | helt 183 | Som 184 | by 185 | Christian 186 | døde 187 | noget 188 | lille 189 | gamle 190 | Denne 191 | måtte 192 | bruges 193 | spiller 194 | New 195 | født 196 | forbindelse 197 | gjorde 198 | alt 199 | således 200 | består 201 | udgivet 202 | kommer 203 | hendes 204 | ny 205 | især 206 | igen 207 | amerikansk 208 | desuden 209 | stedet 210 | lå 211 | hver 212 | fordi 213 | gav 214 | allerede 215 | km 216 | spillet 217 | f.eks. 218 | søn 219 | bedste 220 | fleste 221 | "The 222 | går 223 | skrevet 224 | lige 225 | Under 226 | Byen 227 | København 228 | gange 229 | For 230 | stort 231 | endnu 232 | død 233 | dermed 234 | Her 235 | får 236 | ned 237 | forhold 238 | år. 239 | kort 240 | brugt 241 | form 242 | kaldes 243 | næsten 244 | her 245 | hvad 246 | ingen 247 | må 248 | bygget 249 | lang 250 | von 251 | nogen 252 | Med 253 | eneste 254 | Københavns 255 | Frederik 256 | begge 257 | står 258 | beliggende 259 | Danmarks 260 | John 261 | set 262 | sted 263 | slutningen 264 | franske 265 | derefter 266 | Peter 267 | skrev 268 | stod 269 | dens 270 | ses 271 | and 272 | fandt 273 | Man 274 | nord 275 | dele 276 | Sogn 277 | godt 278 | idet 279 | USA 280 | nuværende 281 | gift 282 | giver 283 | syd 284 | film 285 | aldrig 286 | længere 287 | lidt 288 | førte 289 | arbejde 290 | Danmark. 291 | egen 292 | langt 293 | imidlertid 294 | løbet 295 | blot 296 | samtidig 297 | Disse 298 | kaldet 299 | fem 300 | kendte 301 | britiske 302 | haft 303 | meter 304 | brug 305 | én 306 | side 307 | politiske 308 | ligesom 309 | nåede 310 | fået 311 | begyndelsen 312 | små 313 | hold 314 | taget 315 | tysk 316 | engelske 317 | betyder 318 | området 319 | gøre 320 | Til 321 | om, 322 | gør 323 | deltog 324 | følge 325 | nordlige 326 | videre 327 | gruppe 328 | Kommune 329 | angreb 330 | eksempel 331 | dels 332 | Historie. 333 | Dansk 334 | indeholder 335 | komme 336 | grundlagt 337 | opført 338 | valgt 339 | sydlige 340 | indbyggere 341 | direkte 342 | vi 343 | holdt 344 | se 345 | moderne 346 | år, 347 | hurtigt 348 | datter 349 | Siden 350 | enkelte 351 | kilometer 352 | tage 353 | til, 354 | tiden 355 | måde 356 | Desuden 357 | finde 358 | hjælp 359 | plads 360 | kampe 361 | stammer 362 | tæt 363 | vest 364 | perioden 365 | periode 366 | Senere 367 | hvert 368 | følgende 369 | via 370 | rolle 371 | udgav 372 | altid 373 | startede 374 | give 375 | antal 376 | bag 377 | fortsatte 378 | mennesker 379 | flyttede 380 | spille 381 | året 382 | gå 383 | det, 384 | Tyskland 385 | medlemmer 386 | nok 387 | trods 388 | millioner 389 | efterfølgende 390 | såkaldte 391 | oprindelige 392 | på, 393 | tidspunkt 394 | indenfor 395 | øst 396 | langs 397 | Hvis 398 | anvendes 399 | lavet 400 | nær 401 | kamp 402 | hende 403 | dage 404 | formand 405 | grad 406 | mål 407 | viser 408 | ret 409 | VM 410 | vestlige 411 | starten 412 | tale 413 | særlig 414 | par 415 | tredje 416 | landet 417 | finder 418 | baseret 419 | leder 420 | igennem 421 | dets 422 | mand 423 | fundet 424 | lykkedes 425 | seks 426 | lande 427 | rundt 428 | Carl 429 | Nogle 430 | d. 431 | ene 432 | arbejdede 433 | fx 434 | ældste 435 | område 436 | album 437 | unge 438 | bestod 439 | landets 440 | vendte 441 | krigen 442 | Frankrig 443 | Mange 444 | svenske 445 | Thomas 446 | hører 447 | bandet 448 | Kirke 449 | valgte 450 | typisk 451 | Når 452 | næste 453 | engelsk 454 | Michael 455 | sat 456 | filmen 457 | oprindeligt 458 | vundet 459 | fast 460 | betydning 461 | udkom 462 | samlet 463 | høj 464 | muligt 465 | yderligere 466 | overtog 467 | gjort 468 | bedre 469 | byens 470 | lokale 471 | præsident 472 | primært 473 | viste 474 | imod 475 | verden 476 | hvoraf 477 | af, 478 | enten 479 | normalt 480 | altså 481 | lave 482 | in 483 | mindst 484 | København. 485 | politisk 486 | vigtigste 487 | Europa 488 | tilfælde 489 | karriere 490 | givet 491 | stærkt 492 | nyt 493 | liv 494 | ældre 495 | Af 496 | enkelt 497 | kommune 498 | bruge 499 | adskillige 500 | dannet 501 | sendt 502 | England 503 | kaldt 504 | egne 505 | hvorefter 506 | ser 507 | eget 508 | Danmark, 509 | verdens 510 | arbejder 511 | uddannet 512 | sådan 513 | ham. 514 | herunder 515 | fransk 516 | klubben 517 | meste 518 | støtte 519 | holde 520 | Danske 521 | Henrik 522 | nævnes 523 | bedst 524 | er, 525 | tager 526 | resten 527 | Sverige 528 | folk 529 | bruger 530 | hær 531 | cirka 532 | oprettet 533 | vist 534 | øvrige 535 | Kirke. 536 | opkaldt 537 | Blandt 538 | verdenskrig 539 | historie 540 | sidst 541 | steder 542 | Erik 543 | brugte 544 | ligeledes 545 | (i 546 | hvordan 547 | mulighed 548 | Navnet 549 | m 550 | midten 551 | USA. 552 | modtog 553 | far 554 | områder 555 | børn 556 | forlod 557 | Kommune. 558 | internationale 559 | for, 560 | -------------------------------------------------------------------------------- /justext/stoplists/Esperanto.txt: -------------------------------------------------------------------------------- 1 | la 2 | de 3 | en 4 | kaj 5 | estas 6 | La 7 | al 8 | estis 9 | En 10 | por 11 | el 12 | kun 13 | li 14 | kiu 15 | kiel 16 | pri 17 | komunumo 18 | aŭ 19 | ne 20 | ankaŭ 21 | Ĝi 22 | ke 23 | pli 24 | ĝi 25 | plej 26 | per 27 | tiu 28 | oni 29 | dum 30 | sed 31 | jaro 32 | Li 33 | laŭ 34 | inter 35 | kiuj 36 | ĉe 37 | ĝis 38 | havas 39 | sur 40 | urbo 41 | havis 42 | je 43 | unu 44 | pro 45 | troviĝas 46 | distrikto 47 | ili 48 | unua 49 | sia 50 | nur 51 | da 52 | vilaĝo 53 | povas 54 | post 55 | du 56 | situas 57 | ol 58 | poste 59 | ĉi 60 | parto 61 | aliaj 62 | jam 63 | kie 64 | ekde 65 | tiuj 66 | regiono 67 | tre 68 | kiam 69 | tio 70 | nomo 71 | loĝantojn. 72 | lando 73 | lia 74 | NGC 75 | jaroj 76 | sian 77 | granda 78 | Post 79 | Dum 80 | apartenas 81 | multaj 82 | ŝi 83 | ofte 84 | Germanio. 85 | apartenis 86 | federacia 87 | jarcento 88 | ĉar 89 | Historio. 90 | esti 91 | Ĉi 92 | ĝin 93 | kontraŭ 94 | tie 95 | Tiu 96 | do 97 | Ili 98 | Oni 99 | junio 100 | antaŭ 101 | ĉirkaŭ 102 | mortis 103 | eĉ 104 | ĉiuj 105 | kelkaj 106 | tiel 107 | ĉefe 108 | kies 109 | iĝis 110 | Ĝis 111 | okazis 112 | sub 113 | ties 114 | decembro 115 | Ekde 116 | Vidu 117 | kiun 118 | fariĝis 119 | tri 120 | rivero 121 | El 122 | mencio 123 | versio 124 | ĉefurbo 125 | franca 126 | lingvo 127 | ricevis 128 | objekto 129 | Ĝia 130 | siaj 131 | tra 132 | familio 133 | ĝia 134 | kio 135 | originala 136 | Italio. 137 | provinco 138 | tiam 139 | 30-an 140 | lin 141 | Laŭ 142 | sin 143 | dua 144 | diversaj 145 | hungara 146 | tamen 147 | konstelacio 148 | malgranda 149 | vivis 150 | departemento 151 | nun 152 | konata 153 | iom 154 | ĉiu 155 | tuta 156 | germana 157 | loĝantojn 158 | ankoraŭ 159 | tempo 160 | NGC-katalogo. 161 | fine 162 | jenon:. 163 | se 164 | Esperanto 165 | indikoj 166 | nova 167 | super 168 | Ŝi 169 | Inter 170 | naskiĝis 171 | grava 172 | homoj 173 | jaroj. 174 | signifas 175 | aperis 176 | grandaj 177 | (aliaj 178 | ĉefa 179 | galaksio 180 | Situo. 181 | norda 182 | Eksteraj 183 | alia 184 | preskaŭ 185 | 1-a 186 | nacia 187 | Sed 188 | multe 189 | 2-a 190 | mem 191 | PGC 192 | formo 193 | reĝo 194 | ekzistas 195 | precipe 196 | De 197 | Tio 198 | Sankta 199 | tiun 200 | Pro 201 | teritorio 202 | Lia 203 | (en 204 | specio 205 | ĉ. 206 | ekzemple 207 | municipo 208 | (naskiĝis 209 | km 210 | Hungario, 211 | loĝis 212 | Por 213 | januaro 214 | nomata 215 | Geografio. 216 | sen 217 | nomon 218 | komencis 219 | ligiloj. 220 | pluraj 221 | kutime 222 | siajn 223 | uzas 224 | personoj 225 | sama 226 | fare 227 | MCG 228 | loko 229 | alta 230 | liaj 231 | grupo 232 | suda 233 | homoj, 234 | "La 235 | konsistas 236 | kvankam 237 | proksimume 238 | okcidenta 239 | lasta 240 | uzata 241 | centro 242 | marto 243 | subŝtato 244 | majo 245 | septembro 246 | Kiel 247 | julio 248 | tute 249 | ilin 250 | Ankaŭ 251 | usona 252 | Kiam 253 | Kantono 254 | devas 255 | ambaŭ 256 | malpli 257 | Hungario 258 | aprilo 259 | fondita 260 | apud 261 | faris 262 | laboris 263 | grandan 264 | vorto 265 | jarcento. 266 | Rumanio 267 | kune 268 | verkis 269 | unue 270 | oktobro 271 | fino 272 | loĝantaro 273 | kvar 274 | Universitato 275 | denove 276 | (la 277 | fondis 278 | aŭgusto 279 | loĝantoj 280 | areo 281 | iu 282 | 31-an 283 | genro 284 | multajn 285 | ĉiam 286 | unuaj 287 | membro 288 | vivas 289 | mondmilito 290 | morto 291 | novembro 292 | Estas 293 | eble 294 | nombro 295 | filo 296 | fama 297 | povis 298 | plu 299 | kiujn 300 | enhavas 301 | studis 302 | proksime 303 | okazas 304 | devenas 305 | foje 306 | restis 307 | ZWG 308 | longa 309 | 19-a 310 | partoj 311 | loka 312 | tio, 313 | nome 314 | ĉu 315 | alteco 316 | orienta 317 | Poste 318 | Se 319 | ŝtato 320 | prezidanto 321 | gravaj 322 | viroj 323 | Tamen 324 | birdo 325 | insulo 326 | komunumoj 327 | birdoj 328 | devis 329 | preĝejo 330 | angla 331 | Hungara 332 | urboj 333 | San 334 | loĝas 335 | unuan 336 | armeo 337 | montras 338 | registaro 339 | same 340 | devenis 341 | Nacia 342 | sude 343 | februaro 344 | landoj 345 | filmo 346 | m 347 | hungare 348 | milito 349 | bone 350 | partoprenis 351 | historio 352 | meza 353 | bordo 354 | krom 355 | Distrikto 356 | Norda 357 | aperas 358 | mil 359 | vivo 360 | Tiuj 361 | ilia 362 | nomita 363 | konstruita 364 | specioj 365 | popolnombrado 366 | jarcento, 367 | Usono 368 | patro 369 | nomiĝas 370 | Eŭropo 371 | uzis 372 | Granda 373 | Administre 374 | of 375 | Kvankam 376 | ĝenerale 377 | norde 378 | Al 379 | centra 380 | 20-a 381 | komence 382 | lingvoj 383 | okupis 384 | verko 385 | Eŭropa 386 | oficiala 387 | verkoj 388 | jaroj, 389 | politika 390 | po 391 | ŝia 392 | INEGI) 393 | subregiono 394 | sistemo 395 | partio 396 | metroj 397 | Ĉe 398 | UGC 399 | natura 400 | lingvo. 401 | historia 402 | Pri 403 | malnova 404 | (aŭ 405 | libro 406 | Germanio 407 | atingis 408 | Je 409 | internacia 410 | Internacia 411 | hungaroj 412 | periodo 413 | Ĉar 414 | kreis 415 | lian 416 | kelkajn 417 | ktp. 418 | iuj 419 | akiris 420 | donis 421 | fervojo 422 | jarojn 423 | Per 424 | kvin 425 | Francio 426 | epoko 427 | tuj 428 | Esperanto. 429 | novan 430 | tial 431 | aparte 432 | hispana 433 | trovis 434 | ni 435 | lago 436 | aliajn 437 | nuna 438 | iam 439 | Traktato 440 | estus 441 | tiom 442 | a.K. 443 | komenco 444 | hodiaŭ 445 | parton 446 | geografia 447 | virinoj. 448 | Biografio. 449 | sufiĉe 450 | nuntempe 451 | 3-a 452 | 18-a 453 | sukcesis 454 | neniam 455 | oriente 456 | Tamen, 457 | meksika 458 | Respubliko 459 | surfacon 460 | venis 461 | ene 462 | temas 463 | antikva 464 | latina 465 | km². 466 | parte 467 | skriba 468 | mondo 469 | verŝajne 470 | John 471 | edzino 472 | samtempe 473 | m. 474 | eblas 475 | Sur 476 | plejparte 477 | ĉiujn 478 | Tie 479 | kantono 480 | sciis 481 | Multaj 482 | valo 483 | tiama 484 | nigra 485 | antaŭe 486 | Slovakio. 487 | tia 488 | manĝas 489 | del 490 | Pli 491 | duono 492 | milionoj 493 | konsiderata 494 | nomas 495 | mi 496 | punkto 497 | reĝlando, 498 | Krome 499 | (de 500 | marnivelo. 501 | novaj 502 | 16-a 503 | bona 504 | malgrandaj 505 | okcidente 506 | ĉefaj 507 | ajn 508 | km-ojn. 509 | Tiel 510 | fakte 511 | lingvon. 512 | nordo 513 | monto 514 | membroj 515 | rapide 516 | y 517 | Krom 518 | komuna 519 | regionoj 520 | Unu 521 | blanka 522 | instituto 523 | thumb 524 | iliaj 525 | mondo. 526 | mortinta 527 | Rejnlando-Palatinato. 528 | (lasta 529 | kreita 530 | 15-a 531 | restas 532 | nek 533 | 4-a 534 | flanko 535 | universitato 536 | vilaĝoj 537 | batalo 538 | Kelkaj 539 | Suda 540 | greka 541 | titolo 542 | povus 543 | revenis 544 | moderna 545 | estos 546 | Svislando. 547 | Fine 548 | krome 549 | rusa 550 | sekve 551 | Trianon 552 | itala 553 | entute 554 | tutan 555 | urbo. 556 | propra 557 | ĝenerala 558 | lingvo, 559 | proksima 560 | formas 561 | tro 562 | tria 563 | gajnis 564 | ĉeĥa 565 | -------------------------------------------------------------------------------- /justext/stoplists/Italian.txt: -------------------------------------------------------------------------------- 1 | di 2 | e 3 | il 4 | la 5 | in 6 | del 7 | a 8 | che 9 | è 10 | un 11 | della 12 | per 13 | una 14 | con 15 | nel 16 | da 17 | si 18 | i 19 | al 20 | le 21 | dei 22 | Il 23 | La 24 | alla 25 | non 26 | come 27 | più 28 | delle 29 | nella 30 | anche 31 | dal 32 | sono 33 | ed 34 | ad 35 | ha 36 | fu 37 | gli 38 | sua 39 | Nel 40 | suo 41 | era 42 | ma 43 | cui 44 | o 45 | dalla 46 | due 47 | lo 48 | tra 49 | parte 50 | degli 51 | stato 52 | prima 53 | anni 54 | essere 55 | I 56 | su 57 | In 58 | ai 59 | dopo 60 | loro 61 | questo 62 | viene 63 | uno 64 | venne 65 | sul 66 | alle 67 | Le 68 | dove 69 | nei 70 | primo 71 | solo 72 | città 73 | nelle 74 | sulla 75 | nome 76 | È 77 | poi 78 | mentre 79 | molto 80 | quale 81 | comune 82 | fino 83 | suoi 84 | sia 85 | se 86 | dai 87 | stata 88 | dello 89 | altri 90 | quando 91 | hanno 92 | questa 93 | A 94 | erano 95 | durante 96 | San 97 | stesso 98 | può 99 | tre 100 | secondo 101 | aveva 102 | contro 103 | quella 104 | de 105 | serie 106 | quello 107 | furono 108 | fine 109 | alcuni 110 | quali 111 | seguito 112 | grande 113 | film 114 | Dopo 115 | Si 116 | tutti 117 | ancora 118 | così 119 | volta 120 | Per 121 | gruppo 122 | sempre 123 | sue 124 | dalle 125 | negli 126 | quindi 127 | ogni 128 | circa 129 | abitanti 130 | sotto 131 | verso 132 | tempo 133 | periodo 134 | fra 135 | senza 136 | seconda 137 | guerra 138 | altre 139 | proprio 140 | già 141 | quanto 142 | ne 143 | nuovo 144 | fatto 145 | vita 146 | modo 147 | oltre 148 | agli 149 | famiglia 150 | tutto 151 | aver 152 | trova 153 | regione 154 | causa 155 | numero 156 | questi 157 | situato 158 | però 159 | presso 160 | of 161 | Gli 162 | secolo 163 | stagione 164 | molti 165 | primi 166 | alcune 167 | forma 168 | via 169 | francese 170 | sistema 171 | stati 172 | fa 173 | vi 174 | insieme 175 | Nella 176 | storia 177 | titolo 178 | perché 179 | chiesa 180 | spesso 181 | grazie 182 | invece 183 | posto 184 | possono 185 | base 186 | rispetto 187 | quattro 188 | e, 189 | squadra 190 | centro 191 | che, 192 | tutte 193 | punto 194 | termine 195 | territorio 196 | lui 197 | vengono 198 | nuova 199 | particolare 200 | lungo 201 | caso 202 | soprattutto 203 | morte 204 | Un 205 | corso 206 | mondo 207 | anno 208 | quasi 209 | Stati 210 | casa 211 | dagli 212 | inoltre 213 | Con 214 | tale 215 | infatti 216 | sulle 217 | allo 218 | Questo 219 | provincia 220 | versione 221 | ruolo 222 | nello 223 | maggiore 224 | poco 225 | Ha 226 | (in 227 | Fu 228 | attraverso 229 | Dal 230 | divenne 231 | successo 232 | album 233 | finale 234 | diversi 235 | mai 236 | ebbe 237 | sede 238 | Al 239 | tipo 240 | Durante 241 | popolazione 242 | figlio 243 | produzione 244 | pubblicato 245 | Italia 246 | Giovanni 247 | oggi 248 | Lo 249 | ben 250 | the 251 | re 252 | presenta 253 | grandi 254 | queste 255 | grado 256 | stessa 257 | settembre 258 | ci 259 | maggio 260 | Una 261 | volte 262 | esempio 263 | giugno 264 | specie 265 | Storia. 266 | Biografia. 267 | vennero 268 | altro 269 | società 270 | Maria 271 | Tra 272 | diverse 273 | livello 274 | col 275 | campo 276 | zona 277 | quelle 278 | gennaio 279 | partire 280 | meno 281 | dipartimento 282 | sui 283 | maggior 284 | luglio 285 | presenza 286 | propria 287 | campionato 288 | metà 289 | sarebbe 290 | momento 291 | Questa 292 | far 293 | principale 294 | linea 295 | fosse 296 | opere 297 | Santa 298 | carriera 299 | posizione 300 | quel 301 | padre 302 | all'interno 303 | marzo 304 | dicembre 305 | avrebbe 306 | fece 307 | generale 308 | "La 309 | possibile 310 | tutta 311 | lavoro 312 | II 313 | progetto 314 | presente 315 | governo 316 | attività 317 | avere 318 | genere 319 | vari 320 | quelli 321 | novembre 322 | lingua 323 | importante 324 | giorno 325 | Da 326 | luogo 327 | dato 328 | molte 329 | nazionale 330 | allora 331 | deve 332 | "Il 333 | opera 334 | Roma 335 | presenti 336 | Carlo 337 | state 338 | costruzione 339 | diretto 340 | "The 341 | Chiesa 342 | Anche 343 | ciò 344 | italiano 345 | febbraio 346 | corpo 347 | New 348 | stile 349 | comunque 350 | band 351 | numerosi 352 | tratta 353 | situata 354 | The 355 | prime 356 | giovane 357 | sviluppo 358 | Non 359 | ottobre 360 | anni, 361 | servizio 362 | varie 363 | avevano 364 | Di 365 | Secondo 366 | Alla 367 | fiume 368 | pari 369 | importanti 370 | singolo 371 | fare 372 | qualche 373 | subito 374 | aprile 375 | chiamato 376 | canzone 377 | musica 378 | studio 379 | paese 380 | Quando 381 | egli 382 | struttura 383 | agosto 384 | cinque 385 | persone 386 | terzo 387 | tanto 388 | Serie 389 | Come 390 | porta 391 | battaglia 392 | gran 393 | nord 394 | Ma 395 | piano 396 | sei 397 | successivamente 398 | studi 399 | giorni 400 | comunità 401 | origine 402 | principali 403 | politica 404 | realtà 405 | Uniti 406 | sarà 407 | prodotto 408 | John 409 | and 410 | Francesco 411 | tramite 412 | strada 413 | inglese 414 | ultimi 415 | probabilmente 416 | avuto 417 | capo 418 | Repubblica 419 | mondiale 420 | piccolo 421 | tuttavia 422 | lavori 423 | vista 424 | Coppa 425 | vittoria 426 | presidente 427 | potere 428 | sud 429 | Nazionale 430 | stazione 431 | pochi 432 | forte 433 | Regno 434 | fratello 435 | ricerca 436 | mesi 437 | gioco 438 | cioè 439 | numerose 440 | personaggio 441 | secolo, 442 | anni. 443 | controllo 444 | forze 445 | formazione 446 | va 447 | libro 448 | fuori 449 | iniziò 450 | calcio 451 | poiché 452 | riesce 453 | scuola 454 | gruppi 455 | utilizzato 456 | noto 457 | Trama. 458 | dallo 459 | Gran 460 | data 461 | li 462 | membri 463 | De 464 | metri 465 | (che 466 | breve 467 | programma 468 | sta 469 | milioni 470 | fase 471 | km 472 | vicino 473 | figlia 474 | distretto 475 | classe 476 | centrale 477 | nonostante 478 | uomini 479 | italiana 480 | ritorno 481 | punti 482 | Francia 483 | ora 484 | Paolo 485 | assieme 486 | luce 487 | inizia 488 | madre 489 | fronte 490 | disco 491 | lei 492 | superficie 493 | funzione 494 | pubblico 495 | moglie 496 | nuovi 497 | nota 498 | piccola 499 | modello 500 | almeno 501 | processo 502 | scritto 503 | proprietà 504 | ormai 505 | Giuseppe 506 | valore 507 | brano 508 | (o 509 | chiamata 510 | considerato 511 | Negli 512 | piuttosto 513 | velocità 514 | legge 515 | nuove 516 | Sono 517 | vinto 518 | trovano 519 | attualmente 520 | possibilità 521 | elementi 522 | detto 523 | problemi 524 | completamente 525 | Stato 526 | mezzo 527 | nominato 528 | intorno 529 | membro 530 | Luigi 531 | qui 532 | ovvero 533 | simile 534 | forza 535 | motore 536 | totale 537 | Pietro 538 | militare 539 | diritto 540 | scopo 541 | movimento 542 | secolo. 543 | fascia 544 | riuscì 545 | caratterizzata 546 | personaggi 547 | usato 548 | ufficiale 549 | prese 550 | successivo 551 | Nonostante 552 | Nei 553 | truppe 554 | Antonio 555 | capacità 556 | (il 557 | romanzo 558 | regno 559 | edizione 560 | diocesi 561 | Prima 562 | III 563 | terra 564 | tempi 565 | spazio 566 | donna 567 | comuni 568 | località 569 | musicale 570 | -------------------------------------------------------------------------------- /tests/test_sax.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division, print_function, unicode_literals 5 | 6 | from lxml import html 7 | from justext.core import ParagraphMaker 8 | 9 | 10 | class TestSax(): 11 | 12 | def assert_paragraphs_equal(self, paragraph, **kwargs): 13 | for name, value in kwargs.items(): 14 | returned_value = getattr(paragraph, name) 15 | msg = "%s: %r != %r" % (name, value, returned_value) 16 | assert value == returned_value, msg 17 | 18 | def test_no_paragraphs(self): 19 | html_string = '' 20 | dom = html.fromstring(html_string) 21 | 22 | returned = html.tostring(dom).decode("utf8") 23 | assert html_string == returned 24 | 25 | paragraphs = ParagraphMaker.make_paragraphs(dom) 26 | assert len(paragraphs) == 0 27 | 28 | def test_basic(self): 29 | html_string = ( 30 | '' 31 | '

Header

' 32 | '

text and some other words that I have in my head now

' 33 | '

footer

' 34 | '' 35 | ) 36 | dom = html.fromstring(html_string) 37 | 38 | returned = html.tostring(dom).decode("utf8") 39 | assert html_string == returned 40 | 41 | paragraphs = ParagraphMaker.make_paragraphs(dom) 42 | assert len(paragraphs) == 3 43 | 44 | self.assert_paragraphs_equal(paragraphs[0], text="Header", words_count=1, tags_count=0) 45 | 46 | text = "text and some other words that I have in my head now" 47 | self.assert_paragraphs_equal(paragraphs[1], text=text, words_count=12, tags_count=2) 48 | 49 | self.assert_paragraphs_equal(paragraphs[2], text="footer", words_count=1, tags_count=0) 50 | 51 | def test_whitespace_handling(self): 52 | html_string = ( 53 | '' 54 | '

preinpost \t pre in post

' 55 | '
pre in post
' 56 | '
prein post
' 57 | '
pre inpost
' 58 | '' 59 | ) 60 | dom = html.fromstring(html_string) 61 | 62 | returned = html.tostring(dom).decode("utf8") 63 | assert html_string == returned 64 | 65 | paragraphs = ParagraphMaker.make_paragraphs(dom) 66 | assert len(paragraphs) == 4 67 | 68 | self.assert_paragraphs_equal( 69 | paragraphs[0], 70 | text="preinpost pre in post", 71 | words_count=4, 72 | tags_count=2 73 | ) 74 | self.assert_paragraphs_equal( 75 | paragraphs[1], 76 | text="pre in post", 77 | words_count=3, 78 | tags_count=1 79 | ) 80 | self.assert_paragraphs_equal( 81 | paragraphs[2], 82 | text="prein post", 83 | words_count=2, 84 | tags_count=1 85 | ) 86 | self.assert_paragraphs_equal( 87 | paragraphs[3], 88 | text="pre inpost", 89 | words_count=2, 90 | tags_count=1 91 | ) 92 | 93 | def test_multiple_line_break(self): 94 | html_string = ( 95 | '' 96 | ' normal text

another text ' 97 | '' 98 | ) 99 | dom = html.fromstring(html_string) 100 | 101 | returned = html.tostring(dom).decode("utf8") 102 | assert html_string == returned 103 | 104 | paragraphs = ParagraphMaker.make_paragraphs(dom) 105 | assert len(paragraphs) == 2 106 | 107 | self.assert_paragraphs_equal( 108 | paragraphs[0], 109 | text="normal text", 110 | words_count=2, 111 | tags_count=0 112 | ) 113 | self.assert_paragraphs_equal( 114 | paragraphs[1], 115 | text="another text", 116 | words_count=2, 117 | tags_count=0 118 | ) 119 | 120 | def test_inline_text_in_body(self): 121 | """Inline text should be treated as separate paragraph.""" 122 | html_string = ( 123 | '' 124 | 'I am top-inline\n\n\n\n and I am happy \n' 125 | '

normal text

' 126 | '\nvar i = -INFINITY;\n' 127 | '
after text with variable N
' 128 | ' I am inline\n\n\n\n and I am happy \n' 129 | '' 130 | ) 131 | dom = html.fromstring(html_string) 132 | 133 | paragraphs = ParagraphMaker.make_paragraphs(dom) 134 | assert len(paragraphs) == 5 135 | 136 | self.assert_paragraphs_equal( 137 | paragraphs[0], 138 | words_count=7, 139 | tags_count=2, 140 | text="I am top-inline\nand I am happy" 141 | ) 142 | self.assert_paragraphs_equal( 143 | paragraphs[1], 144 | words_count=2, 145 | tags_count=0, 146 | text="normal text" 147 | ) 148 | self.assert_paragraphs_equal( 149 | paragraphs[2], 150 | words_count=4, 151 | tags_count=1, 152 | text="var i = -INFINITY;" 153 | ) 154 | self.assert_paragraphs_equal( 155 | paragraphs[3], 156 | words_count=5, 157 | tags_count=1, 158 | text="after text with variable N" 159 | ) 160 | self.assert_paragraphs_equal( 161 | paragraphs[4], 162 | words_count=7, 163 | tags_count=0, 164 | text="I am inline\nand I am happy" 165 | ) 166 | 167 | def test_links(self): 168 | """Inline text should be treated as separate paragraph.""" 169 | html_string = ( 170 | '' 171 | 'I am top-inline\n\n\n\n and I am happy \n' 172 | '

normal text

' 173 | '\nvar i = -INFINITY;\n' 174 | '
after text with variable N
' 175 | ' I am inline\n\n\n\n and I am happy \n' 176 | '' 177 | ) 178 | dom = html.fromstring(html_string) 179 | 180 | paragraphs = ParagraphMaker.make_paragraphs(dom) 181 | assert len(paragraphs) == 5 182 | 183 | self.assert_paragraphs_equal( 184 | paragraphs[0], 185 | words_count=7, 186 | tags_count=2, 187 | text="I am top-inline\nand I am happy", 188 | chars_count_in_links=31 189 | ) 190 | self.assert_paragraphs_equal( 191 | paragraphs[1], 192 | words_count=2, 193 | tags_count=0, 194 | text="normal text" 195 | ) 196 | self.assert_paragraphs_equal( 197 | paragraphs[2], 198 | words_count=4, 199 | tags_count=1, 200 | text="var i = -INFINITY;" 201 | ) 202 | self.assert_paragraphs_equal( 203 | paragraphs[3], 204 | words_count=5, 205 | tags_count=2, 206 | text="after text with variable N", 207 | chars_count_in_links=4 208 | ) 209 | self.assert_paragraphs_equal( 210 | paragraphs[4], 211 | words_count=7, 212 | tags_count=0, 213 | text="I am inline\nand I am happy" 214 | ) 215 | --------------------------------------------------------------------------------