├── .env.example ├── .gitignore ├── README.md ├── data.json ├── data └── nltk_data │ └── tokenizers │ └── punkt │ ├── PY3 │ ├── README │ ├── czech.pickle │ ├── danish.pickle │ ├── dutch.pickle │ ├── english.pickle │ ├── estonian.pickle │ ├── finnish.pickle │ ├── french.pickle │ ├── german.pickle │ ├── greek.pickle │ ├── italian.pickle │ ├── norwegian.pickle │ ├── polish.pickle │ ├── portuguese.pickle │ ├── russian.pickle │ ├── slovene.pickle │ ├── spanish.pickle │ ├── swedish.pickle │ └── turkish.pickle │ ├── README │ ├── czech.pickle │ ├── danish.pickle │ ├── dutch.pickle │ ├── english.pickle │ ├── estonian.pickle │ ├── finnish.pickle │ ├── french.pickle │ ├── german.pickle │ ├── greek.pickle │ ├── italian.pickle │ ├── norwegian.pickle │ ├── polish.pickle │ ├── portuguese.pickle │ ├── russian.pickle │ ├── slovene.pickle │ ├── spanish.pickle │ ├── swedish.pickle │ └── turkish.pickle ├── media └── search.png ├── requirements.txt ├── schema.sql ├── scrapy.cfg ├── script └── meili.py └── sherlock ├── __init__.py ├── items.py ├── loaders.py ├── middlewares.py ├── pipelines.py ├── settings.py ├── spiders ├── __init__.py ├── members.py ├── pages.py ├── titles.py └── votes.py └── utils ├── __init__.py ├── config.py ├── database.py ├── regex.py └── wikidot.py /.env.example: -------------------------------------------------------------------------------- 1 | PG_DBNAME= 2 | PG_USER= 3 | PG_PASSWORD= 4 | PG_HOST= 5 | PG_PORT= 6 | 7 | MEILISEARCH_HOST= 8 | MEILISEARCH_PORT= 9 | MEILISEARCH_KEY= -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .vscode 3 | output.json 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | # pytype static type analyzer 139 | .pytype/ 140 | 141 | # Cython debug symbols 142 | cython_debug/ 143 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |
3 |
8 | A web crawler to retrieve Wikidot informations 9 |
10 | 11 |
12 |
13 |
14 |
15 |
16 |
🎩
142 | -------------------------------------------------------------------------------- /data.json: -------------------------------------------------------------------------------- 1 | { 2 | "scp-wiki": { 3 | "id": 66711, 4 | "index": [ 5 | "scp-series", 6 | "scp-series-2", 7 | "scp-series-3", 8 | "scp-series-4", 9 | "scp-series-5", 10 | "scp-series-6", 11 | "joke-scps", 12 | "scp-ex", 13 | "archived-scps" 14 | ], 15 | "language": "english" 16 | }, 17 | "scpko": { 18 | "id": 486864, 19 | "index": [], 20 | "language": null 21 | }, 22 | "scp-wiki-cn": { 23 | "id": 530812, 24 | "index": [ 25 | "scp-series", 26 | "scp-series-2", 27 | "scp-series-3", 28 | "scp-series-4", 29 | "scp-series-5", 30 | "scp-series-6", 31 | "scp-international", 32 | "scp-series-cn", 33 | "scp-series-cn-2", 34 | "joke-scps", 35 | "joke-scps-cn", 36 | "scp-ex", 37 | "scp-ex-cn", 38 | "archived-scps" 39 | ], 40 | "language": null 41 | }, 42 | "fondationscp": { 43 | "id": 464696, 44 | "index": [ 45 | "scp-series", 46 | "scp-series-2", 47 | "scp-series-3", 48 | "scp-series-4", 49 | "scp-series-5", 50 | "scp-series-6", 51 | "scp-series-hub", 52 | "liste-francaise", 53 | "scps-humoristiques-francais", 54 | "joke-scps", 55 | "scp-ex" 56 | ], 57 | "language": "french" 58 | }, 59 | "scp-pl": { 60 | "id": 647733, 61 | "index": [], 62 | "language": "polish" 63 | }, 64 | "lafundacionscp": { 65 | "id": 560484, 66 | "index": [ 67 | "scp-series", 68 | "scp-series-2", 69 | "scp-series-3", 70 | "scp-series-4", 71 | "scp-series-5", 72 | "scp-series-6", 73 | "scp-series-hub", 74 | "serie-scp-es", 75 | "serie-scp-es-2", 76 | "scps-humoristicos", 77 | "scp-ex" 78 | ], 79 | "language": "spanish" 80 | }, 81 | "scp-th": { 82 | "id": 547203, 83 | "index": [], 84 | "language": null 85 | }, 86 | "scp-jp": { 87 | "id": 578002, 88 | "index": [], 89 | "language": null 90 | }, 91 | "scp-wiki-de": { 92 | "id": 1269857, 93 | "index": [], 94 | "language": "german" 95 | }, 96 | "fondazionescp": { 97 | "id": 530167, 98 | "index": [], 99 | "language": "italian" 100 | }, 101 | "scp-ukrainian": { 102 | "id": 1398197, 103 | "index": [], 104 | "language": null 105 | }, 106 | "scp-pt-br": { 107 | "id": 783633, 108 | "index": [], 109 | "language": "portuguese" 110 | }, 111 | "scp-int": { 112 | "id": 1427610, 113 | "index": [], 114 | "language": "english" 115 | }, 116 | "scp-ru": { 117 | "id": 169125, 118 | "index": [], 119 | "language": "russian" 120 | }, 121 | "scp-zh-tr": { 122 | "id": 3947998, 123 | "index": [ 124 | "scp-series", 125 | "scp-series-2", 126 | "scp-series-3", 127 | "scp-series-4", 128 | "scp-series-5", 129 | "scp-series-6", 130 | "scp-int-translation-hub", 131 | "scp-series-zh", 132 | "joke-scps", 133 | "scp-ex", 134 | "archived-scps" 135 | ], 136 | "language": null 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/README: -------------------------------------------------------------------------------- 1 | Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected) 2 | 3 | Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have 4 | been contributed by various people using NLTK for sentence boundary detection. 5 | 6 | For information about how to use these models, please confer the tokenization HOWTO: 7 | http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html 8 | and chapter 3.8 of the NLTK book: 9 | http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation 10 | 11 | There are pretrained tokenizers for the following languages: 12 | 13 | File Language Source Contents Size of training corpus(in tokens) Model contributed by 14 | ======================================================================================================================================================================= 15 | czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss 16 | Literarni Noviny 17 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 18 | danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss 19 | (Berlingske Avisdata, Copenhagen) Weekend Avisen 20 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 21 | dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss 22 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 23 | english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss 24 | (American) 25 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 26 | estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss 27 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 28 | finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss 29 | Text Bank (Suomen Kielen newspapers 30 | Tekstipankki) 31 | Finnish Center for IT Science 32 | (CSC) 33 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 34 | french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss 35 | (European) 36 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 37 | german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss 38 | (Switzerland) CD-ROM 39 | (Uses "ss" 40 | instead of "ß") 41 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 42 | greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss 43 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 44 | italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss 45 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 46 | norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss 47 | (Bokmål and Information Technologies, 48 | Nynorsk) Bergen 49 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 50 | polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner 51 | (http://www.nkjp.pl/) 52 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 53 | portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss 54 | (Brazilian) (Linguateca) 55 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 56 | slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss 57 | Slovene Academy for Arts 58 | and Sciences 59 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 60 | spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss 61 | (European) 62 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 63 | swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss 64 | (and some other texts) 65 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 66 | turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss 67 | (Türkçe Derlem Projesi) 68 | University of Ankara 69 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 70 | 71 | The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to 72 | Unicode using the codecs module. 73 | 74 | Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection. 75 | Computational Linguistics 32: 485-525. 76 | 77 | ---- Training Code ---- 78 | 79 | # import punkt 80 | import nltk.tokenize.punkt 81 | 82 | # Make a new Tokenizer 83 | tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer() 84 | 85 | # Read in training corpus (one example: Slovene) 86 | import codecs 87 | text = codecs.open("slovene.plain","Ur","iso-8859-2").read() 88 | 89 | # Train tokenizer 90 | tokenizer.train(text) 91 | 92 | # Dump pickled tokenizer 93 | import pickle 94 | out = open("slovene.pickle","wb") 95 | pickle.dump(tokenizer, out) 96 | out.close() 97 | 98 | --------- 99 | -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/czech.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/czech.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/danish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/danish.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/dutch.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/dutch.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/english.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/english.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/estonian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/estonian.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/finnish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/finnish.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/french.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/french.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/german.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/german.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/greek.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/greek.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/italian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/italian.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/norwegian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/norwegian.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/polish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/polish.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/portuguese.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/portuguese.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/russian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/russian.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/slovene.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/slovene.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/spanish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/spanish.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/swedish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/swedish.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/PY3/turkish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/turkish.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/README: -------------------------------------------------------------------------------- 1 | Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected) 2 | 3 | Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have 4 | been contributed by various people using NLTK for sentence boundary detection. 5 | 6 | For information about how to use these models, please confer the tokenization HOWTO: 7 | http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html 8 | and chapter 3.8 of the NLTK book: 9 | http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation 10 | 11 | There are pretrained tokenizers for the following languages: 12 | 13 | File Language Source Contents Size of training corpus(in tokens) Model contributed by 14 | ======================================================================================================================================================================= 15 | czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss 16 | Literarni Noviny 17 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 18 | danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss 19 | (Berlingske Avisdata, Copenhagen) Weekend Avisen 20 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 21 | dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss 22 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 23 | english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss 24 | (American) 25 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 26 | estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss 27 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 28 | finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss 29 | Text Bank (Suomen Kielen newspapers 30 | Tekstipankki) 31 | Finnish Center for IT Science 32 | (CSC) 33 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 34 | french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss 35 | (European) 36 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 37 | german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss 38 | (Switzerland) CD-ROM 39 | (Uses "ss" 40 | instead of "ß") 41 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 42 | greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss 43 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 44 | italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss 45 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 46 | norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss 47 | (Bokmål and Information Technologies, 48 | Nynorsk) Bergen 49 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 50 | polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner 51 | (http://www.nkjp.pl/) 52 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 53 | portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss 54 | (Brazilian) (Linguateca) 55 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 56 | slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss 57 | Slovene Academy for Arts 58 | and Sciences 59 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 60 | spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss 61 | (European) 62 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 63 | swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss 64 | (and some other texts) 65 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 66 | turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss 67 | (Türkçe Derlem Projesi) 68 | University of Ankara 69 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 70 | 71 | The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to 72 | Unicode using the codecs module. 73 | 74 | Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection. 75 | Computational Linguistics 32: 485-525. 76 | 77 | ---- Training Code ---- 78 | 79 | # import punkt 80 | import nltk.tokenize.punkt 81 | 82 | # Make a new Tokenizer 83 | tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer() 84 | 85 | # Read in training corpus (one example: Slovene) 86 | import codecs 87 | text = codecs.open("slovene.plain","Ur","iso-8859-2").read() 88 | 89 | # Train tokenizer 90 | tokenizer.train(text) 91 | 92 | # Dump pickled tokenizer 93 | import pickle 94 | out = open("slovene.pickle","wb") 95 | pickle.dump(tokenizer, out) 96 | out.close() 97 | 98 | --------- 99 | -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/czech.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/czech.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/danish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/danish.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/dutch.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/dutch.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/estonian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/estonian.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/finnish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/finnish.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/french.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/french.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/german.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/german.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/italian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/italian.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/norwegian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/norwegian.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/polish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/polish.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/portuguese.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/portuguese.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/russian.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/russian.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/slovene.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/slovene.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/spanish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/spanish.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/swedish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/swedish.pickle -------------------------------------------------------------------------------- /data/nltk_data/tokenizers/punkt/turkish.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/turkish.pickle -------------------------------------------------------------------------------- /media/search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/media/search.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | astroid==2.4.2 2 | attrs==20.1.0 3 | Automat==20.2.0 4 | autopep8==1.5.4 5 | cffi==1.14.2 6 | click==7.1.2 7 | constantly==15.1.0 8 | cryptography==3.3.2 9 | cssselect==1.1.0 10 | hyperlink==20.0.1 11 | idna==2.10 12 | incremental==17.5.0 13 | isort==5.5.1 14 | itemadapter==0.1.0 15 | itemloaders==1.0.2 16 | jmespath==0.10.0 17 | joblib==0.16.0 18 | lazy-object-proxy==1.4.3 19 | lxml==4.6.3 20 | mccabe==0.6.1 21 | nltk==3.5 22 | parsel==1.6.0 23 | Protego==0.1.16 24 | psycopg2==2.8.5 25 | pyasn1==0.4.8 26 | pyasn1-modules==0.2.8 27 | pycodestyle==2.6.0 28 | pycparser==2.20 29 | PyDispatcher==2.0.5 30 | PyHamcrest==2.0.2 31 | pylint==2.6.0 32 | pyOpenSSL==19.1.0 33 | python-dotenv==0.14.0 34 | queries==2.1.0 35 | queuelib==1.5.0 36 | regex==2020.7.14 37 | Scrapy==2.3.0 38 | service-identity==18.1.0 39 | six==1.15.0 40 | toml==0.10.1 41 | tqdm==4.48.2 42 | Twisted==20.3.0 43 | w3lib==1.22.0 44 | wrapt==1.12.1 45 | zope.interface==5.1.0 46 | -------------------------------------------------------------------------------- /schema.sql: -------------------------------------------------------------------------------- 1 | --- Author : Corentin POUPRY (HelloEdit) 2 | 3 | 4 | CREATE TABLE public.branch ( 5 | id INTEGER GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, 6 | name VARCHAR(25) NOT NULL, 7 | locale VARCHAR(5) UNIQUE NOT NULL, 8 | url VARCHAR(50) UNIQUE NOT NULL, 9 | created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() NOT NULL 10 | ); 11 | 12 | INSERT INTO public.branch (id, name, locale, url, created_at) VALUES 13 | (66711, 'SCP Foundation', 'en', 'http://www.scp-wiki.net/', '2008-07-24 11:00:22+00:00'), 14 | (486864, 'SCP 재단', 'ko', 'http://ko.scp-wiki.net/', '2012-08-10 10:01:31+00:00'), 15 | (530812, 'SCP基金会', 'cn', 'http://scp-wiki-cn.wikidot.com/', '2013-01-30 17:36:27+00:00'), 16 | (464696, 'Fondation SCP', 'fr', 'http://fondationscp.wikidot.com/', '2012-03-21 14:35:40+00:00'), 17 | (647733, 'Fundacja SCP', 'pl', 'http://scp-wiki.net.pl/', '2014-04-25 12:14:57+00:00'), 18 | (560484, 'La Fundación SCP', 'es', 'http://lafundacionscp.wikidot.com/', '2013-05-05 14:43:20+00:00'), 19 | (547203, 'สถาบัน SCP', 'th', 'http://scp-th.wikidot.com/', '2013-04-05 09:04:34+00:00'), 20 | (578002, 'SCP財団', 'jp', 'http://scp-jp.wikidot.com/', '2013-07-08 11:09:46+00:00'), 21 | (1269857, 'SCP auf Deutsch', 'de', 'http://scp-wiki-de.wikidot.com/', '2016-04-05 21:04:44+00:00'), 22 | (530167, 'Fondazione SCP', 'it', 'http://fondazionescp.wikidot.com/', '2013-01-26 15:51:12+00:00'), 23 | (1398197, 'Фонд SCP', 'ua', 'http://scp-ukrainian.wikidot.com/', '2016-11-10 08:10:32+00:00'), 24 | (783633, 'Fundação SCP', 'pt-br', 'http://scp-pt-br.wikidot.com/', '2015-08-24 13:40:14+00:00'), 25 | (1427610, 'SCP International', 'int', 'http://scp-int.wikidot.com/', '2017-01-30 07:08:17+00:00'), 26 | (169125, 'Фонд SCP', 'ru', 'http://scp-ru.wikidot.com/', '2010-06-27 17:11:41+00:00'); 27 | 28 | CREATE TABLE public.pass ( 29 | id INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY, 30 | branch_id INTEGER NOT NULL REFERENCES public.branch(id) ON DELETE CASCADE ON UPDATE CASCADE, 31 | subject VARCHAR(25) NOT NULL, 32 | started_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), 33 | ended_at TIMESTAMP WITH TIME ZONE, 34 | pending BOOLEAN NOT NULL DEFAULT TRUE, 35 | successful BOOLEAN NOT NULL DEFAULT FALSE 36 | ); 37 | 38 | CREATE TABLE public.user ( 39 | id INTEGER GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, 40 | username VARCHAR(99) UNIQUE NOT NULL, 41 | slug VARCHAR(99) UNIQUE NOT NULL 42 | ); 43 | 44 | CREATE TABLE public.membership ( 45 | id INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY, 46 | branch_id INTEGER NOT NULL REFERENCES public.branch(id) ON DELETE RESTRICT ON UPDATE CASCADE, 47 | user_id INTEGER NOT NULL REFERENCES public.user(id) ON DELETE RESTRICT ON UPDATE CASCADE, 48 | member_since TIMESTAMP WITH TIME ZONE NOT NULL, 49 | pass_id INTEGER NOT NULL REFERENCES public.pass(id) ON DELETE RESTRICT ON UPDATE CASCADE, 50 | UNIQUE(user_id, branch_id) 51 | ); 52 | 53 | CREATE TABLE public.page ( 54 | id INTEGER GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, 55 | title VARCHAR(256) NOT NULL, 56 | subtitle VARCHAR(256), 57 | preview VARCHAR(500), 58 | branch_id INTEGER NOT NULL REFERENCES public.branch(id) ON DELETE RESTRICT ON UPDATE CASCADE, 59 | slug VARCHAR(256) NOT NULL, 60 | tags VARCHAR(50)[] NOT NULL DEFAULT '{}', 61 | created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() NOT NULL, 62 | updated_at TIMESTAMP WITH TIME ZONE DEFAULT NULL, 63 | created_by INTEGER REFERENCES public.user(id) ON DELETE SET NULL ON UPDATE CASCADE, 64 | pass_id INTEGER NOT NULL REFERENCES public.pass(id) ON DELETE RESTRICT ON UPDATE CASCADE, 65 | UNIQUE(branch_id, slug) 66 | ); 67 | 68 | CREATE TABLE public.vote ( 69 | id INTEGER GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, 70 | user_id INTEGER NOT NULL REFERENCES public.user(id) ON DELETE CASCADE ON UPDATE CASCADE, 71 | page_id INTEGER NOT NULL REFERENCES public.page(id) ON DELETE CASCADE ON UPDATE CASCADE, 72 | vote SMALLINT NOT NULL, 73 | created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() NOT NULL, 74 | pass_id INTEGER NOT NULL REFERENCES public.pass(id) ON DELETE RESTRICT ON UPDATE CASCADE, 75 | UNIQUE(user_id, page_id) 76 | ); 77 | 78 | 79 | CREATE OR REPLACE FUNCTION add_member(branch_id INTEGER, user_id INTEGER, slug TEXT, username TEXT, member_since TIMESTAMP WITH TIME ZONE, pass_id INTEGER) RETURNS void AS $$ 80 | INSERT INTO public.user VALUES (user_id, username, slug) 81 | ON CONFLICT (id) DO UPDATE SET 82 | username = EXCLUDED.username, 83 | slug = EXCLUDED.slug; 84 | 85 | INSERT INTO public.membership VALUES (DEFAULT, branch_id, user_id, member_since, pass_id) 86 | ON CONFLICT (user_id, branch_id) DO UPDATE SET 87 | pass_id = EXCLUDED.pass_id; 88 | $$ LANGUAGE sql; 89 | 90 | CREATE OR REPLACE FUNCTION add_page(page_id INTEGER, branch_id INTEGER, title TEXT, preview TEXT, slug TEXT, tags TEXT[], created_by INTEGER, created_at TIMESTAMP WITH TIME ZONE, updated_at TIMESTAMP WITH TIME ZONE, pass_id INTEGER) RETURNS void AS $$ 91 | INSERT INTO public.page VALUES (page_id, title, NULL, preview, branch_id, slug, tags, created_at, updated_at, created_by, pass_id) 92 | ON CONFLICT (id) DO UPDATE SET 93 | pass_id = EXCLUDED.pass_id, 94 | preview = EXCLUDED.preview, 95 | title = EXCLUDED.title, 96 | slug = EXCLUDED.slug; 97 | $$ LANGUAGE sql; 98 | 99 | CREATE OR REPLACE FUNCTION add_title(subtitle TEXT, slug TEXT, branch_id INTEGER) RETURNS void AS $$ 100 | UPDATE public.page SET subtitle = $1 WHERE 101 | slug = $2 AND 102 | branch_id = $3; 103 | $$ LANGUAGE sql; 104 | 105 | CREATE FUNCTION public.check_public_page_user_id() RETURNS trigger AS $$ 106 | BEGIN 107 | NEW.created_by = (SELECT id FROM public.user WHERE id = NEW.created_by); 108 | RETURN NEW; 109 | END; 110 | $$ LANGUAGE plpgsql; 111 | 112 | CREATE TRIGGER public_page_user_id_insert 113 | BEFORE INSERT OR UPDATE ON public.page 114 | FOR EACH ROW 115 | WHEN (NEW.created_by IS NOT NULL) 116 | EXECUTE FUNCTION public.check_public_page_user_id(); 117 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sherlock.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sherlock 12 | -------------------------------------------------------------------------------- /script/meili.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import re 3 | from os import environ 4 | from sys import exit 5 | from time import sleep 6 | 7 | import meilisearch 8 | import queries 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | SELECT_PAGE = """ 14 | SELECT ARRAY_AGG(ROW_TO_JSON(t)) as data FROM ( 15 | SELECT usr.username as author, page.id, title, subtitle, preview, tags, (branch.url || page.slug) AS url 16 | FROM public.page 17 | INNER JOIN public.branch ON page.branch_id = branch.id 18 | LEFT JOIN public.user as usr ON page.created_by = usr.id 19 | ) t; 20 | """ 21 | URI = queries.uri( 22 | host=environ['PG_HOST'], 23 | port=environ['PG_PORT'], 24 | dbname=environ['PG_DBNAME'], 25 | user=environ['PG_USER'], 26 | password=environ['PG_PASSWORD'] 27 | ) 28 | ITEM_CHOICES = ['member'] 29 | LOCALE_CHOICES = ['fr', 'en', 'int'] 30 | TITLE_REGEX = re.compile(r"SCP-(\d+)$") 31 | 32 | parser = argparse.ArgumentParser( 33 | prog="Meilisearch util", 34 | description="CLI to interact with the Meilisearch database", 35 | epilog="Because everyone love Meilisearch" 36 | ) 37 | 38 | parser.add_argument("action", choices=['upload']) 39 | parser.add_argument("--item", choices=ITEM_CHOICES, nargs="+", required=True) 40 | parser.add_argument("--locale", choices=LOCALE_CHOICES, 41 | nargs="+", required=True) 42 | 43 | print(""" 44 | ▄▄ ▄▄ ▄▄▄▄▄▄▄ ▄▄▄ ▄▄▄ ▄▄▄ ▄▄▄▄▄▄▄ ▄▄▄▄▄▄▄ ▄▄▄▄▄▄ ▄▄▄▄▄▄ ▄▄▄▄▄▄▄ ▄▄ ▄▄ 45 | █ █▄█ █ █ █ █ █ █ █ █ █ ▄ █ █ █ █ █ █ 46 | █ █ ▄▄▄█ █ █ █ █ ▄▄▄▄▄█ ▄▄▄█ ▄ █ █ █ █ █ █ █▄█ █ 47 | █ █ █▄▄▄█ █ █ █ █ █▄▄▄▄▄█ █▄▄▄█ █▄█ █ █▄▄█▄█ ▄▄█ █ 48 | █ █ ▄▄▄█ █ █▄▄▄█ █▄▄▄▄▄ █ ▄▄▄█ █ ▄▄ █ █ █ ▄ █ 49 | █ ██▄██ █ █▄▄▄█ █ █ █▄▄▄▄▄█ █ █▄▄▄█ ▄ █ █ █ █ █▄▄█ █ █ █ 50 | █▄█ █▄█▄▄▄▄▄▄▄█▄▄▄█▄▄▄▄▄▄▄█▄▄▄█▄▄▄▄▄▄▄█▄▄▄▄▄▄▄█▄█ █▄▄█▄▄▄█ █▄█▄▄▄▄▄▄▄█▄▄█ █▄▄█ 51 | 52 | \tA tool for Sherlock 53 | """) 54 | 55 | args = parser.parse_args() 56 | 57 | uri = "http://{host}:{port}".format( 58 | host=environ['MEILISEARCH_HOST'], 59 | port=environ['MEILISEARCH_PORT'] 60 | ) 61 | client = meilisearch.Client(uri, environ['MEILISEARCH_KEY']) 62 | 63 | try: 64 | info = client.get_version() 65 | print(f'Meilisearch v{info["pkgVersion"]}', end="\n\n") 66 | except: 67 | print('[ERROR] > it is likely that the connection to the specified Meilisearch instance is impossible.') 68 | exit(1) 69 | 70 | 71 | def configure_index(index: meilisearch.client.Index): 72 | if index.uid.endswith("page"): 73 | index.update_settings({ 74 | "rankingRules": [ 75 | "typo", 76 | "words", 77 | "proximity", 78 | "exactness", 79 | "attribute", 80 | "wordsPosition" 81 | ], 82 | "searchableAttributes": [ 83 | "title", 84 | "title:code", 85 | "subtitle", 86 | "preview", 87 | "author" 88 | ], 89 | "displayedAttributes": [ 90 | "id", 91 | "title", 92 | "subtitle", 93 | "preview", 94 | "url", 95 | "author", 96 | "tags" 97 | ] 98 | }) 99 | 100 | 101 | def title_extraction(item: dict): 102 | title = item.get('title') 103 | match = TITLE_REGEX.match(title) 104 | 105 | if match: 106 | item['title:code'] = match.group(1) 107 | 108 | return item 109 | 110 | 111 | if args.action == "upload": 112 | session = queries.Session(URI) 113 | 114 | for item in args.item: 115 | print(f'> Initialization of the upload for `{item}`', end="\n\n") 116 | 117 | for locale in args.locale: 118 | print(f'[{item}] > Getting index for `{locale}`...', end="\n\n") 119 | 120 | name = f'{locale}_{item}' 121 | index = client.get_index(name) 122 | 123 | try: 124 | index.info() 125 | except: 126 | print(f'[{name}] > `{name}` does not seem to exist. Please wait...') 127 | 128 | client.create_index(name, {'primaryKey': 'id'}) 129 | sleep(5) 130 | 131 | configure_index(index) 132 | 133 | print(f'[{name}] > `{name}` created. Resuming procedure...') 134 | 135 | print(f'[{name}] > Fetching raw data from database...') 136 | 137 | result = session.query(SELECT_PAGE).as_dict()['data'] 138 | 139 | print(f'[{name}] > Inserting data into Meilisearch...') 140 | 141 | result = list(map(title_extraction, result)) 142 | update = index.add_documents(result)['updateId'] 143 | 144 | print(f'[{name}] > Update in progress ({name}/{update})', end="\n\n") 145 | 146 | session.close() 147 | -------------------------------------------------------------------------------- /sherlock/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/sherlock/__init__.py -------------------------------------------------------------------------------- /sherlock/items.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from typing import List 4 | 5 | 6 | @dataclass 7 | class MemberItem: 8 | branch_id: str = field(default=None) 9 | user_id: str = field(default=None) 10 | slug: str = field(default=None) 11 | username: str = field(default=None) 12 | member_since: str = field(default=None) 13 | 14 | 15 | @dataclass 16 | class PageItem: 17 | page_id: str = field(default=None) 18 | branch_id: str = field(default=None) 19 | title: str = field(default=None) 20 | preview: str = field(default=None) 21 | slug: str = field(default=None) 22 | tags: List[str] = field(default_factory=list) 23 | created_by: str = field(default=None) 24 | created_at: str = field(default=None) 25 | updated_at: str = field(default=None) 26 | 27 | 28 | @dataclass 29 | class TitleItem: 30 | subtitle: str = field(default=None) 31 | slug: str = field(default=None) 32 | branch_id: str = field(default=None) 33 | 34 | 35 | @dataclass 36 | class VoteItem: 37 | user_id: str = field(default=None) 38 | page_id: str = field(default=None) 39 | vote: int = field(default=None) 40 | -------------------------------------------------------------------------------- /sherlock/loaders.py: -------------------------------------------------------------------------------- 1 | from itemloaders.processors import TakeFirst, MapCompose, Join, Identity 2 | from scrapy.loader import ItemLoader 3 | from sherlock.utils import regex, wikidot 4 | 5 | 6 | class MemberLoader(ItemLoader): 7 | default_output_processor = TakeFirst() 8 | 9 | member_since_in = MapCompose(wikidot.time_to_iso) 10 | 11 | 12 | class PageLoader(ItemLoader): 13 | default_output_processor = TakeFirst() 14 | 15 | title_in = MapCompose(str.strip) 16 | preview_in = MapCompose(str.strip) 17 | tags_out = Identity() 18 | created_at_in = MapCompose(wikidot.time_to_iso) 19 | updated_at_in = MapCompose(wikidot.time_to_iso) 20 | 21 | 22 | class TitleLoader(ItemLoader): 23 | default_output_processor = TakeFirst() 24 | 25 | subtitle_in = MapCompose(str.strip) 26 | slug_in = MapCompose(lambda slug: slug[1:]) 27 | -------------------------------------------------------------------------------- /sherlock/middlewares.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | from scrapy import exceptions 5 | 6 | 7 | forbidden = re.compile( 8 | r'\/(_|main|_template|nav:|admin:|system:|css:|forum:?|search:|test:|fragment:)') 9 | 10 | 11 | class SherlockDownloaderMiddleware(object): 12 | def process_request(self, request, spider): 13 | """Prohibit incompatible URL""" 14 | url = request.url 15 | 16 | if forbidden.search(url): 17 | raise exceptions.IgnoreRequest('{} is blacklisted'.format(url)) 18 | 19 | def process_response(self, request, response, spider): 20 | """Format Wikidot response""" 21 | 22 | # Wikidot famous 23 | # ⊂_ヽ 24 | # \\ HTML in 25 | # \( ͡° ͜ʖ ͡°) 26 | # > ヽ 27 | # / へ\ 28 | # / / \\JSON 29 | # レ ノ ヽ_つ 30 | # / / 31 | # / /| 32 | # ( (ヽ 33 | # | |、\ 34 | # | 丿 \ ⌒) 35 | # | | ) / 36 | # ノ ) Lノ 37 | # (_/ 38 | 39 | url = response.url 40 | 41 | # if the response is coming from Wikidot API 42 | if url.endswith('ajax-module-connector.php'): 43 | data = None 44 | try: 45 | data = json.loads(response.text) 46 | except json.decoder.JSONDecodeError: 47 | print("Error with ", response.text) 48 | 49 | if data['status'] != 'ok': 50 | # TODO: add metadata logging 51 | raise exceptions.IgnoreRequest( 52 | 'status != ok for {}'.format(url)) 53 | 54 | response = response.replace(body=data['body']) 55 | 56 | return response 57 | -------------------------------------------------------------------------------- /sherlock/pipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy import exceptions 2 | 3 | from sherlock import items 4 | from sherlock.utils import database 5 | from dataclasses import asdict 6 | 7 | from psycopg2 import extras 8 | 9 | 10 | class SherlockCheckPipeline: 11 | def process_item(self, item, spider): 12 | if isinstance(item, items.PageItem): 13 | if "admin" in item.tags: 14 | raise exceptions.DropItem("`admin` tag found") 15 | 16 | return item 17 | 18 | 19 | INSERT_PASS = """ 20 | INSERT INTO public.pass (branch_id, subject) VALUES (%(branch_id)s, %(subject)s) 21 | RETURNING id; 22 | """ 23 | 24 | 25 | UPDATE_PASS = """ 26 | UPDATE public.pass SET 27 | ended_at = NOW(), 28 | pending = %(pending)s, 29 | successful = %(successful)s 30 | WHERE id = %(id)s; 31 | """ 32 | 33 | 34 | class SherlockStoragePipeline: 35 | id = None 36 | """Maintains an id representing the current process""" 37 | 38 | session = None 39 | 40 | def __init__(self): 41 | self.session = database.get_session() 42 | 43 | def open_spider(self, spider): 44 | cursor = self.session.cursor 45 | 46 | cursor.execute(INSERT_PASS, { 47 | 'branch_id': spider.info['branch_id'], 48 | 'subject': spider.name 49 | }) 50 | 51 | # attach the current pass id to this pipeline 52 | self.id = cursor.fetchone()['id'] 53 | 54 | def close_spider(self, spider): 55 | self.session.cursor.execute(UPDATE_PASS, { 56 | 'id': self.id, 57 | 'pending': False, 58 | 'successful': True 59 | }) 60 | 61 | self.session.close() 62 | 63 | def process_item(self, item, spider): 64 | if isinstance(item, items.MemberItem): 65 | # maybe batch users if possible 66 | self.session.callproc( 67 | "add_member", {**asdict(item), 'pass_id': self.id}) 68 | 69 | if isinstance(item, items.PageItem): 70 | self.session.callproc( 71 | "add_page", {**asdict(item), 'pass_id': self.id}) 72 | 73 | if isinstance(item, items.TitleItem): 74 | self.session.callproc("add_title", asdict(item)) 75 | 76 | return item 77 | -------------------------------------------------------------------------------- /sherlock/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for sherlock project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = 'sherlock' 11 | 12 | SPIDER_MODULES = ['sherlock.spiders'] 13 | NEWSPIDER_MODULE = 'sherlock.spiders' 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'sherlock (+http://www.yourdomain.com)' 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = True 21 | 22 | FEED_EXPORT_ENCODING = 'utf-8' 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | # DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | # } 46 | 47 | # Enable or disable spider middlewares 48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 49 | # SPIDER_MIDDLEWARES = { 50 | # 'sherlock.middlewares.SherlockSpiderMiddleware': 543, 51 | # } 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | 'sherlock.middlewares.SherlockDownloaderMiddleware': 543, 57 | } 58 | 59 | # Enable or disable extensions 60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 61 | # EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | # } 64 | 65 | # Configure item pipelines 66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'sherlock.pipelines.SherlockCheckPipeline': 300, 69 | # 'sherlock.pipelines.SherlockStoragePipeline': 400, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /sherlock/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /sherlock/spiders/members.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | from sherlock import items, loaders 5 | from sherlock.utils import Config, regex, wikidot 6 | 7 | 8 | class MembersSpider(scrapy.Spider): 9 | name = 'members' 10 | allowed_domains = ['wikidot.com'] 11 | 12 | def __init__(self, site=None, *args, **kwargs): 13 | super(MembersSpider, self).__init__(*args, **kwargs) 14 | 15 | self.info = Config.get_config(site) 16 | 17 | self.api = wikidot.path(site, 'ajax-module-connector.php') 18 | 19 | def start_requests(self): 20 | data, cookie = wikidot.request( 21 | 'membership/MembersListModule', per_page=1000000) 22 | yield scrapy.FormRequest(self.api, 23 | cookies=cookie, 24 | formdata=data, 25 | callback=self.analyze_members_list) 26 | 27 | def analyze_members_list(self, response): 28 | total = response.css('.pager .target:nth-last-child(2) a::text').get() 29 | 30 | # we analyze the pagination to find the total number of pages 31 | for page in range(0, int(total)): 32 | data, cookie = wikidot.request( 33 | 'membership/MembersListModule', 34 | page=page + 1, 35 | per_page=1000000 36 | ) 37 | 38 | yield scrapy.FormRequest(self.api, cookies=cookie, formdata=data) 39 | 40 | def parse(self, response): 41 | for row in response.xpath('//div/table/tr'): 42 | 43 | user = row.xpath('./td[1]/span/a[1]') 44 | item = loaders.MemberLoader(items.MemberItem(), selector=user) 45 | 46 | item.add_value('branch_id', self.info['branch_id']) 47 | item.add_xpath('user_id', '@onclick', re=regex['user_id']) 48 | item.add_xpath('slug', '@href', re=regex['user_slug']) 49 | item.add_xpath('username', './img/@alt') 50 | 51 | since = row.xpath('./td[2]/span/@class').get() 52 | item.add_value('member_since', since, re=regex['timestamp']) 53 | 54 | yield item.load_item() 55 | -------------------------------------------------------------------------------- /sherlock/spiders/pages.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | 4 | import scrapy 5 | from twisted.internet.defer import inlineCallbacks 6 | 7 | from sherlock import items, loaders 8 | from sherlock.utils import Config, regex, wikidot 9 | 10 | 11 | class PagesSpider(scrapy.spiders.SitemapSpider): 12 | name = 'pages' 13 | sitemap_follow = [r'sitemap_page'] 14 | 15 | def __init__(self, site=None, *args, **kwargs): 16 | super(PagesSpider, self).__init__(*args, **kwargs) 17 | 18 | self.info = Config.get_config(site) 19 | 20 | self.api = wikidot.path(site, 'ajax-module-connector.php') 21 | self.sitemap_urls = [wikidot.path(site, 'sitemap.xml')] 22 | 23 | def request(self, *args, **kwargs): 24 | request = scrapy.FormRequest(*args, **kwargs) 25 | return self.crawler.engine.download(request, self) 26 | 27 | @inlineCallbacks 28 | def parse(self, response): 29 | item = loaders.PageLoader(items.PageItem(), response) 30 | 31 | item.add_value('branch_id', self.info['branch_id']) 32 | item.add_css('title', 'div#page-title::text') 33 | item.add_css('tags', 'div.page-tags a::text') 34 | 35 | item.add_value('preview', wikidot.get_preview( 36 | response, language=self.info['language'])) 37 | 38 | script = response.xpath( 39 | '/html/head/script[contains(., "URL")]/text()').get() 40 | 41 | item.add_value('page_id', script, re=regex['page_id']) 42 | item.add_value('branch_id', script, re=regex['branch_id']) 43 | item.add_value('slug', script, re=regex['page_slug']) 44 | 45 | item = item.load_item() 46 | 47 | # Some information is loaded on-demand via an XHR request that we need to simulate here 48 | data, cookie = wikidot.request( 49 | 'history/PageRevisionListModule', 50 | page_id=item.page_id, 51 | perpage=99999 52 | ) 53 | 54 | response = yield self.request(self.api, 55 | cookies=cookie, 56 | formdata=data, 57 | ) 58 | 59 | item = loaders.PageLoader(item, response) 60 | item.add_xpath('created_by', '//table/tr[last()]/td/span/a[1]/@onclick', 61 | re=regex['user_id']) 62 | item.add_xpath('created_at', '//table/tr[last()]/td[6]/span/@class', 63 | re=regex['timestamp']) 64 | item.add_xpath('updated_at', '//table/tr[2]/td[6]/span/@class', 65 | re=regex['timestamp']) 66 | 67 | return item.load_item() 68 | -------------------------------------------------------------------------------- /sherlock/spiders/titles.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | from sherlock import items, loaders 4 | from sherlock.utils import Config, database, wikidot, regex 5 | 6 | 7 | class TitlesSpider(scrapy.Spider): 8 | name = 'titles' 9 | allowed_domains = ['wikidot.com'] 10 | 11 | def __init__(self, site=None, *args, **kwargs): 12 | super(TitlesSpider, self).__init__(*args, **kwargs) 13 | 14 | self.info = Config.get_config(site) 15 | 16 | index = Config.get(site, 'index') 17 | self.start_urls = [wikidot.path(site, slug) for slug in index] 18 | 19 | def parse(self, response): 20 | for title in response.css('.content-panel ul a:not(.newpage)'): 21 | item = loaders.TitleLoader(items.TitleItem(), selector=title) 22 | 23 | item.add_value('branch_id', self.info['branch_id']) 24 | item.add_xpath('subtitle', 'string(./ancestor::li)', 25 | re=regex['scp_subtitle']) 26 | item.add_css('slug', '::attr(href)') 27 | 28 | yield item.load_item() 29 | -------------------------------------------------------------------------------- /sherlock/spiders/votes.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import queries 3 | 4 | from sherlock.utils import Config, regex, wikidot, database 5 | from sherlock.items import VoteItem 6 | 7 | import pprint 8 | 9 | 10 | class VotesSpider(scrapy.Spider): 11 | name = 'votes' 12 | allowed_domains = ['wikidot.com'] 13 | 14 | def __init__(self, site=None, *args, **kwargs): 15 | super(VotesSpider, self).__init__(*args, **kwargs) 16 | 17 | self.info = Config.get_config(site) 18 | self.api = wikidot.path(site, 'ajax-module-connector.php') 19 | 20 | def start_requests(self): 21 | with database.get_session() as session: 22 | for row in session.query("SELECT id FROM public.page WHERE branch_id = %s", (self.info['branch_id'],)): 23 | id = row['id'] 24 | data, cookie = wikidot.request("pagerate/WhoRatedPageModule", 25 | pageId=id) 26 | yield scrapy.FormRequest(self.api, cookies=cookie, formdata=data, meta={'page_id': id}) 27 | 28 | def parse(self, response): 29 | for block in response.css('span.printuser:not(.deleted)'): 30 | vote = block.css('* + span::text').get().strip() 31 | user = block.css( 32 | 'a:first-child::attr(onclick)').re_first(regex['user_id']) 33 | 34 | yield VoteItem(user_id=user, page_id=response.meta['page_id'], vote=vote) 35 | -------------------------------------------------------------------------------- /sherlock/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import Config 2 | from .regex import regex 3 | -------------------------------------------------------------------------------- /sherlock/utils/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class Config: 5 | """Branches configuration module""" 6 | 7 | is_loaded = False 8 | content: dict = None 9 | path = './data.json' 10 | wiki = [] 11 | 12 | @classmethod 13 | def load(self): 14 | """load config file""" 15 | 16 | with open(self.path) as file: 17 | self.content = json.load(file) 18 | self.wiki = self.content.keys() 19 | 20 | self.is_loaded = True 21 | 22 | @classmethod 23 | def get(self, section: str, name: str): 24 | """get specific section & attribute of the configuration""" 25 | 26 | Config.check(section) 27 | 28 | # pylint: disable=unsubscriptable-object 29 | return self.content[section][name] 30 | 31 | @classmethod 32 | def check(self, wiki: str): 33 | """"check if the current wiki is supported by the configuration""" 34 | 35 | if not self.is_loaded: 36 | self.load() 37 | 38 | if wiki is None: 39 | raise AssertionError("You must provide a `site` to crawl") 40 | 41 | if wiki in self.wiki: 42 | return 43 | 44 | raise NotImplementedError( 45 | f'"{wiki}" is not in the config file ({self.path})') 46 | 47 | @classmethod 48 | def get_config(self, section: str): 49 | Config.check(section) 50 | 51 | # pylint: disable=unsubscriptable-object 52 | section = self.content[section] 53 | 54 | return {"branch_id": section['id'], "language": section['language']} 55 | -------------------------------------------------------------------------------- /sherlock/utils/database.py: -------------------------------------------------------------------------------- 1 | from os import environ 2 | 3 | import queries 4 | from dotenv import load_dotenv 5 | 6 | 7 | load_dotenv() 8 | 9 | URI = queries.uri( 10 | host=environ['PG_HOST'], 11 | port=environ['PG_PORT'], 12 | dbname=environ['PG_DBNAME'], 13 | user=environ['PG_USER'], 14 | password=environ['PG_PASSWORD'] 15 | ) 16 | 17 | 18 | def get_session(): 19 | return queries.Session(URI) 20 | -------------------------------------------------------------------------------- /sherlock/utils/regex.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | regex = { 4 | 'page_id': re.compile(r".pageId = (?P