├── .env.example
├── .gitignore
├── README.md
├── data.json
├── data
    └── nltk_data
    │   └── tokenizers
    │       └── punkt
    │           ├── PY3
    │               ├── README
    │               ├── czech.pickle
    │               ├── danish.pickle
    │               ├── dutch.pickle
    │               ├── english.pickle
    │               ├── estonian.pickle
    │               ├── finnish.pickle
    │               ├── french.pickle
    │               ├── german.pickle
    │               ├── greek.pickle
    │               ├── italian.pickle
    │               ├── norwegian.pickle
    │               ├── polish.pickle
    │               ├── portuguese.pickle
    │               ├── russian.pickle
    │               ├── slovene.pickle
    │               ├── spanish.pickle
    │               ├── swedish.pickle
    │               └── turkish.pickle
    │           ├── README
    │           ├── czech.pickle
    │           ├── danish.pickle
    │           ├── dutch.pickle
    │           ├── english.pickle
    │           ├── estonian.pickle
    │           ├── finnish.pickle
    │           ├── french.pickle
    │           ├── german.pickle
    │           ├── greek.pickle
    │           ├── italian.pickle
    │           ├── norwegian.pickle
    │           ├── polish.pickle
    │           ├── portuguese.pickle
    │           ├── russian.pickle
    │           ├── slovene.pickle
    │           ├── spanish.pickle
    │           ├── swedish.pickle
    │           └── turkish.pickle
├── media
    └── search.png
├── requirements.txt
├── schema.sql
├── scrapy.cfg
├── script
    └── meili.py
└── sherlock
    ├── __init__.py
    ├── items.py
    ├── loaders.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    ├── spiders
        ├── __init__.py
        ├── members.py
        ├── pages.py
        ├── titles.py
        └── votes.py
    └── utils
        ├── __init__.py
        ├── config.py
        ├── database.py
        ├── regex.py
        └── wikidot.py


/.env.example:
--------------------------------------------------------------------------------
1 | PG_DBNAME=
2 | PG_USER=
3 | PG_PASSWORD=
4 | PG_HOST=
5 | PG_PORT=
6 | 
7 | MEILISEARCH_HOST=
8 | MEILISEARCH_PORT=
9 | MEILISEARCH_KEY=


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .env
  2 | .vscode
  3 | output.json
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 | 
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 | 
108 | # SageMath parsed files
109 | *.sage.py
110 | 
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 
138 | # pytype static type analyzer
139 | .pytype/
140 | 
141 | # Cython debug symbols
142 | cython_debug/
143 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |    <img src="https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/master/media/search.png" />
  3 | </p>
  4 | 
  5 | <h2 align="center">Sherlock</h2>
  6 | 
  7 | <p align="center">
  8 |     <em>A web crawler to retrieve Wikidot informations</em>
  9 | </p>
 10 | 
 11 | <p align="center">
 12 |     <a href="https://www.python.org/">
 13 |       <img alt="Python 3.8" src="https://img.shields.io/badge/Made%20with-Python%203.8-1f425f.svg">
 14 |     </a>
 15 |     <img alt="MIT License" src="https://img.shields.io/badge/License-MIT-yellow.svg">
 16 | </p>
 17 | 
 18 | ---
 19 | 
 20 | Sherlock is a project allowing to retrieve some Wikidot information (notably pages and their metadata, as well as the members of a wiki). Sherlock is built on the Scrapy framework.
 21 | 
 22 | # Installation
 23 | 
 24 | _This project has been coded with Python 3.8_
 25 | 
 26 | We strongly recommend that you use a [virtual environment](https://docs.python.org/3/tutorial/venv.html) system to use Sherlock in order to neglect the issue of version conflicts that you may be subject to.
 27 | 
 28 | ```sh
 29 | python3.8 -m venv .venv --prompt sherlock
 30 | source ./.venv/bin/activate
 31 | ```
 32 | 
 33 | And only then, install the project's dependencies _inside_ the virtual env.
 34 | 
 35 | ```
 36 | pip install -r requirements.txt
 37 | ```
 38 | 
 39 | # Usage
 40 | 
 41 | If you are not familiar with Scrapy, I advise you to read the [tutorial](https://docs.scrapy.org/en/latest/intro/tutorial.html).
 42 | The project currently contains 4 spiders (the functionality allowing to collect the votes is not yet present). A spider is a classe which define how a certain site will be scraped, including how to perform the crawl (i.e. follow links) and how to extract structured data from their pages (i.e. scraping items).
 43 | The spiders only retrieve data for one wiki at a time. You have to provide them with the name of the target site as an argument (i.e `fondationscp`, `scp-wiki`, `scpko`...) with the following syntax `-a site=<name>`.
 44 | 
 45 | > The branches and their associated metadata are located in the `data.json` file.
 46 | 
 47 | If you want to store the recovered data in a Postgresql database, please decrement line 69 in `settings.py`.
 48 | 
 49 | ## `members` spider
 50 | 
 51 | To start the member crawler.
 52 | 
 53 | ```sh
 54 | scrapy crawl members -o output.json -a site=fondationscp
 55 | ```
 56 | 
 57 | The collected objects will be of the following form
 58 | 
 59 | ```json
 60 | {
 61 |   "branch_id": 464696,
 62 |   "user_id": "1511818",
 63 |   "slug": "dr-hideous",
 64 |   "username": "Dr Hideous",
 65 |   "member_since": "2012-12-16T20:06:48"
 66 | }
 67 | ```
 68 | 
 69 | ## `pages` spider
 70 | 
 71 | To start the page crawler.
 72 | 
 73 | ```sh
 74 | scrapy crawl pages -o output.json -a site=scp-wiki
 75 | ```
 76 | 
 77 | The collected objects will be of the following form
 78 | 
 79 | ```json
 80 | {
 81 |   "page_id": "2417704",
 82 |   "branch_id": 66711,
 83 |   "title": "SCP-547-D",
 84 |   "preview": "Description: SCP-547 appears to be a young, muscular Caucasian male in his late teenage years. Subject is 2m (6'3\") and weighs 85kg (190 lbs.). Since original encounter with SCP-547 in Austria near the small village of in 17██, subject appears to have not aged at all. MRI and X-Ray reveal SCP-547's body to be completely human with the exception of where subject's heart would be.",
 85 |   "slug": "decomm:scp-547-d",
 86 |   "tags": ["decommissioned", "euclid", "humanoid", "neutralized", "scp"],
 87 |   "created_by": "172464",
 88 |   "created_at": "2008-11-08T17:44:50",
 89 |   "updated_at": "2014-09-04T06:56:00"
 90 | }
 91 | ```
 92 | 
 93 | ## `titles` spider
 94 | 
 95 | To start the title crawler.
 96 | 
 97 | ```sh
 98 | scrapy crawl titles -o output.json -a site=scp-wiki
 99 | ```
100 | 
101 | The collected objects will be of the following form
102 | 
103 | ```json
104 | {
105 |   "subtitle": "The 12 Rusty Keys and the Door",
106 |   "slug": "scp-004",
107 |   "branch_id": 66711
108 | }
109 | ```
110 | 
111 | # `data.json`
112 | 
113 | This file is responsible for directing the crawler to the relevant information of the branch concerned. This file looks like this:
114 | 
115 | ```python
116 | {
117 |   [...]
118 |   "scp-wiki": { # as it is hosted at scp-wiki.wikidot.com
119 |     "id": 66711, # ID that wikidot gave to the branch site
120 |     "index": [ # list of paths where the crawler can find the titles of the different SCP
121 |       "scp-series",
122 |       "scp-series-2",
123 |       [...]
124 |     ],
125 |     "language": "english" # language is used to generate a preview of the page
126 |   },
127 |   [...]
128 | }
129 | ```
130 | 
131 | When you give `-a site=<name>`, name is searched as the key from this file and the corresponding value is loaded as the current configuration.
132 | 
133 | > Note: `language` must be supported by [nltk](https://github.com/nltk/nltk) and by the [`punkt`](https://github.com/nltk/nltk_data/blob/gh-pages/packages/tokenizers/punkt.xml) dataset
134 | 
135 | **If you notice that some configuration elements are missing for a branch, don't hesitate to propose a pull request.**
136 | 
137 | ### License
138 | 
139 | MIT
140 | 
141 | <p align="center">🎩</p>
142 | 


--------------------------------------------------------------------------------
/data.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "scp-wiki": {
  3 |     "id": 66711,
  4 |     "index": [
  5 |       "scp-series",
  6 |       "scp-series-2",
  7 |       "scp-series-3",
  8 |       "scp-series-4",
  9 |       "scp-series-5",
 10 |       "scp-series-6",
 11 |       "joke-scps",
 12 |       "scp-ex",
 13 |       "archived-scps"
 14 |     ],
 15 |     "language": "english"
 16 |   },
 17 |   "scpko": {
 18 |     "id": 486864,
 19 |     "index": [],
 20 |     "language": null
 21 |   },
 22 |   "scp-wiki-cn": {
 23 |     "id": 530812,
 24 |     "index": [
 25 |       "scp-series",
 26 |       "scp-series-2",
 27 |       "scp-series-3",
 28 |       "scp-series-4",
 29 |       "scp-series-5",
 30 |       "scp-series-6",
 31 |       "scp-international",
 32 |       "scp-series-cn",
 33 |       "scp-series-cn-2",
 34 |       "joke-scps",
 35 |       "joke-scps-cn",
 36 |       "scp-ex",
 37 |       "scp-ex-cn",
 38 |       "archived-scps"
 39 |     ],
 40 |     "language": null
 41 |   },
 42 |   "fondationscp": {
 43 |     "id": 464696,
 44 |     "index": [
 45 |       "scp-series",
 46 |       "scp-series-2",
 47 |       "scp-series-3",
 48 |       "scp-series-4",
 49 |       "scp-series-5",
 50 |       "scp-series-6",
 51 |       "scp-series-hub",
 52 |       "liste-francaise",
 53 |       "scps-humoristiques-francais",
 54 |       "joke-scps",
 55 |       "scp-ex"
 56 |     ],
 57 |     "language": "french"
 58 |   },
 59 |   "scp-pl": {
 60 |     "id": 647733,
 61 |     "index": [],
 62 |     "language": "polish"
 63 |   },
 64 |   "lafundacionscp": {
 65 |     "id": 560484,
 66 |     "index": [
 67 |       "scp-series",
 68 |       "scp-series-2",
 69 |       "scp-series-3",
 70 |       "scp-series-4",
 71 |       "scp-series-5",
 72 |       "scp-series-6",
 73 |       "scp-series-hub",
 74 |       "serie-scp-es",
 75 |       "serie-scp-es-2",
 76 |       "scps-humoristicos",
 77 |       "scp-ex"
 78 |     ],
 79 |     "language": "spanish"
 80 |   },
 81 |   "scp-th": {
 82 |     "id": 547203,
 83 |     "index": [],
 84 |     "language": null
 85 |   },
 86 |   "scp-jp": {
 87 |     "id": 578002,
 88 |     "index": [],
 89 |     "language": null
 90 |   },
 91 |   "scp-wiki-de": {
 92 |     "id": 1269857,
 93 |     "index": [],
 94 |     "language": "german"
 95 |   },
 96 |   "fondazionescp": {
 97 |     "id": 530167,
 98 |     "index": [],
 99 |     "language": "italian"
100 |   },
101 |   "scp-ukrainian": {
102 |     "id": 1398197,
103 |     "index": [],
104 |     "language": null
105 |   },
106 |   "scp-pt-br": {
107 |     "id": 783633,
108 |     "index": [],
109 |     "language": "portuguese"
110 |   },
111 |   "scp-int": {
112 |     "id": 1427610,
113 |     "index": [],
114 |     "language": "english"
115 |   },
116 |   "scp-ru": {
117 |     "id": 169125,
118 |     "index": [],
119 |     "language": "russian"
120 |   },
121 |   "scp-zh-tr": {
122 |     "id": 3947998,
123 |     "index": [
124 |       "scp-series",
125 |       "scp-series-2",
126 |       "scp-series-3",
127 |       "scp-series-4",
128 |       "scp-series-5",
129 |       "scp-series-6",
130 |       "scp-int-translation-hub",
131 |       "scp-series-zh",
132 |       "joke-scps",
133 |       "scp-ex",
134 |       "archived-scps"
135 |     ],
136 |     "language": null
137 |   }
138 | }
139 | 


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/README:
--------------------------------------------------------------------------------
 1 | Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
 2 | 
 3 | Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
 4 | been contributed by various people using NLTK for sentence boundary detection.
 5 | 
 6 | For information about how to use these models, please confer the tokenization HOWTO:
 7 | http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
 8 | and chapter 3.8 of the NLTK book:
 9 | http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10 | 
11 | There are pretrained tokenizers for the following languages:
12 | 
13 | File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
14 | =======================================================================================================================================================================
15 | czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
16 |                                                                            Literarni Noviny
17 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18 | danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
19 |                                         (Berlingske Avisdata, Copenhagen)  Weekend Avisen
20 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21 | dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
22 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23 | english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
24 |                     (American)
25 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26 | estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
27 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28 | finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
29 |                                         Text Bank (Suomen Kielen           newspapers
30 |                                         Tekstipankki)
31 |                                         Finnish Center for IT Science
32 |                                         (CSC)
33 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34 | french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
35 |                     (European)
36 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37 | german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
38 |                     (Switzerland)       CD-ROM
39 |                     (Uses "ss"
40 |                      instead of "ß")
41 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42 | greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
43 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44 | italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
45 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46 | norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
47 |                     (Bokmål and         Information Technologies,
48 |                      Nynorsk)           Bergen
49 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50 | polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
51 |                                         (http://www.nkjp.pl/)
52 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53 | portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
54 |                     (Brazilian)         (Linguateca)
55 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56 | slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
57 |                                         Slovene Academy for Arts
58 |                                         and Sciences
59 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60 | spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
61 |                     (European)
62 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63 | swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
64 |                                                                            (and some other texts)
65 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66 | turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
67 |                                         (Türkçe Derlem Projesi)
68 |                                         University of Ankara
69 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70 | 
71 | The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72 | Unicode using the codecs module.
73 | 
74 | Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75 | Computational Linguistics 32: 485-525.
76 | 
77 | ---- Training Code ----
78 | 
79 | # import punkt
80 | import nltk.tokenize.punkt
81 | 
82 | # Make a new Tokenizer
83 | tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84 | 
85 | # Read in training corpus (one example: Slovene)
86 | import codecs
87 | text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88 | 
89 | # Train tokenizer
90 | tokenizer.train(text)
91 | 
92 | # Dump pickled tokenizer
93 | import pickle
94 | out = open("slovene.pickle","wb")
95 | pickle.dump(tokenizer, out)
96 | out.close()
97 | 
98 | ---------
99 | 


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/czech.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/czech.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/danish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/danish.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/dutch.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/dutch.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/english.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/english.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/estonian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/estonian.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/finnish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/finnish.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/french.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/french.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/german.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/german.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/greek.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/greek.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/italian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/italian.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/norwegian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/norwegian.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/polish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/polish.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/portuguese.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/portuguese.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/russian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/russian.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/slovene.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/slovene.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/spanish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/spanish.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/swedish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/swedish.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/PY3/turkish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/PY3/turkish.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/README:
--------------------------------------------------------------------------------
 1 | Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
 2 | 
 3 | Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
 4 | been contributed by various people using NLTK for sentence boundary detection.
 5 | 
 6 | For information about how to use these models, please confer the tokenization HOWTO:
 7 | http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
 8 | and chapter 3.8 of the NLTK book:
 9 | http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10 | 
11 | There are pretrained tokenizers for the following languages:
12 | 
13 | File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
14 | =======================================================================================================================================================================
15 | czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
16 |                                                                            Literarni Noviny
17 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18 | danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
19 |                                         (Berlingske Avisdata, Copenhagen)  Weekend Avisen
20 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21 | dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
22 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23 | english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
24 |                     (American)
25 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26 | estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
27 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28 | finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
29 |                                         Text Bank (Suomen Kielen           newspapers
30 |                                         Tekstipankki)
31 |                                         Finnish Center for IT Science
32 |                                         (CSC)
33 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34 | french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
35 |                     (European)
36 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37 | german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
38 |                     (Switzerland)       CD-ROM
39 |                     (Uses "ss"
40 |                      instead of "ß")
41 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42 | greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
43 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44 | italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
45 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46 | norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
47 |                     (Bokmål and         Information Technologies,
48 |                      Nynorsk)           Bergen
49 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50 | polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
51 |                                         (http://www.nkjp.pl/)
52 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53 | portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
54 |                     (Brazilian)         (Linguateca)
55 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56 | slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
57 |                                         Slovene Academy for Arts
58 |                                         and Sciences
59 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60 | spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
61 |                     (European)
62 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63 | swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
64 |                                                                            (and some other texts)
65 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66 | turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
67 |                                         (Türkçe Derlem Projesi)
68 |                                         University of Ankara
69 | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70 | 
71 | The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72 | Unicode using the codecs module.
73 | 
74 | Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75 | Computational Linguistics 32: 485-525.
76 | 
77 | ---- Training Code ----
78 | 
79 | # import punkt
80 | import nltk.tokenize.punkt
81 | 
82 | # Make a new Tokenizer
83 | tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84 | 
85 | # Read in training corpus (one example: Slovene)
86 | import codecs
87 | text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88 | 
89 | # Train tokenizer
90 | tokenizer.train(text)
91 | 
92 | # Dump pickled tokenizer
93 | import pickle
94 | out = open("slovene.pickle","wb")
95 | pickle.dump(tokenizer, out)
96 | out.close()
97 | 
98 | ---------
99 | 


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/czech.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/czech.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/danish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/danish.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/dutch.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/dutch.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/estonian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/estonian.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/finnish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/finnish.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/french.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/french.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/german.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/german.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/italian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/italian.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/norwegian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/norwegian.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/polish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/polish.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/portuguese.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/portuguese.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/russian.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/russian.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/slovene.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/slovene.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/spanish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/spanish.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/swedish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/swedish.pickle


--------------------------------------------------------------------------------
/data/nltk_data/tokenizers/punkt/turkish.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/data/nltk_data/tokenizers/punkt/turkish.pickle


--------------------------------------------------------------------------------
/media/search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/media/search.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | astroid==2.4.2
 2 | attrs==20.1.0
 3 | Automat==20.2.0
 4 | autopep8==1.5.4
 5 | cffi==1.14.2
 6 | click==7.1.2
 7 | constantly==15.1.0
 8 | cryptography==3.3.2
 9 | cssselect==1.1.0
10 | hyperlink==20.0.1
11 | idna==2.10
12 | incremental==17.5.0
13 | isort==5.5.1
14 | itemadapter==0.1.0
15 | itemloaders==1.0.2
16 | jmespath==0.10.0
17 | joblib==0.16.0
18 | lazy-object-proxy==1.4.3
19 | lxml==4.6.3
20 | mccabe==0.6.1
21 | nltk==3.5
22 | parsel==1.6.0
23 | Protego==0.1.16
24 | psycopg2==2.8.5
25 | pyasn1==0.4.8
26 | pyasn1-modules==0.2.8
27 | pycodestyle==2.6.0
28 | pycparser==2.20
29 | PyDispatcher==2.0.5
30 | PyHamcrest==2.0.2
31 | pylint==2.6.0
32 | pyOpenSSL==19.1.0
33 | python-dotenv==0.14.0
34 | queries==2.1.0
35 | queuelib==1.5.0
36 | regex==2020.7.14
37 | Scrapy==2.3.0
38 | service-identity==18.1.0
39 | six==1.15.0
40 | toml==0.10.1
41 | tqdm==4.48.2
42 | Twisted==20.3.0
43 | w3lib==1.22.0
44 | wrapt==1.12.1
45 | zope.interface==5.1.0
46 | 


--------------------------------------------------------------------------------
/schema.sql:
--------------------------------------------------------------------------------
  1 | --- Author : Corentin POUPRY (HelloEdit)
  2 | 
  3 | 
  4 | CREATE TABLE public.branch (
  5 |     id INTEGER GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
  6 |     name VARCHAR(25) NOT NULL,
  7 |     locale VARCHAR(5) UNIQUE NOT NULL,
  8 |     url VARCHAR(50) UNIQUE NOT NULL,
  9 |     created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() NOT NULL
 10 | );
 11 | 
 12 | INSERT INTO public.branch (id, name, locale, url, created_at) VALUES
 13 |     (66711, 'SCP Foundation', 'en', 'http://www.scp-wiki.net/', '2008-07-24 11:00:22+00:00'),
 14 |     (486864, 'SCP 재단', 'ko', 'http://ko.scp-wiki.net/', '2012-08-10 10:01:31+00:00'),
 15 |     (530812, 'SCP基金会', 'cn', 'http://scp-wiki-cn.wikidot.com/', '2013-01-30 17:36:27+00:00'),
 16 |     (464696, 'Fondation SCP', 'fr', 'http://fondationscp.wikidot.com/', '2012-03-21 14:35:40+00:00'),
 17 |     (647733, 'Fundacja SCP', 'pl', 'http://scp-wiki.net.pl/', '2014-04-25 12:14:57+00:00'),
 18 |     (560484, 'La Fundación SCP', 'es', 'http://lafundacionscp.wikidot.com/', '2013-05-05 14:43:20+00:00'),
 19 |     (547203, 'สถาบัน SCP', 'th', 'http://scp-th.wikidot.com/', '2013-04-05 09:04:34+00:00'),
 20 |     (578002, 'SCP財団', 'jp', 'http://scp-jp.wikidot.com/', '2013-07-08 11:09:46+00:00'),
 21 |     (1269857, 'SCP auf Deutsch', 'de', 'http://scp-wiki-de.wikidot.com/', '2016-04-05 21:04:44+00:00'),
 22 |     (530167, 'Fondazione SCP', 'it', 'http://fondazionescp.wikidot.com/', '2013-01-26 15:51:12+00:00'),
 23 |     (1398197, 'Фонд SCP', 'ua', 'http://scp-ukrainian.wikidot.com/', '2016-11-10 08:10:32+00:00'),
 24 |     (783633, 'Fundação SCP', 'pt-br', 'http://scp-pt-br.wikidot.com/', '2015-08-24 13:40:14+00:00'),
 25 |     (1427610, 'SCP International', 'int', 'http://scp-int.wikidot.com/', '2017-01-30 07:08:17+00:00'),
 26 |     (169125, 'Фонд SCP', 'ru', 'http://scp-ru.wikidot.com/', '2010-06-27 17:11:41+00:00');
 27 | 
 28 | CREATE TABLE public.pass (
 29 |     id INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
 30 |     branch_id INTEGER NOT NULL REFERENCES public.branch(id) ON DELETE CASCADE ON UPDATE CASCADE,
 31 |     subject VARCHAR(25) NOT NULL,
 32 |     started_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
 33 |     ended_at TIMESTAMP WITH TIME ZONE,
 34 |     pending BOOLEAN NOT NULL DEFAULT TRUE,
 35 |     successful BOOLEAN NOT NULL DEFAULT FALSE
 36 | );
 37 | 
 38 | CREATE TABLE public.user (
 39 |     id INTEGER GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
 40 |     username VARCHAR(99) UNIQUE NOT NULL,
 41 |     slug VARCHAR(99) UNIQUE NOT NULL
 42 | );
 43 | 
 44 | CREATE TABLE public.membership (
 45 |     id INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
 46 |     branch_id INTEGER NOT NULL REFERENCES public.branch(id) ON DELETE RESTRICT ON UPDATE CASCADE,
 47 |     user_id INTEGER NOT NULL REFERENCES public.user(id) ON DELETE RESTRICT ON UPDATE CASCADE,
 48 |     member_since TIMESTAMP WITH TIME ZONE NOT NULL,
 49 |     pass_id INTEGER NOT NULL REFERENCES public.pass(id) ON DELETE RESTRICT ON UPDATE CASCADE,
 50 |         UNIQUE(user_id, branch_id)
 51 | );
 52 | 
 53 | CREATE TABLE public.page (
 54 |     id INTEGER GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
 55 |     title VARCHAR(256) NOT NULL,
 56 |     subtitle VARCHAR(256),
 57 |     preview VARCHAR(500),
 58 |     branch_id INTEGER NOT NULL REFERENCES public.branch(id) ON DELETE RESTRICT ON UPDATE CASCADE,
 59 |     slug VARCHAR(256) NOT NULL,
 60 |     tags VARCHAR(50)[] NOT NULL DEFAULT '{}',
 61 |     created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() NOT NULL,
 62 |     updated_at TIMESTAMP WITH TIME ZONE DEFAULT NULL,
 63 |     created_by INTEGER REFERENCES public.user(id) ON DELETE SET NULL ON UPDATE CASCADE,
 64 |     pass_id INTEGER NOT NULL REFERENCES public.pass(id) ON DELETE RESTRICT ON UPDATE CASCADE,
 65 |         UNIQUE(branch_id, slug)
 66 | );
 67 | 
 68 | CREATE TABLE public.vote (
 69 |     id INTEGER GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
 70 |     user_id INTEGER NOT NULL REFERENCES public.user(id) ON DELETE CASCADE ON UPDATE CASCADE,
 71 |     page_id INTEGER NOT NULL REFERENCES public.page(id) ON DELETE CASCADE ON UPDATE CASCADE,
 72 |     vote SMALLINT NOT NULL,
 73 |     created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() NOT NULL,
 74 |     pass_id INTEGER NOT NULL REFERENCES public.pass(id) ON DELETE RESTRICT ON UPDATE CASCADE,
 75 |         UNIQUE(user_id, page_id)
 76 | );
 77 | 
 78 | 
 79 | CREATE OR REPLACE FUNCTION add_member(branch_id INTEGER, user_id INTEGER, slug TEXT, username TEXT, member_since TIMESTAMP WITH TIME ZONE, pass_id INTEGER) RETURNS void AS $$
 80 |     INSERT INTO public.user VALUES (user_id, username, slug)
 81 |         ON CONFLICT (id) DO UPDATE SET
 82 |             username = EXCLUDED.username,
 83 |             slug = EXCLUDED.slug;
 84 | 
 85 |     INSERT INTO public.membership VALUES (DEFAULT, branch_id, user_id, member_since, pass_id)
 86 |         ON CONFLICT (user_id, branch_id) DO UPDATE SET
 87 |             pass_id = EXCLUDED.pass_id;
 88 | $$ LANGUAGE sql;
 89 | 
 90 | CREATE OR REPLACE FUNCTION add_page(page_id INTEGER, branch_id INTEGER, title TEXT, preview TEXT, slug TEXT, tags TEXT[], created_by INTEGER, created_at TIMESTAMP WITH TIME ZONE, updated_at TIMESTAMP WITH TIME ZONE, pass_id INTEGER) RETURNS void AS $$
 91 |     INSERT INTO public.page VALUES (page_id, title, NULL, preview, branch_id, slug, tags, created_at, updated_at, created_by, pass_id)
 92 |     ON CONFLICT (id) DO UPDATE SET
 93 |         pass_id = EXCLUDED.pass_id,
 94 |         preview = EXCLUDED.preview,
 95 |         title = EXCLUDED.title,
 96 |         slug = EXCLUDED.slug;
 97 | $$ LANGUAGE sql;
 98 | 
 99 | CREATE OR REPLACE FUNCTION add_title(subtitle TEXT, slug TEXT, branch_id INTEGER) RETURNS void AS $$
100 |     UPDATE public.page SET subtitle = $1 WHERE 
101 |         slug = $2 AND
102 |         branch_id = $3;
103 | $$ LANGUAGE sql;
104 | 
105 | CREATE FUNCTION public.check_public_page_user_id() RETURNS trigger AS $$
106 |     BEGIN
107 |         NEW.created_by = (SELECT id FROM public.user WHERE id = NEW.created_by);
108 |         RETURN NEW;
109 |     END;
110 | $$ LANGUAGE plpgsql;
111 | 
112 | CREATE TRIGGER public_page_user_id_insert
113 |     BEFORE INSERT OR UPDATE ON public.page
114 |     FOR EACH ROW
115 |     WHEN (NEW.created_by IS NOT NULL)
116 |     EXECUTE FUNCTION public.check_public_page_user_id();
117 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sherlock.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sherlock
12 | 


--------------------------------------------------------------------------------
/script/meili.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import re
  3 | from os import environ
  4 | from sys import exit
  5 | from time import sleep
  6 | 
  7 | import meilisearch
  8 | import queries
  9 | from dotenv import load_dotenv
 10 | 
 11 | load_dotenv()
 12 | 
 13 | SELECT_PAGE = """
 14 |     SELECT ARRAY_AGG(ROW_TO_JSON(t)) as data FROM (
 15 |         SELECT usr.username as author, page.id, title, subtitle, preview, tags, (branch.url || page.slug) AS url
 16 |             FROM public.page
 17 |             INNER JOIN public.branch ON page.branch_id = branch.id
 18 |             LEFT JOIN public.user as usr ON page.created_by = usr.id
 19 |     ) t;
 20 | """
 21 | URI = queries.uri(
 22 |     host=environ['PG_HOST'],
 23 |     port=environ['PG_PORT'],
 24 |     dbname=environ['PG_DBNAME'],
 25 |     user=environ['PG_USER'],
 26 |     password=environ['PG_PASSWORD']
 27 | )
 28 | ITEM_CHOICES = ['member']
 29 | LOCALE_CHOICES = ['fr', 'en', 'int']
 30 | TITLE_REGEX = re.compile(r"SCP-(\d+)$")
 31 | 
 32 | parser = argparse.ArgumentParser(
 33 |     prog="Meilisearch util",
 34 |     description="CLI to interact with the Meilisearch database",
 35 |     epilog="Because everyone love Meilisearch"
 36 | )
 37 | 
 38 | parser.add_argument("action", choices=['upload'])
 39 | parser.add_argument("--item", choices=ITEM_CHOICES, nargs="+", required=True)
 40 | parser.add_argument("--locale", choices=LOCALE_CHOICES,
 41 |                     nargs="+", required=True)
 42 | 
 43 | print("""
 44 |  ▄▄   ▄▄ ▄▄▄▄▄▄▄ ▄▄▄ ▄▄▄     ▄▄▄ ▄▄▄▄▄▄▄ ▄▄▄▄▄▄▄ ▄▄▄▄▄▄ ▄▄▄▄▄▄   ▄▄▄▄▄▄▄ ▄▄   ▄▄
 45 | █  █▄█  █       █   █   █   █   █       █       █      █   ▄  █ █       █  █ █  █
 46 | █       █    ▄▄▄█   █   █   █   █  ▄▄▄▄▄█    ▄▄▄█  ▄   █  █ █ █ █       █  █▄█  █
 47 | █       █   █▄▄▄█   █   █   █   █ █▄▄▄▄▄█   █▄▄▄█ █▄█  █   █▄▄█▄█     ▄▄█       █
 48 | █       █    ▄▄▄█   █   █▄▄▄█   █▄▄▄▄▄  █    ▄▄▄█      █    ▄▄  █    █  █   ▄   █
 49 | █ ██▄██ █   █▄▄▄█   █       █   █▄▄▄▄▄█ █   █▄▄▄█  ▄   █   █  █ █    █▄▄█  █ █  █
 50 | █▄█   █▄█▄▄▄▄▄▄▄█▄▄▄█▄▄▄▄▄▄▄█▄▄▄█▄▄▄▄▄▄▄█▄▄▄▄▄▄▄█▄█ █▄▄█▄▄▄█  █▄█▄▄▄▄▄▄▄█▄▄█ █▄▄█
 51 | 
 52 | \tA tool for Sherlock
 53 | """)
 54 | 
 55 | args = parser.parse_args()
 56 | 
 57 | uri = "http://{host}:{port}".format(
 58 |     host=environ['MEILISEARCH_HOST'],
 59 |     port=environ['MEILISEARCH_PORT']
 60 | )
 61 | client = meilisearch.Client(uri, environ['MEILISEARCH_KEY'])
 62 | 
 63 | try:
 64 |     info = client.get_version()
 65 |     print(f'Meilisearch v{info["pkgVersion"]}', end="\n\n")
 66 | except:
 67 |     print('[ERROR] > it is likely that the connection to the specified Meilisearch instance is impossible.')
 68 |     exit(1)
 69 | 
 70 | 
 71 | def configure_index(index: meilisearch.client.Index):
 72 |     if index.uid.endswith("page"):
 73 |         index.update_settings({
 74 |             "rankingRules": [
 75 |                 "typo",
 76 |                 "words",
 77 |                 "proximity",
 78 |                 "exactness",
 79 |                 "attribute",
 80 |                 "wordsPosition"
 81 |             ],
 82 |             "searchableAttributes": [
 83 |                 "title",
 84 |                 "title:code",
 85 |                 "subtitle",
 86 |                 "preview",
 87 |                 "author"
 88 |             ],
 89 |             "displayedAttributes": [
 90 |                 "id",
 91 |                 "title",
 92 |                 "subtitle",
 93 |                 "preview",
 94 |                 "url",
 95 |                 "author",
 96 |                 "tags"
 97 |             ]
 98 |         })
 99 | 
100 | 
101 | def title_extraction(item: dict):
102 |     title = item.get('title')
103 |     match = TITLE_REGEX.match(title)
104 | 
105 |     if match:
106 |         item['title:code'] = match.group(1)
107 | 
108 |     return item
109 | 
110 | 
111 | if args.action == "upload":
112 |     session = queries.Session(URI)
113 | 
114 |     for item in args.item:
115 |         print(f'> Initialization of the upload for `{item}`', end="\n\n")
116 | 
117 |         for locale in args.locale:
118 |             print(f'[{item}] > Getting index for `{locale}`...', end="\n\n")
119 | 
120 |             name = f'{locale}_{item}'
121 |             index = client.get_index(name)
122 | 
123 |             try:
124 |                 index.info()
125 |             except:
126 |                 print(f'[{name}] > `{name}` does not seem to exist. Please wait...')
127 | 
128 |                 client.create_index(name, {'primaryKey': 'id'})
129 |                 sleep(5)
130 | 
131 |                 configure_index(index)
132 | 
133 |                 print(f'[{name}] > `{name}` created. Resuming procedure...')
134 | 
135 |             print(f'[{name}] > Fetching raw data from database...')
136 | 
137 |             result = session.query(SELECT_PAGE).as_dict()['data']
138 | 
139 |             print(f'[{name}] > Inserting data into Meilisearch...')
140 | 
141 |             result = list(map(title_extraction, result))
142 |             update = index.add_documents(result)['updateId']
143 | 
144 |             print(f'[{name}] > Update in progress ({name}/{update})', end="\n\n")
145 | 
146 |     session.close()
147 | 


--------------------------------------------------------------------------------
/sherlock/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-int-tech-team/sherlock/e1d44d115cb6263c229b16ccfc66ebe846563e51/sherlock/__init__.py


--------------------------------------------------------------------------------
/sherlock/items.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | from typing import List
 4 | 
 5 | 
 6 | @dataclass
 7 | class MemberItem:
 8 |     branch_id: str = field(default=None)
 9 |     user_id: str = field(default=None)
10 |     slug: str = field(default=None)
11 |     username: str = field(default=None)
12 |     member_since: str = field(default=None)
13 | 
14 | 
15 | @dataclass
16 | class PageItem:
17 |     page_id: str = field(default=None)
18 |     branch_id: str = field(default=None)
19 |     title: str = field(default=None)
20 |     preview: str = field(default=None)
21 |     slug: str = field(default=None)
22 |     tags: List[str] = field(default_factory=list)
23 |     created_by: str = field(default=None)
24 |     created_at: str = field(default=None)
25 |     updated_at: str = field(default=None)
26 | 
27 | 
28 | @dataclass
29 | class TitleItem:
30 |     subtitle: str = field(default=None)
31 |     slug: str = field(default=None)
32 |     branch_id: str = field(default=None)
33 | 
34 | 
35 | @dataclass
36 | class VoteItem:
37 |     user_id: str = field(default=None)
38 |     page_id: str = field(default=None)
39 |     vote: int = field(default=None)
40 | 


--------------------------------------------------------------------------------
/sherlock/loaders.py:
--------------------------------------------------------------------------------
 1 | from itemloaders.processors import TakeFirst, MapCompose, Join, Identity
 2 | from scrapy.loader import ItemLoader
 3 | from sherlock.utils import regex, wikidot
 4 | 
 5 | 
 6 | class MemberLoader(ItemLoader):
 7 |     default_output_processor = TakeFirst()
 8 | 
 9 |     member_since_in = MapCompose(wikidot.time_to_iso)
10 | 
11 | 
12 | class PageLoader(ItemLoader):
13 |     default_output_processor = TakeFirst()
14 | 
15 |     title_in = MapCompose(str.strip)
16 |     preview_in = MapCompose(str.strip)
17 |     tags_out = Identity()
18 |     created_at_in = MapCompose(wikidot.time_to_iso)
19 |     updated_at_in = MapCompose(wikidot.time_to_iso)
20 | 
21 | 
22 | class TitleLoader(ItemLoader):
23 |     default_output_processor = TakeFirst()
24 | 
25 |     subtitle_in = MapCompose(str.strip)
26 |     slug_in = MapCompose(lambda slug: slug[1:])
27 | 


--------------------------------------------------------------------------------
/sherlock/middlewares.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | 
 4 | from scrapy import exceptions
 5 | 
 6 | 
 7 | forbidden = re.compile(
 8 |     r'\/(_|main|_template|nav:|admin:|system:|css:|forum:?|search:|test:|fragment:)')
 9 | 
10 | 
11 | class SherlockDownloaderMiddleware(object):
12 |     def process_request(self, request, spider):
13 |         """Prohibit incompatible URL"""
14 |         url = request.url
15 | 
16 |         if forbidden.search(url):
17 |             raise exceptions.IgnoreRequest('{} is blacklisted'.format(url))
18 | 
19 |     def process_response(self, request, response, spider):
20 |         """Format Wikidot response"""
21 | 
22 |         #  Wikidot famous
23 |         #      ⊂_ヽ
24 |         #     　 ＼＼ HTML in
25 |         #     　　 ＼( ͡° ͜ʖ ͡°)
26 |         #     　　　 >　 ヽ
27 |         #     　　　/ 　 へ＼
28 |         #     　　 /　　/　＼＼JSON
29 |         #     　　 ﾚ　ノ　　 ヽ_つ
30 |         #     　　/　/
31 |         #     　 /　/|
32 |         #     　(　(ヽ
33 |         #     　|　|、＼
34 |         #     　| 丿 ＼ ⌒)
35 |         #     　| |　　) /
36 |         #     ノ )　　Lﾉ
37 |         #    (_／
38 | 
39 |         url = response.url
40 | 
41 |         # if the response is coming from Wikidot API
42 |         if url.endswith('ajax-module-connector.php'):
43 |             data = None
44 |             try:
45 |                 data = json.loads(response.text)
46 |             except json.decoder.JSONDecodeError:
47 |                 print("Error with ", response.text)
48 | 
49 |             if data['status'] != 'ok':
50 |                 # TODO: add metadata logging
51 |                 raise exceptions.IgnoreRequest(
52 |                     'status != ok for {}'.format(url))
53 | 
54 |             response = response.replace(body=data['body'])
55 | 
56 |         return response
57 | 


--------------------------------------------------------------------------------
/sherlock/pipelines.py:
--------------------------------------------------------------------------------
 1 | from scrapy import exceptions
 2 | 
 3 | from sherlock import items
 4 | from sherlock.utils import database
 5 | from dataclasses import asdict
 6 | 
 7 | from psycopg2 import extras
 8 | 
 9 | 
10 | class SherlockCheckPipeline:
11 |     def process_item(self, item, spider):
12 |         if isinstance(item, items.PageItem):
13 |             if "admin" in item.tags:
14 |                 raise exceptions.DropItem("`admin` tag found")
15 | 
16 |         return item
17 | 
18 | 
19 | INSERT_PASS = """
20 |     INSERT INTO public.pass (branch_id, subject) VALUES (%(branch_id)s, %(subject)s)
21 |         RETURNING id;
22 | """
23 | 
24 | 
25 | UPDATE_PASS = """
26 |     UPDATE public.pass SET 
27 |         ended_at = NOW(),
28 |         pending = %(pending)s,
29 |         successful = %(successful)s 
30 |         WHERE id = %(id)s; 
31 | """
32 | 
33 | 
34 | class SherlockStoragePipeline:
35 |     id = None
36 |     """Maintains an id representing the current process"""
37 | 
38 |     session = None
39 | 
40 |     def __init__(self):
41 |         self.session = database.get_session()
42 | 
43 |     def open_spider(self, spider):
44 |         cursor = self.session.cursor
45 | 
46 |         cursor.execute(INSERT_PASS, {
47 |             'branch_id': spider.info['branch_id'],
48 |             'subject': spider.name
49 |         })
50 | 
51 |         # attach the current pass id to this pipeline
52 |         self.id = cursor.fetchone()['id']
53 | 
54 |     def close_spider(self, spider):
55 |         self.session.cursor.execute(UPDATE_PASS, {
56 |             'id': self.id,
57 |             'pending': False,
58 |             'successful': True
59 |         })
60 | 
61 |         self.session.close()
62 | 
63 |     def process_item(self, item, spider):
64 |         if isinstance(item, items.MemberItem):
65 |             # maybe batch users if possible
66 |             self.session.callproc(
67 |                 "add_member", {**asdict(item), 'pass_id': self.id})
68 | 
69 |         if isinstance(item, items.PageItem):
70 |             self.session.callproc(
71 |                 "add_page", {**asdict(item), 'pass_id': self.id})
72 | 
73 |         if isinstance(item, items.TitleItem):
74 |             self.session.callproc("add_title", asdict(item))
75 | 
76 |         return item
77 | 


--------------------------------------------------------------------------------
/sherlock/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for sherlock project
 2 | #
 3 | # For simplicity, this file contains only settings considered important or
 4 | # commonly used. You can find more settings consulting the documentation:
 5 | #
 6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 9 | 
10 | BOT_NAME = 'sherlock'
11 | 
12 | SPIDER_MODULES = ['sherlock.spiders']
13 | NEWSPIDER_MODULE = 'sherlock.spiders'
14 | 
15 | 
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'sherlock (+http://www.yourdomain.com)'
18 | 
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = True
21 | 
22 | FEED_EXPORT_ENCODING = 'utf-8'
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | # DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | # }
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
49 | # SPIDER_MIDDLEWARES = {
50 | # 'sherlock.middlewares.SherlockSpiderMiddleware': 543,
51 | # }
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 |     'sherlock.middlewares.SherlockDownloaderMiddleware': 543,
57 | }
58 | 
59 | # Enable or disable extensions
60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
61 | # EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | # }
64 | 
65 | # Configure item pipelines
66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'sherlock.pipelines.SherlockCheckPipeline': 300,
69 |     # 'sherlock.pipelines.SherlockStoragePipeline': 400,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/sherlock/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/sherlock/spiders/members.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | from sherlock import items, loaders
 5 | from sherlock.utils import Config, regex, wikidot
 6 | 
 7 | 
 8 | class MembersSpider(scrapy.Spider):
 9 |     name = 'members'
10 |     allowed_domains = ['wikidot.com']
11 | 
12 |     def __init__(self, site=None, *args, **kwargs):
13 |         super(MembersSpider, self).__init__(*args, **kwargs)
14 | 
15 |         self.info = Config.get_config(site)
16 | 
17 |         self.api = wikidot.path(site, 'ajax-module-connector.php')
18 | 
19 |     def start_requests(self):
20 |         data, cookie = wikidot.request(
21 |             'membership/MembersListModule', per_page=1000000)
22 |         yield scrapy.FormRequest(self.api,
23 |                                  cookies=cookie,
24 |                                  formdata=data,
25 |                                  callback=self.analyze_members_list)
26 | 
27 |     def analyze_members_list(self, response):
28 |         total = response.css('.pager .target:nth-last-child(2) a::text').get()
29 | 
30 |         # we analyze the pagination to find the total number of pages
31 |         for page in range(0, int(total)):
32 |             data, cookie = wikidot.request(
33 |                 'membership/MembersListModule',
34 |                 page=page + 1,
35 |                 per_page=1000000
36 |             )
37 | 
38 |             yield scrapy.FormRequest(self.api, cookies=cookie, formdata=data)
39 | 
40 |     def parse(self, response):
41 |         for row in response.xpath('//div/table/tr'):
42 | 
43 |             user = row.xpath('./td[1]/span/a[1]')
44 |             item = loaders.MemberLoader(items.MemberItem(), selector=user)
45 | 
46 |             item.add_value('branch_id', self.info['branch_id'])
47 |             item.add_xpath('user_id', '@onclick', re=regex['user_id'])
48 |             item.add_xpath('slug', '@href', re=regex['user_slug'])
49 |             item.add_xpath('username', './img/@alt')
50 | 
51 |             since = row.xpath('./td[2]/span/@class').get()
52 |             item.add_value('member_since', since, re=regex['timestamp'])
53 | 
54 |             yield item.load_item()
55 | 


--------------------------------------------------------------------------------
/sherlock/spiders/pages.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | 
 4 | import scrapy
 5 | from twisted.internet.defer import inlineCallbacks
 6 | 
 7 | from sherlock import items, loaders
 8 | from sherlock.utils import Config, regex, wikidot
 9 | 
10 | 
11 | class PagesSpider(scrapy.spiders.SitemapSpider):
12 |     name = 'pages'
13 |     sitemap_follow = [r'sitemap_page']
14 | 
15 |     def __init__(self, site=None, *args, **kwargs):
16 |         super(PagesSpider, self).__init__(*args, **kwargs)
17 | 
18 |         self.info = Config.get_config(site)
19 | 
20 |         self.api = wikidot.path(site, 'ajax-module-connector.php')
21 |         self.sitemap_urls = [wikidot.path(site, 'sitemap.xml')]
22 | 
23 |     def request(self, *args, **kwargs):
24 |         request = scrapy.FormRequest(*args, **kwargs)
25 |         return self.crawler.engine.download(request, self)
26 | 
27 |     @inlineCallbacks
28 |     def parse(self, response):
29 |         item = loaders.PageLoader(items.PageItem(), response)
30 | 
31 |         item.add_value('branch_id', self.info['branch_id'])
32 |         item.add_css('title', 'div#page-title::text')
33 |         item.add_css('tags', 'div.page-tags a::text')
34 | 
35 |         item.add_value('preview', wikidot.get_preview(
36 |             response, language=self.info['language']))
37 | 
38 |         script = response.xpath(
39 |             '/html/head/script[contains(., "URL")]/text()').get()
40 | 
41 |         item.add_value('page_id', script, re=regex['page_id'])
42 |         item.add_value('branch_id', script, re=regex['branch_id'])
43 |         item.add_value('slug', script, re=regex['page_slug'])
44 | 
45 |         item = item.load_item()
46 | 
47 |         # Some information is loaded on-demand via an XHR request that we need to simulate here
48 |         data, cookie = wikidot.request(
49 |             'history/PageRevisionListModule',
50 |             page_id=item.page_id,
51 |             perpage=99999
52 |         )
53 | 
54 |         response = yield self.request(self.api,
55 |                                       cookies=cookie,
56 |                                       formdata=data,
57 |                                       )
58 | 
59 |         item = loaders.PageLoader(item, response)
60 |         item.add_xpath('created_by', '//table/tr[last()]/td/span/a[1]/@onclick',
61 |                        re=regex['user_id'])
62 |         item.add_xpath('created_at', '//table/tr[last()]/td[6]/span/@class',
63 |                        re=regex['timestamp'])
64 |         item.add_xpath('updated_at', '//table/tr[2]/td[6]/span/@class',
65 |                        re=regex['timestamp'])
66 | 
67 |         return item.load_item()
68 | 


--------------------------------------------------------------------------------
/sherlock/spiders/titles.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | from sherlock import items, loaders
 4 | from sherlock.utils import Config, database, wikidot, regex
 5 | 
 6 | 
 7 | class TitlesSpider(scrapy.Spider):
 8 |     name = 'titles'
 9 |     allowed_domains = ['wikidot.com']
10 | 
11 |     def __init__(self, site=None, *args, **kwargs):
12 |         super(TitlesSpider, self).__init__(*args, **kwargs)
13 | 
14 |         self.info = Config.get_config(site)
15 | 
16 |         index = Config.get(site, 'index')
17 |         self.start_urls = [wikidot.path(site, slug) for slug in index]
18 | 
19 |     def parse(self, response):
20 |         for title in response.css('.content-panel ul a:not(.newpage)'):
21 |             item = loaders.TitleLoader(items.TitleItem(), selector=title)
22 | 
23 |             item.add_value('branch_id', self.info['branch_id'])
24 |             item.add_xpath('subtitle', 'string(./ancestor::li)',
25 |                            re=regex['scp_subtitle'])
26 |             item.add_css('slug', '::attr(href)')
27 | 
28 |             yield item.load_item()
29 | 


--------------------------------------------------------------------------------
/sherlock/spiders/votes.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import queries
 3 | 
 4 | from sherlock.utils import Config, regex, wikidot, database
 5 | from sherlock.items import VoteItem
 6 | 
 7 | import pprint
 8 | 
 9 | 
10 | class VotesSpider(scrapy.Spider):
11 |     name = 'votes'
12 |     allowed_domains = ['wikidot.com']
13 | 
14 |     def __init__(self, site=None, *args, **kwargs):
15 |         super(VotesSpider, self).__init__(*args, **kwargs)
16 | 
17 |         self.info = Config.get_config(site)
18 |         self.api = wikidot.path(site, 'ajax-module-connector.php')
19 | 
20 |     def start_requests(self):
21 |         with database.get_session() as session:
22 |             for row in session.query("SELECT id FROM public.page WHERE branch_id = %s", (self.info['branch_id'],)):
23 |                 id = row['id']
24 |                 data, cookie = wikidot.request("pagerate/WhoRatedPageModule",
25 |                                                pageId=id)
26 |                 yield scrapy.FormRequest(self.api, cookies=cookie, formdata=data, meta={'page_id': id})
27 | 
28 |     def parse(self, response):
29 |         for block in response.css('span.printuser:not(.deleted)'):
30 |             vote = block.css('* + span::text').get().strip()
31 |             user = block.css(
32 |                 'a:first-child::attr(onclick)').re_first(regex['user_id'])
33 | 
34 |             yield VoteItem(user_id=user, page_id=response.meta['page_id'], vote=vote)
35 | 


--------------------------------------------------------------------------------
/sherlock/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import Config
2 | from .regex import regex
3 | 


--------------------------------------------------------------------------------
/sherlock/utils/config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | class Config:
 5 |     """Branches configuration module"""
 6 | 
 7 |     is_loaded = False
 8 |     content: dict = None
 9 |     path = './data.json'
10 |     wiki = []
11 | 
12 |     @classmethod
13 |     def load(self):
14 |         """load config file"""
15 | 
16 |         with open(self.path) as file:
17 |             self.content = json.load(file)
18 |             self.wiki = self.content.keys()
19 | 
20 |         self.is_loaded = True
21 | 
22 |     @classmethod
23 |     def get(self, section: str, name: str):
24 |         """get specific section & attribute of the configuration"""
25 | 
26 |         Config.check(section)
27 | 
28 |         # pylint: disable=unsubscriptable-object
29 |         return self.content[section][name]
30 | 
31 |     @classmethod
32 |     def check(self, wiki: str):
33 |         """"check if the current wiki is supported by the configuration"""
34 | 
35 |         if not self.is_loaded:
36 |             self.load()
37 | 
38 |         if wiki is None:
39 |             raise AssertionError("You must provide a `site` to crawl")
40 | 
41 |         if wiki in self.wiki:
42 |             return
43 | 
44 |         raise NotImplementedError(
45 |             f'"{wiki}" is not in the config file ({self.path})')
46 | 
47 |     @classmethod
48 |     def get_config(self, section: str):
49 |         Config.check(section)
50 | 
51 |         # pylint: disable=unsubscriptable-object
52 |         section = self.content[section]
53 | 
54 |         return {"branch_id": section['id'], "language": section['language']}
55 | 


--------------------------------------------------------------------------------
/sherlock/utils/database.py:
--------------------------------------------------------------------------------
 1 | from os import environ
 2 | 
 3 | import queries
 4 | from dotenv import load_dotenv
 5 | 
 6 | 
 7 | load_dotenv()
 8 | 
 9 | URI = queries.uri(
10 |     host=environ['PG_HOST'],
11 |     port=environ['PG_PORT'],
12 |     dbname=environ['PG_DBNAME'],
13 |     user=environ['PG_USER'],
14 |     password=environ['PG_PASSWORD']
15 | )
16 | 
17 | 
18 | def get_session():
19 |     return queries.Session(URI)
20 | 


--------------------------------------------------------------------------------
/sherlock/utils/regex.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | regex = {
 4 |     'page_id': re.compile(r".pageId = (?P<page>\d+);"),
 5 |     'user_id': re.compile(r".userInfo\((?P<user>\d+)\);"),
 6 |     'branch_id': re.compile(r".siteId = (?P<branch>\d+);"),
 7 |     'page_slug': re.compile(r".requestPageName = \"(?P<page>.+)\";"),
 8 |     'timestamp': re.compile(r"time_(?P<time>\d+) "),
 9 |     'user_slug': re.compile(r"user:info\/(?P<username>\S+)"),
10 |     'scp_subtitle': re.compile(r"SCP-.+?-(?P<subtitle>.+)")
11 | }
12 | 


--------------------------------------------------------------------------------
/sherlock/utils/wikidot.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import string
 3 | from datetime import datetime
 4 | from posixpath import join
 5 | from unicodedata import normalize
 6 | 
 7 | from nltk import data, tokenize
 8 | from w3lib.html import remove_tags, remove_tags_with_content
 9 | 
10 | data.path.append('./data/nltk_data')
11 | 
12 | 
13 | def path(site=None, component=None):
14 |     """Build Wikidot URL for a specific wiki"""
15 | 
16 |     url = f'http://{site}.wikidot.com/'
17 | 
18 |     if component:
19 |         url = join(url, component)
20 | 
21 |     return url
22 | 
23 | 
24 | def request(module: str, **kwargs):
25 |     """Format cookies and data to make a request to the Wikidot API"""
26 | 
27 |     token = ''.join(
28 |         random.choices(string.ascii_lowercase + string.digits, k=6)
29 |     )
30 |     cookies = {'wikidot_token7': token}
31 |     data = {
32 |         'callbackIndex': '1',
33 |         'wikidot_token7': token,
34 |         'moduleName': module,
35 |         **dict([a, str(x)] for a, x in kwargs.items())
36 |     }
37 | 
38 |     return data, cookies
39 | 
40 | 
41 | def time_to_iso(timestamp):
42 |     """Timestamp date to ISO format"""
43 | 
44 |     return datetime.utcfromtimestamp(int(timestamp)).isoformat()
45 | 
46 | 
47 | def compute_vote(votes, excluded="0"):
48 |     i = 0
49 |     for (user, vote) in votes.items():
50 |         if user == excluded:
51 |             continue
52 | 
53 |         if vote == '+':
54 |             i += 1
55 |         elif vote == '-':
56 |             i -= 1
57 | 
58 |         return i
59 | 
60 | 
61 | def get_preview(response, language: str):
62 |     """extract a preview if possible"""
63 | 
64 |     # try to find a block with 'preview' as class
65 |     preview = response.css(".preview p::text").get()
66 | 
67 |     if preview:
68 |         return preview
69 | 
70 |     # else fallback to the description field
71 |     description = response.xpath(
72 |         "//strong[contains(text(), 'Description')]/ancestor::p").get()
73 | 
74 |     if not description:
75 |         return None
76 | 
77 |     description = normalize('NFKD', remove_tags(remove_tags_with_content(
78 |         description, which_ones=("sup",))))
79 | 
80 |     sentences = []
81 | 
82 |     try:
83 |         # if the language is supported by nltk, we split the frst 450 chars of the description in correct sentences
84 |         sentences = tokenize.sent_tokenize(description[:450], language)
85 |     except LookupError:
86 |         # fallback to the first 149 + '…' chars of the description
87 |         return description[:149] + '…'
88 | 
89 |     # if the description contains only one sentence and less
90 |     # than 15 chars, we will consider that there is no preview.
91 |     if len(sentences) == 1:
92 |         return None if len(sentences[0]) <= 15 else sentences[0]
93 | 
94 |     # last sentence is eliminated because it is probably incomplete...
95 |     return ' '.join(sentences[:-1])
96 | 


--------------------------------------------------------------------------------