├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── __init__.py
├── assets
    └── .gitignore
├── configs
    ├── filter_terms.txt
    └── skip_terms.yaml
├── output
    └── .gitignore
├── project.yml
├── requirements.txt
├── scripts
    ├── __init__.py
    ├── create_kb.py
    ├── extract_demo_dump.py
    ├── parse.py
    ├── utils.py
    └── wiki
    │   ├── __init__.py
    │   ├── compat.py
    │   ├── ddl.sql
    │   ├── download.sh
    │   ├── namespaces.py
    │   ├── schemas.py
    │   ├── wikidata.py
    │   └── wikipedia.py
├── setup.cfg
├── setup.py
└── test_wikid.py


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--- Provide a general summary of your changes in the title. -->
 2 | 
 3 | ## Description
 4 | <!--- Use this section to describe your changes. If your changes required
 5 | testing, include information about the testing environment and the tests you
 6 | ran. If your test fixes a bug reported in an issue, don't forget to include the
 7 | issue number. If your PR is still a work in progress, that's totally fine – just
 8 | include a note to let us know. -->
 9 | 
10 | ### Types of change
11 | <!-- What type of change does your PR cover? Is it a bug fix, an enhancement
12 | or new feature, or a change to the documentation? -->
13 | 
14 | ## Checklist
15 | <!--- Before you submit the PR, go over this checklist and make sure you can
16 | tick off all the boxes. [] -> [x] -->
17 | - [ ] I confirm that I have the right to submit this contribution under the project's MIT license.
18 | - [ ] I ran the tests, and all new and existing tests passed.
19 | - [ ] My changes don't require a change to the readme, or if they do, I've added all required information.


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on:
 4 |   push:
 5 |     paths-ignore:
 6 |       - "*.md"
 7 |   pull_request:
 8 |     types: [opened, synchronize, reopened, edited]
 9 |     paths-ignore:
10 |       - "*.md"
11 | 
12 | jobs:
13 |   tests:
14 |     name: Test
15 |     if: github.repository_owner == 'explosion'
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         os: [ubuntu-latest, windows-latest, macos-latest]
20 |         python_version: ["3.8"]
21 |     runs-on: ${{ matrix.os }}
22 | 
23 |     steps:
24 |       - name: Check out repo
25 |         uses: actions/checkout@v3
26 | 
27 |       - name: Configure Python version
28 |         uses: actions/setup-python@v4
29 |         with:
30 |           python-version: ${{ matrix.python_version }}
31 |           architecture: x64
32 | 
33 |       - name: Build sdist
34 |         run: |
35 |           python -m pip install pytest wheel
36 |           python -m pip install -r requirements.txt
37 | 
38 |       - name: Run tests
39 |         shell: bash
40 |         run: |
41 |           python -m pytest
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__
3 | project.lock
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/ambv/black
 3 |     rev: 22.3.0
 4 |     hooks:
 5 |     - id: black
 6 |       language_version: python3.7
 7 |       additional_dependencies: ['click==8.0.4']
 8 | -   repo: https://gitlab.com/pycqa/flake8
 9 |     rev: 5.0.4
10 |     hooks:
11 |     - id: flake8
12 |       args:
13 |         - "--config=setup.cfg"
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Explosion
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->
 2 | 
 3 | # 🪐 spaCy Project: wikid
 4 | 
 5 | [![tests](https://github.com/explosion/wikid/actions/workflows/tests.yml/badge.svg)](https://github.com/explosion/wikid/actions/workflows/tests.yml)
 6 | [![spaCy](https://img.shields.io/static/v1?label=made%20with%20%E2%9D%A4%20and&message=spaCy&color=09a3d5&style=flat-square)](https://spacy.io)
 7 | <br/> _No REST for the `wikid`_ :jack_o_lantern: - generate a SQLite database
 8 | and a spaCy `KnowledgeBase` from Wikipedia & Wikidata dumps. `wikid` was
 9 | designed with the use case of named entity linking (NEL) with spaCy in mind.
10 | <br/> Note this repository is still in an experimental stage, so the public API
11 | might change at any time.
12 | 
13 | ## 📋 project.yml
14 | 
15 | The [`project.yml`](project.yml) defines the data assets required by the
16 | project, as well as the available commands and workflows. For details, see the
17 | [spaCy projects documentation](https://spacy.io/usage/projects).
18 | 
19 | ### ⏯ Commands
20 | 
21 | The following commands are defined by the project. They can be executed using
22 | [`spacy project run [name]`](https://spacy.io/api/cli#project-run). Commands are
23 | only re-run if their inputs have changed.
24 | 
25 | | Command          | Description                                                                                                   |
26 | | ---------------- | ------------------------------------------------------------------------------------------------------------- |
27 | | `parse`          | Parse Wiki dumps. This can take a long time if you're not using the filtered dumps!                           |
28 | | `download_model` | Download spaCy language model.                                                                                |
29 | | `create_kb`      | Creates KB utilizing SQLite database with Wiki content.                                                       |
30 | | `delete_db`      | Deletes SQLite database generated in step parse_wiki_dumps with data parsed from Wikidata and Wikipedia dump. |
31 | | `clean`          | Delete all generated artifacts except for SQLite database.                                                    |
32 | 
33 | ### ⏭ Workflows
34 | 
35 | The following workflows are defined by the project. They can be executed using
36 | [`spacy project run [name]`](https://spacy.io/api/cli#project-run) and will run
37 | the specified commands in order. Commands are only re-run if their inputs have
38 | changed.
39 | 
40 | | Workflow | Steps                                              |
41 | | -------- | -------------------------------------------------- |
42 | | `all`    | `parse` &rarr; `download_model` &rarr; `create_kb` |
43 | 
44 | ### 🗂 Assets
45 | 
46 | The following assets are defined by the project. They can be fetched by running
47 | [`spacy project assets`](https://spacy.io/api/cli#project-assets) in the project
48 | directory.
49 | 
50 | | File                                            | Source | Description                                                     |
51 | | ----------------------------------------------- | ------ | --------------------------------------------------------------- |
52 | | `assets/wikidata_entity_dump.json.bz2`          | URL    | Wikidata entity dump. Download can take a long time!            |
53 | | `assets/wikipedia_dump.xml.bz2`                 | URL    | Wikipedia dump. Download can take a long time!                  |
54 | | `assets/wikidata_entity_dump_filtered.json.bz2` | URL    | Filtered Wikidata entity dump for demo purposes (English only). |
55 | | `assets/wikipedia_dump_filtered.xml.bz2`        | URL    | Filtered Wikipedia dump for demo purposes (English only).       |
56 | 
57 | <!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->
58 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .scripts import *
2 | 


--------------------------------------------------------------------------------
/assets/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------
/configs/filter_terms.txt:
--------------------------------------------------------------------------------
1 | New York
2 | Boston


--------------------------------------------------------------------------------
/configs/skip_terms.yaml:
--------------------------------------------------------------------------------
 1 | # List of lower-cased terms indicating an article should be skipped in its entirety, by language.
 2 | # These terms appear in Wikipedia articles that we aren't interested in including in our database, such as redirection
 3 | # or disambiguation pages. Unfortunately there doesn't seem to be a language-agnostic mark-up code for these articles
 4 | # (or is there?), so we gather them per language.
 5 | en:
 6 |   - "#redirection"
 7 |   - "#redirect"
 8 |   - "{{disambiguation}}"
 9 | es:
10 |   - "#redirect"
11 |   - "#redirección"
12 |   - "{{desambiguación}}"
13 | 


--------------------------------------------------------------------------------
/output/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------
/project.yml:
--------------------------------------------------------------------------------
 1 | title: 'wikid'
 2 | description: |
 3 |   [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/32/main.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=32)
 4 |   [![spaCy](https://img.shields.io/static/v1?label=made%20with%20%E2%9D%A4%20and&message=spaCy&color=09a3d5&style=flat-square)](https://spacy.io)
 5 |   <br/>
 6 |   _No REST for the `wikid`_ :jack_o_lantern: - generate a SQLite database and a spaCy `KnowledgeBase` from Wikipedia & 
 7 |   Wikidata dumps. `wikid` was designed with the use case of named entity linking (NEL) with spaCy in mind.
 8 |   <br/>
 9 |   Note this repository is still in an experimental stage, so the public API might change at any time. 
10 | 
11 | vars:
12 |   version: "0.0.2"
13 |   language: "en"
14 |   vectors_model: "en_core_web_lg"
15 |   filter: "True"
16 |   n_process: 1    # set to -1 to set to multiprocessing.cpu_count() automatically
17 | 
18 | directories: ["assets", "configs", "scripts", "output"]
19 | 
20 | assets:
21 |   - dest: 'assets/wikidata_entity_dump.json.bz2'
22 |     url: 'https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2'
23 |     description: Wikidata entity dump. Download can take a long time!
24 |     extra: True
25 |   - dest: 'assets/${vars.language}-wikipedia_dump.xml.bz2'
26 |     url: 'https://dumps.wikimedia.org/${vars.language}wiki/latest/${vars.language}wiki-latest-pages-articles-multistream.xml.bz2'
27 |     description: Wikipedia dump. Download can take a long time!
28 |     extra: True
29 |   - dest: 'assets/wikidata_entity_dump_filtered.json.bz2'
30 |     url: 'https://github.com/explosion/projects/releases/download/nel-benchmark-filtered-wiki-data/wikidata_entity_dump_filtered.json.bz2'
31 |     description: Filtered Wikidata entity dump for demo purposes (English only).
32 |     checksum: 'ba2d979105abf174208608b942242fcb'
33 |   - dest: 'assets/wikipedia_dump_filtered.xml.bz2'
34 |     url: 'https://github.com/explosion/projects/releases/download/nel-benchmark-filtered-wiki-data/wikipedia_dump_filtered.xml.bz2'
35 |     description: Filtered Wikipedia dump for demo purposes (English only).
36 |     checksum: 'cb624eaa5887fe1ff47a9206c9bdcfd8'
37 | 
38 | workflows:
39 |   all:
40 |     - parse
41 |     - download_model
42 |     - create_kb
43 | 
44 | commands:
45 |   - name: parse
46 |     help: "Parse Wiki dumps. This can take a long time if you're not using the filtered dumps!"
47 |     script:
48 |       - "env PYTHONPATH=scripts python ./scripts/parse.py ${vars.language} ${vars.filter}"
49 |     outputs:
50 |       - "output/${vars.language}/wiki.sqlite3"
51 | 
52 |   - name: download_model
53 |     help: "Download spaCy language model."
54 |     script:
55 |       - "spacy download ${vars.vectors_model}"
56 | 
57 |   - name: create_kb
58 |     help: "Creates KB utilizing SQLite database with Wiki content."
59 |     script:
60 |       - "env PYTHONPATH=scripts python ./scripts/create_kb.py ${vars.vectors_model} ${vars.language} ${vars.n_process}"
61 |     deps:
62 |       - "output/${vars.language}/wiki.sqlite3"
63 |     outputs:
64 |       - "output/${vars.language}/kb"
65 |       - "output/${vars.language}/nlp"
66 |       - "output/${vars.language}/descriptions.csv"
67 | 
68 |   - name: delete_db
69 |     help: "Deletes SQLite database generated in step parse with data parsed from Wikidata and Wikipedia dump."
70 |     script:
71 |       - "rm -f output/${vars.language}/wiki.sqlite3"
72 |     deps:
73 |       - "output/${vars.language}/wiki.sqlite3"
74 | 
75 |   - name: delete_kb
76 |     help: "Delete all KnowledgeBase-related artifacts, but not the SQLite database."
77 |     script:
78 |       - "rm -rf output/${vars.language}/kb"
79 |       - "rm -rf output/${vars.language}/nlp"
80 |       - "rm -f output/${vars.language}/descriptions.csv"
81 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | spacy<4.0
2 | pyyaml
3 | tqdm
4 | prettytable


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | from .wiki import (
 2 |     schemas,
 3 |     load_entities,
 4 |     establish_db_connection,
 5 |     extract_demo_dump,
 6 |     load_alias_entity_prior_probabilities,
 7 |     parse,
 8 |     namespaces,
 9 | )
10 | from .utils import read_filter_terms
11 | 
12 | __all__ = [
13 |     "schemas",
14 |     "load_entities",
15 |     "establish_db_connection",
16 |     "extract_demo_dump",
17 |     "load_alias_entity_prior_probabilities",
18 |     "parse",
19 |     "namespaces",
20 |     "read_filter_terms",
21 | ]
22 | 


--------------------------------------------------------------------------------
/scripts/create_kb.py:
--------------------------------------------------------------------------------
  1 | """Functionality for creating the knowledge base from downloaded assets and by querying Wikipedia's API."""
  2 | import csv
  3 | import logging
  4 | import os
  5 | from pathlib import Path
  6 | from typing import List
  7 | 
  8 | import numpy
  9 | import spacy
 10 | import tqdm
 11 | import typer
 12 | 
 13 | try:
 14 |     from spacy.kb import InMemoryLookupKB as DefaultKB
 15 | except ImportError:
 16 |     from spacy.kb import KnowledgeBase as DefaultKB
 17 | import wiki
 18 | 
 19 | 
 20 | def main(vectors_model: str, language: str, n_process: int):
 21 |     """Create the Knowledge Base in spaCy and write it to file.
 22 |     language (str): Language.
 23 |     vectors_model (str): Name of model with word vectors to use.
 24 |     """
 25 | 
 26 |     logger = logging.getLogger(__name__)
 27 |     nlp = spacy.load(vectors_model, exclude=["tagger", "lemmatizer", "attribute_ruler"])
 28 | 
 29 |     logger.info("Constructing knowledge base.")
 30 |     kb = DefaultKB(vocab=nlp.vocab, entity_vector_length=nlp.vocab.vectors_length)
 31 |     entity_list: List[str] = []
 32 |     count_list: List[int] = []
 33 |     vector_list: List[numpy.ndarray] = []  # type: ignore
 34 |     entities = wiki.load_entities(language=language)
 35 |     ent_descriptions = {
 36 |         qid: entities[qid].description
 37 |         if entities[qid].description
 38 |         else (
 39 |             entities[qid].article_text[:200]
 40 |             if entities[qid].article_text
 41 |             else entities[qid].name
 42 |         )
 43 |         for qid in entities.keys()
 44 |     }
 45 | 
 46 |     # Infer vectors for entities' descriptions.
 47 |     desc_vectors = [
 48 |         doc.vector
 49 |         for doc in tqdm.tqdm(
 50 |             nlp.pipe(
 51 |                 texts=[ent_descriptions[qid] for qid in entities.keys()], n_process=n_process
 52 |             ),
 53 |             total=len(entities),
 54 |             desc="Inferring entity embeddings",
 55 |         )
 56 |     ]
 57 |     for qid, desc_vector in zip(entities.keys(), desc_vectors):
 58 |         entity_list.append(qid)
 59 |         count_list.append(entities[qid].count)
 60 |         vector_list.append(
 61 |             desc_vector if isinstance(desc_vector, numpy.ndarray) else desc_vector.get()
 62 |         )
 63 |     kb.set_entities(
 64 |         entity_list=entity_list, vector_list=vector_list, freq_list=count_list
 65 |     )
 66 | 
 67 |     # Add aliases with normalized priors to KB. This won't be necessary with a custom KB.
 68 |     alias_entity_prior_probs = wiki.load_alias_entity_prior_probabilities(
 69 |         language=language
 70 |     )
 71 |     for alias, entity_prior_probs in alias_entity_prior_probs.items():
 72 |         kb.add_alias(
 73 |             alias=alias,
 74 |             entities=[epp[0] for epp in entity_prior_probs],
 75 |             probabilities=[epp[1] for epp in entity_prior_probs],
 76 |         )
 77 |     # Add pseudo aliases for easier lookup with new candidate generators.
 78 |     for entity_id in entity_list:
 79 |         kb.add_alias(
 80 |             alias="_" + entity_id + "_", entities=[entity_id], probabilities=[1]
 81 |         )
 82 | 
 83 |     # Serialize knowledge base & pipeline.
 84 |     output_dir = Path(os.path.abspath(__file__)).parent.parent / "output"
 85 |     kb.to_disk(output_dir / language / "kb")
 86 |     nlp_dir = output_dir / language / "nlp"
 87 |     os.makedirs(nlp_dir, exist_ok=True)
 88 |     nlp.to_disk(nlp_dir)
 89 |     # # Write descriptions to file.
 90 |     with open(
 91 |         output_dir / language / "descriptions.csv", "w", encoding="utf-8"
 92 |     ) as csvfile:
 93 |         csv_writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
 94 |         for qid, ent_desc in ent_descriptions.items():
 95 |             csv_writer.writerow([qid, ent_desc])
 96 |     logger.info("Successfully constructed knowledge base.")
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     typer.run(main)
101 | 


--------------------------------------------------------------------------------
/scripts/extract_demo_dump.py:
--------------------------------------------------------------------------------
1 | """Extract demo set from Wiki dumps."""
2 | from utils import read_filter_terms
3 | from wiki import wiki_dump_api
4 | 
5 | if __name__ == "__main__":
6 |     wiki_dump_api.extract_demo_dump(read_filter_terms(), "en")
7 | 


--------------------------------------------------------------------------------
/scripts/parse.py:
--------------------------------------------------------------------------------
 1 | """ Parsing of Wiki dump and persisting of parsing results to DB. """
 2 | from typing import Optional
 3 | import typer
 4 | from wiki import parse
 5 | 
 6 | 
 7 | def main(
 8 |     language: str,
 9 |     # Argument instead of option so it can be overwritten by other spaCy projects (otherwise escaping makes it
10 |     # impossible to pass on '--OPTION', since it's interpreted as dedicated option ("--vars.OPTION --OPTION") instead
11 |     # of as "--vars.OPTION '--OPTION'", as it should be.
12 |     use_filtered_dumps: bool,
13 |     entity_limit: Optional[int] = typer.Option(None, "--entity_limit"),
14 |     article_limit: Optional[int] = typer.Option(None, "--article_limit"),
15 |     alias_limit: Optional[int] = typer.Option(None, "--alias_limit"),
16 | ):
17 |     """Parses Wikidata and Wikipedia dumps. Persists parsing results to DB. If one of the _limit variables is reached,
18 |     parsing is stopped.
19 |     language (str): Language (e.g. 'en', 'es', ...) to assume for Wiki dump.
20 |     use_filtered_dumps (bool): Whether to use filtered Wiki dumps instead of the full ones.
21 |     entity_limit (Optional[int]): Max. number of entities to parse. Unlimited if None.
22 |     article_limit (Optional[int]): Max. number of articles to parse. Unlimited if None.
23 |     alias_limit (Optional[int]): Max. number of entity aliases to parse. Unlimited if None.
24 |     """
25 | 
26 |     parse(
27 |         language=language,
28 |         use_filtered_dumps=use_filtered_dumps,
29 |         entity_config={"limit": entity_limit},
30 |         article_text_config={"limit": article_limit},
31 |         alias_prior_prob_config={"limit": alias_limit},
32 |     )
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     typer.run(main)
37 | 


--------------------------------------------------------------------------------
/scripts/utils.py:
--------------------------------------------------------------------------------
 1 | """ Various utils. """
 2 | 
 3 | import logging
 4 | from pathlib import Path
 5 | from typing import Set
 6 | 
 7 | logging.basicConfig(
 8 |     level=logging.INFO,
 9 |     format="%(asctime)s %(levelname)s %(message)s",
10 |     datefmt="%H:%M:%S",
11 | )
12 | 
13 | 
14 | def get_logger(handle: str) -> logging.Logger:
15 |     """Get logger for handle.
16 |     handle (str): Logger handle.
17 |     RETURNS (logging.Logger): Logger.
18 |     """
19 | 
20 |     return logging.getLogger(handle)
21 | 
22 | 
23 | def read_filter_terms() -> Set[str]:
24 |     """Read terms used to filter Wiki dumps/corpora.
25 |     RETURNS (Set[str]): Set of filter terms.
26 |     """
27 |     with open(
28 |         Path(__file__).parent.parent / "configs" / "filter_terms.txt", "r"
29 |     ) as file:
30 |         return {ft.replace("\n", "") for ft in file.readlines()}
31 | 


--------------------------------------------------------------------------------
/scripts/wiki/__init__.py:
--------------------------------------------------------------------------------
  1 | """ Wiki dataset for unified access to information from Wikipedia and Wikidata dumps. """
  2 | import os.path
  3 | import pickle
  4 | from pathlib import Path
  5 | from typing import Dict, Any, Tuple, List, Set, Optional
  6 | 
  7 | from .compat import sqlite3
  8 | from . import schemas
  9 | from . import wikidata
 10 | from . import wikipedia
 11 | 
 12 | 
 13 | def _get_paths(language: str) -> Dict[str, Path]:
 14 |     """Get paths.
 15 |     language (str): Language.
 16 |     RETURNS (Dict[str, Path]): Paths.
 17 |     """
 18 | 
 19 |     _root_dir = Path(os.path.abspath(__file__)).parent.parent.parent
 20 |     _assets_dir = _root_dir / "assets"
 21 |     return {
 22 |         "db": _root_dir / "output" / language / "wiki.sqlite3",
 23 |         "wikidata_dump": _assets_dir / "wikidata_entity_dump.json.bz2",
 24 |         "wikipedia_dump": _assets_dir / f"{language}-wikipedia_dump.xml.bz2",
 25 |         "filtered_wikidata_dump": _assets_dir
 26 |         / "wikidata_entity_dump_filtered.json.bz2",
 27 |         "filtered_wikipedia_dump": _assets_dir / "wikipedia_dump_filtered.xml.bz2",
 28 |     }
 29 | 
 30 | 
 31 | def establish_db_connection(language: str) -> sqlite3.Connection:
 32 |     """Estabished database connection.
 33 |     language (str): Language.
 34 |     RETURNS (sqlite3.Connection): Database connection.
 35 |     """
 36 |     db_path = _get_paths(language)["db"]
 37 |     os.makedirs(db_path.parent, exist_ok=True)
 38 |     db_conn = sqlite3.connect(_get_paths(language)["db"])
 39 |     db_conn.row_factory = sqlite3.Row
 40 |     return db_conn
 41 | 
 42 | 
 43 | def extract_demo_dump(filter_terms: Set[str], language: str) -> None:
 44 |     """Extracts small demo dump by parsing the Wiki dumps and keeping only those entities (and their articles)
 45 |     containing any of the specified filter_terms. The retained entities and articles are written into intermediate
 46 |     files.
 47 |     filter_terms (Set[str]): Terms having to appear in entity descriptions in order to be wrr
 48 |     language (str): Language.
 49 |     """
 50 | 
 51 |     _paths = _get_paths(language)
 52 |     entity_ids, entity_labels = wikidata.extract_demo_dump(
 53 |         _paths["wikidata_dump"], _paths["filtered_wikidata_dump"], filter_terms
 54 |     )
 55 |     with open(_paths["filtered_entity_entity_info"], "wb") as file:
 56 |         pickle.dump((entity_ids, entity_labels), file)
 57 | 
 58 |     with open(_paths["filtered_entity_entity_info"], "rb") as file:
 59 |         _, entity_labels = pickle.load(file)
 60 |     wikipedia.extract_demo_dump(
 61 |         _paths["wikipedia_dump"], _paths["filtered_wikipedia_dump"], entity_labels
 62 |     )
 63 | 
 64 | 
 65 | def parse(
 66 |     language: str,
 67 |     db_conn: Optional[sqlite3.Connection] = None,
 68 |     entity_config: Optional[Dict[str, Any]] = None,
 69 |     article_text_config: Optional[Dict[str, Any]] = None,
 70 |     alias_prior_prob_config: Optional[Dict[str, Any]] = None,
 71 |     use_filtered_dumps: bool = False,
 72 | ) -> None:
 73 |     """Parses Wikipedia and Wikidata dumps. Writes parsing results to a database. Note that this takes hours.
 74 |     language (str): Language (e.g. 'en', 'es', ...) to assume for Wiki dump.
 75 |     db_conn (Optional[sqlite3.Connection]): Database connection.
 76 |     entity_config (Dict[str, Any]): Arguments to be passed on to wikidata.read_entities().
 77 |     article_text_config (Dict[str, Any]): Arguments to be passed on to wikipedia.read_text().
 78 |     alias_prior_prob_config (Dict[str, Any]): Arguments to be passed on to wikipedia.read_prior_probs().
 79 |     use_filtered_dumps (bool): Whether to use small, filtered Wiki dumps.
 80 |     """
 81 | 
 82 |     _paths = _get_paths(language)
 83 |     msg = "Database exists already. Execute `weasel run delete_db` to remove it."
 84 |     assert not os.path.exists(_paths["db"]), msg
 85 | 
 86 |     db_conn = db_conn if db_conn else establish_db_connection(language)
 87 |     with open(Path(os.path.abspath(__file__)).parent / "ddl.sql", "r") as ddl_sql:
 88 |         db_conn.cursor().executescript(ddl_sql.read())
 89 | 
 90 |     wikidata.read_entities(
 91 |         _paths["wikidata_dump"]
 92 |         if not use_filtered_dumps
 93 |         else _paths["filtered_wikidata_dump"],
 94 |         db_conn,
 95 |         **(entity_config if entity_config else {}),
 96 |         lang=language,
 97 |     )
 98 | 
 99 |     wikipedia.read_prior_probs(
100 |         _paths["wikipedia_dump"]
101 |         if not use_filtered_dumps
102 |         else _paths["filtered_wikipedia_dump"],
103 |         db_conn,
104 |         **(alias_prior_prob_config if alias_prior_prob_config else {}),
105 |     )
106 | 
107 |     wikipedia.read_texts(
108 |         _paths["wikipedia_dump"]
109 |         if not use_filtered_dumps
110 |         else _paths["filtered_wikipedia_dump"],
111 |         db_conn,
112 |         **(article_text_config if article_text_config else {}),
113 |     )
114 | 
115 | 
116 | def load_entities(
117 |     language: str,
118 |     qids: Tuple[str, ...] = tuple(),
119 |     db_conn: Optional[sqlite3.Connection] = None,
120 | ) -> Dict[str, schemas.Entity]:
121 |     """Loads information for entity or entities by querying information from DB.
122 |     Note that this doesn't return all available information, only the part used in the current benchmark solution.
123 |     language (str): Language.
124 |     qids (Tuple[str]): QIDS to look up. If empty, all entities are loaded.
125 |     db_conn (Optional[sqlite3.Connection]): Database connection.
126 |     RETURNS (Dict[str, Entity]): Information on requested entities.
127 |     """
128 |     db_conn = db_conn if db_conn else establish_db_connection(language)
129 | 
130 |     return {
131 |         rec["id"]: schemas.Entity(
132 |             qid=rec["id"],
133 |             name=rec["entity_title"],
134 |             aliases={
135 |                 alias
136 |                 for alias in {
137 |                     rec["entity_title"],
138 |                     rec["article_title"],
139 |                     rec["label"],
140 |                     *(rec["aliases"] if rec["aliases"] else "").split(","),
141 |                 }
142 |                 if alias
143 |             },
144 |             article_title=rec["article_title"],
145 |             article_text=rec["content"],
146 |             description=rec["description"],
147 |             count=rec["count"] if rec["count"] else 0,
148 |         )
149 |         for rec in db_conn.cursor().execute(
150 |             f"""
151 |                 SELECT
152 |                     e.id,
153 |                     et.name as entity_title,
154 |                     et.description,
155 |                     et.label,
156 |                     at.title as article_title,
157 |                     at.content,
158 |                     GROUP_CONCAT(afe.alias) as aliases,
159 |                     SUM(afe.count) as count
160 |                 FROM
161 |                     entities e
162 |                 LEFT JOIN entities_texts et on
163 |                     et.ROWID = e.ROWID
164 |                 LEFT JOIN articles a on
165 |                     a.entity_id = e.id
166 |                 LEFT JOIN articles_texts at on
167 |                     at.ROWID = a.ROWID
168 |                 LEFT JOIN aliases_for_entities afe on
169 |                     afe.entity_id = e.id
170 |                 WHERE
171 |                     {'FALSE' if len(qids) else 'TRUE'} OR e.id IN (%s)
172 |                 GROUP BY
173 |                     e.id,
174 |                     et.name,
175 |                     et.description,
176 |                     et.label,
177 |                     at.title,
178 |                     at.content
179 |             """
180 |             % ",".join("?" * len(qids)),
181 |             tuple(set(qids)),
182 |         )
183 |     }
184 | 
185 | 
186 | def load_alias_entity_prior_probabilities(
187 |     language: str, db_conn: Optional[sqlite3.Connection] = None
188 | ) -> Dict[str, List[Tuple[str, float]]]:
189 |     """Loads alias-entity counts from database and transforms them into prior probabilities per alias.
190 |     language (str): Language.
191 |     RETURN (Dict[str, Tuple[Tuple[str, ...], Tuple[float, ...]]]): Mapping of alias to tuples of entities and the
192 |         corresponding prior probabilities.
193 |     """
194 | 
195 |     db_conn = db_conn if db_conn else establish_db_connection(language)
196 | 
197 |     alias_entity_prior_probs = {
198 |         rec["alias"]: [
199 |             (entity_id, int(count))
200 |             for entity_id, count in zip(
201 |                 rec["entity_ids"].split(","), rec["counts"].split(",")
202 |             )
203 |         ]
204 |         for rec in db_conn.cursor().execute(
205 |             """
206 |                 SELECT
207 |                     alias,
208 |                     GROUP_CONCAT(entity_id) as entity_ids,
209 |                     GROUP_CONCAT(count) as counts
210 |                 FROM
211 |                     aliases_for_entities
212 |                 GROUP BY
213 |                     alias
214 |             """
215 |         )
216 |     }
217 | 
218 |     for alias, entity_counts in alias_entity_prior_probs.items():
219 |         total_count = sum([ec[1] for ec in entity_counts])
220 |         alias_entity_prior_probs[alias] = [
221 |             (ec[0], ec[1] / max(total_count, 1)) for ec in entity_counts
222 |         ]
223 | 
224 |     return alias_entity_prior_probs
225 | 


--------------------------------------------------------------------------------
/scripts/wiki/compat.py:
--------------------------------------------------------------------------------
 1 | # Import pysqlite3, if available. This allows for more flexibility in downstream applications/deployments, as the SQLITE
 2 | # version coupled to the Python version might not support all the features needed for `wikid` to work (e. g. FTS5
 3 | # virtual tables). Fall back to bundled sqlite3 otherwise.
 4 | try:
 5 |     import pysqlite3 as sqlite3
 6 | except ModuleNotFoundError:
 7 |     import sqlite3
 8 | 
 9 | __all__ = ["sqlite3"]
10 | 


--------------------------------------------------------------------------------
/scripts/wiki/ddl.sql:
--------------------------------------------------------------------------------
 1 | -- DDL for parsed Wiki data.
 2 | 
 3 | -- Note that the four tables entities, entities_texts, articles, and article_texts could be combined into one table.
 4 | -- Two reasons why this isn't done:
 5 | --  1. For efficient full-text search we're using FTS5 virtual tables, which don't support index lookup as efficient as
 6 | --     an index lookup in a normal table. Hence we split the data we want to use for full-text search from the
 7 | --     identifiying keys (entity/article IDs).
 8 | --  2. All article data could just as well be part of the entities and/or entities_texts. This is not done due to the
 9 | --     sequential nature of our Wiki parsing: first the Wikidata dump (entities) are read and stored in the DB, then the
10 | --     Wikipedia dump (articles). We could update the entities table, but this is less efficient than inserting new
11 | --     records. If profiling shows this not to be a bottleneck, we may reconsider merging these two tables.
12 | 
13 | CREATE TABLE entities (
14 |     -- Equivalent to Wikidata QID.
15 |     id TEXT PRIMARY KEY NOT NULL,
16 |     -- Claims found for this entity.
17 |     -- This could be normalized. Not worth it at the moment though, since claims aren't used.
18 |     claims TEXT
19 | );
20 | 
21 | -- The FTS5 virtual table implementation doesn't allow for indices, so we rely on ROWID to match entities.
22 | -- This isn't great, but with a controlled data ingestion setup this allows for stable matching.
23 | -- Same for foreign keys.
24 | CREATE VIRTUAL TABLE entities_texts USING fts5(
25 |     -- Equivalent to Wikidata QID. UNINDEXED signifies that this field is not indexed for full text search.
26 |     entity_id UNINDEXED,
27 |     -- Entity name.
28 |     name,
29 |     -- Entity description.
30 |     description,
31 |     -- Entity label.
32 |     label
33 | );
34 | 
35 | CREATE TABLE articles (
36 |     -- Equivalent to Wikdata QID.
37 |     entity_id TEXT PRIMARY KEY NOT NULL,
38 |     -- Wikipedia article ID (different from entity QID).
39 |     id TEXT NOT NULL,
40 |     FOREIGN KEY(entity_id) REFERENCES entities(id)
41 | );
42 | CREATE UNIQUE INDEX idx_articles_id
43 | ON articles (id);
44 | 
45 | -- Same here: no indices possible, relying on ROWID to match with articles.
46 | CREATE VIRTUAL TABLE articles_texts USING fts5(
47 |     -- Equivalent to Wikidata QID. UNINDEXED signifies that this field is not indexed for full text search.
48 |     entity_id UNINDEXED,
49 |     -- Article title.
50 |     title,
51 |     -- Article text.
52 |     content
53 | );
54 | 
55 | CREATE TABLE properties_in_entities (
56 |     -- ID of property describing relationships between entities.
57 |     property_id TEXT NOT NULL,
58 |     -- ID of source entity.
59 |     from_entity_id TEXT NOT NULL,
60 |     -- ID of destination entity.
61 |     to_entity_id TEXT NOT NULL,
62 |     PRIMARY KEY (property_id, from_entity_id, to_entity_id),
63 |     FOREIGN KEY(from_entity_id) REFERENCES entities(id),
64 |     FOREIGN KEY(to_entity_id) REFERENCES entities(id)
65 | );
66 | CREATE INDEX idx_properties_in_entities
67 | ON properties_in_entities (property_id);
68 | 
69 | CREATE TABLE aliases_for_entities (
70 |     -- Alias for entity label.
71 |     alias TEXT NOT NULL,
72 |     -- Equivalent to Wikidata QID.
73 |     entity_id TEXT NOT NULL,
74 |     -- Count of alias occurence in Wiki articles.
75 |     count INTEGER,
76 |     PRIMARY KEY (alias, entity_id),
77 |     FOREIGN KEY(entity_id) REFERENCES entities(id)
78 | );
79 | CREATE INDEX idx_aliases_for_entities_alias
80 | ON aliases_for_entities (alias);
81 | CREATE INDEX idx_aliases_for_entities_entity_id
82 | ON aliases_for_entities (entity_id);


--------------------------------------------------------------------------------
/scripts/wiki/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Utility for robustly downloading large files (i.e. retrying on dropped connections).
 3 | # Source: https://superuser.com/a/689340
 4 | 
 5 | while [ 1 ]; do
 6 | wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 0 --continue $1
 7 | if [ $? = 0 ]; then break; fi; # check return value, break if successful (0)
 8 | sleep 1s;
 9 | done
10 | 


--------------------------------------------------------------------------------
/scripts/wiki/namespaces.py:
--------------------------------------------------------------------------------
  1 | """ Information on Wiki namespaces.
  2 | Source: https://github.com/explosion/projects/blob/master/nel-wikipedia/wiki_namespaces.py.
  3 | """
  4 | 
  5 | # List of meta pages in Wikidata, should be kept out of the Knowledge base
  6 | WD_META_ITEMS = [
  7 |     "Q163875",
  8 |     "Q191780",
  9 |     "Q224414",
 10 |     "Q4167836",
 11 |     "Q4167410",
 12 |     "Q4663903",
 13 |     "Q11266439",
 14 |     "Q13406463",
 15 |     "Q15407973",
 16 |     "Q18616576",
 17 |     "Q19887878",
 18 |     "Q22808320",
 19 |     "Q23894233",
 20 |     "Q33120876",
 21 |     "Q42104522",
 22 |     "Q47460393",
 23 |     "Q64875536",
 24 |     "Q66480449",
 25 | ]
 26 | 
 27 | 
 28 | # TODO: add more cases from non-English WP's
 29 | 
 30 | # List of prefixes that refer to Wikipedia "file" pages
 31 | WP_FILE_NAMESPACE = ["Bestand", "File"]
 32 | 
 33 | # List of prefixes that refer to Wikipedia "category" pages
 34 | WP_CATEGORY_NAMESPACE = ["Kategori", "Category", "Categorie"]
 35 | 
 36 | # List of prefixes that refer to Wikipedia "meta" pages
 37 | # these will/should be matched ignoring case
 38 | WP_META_NAMESPACE = (
 39 |     WP_FILE_NAMESPACE
 40 |     + WP_CATEGORY_NAMESPACE
 41 |     + [
 42 |         "b",
 43 |         "betawikiversity",
 44 |         "Book",
 45 |         "c",
 46 |         "Commons",
 47 |         "d",
 48 |         "dbdump",
 49 |         "download",
 50 |         "Draft",
 51 |         "Education",
 52 |         "Foundation",
 53 |         "Gadget",
 54 |         "Gadget definition",
 55 |         "Gebruiker",
 56 |         "gerrit",
 57 |         "Help",
 58 |         "Image",
 59 |         "Incubator",
 60 |         "m",
 61 |         "mail",
 62 |         "mailarchive",
 63 |         "media",
 64 |         "MediaWiki",
 65 |         "MediaWiki talk",
 66 |         "Mediawikiwiki",
 67 |         "MediaZilla",
 68 |         "Meta",
 69 |         "Metawikipedia",
 70 |         "Module",
 71 |         "mw",
 72 |         "n",
 73 |         "nost",
 74 |         "oldwikisource",
 75 |         "otrs",
 76 |         "OTRSwiki",
 77 |         "Overleg gebruiker",
 78 |         "outreach",
 79 |         "outreachwiki",
 80 |         "Portal",
 81 |         "phab",
 82 |         "Phabricator",
 83 |         "Project",
 84 |         "q",
 85 |         "quality",
 86 |         "rev",
 87 |         "s",
 88 |         "spcom",
 89 |         "Special",
 90 |         "species",
 91 |         "Strategy",
 92 |         "sulutil",
 93 |         "svn",
 94 |         "Talk",
 95 |         "Template",
 96 |         "Template talk",
 97 |         "Testwiki",
 98 |         "ticket",
 99 |         "TimedText",
100 |         "Toollabs",
101 |         "tools",
102 |         "tswiki",
103 |         "User",
104 |         "User talk",
105 |         "v",
106 |         "voy",
107 |         "w",
108 |         "Wikibooks",
109 |         "Wikidata",
110 |         "wikiHow",
111 |         "Wikinvest",
112 |         "wikilivres",
113 |         "Wikimedia",
114 |         "Wikinews",
115 |         "Wikipedia",
116 |         "Wikipedia talk",
117 |         "Wikiquote",
118 |         "Wikisource",
119 |         "Wikispecies",
120 |         "Wikitech",
121 |         "Wikiversity",
122 |         "Wikivoyage",
123 |         "wikt",
124 |         "wiktionary",
125 |         "wmf",
126 |         "wmania",
127 |         "WP",
128 |     ]
129 | )
130 | 


--------------------------------------------------------------------------------
/scripts/wiki/schemas.py:
--------------------------------------------------------------------------------
 1 | """ Schemas for types used in this project. """
 2 | 
 3 | from typing import Set, Optional
 4 | 
 5 | from pydantic.fields import Field
 6 | from pydantic.main import BaseModel
 7 | from pydantic.types import StrictInt
 8 | 
 9 | 
10 | class Entity(BaseModel):
11 |     """Schema for single entity."""
12 | 
13 |     qid: str = Field(..., title="Wiki QID.")
14 |     name: str = Field(..., title="Entity name.")
15 |     aliases: Set[str] = Field(..., title="All found aliases.")
16 |     count: StrictInt = Field(0, title="Count in Wiki corpus.")
17 |     description: Optional[str] = Field(None, title="Full description.")
18 |     article_title: Optional[str] = Field(None, title="Article title.")
19 |     article_text: Optional[str] = Field(None, title="Article text.")
20 | 
21 | 
22 | class Annotation(BaseModel):
23 |     """Schema for single annotation."""
24 | 
25 |     entity_name: str = Field(..., title="Entity name.")
26 |     entity_id: Optional[str] = Field(None, title="Entity ID.")
27 |     start_pos: StrictInt = Field(..., title="Start character position.")
28 |     end_pos: StrictInt = Field(..., title="End character position.")
29 | 


--------------------------------------------------------------------------------
/scripts/wiki/wikidata.py:
--------------------------------------------------------------------------------
  1 | """ Functionalities for processing Wikidata dump.
  2 | Modified from https://github.com/explosion/projects/blob/master/nel-wikipedia/wikidata_processor.py.
  3 | """
  4 | 
  5 | import bz2
  6 | import io
  7 | import json
  8 | 
  9 | from pathlib import Path
 10 | from typing import Union, Optional, Dict, Tuple, Any, List, Set, Iterator
 11 | 
 12 | import tqdm
 13 | 
 14 | from .compat import sqlite3
 15 | from .namespaces import WD_META_ITEMS
 16 | 
 17 | 
 18 | def chunked_readlines(
 19 |     f: bz2.BZ2File, chunk_size: int = 1024 * 1024 * 32
 20 | ) -> Iterator[bytes]:
 21 |     """Reads lines from compressed BZ2 file in chunks. Source: https://stackoverflow.com/a/65765814.
 22 |     chunk_size (int): Chunk size in bytes.
 23 |     RETURNS (Iterator[bytes]): Read bytes.
 24 |     """
 25 |     s = io.BytesIO()
 26 |     while True:
 27 |         buf = f.read(chunk_size)
 28 |         if not buf:
 29 |             return s.getvalue()
 30 |         s.write(buf)
 31 |         s.seek(0)
 32 |         l = s.readlines()
 33 |         yield from l[:-1]
 34 |         s = io.BytesIO()
 35 |         # very important: the last line read in the 1 MB chunk might be
 36 |         # incomplete, so we keep it to be processed in the next iteration
 37 |         # check if this is ok if f.read() stopped in the middle of a \r\n?
 38 |         s.write(l[-1])
 39 | 
 40 | 
 41 | def read_entities(
 42 |     wikidata_file: Union[str, Path],
 43 |     db_conn: sqlite3.Connection,
 44 |     batch_size: int = 5000,
 45 |     limit: Optional[int] = None,
 46 |     lang: str = "en",
 47 |     parse_descr: bool = True,
 48 |     parse_properties: bool = True,
 49 |     parse_sitelinks: bool = True,
 50 |     parse_labels: bool = True,
 51 |     parse_aliases: bool = True,
 52 |     parse_claims: bool = True,
 53 | ) -> None:
 54 |     """Reads entity information from wikidata dump.
 55 |     wikidata_file (Union[str, Path]): Path of wikidata dump file.
 56 |     db_conn (sqlite3.Connection): DB connection.
 57 |     batch_size (int): Batch size for DB commits.
 58 |     limit (Optional[int]): Max. number of entities to parse.
 59 |     to_print (bool): Whether to print information during the parsing process.
 60 |     lang (str): Language with which to filter entity information.
 61 |     parse_descr (bool): Whether to parse entity descriptions.
 62 |     parse_properties (bool): Whether to parse entity properties.
 63 |     parse_sitelinks (bool): Whether to parse entity sitelinks.
 64 |     parse_labels (bool): Whether to parse entity labels.
 65 |     parse_aliases (bool): Whether to parse entity aliases.
 66 |     parse_claims (bool): Whether to parse entity claims.
 67 |     """
 68 | 
 69 |     # Read the JSON wiki data and parse out the entities. Takes about 7-10h to parse 55M lines.
 70 |     # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
 71 | 
 72 |     site_filter = "{}wiki".format(lang)
 73 | 
 74 |     # filter: currently defined as OR: one hit suffices to be removed from further processing
 75 |     exclude_list = WD_META_ITEMS
 76 | 
 77 |     # punctuation
 78 |     exclude_list.extend(["Q1383557", "Q10617810"])
 79 | 
 80 |     # letters etc
 81 |     exclude_list.extend(
 82 |         ["Q188725", "Q19776628", "Q3841820", "Q17907810", "Q9788", "Q9398093"]
 83 |     )
 84 | 
 85 |     neg_prop_filter = {
 86 |         "P31": exclude_list,  # instance of
 87 |         "P279": exclude_list,  # subclass
 88 |     }
 89 | 
 90 |     entity_ids_in_db: Set[str] = {
 91 |         rec["id"] for rec in db_conn.cursor().execute("SELECT id FROM entities")
 92 |     }
 93 |     title_to_id: Dict[str, str] = {}
 94 |     id_to_attrs: Dict[str, Dict[str, Any]] = {}
 95 | 
 96 |     with bz2.open(wikidata_file, mode="rb") as file:
 97 |         pbar_params = {"total": limit} if limit else {}
 98 | 
 99 |         with tqdm.tqdm(
100 |             desc="Parsing entity data", leave=True, miniters=1000, **pbar_params
101 |         ) as pbar:
102 |             for cnt, line in enumerate(file):
103 |                 if limit and cnt >= limit:
104 |                     break
105 | 
106 |                 clean_line = line.strip()
107 |                 if clean_line.endswith(b","):
108 |                     clean_line = clean_line[:-1]
109 | 
110 |                 if len(clean_line) > 1:
111 |                     obj = json.loads(clean_line)
112 |                     if obj.get("id") in entity_ids_in_db:
113 |                         pbar.update(1)
114 |                         continue
115 |                     entry_type = obj["type"]
116 | 
117 |                     if entry_type == "item":
118 |                         keep = True
119 | 
120 |                         claims = obj["claims"]
121 |                         filtered_claims: List[Dict[str, str]] = []
122 |                         if parse_claims:
123 |                             for prop, value_set in neg_prop_filter.items():
124 |                                 claim_property = claims.get(prop, None)
125 |                                 if claim_property:
126 |                                     filtered_claims.append(claim_property)
127 |                                     for cp in claim_property:
128 |                                         cp_id = (
129 |                                             cp["mainsnak"]
130 |                                             .get("datavalue", {})
131 |                                             .get("value", {})
132 |                                             .get("id")
133 |                                         )
134 |                                         cp_rank = cp["rank"]
135 |                                         if (
136 |                                             cp_rank != "deprecated"
137 |                                             and cp_id in value_set
138 |                                         ):
139 |                                             keep = False
140 | 
141 |                         if keep:
142 |                             unique_id = obj["id"]
143 |                             if unique_id not in id_to_attrs:
144 |                                 id_to_attrs[unique_id] = {}
145 |                             if parse_claims:
146 |                                 id_to_attrs[unique_id]["claims"] = filtered_claims
147 | 
148 |                             # parsing all properties that refer to other entities
149 |                             if parse_properties:
150 |                                 id_to_attrs[unique_id]["properties"] = []
151 |                                 for prop, claim_property in claims.items():
152 |                                     cp_dicts = [
153 |                                         cp["mainsnak"]["datavalue"].get("value")
154 |                                         for cp in claim_property
155 |                                         if cp["mainsnak"].get("datavalue")
156 |                                     ]
157 |                                     cp_values = [
158 |                                         cp_dict.get("id")
159 |                                         for cp_dict in cp_dicts
160 |                                         if isinstance(cp_dict, dict)
161 |                                         if cp_dict.get("id") is not None
162 |                                     ]
163 |                                     if cp_values:
164 |                                         id_to_attrs[unique_id]["properties"].append(
165 |                                             (prop, cp_values)
166 |                                         )
167 | 
168 |                             found_link = False
169 |                             if parse_sitelinks:
170 |                                 site_value = obj["sitelinks"].get(site_filter, None)
171 |                                 if site_value:
172 |                                     site = site_value["title"]
173 |                                     title_to_id[site] = unique_id
174 |                                     found_link = True
175 |                                     id_to_attrs[unique_id]["sitelinks"] = site_value
176 | 
177 |                             if parse_labels:
178 |                                 labels = obj["labels"]
179 |                                 if labels:
180 |                                     lang_label = labels.get(lang, None)
181 |                                     if lang_label:
182 |                                         id_to_attrs[unique_id]["labels"] = lang_label
183 | 
184 |                             if found_link and parse_descr:
185 |                                 descriptions = obj["descriptions"]
186 |                                 if descriptions:
187 |                                     lang_descr = descriptions.get(lang, None)
188 |                                     if lang_descr:
189 |                                         id_to_attrs[unique_id][
190 |                                             "description"
191 |                                         ] = lang_descr["value"]
192 | 
193 |                             if parse_aliases:
194 |                                 id_to_attrs[unique_id]["aliases"] = []
195 |                                 aliases = obj["aliases"]
196 |                                 if aliases:
197 |                                     lang_aliases = aliases.get(lang, None)
198 |                                     if lang_aliases:
199 |                                         for item in lang_aliases:
200 |                                             id_to_attrs[unique_id]["aliases"].append(
201 |                                                 item["value"]
202 |                                             )
203 | 
204 |                 pbar.update(1)
205 | 
206 |                 # Save batch.
207 |                 if pbar.n % batch_size == 0:
208 |                     _write_to_db(db_conn, title_to_id, id_to_attrs)
209 |                     title_to_id = {}
210 |                     id_to_attrs = {}
211 | 
212 |     if pbar.n % batch_size != 0:
213 |         _write_to_db(db_conn, title_to_id, id_to_attrs)
214 | 
215 | 
216 | def _write_to_db(
217 |     db_conn: sqlite3.Connection,
218 |     title_to_id: Dict[str, str],
219 |     id_to_attrs: Dict[str, Dict[str, Any]],
220 | ) -> None:
221 |     """Persists entity information to database.
222 |     db_conn (Connection): Database connection.
223 |     title_to_id (Dict[str, str]): Titles to QIDs.
224 |     id_to_attrs (Dict[str, Dict[str, Any]]): For QID a dictionary with property name to property value(s).
225 |     """
226 | 
227 |     entities: List[Tuple[Optional[str], ...]] = []
228 |     entities_texts: List[Tuple[Optional[str], ...]] = []
229 |     props_in_ents: Set[Tuple[str, str, str]] = set()
230 |     aliases_for_entities: List[Tuple[str, str, int]] = []
231 | 
232 |     for title, qid in title_to_id.items():
233 |         entities.append((qid, json.dumps(id_to_attrs[qid]["claims"])))
234 |         entities_texts.append(
235 |             (
236 |                 qid,
237 |                 title,
238 |                 id_to_attrs[qid].get("description", None),
239 |                 id_to_attrs[qid].get("labels", {}).get("value", None),
240 |             )
241 |         )
242 |         for alias in id_to_attrs[qid]["aliases"]:
243 |             aliases_for_entities.append((alias, qid, 1))
244 | 
245 |         for prop in id_to_attrs[qid]["properties"]:
246 |             for second_qid in prop[1]:
247 |                 props_in_ents.add((prop[0], qid, second_qid))
248 | 
249 |     cur = db_conn.cursor()
250 |     cur.executemany(
251 |         "INSERT INTO entities (id, claims) VALUES (?, ?)",
252 |         entities,
253 |     )
254 |     cur.executemany(
255 |         "INSERT INTO entities_texts (entity_id, name, description, label) VALUES (?, ?, ?, ?)",
256 |         entities_texts,
257 |     )
258 |     cur.executemany(
259 |         "INSERT INTO properties_in_entities (property_id, from_entity_id, to_entity_id) VALUES (?, ?, ?)",
260 |         props_in_ents,
261 |     )
262 |     cur.executemany(
263 |         """
264 |         INSERT INTO aliases_for_entities (alias, entity_id, count) VALUES (?, ?, ?)
265 |         ON CONFLICT (alias, entity_id) DO UPDATE SET
266 |             count=count + excluded.count
267 |         """,
268 |         aliases_for_entities,
269 |     )
270 |     db_conn.commit()
271 | 
272 | 
273 | def extract_demo_dump(
274 |     in_dump_path: Path, out_dump_path: Path, filter_terms: Set[str]
275 | ) -> Tuple[Set[str], Set[str]]:
276 |     """Writes information on those entities having at least one of the filter_terms in their description to a new dump
277 |     at location filtered_dump_path.
278 |     in_dump_path (Path): Path to complete Wikidata dump.
279 |     out_dump_path (Path): Path to filtered Wikidata dump.
280 |     filter_terms (Set[str]): Terms having to appear in entity descriptions in order to be included in output dump.
281 |     RETURNS (Tuple[Set[str], Set[str]]): For retained entities: (1) set of QIDs, (2) set of labels (should match article
282 |         titles).
283 |     """
284 | 
285 |     entity_ids: Set[str] = set()
286 |     entity_labels: Set[str] = set()
287 |     filter_terms = {ft.lower() for ft in filter_terms}
288 | 
289 |     with bz2.open(in_dump_path, mode="rb") as in_file:
290 |         with bz2.open(out_dump_path, mode="wb") as out_file:
291 |             write_count = 0
292 |             with tqdm.tqdm(
293 |                 desc="Filtering entity data", leave=True, miniters=100
294 |             ) as pbar:
295 |                 for cnt, line in enumerate(in_file):
296 |                     keep = cnt == 0
297 | 
298 |                     if not keep:
299 |                         clean_line = line.strip()
300 |                         if clean_line.endswith(b","):
301 |                             clean_line = clean_line[:-1]
302 |                         if len(clean_line) > 1:
303 |                             keep = any(
304 |                                 [
305 |                                     ft in clean_line.decode("utf-8").lower()
306 |                                     for ft in filter_terms
307 |                                 ]
308 |                             )
309 |                             if keep:
310 |                                 obj = json.loads(clean_line)
311 |                                 label = obj["labels"].get("en", {}).get("value", "")
312 |                                 entity_ids.add(obj["id"])
313 |                                 entity_labels.add(label)
314 | 
315 |                     if keep:
316 |                         out_file.write(line)
317 |                         write_count += 1
318 | 
319 |                     pbar.update(1)
320 | 
321 |     return entity_ids, entity_labels
322 | 


--------------------------------------------------------------------------------
/scripts/wiki/wikipedia.py:
--------------------------------------------------------------------------------
  1 | """ Functionalities for processing Wikipedia dump.
  2 | Modified from https://github.com/explosion/projects/blob/master/nel-wikipedia/wikipedia_processor.py.
  3 | """
  4 | 
  5 | import re
  6 | import bz2
  7 | 
  8 | from pathlib import Path
  9 | from typing import Union, Optional, Tuple, List, Dict, Set, Any
 10 | 
 11 | import tqdm
 12 | import yaml
 13 | 
 14 | from .compat import sqlite3
 15 | from .namespaces import (
 16 |     WP_META_NAMESPACE,
 17 |     WP_FILE_NAMESPACE,
 18 |     WP_CATEGORY_NAMESPACE,
 19 | )
 20 | 
 21 | """
 22 | Process a Wikipedia dump to calculate entity_title frequencies and prior probabilities in combination with certain mentions.
 23 | Write these results to file for downstream KB and training data generation.
 24 | 
 25 | Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
 26 | """
 27 | 
 28 | map_alias_to_link = dict()
 29 | 
 30 | title_regex = re.compile(r"(?<=<title>).*(?=</title>)")
 31 | id_regex = re.compile(r"(?<=<id>)\d*(?=</id>)")
 32 | text_tag_regex = re.compile(r"(?<=<text).*?(?=>)")
 33 | text_regex = re.compile(r"(?<=<text>).*(?=</text)")
 34 | info_regex = re.compile(r"{[^{]*?}")
 35 | html_regex = re.compile(r"&lt;!--[^-]*--&gt;")
 36 | ref_regex = re.compile(r"&lt;ref.*?&gt;")  # non-greedy
 37 | ref_2_regex = re.compile(r"&lt;/ref.*?&gt;")  # non-greedy
 38 | 
 39 | # find the links
 40 | link_regex = re.compile(r"\[\[[^\[\]]*\]\]")
 41 | 
 42 | # match on interwiki links, e.g. `en:` or `:fr:`
 43 | ns_regex = r":?" + "[a-z][a-z]" + ":"
 44 | # match on Namespace: optionally preceded by a :
 45 | for ns in WP_META_NAMESPACE:
 46 |     ns_regex += "|" + ":?" + ns + ":"
 47 | ns_regex = re.compile(ns_regex, re.IGNORECASE)
 48 | 
 49 | files = r""
 50 | for f in WP_FILE_NAMESPACE:
 51 |     files += "\[\[" + f + ":[^[\]]+]]" + "|"
 52 | files = files[0 : len(files) - 1]
 53 | file_regex = re.compile(files)
 54 | 
 55 | cats = r""
 56 | for c in WP_CATEGORY_NAMESPACE:
 57 |     cats += "\[\[" + c + ":[^\[]*]]" + "|"
 58 | cats = cats[0 : len(cats) - 1]
 59 | category_regex = re.compile(cats)
 60 | 
 61 | 
 62 | def _read_entity_title_id_map(db_conn: sqlite3.Connection) -> Dict[str, str]:
 63 |     """
 64 |     Read entity title to ID map from database.
 65 |     db_conn (sqlite3.Connection): DB connection.
 66 |     RETURNS (Dict[str, str]): Map from entity title to ID.
 67 |     """
 68 | 
 69 |     return {
 70 |         row["name"]: row["id"]
 71 |         for row in db_conn.execute(
 72 |             """
 73 |             SELECT
 74 |                 et.name, e.id
 75 |             FROM
 76 |                 entities e
 77 |             INNER JOIN entities_texts et ON
 78 |                 et.ROWID = e.ROWID
 79 |         """
 80 |         )
 81 |     }
 82 | 
 83 | 
 84 | def read_prior_probs(
 85 |     wikipedia_input_path: Union[str, Path],
 86 |     db_conn: sqlite3.Connection,
 87 |     batch_size: int = 5000,
 88 |     limit: Optional[int] = None,
 89 | ) -> None:
 90 |     """
 91 |     Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities.
 92 |     The full file takes about 2-3h to parse 1100M lines. Writes prior information to DB.
 93 |     It works relatively fast because it runs line by line, irrelevant of which article the intrawiki is from.
 94 |     wikipedia_input_path (Union[str, Path]): Path to Wikipedia dump.
 95 |     batch_size (int): DB batch size.
 96 |     db_conn (sqlite3.Connection): Database connection.
 97 |     n_article_limit (Optional[int]): Number of articles/entities to process.
 98 |     """
 99 | 
100 |     read_id = False
101 |     current_article_id = None
102 |     entity_title_to_id = _read_entity_title_id_map(db_conn)
103 | 
104 |     def write_to_db(_aliases_for_entities) -> None:
105 |         """Writes record triples to DB.
106 |         __aliases_for_entities (): alias-entity-frequency triples.
107 |         """
108 |         db_conn.cursor().executemany(
109 |             """
110 |             INSERT INTO aliases_for_entities (alias, entity_id, count) VALUES (?, ?, ?)
111 |             ON CONFLICT (alias, entity_id) DO UPDATE SET
112 |                 count=count + excluded.count
113 |             """,
114 |             _aliases_for_entities,
115 |         )
116 |         db_conn.commit()
117 | 
118 |     with bz2.open(wikipedia_input_path, mode="rb") as file:
119 |         pbar_params = {"total": limit} if limit else {}
120 |         with tqdm.tqdm(
121 |             desc="Parsing alias-entity prior probabilities", **pbar_params
122 |         ) as pbar:
123 |             line = file.readline()
124 |             while line and (not limit or pbar.n < limit):
125 |                 clean_line = line.strip().decode("utf-8")
126 | 
127 |                 # we attempt at reading the article's ID (but not the revision or contributor ID)
128 |                 if "<revision>" in clean_line or "<contributor>" in clean_line:
129 |                     read_id = False
130 |                 if "<page>" in clean_line:
131 |                     read_id = True
132 | 
133 |                 if read_id:
134 |                     ids = id_regex.search(clean_line)
135 |                     if ids:
136 |                         current_article_id = ids[0]
137 | 
138 |                 # only processing prior probabilities from true training (non-dev) articles
139 |                 if not is_dev(current_article_id):
140 |                     aliases, entities, normalizations = _get_wp_links(clean_line)
141 |                     for alias, entity_title, norm in zip(
142 |                         aliases, entities, normalizations
143 |                     ):
144 |                         _store_alias(
145 |                             alias,
146 |                             entity_title,
147 |                             normalize_alias=norm,
148 |                             normalize_entity=True,
149 |                         )
150 | 
151 |                 line = file.readline()
152 |                 pbar.update(1)
153 | 
154 |     # write all aliases and their entities and count occurrences to file
155 |     with tqdm.tqdm(
156 |         desc="Persisting alias-entity prior probabilities", total=len(map_alias_to_link)
157 |     ) as pbar:
158 |         aliases_for_entities: List[Tuple[str, str, int]] = []
159 |         for alias, alias_dict in map_alias_to_link.items():
160 |             for entity_title, count in alias_dict.items():
161 |                 if entity_title in entity_title_to_id:
162 |                     aliases_for_entities.append(
163 |                         (alias, entity_title_to_id[entity_title], count)
164 |                     )
165 |             if pbar.n % batch_size == 0:
166 |                 write_to_db(aliases_for_entities)
167 |                 aliases_for_entities = []
168 | 
169 |             pbar.update(1)
170 | 
171 |         if pbar.n % batch_size != 0:
172 |             write_to_db(aliases_for_entities)
173 | 
174 | 
175 | def _store_alias(
176 |     alias: str,
177 |     entity_title: str,
178 |     normalize_alias: bool = False,
179 |     normalize_entity: bool = True,
180 | ) -> None:
181 |     """Stores (normalized) alias for (normalized) entity_title ID in mapping dictionaries.
182 |     alias (str): Alias text.
183 |     entity_title (str): Entity title.
184 |     normalize_alias (bool): Whether to normalize the alias text, i.e. remove anchors.
185 |     normalize_entity (bool): Whether to normalize the entity title.
186 |     """
187 |     alias = alias.strip()
188 |     entity_title = entity_title.strip()
189 | 
190 |     # remove everything after # as this is not part of the title but refers to a specific paragraph
191 |     if normalize_entity:
192 |         # wikipedia titles are always capitalized
193 |         entity_title = _capitalize_first(entity_title.split("#")[0])
194 |     if normalize_alias:
195 |         alias = alias.split("#")[0]
196 | 
197 |     if alias and entity_title:
198 |         alias_dict = map_alias_to_link.get(alias, dict())
199 |         entity_count = alias_dict.get(entity_title, 0)
200 |         alias_dict[entity_title] = entity_count + 1
201 |         map_alias_to_link[alias] = alias_dict
202 | 
203 | 
204 | def _get_wp_links(text: str) -> Tuple[List[str], List[str], List[bool]]:
205 |     """Retrieve interwiki links from text.
206 |     text (str): Text to parse.
207 |     RETURNS (Tuple[List[str], List[str], List[bool]]): List of aliases, entity titles, and whether normalization they
208 |         were normalized.
209 |     """
210 |     aliases: List[str] = []
211 |     entities: List[str] = []
212 |     normalizations: List[bool] = []
213 | 
214 |     matches = link_regex.findall(text)
215 |     for match in matches:
216 |         match = match[2:][:-2].replace("_", " ").strip()
217 | 
218 |         if ns_regex.match(match):
219 |             pass  # ignore the entity_title if it points to a "meta" page
220 | 
221 |         # this is a simple [[link]], with the alias the same as the mention
222 |         elif "|" not in match:
223 |             aliases.append(match)
224 |             entities.append(match)
225 |             normalizations.append(True)
226 | 
227 |         # in wiki format, the link is written as [[entity_title|alias]]
228 |         else:
229 |             splits = match.split("|")
230 |             entity = splits[0].strip()
231 |             alias = splits[1].strip()
232 |             # specific wiki format  [[alias (specification)|]]
233 |             if len(alias) == 0 and "(" in entity:
234 |                 alias = entity.split("(")[0]
235 |                 aliases.append(alias)
236 |                 entities.append(entity)
237 |                 normalizations.append(False)
238 |             else:
239 |                 aliases.append(alias)
240 |                 entities.append(entity)
241 |                 normalizations.append(False)
242 | 
243 |     return aliases, entities, normalizations
244 | 
245 | 
246 | def _capitalize_first(text: str) -> Optional[str]:
247 |     """Capitalize first character.
248 |     text (str): String in which to capitalize first character.
249 |     RETURN (Optional[str]): Text with first character capitalized.
250 |     """
251 |     if not text:
252 |         return None
253 |     result = text[0].capitalize()
254 |     if len(result) > 0:
255 |         result += text[1:]
256 |     return result
257 | 
258 | 
259 | def read_texts(
260 |     wikipedia_input_path: Union[str, Path],
261 |     db_conn: sqlite3.Connection,
262 |     batch_size: int = 10000,
263 |     limit: Optional[int] = None,
264 |     n_char_limit: int = 1000,
265 |     lang: str = "en",
266 | ) -> None:
267 |     """
268 |     Read the XML Wikipedia data to parse out clean article texts. Texts are stored in file.
269 |     wikipedia_input_path (Union[str, Path]): Path to Wikipedia dump.
270 |     db_conn (sqlite3.Connection): DB connection.
271 |     limit (Optional[int]): Max. number of articles to process. If None, all are processed.
272 |     n_char_limit (Optional[int]): Max. number of characters to process per article.
273 |     lang (str): Language with which to filter entity information.
274 |     """
275 |     read_ids: Set[str] = set()
276 |     entity_title_to_id = _read_entity_title_id_map(db_conn)
277 |     article_records: List[Tuple[str, str]] = []
278 |     article_texts_records: List[Tuple[str, str, str]] = []
279 |     # Fetch IDs of entities whose articles are already in the DB.
280 |     article_ids_in_db: Set[str] = {
281 |         rec["id"] for rec in db_conn.cursor().execute("SELECT id FROM articles")
282 |     }
283 | 
284 |     def write_to_db(
285 |         _article_records: List[Tuple[str, str]],
286 |         _article_text_records: List[Tuple[str, str, str]],
287 |     ) -> None:
288 |         """Writes records to list.
289 |         _article_records (List[Tuple[str, str]]): `articles`entries with entity ID, ID.
290 |         _article_texts_records (List[Tuple[str, str, str]]): `articles_texts` entries with entity ID, title, content.
291 |         """
292 |         db_conn.cursor().executemany(
293 |             "INSERT INTO articles (entity_id, id) VALUES (?, ?)",
294 |             _article_records,
295 |         )
296 |         db_conn.cursor().executemany(
297 |             "INSERT INTO articles_texts (entity_id, title, content) VALUES (?, ?, ?)",
298 |             _article_text_records,
299 |         )
300 |         db_conn.commit()
301 | 
302 |     with bz2.open(wikipedia_input_path, mode="rb") as file:
303 |         pbar_params = {"total": limit} if limit else {}
304 |         with tqdm.tqdm(
305 |             desc="Parsing article texts", miniters=1000, **pbar_params
306 |         ) as pbar:
307 |             n_articles = 0
308 |             n_viable_articles = 0
309 |             article_text = ""
310 |             article_title: Optional[str] = None
311 |             article_id: Optional[str] = None
312 |             reading_text = False
313 |             reading_revision = False
314 |             # Terms in article indicating it should be skipped (for redirects and disambiguation pages).
315 |             # Note: checks for redirection/disambiguation articles are not language-agnostic. Porting this to the
316 |             # generalized extraction needs to consider that.
317 |             with open(
318 |                 Path(__file__).parent.parent.parent / "configs" / "skip_terms.yaml", "r"
319 |             ) as stream:
320 |                 skip_terms = set(yaml.safe_load(stream)[lang])
321 |             skip_article = False
322 | 
323 |             for line in file:
324 |                 if limit and pbar.n >= limit:
325 |                     break
326 | 
327 |                 clean_line = line.strip().decode("utf-8")
328 | 
329 |                 # Check if article is to be skipped.
330 |                 cl_lower = clean_line.lower()
331 |                 for skip_term in skip_terms:
332 |                     if skip_term in cl_lower:
333 |                         skip_article = True
334 |                         break
335 | 
336 |                 # Skip to next line if article is to be skipped.
337 |                 if skip_article and clean_line != "</page>":
338 |                     continue
339 | 
340 |                 if clean_line == "<revision>":
341 |                     reading_revision = True
342 |                 elif clean_line == "</revision>":
343 |                     reading_revision = False
344 | 
345 |                 # Start reading new page
346 |                 if clean_line == "<page>":
347 |                     n_articles += 1
348 |                     article_text = ""
349 |                     article_title = None
350 |                     article_id = None
351 | 
352 |                 # finished reading this page
353 |                 elif clean_line == "</page>":
354 |                     if article_id and article_id not in article_ids_in_db:
355 |                         clean_text, entities = _process_wp_text(
356 |                             article_title, article_text, entity_title_to_id
357 |                         )
358 |                         if clean_text is not None:
359 |                             n_viable_articles += 1
360 |                             if article_title in entity_title_to_id:
361 |                                 text_to_append = clean_text[:n_char_limit]
362 |                                 for (to_replace, replacement) in (
363 |                                     ("(;", " "),
364 |                                     ("(,", " "),
365 |                                     (" ; ", " "),
366 |                                     (" , ", ""),
367 |                                     ("()", ""),
368 |                                 ):
369 |                                     text_to_append = text_to_append.replace(
370 |                                         to_replace, replacement
371 |                                     )
372 | 
373 |                                 article_records.append(
374 |                                     (entity_title_to_id[article_title], article_id)
375 |                                 )
376 |                                 article_texts_records.append(
377 |                                     (
378 |                                         entity_title_to_id[article_title],
379 |                                         article_title,
380 |                                         " ".join(text_to_append.split(" ")[:-1]),
381 |                                     )
382 |                                 )
383 |                             pbar.update(1)
384 | 
385 |                             if pbar.n % batch_size == 0:
386 |                                 write_to_db(article_records, article_texts_records)
387 |                                 article_records = []
388 |                                 article_texts_records = []
389 | 
390 |                     article_text = ""
391 |                     article_title = None
392 |                     article_id = None
393 |                     reading_text = False
394 |                     reading_revision = False
395 |                     skip_article = False
396 | 
397 |                 # start reading text within a page
398 |                 if "<text" in clean_line:
399 |                     reading_text = True
400 | 
401 |                 if reading_text:
402 |                     article_text += " " + clean_line
403 | 
404 |                 # stop reading text within a page (we assume a new page doesn't start on the same line)
405 |                 if "</text" in clean_line:
406 |                     reading_text = False
407 | 
408 |                 # read the ID of this article (outside the revision portion of the document)
409 |                 if not reading_revision:
410 |                     ids = id_regex.search(clean_line)
411 |                     if ids:
412 |                         article_id = ids[0]
413 |                         if article_id in read_ids:
414 |                             # This should never happen ...
415 |                             print("Found duplicate article ID", article_id, clean_line)
416 |                         read_ids.add(article_id)
417 | 
418 |                 # read the title of this article (outside the revision portion of the document)
419 |                 if not reading_revision:
420 |                     titles = title_regex.search(clean_line)
421 |                     if titles:
422 |                         article_title = titles[0].strip()
423 | 
424 |     if pbar.n % batch_size != 0:
425 |         write_to_db(article_records, article_texts_records)
426 | 
427 |     print(
428 |         f"Processed {n_articles} articles.\n  "
429 |         f"Of which viable (with article ID and text): {n_viable_articles} ({n_viable_articles / n_articles * 100:.2f}%)"
430 |         f"\n    "
431 |         f"Of which processed (title in entity table): {pbar.n} ({pbar.n / n_viable_articles * 100:.2f}%)"
432 |     )
433 | 
434 | 
435 | def extract_demo_dump(
436 |     in_dump_path: Path, out_dump_path: Path, entity_titles: Set[str]
437 | ) -> None:
438 |     """Writes information on those entities having at least one of the filter_terms in their description to a new dump
439 |     at location filtered_dump_path.
440 |     in_dump_path (Path): Path to complete Wikidata dump.
441 |     out_dump_path (Path): Path to filtered Wikidata dump.
442 |     entity_titles (Set[str]): Entity titles to include.
443 |     """
444 | 
445 |     with bz2.open(in_dump_path, mode="rb") as in_file:
446 |         with bz2.open(out_dump_path, mode="wb") as out_file:
447 |             with tqdm.tqdm(
448 |                 desc="Filtering article texts", miniters=1, total=len(entity_titles)
449 |             ) as pbar:
450 |                 reading_revision = False
451 |                 line_cache: List[bytes] = []
452 | 
453 |                 for line in in_file:
454 |                     clean_line = line.strip().decode("utf-8")
455 | 
456 |                     if clean_line == "<revision>":
457 |                         reading_revision = True
458 |                     elif clean_line == "</revision>":
459 |                         reading_revision = False
460 | 
461 |                     # Start reading new page
462 |                     if clean_line == "<page>":
463 |                         line_cache = [line]
464 |                         article_title = None
465 | 
466 |                     else:
467 |                         line_cache.append(line)
468 |                         # finished reading this page
469 |                         if clean_line == "</page>":
470 |                             line_cache.append(line)
471 |                             if article_title and article_title in entity_titles:
472 |                                 out_file.writelines(line_cache)
473 |                                 line_cache = []
474 |                                 pbar.update(1)
475 | 
476 |                             article_title = None
477 |                             reading_revision = False
478 | 
479 |                     # read the title of this article (outside the revision portion of the document)
480 |                     if not reading_revision:
481 |                         titles = title_regex.search(clean_line)
482 |                         if titles:
483 |                             article_title = titles[0].strip()
484 | 
485 | 
486 | def _process_wp_text(
487 |     article_title: str, article_text: str, entity_title_to_id: Dict[str, str]
488 | ) -> Tuple[Optional[str], Optional[List[Tuple[str, Any, int, int]]]]:
489 |     """Process article text.
490 |     article_title (str): Article title.
491 |     article_text (str): Article text.
492 |     entity_title_to_id (Dict[str, str]): Map for entity/article titles to their IDs.
493 |     RETURNS (Tuple[Optional[str], Optional[List[Tuple[str, Any, int, int]]]]): Cleaned text and list of entities in
494 |         article text.
495 |     """
496 |     # ignore meta Wikipedia pages
497 |     if ns_regex.match(article_title):
498 |         return None, None
499 | 
500 |     # remove the text tags
501 |     text_search = text_tag_regex.sub("", article_text)
502 |     text_search = text_regex.search(text_search)
503 |     if text_search is None:
504 |         return None, None
505 |     text = text_search.group(0)
506 | 
507 |     # stop processing if this is a redirect page
508 |     if text.startswith("#REDIRECT"):
509 |         return None, None
510 | 
511 |     # get the raw text without markup etc, keeping only interwiki links
512 |     return _remove_links(_get_clean_wp_text(text), entity_title_to_id)
513 | 
514 | 
515 | def _get_clean_wp_text(article_text: str) -> str:
516 |     """Cleans article text.
517 |     article_text (str): Text to clean.
518 |     RETURNS (str): Cleaned text.
519 |     """
520 |     clean_text = article_text.strip()
521 | 
522 |     # remove bolding & italic markup
523 |     clean_text = clean_text.replace("'''", "")
524 |     clean_text = clean_text.replace("''", "")
525 | 
526 |     # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
527 |     try_again = True
528 |     previous_length = len(clean_text)
529 |     while try_again:
530 |         clean_text = info_regex.sub(
531 |             "", clean_text
532 |         )  # non-greedy match excluding a nested {
533 |         if len(clean_text) < previous_length:
534 |             try_again = True
535 |         else:
536 |             try_again = False
537 |         previous_length = len(clean_text)
538 | 
539 |     # remove HTML comments
540 |     clean_text = html_regex.sub("", clean_text)
541 | 
542 |     # remove Category and File statements
543 |     clean_text = category_regex.sub("", clean_text)
544 |     clean_text = file_regex.sub("", clean_text)
545 | 
546 |     # remove multiple =
547 |     while "==" in clean_text:
548 |         clean_text = clean_text.replace("==", "=")
549 | 
550 |     clean_text = clean_text.replace(". =", ".")
551 |     clean_text = clean_text.replace(" = ", ". ")
552 |     clean_text = clean_text.replace("= ", ".")
553 |     clean_text = clean_text.replace(" =", "")
554 | 
555 |     # remove refs (non-greedy match)
556 |     clean_text = ref_regex.sub("", clean_text)
557 |     clean_text = ref_2_regex.sub("", clean_text)
558 | 
559 |     # remove additional wikiformatting
560 |     clean_text = re.sub(r"&lt;blockquote&gt;", "", clean_text)
561 |     clean_text = re.sub(r"&lt;/blockquote&gt;", "", clean_text)
562 | 
563 |     # change special characters back to normal ones
564 |     clean_text = clean_text.replace(r"&lt;", "<")
565 |     clean_text = clean_text.replace(r"&gt;", ">")
566 |     clean_text = clean_text.replace(r"&quot;", '"')
567 |     clean_text = clean_text.replace(r"&amp;nbsp;", " ")
568 |     clean_text = clean_text.replace(r"&amp;", "&")
569 | 
570 |     # remove multiple spaces
571 |     while "  " in clean_text:
572 |         clean_text = clean_text.replace("  ", " ")
573 | 
574 |     return clean_text.strip()
575 | 
576 | 
577 | def _remove_links(
578 |     clean_text: str, entity_title_to_id: Dict[str, str]
579 | ) -> Tuple[Optional[str], Optional[List[Tuple[str, Any, int, int]]]]:
580 |     """Remove links from clean text.
581 |     clean_text (str): Cleaned article text.
582 |     entity_title_to_id (Dict[str, str]): Map for entity/article titles to their IDs.
583 |     RETURNS (Tuple[Optional[str], Optional[List[Tuple[str, Any, int, int]]]]): Cleaned text without links, information
584 |         on entities in text.
585 |     """
586 |     # read the text char by char to get the right offsets for the interwiki links
587 |     entities = []
588 |     final_text = ""
589 |     open_read = 0
590 |     reading_text = True
591 |     reading_entity = False
592 |     reading_mention = False
593 |     reading_special_case = False
594 |     entity_buffer = ""
595 |     mention_buffer = ""
596 |     for index, letter in enumerate(clean_text):
597 |         if letter == "[":
598 |             open_read += 1
599 |         elif letter == "]":
600 |             open_read -= 1
601 |         elif letter == "|":
602 |             if reading_text:
603 |                 final_text += letter
604 |             # switch from reading entity_title to mention in the [[entity_title|mention]] pattern
605 |             elif reading_entity:
606 |                 reading_text = False
607 |                 reading_entity = False
608 |                 reading_mention = True
609 |             else:
610 |                 reading_special_case = True
611 |         else:
612 |             if reading_entity:
613 |                 entity_buffer += letter
614 |             elif reading_mention:
615 |                 mention_buffer += letter
616 |             elif reading_text:
617 |                 final_text += letter
618 |             else:
619 |                 raise ValueError("Not sure at point", clean_text[index - 2 : index + 2])
620 | 
621 |         if open_read > 2:
622 |             reading_special_case = True
623 | 
624 |         if open_read == 2 and reading_text:
625 |             reading_text = False
626 |             reading_entity = True
627 |             reading_mention = False
628 | 
629 |         # we just finished reading an entity_title
630 |         if open_read == 0 and not reading_text:
631 |             if "#" in entity_buffer or entity_buffer.startswith(":"):
632 |                 reading_special_case = True
633 |             # Ignore cases with nested structures like File: handles etc
634 |             if not reading_special_case:
635 |                 if not mention_buffer:
636 |                     mention_buffer = entity_buffer
637 |                 start = len(final_text)
638 |                 end = start + len(mention_buffer)
639 |                 qid = entity_title_to_id.get(entity_buffer, None)
640 |                 if qid:
641 |                     entities.append((mention_buffer, qid, start, end))
642 |                 final_text += mention_buffer
643 | 
644 |             entity_buffer = ""
645 |             mention_buffer = ""
646 | 
647 |             reading_text = True
648 |             reading_entity = False
649 |             reading_mention = False
650 |             reading_special_case = False
651 | 
652 |     return final_text, entities
653 | 
654 | 
655 | def is_dev(article_id: str) -> bool:
656 |     """Checks whether article is dev article.
657 |     article_id (str): Article ID.
658 |     RETURNS (bool): Whether article is dev article.
659 |     """
660 |     if not article_id:
661 |         return False
662 |     return article_id.endswith("3")
663 | 
664 | 
665 | def is_valid_article(doc_text: str) -> bool:
666 |     """Checks whether article is valid.
667 |     doc_text (str): Article text to check.
668 |     RETURNS (bool): Whether article text is valid.
669 |     """
670 |     # custom length cut-off
671 |     return 10 < len(doc_text) < 30000
672 | 
673 | 
674 | def is_valid_sentence(sent_text: str) -> bool:
675 |     """Checks whether sentence is valid.
676 |     sent_text (str): Sentence to check.
677 |     RETURNS (bool): Whether sentence is valid.
678 |     """
679 |     if not 10 < len(sent_text) < 3000:
680 |         # custom length cut-off
681 |         return False
682 | 
683 |     if sent_text.strip().startswith("*") or sent_text.strip().startswith("#"):
684 |         # remove 'enumeration' sentences (occurs often on Wikipedia)
685 |         return False
686 | 
687 |     return True
688 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | description = Generate a SQLite database from Wikipedia & Wikidata dumps
 3 | url = https://spacy.io
 4 | author = Explosion
 5 | author_email = contact@explosion.ai
 6 | license = MIT
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | project_urls =
10 |     Source = https://github.com/explosion/wikid
11 | 
12 | [options]
13 | zip_safe = false
14 | include_package_data = true
15 | python_requires = >=3.7
16 | setup_requires =
17 |     pytest
18 |     pre-commit
19 | install_requires =
20 |     spacy
21 |     pyyaml
22 |     tqdm
23 |     prettytable
24 | 
25 | [flake8]
26 | ignore = E203, E266, E501, E731, W503, E741, F541, W605
27 | max-line-length = 80
28 | select = B,C,E,F,W,T4,B9
29 | exclude =
30 |     .env,
31 |     .git,
32 |     __pycache__,
33 |     _tokenizer_exceptions_list.py,
34 | 
35 | [mypy]
36 | ignore_missing_imports = True
37 | no_implicit_optional = True
38 | allow_redefinition = True
39 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name="wikid",
 5 |     version="0.0.1",
 6 |     description="Wiki dump extraction for spaCy",
 7 |     url="https://www.github.com/explosion/wikid/",
 8 |     packages=["scripts", "scripts.wiki"],
 9 | )
10 | 


--------------------------------------------------------------------------------
/test_wikid.py:
--------------------------------------------------------------------------------
 1 | """ Testing all project steps. """
 2 | import pytest
 3 | from pathlib import Path
 4 | import sys
 5 | from spacy.cli.project.run import project_run
 6 | from spacy.cli.project.assets import project_assets
 7 | 
 8 | 
 9 | @pytest.mark.skipif(
10 |     sys.platform == "win32",
11 |     reason="Skipping on Windows (for now) due to platform-specific scripts.",
12 | )
13 | def test_wikid():
14 |     root = Path(__file__).parent
15 |     project_assets(root)
16 |     project_run(root, "parse", capture=True)
17 |     project_run(root, "download_model", capture=True)
18 |     project_run(root, "create_kb", capture=True)
19 | 


--------------------------------------------------------------------------------