├── .gitignore
├── LICENSE
├── README.md
├── requirements.txt
├── scripts
├── download-wikipedia
└── sift-notebook
├── setup.py
├── sift.ipynb
└── sift
├── __init__.py
├── build.py
├── corpora
├── __init__.py
├── commoncrawl.py
├── redirects.py
├── wikicorpus.py
├── wikidata.py
└── wikipedia.py
├── dataset.py
├── format.py
├── logging.py
├── models
├── __init__.py
├── embeddings.py
├── links.py
└── text.py
└── util.py
/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | output/
3 |
4 | ##########
5 | # PYTHON #
6 | ##########
7 | # Initialized from github
8 | # https://github.com/github/gitignore/blob/master/Python.gitignore
9 |
10 | # Byte-compiled / optimized / DLL files
11 | __pycache__/
12 | *.py[cod]
13 |
14 | # C extensions
15 | *.so
16 |
17 | # Distribution / packaging
18 | bin/
19 | build/
20 | develop-eggs/
21 | dist/
22 | eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | ve
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | .tox/
39 | .coverage
40 | .cache
41 | nosetests.xml
42 | coverage.xml
43 |
44 | # Translations
45 | *.mo
46 |
47 | # Mr Developer
48 | .mr.developer.cfg
49 | .project
50 | .pydevproject
51 |
52 | # Rope
53 | .ropeproject
54 |
55 | # Django stuff:
56 | *.log
57 | *.pot
58 |
59 | # Sphinx documentation
60 | docs/_build/
61 |
62 | # Package
63 | MANIFEST
64 |
65 | #########
66 | # EMACS #
67 | #########
68 | # Initliazed from github
69 | # https://raw2.github.com/github/gitignore/master/Global/Emacs.gitignore
70 |
71 | # -*- mode: gitignore; -*-
72 | *~
73 | \#*\#
74 | /.emacs.desktop
75 | /.emacs.desktop.lock
76 | *.elc
77 | auto-save-list
78 | tramp
79 | .\#*
80 |
81 | # Org-mode
82 | .org-id-locations
83 | *_archive
84 |
85 | # flymake-mode
86 | *_flymake.*
87 |
88 | # eshell files
89 | /eshell/history
90 | /eshell/lastdir
91 |
92 | # elpa packages
93 | /elpa/
94 |
95 | #######
96 | # VIM #
97 | #######
98 | # Initialized from github
99 | # https://raw2.github.com/github/gitignore/master/Global/vim.gitignore
100 |
101 | [._]*.s[a-w][a-z]
102 | [._]s[a-w][a-z]
103 | *.un~
104 | Session.vim
105 | .netrwhist
106 | *~
107 |
108 | #######
109 | # OSX #
110 | #######
111 | # Initialized from github
112 | # https://raw2.github.com/github/gitignore/master/Global/OSX.gitignore
113 |
114 | .DS_Store
115 | .AppleDouble
116 | .LSOverride
117 |
118 | # Icon must ends with two \r.
119 | Icon
120 |
121 | # Thumbnails
122 | ._*
123 |
124 | # Files that might appear on external disk
125 | .Spotlight-V100
126 | .Trashes
127 |
128 | # NFS
129 | .nfs*
130 |
131 | # IDE
132 | *.sublime-*
133 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2014. Andrew Chisholm, Ben Hachey, The University of Sydney.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 |
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | sift - Knowledge extraction from web data
2 | ================================================
3 |
4 | __sift__ is a toolkit for extracting models of entities and text from a corpus of linked documents.
5 |
6 |
7 | ## What can it do?
8 |
9 | __sift__ is written in python, runs on Spark and is completely modular.
10 |
11 | Out of the box, you can:
12 |
13 | - Convert wikipedia articles into json objects without all the mediawiki cruft
14 | - Extract entity relations from wikidata and align them with wikipedia mentions
15 | - Extract plain-text content html and WARC encoded web page crawls
16 | - Model entity popularity, alternative names and relatedness using inlinks
17 | - Preprocess text documents for machine learning pipelines
18 | - Push output into datastores like MongoDB and Redis
19 |
20 | ## Quick Start
21 |
22 | ### Install
23 | ```bash
24 | pip install git+http://git@github.com/wikilinks/sift.git
25 | ```
26 |
27 | ## Getting Started
28 |
29 | To use sift, you'll need some data.
30 |
31 | If you'd like to use Wikipedia data, sift includes a helper script for downloading the latest dumps.
32 |
33 | Download the latest paritioned Wikipedia dump into the 'latest' directory.
34 | ```bash
35 | download-wikipedia latest
36 | ```
37 |
38 | Once you've got some data, take a look at the sample notebook: [sift.ipynb](sift.ipynb).
39 |
40 | ## Spark
41 |
42 | __sift__ uses Spark to process corpora in parallel.
43 |
44 | If you'd like to make use of an existing Spark cluster, ensure the `SPARK_HOME` environment variable is set.
45 |
46 | If not, that's fine. `sift` will prompt you to download and run Spark locally, utilising multiple cores on your system.
47 |
48 | ## Datasets
49 |
50 | [Web KB](https://github.com/andychisholm/web-kb) datasets built from commoncrawl data are available under a public S3 bucket: [s3.amazonaws.com/webkb](https://s3.amazonaws.com/webkb/)
51 |
52 | - `docs-2017` is built from news articles under the [CC-NEWS](http://commoncrawl.org/2016/10/news-dataset-available/) collection from January to June 2017 ([sample](https://s3.amazonaws.com/webkb/docs-2017/part-00000))
53 | - `web-201707` is built from a full web crawl for [July 2017](http://commoncrawl.org/2017/07/july-2017-crawl-archive-now-available/) filted to English language pages ([sample](https://s3.amazonaws.com/webkb/web-201707/part-00000.gz))
54 |
55 | The web collection contains plain-text content, entity mentions and endpoint annotations extracted from 1.5 billion documents with over 4 billion web links.
56 | Data is encoded in a simple one-JSON-blob-per-line structure.
57 |
58 | For example, the first document in the collection is an article from 2012 describing an [upcoming tour by Nicki Minaj](http://1019ampradio.cbslocal.com/2012/11/06/nicki-minaj-promises-man-bits-on-her-upcoming-tour/):
59 |
60 | ```json
61 | {
62 | "_id": "http://1019ampradio.cbslocal.com/2012/11/06/nicki-minaj-promises-man-bits-on-her-upcoming-tour/",
63 | "text": "Nicki Minaj has had quite the year. Currently in the U.K. on her Reloaded Tour she sat down with London DJ Tim Westwood and her U.K. Barbz for a Q & A session. While Nicki took questions from both Westwood and her fans one answer in particular caused the room to pay attention...",
64 | "links":[{
65 | "start": 0,
66 | "endpoint": 0.6358972797,
67 | "stop": 11,
68 | "target": "http://1019ampradio.cbslocal.com/tag/nicki-minaj"
69 | }, {
70 | "start": 145,
71 | "endpoint": 0.2769776554,
72 | "stop": 160,
73 | "target": "http://www.youtube.com/watch?v=vnyuhDBcQo0"
74 | }],
75 | "mentions":[{
76 | "start": 0,
77 | "stop": 11,
78 | "label": "PERSON"
79 | }, {
80 | "start": 53,
81 | "stop": 57,
82 | "label": "GPE"
83 | },
84 | // truncated
85 | }
86 | ```
87 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ujson
2 | numpy
3 | scipy
4 | pattern
5 | gensim
6 | msgpack-python
7 | findspark
8 | jupyter
9 | spacy
10 | lxml
11 | beautifulsoup4
12 | warc
13 | pycld2
14 | dragnet
--------------------------------------------------------------------------------
/scripts/download-wikipedia:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | WKDATE=latest
5 | PARA_ARG=""
6 |
7 | if [ $# -gt 0 ]
8 | then
9 | WKDATE=$1
10 | if [ $# == 2 ]
11 | then
12 | PARA_ARG="-P $2"
13 | fi
14 | fi
15 |
16 | if [ "$WKDATE" == "latest" ]; then
17 | export LINK_PFX=/enwiki/latest/
18 | fi
19 |
20 | export WKDIR=$WKDATE
21 | export WKDATE
22 | rm -rf $WKDIR
23 | mkdir -p $WKDIR
24 |
25 | curl "https://dumps.wikimedia.org/enwiki/$WKDATE/" |\
26 | grep "enwiki-$WKDATE-pages-articles[0-9]*.xml-p[0-9]*p[0-9]*.bz2\""|\
27 | awk -v pfx=$LINK_PFX -F'"' '{print "https://dumps.wikimedia.org" pfx $2}' |\
28 | xargs -n1 $PARA_ARG -L 1 bash -c 'wget $0 -P $WKDIR'
29 |
--------------------------------------------------------------------------------
/scripts/sift-notebook:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | SPARK_URL="http://d3kbcqa49mib13.cloudfront.net/spark-2.1.0-bin-hadoop2.7.tgz"
5 |
6 | if [ -z "$SPARK_HOME" ]; then
7 | echo "SPARK_HOME is unset, using local Spark deployment..."
8 | if [ ! -d "spark" ]; then
9 | read -p "Would you like to download spark and run in standalone mode? " -n 1 -r
10 | if [[ ! $REPLY =~ ^[Yy]$ ]]
11 | then
12 | exit 1
13 | fi
14 | echo
15 | echo "Downloading spark for local standalone deployment..."
16 | mkdir spark
17 | curl $SPARK_URL | tar zx -C spark --strip-components=1
18 |
19 | echo "Updating spark logger config..."
20 | pushd spark/conf > /dev/null
21 | sed -e 's/log4j.rootCategory=INFO/log4j.rootCategory=WARN/' log4j.properties.template > log4j.properties
22 | popd > /dev/null
23 | fi
24 | export SPARK_HOME=$(pwd)/spark
25 | if [ -z "$SPARK_MASTER" ]; then
26 | SPARK_MASTER=local[*]
27 | fi
28 | fi
29 |
30 | if [ ! -z "$VIRTUAL_ENV" ]; then
31 | export PYSPARK_PYTHON=$VIRTUAL_ENV/bin/python
32 | else
33 | export PYSPARK_PYTHON=$(pwd)/ve/bin/python
34 | fi
35 |
36 | SPARK_MASTER_SW=
37 | if [ ! -z "$SPARK_MASTER" ]; then
38 | SPARK_MASTER_SW="--master $SPARK_MASTER"
39 | fi
40 |
41 | PYTHONPATH=$PYTHONPATH:$(pwd) jupyter notebook "$@"
42 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | __version__ = '0.3.0'
4 | __pkg_name__ = 'textsift'
5 |
6 | setup(
7 | name = __pkg_name__,
8 | version = __version__,
9 | description = 'Text modelling framework',
10 | author='Andrew Chisholm',
11 | packages = find_packages(),
12 | license = 'MIT',
13 | url = 'https://github.com/wikilinks/sift',
14 | scripts = [
15 | 'scripts/sift-notebook',
16 | 'scripts/download-wikipedia'
17 | ],
18 | classifiers=[
19 | 'Development Status :: 4 - Beta',
20 | 'Environment :: Console',
21 | 'Intended Audience :: Science/Research',
22 | 'License :: OSI Approved :: MIT License',
23 | 'Programming Language :: Python :: 2.7',
24 | 'Topic :: Text Processing :: Linguistic'
25 | ],
26 | install_requires = [
27 | "ujson",
28 | "numpy",
29 | "pattern",
30 | "gensim",
31 | "msgpack-python",
32 | "beautifulsoup4",
33 | "spacy",
34 | "warc",
35 | "pycld2",
36 | "scipy",
37 | "scikit-learn"
38 | ],
39 | test_suite = __pkg_name__ + '.test'
40 | )
41 |
--------------------------------------------------------------------------------
/sift.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import findspark\n",
12 | "findspark.init()\n",
13 | "import pyspark\n",
14 | "sc = pyspark.SparkContext()"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 36,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "from sift.corpora import wikipedia\n",
26 | "from sift.models import text, links"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 5,
32 | "metadata": {
33 | "collapsed": true
34 | },
35 | "outputs": [],
36 | "source": [
37 | "base_path = '/data/wikipedia/20151002/'"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 6,
43 | "metadata": {
44 | "collapsed": true
45 | },
46 | "outputs": [],
47 | "source": [
48 | "corpus = wikipedia.WikipediaCorpus()(sc, base_path + 'dump')"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 9,
54 | "metadata": {
55 | "collapsed": false
56 | },
57 | "outputs": [],
58 | "source": [
59 | "redirects = wikipedia.WikipediaRedirects()(corpus)\n",
60 | "docs = wikipedia.WikipediaArticles()(corpus, redirects)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 27,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "[{'_id': 'en.wikipedia.org/wiki/Causa_y_Efecto',\n",
74 | " 'links': [{'start': 986,\n",
75 | " 'stop': 999,\n",
76 | " 'target': u'en.wikipedia.org/wiki/New_York_City'},\n",
77 | " {'start': 93, 'stop': 105, 'target': u'en.wikipedia.org/wiki/Studio_album'},\n",
78 | " {'start': 971, 'stop': 982, 'target': u'en.wikipedia.org/wiki/Gotham_Hall'},\n",
79 | " {'start': 2178,\n",
80 | " 'stop': 2192,\n",
81 | " 'target': u'en.wikipedia.org/wiki/Miami,_Florida'},\n",
82 | " {'start': 1791,\n",
83 | " 'stop': 1808,\n",
84 | " 'target': u'en.wikipedia.org/wiki/Latin_Pop_Airplay'},\n",
85 | " {'start': 2702,\n",
86 | " 'stop': 2719,\n",
87 | " 'target': u'en.wikipedia.org/wiki/Latin_Pop_Airplay'},\n",
88 | " {'start': 465,\n",
89 | " 'stop': 484,\n",
90 | " 'target': u'en.wikipedia.org/wiki/Ni_Una_Sola_Palabra'},\n",
91 | " {'start': 2122, 'stop': 2129, 'target': u'en.wikipedia.org/wiki/Austria'},\n",
92 | " {'start': 2740,\n",
93 | " 'stop': 2760,\n",
94 | " 'target': u'en.wikipedia.org/wiki/Latin_Rhythm_Airplay'},\n",
95 | " {'start': 106,\n",
96 | " 'stop': 119,\n",
97 | " 'target': u'en.wikipedia.org/wiki/Gran_City_Pop'},\n",
98 | " {'start': 2388, 'stop': 2397, 'target': u'en.wikipedia.org/wiki/Reggaeton'},\n",
99 | " {'start': 2069,\n",
100 | " 'stop': 2080,\n",
101 | " 'target': u'en.wikipedia.org/wiki/Music_video'},\n",
102 | " {'start': 2530, 'stop': 2534, 'target': u'en.wikipedia.org/wiki/U.S.'},\n",
103 | " {'start': 2573, 'stop': 2577, 'target': u'en.wikipedia.org/wiki/U.S.'},\n",
104 | " {'start': 2612, 'stop': 2616, 'target': u'en.wikipedia.org/wiki/U.S.'},\n",
105 | " {'start': 2649, 'stop': 2653, 'target': u'en.wikipedia.org/wiki/U.S.'},\n",
106 | " {'start': 2685, 'stop': 2689, 'target': u'en.wikipedia.org/wiki/U.S.'},\n",
107 | " {'start': 2723, 'stop': 2727, 'target': u'en.wikipedia.org/wiki/U.S.'},\n",
108 | " {'start': 1509,\n",
109 | " 'stop': 1524,\n",
110 | " 'target': u'en.wikipedia.org/wiki/Hot_Latin_Songs'},\n",
111 | " {'start': 2666,\n",
112 | " 'stop': 2681,\n",
113 | " 'target': u'en.wikipedia.org/wiki/Hot_Latin_Songs'},\n",
114 | " {'start': 1391,\n",
115 | " 'stop': 1400,\n",
116 | " 'target': u'en.wikipedia.org/wiki/Causa_y_Efecto'},\n",
117 | " {'start': 41, 'stop': 54, 'target': u'en.wikipedia.org/wiki/Paulina_Rubio'},\n",
118 | " {'start': 821,\n",
119 | " 'stop': 854,\n",
120 | " 'target': u'en.wikipedia.org/wiki/2009_Latin_Billboard_Music_Awards'},\n",
121 | " {'start': 2402,\n",
122 | " 'stop': 2415,\n",
123 | " 'target': u'en.wikipedia.org/wiki/Angel_&_Khriz'},\n",
124 | " {'start': 2775,\n",
125 | " 'stop': 2827,\n",
126 | " 'target': u'en.wikipedia.org/wiki/List_of_number-one_Billboard_Hot_Latin_Songs_of_2009'},\n",
127 | " {'start': 2547,\n",
128 | " 'stop': 2569,\n",
129 | " 'target': u'en.wikipedia.org/wiki/Bubbling_Under_Hot_100'},\n",
130 | " {'start': 2536,\n",
131 | " 'stop': 2545,\n",
132 | " 'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n",
133 | " {'start': 2579,\n",
134 | " 'stop': 2588,\n",
135 | " 'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n",
136 | " {'start': 2618,\n",
137 | " 'stop': 2627,\n",
138 | " 'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n",
139 | " {'start': 2655,\n",
140 | " 'stop': 2664,\n",
141 | " 'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n",
142 | " {'start': 2691,\n",
143 | " 'stop': 2700,\n",
144 | " 'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n",
145 | " {'start': 2729,\n",
146 | " 'stop': 2738,\n",
147 | " 'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n",
148 | " {'start': 2304, 'stop': 2307, 'target': u'en.wikipedia.org/wiki/MTV'}],\n",
149 | " 'text': u'\"\" is a song performed by Mexican singer Paulina Rubio. The song was recorded for her ninth studio album Gran City Pop, and was released as the lead single on March 30, 2009. Causa y Efecto became a hit reaching number 1 in the U.S. Billboard Hot Latin Songs and Hot Latin Airplays. Causa y Efecto was produced by Cachorro L\\xf3pez and written by Mario Domm and M\\xf3nica V\\xe9lez. \"Causa y Efecto\" is Rubio\\'s first number one single in the Billboard Hot Latin Songs since Ni Una Sola Palabra in 2006. \"Causa y Efecto\" was awarded \"Song of the year pop/ballad\" by ASCAP.\\n\\nAn English version of the song titled \"Cause and Effect\" will be released on Paulina\\'s next bilingual album.\\n\\nRelease and promotion\\n\"Causa y Efecto\", debuted in radio stations on March 30. The song was performed for the first time on April 23, 2009 at the Latin Billboard Music Awards 2009, Rubio was one of the most expected artists of the night. Rubio also performed the song in a private concert at the Gotham Hall in New York City on May 11 promoting Gran City Pop. The concert was presented by Univision Radio.\\n\\nRubio performed the song at the Wal-Mart Shareholders\\' Meeting, singing a \"spanglish\" version of the song. She performed parts of the Spanish version and others of an unreleased English version. An English version of the song was released on a remix of the song, the George Figares Radio Mix.\\n\\nTrack listing\\n*CD Single\\n# \"Causa y Efecto\" Album version - 3:27\\n\\nChart performance\\nThe song debuted at number 40 on the \"Billboard\" Hot Latin Songs, the next week the song jumped at #26, obtaining the highest \\'jump\\' of the week on the chart. The song peaked #1 for five consecutive weeks and it\\'s her fourth #1 and her twelfth top ten hit on Hot Latin songs. It debuted at #22 and peaked at #1 on the \"Billboard\" Latin Pop Airplay.\\n\\nIn Spain, the single debuted at #43, and has peaked at #7 based on downloads alone. On May 9, 2009, \"Causa y Efecto\" entered at the Spanish Airplay Chart at #7 as the highest debut of that week, and peaked at #1 for three consecutive weeks.\\n\\nMusic video\\nThe music video for \"Causa y Efecto\" was directed by the Austrian director Rudi Dolezal. The video was filmed in Miami, Florida during the month of March at M3 Studios. The video premiered worldwide on May 7 and in the U.S on the channel MTV Tres.\\n\\nRemix\\nAn official remix of the song was released on June 12. It features reggaeton duo Angel & Khriz.\\n\\nCharts\\n\\nChart (2009)\\nPeakposition\\n\\nMexico (Monitor Latino)\\n1\\n\\nSpanish Airplay Chart\\n1\\n\\nSpanish Singles Chart\\n7\\n\\nU.S. \"Billboard\" Bubbling Under Hot 100\\n4\\n\\nU.S. \"Billboard\" Heatseeker Songs \\n23\\n\\nU.S. \"Billboard\" Tropical Songs \\n29\\n\\nU.S. \"Billboard\" Hot Latin Songs\\n1\\n\\nU.S. \"Billboard\" Latin Pop Airplay\\n1\\n\\nU.S. \"Billboard\" Latin Rhythm Airplay\\n6\\n\\n\\nSee also\\n*List of number-one Billboard Hot Latin Songs of 2009\\n\\nSales and certifications\\n\\n\\n Country\\n Certification\\n Sales\\n\\n Spain\\n Platinum\\n 40,000\\n\\n\\nReferences'}]"
150 | ]
151 | },
152 | "execution_count": 27,
153 | "metadata": {},
154 | "output_type": "execute_result"
155 | }
156 | ],
157 | "source": [
158 | "docs.take(1)"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 25,
164 | "metadata": {
165 | "collapsed": false
166 | },
167 | "outputs": [],
168 | "source": [
169 | "vocab = text.TermVocab(max_rank=100000,min_rank=0,lowercase=True,min_df=5)(docs.sample(False, 0.25))"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 26,
175 | "metadata": {
176 | "collapsed": false
177 | },
178 | "outputs": [
179 | {
180 | "data": {
181 | "text/plain": [
182 | "[{'_id': u'the', 'count': 1172125, 'rank': 0},\n",
183 | " {'_id': u'in', 'count': 1135557, 'rank': 1},\n",
184 | " {'_id': u'a', 'count': 1127366, 'rank': 2},\n",
185 | " {'_id': u'of', 'count': 1101586, 'rank': 3},\n",
186 | " {'_id': u'and', 'count': 1008637, 'rank': 4},\n",
187 | " {'_id': u'is', 'count': 997703, 'rank': 5},\n",
188 | " {'_id': u'references', 'count': 958549, 'rank': 6},\n",
189 | " {'_id': u'to', 'count': 889253, 'rank': 7},\n",
190 | " {'_id': u'was', 'count': 804122, 'rank': 8},\n",
191 | " {'_id': u'for', 'count': 725355, 'rank': 9}]"
192 | ]
193 | },
194 | "execution_count": 26,
195 | "metadata": {},
196 | "output_type": "execute_result"
197 | }
198 | ],
199 | "source": [
200 | "vocab.take(10)"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 34,
206 | "metadata": {
207 | "collapsed": false
208 | },
209 | "outputs": [
210 | {
211 | "data": {
212 | "text/plain": [
213 | "[{'_id': u'en.wikipedia.org/wiki/New_York_City',\n",
214 | " 'source': 'en.wikipedia.org/wiki/Causa_y_Efecto',\n",
215 | " 'span': (73, 86),\n",
216 | " 'text': u'rubio also performed the song in a private concert at the gotham hall in new york city on may 11 promoting gran city pop.'}]"
217 | ]
218 | },
219 | "execution_count": 34,
220 | "metadata": {},
221 | "output_type": "execute_result"
222 | }
223 | ],
224 | "source": [
225 | "text.EntityMentions(sentence_window=1,lowercase=True)(docs).take(1)"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 31,
231 | "metadata": {
232 | "collapsed": true
233 | },
234 | "outputs": [],
235 | "source": [
236 | "mentions = text.IndexMappedMentions(sentence_window=1,lowercase=True)(sc, docs, vocab)"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 32,
242 | "metadata": {
243 | "collapsed": false
244 | },
245 | "outputs": [
246 | {
247 | "data": {
248 | "text/plain": [
249 | "[{'_id': u'en.wikipedia.org/wiki/New_York_City',\n",
250 | " 'source': 'en.wikipedia.org/wiki/Causa_y_Efecto',\n",
251 | " 'span': (14, 17),\n",
252 | " 'text': [25961,\n",
253 | " 18,\n",
254 | " 686,\n",
255 | " 0,\n",
256 | " 447,\n",
257 | " 1,\n",
258 | " 2,\n",
259 | " 574,\n",
260 | " 2057,\n",
261 | " 13,\n",
262 | " 0,\n",
263 | " 21394,\n",
264 | " 518,\n",
265 | " 1,\n",
266 | " 35,\n",
267 | " 227,\n",
268 | " 98,\n",
269 | " 11,\n",
270 | " 46,\n",
271 | " 205,\n",
272 | " 3585,\n",
273 | " 9860,\n",
274 | " 98,\n",
275 | " 1770]}]"
276 | ]
277 | },
278 | "execution_count": 32,
279 | "metadata": {},
280 | "output_type": "execute_result"
281 | }
282 | ],
283 | "source": [
284 | "mentions.take(1)"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 42,
290 | "metadata": {
291 | "collapsed": false
292 | },
293 | "outputs": [
294 | {
295 | "data": {
296 | "text/plain": [
297 | "[{'_id': u'', 'count': 4},\n",
298 | " {'_id': u'www.rsssf.com/tabless/slow97.html', 'count': 2},\n",
299 | " {'_id': u'en.wikipedia.org/wiki/Yuba,_Michigan', 'count': 3},\n",
300 | " {'_id': u'en.wikipedia.org/wiki/Walnut_River_(Kansas)', 'count': 12},\n",
301 | " {'_id': u'www.ctheritage.org/encyclopedia/ct1865_1929/admin_baldwin.htm',\n",
302 | " 'count': 2},\n",
303 | " {'_id': u'en.wikipedia.org/wiki/Falling_factorial', 'count': 28},\n",
304 | " {'_id': u'en.wikipedia.org/wiki/WOW_Worship:_Blue', 'count': 5},\n",
305 | " {'_id': u'en.wikipedia.org/wiki/Ekhane_Pinjar', 'count': 2},\n",
306 | " {'_id': u'en.wikipedia.org/wiki/Conditional_execution', 'count': 2},\n",
307 | " {'_id': u'en.wikipedia.org/wiki/Paralititan', 'count': 27}]"
308 | ]
309 | },
310 | "execution_count": 42,
311 | "metadata": {},
312 | "output_type": "execute_result"
313 | }
314 | ],
315 | "source": [
316 | "links.EntityCounts()(docs).take(10)"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 49,
322 | "metadata": {
323 | "collapsed": false
324 | },
325 | "outputs": [
326 | {
327 | "data": {
328 | "text/plain": [
329 | "[{'_id': u'XMT',\n",
330 | " 'counts': {u'en.wikipedia.org/wiki/Cray_XMT': 1,\n",
331 | " u'en.wikipedia.org/wiki/Extensible_MPEG-4_Textual_Format': 1,\n",
332 | " u'en.wikipedia.org/wiki/XMT': 1},\n",
333 | " 'total': 3},\n",
334 | " {'_id': u'New York',\n",
335 | " 'counts': {u'en.wikipedia.org/wiki/New_York_(disambiguation)': 1,\n",
336 | " u'en.wikipedia.org/wiki/New_York_Disability_Benefits_Law': 1,\n",
337 | " u'en.wikipedia.org/wiki/New_York_State_Armory_(Poughkeepsie)': 1,\n",
338 | " u'en.wikipedia.org/wiki/Outline_of_New_York': 1,\n",
339 | " u'en.wikipedia.org/wiki/Vehicle_registration_plates_of_New_York': 1},\n",
340 | " 'total': 5},\n",
341 | " {'_id': u'Albert Lewis',\n",
342 | " 'counts': {u'en.wikipedia.org/wiki/Albert_Gerald_Lewis': 1,\n",
343 | " u'en.wikipedia.org/wiki/Albert_Lewis': 1,\n",
344 | " u'en.wikipedia.org/wiki/Albert_Lewis_(American_football)': 27,\n",
345 | " u'en.wikipedia.org/wiki/Albert_Lewis_(Sheffield_United)': 1,\n",
346 | " u'en.wikipedia.org/wiki/Albert_Lewis_(footballer)': 3,\n",
347 | " u'en.wikipedia.org/wiki/Albert_Lewis_(priest)': 2,\n",
348 | " u'en.wikipedia.org/wiki/Albert_Lewis_(producer)': 5,\n",
349 | " u'en.wikipedia.org/wiki/Talbot_Lewis': 1},\n",
350 | " 'total': 41},\n",
351 | " {'_id': u'WFA website',\n",
352 | " 'counts': {u'wfafootball.com': 1,\n",
353 | " u'www.wfafootball.com': 1,\n",
354 | " u'www.wfafootball.com/': 2},\n",
355 | " 'total': 4},\n",
356 | " {'_id': u'Marlboro British F3 Championship round 3',\n",
357 | " 'counts': {u'en.wikipedia.org/wiki/1981_Marlboro_British_F3_Championship,_Rd.3': 1,\n",
358 | " u'en.wikipedia.org/wiki/1982_Marlboro_British_F3_Championship,_Rd.3': 1,\n",
359 | " u'en.wikipedia.org/wiki/1983_Marlboro_British_F3_Championship,_Rd.3': 1,\n",
360 | " u'en.wikipedia.org/wiki/1984_Marlboro_British_F3_Championship,_Rd.3': 1,\n",
361 | " u'en.wikipedia.org/wiki/1985_Marlboro_British_F3_Championship,_Rd.3': 1},\n",
362 | " 'total': 5}]"
363 | ]
364 | },
365 | "execution_count": 49,
366 | "metadata": {},
367 | "output_type": "execute_result"
368 | }
369 | ],
370 | "source": [
371 | "links\\\n",
372 | " .EntityNameCounts()(docs)\\\n",
373 | " .filter(lambda r: len(r['counts']) >= 3)\\\n",
374 | " .take(5)"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": null,
380 | "metadata": {
381 | "collapsed": true
382 | },
383 | "outputs": [],
384 | "source": []
385 | }
386 | ],
387 | "metadata": {
388 | "kernelspec": {
389 | "display_name": "Python 2",
390 | "language": "python",
391 | "name": "python2"
392 | },
393 | "language_info": {
394 | "codemirror_mode": {
395 | "name": "ipython",
396 | "version": 2
397 | },
398 | "file_extension": ".py",
399 | "mimetype": "text/x-python",
400 | "name": "python",
401 | "nbconvert_exporter": "python",
402 | "pygments_lexer": "ipython2",
403 | "version": "2.7.6"
404 | }
405 | },
406 | "nbformat": 4,
407 | "nbformat_minor": 0
408 | }
409 |
--------------------------------------------------------------------------------
/sift/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.2.0'
--------------------------------------------------------------------------------
/sift/build.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | import shutil
4 | import textwrap
5 | import argparse
6 | import ujson as json
7 |
8 | from pyspark import SparkContext, SparkConf
9 | from sift.format import ModelFormat
10 |
11 | import logging
12 | log = logging.getLogger()
13 |
14 | class DatasetBuilder(object):
15 | """ Wrapper for modules which extract models of entities or text from a corpus of linked documents """
16 | def __init__(self, **kwargs):
17 | self.output_path = kwargs.pop('output_path')
18 | self.sample = kwargs.pop('sample')
19 |
20 | fmtcls = kwargs.pop('fmtcls')
21 | fmt_args = {p:kwargs[p] for p in fmtcls.__init__.__code__.co_varnames if p in kwargs}
22 | self.formatter = fmtcls(**fmt_args)
23 |
24 | modelcls = kwargs.pop('modelcls')
25 | self.model_name = re.sub('([A-Z])', r' \1', modelcls.__name__).strip()
26 |
27 | log.info("Building %s...", self.model_name)
28 | self.model = modelcls(**kwargs)
29 |
30 | def __call__(self):
31 | c = SparkConf().setAppName('Build %s' % self.model_name)
32 |
33 | log.info('Using spark master: %s', c.get('spark.master'))
34 | sc = SparkContext(conf=c)
35 |
36 | kwargs = self.model.prepare(sc)
37 | m = self.model.build(**kwargs)
38 | m = self.model.format_items(m)
39 | m = self.formatter(m)
40 |
41 | if self.output_path:
42 | log.info("Saving to: %s", self.output_path)
43 | if os.path.isdir(self.output_path):
44 | log.warn('Writing over output path: %s', self.output_path)
45 | shutil.rmtree(self.output_path)
46 | m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec')
47 | elif self.sample > 0:
48 | print '\n'.join(str(i) for i in m.take(self.sample))
49 |
50 | log.info('Done.')
51 |
52 | @classmethod
53 | def add_arguments(cls, p):
54 | p.add_argument('--save', dest='output_path', required=False, default=None, metavar='OUTPUT_PATH')
55 | p.add_argument('--sample', dest='sample', required=False, default=1, type=int, metavar='NUM_SAMPLES')
56 | p.set_defaults(cls=cls)
57 |
58 | sp = p.add_subparsers()
59 | for modelcls in cls.providers():
60 | name = modelcls.__name__
61 | help_str = modelcls.__doc__.split('\n')[0]
62 | desc = textwrap.dedent(modelcls.__doc__.rstrip())
63 | csp = sp.add_parser(name,
64 | help=help_str,
65 | description=desc,
66 | formatter_class=argparse.RawDescriptionHelpFormatter)
67 | modelcls.add_arguments(csp)
68 | cls.add_formatter_arguments(csp)
69 |
70 | return p
71 |
72 | @classmethod
73 | def add_formatter_arguments(cls, p):
74 | sp = p.add_subparsers()
75 | for fmtcls in ModelFormat.iter_options():
76 | name = fmtcls.__name__.lower()
77 | if name.endswith('format'):
78 | name = name[:-len('format')]
79 | help_str = fmtcls.__doc__.split('\n')[0]
80 | desc = textwrap.dedent(fmtcls.__doc__.rstrip())
81 | csp = sp.add_parser(name,
82 | help=help_str,
83 | description=desc,
84 | formatter_class=argparse.RawDescriptionHelpFormatter)
85 | fmtcls.add_arguments(csp)
86 | return p
87 |
--------------------------------------------------------------------------------
/sift/corpora/__init__.py:
--------------------------------------------------------------------------------
1 | class Corpus(object):
2 | pass
3 |
--------------------------------------------------------------------------------
/sift/corpora/commoncrawl.py:
--------------------------------------------------------------------------------
1 | import re
2 | from cStringIO import StringIO
3 | from warc import WARCFile
4 | from dragnet import content_extractor, BlockifyError
5 | from lxml import etree
6 | from bs4 import BeautifulSoup
7 | from sift.dataset import ModelBuilder, Model, Documents
8 | from sift import logging
9 | import pycld2 as cld
10 | from pycld2 import error as cld_error
11 |
12 | LINKS_RE = re.compile(r'(.+?)')
13 |
14 | class WARCCorpus(ModelBuilder, Model):
15 | def __init__(self, language=None):
16 | self.language = language
17 |
18 | @staticmethod
19 | def parse_warc_content(buf):
20 | try:
21 | wf = WARCFile(fileobj=StringIO(buf))
22 | record = wf.read_record()
23 | payload = record.payload.read()
24 | top = payload[:15]
25 |
26 | if top.startswith('HTTP/') and top.endswith('200 OK'):
27 | content_start = payload.find('\r\n\r\n')
28 | if content_start != -1:
29 | yield record.url, payload[content_start+4:]
30 | except IOError:
31 | pass
32 |
33 | @staticmethod
34 | def try_get_lang(content):
35 | try:
36 | reliable, _, details = cld.detect(content)
37 | if reliable:
38 | return details[0][1]
39 | except cld_error:
40 | pass
41 | return None
42 |
43 | def build(self, sc, path):
44 | PAGE_DELIMITER = "WARC/1.0\r\n"
45 | warcs = sc\
46 | .newAPIHadoopFile(
47 | path,
48 | "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
49 | "org.apache.hadoop.io.LongWritable",
50 | "org.apache.hadoop.io.Text",
51 | conf = { "textinputformat.record.delimiter": PAGE_DELIMITER })\
52 | .filter(lambda (_, part): part)\
53 | .map(lambda (_, part): PAGE_DELIMITER+part.encode('utf-8'))\
54 | .flatMap(self.parse_warc_content)
55 |
56 | if self.language != None:
57 | warcs = warcs.filter(lambda (url, content): self.try_get_lang(content) == self.language)
58 | return warcs
59 |
60 | @staticmethod
61 | def format_item((url, content)):
62 | return {
63 | '_id': url,
64 | 'content': content,
65 | }
66 |
67 | class CommonCrawlArticles(ModelBuilder, Documents):
68 | THRESHOLD_CONTENT_SZ = 250000
69 |
70 | @staticmethod
71 | def clean_content((url, content)):
72 | try:
73 | blocks = content_extractor.analyze(content, blocks=True)
74 | content = ''.join(etree.tostring(b.features['block_start_element']) for b in blocks)
75 | if len(content) < CommonCrawlArticles.THRESHOLD_CONTENT_SZ:
76 | yield url, content
77 | except (BlockifyError, etree.SerialisationError):
78 | pass
79 |
80 | @staticmethod
81 | def parse_article(content):
82 | soup = BeautifulSoup(content, 'lxml')
83 |
84 | for tag in soup.find_all():
85 | if tag.name == 'a' and tag.attrs.get('href') and tag.text.strip():
86 | tag.attrs = {'href': tag.attrs['href']}
87 | else:
88 | tag.unwrap()
89 |
90 | return soup.encode_contents().decode('utf-8').strip()
91 |
92 | @staticmethod
93 | def extract_links(content):
94 | links = []
95 | offset = 0
96 | for match in LINKS_RE.finditer(content):
97 | target = match.group(1)
98 | anchor = match.group(2)
99 | start = match.start() - offset
100 | offset += len(match.group())-len(anchor)
101 | links.append((target, slice(start, start+len(anchor))))
102 |
103 | return LINKS_RE.sub(r'\2', content), links
104 |
105 | def build(self, corpus):
106 | return corpus\
107 | .map(lambda item: (item['_id'], item['content']))\
108 | .flatMap(self.clean_content)\
109 | .mapValues(self.parse_article)\
110 | .mapValues(self.extract_links)
111 |
--------------------------------------------------------------------------------
/sift/corpora/redirects.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import ujson as json
3 |
4 | from sift.dataset import Model, DocumentModel
5 | from sift.util import trim_link_protocol, iter_sent_spans, ngrams
6 |
7 | from sift import logging
8 | log = logging.getLogger()
9 |
10 | class MapRedirects(Model):
11 | """ Map redirects """
12 | def __init__(self, *args, **kwargs):
13 | self.from_path = kwargs.pop('from_path')
14 | self.to_path = kwargs.pop('to_path')
15 |
16 | def prepare(self, sc):
17 | return {
18 | "from_rds": self.load(sc, self.from_path).cache(),
19 | "to_rds": self.load(sc, self.to_path).cache()
20 | }
21 |
22 | @staticmethod
23 | def map_redirects(source, target):
24 | return source\
25 | .map(lambda (s, t): (t, s))\
26 | .leftOuterJoin(target)\
27 | .map(lambda (t, (s, r)): (s, r or t))\
28 | .distinct()
29 |
30 | def build(self, from_rds, to_rds):
31 | # map source of destination kb
32 | # e.g. (a > b) and (a > c) becomes (b > c)
33 | mapped_to = to_rds\
34 | .leftOuterJoin(from_rds)\
35 | .map(lambda (s, (t, f)): (f or s, t))\
36 |
37 | # map target of origin kb
38 | # e.g. (a > b) and (b > c) becomes (a > c)
39 | mapped_from = from_rds\
40 | .map(lambda (s, t): (t, s))\
41 | .leftOuterJoin(mapped_to)\
42 | .map(lambda (t, (s, r)): (s, r))\
43 | .filter(lambda (s, t): t)
44 |
45 | rds = (mapped_from + mapped_to).distinct()
46 | rds.cache()
47 |
48 | log.info('Resolving transitive mappings over %i redirects...', rds.count())
49 | rds = self.map_redirects(rds, rds)
50 |
51 | log.info('Resolved %i redirects...', rds.count())
52 | return rds
53 |
54 | @staticmethod
55 | def load(sc, path, fmt=json):
56 | log.info('Using redirects: %s', path)
57 | return sc\
58 | .textFile(path)\
59 | .map(fmt.loads)\
60 | .map(lambda r: (r['_id'], r['target']))
61 |
62 | def format_items(self, model):
63 | return model\
64 | .map(lambda (source, target): {
65 | '_id': source,
66 | 'target': target
67 | })
68 |
69 | @classmethod
70 | def add_arguments(cls, p):
71 | super(MapRedirects, cls).add_arguments(p)
72 | p.add_argument('from_path', metavar='FROM_REDIRECTS_PATH')
73 | p.add_argument('to_path', metavar='TO_REDIRECTS_PATH')
74 | return p
75 |
76 | class RedirectDocuments(DocumentModel):
77 | """ Map links in a corpus via a set of redirects """
78 | def __init__(self, **kwargs):
79 | self.redirect_path = kwargs.pop('redirects_path')
80 | super(RedirectDocuments, self).__init__(**kwargs)
81 |
82 | def prepare(self, sc):
83 | params = super(RedirectDocuments, self).prepare(sc)
84 | params['redirects'] = self.load(sc, self.redirect_path).cache()
85 | return params
86 |
87 | def build(self, corpus, redirects):
88 | articles = corpus.map(lambda d: (d['_id'], d))
89 |
90 | def map_doc_links(doc, rds):
91 | for l in doc['links']:
92 | l['target'] = rds[l['target']]
93 | return doc
94 |
95 | return corpus\
96 | .map(lambda d: (d['_id'], set(l['target'] for l in d['links'])))\
97 | .flatMap(lambda (pid, links): [(t, pid) for t in links])\
98 | .leftOuterJoin(redirects)\
99 | .map(lambda (t, (pid, r)): (pid, (t, r if r else t)))\
100 | .groupByKey()\
101 | .mapValues(dict)\
102 | .join(articles)\
103 | .map(lambda (pid, (rds, doc)): map_doc_links(doc, rds))
104 |
105 | def format_items(self, model):
106 | return model
107 |
108 | @classmethod
109 | def add_arguments(cls, p):
110 | super(RedirectDocuments, cls).add_arguments(p)
111 | p.add_argument('redirects_path', metavar='REDIRECTS_PATH')
112 | return p
113 |
--------------------------------------------------------------------------------
/sift/corpora/wikicorpus.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Based on wikicorpus.py from Gensim:
5 | # https://github.com/piskvorky/gensim/blob/develop/gensim/corpora/wikicorpus.py
6 | # Credits:
7 | # Radim Rehurek
8 | # Lars Buitinck
9 |
10 | import re
11 | import xml.etree.cElementTree as ET
12 |
13 | from htmlentitydefs import name2codepoint
14 |
15 | wikilink_prefix = 'en.wikipedia.org/wiki/'
16 |
17 | RE_P0 = re.compile('', re.DOTALL | re.UNICODE) # comments
18 | RE_P1 = re.compile('[ ].*?)(]|/>)', re.DOTALL | re.UNICODE) # footnotes
19 | RE_P2 = re.compile("(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$", re.UNICODE) # links to languages
20 | RE_P3 = re.compile("{{([^}{]*)}}", re.DOTALL | re.UNICODE) # template
21 | RE_P4 = re.compile("{{([^}]*)}}", re.DOTALL | re.UNICODE) # template
22 | RE_P5 = re.compile('\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description
23 | RE_P6 = re.compile("\[\[:?([^][]*)\|([^][]*)\]\]", re.DOTALL | re.UNICODE) # simplify links, keep description
24 | RE_P6_ex = re.compile("\[\[:?([^][]*)\]\]", re.DOTALL | re.UNICODE) # links without description
25 | RE_P7 = re.compile('\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images
26 | RE_P8 = re.compile('\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files
27 | RE_P9 = re.compile(' ].*?)(|/>)', re.DOTALL | re.UNICODE) # outside links
28 | RE_P10 = re.compile('|/>)', re.DOTALL | re.UNICODE) # math content
29 | RE_P11 = re.compile('<(.*?)>', re.DOTALL | re.UNICODE) # all other tags
30 | RE_P12 = re.compile('\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting
31 | RE_P13 = re.compile('\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting
32 | RE_P14 = re.compile('\[\[Category:[^][]*\]\]', re.UNICODE) # categories
33 | RE_P15 = re.compile('\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
34 |
35 | RE_BI = re.compile(r"'''''([^']*?)'''''")
36 | RE_B = re.compile(r"'''(.*?)'''")
37 | RE_IQ = re.compile(r"''\"(.*?)\"''")
38 | RE_I = re.compile(r"''([^']*)''")
39 | RE_QQ = re.compile(r'""(.*?)""')
40 | RE_SECT = re.compile(r'(==+)\s*(.*?)\s*\1')
41 | RE_EMPTY_PARENS = re.compile(r' \(\s*\)')
42 |
43 | RE_HTML_ENT = re.compile("?(\w+);")
44 |
45 | def remove_markup((uri, text)):
46 | text = re.sub(RE_P2, "", text)
47 |
48 | # TODO: may be desirable to extract captions for files and images and insert them back into the document
49 | text = remove_template(text)
50 | text = extract_tag_content(text, [
51 | re.compile('\[\[[fF]ile:(.*?)(\|[^\]\[]+?)*\|'),
52 | re.compile('\[\[[iI]mage:(.*?)(\|[^\]\[]+?)*\|')
53 | ])
54 |
55 | # the wiki markup is recursive (markup inside markup etc) we deal with that by removing
56 | # markup in a loop, starting with inner-most expressions and working outwards as long as something changes.
57 | iters = 0
58 | while True:
59 | old, iters = text, iters + 1
60 | text = re.sub(RE_P0, "", text) # remove comments
61 | text = re.sub(RE_P1, '', text) # remove footnotes
62 | text = re.sub(RE_P9, "", text) # remove outside links
63 | text = re.sub(RE_P10, "", text) # remove math content
64 | if iters == 1:
65 | text = re.sub(RE_P11, "", text) # remove all remaining tags
66 |
67 | # todo: extract sections
68 | text = re.sub(RE_SECT, '\\2', text)
69 |
70 | # inject link from the first bolded phrase as a mention of the article entity
71 | # this heuristic holds for the vast majority of pages and is a wiki standard
72 | text = re.sub(RE_B, '\\1' % uri, text, 1)
73 |
74 | text = re.sub(RE_P14, '', text) # remove categories
75 |
76 | # inject links
77 | text = re.sub(RE_P5, '\\3', text) # remove urls, keep description
78 | text = re.sub(RE_P6, '\\2' % wikilink_prefix, text) # simplify links, keep description only
79 | text = re.sub(RE_P6_ex, '\\1' % wikilink_prefix, text)
80 | # remove table markup
81 | text = text.replace('||', '\n|') # each table cell on a separate line
82 | text = re.sub(RE_P12, '\n', text) # remove formatting lines
83 | text = re.sub(RE_P13, '\n\\3', text) # leave only cell content
84 | # remove empty mark-up
85 | text = text.replace('[]', '')
86 |
87 | # formatting
88 | text = re.sub(RE_BI, r"\1", text)
89 | text = re.sub(RE_B, r"\1", text)
90 | text = re.sub(RE_IQ, r'"\1"', text)
91 | text = re.sub(RE_I, r'"\1"', text)
92 | text = re.sub(RE_QQ, r"\1", text)
93 |
94 | if old == text or iters > 2: # stop if nothing changed between two iterations or after a fixed number of iterations
95 | break
96 |
97 | text = re.sub(RE_EMPTY_PARENS, '', text) # remove empty parenthesis (usually left by stripped templates)
98 | text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text
99 | text = html_unescape(text.strip())
100 | return (uri, text)
101 |
102 | def remove_template(s):
103 | # Find the start and end position of each template by finding the opening '{{' and closing '}}'
104 | n_open, n_close = 0, 0
105 | starts, ends = [], []
106 | in_template = False
107 | prev_c = None
108 | for i, c in enumerate(iter(s)):
109 | if not in_template:
110 | if c == '{' and c == prev_c:
111 | starts.append(i - 1)
112 | in_template = True
113 | n_open = 1
114 | if in_template:
115 | if c == '{':
116 | n_open += 1
117 | elif c == '}':
118 | n_close += 1
119 | if n_open == n_close:
120 | ends.append(i)
121 | in_template = False
122 | n_open, n_close = 0, 0
123 | prev_c = c
124 |
125 | # Remove all the templates
126 | s = ''.join([s[end + 1:start] for start, end in
127 | zip(starts + [None], [-1] + ends)])
128 |
129 | return s
130 |
131 | def extract_tag_content(s, tags, include_content=True):
132 | s = s.replace(u'\u2502','|')
133 | for t in tags:
134 | parts = []
135 | last_match_end = None
136 | for match in t.finditer(s):
137 | parts.append(slice(last_match_end,match.start()))
138 |
139 | i = match.end()
140 | while True:
141 | next_open = s.find('[[', i)
142 | next_close = s.find(']]', i)
143 | if next_open == -1 or next_open > next_close:
144 | last_match_end = next_close
145 | break
146 | elif next_close == -1:
147 | # unbalanced tags in wikimarkup, bail!
148 | last_match_end = i
149 | break
150 | i = next_close+2
151 | if include_content and match.end() != last_match_end:
152 | content = s[match.end():last_match_end].strip('] ')
153 | if content:
154 | parts.append(slice(match.end(),last_match_end))
155 | if not content.endswith('.'):
156 | parts.append('.')
157 | last_match_end += 2
158 | parts.append(slice(last_match_end,None))
159 | s = ''.join(s[p] if type(p) is slice else p for p in parts)
160 |
161 | return s
162 |
163 | def html_unescape(text):
164 | def replace(m):
165 | span, code = m.group(0), m.group(1)
166 | try:
167 | if span[1] == "#":
168 | return unichr(int(code[1:], 16)) if span[2] == "x" else unichr(int(code))
169 | else:
170 | return unichr(name2codepoint[code])
171 | except:
172 | return span
173 | return re.sub(RE_HTML_ENT, replace, text)
174 |
175 | def extract_page(content):
176 | e = ET.fromstring(content.encode('utf-8'))
177 |
178 | title = e.find('title').text
179 | ns = e.find('ns').text
180 | pageid = int(e.find('id').text)
181 | redirect_elem = e.find('redirect')
182 | redirect = None if redirect_elem == None else redirect_elem.attrib['title']
183 | content = None if redirect != None else e.find('revision/text').text
184 | uri = wikilink_prefix+title.replace(' ', '_')
185 |
186 | return uri, ns, pageid, redirect, content
187 |
188 | def normalise_wikilink(s):
189 | s = s.replace(' ', '_').strip('_').strip()
190 | if s and s[0].islower():
191 | s = s[0].upper() + s[1:]
192 | return s
193 |
194 | def normalise_link(s):
195 | if s.startswith(wikilink_prefix):
196 | s = wikilink_prefix + normalise_wikilink(s[len(wikilink_prefix):])
197 | return s
198 |
199 | def extract_links(content):
200 | links_re = re.compile(r'(.+?)')
201 |
202 | links = []
203 | offset = 0
204 | for match in list(links_re.finditer(content)):
205 | target = match.group(1)
206 | anchor = match.group(2)
207 | start = match.start() - offset
208 | offset += len(match.group())-len(anchor)
209 | links.append((normalise_link(target), slice(start, start+len(anchor))))
210 |
211 | return links_re.sub(r'\2', content), links
212 |
--------------------------------------------------------------------------------
/sift/corpora/wikidata.py:
--------------------------------------------------------------------------------
1 | import ujson as json
2 |
3 | from sift.corpora import wikicorpus
4 | from sift.dataset import ModelBuilder, Model, Relations
5 |
6 | from sift import logging
7 | log = logging.getLogger()
8 |
9 | ENTITY_PREFIX = 'Q'
10 | PREDICATE_PREFIX = 'P'
11 |
12 | class WikidataCorpus(ModelBuilder, Model):
13 | @staticmethod
14 | def iter_item_for_line(line):
15 | line = line.strip()
16 | if line != '[' and line != ']':
17 | yield json.loads(line.rstrip(',\n'))
18 |
19 | def build(self, sc, path):
20 | return sc\
21 | .textFile(path)\
22 | .flatMap(self.iter_item_for_line)\
23 | .map(lambda i: (i['id'], i))
24 |
25 | @staticmethod
26 | def format_item((wid, item)):
27 | return {
28 | '_id': wid,
29 | 'data': item
30 | }
31 |
32 | class WikidataRelations(ModelBuilder, Relations):
33 | """ Prepare a corpus of relations from wikidata """
34 | @staticmethod
35 | def iter_relations_for_item(item):
36 | for pid, statements in item.get('claims', {}).iteritems():
37 | for statement in statements:
38 | if statement['mainsnak'].get('snaktype') == 'value':
39 | datatype = statement['mainsnak'].get('datatype')
40 | if datatype == 'wikibase-item':
41 | yield pid, int(statement['mainsnak']['datavalue']['value']['numeric-id'])
42 | elif datatype == 'time':
43 | yield pid, statement['mainsnak']['datavalue']['value']['time']
44 | elif datatype == 'string' or datatype == 'url':
45 | yield pid, statement['mainsnak']['datavalue']['value']
46 |
47 | def build(self, corpus):
48 | entities = corpus\
49 | .filter(lambda item: item['_id'].startswith(ENTITY_PREFIX))
50 |
51 | entity_labels = entities\
52 | .map(lambda item: (item['_id'], item['data'].get('labels', {}).get('en', {}).get('value', None)))\
53 | .filter(lambda (pid, label): label)\
54 | .map(lambda (pid, label): (int(pid[1:]), label))
55 |
56 | wiki_entities = entities\
57 | .map(lambda item: (item['data'].get('sitelinks', {}).get('enwiki', {}).get('title', None), item['data']))\
58 | .filter(lambda (e, _): e)\
59 | .cache()
60 |
61 | predicate_labels = corpus\
62 | .filter(lambda item: item['_id'].startswith(PREDICATE_PREFIX))\
63 | .map(lambda item: (item['_id'], item['data'].get('labels', {}).get('en', {}).get('value', None)))\
64 | .filter(lambda (pid, label): label)\
65 | .cache()
66 |
67 | relations = wiki_entities\
68 | .flatMap(lambda (eid, item): ((pid, (value, eid)) for pid, value in self.iter_relations_for_item(item)))\
69 | .join(predicate_labels)\
70 | .map(lambda (pid, ((value, eid), label)): (value, (label, eid)))
71 |
72 | return relations\
73 | .leftOuterJoin(entity_labels)\
74 | .map(lambda (value, ((label, eid), value_label)): (eid, (label, value_label or value)))\
75 | .groupByKey()\
76 | .mapValues(dict)
77 |
--------------------------------------------------------------------------------
/sift/corpora/wikipedia.py:
--------------------------------------------------------------------------------
1 | import ujson as json
2 |
3 | from sift.corpora import wikicorpus
4 | from sift.dataset import ModelBuilder, Model, Redirects, Documents
5 |
6 | from sift import logging
7 | log = logging.getLogger()
8 |
9 | class WikipediaCorpus(ModelBuilder, Model):
10 | def build(self, sc, path):
11 | PAGE_DELIMITER = "\n \n"
12 | PAGE_START = '\n'
13 | PAGE_END = ''
14 | return sc\
15 | .newAPIHadoopFile(
16 | path,
17 | "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
18 | "org.apache.hadoop.io.LongWritable",
19 | "org.apache.hadoop.io.Text",
20 | conf = { "textinputformat.record.delimiter": PAGE_DELIMITER })\
21 | .map(lambda (_, part): (part.find(PAGE_START), part))\
22 | .filter(lambda (offset, _): offset >= 0)\
23 | .map(lambda (offset, content): content[offset:]+PAGE_END)\
24 | .map(wikicorpus.extract_page)
25 |
26 | @staticmethod
27 | def format_item((title, ns, pid, redirect, content)):
28 | return {
29 | '_id': title,
30 | 'pid': pid,
31 | 'namespace': ns,
32 | 'redirect': redirect,
33 | 'content': content
34 | }
35 |
36 | class WikipediaRedirects(ModelBuilder, Redirects):
37 | """ Extract a set of redirects from wikipedia """
38 | def __init__(self, resolve_transitive=False):
39 | self.resolve_transitive = resolve_transitive
40 |
41 | def build(self, pages, verbose=False):
42 | pfx = wikicorpus.wikilink_prefix
43 | redirects = pages\
44 | .filter(lambda page: page['redirect'] != None)\
45 | .map(lambda page: (page['_id'], page['redirect']))\
46 | .mapValues(wikicorpus.normalise_wikilink)\
47 | .map(lambda (s, t): (s, pfx+t))
48 |
49 | if self.resolve_transitive:
50 | redirects = redirects.cache()
51 |
52 | num_targets = redirects\
53 | .map(lambda (k,v): v)\
54 | .distinct()\
55 | .count()
56 |
57 | redirects = redirects\
58 | .map(lambda (s, t): (t, s)).leftOuterJoin(redirects)\
59 | .map(lambda (target, (source, redirect)): (source, redirect or target))
60 |
61 | if verbose:
62 | redirects = redirects.cache()
63 | final_num_targets = redirects.map(lambda (k,v): v).distinct().count()
64 | log.info('Resolved %i transitive redirects...', num_targets - final_num_targets)
65 |
66 | return redirects.distinct()
67 |
68 | class WikipediaArticles(ModelBuilder, Documents):
69 | """ Prepare a corpus of documents from wikipedia """
70 | def build(self, corpus, redirects=None):
71 | articles = corpus\
72 | .filter(lambda page: page['namespace'] == '0' and page['redirect'] == None and page['content'])\
73 | .map(lambda page: (page['_id'], page['content']))\
74 | .map(wikicorpus.remove_markup)\
75 | .mapValues(wikicorpus.extract_links)
76 |
77 | if redirects:
78 | redirects = redirects.map(lambda r: (r['_id'], r['target']))
79 | articles.cache()
80 |
81 | # redirect set is typically too large to be broadcasted for a map-side join
82 | articles = articles\
83 | .flatMap(lambda (pid, (text, links)): ((t, (pid, span)) for t, span in links))\
84 | .leftOuterJoin(redirects)\
85 | .map(lambda (t, ((pid, span), r)): (pid, (r if r else t, span)))\
86 | .groupByKey()\
87 | .mapValues(list)\
88 | .join(articles)\
89 | .map(lambda (pid, (links, (text, _))): (pid, (text, links)))
90 |
91 | return articles
92 |
--------------------------------------------------------------------------------
/sift/dataset.py:
--------------------------------------------------------------------------------
1 | import ujson as json
2 |
3 | class ModelBuilder(object):
4 | def __init__(self, *args, **kwargs): pass
5 |
6 | def __call__(self, *args, **kwargs):
7 | return self.build(*args, **kwargs).map(self.format_item)
8 |
9 | def build(self, *args, **kwargs):
10 | raise NotImplementedError
11 |
12 | class Model(object):
13 | @staticmethod
14 | def format_item(item):
15 | raise NotImplementedError
16 |
17 | @staticmethod
18 | def load(sc, path, fmt=json):
19 | return sc.textFile(path).map(json.loads)
20 |
21 | @staticmethod
22 | def save(m, path, fmt=json):
23 | m.map(json.dumps).saveAsTextFile(path, 'org.apache.hadoop.io.compress.GzipCodec')
24 |
25 | class Redirects(Model):
26 | @staticmethod
27 | def format_item((source, target)):
28 | return {'_id': source, 'target': target}
29 |
30 | class Vocab(Model):
31 | @staticmethod
32 | def format_item((term, (count, rank))):
33 | return {
34 | '_id': term,
35 | 'count': count,
36 | 'rank': rank
37 | }
38 |
39 | class Mentions(Model):
40 | @staticmethod
41 | def format_item((target, source, text, span)):
42 | return {
43 | '_id': target,
44 | 'source': source,
45 | 'text': text,
46 | 'span': span
47 | }
48 |
49 | class IndexedMentions(Model):
50 | @staticmethod
51 | def format_item((target, source, text, span)):
52 | return {
53 | '_id': target,
54 | 'source': source,
55 | 'sequence': text,
56 | 'span': span
57 | }
58 |
59 | class Documents(Model):
60 | @staticmethod
61 | def format_item((uri, (text, links))):
62 | return {
63 | '_id': uri,
64 | 'text': text,
65 | 'links': [{
66 | 'target': target,
67 | 'start': span.start,
68 | 'stop': span.stop
69 | } for target, span in links]
70 | }
71 |
72 | class Relations(Model):
73 | @staticmethod
74 | def format_item((uri, relations)):
75 | return {
76 | '_id': uri,
77 | 'relations': relations
78 | }
--------------------------------------------------------------------------------
/sift/format.py:
--------------------------------------------------------------------------------
1 | import cPickle as pickle
2 | import ujson as json
3 | import msgpack
4 | import base64
5 |
6 | class ModelFormat(object):
7 | def __init__(self):
8 | pass
9 | def __call__(self, model):
10 | raise NotImplemented
11 |
12 | @classmethod
13 | def iter_options(cls):
14 | yield JsonFormat
15 | yield RedisFormat
16 | yield TsvFormat
17 |
18 | class TsvFormat(ModelFormat):
19 | """ Format model output as tab separated values """
20 | @staticmethod
21 | def items_to_tsv(items):
22 | key_order = None
23 | for item in items:
24 | if key_order == None:
25 | key_order = []
26 | if '_id' in item:
27 | key_order.append('_id')
28 | key_order += sorted(k for k in item.iterkeys() if k != '_id')
29 |
30 | # todo: proper field serialization and escapes
31 | yield u'\t'.join(unicode(item[k]) for k in key_order).encode('utf-8')
32 |
33 | def __call__(self, model):
34 | return model.mapPartitions(self.items_to_tsv)
35 |
36 | @classmethod
37 | def add_arguments(cls, p):
38 | p.set_defaults(fmtcls=cls)
39 | return p
40 |
41 | class JsonFormat(ModelFormat):
42 | """ Format model output as json """
43 | def __call__(self, model):
44 | return model.map(json.dumps)
45 |
46 | @classmethod
47 | def add_arguments(cls, p):
48 | p.set_defaults(fmtcls=cls)
49 | return p
50 |
51 | class RedisFormat(ModelFormat):
52 | """ Format model output as redis protocol SET commands """
53 | def __init__(self, prefix, serializer, field):
54 | if serializer == 'raw' and not field:
55 | raise Exception("Target field required for raw serializer")
56 |
57 | self.prefix = prefix
58 | self.field = field
59 | self.serializer = {
60 | 'json': json.dumps,
61 | 'msgpack': lambda o: base64.b64encode(msgpack.dumps(o)),
62 | 'pickle': lambda o: base64.b64encode(pickle.dumps(o, -1)),
63 | 'raw': lambda o: o
64 | }[serializer]
65 |
66 | def to_value(self, item):
67 | if self.field:
68 | item = unicode(item[self.field])
69 | else:
70 | item.pop('_id', None)
71 | return self.serializer(item)
72 |
73 | def __call__(self, model):
74 | cmd = '\r\n'.join(["*3", "$3", "SET", "${}", "{}", "${}", "{}"])+'\r'
75 | return model\
76 | .map(lambda i: ((self.prefix+i['_id'].replace('"','\\"')).encode('utf-8'), self.to_value(i)))\
77 | .map(lambda (t, c): cmd.format(len(t), t, len(c), c))
78 |
79 | @classmethod
80 | def add_arguments(cls, p):
81 | p.add_argument('--prefix', required=False, default='', metavar='PREFIX')
82 | p.add_argument('--serializer', choices=['json', 'pickle', 'msgpack', 'raw'], required=False, default='json', metavar='SERIALIZER')
83 | p.add_argument('--field', required=False, metavar='FIELD_TO_SERIALIZE')
84 | p.set_defaults(fmtcls=cls)
85 | return p
86 |
--------------------------------------------------------------------------------
/sift/logging.py:
--------------------------------------------------------------------------------
1 | """ Logging Configuration """
2 | from __future__ import absolute_import
3 | import logging
4 |
5 | def setup():
6 | fmt = '%(asctime)s|%(levelname)s|%(module)s|%(message)s'
7 | logging.basicConfig(format=fmt)
8 | log = logging.getLogger('nel')
9 | log.setLevel(logging.DEBUG)
10 |
11 | def getLogger():
12 | return logging.getLogger('nel')
13 |
14 | setup()
--------------------------------------------------------------------------------
/sift/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andychisholm/sift/c25cc42ab4ad7f44838036c63d0be4b03767c16c/sift/models/__init__.py
--------------------------------------------------------------------------------
/sift/models/embeddings.py:
--------------------------------------------------------------------------------
1 | from operator import add
2 | from itertools import chain
3 |
4 | from sift.models.text import EntityMentions
5 | from sift.util import ngrams
6 | from sift.dataset import ModelBuilder, Model
7 |
8 | from sift import logging
9 | log = logging.getLogger()
10 |
11 | class EntitySkipGramEmbeddings(ModelBuilder, Model):
12 | """ Learn distributed representations for words and entities in a corpus via skip-gram embedding """
13 | def __init__(
14 | self,
15 | dimensions=100,
16 | min_word_count=500,
17 | min_entity_count=10,
18 | entity_prefix='en.wikipedia.org/wiki/',
19 | exclude_words=False,
20 | exclude_entities=False,
21 | workers=4,
22 | coalesce=None,
23 | *args, **kwargs):
24 |
25 | self.dimensions = dimensions
26 | self.min_word_count = min_word_count
27 | self.min_entity_count = min_entity_count
28 | self.filter_target = entity_prefix
29 | self.exclude_words = exclude_words
30 | self.exclude_entities = exclude_entities
31 | self.workers = workers
32 | self.coalesce = coalesce
33 |
34 | def get_trim_rule(self):
35 | from gensim.utils import RULE_KEEP, RULE_DISCARD
36 | def trim_rule(word, count, min_count):
37 | if not word.startswith(self.filter_target):
38 | return RULE_KEEP if count >= self.min_word_count else RULE_DISCARD
39 | else:
40 | return RULE_KEEP if count >= self.min_entity_count else RULE_DISCARD
41 | return RULE_KEEP
42 | return trim_rule
43 |
44 | def build(self, mentions):
45 | from gensim.models.word2vec import Word2Vec
46 | sentences = mentions\
47 | .filter(lambda (target, source, text, span): target.startswith(self.filter_target))\
48 |
49 | sentences = sentences\
50 | .map(lambda (target, source, text, (s,e)): list(chain(ngrams(text[:s],1), [target], ngrams(text[e:],1))))
51 |
52 | if self.coalesce:
53 | sentences = sentences.coalesce(self.coalesce)
54 |
55 | sentences = sentences.cache()
56 |
57 | model = Word2Vec(sample=1e-5, size=self.dimensions, workers=self.workers)
58 |
59 | log.info('Preparing corpus...')
60 | model.corpus_count = sentences.count()
61 |
62 | log.info('Computing vocab statistics...')
63 | term_counts = sentences\
64 | .flatMap(lambda tokens: ((t, 1) for t in tokens))\
65 | .reduceByKey(add)\
66 | .filter(lambda (t, count): \
67 | (t.startswith(self.filter_target) and count >= self.min_entity_count) or \
68 | (count >= self.min_word_count))
69 |
70 | model.raw_vocab = dict(term_counts.collect())
71 | model.scale_vocab(trim_rule=self.get_trim_rule())
72 | model.finalize_vocab()
73 |
74 | log.info('Training local word2vec model...')
75 | model.train(sentences.toLocalIterator())
76 |
77 | log.info('Normalising embeddings...')
78 | model.init_sims(replace=True)
79 |
80 | total_entities = sum(1 if t.startswith(self.filter_target) else 0 for t in model.vocab.iterkeys())
81 | total_words = len(model.vocab) - total_entities
82 |
83 | vocab_sz = 0
84 | if not self.exclude_entities:
85 | log.info('Including %i entity embeddings in exported vocab...', total_entities)
86 | vocab_sz += total_entities
87 | if not self.exclude_words:
88 | log.info('Including %i word embeddings in exported vocab...', total_words)
89 | vocab_sz += total_words
90 |
91 | log.info('Parallelizing %i learned embeddings...', vocab_sz)
92 | return mentions\
93 | .context\
94 | .parallelize(
95 | (t, model.syn0[vi.index].tolist())
96 | for t, vi in model.vocab.iteritems()
97 | if (not self.exclude_entities and t.startswith(self.filter_target)) or
98 | (not self.exclude_words and not t.startswith(self.filter_target)))
99 |
100 | @staticmethod
101 | def format_item((entity, embedding)):
102 | return {
103 | '_id': entity,
104 | 'embedding': embedding
105 | }
--------------------------------------------------------------------------------
/sift/models/links.py:
--------------------------------------------------------------------------------
1 | import ujson as json
2 |
3 | from operator import add
4 | from collections import Counter
5 | from itertools import chain
6 |
7 | from sift.dataset import ModelBuilder, Documents, Model
8 | from sift.util import trim_link_subsection, trim_link_protocol, ngrams
9 |
10 | from sift import logging
11 | log = logging.getLogger()
12 |
13 | class EntityCounts(ModelBuilder, Model):
14 | """ Inlink counts """
15 | def __init__(self, min_count=1, filter_target=None):
16 | self.min_count = min_count
17 | self.filter_target = filter_target
18 |
19 | def build(self, docs):
20 | links = docs\
21 | .flatMap(lambda d: d['links'])\
22 | .map(lambda l: l['target'])\
23 | .map(trim_link_subsection)\
24 | .map(trim_link_protocol)
25 |
26 | if self.filter_target:
27 | links = links.filter(lambda l: l.startswith(self.filter_target))
28 |
29 | return links\
30 | .map(lambda l: (l, 1))\
31 | .reduceByKey(add)\
32 | .filter(lambda (t, c): c > self.min_count)
33 |
34 | @staticmethod
35 | def format_item((target, count)):
36 | return {
37 | '_id': target,
38 | 'count': count
39 | }
40 |
41 | class EntityNameCounts(ModelBuilder, Model):
42 | """ Entity counts by name """
43 | def __init__(self, lowercase=False, filter_target=None):
44 | self.lowercase = lowercase
45 | self.filter_target = filter_target
46 |
47 | def iter_anchor_target_pairs(self, doc):
48 | for link in doc['links']:
49 | target = link['target']
50 | target = trim_link_subsection(target)
51 | target = trim_link_protocol(target)
52 |
53 | anchor = doc['text'][link['start']:link['stop']].strip()
54 |
55 | if self.lowercase:
56 | anchor = anchor.lower()
57 |
58 | if anchor and target:
59 | yield anchor, target
60 |
61 | def build(self, docs):
62 | m = docs.flatMap(lambda d: self.iter_anchor_target_pairs(d))
63 |
64 | if self.filter_target:
65 | m = m.filter(lambda (a, t): t.startswith(self.filter_target))
66 |
67 | return m\
68 | .groupByKey()\
69 | .mapValues(Counter)
70 |
71 | @staticmethod
72 | def format_item((anchor, counts)):
73 | return {
74 | '_id': anchor,
75 | 'counts': dict(counts),
76 | 'total': sum(counts.itervalues())
77 | }
78 |
79 | class NamePartCounts(ModelBuilder, Model):
80 | """
81 | Occurrence counts for ngrams at different positions within link anchors.
82 | 'B' - beginning of span
83 | 'E' - end of span
84 | 'I' - inside span
85 | 'O' - outside span
86 | """
87 | def __init__(self, max_ngram=2, lowercase=False, filter_target=None):
88 | self.lowercase = lowercase
89 | self.filter_target = filter_target
90 | self.max_ngram = max_ngram
91 |
92 | def iter_anchors(self, doc):
93 | for link in doc['links']:
94 | anchor = doc['text'][link['start']:link['stop']].strip()
95 | if self.lowercase:
96 | anchor = anchor.lower()
97 | if anchor:
98 | yield anchor
99 |
100 | @staticmethod
101 | def iter_span_count_types(anchor, n):
102 | parts = list(ngrams(anchor, n, n))
103 | if parts:
104 | yield parts[0], 'B'
105 | yield parts[-1], 'E'
106 | for i in xrange(1, len(parts)-1):
107 | yield parts[i], 'I'
108 |
109 | def build(self, docs):
110 | part_counts = docs\
111 | .flatMap(self.iter_anchors)\
112 | .flatMap(lambda a: chain.from_iterable(self.iter_span_count_types(a, i) for i in xrange(1, self.max_ngram+1)))\
113 | .map(lambda p: (p, 1))\
114 | .reduceByKey(add)\
115 | .map(lambda ((term, spantype), count): (term, (spantype, count)))
116 |
117 | part_counts += docs\
118 | .flatMap(lambda d: ngrams(d['text'], self.max_ngram))\
119 | .map(lambda t: (t, 1))\
120 | .reduceByKey(add)\
121 | .filter(lambda (t, c): c > 1)\
122 | .map(lambda (t, c): (t, ('O', c)))
123 |
124 | return part_counts\
125 | .groupByKey()\
126 | .mapValues(dict)\
127 | .filter(lambda (t, cs): 'O' in cs and len(cs) > 1)
128 |
129 | @staticmethod
130 | def format_item((term, part_counts)):
131 | return {
132 | '_id': term,
133 | 'counts': dict(part_counts)
134 | }
135 |
136 | class EntityInlinks(ModelBuilder, Model):
137 | """ Inlink sets for each entity """
138 | def build(self, docs):
139 | return docs\
140 | .flatMap(lambda d: ((d['_id'], l) for l in set(l['target'] for l in d['links'])))\
141 | .mapValues(trim_link_subsection)\
142 | .mapValues(trim_link_protocol)\
143 | .map(lambda (k, v): (v, k))\
144 | .groupByKey()\
145 | .mapValues(list)
146 |
147 | @staticmethod
148 | def format_item((target, inlinks)):
149 | return {
150 | '_id': target,
151 | 'inlinks': inlinks
152 | }
153 |
154 | class EntityVocab(ModelBuilder, Model):
155 | """ Generate unique indexes for entities in a corpus. """
156 | def __init__(self, min_rank=0, max_rank=10000):
157 | self.min_rank = min_rank
158 | self.max_rank = max_rank
159 |
160 | def build(self, docs):
161 | log.info('Building entity vocab: df rank range=(%i, %i)', self.min_rank, self.max_rank)
162 | m = super(EntityVocab, self)\
163 | .build(docs)\
164 | .map(lambda (target, count): (count, target))\
165 | .sortByKey(False)\
166 | .zipWithIndex()\
167 | .map(lambda ((df, t), idx): (t, (df, idx)))
168 |
169 | if self.min_rank != None:
170 | m = m.filter(lambda (t, (df, idx)): idx >= self.min_rank)
171 | if self.max_rank != None:
172 | m = m.filter(lambda (t, (df, idx)): idx < self.max_rank)
173 | return m
174 |
175 | @staticmethod
176 | def format_item((term, (f, idx))):
177 | return {
178 | '_id': term,
179 | 'count': f,
180 | 'rank': idx
181 | }
182 |
183 | @staticmethod
184 | def load(sc, path, fmt=json):
185 | log.info('Loading entity-index mapping: %s ...', path)
186 | return sc\
187 | .textFile(path)\
188 | .map(fmt.loads)\
189 | .map(lambda r: (r['_id'], (r['count'], r['rank'])))
190 |
191 | class EntityComentions(ModelBuilder, Model):
192 | """ Entity comentions """
193 | @staticmethod
194 | def iter_unique_links(doc):
195 | links = set()
196 | for l in doc['links']:
197 | link = trim_link_subsection(l['target'])
198 | link = trim_link_protocol(link)
199 | if link not in links:
200 | yield link
201 | links.add(link)
202 |
203 | def build(self, docs):
204 | return docs\
205 | .map(lambda d: (d['_id'], list(self.iter_unique_links(d))))\
206 | .filter(lambda (uri, es): es)
207 |
208 | @staticmethod
209 | def format_item((uri, es)):
210 | return {
211 | '_id': uri,
212 | 'entities': es
213 | }
214 |
215 | class MappedEntityComentions(EntityComentions):
216 | """ Entity comentions with entities mapped to a numeric index """
217 | def build(self, docs, entity_vocab):
218 | ev = sc.broadcast(dict(ev.collect()))
219 | return super(MappedEntityComentions, self)\
220 | .build(docs)\
221 | .map(lambda (uri, es): (uri, [ev.value[e] for e in es if e in ev.value]))\
222 | .filter(lambda (uri, es): es)
--------------------------------------------------------------------------------
/sift/models/text.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy
3 | import ujson as json
4 | from bisect import bisect_left, bisect_right
5 | from operator import add
6 | from collections import Counter
7 |
8 | from sift.models.links import EntityVocab
9 | from sift.dataset import ModelBuilder, Documents, Model, Mentions, IndexedMentions, Vocab
10 | from sift.util import ngrams, iter_sent_spans, trim_link_subsection, trim_link_protocol
11 |
12 | from sift import logging
13 | log = logging.getLogger()
14 |
15 | class TermFrequencies(ModelBuilder, Model):
16 | """ Get term frequencies over a corpus """
17 | def __init__(self, lowercase, max_ngram):
18 | self.lowercase = lowercase
19 | self.max_ngram = max_ngram
20 |
21 | def build(self, docs):
22 | m = docs.map(lambda d: d['text'])
23 | if self.lowercase:
24 | m = m.map(unicode.lower)
25 |
26 | return m\
27 | .flatMap(lambda text: ngrams(text, self.max_ngram))\
28 | .map(lambda t: (t, 1))\
29 | .reduceByKey(add)\
30 | .filter(lambda (k,v): v > 1)
31 |
32 | @staticmethod
33 | def format_item(self, (term, count)):
34 | return {
35 | '_id': term,
36 | 'count': count,
37 | }
38 |
39 | class EntityMentions(ModelBuilder, Mentions):
40 | """ Get aggregated sentence context around links in a corpus """
41 | def __init__(self, sentence_window = 1, lowercase=False, normalize_url=True, strict_sentences=True):
42 | self.sentence_window = sentence_window
43 | self.lowercase = lowercase
44 | self.strict_sentences = strict_sentences
45 | self.normalize_url = normalize_url
46 |
47 | @staticmethod
48 | def iter_mentions(doc, window = 1, norm_url=True, strict=True):
49 | sent_spans = list(iter_sent_spans(doc['text']))
50 | sent_offsets = [s.start for s in sent_spans]
51 |
52 | for link in doc['links']:
53 | # align the link span over sentence spans in the document
54 | # mention span may cross sentence bounds if sentence tokenisation is dodgy
55 | # if so, the entire span between bounding sentences will be used as context
56 | sent_start_idx = bisect_right(sent_offsets, link['start']) - 1
57 | sent_end_idx = bisect_left(sent_offsets, link['stop']) - 1
58 |
59 | lhs_offset = window / 2
60 | rhs_offset = (window - lhs_offset) - 1
61 | sent_start_idx = max(0, sent_start_idx - lhs_offset)
62 | sent_end_idx = min(len(sent_spans)-1, sent_end_idx + rhs_offset)
63 | sent_offset = sent_spans[sent_start_idx].start
64 |
65 | span = (link['start'] - sent_offset, link['stop'] - sent_offset)
66 | target = link['target']
67 | if norm_url:
68 | target = trim_link_subsection(link['target'])
69 | target = trim_link_protocol(target)
70 | mention = doc['text'][sent_spans[sent_start_idx].start:sent_spans[sent_end_idx].stop]
71 |
72 | # filter out instances where the mention span is the entire sentence
73 | if span == (0, len(mention)):
74 | continue
75 |
76 | if strict:
77 | # filter out list item sentences
78 | sm = mention.strip()
79 | if not sm or sm.startswith('*') or sm[-1] not in '.!?"\'':
80 | continue
81 |
82 | yield target, doc['_id'], mention, span
83 |
84 | def build(self, docs):
85 | m = docs.flatMap(lambda d: self.iter_mentions(d, self.sentence_window, self.normalize_url, self.strict_sentences))
86 | if self.lowercase:
87 | m = m.map(lambda (t, src, m, s): (t, src, m.lower(), s))
88 | return m
89 |
90 | class IndexMappedMentions(EntityMentions, IndexedMentions):
91 | """ Entity mention corpus with terms mapped to numeric indexes """
92 | def build(self, sc, docs, vocab):
93 | tv = sc.broadcast(dict(vocab.map(lambda r: (r['_id'], r['rank'])).collect()))
94 | return super(IndexMappedMentions, self)\
95 | .build(docs)\
96 | .map(lambda m: self.transform(m, tv))
97 |
98 | @staticmethod
99 | def transform((target, source, text, span), vocab):
100 | vocab = vocab.value
101 |
102 | start, stop = span
103 | pre = list(ngrams(text[:start], 1))
104 | ins = list(ngrams(text[start:stop], 1))
105 | post = list(ngrams(text[stop:], 1))
106 | indexes = [vocab.get(t, len(vocab)-1) for t in (pre+ins+post)]
107 |
108 | return target, source, indexes, (len(pre), len(pre)+len(ins))
109 |
110 | class TermDocumentFrequencies(ModelBuilder):
111 | """ Get document frequencies for terms in a corpus """
112 | def __init__(self, lowercase=False, max_ngram=1, min_df=2):
113 | self.lowercase = lowercase
114 | self.max_ngram = max_ngram
115 | self.min_df = min_df
116 |
117 | def build(self, docs):
118 | m = docs.map(lambda d: d['text'])
119 | if self.lowercase:
120 | m = m.map(lambda text: text.lower())
121 |
122 | return m\
123 | .flatMap(lambda text: set(ngrams(text, self.max_ngram)))\
124 | .map(lambda t: (t, 1))\
125 | .reduceByKey(add)\
126 | .filter(lambda (k,v): v > self.min_df)
127 |
128 | class TermVocab(TermDocumentFrequencies, Vocab):
129 | """ Generate unique indexes for termed based on their document frequency ranking. """
130 | def __init__(self, max_rank, min_rank=100, *args, **kwargs):
131 | self.max_rank = max_rank
132 | self.min_rank = min_rank
133 | super(TermVocab, self).__init__(*args, **kwargs)
134 |
135 | def build(self, docs):
136 | m = super(TermVocab, self)\
137 | .build(docs)\
138 | .map(lambda (t, df): (df, t))\
139 | .sortByKey(False)\
140 | .zipWithIndex()\
141 | .map(lambda ((df, t), idx): (t, (df, idx)))
142 |
143 | if self.min_rank != None:
144 | m = m.filter(lambda (t, (df, idx)): idx >= self.min_rank)
145 | if self.max_rank != None:
146 | m = m.filter(lambda (t, (df, idx)): idx < self.max_rank)
147 | return m
148 |
149 | @staticmethod
150 | def format_item((term, (f, idx))):
151 | return {
152 | '_id': term,
153 | 'count': f,
154 | 'rank': idx
155 | }
156 |
157 | class TermIdfs(TermDocumentFrequencies, Model):
158 | """ Compute tf-idf weighted token counts over sentence contexts around links in a corpus """
159 | def build(self, corpus):
160 | log.info('Counting documents in corpus...')
161 | N = float(corpus.count())
162 | dfs = super(TermIdfs, self).build(corpus)
163 |
164 | log.info('Building idf model: N=%i', N)
165 | return dfs\
166 | .map(lambda (term, (df, rank)): (term, df))\
167 | .mapValues(lambda df: math.log(N/df))
168 |
169 | @staticmethod
170 | def format_item((term, idf)):
171 | return {
172 | '_id': term,
173 | 'idf': idf,
174 | }
175 |
176 | class EntityMentionTermFrequency(ModelBuilder, Model):
177 | """ Compute tf-idf weighted token counts over sentence contexts around links in a corpus """
178 | def __init__(self, max_ngram=1, normalize = True):
179 | self.max_ngram = max_ngram
180 | self.normalize = normalize
181 |
182 | def build(self, mentions, idfs):
183 | m = mentions\
184 | .map(lambda (target, (span, text)): (target, text))\
185 | .mapValues(lambda v: ngrams(v, self.max_ngram))\
186 | .flatMap(lambda (target, tokens): (((target, t), 1) for t in tokens))\
187 | .reduceByKey(add)\
188 | .map(lambda ((target, token), count): (token, (target, count)))\
189 | .leftOuterJoin(idfs)\
190 | .filter(lambda (token, ((target, count), idf)): idf != None)\
191 | .map(lambda (token, ((target, count), idf)): (target, (token, math.sqrt(count)*idf)))\
192 | .groupByKey()
193 |
194 | return m.mapValues(self.normalize_counts if self.normalize else list)
195 |
196 | @staticmethod
197 | def normalize_counts(counts):
198 | norm = numpy.linalg.norm([v for _, v in counts])
199 | return [(k, v/norm) for k, v in counts]
200 |
201 | @staticmethod
202 | def format_item((link, counts)):
203 | return {
204 | '_id': link,
205 | 'counts': dict(counts),
206 | }
207 |
--------------------------------------------------------------------------------
/sift/util.py:
--------------------------------------------------------------------------------
1 | import re
2 | from pattern import en
3 |
4 | # todo: use spacy tokenization
5 | def ngrams(text, max_n=1, min_n=1, strip_punctuation=True):
6 | pattern_args = {} if strip_punctuation else {'punctuation':''}
7 | for i in xrange(min_n-1,max_n):
8 | for n in en.ngrams(text, n=i+1, **pattern_args):
9 | yield ' '.join(n)
10 |
11 |
12 | # sentences can't end with a single lowercase letter
13 | SENT_NO_END_LC = "(?