├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt ├── scripts ├── download-wikipedia └── sift-notebook ├── setup.py ├── sift.ipynb └── sift ├── __init__.py ├── build.py ├── corpora ├── __init__.py ├── commoncrawl.py ├── redirects.py ├── wikicorpus.py ├── wikidata.py └── wikipedia.py ├── dataset.py ├── format.py ├── logging.py ├── models ├── __init__.py ├── embeddings.py ├── links.py └── text.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | output/ 3 | 4 | ########## 5 | # PYTHON # 6 | ########## 7 | # Initialized from github 8 | # https://github.com/github/gitignore/blob/master/Python.gitignore 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | bin/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | ve 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | .tox/ 39 | .coverage 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | 44 | # Translations 45 | *.mo 46 | 47 | # Mr Developer 48 | .mr.developer.cfg 49 | .project 50 | .pydevproject 51 | 52 | # Rope 53 | .ropeproject 54 | 55 | # Django stuff: 56 | *.log 57 | *.pot 58 | 59 | # Sphinx documentation 60 | docs/_build/ 61 | 62 | # Package 63 | MANIFEST 64 | 65 | ######### 66 | # EMACS # 67 | ######### 68 | # Initliazed from github 69 | # https://raw2.github.com/github/gitignore/master/Global/Emacs.gitignore 70 | 71 | # -*- mode: gitignore; -*- 72 | *~ 73 | \#*\# 74 | /.emacs.desktop 75 | /.emacs.desktop.lock 76 | *.elc 77 | auto-save-list 78 | tramp 79 | .\#* 80 | 81 | # Org-mode 82 | .org-id-locations 83 | *_archive 84 | 85 | # flymake-mode 86 | *_flymake.* 87 | 88 | # eshell files 89 | /eshell/history 90 | /eshell/lastdir 91 | 92 | # elpa packages 93 | /elpa/ 94 | 95 | ####### 96 | # VIM # 97 | ####### 98 | # Initialized from github 99 | # https://raw2.github.com/github/gitignore/master/Global/vim.gitignore 100 | 101 | [._]*.s[a-w][a-z] 102 | [._]s[a-w][a-z] 103 | *.un~ 104 | Session.vim 105 | .netrwhist 106 | *~ 107 | 108 | ####### 109 | # OSX # 110 | ####### 111 | # Initialized from github 112 | # https://raw2.github.com/github/gitignore/master/Global/OSX.gitignore 113 | 114 | .DS_Store 115 | .AppleDouble 116 | .LSOverride 117 | 118 | # Icon must ends with two \r. 119 | Icon 120 | 121 | # Thumbnails 122 | ._* 123 | 124 | # Files that might appear on external disk 125 | .Spotlight-V100 126 | .Trashes 127 | 128 | # NFS 129 | .nfs* 130 | 131 | # IDE 132 | *.sublime-* 133 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014. Andrew Chisholm, Ben Hachey, The University of Sydney. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | sift - Knowledge extraction from web data 2 | ================================================ 3 | 4 | __sift__ is a toolkit for extracting models of entities and text from a corpus of linked documents. 5 | 6 | 7 | ## What can it do? 8 | 9 | __sift__ is written in python, runs on Spark and is completely modular. 10 | 11 | Out of the box, you can: 12 | 13 | - Convert wikipedia articles into json objects without all the mediawiki cruft 14 | - Extract entity relations from wikidata and align them with wikipedia mentions 15 | - Extract plain-text content html and WARC encoded web page crawls 16 | - Model entity popularity, alternative names and relatedness using inlinks 17 | - Preprocess text documents for machine learning pipelines 18 | - Push output into datastores like MongoDB and Redis 19 | 20 | ## Quick Start 21 | 22 | ### Install 23 | ```bash 24 | pip install git+http://git@github.com/wikilinks/sift.git 25 | ``` 26 | 27 | ## Getting Started 28 | 29 | To use sift, you'll need some data. 30 | 31 | If you'd like to use Wikipedia data, sift includes a helper script for downloading the latest dumps. 32 | 33 | Download the latest paritioned Wikipedia dump into the 'latest' directory. 34 | ```bash 35 | download-wikipedia latest 36 | ``` 37 | 38 | Once you've got some data, take a look at the sample notebook: [sift.ipynb](sift.ipynb). 39 | 40 | ## Spark 41 | 42 | __sift__ uses Spark to process corpora in parallel. 43 | 44 | If you'd like to make use of an existing Spark cluster, ensure the `SPARK_HOME` environment variable is set. 45 | 46 | If not, that's fine. `sift` will prompt you to download and run Spark locally, utilising multiple cores on your system. 47 | 48 | ## Datasets 49 | 50 | [Web KB](https://github.com/andychisholm/web-kb) datasets built from commoncrawl data are available under a public S3 bucket: [s3.amazonaws.com/webkb](https://s3.amazonaws.com/webkb/) 51 | 52 | - `docs-2017` is built from news articles under the [CC-NEWS](http://commoncrawl.org/2016/10/news-dataset-available/) collection from January to June 2017 ([sample](https://s3.amazonaws.com/webkb/docs-2017/part-00000)) 53 | - `web-201707` is built from a full web crawl for [July 2017](http://commoncrawl.org/2017/07/july-2017-crawl-archive-now-available/) filted to English language pages ([sample](https://s3.amazonaws.com/webkb/web-201707/part-00000.gz)) 54 | 55 | The web collection contains plain-text content, entity mentions and endpoint annotations extracted from 1.5 billion documents with over 4 billion web links. 56 | Data is encoded in a simple one-JSON-blob-per-line structure. 57 | 58 | For example, the first document in the collection is an article from 2012 describing an [upcoming tour by Nicki Minaj](http://1019ampradio.cbslocal.com/2012/11/06/nicki-minaj-promises-man-bits-on-her-upcoming-tour/): 59 | 60 | ```json 61 | { 62 | "_id": "http://1019ampradio.cbslocal.com/2012/11/06/nicki-minaj-promises-man-bits-on-her-upcoming-tour/", 63 | "text": "Nicki Minaj has had quite the year. Currently in the U.K. on her Reloaded Tour she sat down with London DJ Tim Westwood and her U.K. Barbz for a Q & A session. While Nicki took questions from both Westwood and her fans one answer in particular caused the room to pay attention...", 64 | "links":[{ 65 | "start": 0, 66 | "endpoint": 0.6358972797, 67 | "stop": 11, 68 | "target": "http://1019ampradio.cbslocal.com/tag/nicki-minaj" 69 | }, { 70 | "start": 145, 71 | "endpoint": 0.2769776554, 72 | "stop": 160, 73 | "target": "http://www.youtube.com/watch?v=vnyuhDBcQo0" 74 | }], 75 | "mentions":[{ 76 | "start": 0, 77 | "stop": 11, 78 | "label": "PERSON" 79 | }, { 80 | "start": 53, 81 | "stop": 57, 82 | "label": "GPE" 83 | }, 84 | // truncated 85 | } 86 | ``` 87 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ujson 2 | numpy 3 | scipy 4 | pattern 5 | gensim 6 | msgpack-python 7 | findspark 8 | jupyter 9 | spacy 10 | lxml 11 | beautifulsoup4 12 | warc 13 | pycld2 14 | dragnet -------------------------------------------------------------------------------- /scripts/download-wikipedia: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | WKDATE=latest 5 | PARA_ARG="" 6 | 7 | if [ $# -gt 0 ] 8 | then 9 | WKDATE=$1 10 | if [ $# == 2 ] 11 | then 12 | PARA_ARG="-P $2" 13 | fi 14 | fi 15 | 16 | if [ "$WKDATE" == "latest" ]; then 17 | export LINK_PFX=/enwiki/latest/ 18 | fi 19 | 20 | export WKDIR=$WKDATE 21 | export WKDATE 22 | rm -rf $WKDIR 23 | mkdir -p $WKDIR 24 | 25 | curl "https://dumps.wikimedia.org/enwiki/$WKDATE/" |\ 26 | grep "enwiki-$WKDATE-pages-articles[0-9]*.xml-p[0-9]*p[0-9]*.bz2\""|\ 27 | awk -v pfx=$LINK_PFX -F'"' '{print "https://dumps.wikimedia.org" pfx $2}' |\ 28 | xargs -n1 $PARA_ARG -L 1 bash -c 'wget $0 -P $WKDIR' 29 | -------------------------------------------------------------------------------- /scripts/sift-notebook: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | SPARK_URL="http://d3kbcqa49mib13.cloudfront.net/spark-2.1.0-bin-hadoop2.7.tgz" 5 | 6 | if [ -z "$SPARK_HOME" ]; then 7 | echo "SPARK_HOME is unset, using local Spark deployment..." 8 | if [ ! -d "spark" ]; then 9 | read -p "Would you like to download spark and run in standalone mode? " -n 1 -r 10 | if [[ ! $REPLY =~ ^[Yy]$ ]] 11 | then 12 | exit 1 13 | fi 14 | echo 15 | echo "Downloading spark for local standalone deployment..." 16 | mkdir spark 17 | curl $SPARK_URL | tar zx -C spark --strip-components=1 18 | 19 | echo "Updating spark logger config..." 20 | pushd spark/conf > /dev/null 21 | sed -e 's/log4j.rootCategory=INFO/log4j.rootCategory=WARN/' log4j.properties.template > log4j.properties 22 | popd > /dev/null 23 | fi 24 | export SPARK_HOME=$(pwd)/spark 25 | if [ -z "$SPARK_MASTER" ]; then 26 | SPARK_MASTER=local[*] 27 | fi 28 | fi 29 | 30 | if [ ! -z "$VIRTUAL_ENV" ]; then 31 | export PYSPARK_PYTHON=$VIRTUAL_ENV/bin/python 32 | else 33 | export PYSPARK_PYTHON=$(pwd)/ve/bin/python 34 | fi 35 | 36 | SPARK_MASTER_SW= 37 | if [ ! -z "$SPARK_MASTER" ]; then 38 | SPARK_MASTER_SW="--master $SPARK_MASTER" 39 | fi 40 | 41 | PYTHONPATH=$PYTHONPATH:$(pwd) jupyter notebook "$@" 42 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | __version__ = '0.3.0' 4 | __pkg_name__ = 'textsift' 5 | 6 | setup( 7 | name = __pkg_name__, 8 | version = __version__, 9 | description = 'Text modelling framework', 10 | author='Andrew Chisholm', 11 | packages = find_packages(), 12 | license = 'MIT', 13 | url = 'https://github.com/wikilinks/sift', 14 | scripts = [ 15 | 'scripts/sift-notebook', 16 | 'scripts/download-wikipedia' 17 | ], 18 | classifiers=[ 19 | 'Development Status :: 4 - Beta', 20 | 'Environment :: Console', 21 | 'Intended Audience :: Science/Research', 22 | 'License :: OSI Approved :: MIT License', 23 | 'Programming Language :: Python :: 2.7', 24 | 'Topic :: Text Processing :: Linguistic' 25 | ], 26 | install_requires = [ 27 | "ujson", 28 | "numpy", 29 | "pattern", 30 | "gensim", 31 | "msgpack-python", 32 | "beautifulsoup4", 33 | "spacy", 34 | "warc", 35 | "pycld2", 36 | "scipy", 37 | "scikit-learn" 38 | ], 39 | test_suite = __pkg_name__ + '.test' 40 | ) 41 | -------------------------------------------------------------------------------- /sift.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import findspark\n", 12 | "findspark.init()\n", 13 | "import pyspark\n", 14 | "sc = pyspark.SparkContext()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 36, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "from sift.corpora import wikipedia\n", 26 | "from sift.models import text, links" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 5, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "base_path = '/data/wikipedia/20151002/'" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 6, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "corpus = wikipedia.WikipediaCorpus()(sc, base_path + 'dump')" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 9, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "redirects = wikipedia.WikipediaRedirects()(corpus)\n", 60 | "docs = wikipedia.WikipediaArticles()(corpus, redirects)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 27, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "[{'_id': 'en.wikipedia.org/wiki/Causa_y_Efecto',\n", 74 | " 'links': [{'start': 986,\n", 75 | " 'stop': 999,\n", 76 | " 'target': u'en.wikipedia.org/wiki/New_York_City'},\n", 77 | " {'start': 93, 'stop': 105, 'target': u'en.wikipedia.org/wiki/Studio_album'},\n", 78 | " {'start': 971, 'stop': 982, 'target': u'en.wikipedia.org/wiki/Gotham_Hall'},\n", 79 | " {'start': 2178,\n", 80 | " 'stop': 2192,\n", 81 | " 'target': u'en.wikipedia.org/wiki/Miami,_Florida'},\n", 82 | " {'start': 1791,\n", 83 | " 'stop': 1808,\n", 84 | " 'target': u'en.wikipedia.org/wiki/Latin_Pop_Airplay'},\n", 85 | " {'start': 2702,\n", 86 | " 'stop': 2719,\n", 87 | " 'target': u'en.wikipedia.org/wiki/Latin_Pop_Airplay'},\n", 88 | " {'start': 465,\n", 89 | " 'stop': 484,\n", 90 | " 'target': u'en.wikipedia.org/wiki/Ni_Una_Sola_Palabra'},\n", 91 | " {'start': 2122, 'stop': 2129, 'target': u'en.wikipedia.org/wiki/Austria'},\n", 92 | " {'start': 2740,\n", 93 | " 'stop': 2760,\n", 94 | " 'target': u'en.wikipedia.org/wiki/Latin_Rhythm_Airplay'},\n", 95 | " {'start': 106,\n", 96 | " 'stop': 119,\n", 97 | " 'target': u'en.wikipedia.org/wiki/Gran_City_Pop'},\n", 98 | " {'start': 2388, 'stop': 2397, 'target': u'en.wikipedia.org/wiki/Reggaeton'},\n", 99 | " {'start': 2069,\n", 100 | " 'stop': 2080,\n", 101 | " 'target': u'en.wikipedia.org/wiki/Music_video'},\n", 102 | " {'start': 2530, 'stop': 2534, 'target': u'en.wikipedia.org/wiki/U.S.'},\n", 103 | " {'start': 2573, 'stop': 2577, 'target': u'en.wikipedia.org/wiki/U.S.'},\n", 104 | " {'start': 2612, 'stop': 2616, 'target': u'en.wikipedia.org/wiki/U.S.'},\n", 105 | " {'start': 2649, 'stop': 2653, 'target': u'en.wikipedia.org/wiki/U.S.'},\n", 106 | " {'start': 2685, 'stop': 2689, 'target': u'en.wikipedia.org/wiki/U.S.'},\n", 107 | " {'start': 2723, 'stop': 2727, 'target': u'en.wikipedia.org/wiki/U.S.'},\n", 108 | " {'start': 1509,\n", 109 | " 'stop': 1524,\n", 110 | " 'target': u'en.wikipedia.org/wiki/Hot_Latin_Songs'},\n", 111 | " {'start': 2666,\n", 112 | " 'stop': 2681,\n", 113 | " 'target': u'en.wikipedia.org/wiki/Hot_Latin_Songs'},\n", 114 | " {'start': 1391,\n", 115 | " 'stop': 1400,\n", 116 | " 'target': u'en.wikipedia.org/wiki/Causa_y_Efecto'},\n", 117 | " {'start': 41, 'stop': 54, 'target': u'en.wikipedia.org/wiki/Paulina_Rubio'},\n", 118 | " {'start': 821,\n", 119 | " 'stop': 854,\n", 120 | " 'target': u'en.wikipedia.org/wiki/2009_Latin_Billboard_Music_Awards'},\n", 121 | " {'start': 2402,\n", 122 | " 'stop': 2415,\n", 123 | " 'target': u'en.wikipedia.org/wiki/Angel_&_Khriz'},\n", 124 | " {'start': 2775,\n", 125 | " 'stop': 2827,\n", 126 | " 'target': u'en.wikipedia.org/wiki/List_of_number-one_Billboard_Hot_Latin_Songs_of_2009'},\n", 127 | " {'start': 2547,\n", 128 | " 'stop': 2569,\n", 129 | " 'target': u'en.wikipedia.org/wiki/Bubbling_Under_Hot_100'},\n", 130 | " {'start': 2536,\n", 131 | " 'stop': 2545,\n", 132 | " 'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n", 133 | " {'start': 2579,\n", 134 | " 'stop': 2588,\n", 135 | " 'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n", 136 | " {'start': 2618,\n", 137 | " 'stop': 2627,\n", 138 | " 'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n", 139 | " {'start': 2655,\n", 140 | " 'stop': 2664,\n", 141 | " 'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n", 142 | " {'start': 2691,\n", 143 | " 'stop': 2700,\n", 144 | " 'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n", 145 | " {'start': 2729,\n", 146 | " 'stop': 2738,\n", 147 | " 'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n", 148 | " {'start': 2304, 'stop': 2307, 'target': u'en.wikipedia.org/wiki/MTV'}],\n", 149 | " 'text': u'\"\" is a song performed by Mexican singer Paulina Rubio. The song was recorded for her ninth studio album Gran City Pop, and was released as the lead single on March 30, 2009. Causa y Efecto became a hit reaching number 1 in the U.S. Billboard Hot Latin Songs and Hot Latin Airplays. Causa y Efecto was produced by Cachorro L\\xf3pez and written by Mario Domm and M\\xf3nica V\\xe9lez. \"Causa y Efecto\" is Rubio\\'s first number one single in the Billboard Hot Latin Songs since Ni Una Sola Palabra in 2006. \"Causa y Efecto\" was awarded \"Song of the year pop/ballad\" by ASCAP.\\n\\nAn English version of the song titled \"Cause and Effect\" will be released on Paulina\\'s next bilingual album.\\n\\nRelease and promotion\\n\"Causa y Efecto\", debuted in radio stations on March 30. The song was performed for the first time on April 23, 2009 at the Latin Billboard Music Awards 2009, Rubio was one of the most expected artists of the night. Rubio also performed the song in a private concert at the Gotham Hall in New York City on May 11 promoting Gran City Pop. The concert was presented by Univision Radio.\\n\\nRubio performed the song at the Wal-Mart Shareholders\\' Meeting, singing a \"spanglish\" version of the song. She performed parts of the Spanish version and others of an unreleased English version. An English version of the song was released on a remix of the song, the George Figares Radio Mix.\\n\\nTrack listing\\n*CD Single\\n# \"Causa y Efecto\" Album version - 3:27\\n\\nChart performance\\nThe song debuted at number 40 on the \"Billboard\" Hot Latin Songs, the next week the song jumped at #26, obtaining the highest \\'jump\\' of the week on the chart. The song peaked #1 for five consecutive weeks and it\\'s her fourth #1 and her twelfth top ten hit on Hot Latin songs. It debuted at #22 and peaked at #1 on the \"Billboard\" Latin Pop Airplay.\\n\\nIn Spain, the single debuted at #43, and has peaked at #7 based on downloads alone. On May 9, 2009, \"Causa y Efecto\" entered at the Spanish Airplay Chart at #7 as the highest debut of that week, and peaked at #1 for three consecutive weeks.\\n\\nMusic video\\nThe music video for \"Causa y Efecto\" was directed by the Austrian director Rudi Dolezal. The video was filmed in Miami, Florida during the month of March at M3 Studios. The video premiered worldwide on May 7 and in the U.S on the channel MTV Tres.\\n\\nRemix\\nAn official remix of the song was released on June 12. It features reggaeton duo Angel & Khriz.\\n\\nCharts\\n\\nChart (2009)\\nPeakposition\\n\\nMexico (Monitor Latino)\\n1\\n\\nSpanish Airplay Chart\\n1\\n\\nSpanish Singles Chart\\n7\\n\\nU.S. \"Billboard\" Bubbling Under Hot 100\\n4\\n\\nU.S. \"Billboard\" Heatseeker Songs \\n23\\n\\nU.S. \"Billboard\" Tropical Songs \\n29\\n\\nU.S. \"Billboard\" Hot Latin Songs\\n1\\n\\nU.S. \"Billboard\" Latin Pop Airplay\\n1\\n\\nU.S. \"Billboard\" Latin Rhythm Airplay\\n6\\n\\n\\nSee also\\n*List of number-one Billboard Hot Latin Songs of 2009\\n\\nSales and certifications\\n\\n\\n Country\\n Certification\\n Sales\\n\\n Spain\\n Platinum\\n 40,000\\n\\n\\nReferences'}]" 150 | ] 151 | }, 152 | "execution_count": 27, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "docs.take(1)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 25, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "vocab = text.TermVocab(max_rank=100000,min_rank=0,lowercase=True,min_df=5)(docs.sample(False, 0.25))" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 26, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "[{'_id': u'the', 'count': 1172125, 'rank': 0},\n", 183 | " {'_id': u'in', 'count': 1135557, 'rank': 1},\n", 184 | " {'_id': u'a', 'count': 1127366, 'rank': 2},\n", 185 | " {'_id': u'of', 'count': 1101586, 'rank': 3},\n", 186 | " {'_id': u'and', 'count': 1008637, 'rank': 4},\n", 187 | " {'_id': u'is', 'count': 997703, 'rank': 5},\n", 188 | " {'_id': u'references', 'count': 958549, 'rank': 6},\n", 189 | " {'_id': u'to', 'count': 889253, 'rank': 7},\n", 190 | " {'_id': u'was', 'count': 804122, 'rank': 8},\n", 191 | " {'_id': u'for', 'count': 725355, 'rank': 9}]" 192 | ] 193 | }, 194 | "execution_count": 26, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "vocab.take(10)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 34, 206 | "metadata": { 207 | "collapsed": false 208 | }, 209 | "outputs": [ 210 | { 211 | "data": { 212 | "text/plain": [ 213 | "[{'_id': u'en.wikipedia.org/wiki/New_York_City',\n", 214 | " 'source': 'en.wikipedia.org/wiki/Causa_y_Efecto',\n", 215 | " 'span': (73, 86),\n", 216 | " 'text': u'rubio also performed the song in a private concert at the gotham hall in new york city on may 11 promoting gran city pop.'}]" 217 | ] 218 | }, 219 | "execution_count": 34, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "text.EntityMentions(sentence_window=1,lowercase=True)(docs).take(1)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 31, 231 | "metadata": { 232 | "collapsed": true 233 | }, 234 | "outputs": [], 235 | "source": [ 236 | "mentions = text.IndexMappedMentions(sentence_window=1,lowercase=True)(sc, docs, vocab)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 32, 242 | "metadata": { 243 | "collapsed": false 244 | }, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "text/plain": [ 249 | "[{'_id': u'en.wikipedia.org/wiki/New_York_City',\n", 250 | " 'source': 'en.wikipedia.org/wiki/Causa_y_Efecto',\n", 251 | " 'span': (14, 17),\n", 252 | " 'text': [25961,\n", 253 | " 18,\n", 254 | " 686,\n", 255 | " 0,\n", 256 | " 447,\n", 257 | " 1,\n", 258 | " 2,\n", 259 | " 574,\n", 260 | " 2057,\n", 261 | " 13,\n", 262 | " 0,\n", 263 | " 21394,\n", 264 | " 518,\n", 265 | " 1,\n", 266 | " 35,\n", 267 | " 227,\n", 268 | " 98,\n", 269 | " 11,\n", 270 | " 46,\n", 271 | " 205,\n", 272 | " 3585,\n", 273 | " 9860,\n", 274 | " 98,\n", 275 | " 1770]}]" 276 | ] 277 | }, 278 | "execution_count": 32, 279 | "metadata": {}, 280 | "output_type": "execute_result" 281 | } 282 | ], 283 | "source": [ 284 | "mentions.take(1)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 42, 290 | "metadata": { 291 | "collapsed": false 292 | }, 293 | "outputs": [ 294 | { 295 | "data": { 296 | "text/plain": [ 297 | "[{'_id': u'', 'count': 4},\n", 298 | " {'_id': u'www.rsssf.com/tabless/slow97.html', 'count': 2},\n", 299 | " {'_id': u'en.wikipedia.org/wiki/Yuba,_Michigan', 'count': 3},\n", 300 | " {'_id': u'en.wikipedia.org/wiki/Walnut_River_(Kansas)', 'count': 12},\n", 301 | " {'_id': u'www.ctheritage.org/encyclopedia/ct1865_1929/admin_baldwin.htm',\n", 302 | " 'count': 2},\n", 303 | " {'_id': u'en.wikipedia.org/wiki/Falling_factorial', 'count': 28},\n", 304 | " {'_id': u'en.wikipedia.org/wiki/WOW_Worship:_Blue', 'count': 5},\n", 305 | " {'_id': u'en.wikipedia.org/wiki/Ekhane_Pinjar', 'count': 2},\n", 306 | " {'_id': u'en.wikipedia.org/wiki/Conditional_execution', 'count': 2},\n", 307 | " {'_id': u'en.wikipedia.org/wiki/Paralititan', 'count': 27}]" 308 | ] 309 | }, 310 | "execution_count": 42, 311 | "metadata": {}, 312 | "output_type": "execute_result" 313 | } 314 | ], 315 | "source": [ 316 | "links.EntityCounts()(docs).take(10)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 49, 322 | "metadata": { 323 | "collapsed": false 324 | }, 325 | "outputs": [ 326 | { 327 | "data": { 328 | "text/plain": [ 329 | "[{'_id': u'XMT',\n", 330 | " 'counts': {u'en.wikipedia.org/wiki/Cray_XMT': 1,\n", 331 | " u'en.wikipedia.org/wiki/Extensible_MPEG-4_Textual_Format': 1,\n", 332 | " u'en.wikipedia.org/wiki/XMT': 1},\n", 333 | " 'total': 3},\n", 334 | " {'_id': u'New York',\n", 335 | " 'counts': {u'en.wikipedia.org/wiki/New_York_(disambiguation)': 1,\n", 336 | " u'en.wikipedia.org/wiki/New_York_Disability_Benefits_Law': 1,\n", 337 | " u'en.wikipedia.org/wiki/New_York_State_Armory_(Poughkeepsie)': 1,\n", 338 | " u'en.wikipedia.org/wiki/Outline_of_New_York': 1,\n", 339 | " u'en.wikipedia.org/wiki/Vehicle_registration_plates_of_New_York': 1},\n", 340 | " 'total': 5},\n", 341 | " {'_id': u'Albert Lewis',\n", 342 | " 'counts': {u'en.wikipedia.org/wiki/Albert_Gerald_Lewis': 1,\n", 343 | " u'en.wikipedia.org/wiki/Albert_Lewis': 1,\n", 344 | " u'en.wikipedia.org/wiki/Albert_Lewis_(American_football)': 27,\n", 345 | " u'en.wikipedia.org/wiki/Albert_Lewis_(Sheffield_United)': 1,\n", 346 | " u'en.wikipedia.org/wiki/Albert_Lewis_(footballer)': 3,\n", 347 | " u'en.wikipedia.org/wiki/Albert_Lewis_(priest)': 2,\n", 348 | " u'en.wikipedia.org/wiki/Albert_Lewis_(producer)': 5,\n", 349 | " u'en.wikipedia.org/wiki/Talbot_Lewis': 1},\n", 350 | " 'total': 41},\n", 351 | " {'_id': u'WFA website',\n", 352 | " 'counts': {u'wfafootball.com': 1,\n", 353 | " u'www.wfafootball.com': 1,\n", 354 | " u'www.wfafootball.com/': 2},\n", 355 | " 'total': 4},\n", 356 | " {'_id': u'Marlboro British F3 Championship round 3',\n", 357 | " 'counts': {u'en.wikipedia.org/wiki/1981_Marlboro_British_F3_Championship,_Rd.3': 1,\n", 358 | " u'en.wikipedia.org/wiki/1982_Marlboro_British_F3_Championship,_Rd.3': 1,\n", 359 | " u'en.wikipedia.org/wiki/1983_Marlboro_British_F3_Championship,_Rd.3': 1,\n", 360 | " u'en.wikipedia.org/wiki/1984_Marlboro_British_F3_Championship,_Rd.3': 1,\n", 361 | " u'en.wikipedia.org/wiki/1985_Marlboro_British_F3_Championship,_Rd.3': 1},\n", 362 | " 'total': 5}]" 363 | ] 364 | }, 365 | "execution_count": 49, 366 | "metadata": {}, 367 | "output_type": "execute_result" 368 | } 369 | ], 370 | "source": [ 371 | "links\\\n", 372 | " .EntityNameCounts()(docs)\\\n", 373 | " .filter(lambda r: len(r['counts']) >= 3)\\\n", 374 | " .take(5)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": { 381 | "collapsed": true 382 | }, 383 | "outputs": [], 384 | "source": [] 385 | } 386 | ], 387 | "metadata": { 388 | "kernelspec": { 389 | "display_name": "Python 2", 390 | "language": "python", 391 | "name": "python2" 392 | }, 393 | "language_info": { 394 | "codemirror_mode": { 395 | "name": "ipython", 396 | "version": 2 397 | }, 398 | "file_extension": ".py", 399 | "mimetype": "text/x-python", 400 | "name": "python", 401 | "nbconvert_exporter": "python", 402 | "pygments_lexer": "ipython2", 403 | "version": "2.7.6" 404 | } 405 | }, 406 | "nbformat": 4, 407 | "nbformat_minor": 0 408 | } 409 | -------------------------------------------------------------------------------- /sift/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.2.0' -------------------------------------------------------------------------------- /sift/build.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import shutil 4 | import textwrap 5 | import argparse 6 | import ujson as json 7 | 8 | from pyspark import SparkContext, SparkConf 9 | from sift.format import ModelFormat 10 | 11 | import logging 12 | log = logging.getLogger() 13 | 14 | class DatasetBuilder(object): 15 | """ Wrapper for modules which extract models of entities or text from a corpus of linked documents """ 16 | def __init__(self, **kwargs): 17 | self.output_path = kwargs.pop('output_path') 18 | self.sample = kwargs.pop('sample') 19 | 20 | fmtcls = kwargs.pop('fmtcls') 21 | fmt_args = {p:kwargs[p] for p in fmtcls.__init__.__code__.co_varnames if p in kwargs} 22 | self.formatter = fmtcls(**fmt_args) 23 | 24 | modelcls = kwargs.pop('modelcls') 25 | self.model_name = re.sub('([A-Z])', r' \1', modelcls.__name__).strip() 26 | 27 | log.info("Building %s...", self.model_name) 28 | self.model = modelcls(**kwargs) 29 | 30 | def __call__(self): 31 | c = SparkConf().setAppName('Build %s' % self.model_name) 32 | 33 | log.info('Using spark master: %s', c.get('spark.master')) 34 | sc = SparkContext(conf=c) 35 | 36 | kwargs = self.model.prepare(sc) 37 | m = self.model.build(**kwargs) 38 | m = self.model.format_items(m) 39 | m = self.formatter(m) 40 | 41 | if self.output_path: 42 | log.info("Saving to: %s", self.output_path) 43 | if os.path.isdir(self.output_path): 44 | log.warn('Writing over output path: %s', self.output_path) 45 | shutil.rmtree(self.output_path) 46 | m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec') 47 | elif self.sample > 0: 48 | print '\n'.join(str(i) for i in m.take(self.sample)) 49 | 50 | log.info('Done.') 51 | 52 | @classmethod 53 | def add_arguments(cls, p): 54 | p.add_argument('--save', dest='output_path', required=False, default=None, metavar='OUTPUT_PATH') 55 | p.add_argument('--sample', dest='sample', required=False, default=1, type=int, metavar='NUM_SAMPLES') 56 | p.set_defaults(cls=cls) 57 | 58 | sp = p.add_subparsers() 59 | for modelcls in cls.providers(): 60 | name = modelcls.__name__ 61 | help_str = modelcls.__doc__.split('\n')[0] 62 | desc = textwrap.dedent(modelcls.__doc__.rstrip()) 63 | csp = sp.add_parser(name, 64 | help=help_str, 65 | description=desc, 66 | formatter_class=argparse.RawDescriptionHelpFormatter) 67 | modelcls.add_arguments(csp) 68 | cls.add_formatter_arguments(csp) 69 | 70 | return p 71 | 72 | @classmethod 73 | def add_formatter_arguments(cls, p): 74 | sp = p.add_subparsers() 75 | for fmtcls in ModelFormat.iter_options(): 76 | name = fmtcls.__name__.lower() 77 | if name.endswith('format'): 78 | name = name[:-len('format')] 79 | help_str = fmtcls.__doc__.split('\n')[0] 80 | desc = textwrap.dedent(fmtcls.__doc__.rstrip()) 81 | csp = sp.add_parser(name, 82 | help=help_str, 83 | description=desc, 84 | formatter_class=argparse.RawDescriptionHelpFormatter) 85 | fmtcls.add_arguments(csp) 86 | return p 87 | -------------------------------------------------------------------------------- /sift/corpora/__init__.py: -------------------------------------------------------------------------------- 1 | class Corpus(object): 2 | pass 3 | -------------------------------------------------------------------------------- /sift/corpora/commoncrawl.py: -------------------------------------------------------------------------------- 1 | import re 2 | from cStringIO import StringIO 3 | from warc import WARCFile 4 | from dragnet import content_extractor, BlockifyError 5 | from lxml import etree 6 | from bs4 import BeautifulSoup 7 | from sift.dataset import ModelBuilder, Model, Documents 8 | from sift import logging 9 | import pycld2 as cld 10 | from pycld2 import error as cld_error 11 | 12 | LINKS_RE = re.compile(r'(.+?)') 13 | 14 | class WARCCorpus(ModelBuilder, Model): 15 | def __init__(self, language=None): 16 | self.language = language 17 | 18 | @staticmethod 19 | def parse_warc_content(buf): 20 | try: 21 | wf = WARCFile(fileobj=StringIO(buf)) 22 | record = wf.read_record() 23 | payload = record.payload.read() 24 | top = payload[:15] 25 | 26 | if top.startswith('HTTP/') and top.endswith('200 OK'): 27 | content_start = payload.find('\r\n\r\n') 28 | if content_start != -1: 29 | yield record.url, payload[content_start+4:] 30 | except IOError: 31 | pass 32 | 33 | @staticmethod 34 | def try_get_lang(content): 35 | try: 36 | reliable, _, details = cld.detect(content) 37 | if reliable: 38 | return details[0][1] 39 | except cld_error: 40 | pass 41 | return None 42 | 43 | def build(self, sc, path): 44 | PAGE_DELIMITER = "WARC/1.0\r\n" 45 | warcs = sc\ 46 | .newAPIHadoopFile( 47 | path, 48 | "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", 49 | "org.apache.hadoop.io.LongWritable", 50 | "org.apache.hadoop.io.Text", 51 | conf = { "textinputformat.record.delimiter": PAGE_DELIMITER })\ 52 | .filter(lambda (_, part): part)\ 53 | .map(lambda (_, part): PAGE_DELIMITER+part.encode('utf-8'))\ 54 | .flatMap(self.parse_warc_content) 55 | 56 | if self.language != None: 57 | warcs = warcs.filter(lambda (url, content): self.try_get_lang(content) == self.language) 58 | return warcs 59 | 60 | @staticmethod 61 | def format_item((url, content)): 62 | return { 63 | '_id': url, 64 | 'content': content, 65 | } 66 | 67 | class CommonCrawlArticles(ModelBuilder, Documents): 68 | THRESHOLD_CONTENT_SZ = 250000 69 | 70 | @staticmethod 71 | def clean_content((url, content)): 72 | try: 73 | blocks = content_extractor.analyze(content, blocks=True) 74 | content = ''.join(etree.tostring(b.features['block_start_element']) for b in blocks) 75 | if len(content) < CommonCrawlArticles.THRESHOLD_CONTENT_SZ: 76 | yield url, content 77 | except (BlockifyError, etree.SerialisationError): 78 | pass 79 | 80 | @staticmethod 81 | def parse_article(content): 82 | soup = BeautifulSoup(content, 'lxml') 83 | 84 | for tag in soup.find_all(): 85 | if tag.name == 'a' and tag.attrs.get('href') and tag.text.strip(): 86 | tag.attrs = {'href': tag.attrs['href']} 87 | else: 88 | tag.unwrap() 89 | 90 | return soup.encode_contents().decode('utf-8').strip() 91 | 92 | @staticmethod 93 | def extract_links(content): 94 | links = [] 95 | offset = 0 96 | for match in LINKS_RE.finditer(content): 97 | target = match.group(1) 98 | anchor = match.group(2) 99 | start = match.start() - offset 100 | offset += len(match.group())-len(anchor) 101 | links.append((target, slice(start, start+len(anchor)))) 102 | 103 | return LINKS_RE.sub(r'\2', content), links 104 | 105 | def build(self, corpus): 106 | return corpus\ 107 | .map(lambda item: (item['_id'], item['content']))\ 108 | .flatMap(self.clean_content)\ 109 | .mapValues(self.parse_article)\ 110 | .mapValues(self.extract_links) 111 | -------------------------------------------------------------------------------- /sift/corpora/redirects.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import ujson as json 3 | 4 | from sift.dataset import Model, DocumentModel 5 | from sift.util import trim_link_protocol, iter_sent_spans, ngrams 6 | 7 | from sift import logging 8 | log = logging.getLogger() 9 | 10 | class MapRedirects(Model): 11 | """ Map redirects """ 12 | def __init__(self, *args, **kwargs): 13 | self.from_path = kwargs.pop('from_path') 14 | self.to_path = kwargs.pop('to_path') 15 | 16 | def prepare(self, sc): 17 | return { 18 | "from_rds": self.load(sc, self.from_path).cache(), 19 | "to_rds": self.load(sc, self.to_path).cache() 20 | } 21 | 22 | @staticmethod 23 | def map_redirects(source, target): 24 | return source\ 25 | .map(lambda (s, t): (t, s))\ 26 | .leftOuterJoin(target)\ 27 | .map(lambda (t, (s, r)): (s, r or t))\ 28 | .distinct() 29 | 30 | def build(self, from_rds, to_rds): 31 | # map source of destination kb 32 | # e.g. (a > b) and (a > c) becomes (b > c) 33 | mapped_to = to_rds\ 34 | .leftOuterJoin(from_rds)\ 35 | .map(lambda (s, (t, f)): (f or s, t))\ 36 | 37 | # map target of origin kb 38 | # e.g. (a > b) and (b > c) becomes (a > c) 39 | mapped_from = from_rds\ 40 | .map(lambda (s, t): (t, s))\ 41 | .leftOuterJoin(mapped_to)\ 42 | .map(lambda (t, (s, r)): (s, r))\ 43 | .filter(lambda (s, t): t) 44 | 45 | rds = (mapped_from + mapped_to).distinct() 46 | rds.cache() 47 | 48 | log.info('Resolving transitive mappings over %i redirects...', rds.count()) 49 | rds = self.map_redirects(rds, rds) 50 | 51 | log.info('Resolved %i redirects...', rds.count()) 52 | return rds 53 | 54 | @staticmethod 55 | def load(sc, path, fmt=json): 56 | log.info('Using redirects: %s', path) 57 | return sc\ 58 | .textFile(path)\ 59 | .map(fmt.loads)\ 60 | .map(lambda r: (r['_id'], r['target'])) 61 | 62 | def format_items(self, model): 63 | return model\ 64 | .map(lambda (source, target): { 65 | '_id': source, 66 | 'target': target 67 | }) 68 | 69 | @classmethod 70 | def add_arguments(cls, p): 71 | super(MapRedirects, cls).add_arguments(p) 72 | p.add_argument('from_path', metavar='FROM_REDIRECTS_PATH') 73 | p.add_argument('to_path', metavar='TO_REDIRECTS_PATH') 74 | return p 75 | 76 | class RedirectDocuments(DocumentModel): 77 | """ Map links in a corpus via a set of redirects """ 78 | def __init__(self, **kwargs): 79 | self.redirect_path = kwargs.pop('redirects_path') 80 | super(RedirectDocuments, self).__init__(**kwargs) 81 | 82 | def prepare(self, sc): 83 | params = super(RedirectDocuments, self).prepare(sc) 84 | params['redirects'] = self.load(sc, self.redirect_path).cache() 85 | return params 86 | 87 | def build(self, corpus, redirects): 88 | articles = corpus.map(lambda d: (d['_id'], d)) 89 | 90 | def map_doc_links(doc, rds): 91 | for l in doc['links']: 92 | l['target'] = rds[l['target']] 93 | return doc 94 | 95 | return corpus\ 96 | .map(lambda d: (d['_id'], set(l['target'] for l in d['links'])))\ 97 | .flatMap(lambda (pid, links): [(t, pid) for t in links])\ 98 | .leftOuterJoin(redirects)\ 99 | .map(lambda (t, (pid, r)): (pid, (t, r if r else t)))\ 100 | .groupByKey()\ 101 | .mapValues(dict)\ 102 | .join(articles)\ 103 | .map(lambda (pid, (rds, doc)): map_doc_links(doc, rds)) 104 | 105 | def format_items(self, model): 106 | return model 107 | 108 | @classmethod 109 | def add_arguments(cls, p): 110 | super(RedirectDocuments, cls).add_arguments(p) 111 | p.add_argument('redirects_path', metavar='REDIRECTS_PATH') 112 | return p 113 | -------------------------------------------------------------------------------- /sift/corpora/wikicorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Based on wikicorpus.py from Gensim: 5 | # https://github.com/piskvorky/gensim/blob/develop/gensim/corpora/wikicorpus.py 6 | # Credits: 7 | # Radim Rehurek 8 | # Lars Buitinck 9 | 10 | import re 11 | import xml.etree.cElementTree as ET 12 | 13 | from htmlentitydefs import name2codepoint 14 | 15 | wikilink_prefix = 'en.wikipedia.org/wiki/' 16 | 17 | RE_P0 = re.compile('', re.DOTALL | re.UNICODE) # comments 18 | RE_P1 = re.compile(' ].*?)(|/>)', re.DOTALL | re.UNICODE) # footnotes 19 | RE_P2 = re.compile("(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$", re.UNICODE) # links to languages 20 | RE_P3 = re.compile("{{([^}{]*)}}", re.DOTALL | re.UNICODE) # template 21 | RE_P4 = re.compile("{{([^}]*)}}", re.DOTALL | re.UNICODE) # template 22 | RE_P5 = re.compile('\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description 23 | RE_P6 = re.compile("\[\[:?([^][]*)\|([^][]*)\]\]", re.DOTALL | re.UNICODE) # simplify links, keep description 24 | RE_P6_ex = re.compile("\[\[:?([^][]*)\]\]", re.DOTALL | re.UNICODE) # links without description 25 | RE_P7 = re.compile('\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images 26 | RE_P8 = re.compile('\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files 27 | RE_P9 = re.compile(' ].*?)(|/>)', re.DOTALL | re.UNICODE) # outside links 28 | RE_P10 = re.compile(' ].*?)(|/>)', re.DOTALL | re.UNICODE) # math content 29 | RE_P11 = re.compile('<(.*?)>', re.DOTALL | re.UNICODE) # all other tags 30 | RE_P12 = re.compile('\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting 31 | RE_P13 = re.compile('\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting 32 | RE_P14 = re.compile('\[\[Category:[^][]*\]\]', re.UNICODE) # categories 33 | RE_P15 = re.compile('\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) 34 | 35 | RE_BI = re.compile(r"'''''([^']*?)'''''") 36 | RE_B = re.compile(r"'''(.*?)'''") 37 | RE_IQ = re.compile(r"''\"(.*?)\"''") 38 | RE_I = re.compile(r"''([^']*)''") 39 | RE_QQ = re.compile(r'""(.*?)""') 40 | RE_SECT = re.compile(r'(==+)\s*(.*?)\s*\1') 41 | RE_EMPTY_PARENS = re.compile(r' \(\s*\)') 42 | 43 | RE_HTML_ENT = re.compile("&#?(\w+);") 44 | 45 | def remove_markup((uri, text)): 46 | text = re.sub(RE_P2, "", text) 47 | 48 | # TODO: may be desirable to extract captions for files and images and insert them back into the document 49 | text = remove_template(text) 50 | text = extract_tag_content(text, [ 51 | re.compile('\[\[[fF]ile:(.*?)(\|[^\]\[]+?)*\|'), 52 | re.compile('\[\[[iI]mage:(.*?)(\|[^\]\[]+?)*\|') 53 | ]) 54 | 55 | # the wiki markup is recursive (markup inside markup etc) we deal with that by removing 56 | # markup in a loop, starting with inner-most expressions and working outwards as long as something changes. 57 | iters = 0 58 | while True: 59 | old, iters = text, iters + 1 60 | text = re.sub(RE_P0, "", text) # remove comments 61 | text = re.sub(RE_P1, '', text) # remove footnotes 62 | text = re.sub(RE_P9, "", text) # remove outside links 63 | text = re.sub(RE_P10, "", text) # remove math content 64 | if iters == 1: 65 | text = re.sub(RE_P11, "", text) # remove all remaining tags 66 | 67 | # todo: extract sections 68 | text = re.sub(RE_SECT, '\\2', text) 69 | 70 | # inject link from the first bolded phrase as a mention of the article entity 71 | # this heuristic holds for the vast majority of pages and is a wiki standard 72 | text = re.sub(RE_B, '\\1' % uri, text, 1) 73 | 74 | text = re.sub(RE_P14, '', text) # remove categories 75 | 76 | # inject links 77 | text = re.sub(RE_P5, '\\3', text) # remove urls, keep description 78 | text = re.sub(RE_P6, '\\2' % wikilink_prefix, text) # simplify links, keep description only 79 | text = re.sub(RE_P6_ex, '\\1' % wikilink_prefix, text) 80 | # remove table markup 81 | text = text.replace('||', '\n|') # each table cell on a separate line 82 | text = re.sub(RE_P12, '\n', text) # remove formatting lines 83 | text = re.sub(RE_P13, '\n\\3', text) # leave only cell content 84 | # remove empty mark-up 85 | text = text.replace('[]', '') 86 | 87 | # formatting 88 | text = re.sub(RE_BI, r"\1", text) 89 | text = re.sub(RE_B, r"\1", text) 90 | text = re.sub(RE_IQ, r'"\1"', text) 91 | text = re.sub(RE_I, r'"\1"', text) 92 | text = re.sub(RE_QQ, r"\1", text) 93 | 94 | if old == text or iters > 2: # stop if nothing changed between two iterations or after a fixed number of iterations 95 | break 96 | 97 | text = re.sub(RE_EMPTY_PARENS, '', text) # remove empty parenthesis (usually left by stripped templates) 98 | text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text 99 | text = html_unescape(text.strip()) 100 | return (uri, text) 101 | 102 | def remove_template(s): 103 | # Find the start and end position of each template by finding the opening '{{' and closing '}}' 104 | n_open, n_close = 0, 0 105 | starts, ends = [], [] 106 | in_template = False 107 | prev_c = None 108 | for i, c in enumerate(iter(s)): 109 | if not in_template: 110 | if c == '{' and c == prev_c: 111 | starts.append(i - 1) 112 | in_template = True 113 | n_open = 1 114 | if in_template: 115 | if c == '{': 116 | n_open += 1 117 | elif c == '}': 118 | n_close += 1 119 | if n_open == n_close: 120 | ends.append(i) 121 | in_template = False 122 | n_open, n_close = 0, 0 123 | prev_c = c 124 | 125 | # Remove all the templates 126 | s = ''.join([s[end + 1:start] for start, end in 127 | zip(starts + [None], [-1] + ends)]) 128 | 129 | return s 130 | 131 | def extract_tag_content(s, tags, include_content=True): 132 | s = s.replace(u'\u2502','|') 133 | for t in tags: 134 | parts = [] 135 | last_match_end = None 136 | for match in t.finditer(s): 137 | parts.append(slice(last_match_end,match.start())) 138 | 139 | i = match.end() 140 | while True: 141 | next_open = s.find('[[', i) 142 | next_close = s.find(']]', i) 143 | if next_open == -1 or next_open > next_close: 144 | last_match_end = next_close 145 | break 146 | elif next_close == -1: 147 | # unbalanced tags in wikimarkup, bail! 148 | last_match_end = i 149 | break 150 | i = next_close+2 151 | if include_content and match.end() != last_match_end: 152 | content = s[match.end():last_match_end].strip('] ') 153 | if content: 154 | parts.append(slice(match.end(),last_match_end)) 155 | if not content.endswith('.'): 156 | parts.append('.') 157 | last_match_end += 2 158 | parts.append(slice(last_match_end,None)) 159 | s = ''.join(s[p] if type(p) is slice else p for p in parts) 160 | 161 | return s 162 | 163 | def html_unescape(text): 164 | def replace(m): 165 | span, code = m.group(0), m.group(1) 166 | try: 167 | if span[1] == "#": 168 | return unichr(int(code[1:], 16)) if span[2] == "x" else unichr(int(code)) 169 | else: 170 | return unichr(name2codepoint[code]) 171 | except: 172 | return span 173 | return re.sub(RE_HTML_ENT, replace, text) 174 | 175 | def extract_page(content): 176 | e = ET.fromstring(content.encode('utf-8')) 177 | 178 | title = e.find('title').text 179 | ns = e.find('ns').text 180 | pageid = int(e.find('id').text) 181 | redirect_elem = e.find('redirect') 182 | redirect = None if redirect_elem == None else redirect_elem.attrib['title'] 183 | content = None if redirect != None else e.find('revision/text').text 184 | uri = wikilink_prefix+title.replace(' ', '_') 185 | 186 | return uri, ns, pageid, redirect, content 187 | 188 | def normalise_wikilink(s): 189 | s = s.replace(' ', '_').strip('_').strip() 190 | if s and s[0].islower(): 191 | s = s[0].upper() + s[1:] 192 | return s 193 | 194 | def normalise_link(s): 195 | if s.startswith(wikilink_prefix): 196 | s = wikilink_prefix + normalise_wikilink(s[len(wikilink_prefix):]) 197 | return s 198 | 199 | def extract_links(content): 200 | links_re = re.compile(r'(.+?)') 201 | 202 | links = [] 203 | offset = 0 204 | for match in list(links_re.finditer(content)): 205 | target = match.group(1) 206 | anchor = match.group(2) 207 | start = match.start() - offset 208 | offset += len(match.group())-len(anchor) 209 | links.append((normalise_link(target), slice(start, start+len(anchor)))) 210 | 211 | return links_re.sub(r'\2', content), links 212 | -------------------------------------------------------------------------------- /sift/corpora/wikidata.py: -------------------------------------------------------------------------------- 1 | import ujson as json 2 | 3 | from sift.corpora import wikicorpus 4 | from sift.dataset import ModelBuilder, Model, Relations 5 | 6 | from sift import logging 7 | log = logging.getLogger() 8 | 9 | ENTITY_PREFIX = 'Q' 10 | PREDICATE_PREFIX = 'P' 11 | 12 | class WikidataCorpus(ModelBuilder, Model): 13 | @staticmethod 14 | def iter_item_for_line(line): 15 | line = line.strip() 16 | if line != '[' and line != ']': 17 | yield json.loads(line.rstrip(',\n')) 18 | 19 | def build(self, sc, path): 20 | return sc\ 21 | .textFile(path)\ 22 | .flatMap(self.iter_item_for_line)\ 23 | .map(lambda i: (i['id'], i)) 24 | 25 | @staticmethod 26 | def format_item((wid, item)): 27 | return { 28 | '_id': wid, 29 | 'data': item 30 | } 31 | 32 | class WikidataRelations(ModelBuilder, Relations): 33 | """ Prepare a corpus of relations from wikidata """ 34 | @staticmethod 35 | def iter_relations_for_item(item): 36 | for pid, statements in item.get('claims', {}).iteritems(): 37 | for statement in statements: 38 | if statement['mainsnak'].get('snaktype') == 'value': 39 | datatype = statement['mainsnak'].get('datatype') 40 | if datatype == 'wikibase-item': 41 | yield pid, int(statement['mainsnak']['datavalue']['value']['numeric-id']) 42 | elif datatype == 'time': 43 | yield pid, statement['mainsnak']['datavalue']['value']['time'] 44 | elif datatype == 'string' or datatype == 'url': 45 | yield pid, statement['mainsnak']['datavalue']['value'] 46 | 47 | def build(self, corpus): 48 | entities = corpus\ 49 | .filter(lambda item: item['_id'].startswith(ENTITY_PREFIX)) 50 | 51 | entity_labels = entities\ 52 | .map(lambda item: (item['_id'], item['data'].get('labels', {}).get('en', {}).get('value', None)))\ 53 | .filter(lambda (pid, label): label)\ 54 | .map(lambda (pid, label): (int(pid[1:]), label)) 55 | 56 | wiki_entities = entities\ 57 | .map(lambda item: (item['data'].get('sitelinks', {}).get('enwiki', {}).get('title', None), item['data']))\ 58 | .filter(lambda (e, _): e)\ 59 | .cache() 60 | 61 | predicate_labels = corpus\ 62 | .filter(lambda item: item['_id'].startswith(PREDICATE_PREFIX))\ 63 | .map(lambda item: (item['_id'], item['data'].get('labels', {}).get('en', {}).get('value', None)))\ 64 | .filter(lambda (pid, label): label)\ 65 | .cache() 66 | 67 | relations = wiki_entities\ 68 | .flatMap(lambda (eid, item): ((pid, (value, eid)) for pid, value in self.iter_relations_for_item(item)))\ 69 | .join(predicate_labels)\ 70 | .map(lambda (pid, ((value, eid), label)): (value, (label, eid))) 71 | 72 | return relations\ 73 | .leftOuterJoin(entity_labels)\ 74 | .map(lambda (value, ((label, eid), value_label)): (eid, (label, value_label or value)))\ 75 | .groupByKey()\ 76 | .mapValues(dict) 77 | -------------------------------------------------------------------------------- /sift/corpora/wikipedia.py: -------------------------------------------------------------------------------- 1 | import ujson as json 2 | 3 | from sift.corpora import wikicorpus 4 | from sift.dataset import ModelBuilder, Model, Redirects, Documents 5 | 6 | from sift import logging 7 | log = logging.getLogger() 8 | 9 | class WikipediaCorpus(ModelBuilder, Model): 10 | def build(self, sc, path): 11 | PAGE_DELIMITER = "\n \n" 12 | PAGE_START = '\n' 13 | PAGE_END = '' 14 | return sc\ 15 | .newAPIHadoopFile( 16 | path, 17 | "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", 18 | "org.apache.hadoop.io.LongWritable", 19 | "org.apache.hadoop.io.Text", 20 | conf = { "textinputformat.record.delimiter": PAGE_DELIMITER })\ 21 | .map(lambda (_, part): (part.find(PAGE_START), part))\ 22 | .filter(lambda (offset, _): offset >= 0)\ 23 | .map(lambda (offset, content): content[offset:]+PAGE_END)\ 24 | .map(wikicorpus.extract_page) 25 | 26 | @staticmethod 27 | def format_item((title, ns, pid, redirect, content)): 28 | return { 29 | '_id': title, 30 | 'pid': pid, 31 | 'namespace': ns, 32 | 'redirect': redirect, 33 | 'content': content 34 | } 35 | 36 | class WikipediaRedirects(ModelBuilder, Redirects): 37 | """ Extract a set of redirects from wikipedia """ 38 | def __init__(self, resolve_transitive=False): 39 | self.resolve_transitive = resolve_transitive 40 | 41 | def build(self, pages, verbose=False): 42 | pfx = wikicorpus.wikilink_prefix 43 | redirects = pages\ 44 | .filter(lambda page: page['redirect'] != None)\ 45 | .map(lambda page: (page['_id'], page['redirect']))\ 46 | .mapValues(wikicorpus.normalise_wikilink)\ 47 | .map(lambda (s, t): (s, pfx+t)) 48 | 49 | if self.resolve_transitive: 50 | redirects = redirects.cache() 51 | 52 | num_targets = redirects\ 53 | .map(lambda (k,v): v)\ 54 | .distinct()\ 55 | .count() 56 | 57 | redirects = redirects\ 58 | .map(lambda (s, t): (t, s)).leftOuterJoin(redirects)\ 59 | .map(lambda (target, (source, redirect)): (source, redirect or target)) 60 | 61 | if verbose: 62 | redirects = redirects.cache() 63 | final_num_targets = redirects.map(lambda (k,v): v).distinct().count() 64 | log.info('Resolved %i transitive redirects...', num_targets - final_num_targets) 65 | 66 | return redirects.distinct() 67 | 68 | class WikipediaArticles(ModelBuilder, Documents): 69 | """ Prepare a corpus of documents from wikipedia """ 70 | def build(self, corpus, redirects=None): 71 | articles = corpus\ 72 | .filter(lambda page: page['namespace'] == '0' and page['redirect'] == None and page['content'])\ 73 | .map(lambda page: (page['_id'], page['content']))\ 74 | .map(wikicorpus.remove_markup)\ 75 | .mapValues(wikicorpus.extract_links) 76 | 77 | if redirects: 78 | redirects = redirects.map(lambda r: (r['_id'], r['target'])) 79 | articles.cache() 80 | 81 | # redirect set is typically too large to be broadcasted for a map-side join 82 | articles = articles\ 83 | .flatMap(lambda (pid, (text, links)): ((t, (pid, span)) for t, span in links))\ 84 | .leftOuterJoin(redirects)\ 85 | .map(lambda (t, ((pid, span), r)): (pid, (r if r else t, span)))\ 86 | .groupByKey()\ 87 | .mapValues(list)\ 88 | .join(articles)\ 89 | .map(lambda (pid, (links, (text, _))): (pid, (text, links))) 90 | 91 | return articles 92 | -------------------------------------------------------------------------------- /sift/dataset.py: -------------------------------------------------------------------------------- 1 | import ujson as json 2 | 3 | class ModelBuilder(object): 4 | def __init__(self, *args, **kwargs): pass 5 | 6 | def __call__(self, *args, **kwargs): 7 | return self.build(*args, **kwargs).map(self.format_item) 8 | 9 | def build(self, *args, **kwargs): 10 | raise NotImplementedError 11 | 12 | class Model(object): 13 | @staticmethod 14 | def format_item(item): 15 | raise NotImplementedError 16 | 17 | @staticmethod 18 | def load(sc, path, fmt=json): 19 | return sc.textFile(path).map(json.loads) 20 | 21 | @staticmethod 22 | def save(m, path, fmt=json): 23 | m.map(json.dumps).saveAsTextFile(path, 'org.apache.hadoop.io.compress.GzipCodec') 24 | 25 | class Redirects(Model): 26 | @staticmethod 27 | def format_item((source, target)): 28 | return {'_id': source, 'target': target} 29 | 30 | class Vocab(Model): 31 | @staticmethod 32 | def format_item((term, (count, rank))): 33 | return { 34 | '_id': term, 35 | 'count': count, 36 | 'rank': rank 37 | } 38 | 39 | class Mentions(Model): 40 | @staticmethod 41 | def format_item((target, source, text, span)): 42 | return { 43 | '_id': target, 44 | 'source': source, 45 | 'text': text, 46 | 'span': span 47 | } 48 | 49 | class IndexedMentions(Model): 50 | @staticmethod 51 | def format_item((target, source, text, span)): 52 | return { 53 | '_id': target, 54 | 'source': source, 55 | 'sequence': text, 56 | 'span': span 57 | } 58 | 59 | class Documents(Model): 60 | @staticmethod 61 | def format_item((uri, (text, links))): 62 | return { 63 | '_id': uri, 64 | 'text': text, 65 | 'links': [{ 66 | 'target': target, 67 | 'start': span.start, 68 | 'stop': span.stop 69 | } for target, span in links] 70 | } 71 | 72 | class Relations(Model): 73 | @staticmethod 74 | def format_item((uri, relations)): 75 | return { 76 | '_id': uri, 77 | 'relations': relations 78 | } -------------------------------------------------------------------------------- /sift/format.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import ujson as json 3 | import msgpack 4 | import base64 5 | 6 | class ModelFormat(object): 7 | def __init__(self): 8 | pass 9 | def __call__(self, model): 10 | raise NotImplemented 11 | 12 | @classmethod 13 | def iter_options(cls): 14 | yield JsonFormat 15 | yield RedisFormat 16 | yield TsvFormat 17 | 18 | class TsvFormat(ModelFormat): 19 | """ Format model output as tab separated values """ 20 | @staticmethod 21 | def items_to_tsv(items): 22 | key_order = None 23 | for item in items: 24 | if key_order == None: 25 | key_order = [] 26 | if '_id' in item: 27 | key_order.append('_id') 28 | key_order += sorted(k for k in item.iterkeys() if k != '_id') 29 | 30 | # todo: proper field serialization and escapes 31 | yield u'\t'.join(unicode(item[k]) for k in key_order).encode('utf-8') 32 | 33 | def __call__(self, model): 34 | return model.mapPartitions(self.items_to_tsv) 35 | 36 | @classmethod 37 | def add_arguments(cls, p): 38 | p.set_defaults(fmtcls=cls) 39 | return p 40 | 41 | class JsonFormat(ModelFormat): 42 | """ Format model output as json """ 43 | def __call__(self, model): 44 | return model.map(json.dumps) 45 | 46 | @classmethod 47 | def add_arguments(cls, p): 48 | p.set_defaults(fmtcls=cls) 49 | return p 50 | 51 | class RedisFormat(ModelFormat): 52 | """ Format model output as redis protocol SET commands """ 53 | def __init__(self, prefix, serializer, field): 54 | if serializer == 'raw' and not field: 55 | raise Exception("Target field required for raw serializer") 56 | 57 | self.prefix = prefix 58 | self.field = field 59 | self.serializer = { 60 | 'json': json.dumps, 61 | 'msgpack': lambda o: base64.b64encode(msgpack.dumps(o)), 62 | 'pickle': lambda o: base64.b64encode(pickle.dumps(o, -1)), 63 | 'raw': lambda o: o 64 | }[serializer] 65 | 66 | def to_value(self, item): 67 | if self.field: 68 | item = unicode(item[self.field]) 69 | else: 70 | item.pop('_id', None) 71 | return self.serializer(item) 72 | 73 | def __call__(self, model): 74 | cmd = '\r\n'.join(["*3", "$3", "SET", "${}", "{}", "${}", "{}"])+'\r' 75 | return model\ 76 | .map(lambda i: ((self.prefix+i['_id'].replace('"','\\"')).encode('utf-8'), self.to_value(i)))\ 77 | .map(lambda (t, c): cmd.format(len(t), t, len(c), c)) 78 | 79 | @classmethod 80 | def add_arguments(cls, p): 81 | p.add_argument('--prefix', required=False, default='', metavar='PREFIX') 82 | p.add_argument('--serializer', choices=['json', 'pickle', 'msgpack', 'raw'], required=False, default='json', metavar='SERIALIZER') 83 | p.add_argument('--field', required=False, metavar='FIELD_TO_SERIALIZE') 84 | p.set_defaults(fmtcls=cls) 85 | return p 86 | -------------------------------------------------------------------------------- /sift/logging.py: -------------------------------------------------------------------------------- 1 | """ Logging Configuration """ 2 | from __future__ import absolute_import 3 | import logging 4 | 5 | def setup(): 6 | fmt = '%(asctime)s|%(levelname)s|%(module)s|%(message)s' 7 | logging.basicConfig(format=fmt) 8 | log = logging.getLogger('nel') 9 | log.setLevel(logging.DEBUG) 10 | 11 | def getLogger(): 12 | return logging.getLogger('nel') 13 | 14 | setup() -------------------------------------------------------------------------------- /sift/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andychisholm/sift/c25cc42ab4ad7f44838036c63d0be4b03767c16c/sift/models/__init__.py -------------------------------------------------------------------------------- /sift/models/embeddings.py: -------------------------------------------------------------------------------- 1 | from operator import add 2 | from itertools import chain 3 | 4 | from sift.models.text import EntityMentions 5 | from sift.util import ngrams 6 | from sift.dataset import ModelBuilder, Model 7 | 8 | from sift import logging 9 | log = logging.getLogger() 10 | 11 | class EntitySkipGramEmbeddings(ModelBuilder, Model): 12 | """ Learn distributed representations for words and entities in a corpus via skip-gram embedding """ 13 | def __init__( 14 | self, 15 | dimensions=100, 16 | min_word_count=500, 17 | min_entity_count=10, 18 | entity_prefix='en.wikipedia.org/wiki/', 19 | exclude_words=False, 20 | exclude_entities=False, 21 | workers=4, 22 | coalesce=None, 23 | *args, **kwargs): 24 | 25 | self.dimensions = dimensions 26 | self.min_word_count = min_word_count 27 | self.min_entity_count = min_entity_count 28 | self.filter_target = entity_prefix 29 | self.exclude_words = exclude_words 30 | self.exclude_entities = exclude_entities 31 | self.workers = workers 32 | self.coalesce = coalesce 33 | 34 | def get_trim_rule(self): 35 | from gensim.utils import RULE_KEEP, RULE_DISCARD 36 | def trim_rule(word, count, min_count): 37 | if not word.startswith(self.filter_target): 38 | return RULE_KEEP if count >= self.min_word_count else RULE_DISCARD 39 | else: 40 | return RULE_KEEP if count >= self.min_entity_count else RULE_DISCARD 41 | return RULE_KEEP 42 | return trim_rule 43 | 44 | def build(self, mentions): 45 | from gensim.models.word2vec import Word2Vec 46 | sentences = mentions\ 47 | .filter(lambda (target, source, text, span): target.startswith(self.filter_target))\ 48 | 49 | sentences = sentences\ 50 | .map(lambda (target, source, text, (s,e)): list(chain(ngrams(text[:s],1), [target], ngrams(text[e:],1)))) 51 | 52 | if self.coalesce: 53 | sentences = sentences.coalesce(self.coalesce) 54 | 55 | sentences = sentences.cache() 56 | 57 | model = Word2Vec(sample=1e-5, size=self.dimensions, workers=self.workers) 58 | 59 | log.info('Preparing corpus...') 60 | model.corpus_count = sentences.count() 61 | 62 | log.info('Computing vocab statistics...') 63 | term_counts = sentences\ 64 | .flatMap(lambda tokens: ((t, 1) for t in tokens))\ 65 | .reduceByKey(add)\ 66 | .filter(lambda (t, count): \ 67 | (t.startswith(self.filter_target) and count >= self.min_entity_count) or \ 68 | (count >= self.min_word_count)) 69 | 70 | model.raw_vocab = dict(term_counts.collect()) 71 | model.scale_vocab(trim_rule=self.get_trim_rule()) 72 | model.finalize_vocab() 73 | 74 | log.info('Training local word2vec model...') 75 | model.train(sentences.toLocalIterator()) 76 | 77 | log.info('Normalising embeddings...') 78 | model.init_sims(replace=True) 79 | 80 | total_entities = sum(1 if t.startswith(self.filter_target) else 0 for t in model.vocab.iterkeys()) 81 | total_words = len(model.vocab) - total_entities 82 | 83 | vocab_sz = 0 84 | if not self.exclude_entities: 85 | log.info('Including %i entity embeddings in exported vocab...', total_entities) 86 | vocab_sz += total_entities 87 | if not self.exclude_words: 88 | log.info('Including %i word embeddings in exported vocab...', total_words) 89 | vocab_sz += total_words 90 | 91 | log.info('Parallelizing %i learned embeddings...', vocab_sz) 92 | return mentions\ 93 | .context\ 94 | .parallelize( 95 | (t, model.syn0[vi.index].tolist()) 96 | for t, vi in model.vocab.iteritems() 97 | if (not self.exclude_entities and t.startswith(self.filter_target)) or 98 | (not self.exclude_words and not t.startswith(self.filter_target))) 99 | 100 | @staticmethod 101 | def format_item((entity, embedding)): 102 | return { 103 | '_id': entity, 104 | 'embedding': embedding 105 | } -------------------------------------------------------------------------------- /sift/models/links.py: -------------------------------------------------------------------------------- 1 | import ujson as json 2 | 3 | from operator import add 4 | from collections import Counter 5 | from itertools import chain 6 | 7 | from sift.dataset import ModelBuilder, Documents, Model 8 | from sift.util import trim_link_subsection, trim_link_protocol, ngrams 9 | 10 | from sift import logging 11 | log = logging.getLogger() 12 | 13 | class EntityCounts(ModelBuilder, Model): 14 | """ Inlink counts """ 15 | def __init__(self, min_count=1, filter_target=None): 16 | self.min_count = min_count 17 | self.filter_target = filter_target 18 | 19 | def build(self, docs): 20 | links = docs\ 21 | .flatMap(lambda d: d['links'])\ 22 | .map(lambda l: l['target'])\ 23 | .map(trim_link_subsection)\ 24 | .map(trim_link_protocol) 25 | 26 | if self.filter_target: 27 | links = links.filter(lambda l: l.startswith(self.filter_target)) 28 | 29 | return links\ 30 | .map(lambda l: (l, 1))\ 31 | .reduceByKey(add)\ 32 | .filter(lambda (t, c): c > self.min_count) 33 | 34 | @staticmethod 35 | def format_item((target, count)): 36 | return { 37 | '_id': target, 38 | 'count': count 39 | } 40 | 41 | class EntityNameCounts(ModelBuilder, Model): 42 | """ Entity counts by name """ 43 | def __init__(self, lowercase=False, filter_target=None): 44 | self.lowercase = lowercase 45 | self.filter_target = filter_target 46 | 47 | def iter_anchor_target_pairs(self, doc): 48 | for link in doc['links']: 49 | target = link['target'] 50 | target = trim_link_subsection(target) 51 | target = trim_link_protocol(target) 52 | 53 | anchor = doc['text'][link['start']:link['stop']].strip() 54 | 55 | if self.lowercase: 56 | anchor = anchor.lower() 57 | 58 | if anchor and target: 59 | yield anchor, target 60 | 61 | def build(self, docs): 62 | m = docs.flatMap(lambda d: self.iter_anchor_target_pairs(d)) 63 | 64 | if self.filter_target: 65 | m = m.filter(lambda (a, t): t.startswith(self.filter_target)) 66 | 67 | return m\ 68 | .groupByKey()\ 69 | .mapValues(Counter) 70 | 71 | @staticmethod 72 | def format_item((anchor, counts)): 73 | return { 74 | '_id': anchor, 75 | 'counts': dict(counts), 76 | 'total': sum(counts.itervalues()) 77 | } 78 | 79 | class NamePartCounts(ModelBuilder, Model): 80 | """ 81 | Occurrence counts for ngrams at different positions within link anchors. 82 | 'B' - beginning of span 83 | 'E' - end of span 84 | 'I' - inside span 85 | 'O' - outside span 86 | """ 87 | def __init__(self, max_ngram=2, lowercase=False, filter_target=None): 88 | self.lowercase = lowercase 89 | self.filter_target = filter_target 90 | self.max_ngram = max_ngram 91 | 92 | def iter_anchors(self, doc): 93 | for link in doc['links']: 94 | anchor = doc['text'][link['start']:link['stop']].strip() 95 | if self.lowercase: 96 | anchor = anchor.lower() 97 | if anchor: 98 | yield anchor 99 | 100 | @staticmethod 101 | def iter_span_count_types(anchor, n): 102 | parts = list(ngrams(anchor, n, n)) 103 | if parts: 104 | yield parts[0], 'B' 105 | yield parts[-1], 'E' 106 | for i in xrange(1, len(parts)-1): 107 | yield parts[i], 'I' 108 | 109 | def build(self, docs): 110 | part_counts = docs\ 111 | .flatMap(self.iter_anchors)\ 112 | .flatMap(lambda a: chain.from_iterable(self.iter_span_count_types(a, i) for i in xrange(1, self.max_ngram+1)))\ 113 | .map(lambda p: (p, 1))\ 114 | .reduceByKey(add)\ 115 | .map(lambda ((term, spantype), count): (term, (spantype, count))) 116 | 117 | part_counts += docs\ 118 | .flatMap(lambda d: ngrams(d['text'], self.max_ngram))\ 119 | .map(lambda t: (t, 1))\ 120 | .reduceByKey(add)\ 121 | .filter(lambda (t, c): c > 1)\ 122 | .map(lambda (t, c): (t, ('O', c))) 123 | 124 | return part_counts\ 125 | .groupByKey()\ 126 | .mapValues(dict)\ 127 | .filter(lambda (t, cs): 'O' in cs and len(cs) > 1) 128 | 129 | @staticmethod 130 | def format_item((term, part_counts)): 131 | return { 132 | '_id': term, 133 | 'counts': dict(part_counts) 134 | } 135 | 136 | class EntityInlinks(ModelBuilder, Model): 137 | """ Inlink sets for each entity """ 138 | def build(self, docs): 139 | return docs\ 140 | .flatMap(lambda d: ((d['_id'], l) for l in set(l['target'] for l in d['links'])))\ 141 | .mapValues(trim_link_subsection)\ 142 | .mapValues(trim_link_protocol)\ 143 | .map(lambda (k, v): (v, k))\ 144 | .groupByKey()\ 145 | .mapValues(list) 146 | 147 | @staticmethod 148 | def format_item((target, inlinks)): 149 | return { 150 | '_id': target, 151 | 'inlinks': inlinks 152 | } 153 | 154 | class EntityVocab(ModelBuilder, Model): 155 | """ Generate unique indexes for entities in a corpus. """ 156 | def __init__(self, min_rank=0, max_rank=10000): 157 | self.min_rank = min_rank 158 | self.max_rank = max_rank 159 | 160 | def build(self, docs): 161 | log.info('Building entity vocab: df rank range=(%i, %i)', self.min_rank, self.max_rank) 162 | m = super(EntityVocab, self)\ 163 | .build(docs)\ 164 | .map(lambda (target, count): (count, target))\ 165 | .sortByKey(False)\ 166 | .zipWithIndex()\ 167 | .map(lambda ((df, t), idx): (t, (df, idx))) 168 | 169 | if self.min_rank != None: 170 | m = m.filter(lambda (t, (df, idx)): idx >= self.min_rank) 171 | if self.max_rank != None: 172 | m = m.filter(lambda (t, (df, idx)): idx < self.max_rank) 173 | return m 174 | 175 | @staticmethod 176 | def format_item((term, (f, idx))): 177 | return { 178 | '_id': term, 179 | 'count': f, 180 | 'rank': idx 181 | } 182 | 183 | @staticmethod 184 | def load(sc, path, fmt=json): 185 | log.info('Loading entity-index mapping: %s ...', path) 186 | return sc\ 187 | .textFile(path)\ 188 | .map(fmt.loads)\ 189 | .map(lambda r: (r['_id'], (r['count'], r['rank']))) 190 | 191 | class EntityComentions(ModelBuilder, Model): 192 | """ Entity comentions """ 193 | @staticmethod 194 | def iter_unique_links(doc): 195 | links = set() 196 | for l in doc['links']: 197 | link = trim_link_subsection(l['target']) 198 | link = trim_link_protocol(link) 199 | if link not in links: 200 | yield link 201 | links.add(link) 202 | 203 | def build(self, docs): 204 | return docs\ 205 | .map(lambda d: (d['_id'], list(self.iter_unique_links(d))))\ 206 | .filter(lambda (uri, es): es) 207 | 208 | @staticmethod 209 | def format_item((uri, es)): 210 | return { 211 | '_id': uri, 212 | 'entities': es 213 | } 214 | 215 | class MappedEntityComentions(EntityComentions): 216 | """ Entity comentions with entities mapped to a numeric index """ 217 | def build(self, docs, entity_vocab): 218 | ev = sc.broadcast(dict(ev.collect())) 219 | return super(MappedEntityComentions, self)\ 220 | .build(docs)\ 221 | .map(lambda (uri, es): (uri, [ev.value[e] for e in es if e in ev.value]))\ 222 | .filter(lambda (uri, es): es) -------------------------------------------------------------------------------- /sift/models/text.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy 3 | import ujson as json 4 | from bisect import bisect_left, bisect_right 5 | from operator import add 6 | from collections import Counter 7 | 8 | from sift.models.links import EntityVocab 9 | from sift.dataset import ModelBuilder, Documents, Model, Mentions, IndexedMentions, Vocab 10 | from sift.util import ngrams, iter_sent_spans, trim_link_subsection, trim_link_protocol 11 | 12 | from sift import logging 13 | log = logging.getLogger() 14 | 15 | class TermFrequencies(ModelBuilder, Model): 16 | """ Get term frequencies over a corpus """ 17 | def __init__(self, lowercase, max_ngram): 18 | self.lowercase = lowercase 19 | self.max_ngram = max_ngram 20 | 21 | def build(self, docs): 22 | m = docs.map(lambda d: d['text']) 23 | if self.lowercase: 24 | m = m.map(unicode.lower) 25 | 26 | return m\ 27 | .flatMap(lambda text: ngrams(text, self.max_ngram))\ 28 | .map(lambda t: (t, 1))\ 29 | .reduceByKey(add)\ 30 | .filter(lambda (k,v): v > 1) 31 | 32 | @staticmethod 33 | def format_item(self, (term, count)): 34 | return { 35 | '_id': term, 36 | 'count': count, 37 | } 38 | 39 | class EntityMentions(ModelBuilder, Mentions): 40 | """ Get aggregated sentence context around links in a corpus """ 41 | def __init__(self, sentence_window = 1, lowercase=False, normalize_url=True, strict_sentences=True): 42 | self.sentence_window = sentence_window 43 | self.lowercase = lowercase 44 | self.strict_sentences = strict_sentences 45 | self.normalize_url = normalize_url 46 | 47 | @staticmethod 48 | def iter_mentions(doc, window = 1, norm_url=True, strict=True): 49 | sent_spans = list(iter_sent_spans(doc['text'])) 50 | sent_offsets = [s.start for s in sent_spans] 51 | 52 | for link in doc['links']: 53 | # align the link span over sentence spans in the document 54 | # mention span may cross sentence bounds if sentence tokenisation is dodgy 55 | # if so, the entire span between bounding sentences will be used as context 56 | sent_start_idx = bisect_right(sent_offsets, link['start']) - 1 57 | sent_end_idx = bisect_left(sent_offsets, link['stop']) - 1 58 | 59 | lhs_offset = window / 2 60 | rhs_offset = (window - lhs_offset) - 1 61 | sent_start_idx = max(0, sent_start_idx - lhs_offset) 62 | sent_end_idx = min(len(sent_spans)-1, sent_end_idx + rhs_offset) 63 | sent_offset = sent_spans[sent_start_idx].start 64 | 65 | span = (link['start'] - sent_offset, link['stop'] - sent_offset) 66 | target = link['target'] 67 | if norm_url: 68 | target = trim_link_subsection(link['target']) 69 | target = trim_link_protocol(target) 70 | mention = doc['text'][sent_spans[sent_start_idx].start:sent_spans[sent_end_idx].stop] 71 | 72 | # filter out instances where the mention span is the entire sentence 73 | if span == (0, len(mention)): 74 | continue 75 | 76 | if strict: 77 | # filter out list item sentences 78 | sm = mention.strip() 79 | if not sm or sm.startswith('*') or sm[-1] not in '.!?"\'': 80 | continue 81 | 82 | yield target, doc['_id'], mention, span 83 | 84 | def build(self, docs): 85 | m = docs.flatMap(lambda d: self.iter_mentions(d, self.sentence_window, self.normalize_url, self.strict_sentences)) 86 | if self.lowercase: 87 | m = m.map(lambda (t, src, m, s): (t, src, m.lower(), s)) 88 | return m 89 | 90 | class IndexMappedMentions(EntityMentions, IndexedMentions): 91 | """ Entity mention corpus with terms mapped to numeric indexes """ 92 | def build(self, sc, docs, vocab): 93 | tv = sc.broadcast(dict(vocab.map(lambda r: (r['_id'], r['rank'])).collect())) 94 | return super(IndexMappedMentions, self)\ 95 | .build(docs)\ 96 | .map(lambda m: self.transform(m, tv)) 97 | 98 | @staticmethod 99 | def transform((target, source, text, span), vocab): 100 | vocab = vocab.value 101 | 102 | start, stop = span 103 | pre = list(ngrams(text[:start], 1)) 104 | ins = list(ngrams(text[start:stop], 1)) 105 | post = list(ngrams(text[stop:], 1)) 106 | indexes = [vocab.get(t, len(vocab)-1) for t in (pre+ins+post)] 107 | 108 | return target, source, indexes, (len(pre), len(pre)+len(ins)) 109 | 110 | class TermDocumentFrequencies(ModelBuilder): 111 | """ Get document frequencies for terms in a corpus """ 112 | def __init__(self, lowercase=False, max_ngram=1, min_df=2): 113 | self.lowercase = lowercase 114 | self.max_ngram = max_ngram 115 | self.min_df = min_df 116 | 117 | def build(self, docs): 118 | m = docs.map(lambda d: d['text']) 119 | if self.lowercase: 120 | m = m.map(lambda text: text.lower()) 121 | 122 | return m\ 123 | .flatMap(lambda text: set(ngrams(text, self.max_ngram)))\ 124 | .map(lambda t: (t, 1))\ 125 | .reduceByKey(add)\ 126 | .filter(lambda (k,v): v > self.min_df) 127 | 128 | class TermVocab(TermDocumentFrequencies, Vocab): 129 | """ Generate unique indexes for termed based on their document frequency ranking. """ 130 | def __init__(self, max_rank, min_rank=100, *args, **kwargs): 131 | self.max_rank = max_rank 132 | self.min_rank = min_rank 133 | super(TermVocab, self).__init__(*args, **kwargs) 134 | 135 | def build(self, docs): 136 | m = super(TermVocab, self)\ 137 | .build(docs)\ 138 | .map(lambda (t, df): (df, t))\ 139 | .sortByKey(False)\ 140 | .zipWithIndex()\ 141 | .map(lambda ((df, t), idx): (t, (df, idx))) 142 | 143 | if self.min_rank != None: 144 | m = m.filter(lambda (t, (df, idx)): idx >= self.min_rank) 145 | if self.max_rank != None: 146 | m = m.filter(lambda (t, (df, idx)): idx < self.max_rank) 147 | return m 148 | 149 | @staticmethod 150 | def format_item((term, (f, idx))): 151 | return { 152 | '_id': term, 153 | 'count': f, 154 | 'rank': idx 155 | } 156 | 157 | class TermIdfs(TermDocumentFrequencies, Model): 158 | """ Compute tf-idf weighted token counts over sentence contexts around links in a corpus """ 159 | def build(self, corpus): 160 | log.info('Counting documents in corpus...') 161 | N = float(corpus.count()) 162 | dfs = super(TermIdfs, self).build(corpus) 163 | 164 | log.info('Building idf model: N=%i', N) 165 | return dfs\ 166 | .map(lambda (term, (df, rank)): (term, df))\ 167 | .mapValues(lambda df: math.log(N/df)) 168 | 169 | @staticmethod 170 | def format_item((term, idf)): 171 | return { 172 | '_id': term, 173 | 'idf': idf, 174 | } 175 | 176 | class EntityMentionTermFrequency(ModelBuilder, Model): 177 | """ Compute tf-idf weighted token counts over sentence contexts around links in a corpus """ 178 | def __init__(self, max_ngram=1, normalize = True): 179 | self.max_ngram = max_ngram 180 | self.normalize = normalize 181 | 182 | def build(self, mentions, idfs): 183 | m = mentions\ 184 | .map(lambda (target, (span, text)): (target, text))\ 185 | .mapValues(lambda v: ngrams(v, self.max_ngram))\ 186 | .flatMap(lambda (target, tokens): (((target, t), 1) for t in tokens))\ 187 | .reduceByKey(add)\ 188 | .map(lambda ((target, token), count): (token, (target, count)))\ 189 | .leftOuterJoin(idfs)\ 190 | .filter(lambda (token, ((target, count), idf)): idf != None)\ 191 | .map(lambda (token, ((target, count), idf)): (target, (token, math.sqrt(count)*idf)))\ 192 | .groupByKey() 193 | 194 | return m.mapValues(self.normalize_counts if self.normalize else list) 195 | 196 | @staticmethod 197 | def normalize_counts(counts): 198 | norm = numpy.linalg.norm([v for _, v in counts]) 199 | return [(k, v/norm) for k, v in counts] 200 | 201 | @staticmethod 202 | def format_item((link, counts)): 203 | return { 204 | '_id': link, 205 | 'counts': dict(counts), 206 | } 207 | -------------------------------------------------------------------------------- /sift/util.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pattern import en 3 | 4 | # todo: use spacy tokenization 5 | def ngrams(text, max_n=1, min_n=1, strip_punctuation=True): 6 | pattern_args = {} if strip_punctuation else {'punctuation':''} 7 | for i in xrange(min_n-1,max_n): 8 | for n in en.ngrams(text, n=i+1, **pattern_args): 9 | yield ' '.join(n) 10 | 11 | 12 | # sentences can't end with a single lowercase letter 13 | SENT_NO_END_LC = "(?