├── .gitignore
├── LICENSE
├── README.md
├── requirements.txt
├── scripts
    ├── download-wikipedia
    └── sift-notebook
├── setup.py
├── sift.ipynb
└── sift
    ├── __init__.py
    ├── build.py
    ├── corpora
        ├── __init__.py
        ├── commoncrawl.py
        ├── redirects.py
        ├── wikicorpus.py
        ├── wikidata.py
        └── wikipedia.py
    ├── dataset.py
    ├── format.py
    ├── logging.py
    ├── models
        ├── __init__.py
        ├── embeddings.py
        ├── links.py
        └── text.py
    └── util.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | data/
  2 | output/
  3 | 
  4 | ##########
  5 | # PYTHON #
  6 | ##########
  7 | # Initialized from github
  8 | # https://github.com/github/gitignore/blob/master/Python.gitignore
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | 
 14 | # C extensions
 15 | *.so
 16 | 
 17 | # Distribution / packaging
 18 | bin/
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | ve
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | .tox/
 39 | .coverage
 40 | .cache
 41 | nosetests.xml
 42 | coverage.xml
 43 | 
 44 | # Translations
 45 | *.mo
 46 | 
 47 | # Mr Developer
 48 | .mr.developer.cfg
 49 | .project
 50 | .pydevproject
 51 | 
 52 | # Rope
 53 | .ropeproject
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | *.pot
 58 | 
 59 | # Sphinx documentation
 60 | docs/_build/
 61 | 
 62 | # Package
 63 | MANIFEST
 64 | 
 65 | #########
 66 | # EMACS #
 67 | #########
 68 | # Initliazed from github
 69 | # https://raw2.github.com/github/gitignore/master/Global/Emacs.gitignore
 70 | 
 71 | # -*- mode: gitignore; -*-
 72 | *~
 73 | \#*\#
 74 | /.emacs.desktop
 75 | /.emacs.desktop.lock
 76 | *.elc
 77 | auto-save-list
 78 | tramp
 79 | .\#*
 80 | 
 81 | # Org-mode
 82 | .org-id-locations
 83 | *_archive
 84 | 
 85 | # flymake-mode
 86 | *_flymake.*
 87 | 
 88 | # eshell files
 89 | /eshell/history
 90 | /eshell/lastdir
 91 | 
 92 | # elpa packages
 93 | /elpa/
 94 | 
 95 | #######
 96 | # VIM #
 97 | #######
 98 | # Initialized from github
 99 | # https://raw2.github.com/github/gitignore/master/Global/vim.gitignore
100 | 
101 | [._]*.s[a-w][a-z]
102 | [._]s[a-w][a-z]
103 | *.un~
104 | Session.vim
105 | .netrwhist
106 | *~
107 | 
108 | #######
109 | # OSX #
110 | #######
111 | # Initialized from github
112 | # https://raw2.github.com/github/gitignore/master/Global/OSX.gitignore
113 | 
114 | .DS_Store
115 | .AppleDouble
116 | .LSOverride
117 | 
118 | # Icon must ends with two \r.
119 | Icon
120 | 
121 | # Thumbnails
122 | ._*
123 | 
124 | # Files that might appear on external disk
125 | .Spotlight-V100
126 | .Trashes
127 | 
128 | # NFS
129 | .nfs*
130 | 
131 | # IDE
132 | *.sublime-*
133 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 | 
3 | Copyright (c) 2014. Andrew Chisholm, Ben Hachey, The University of Sydney.
4 | 
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 | 
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 | 
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | sift - Knowledge extraction from web data
 2 | ================================================
 3 | 
 4 | __sift__ is a toolkit for extracting models of entities and text from a corpus of linked documents.
 5 | 
 6 | 
 7 | ## What can it do?
 8 | 
 9 | __sift__ is written in python, runs on Spark and is completely modular.
10 | 
11 | Out of the box, you can:
12 | 
13 | - Convert wikipedia articles into json objects without all the mediawiki cruft
14 | - Extract entity relations from wikidata and align them with wikipedia mentions
15 | - Extract plain-text content html and WARC encoded web page crawls
16 | - Model entity popularity, alternative names and relatedness using inlinks
17 | - Preprocess text documents for machine learning pipelines
18 | - Push output into datastores like MongoDB and Redis
19 | 
20 | ## Quick Start
21 | 
22 | ### Install
23 | ```bash
24 | pip install git+http://git@github.com/wikilinks/sift.git
25 | ```
26 | 
27 | ## Getting Started
28 | 
29 | To use sift, you'll need some data.
30 | 
31 | If you'd like to use Wikipedia data, sift includes a helper script for downloading the latest dumps.
32 | 
33 | Download the latest paritioned Wikipedia dump into the 'latest' directory.
34 | ```bash
35 | download-wikipedia latest
36 | ```
37 | 
38 | Once you've got some data, take a look at the sample notebook: [sift.ipynb](sift.ipynb).
39 | 
40 | ## Spark
41 | 
42 | __sift__ uses Spark to process corpora in parallel.
43 | 
44 | If you'd like to make use of an existing Spark cluster, ensure the `SPARK_HOME` environment variable is set.
45 | 
46 | If not, that's fine. `sift` will prompt you to download and run Spark locally, utilising multiple cores on your system.
47 | 
48 | ## Datasets
49 | 
50 | [Web KB](https://github.com/andychisholm/web-kb) datasets built from commoncrawl data are available under a public S3 bucket: [s3.amazonaws.com/webkb](https://s3.amazonaws.com/webkb/)
51 | 
52 | - `docs-2017` is built from news articles under the [CC-NEWS](http://commoncrawl.org/2016/10/news-dataset-available/) collection from January to June 2017 ([sample](https://s3.amazonaws.com/webkb/docs-2017/part-00000))
53 | - `web-201707` is built from a full web crawl for [July 2017](http://commoncrawl.org/2017/07/july-2017-crawl-archive-now-available/) filted to English language pages ([sample](https://s3.amazonaws.com/webkb/web-201707/part-00000.gz))
54 | 
55 | The web collection contains plain-text content, entity mentions and endpoint annotations extracted from 1.5 billion documents with over 4 billion web links.
56 | Data is encoded in a simple one-JSON-blob-per-line structure.
57 | 
58 | For example, the first document in the collection is an article from 2012 describing an [upcoming tour by Nicki Minaj](http://1019ampradio.cbslocal.com/2012/11/06/nicki-minaj-promises-man-bits-on-her-upcoming-tour/):
59 | 
60 | ```json
61 | {
62 |   "_id": "http://1019ampradio.cbslocal.com/2012/11/06/nicki-minaj-promises-man-bits-on-her-upcoming-tour/",
63 |   "text": "Nicki Minaj has had quite the year. Currently in the U.K. on her Reloaded Tour she sat down with London DJ Tim Westwood and her U.K. Barbz for a Q & A session. While Nicki took questions from both Westwood and her fans one answer in particular caused the room to pay attention...",
64 |   "links":[{
65 |       "start": 0,
66 |       "endpoint": 0.6358972797,
67 |       "stop": 11,
68 |       "target": "http://1019ampradio.cbslocal.com/tag/nicki-minaj"
69 |     }, {
70 |       "start": 145,
71 |       "endpoint": 0.2769776554,
72 |       "stop": 160,
73 |       "target": "http://www.youtube.com/watch?v=vnyuhDBcQo0"
74 |   }],
75 |   "mentions":[{
76 |       "start": 0,
77 |       "stop": 11,
78 |       "label": "PERSON"
79 |     }, {
80 |       "start": 53,
81 |       "stop": 57,
82 |       "label": "GPE"
83 |     },
84 |     // truncated
85 | }
86 | ```
87 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ujson
 2 | numpy
 3 | scipy
 4 | pattern
 5 | gensim
 6 | msgpack-python
 7 | findspark
 8 | jupyter
 9 | spacy
10 | lxml
11 | beautifulsoup4
12 | warc
13 | pycld2
14 | dragnet


--------------------------------------------------------------------------------
/scripts/download-wikipedia:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | WKDATE=latest
 5 | PARA_ARG=""
 6 | 
 7 | if [ $# -gt 0 ]
 8 | then
 9 |     WKDATE=$1
10 |     if [ $# == 2 ]
11 |     then
12 |         PARA_ARG="-P $2"
13 |     fi
14 | fi
15 | 
16 | if [ "$WKDATE" == "latest" ]; then
17 |     export LINK_PFX=/enwiki/latest/
18 | fi
19 | 
20 | export WKDIR=$WKDATE
21 | export WKDATE
22 | rm -rf $WKDIR
23 | mkdir -p $WKDIR
24 | 
25 | curl "https://dumps.wikimedia.org/enwiki/$WKDATE/" |\
26 | grep "enwiki-$WKDATE-pages-articles[0-9]*.xml-p[0-9]*p[0-9]*.bz2\""|\
27 | awk -v pfx=$LINK_PFX -F'"' '{print "https://dumps.wikimedia.org" pfx $2}' |\
28 | xargs -n1 $PARA_ARG -L 1 bash -c 'wget $0 -P $WKDIR'
29 | 


--------------------------------------------------------------------------------
/scripts/sift-notebook:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | SPARK_URL="http://d3kbcqa49mib13.cloudfront.net/spark-2.1.0-bin-hadoop2.7.tgz"
 5 | 
 6 | if [ -z "$SPARK_HOME" ]; then
 7 |     echo "SPARK_HOME is unset, using local Spark deployment..."
 8 |     if [ ! -d "spark" ]; then
 9 |         read -p "Would you like to download spark and run in standalone mode? " -n 1 -r
10 |         if [[ ! $REPLY =~ ^[Yy]$ ]]
11 |         then
12 |             exit 1
13 |         fi
14 |         echo
15 |         echo "Downloading spark for local standalone deployment..."
16 |         mkdir spark
17 |         curl $SPARK_URL | tar zx -C spark --strip-components=1
18 |         
19 |         echo "Updating spark logger config..."
20 |         pushd spark/conf > /dev/null
21 |         sed -e 's/log4j.rootCategory=INFO/log4j.rootCategory=WARN/' log4j.properties.template > log4j.properties
22 |         popd > /dev/null
23 |     fi
24 |     export SPARK_HOME=$(pwd)/spark
25 |     if [ -z "$SPARK_MASTER" ]; then
26 |         SPARK_MASTER=local[*]
27 |     fi
28 | fi
29 | 
30 | if [ ! -z "$VIRTUAL_ENV" ]; then
31 |     export PYSPARK_PYTHON=$VIRTUAL_ENV/bin/python
32 | else
33 |     export PYSPARK_PYTHON=$(pwd)/ve/bin/python
34 | fi
35 | 
36 | SPARK_MASTER_SW=
37 | if [ ! -z "$SPARK_MASTER" ]; then
38 |     SPARK_MASTER_SW="--master $SPARK_MASTER"
39 | fi
40 | 
41 | PYTHONPATH=$PYTHONPATH:$(pwd) jupyter notebook "$@"
42 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | __version__ = '0.3.0'
 4 | __pkg_name__ = 'textsift'
 5 | 
 6 | setup(
 7 |     name = __pkg_name__,
 8 |     version = __version__,
 9 |     description = 'Text modelling framework',
10 |     author='Andrew Chisholm',
11 |     packages = find_packages(),
12 |     license = 'MIT',
13 |     url = 'https://github.com/wikilinks/sift',
14 |     scripts = [
15 |         'scripts/sift-notebook',
16 |         'scripts/download-wikipedia'
17 |     ],
18 |     classifiers=[
19 |         'Development Status :: 4 - Beta',
20 |         'Environment :: Console',
21 |         'Intended Audience :: Science/Research',
22 |         'License :: OSI Approved :: MIT License',
23 |         'Programming Language :: Python :: 2.7',
24 |         'Topic :: Text Processing :: Linguistic'
25 |     ],
26 |     install_requires = [
27 |         "ujson",
28 |         "numpy",
29 |         "pattern",
30 |         "gensim",
31 |         "msgpack-python",
32 |         "beautifulsoup4",
33 |         "spacy",
34 |         "warc",
35 |         "pycld2",
36 |         "scipy",
37 |         "scikit-learn"
38 |     ],
39 |     test_suite = __pkg_name__ + '.test'
40 | )
41 | 


--------------------------------------------------------------------------------
/sift.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import findspark\n",
 12 |     "findspark.init()\n",
 13 |     "import pyspark\n",
 14 |     "sc = pyspark.SparkContext()"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 36,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from sift.corpora import wikipedia\n",
 26 |     "from sift.models import text, links"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 5,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "base_path = '/data/wikipedia/20151002/'"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 6,
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "corpus = wikipedia.WikipediaCorpus()(sc, base_path + 'dump')"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 9,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "redirects = wikipedia.WikipediaRedirects()(corpus)\n",
 60 |     "docs = wikipedia.WikipediaArticles()(corpus, redirects)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 27,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "[{'_id': 'en.wikipedia.org/wiki/Causa_y_Efecto',\n",
 74 |        "  'links': [{'start': 986,\n",
 75 |        "    'stop': 999,\n",
 76 |        "    'target': u'en.wikipedia.org/wiki/New_York_City'},\n",
 77 |        "   {'start': 93, 'stop': 105, 'target': u'en.wikipedia.org/wiki/Studio_album'},\n",
 78 |        "   {'start': 971, 'stop': 982, 'target': u'en.wikipedia.org/wiki/Gotham_Hall'},\n",
 79 |        "   {'start': 2178,\n",
 80 |        "    'stop': 2192,\n",
 81 |        "    'target': u'en.wikipedia.org/wiki/Miami,_Florida'},\n",
 82 |        "   {'start': 1791,\n",
 83 |        "    'stop': 1808,\n",
 84 |        "    'target': u'en.wikipedia.org/wiki/Latin_Pop_Airplay'},\n",
 85 |        "   {'start': 2702,\n",
 86 |        "    'stop': 2719,\n",
 87 |        "    'target': u'en.wikipedia.org/wiki/Latin_Pop_Airplay'},\n",
 88 |        "   {'start': 465,\n",
 89 |        "    'stop': 484,\n",
 90 |        "    'target': u'en.wikipedia.org/wiki/Ni_Una_Sola_Palabra'},\n",
 91 |        "   {'start': 2122, 'stop': 2129, 'target': u'en.wikipedia.org/wiki/Austria'},\n",
 92 |        "   {'start': 2740,\n",
 93 |        "    'stop': 2760,\n",
 94 |        "    'target': u'en.wikipedia.org/wiki/Latin_Rhythm_Airplay'},\n",
 95 |        "   {'start': 106,\n",
 96 |        "    'stop': 119,\n",
 97 |        "    'target': u'en.wikipedia.org/wiki/Gran_City_Pop'},\n",
 98 |        "   {'start': 2388, 'stop': 2397, 'target': u'en.wikipedia.org/wiki/Reggaeton'},\n",
 99 |        "   {'start': 2069,\n",
100 |        "    'stop': 2080,\n",
101 |        "    'target': u'en.wikipedia.org/wiki/Music_video'},\n",
102 |        "   {'start': 2530, 'stop': 2534, 'target': u'en.wikipedia.org/wiki/U.S.'},\n",
103 |        "   {'start': 2573, 'stop': 2577, 'target': u'en.wikipedia.org/wiki/U.S.'},\n",
104 |        "   {'start': 2612, 'stop': 2616, 'target': u'en.wikipedia.org/wiki/U.S.'},\n",
105 |        "   {'start': 2649, 'stop': 2653, 'target': u'en.wikipedia.org/wiki/U.S.'},\n",
106 |        "   {'start': 2685, 'stop': 2689, 'target': u'en.wikipedia.org/wiki/U.S.'},\n",
107 |        "   {'start': 2723, 'stop': 2727, 'target': u'en.wikipedia.org/wiki/U.S.'},\n",
108 |        "   {'start': 1509,\n",
109 |        "    'stop': 1524,\n",
110 |        "    'target': u'en.wikipedia.org/wiki/Hot_Latin_Songs'},\n",
111 |        "   {'start': 2666,\n",
112 |        "    'stop': 2681,\n",
113 |        "    'target': u'en.wikipedia.org/wiki/Hot_Latin_Songs'},\n",
114 |        "   {'start': 1391,\n",
115 |        "    'stop': 1400,\n",
116 |        "    'target': u'en.wikipedia.org/wiki/Causa_y_Efecto'},\n",
117 |        "   {'start': 41, 'stop': 54, 'target': u'en.wikipedia.org/wiki/Paulina_Rubio'},\n",
118 |        "   {'start': 821,\n",
119 |        "    'stop': 854,\n",
120 |        "    'target': u'en.wikipedia.org/wiki/2009_Latin_Billboard_Music_Awards'},\n",
121 |        "   {'start': 2402,\n",
122 |        "    'stop': 2415,\n",
123 |        "    'target': u'en.wikipedia.org/wiki/Angel_&_Khriz'},\n",
124 |        "   {'start': 2775,\n",
125 |        "    'stop': 2827,\n",
126 |        "    'target': u'en.wikipedia.org/wiki/List_of_number-one_Billboard_Hot_Latin_Songs_of_2009'},\n",
127 |        "   {'start': 2547,\n",
128 |        "    'stop': 2569,\n",
129 |        "    'target': u'en.wikipedia.org/wiki/Bubbling_Under_Hot_100'},\n",
130 |        "   {'start': 2536,\n",
131 |        "    'stop': 2545,\n",
132 |        "    'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n",
133 |        "   {'start': 2579,\n",
134 |        "    'stop': 2588,\n",
135 |        "    'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n",
136 |        "   {'start': 2618,\n",
137 |        "    'stop': 2627,\n",
138 |        "    'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n",
139 |        "   {'start': 2655,\n",
140 |        "    'stop': 2664,\n",
141 |        "    'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n",
142 |        "   {'start': 2691,\n",
143 |        "    'stop': 2700,\n",
144 |        "    'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n",
145 |        "   {'start': 2729,\n",
146 |        "    'stop': 2738,\n",
147 |        "    'target': u'en.wikipedia.org/wiki/Billboard_(magazine)'},\n",
148 |        "   {'start': 2304, 'stop': 2307, 'target': u'en.wikipedia.org/wiki/MTV'}],\n",
149 |        "  'text': u'\"\" is a song performed by Mexican singer Paulina Rubio.  The song was recorded for her ninth studio album Gran City Pop, and was released as the lead single on March 30, 2009. Causa y Efecto became a hit reaching number 1 in the U.S. Billboard Hot Latin Songs and Hot Latin Airplays. Causa y Efecto was produced by Cachorro L\\xf3pez and written by Mario Domm and M\\xf3nica V\\xe9lez. \"Causa y Efecto\" is Rubio\\'s first number one single in the Billboard Hot Latin Songs since Ni Una Sola Palabra in 2006.  \"Causa y Efecto\" was awarded \"Song of the year pop/ballad\" by ASCAP.\\n\\nAn English version of the song titled \"Cause and Effect\" will be released on Paulina\\'s next bilingual album.\\n\\nRelease and promotion\\n\"Causa y Efecto\", debuted in radio stations on March 30. The song was performed for the first time on April 23, 2009 at the Latin Billboard Music Awards 2009, Rubio was one of the most expected artists of the night. Rubio also performed the song in a private concert at the Gotham Hall in New York City on May 11 promoting Gran City Pop. The concert was presented by Univision Radio.\\n\\nRubio performed the song at the Wal-Mart Shareholders\\' Meeting, singing a \"spanglish\" version of the song. She performed parts of the Spanish version and others of an unreleased English version. An English version of the song was released on a remix of the song, the George Figares Radio Mix.\\n\\nTrack listing\\n*CD Single\\n# \"Causa y Efecto\" Album version - 3:27\\n\\nChart performance\\nThe song debuted at number 40 on the \"Billboard\" Hot Latin Songs, the next week the song jumped at #26, obtaining the highest \\'jump\\' of the week on the chart. The song peaked #1 for five consecutive weeks and it\\'s her fourth #1 and her twelfth top ten hit on Hot Latin songs.  It debuted at #22 and peaked at #1 on the \"Billboard\" Latin Pop Airplay.\\n\\nIn Spain, the single debuted at #43, and has peaked at #7 based on downloads alone. On May 9, 2009, \"Causa y Efecto\" entered at the Spanish Airplay Chart at #7 as the highest debut of that week, and peaked at #1 for three consecutive weeks.\\n\\nMusic video\\nThe music video for \"Causa y Efecto\" was directed by the Austrian director Rudi Dolezal. The video was filmed in Miami, Florida during the month of March at M3 Studios. The video  premiered worldwide on May 7 and in the U.S on the channel MTV Tres.\\n\\nRemix\\nAn official remix of the song was released on June 12. It features reggaeton duo Angel & Khriz.\\n\\nCharts\\n\\nChart (2009)\\nPeakposition\\n\\nMexico (Monitor Latino)\\n1\\n\\nSpanish Airplay Chart\\n1\\n\\nSpanish Singles Chart\\n7\\n\\nU.S. \"Billboard\" Bubbling Under Hot 100\\n4\\n\\nU.S. \"Billboard\" Heatseeker Songs \\n23\\n\\nU.S. \"Billboard\" Tropical Songs \\n29\\n\\nU.S. \"Billboard\" Hot Latin Songs\\n1\\n\\nU.S. \"Billboard\" Latin Pop Airplay\\n1\\n\\nU.S. \"Billboard\" Latin Rhythm Airplay\\n6\\n\\n\\nSee also\\n*List of number-one Billboard Hot Latin Songs of 2009\\n\\nSales and certifications\\n\\n\\n Country\\n Certification\\n Sales\\n\\n Spain\\n Platinum\\n 40,000\\n\\n\\nReferences'}]"
150 |       ]
151 |      },
152 |      "execution_count": 27,
153 |      "metadata": {},
154 |      "output_type": "execute_result"
155 |     }
156 |    ],
157 |    "source": [
158 |     "docs.take(1)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 25,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [],
168 |    "source": [
169 |     "vocab = text.TermVocab(max_rank=100000,min_rank=0,lowercase=True,min_df=5)(docs.sample(False, 0.25))"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 26,
175 |    "metadata": {
176 |     "collapsed": false
177 |    },
178 |    "outputs": [
179 |     {
180 |      "data": {
181 |       "text/plain": [
182 |        "[{'_id': u'the', 'count': 1172125, 'rank': 0},\n",
183 |        " {'_id': u'in', 'count': 1135557, 'rank': 1},\n",
184 |        " {'_id': u'a', 'count': 1127366, 'rank': 2},\n",
185 |        " {'_id': u'of', 'count': 1101586, 'rank': 3},\n",
186 |        " {'_id': u'and', 'count': 1008637, 'rank': 4},\n",
187 |        " {'_id': u'is', 'count': 997703, 'rank': 5},\n",
188 |        " {'_id': u'references', 'count': 958549, 'rank': 6},\n",
189 |        " {'_id': u'to', 'count': 889253, 'rank': 7},\n",
190 |        " {'_id': u'was', 'count': 804122, 'rank': 8},\n",
191 |        " {'_id': u'for', 'count': 725355, 'rank': 9}]"
192 |       ]
193 |      },
194 |      "execution_count": 26,
195 |      "metadata": {},
196 |      "output_type": "execute_result"
197 |     }
198 |    ],
199 |    "source": [
200 |     "vocab.take(10)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 34,
206 |    "metadata": {
207 |     "collapsed": false
208 |    },
209 |    "outputs": [
210 |     {
211 |      "data": {
212 |       "text/plain": [
213 |        "[{'_id': u'en.wikipedia.org/wiki/New_York_City',\n",
214 |        "  'source': 'en.wikipedia.org/wiki/Causa_y_Efecto',\n",
215 |        "  'span': (73, 86),\n",
216 |        "  'text': u'rubio also performed the song in a private concert at the gotham hall in new york city on may 11 promoting gran city pop.'}]"
217 |       ]
218 |      },
219 |      "execution_count": 34,
220 |      "metadata": {},
221 |      "output_type": "execute_result"
222 |     }
223 |    ],
224 |    "source": [
225 |     "text.EntityMentions(sentence_window=1,lowercase=True)(docs).take(1)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 31,
231 |    "metadata": {
232 |     "collapsed": true
233 |    },
234 |    "outputs": [],
235 |    "source": [
236 |     "mentions = text.IndexMappedMentions(sentence_window=1,lowercase=True)(sc, docs, vocab)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 32,
242 |    "metadata": {
243 |     "collapsed": false
244 |    },
245 |    "outputs": [
246 |     {
247 |      "data": {
248 |       "text/plain": [
249 |        "[{'_id': u'en.wikipedia.org/wiki/New_York_City',\n",
250 |        "  'source': 'en.wikipedia.org/wiki/Causa_y_Efecto',\n",
251 |        "  'span': (14, 17),\n",
252 |        "  'text': [25961,\n",
253 |        "   18,\n",
254 |        "   686,\n",
255 |        "   0,\n",
256 |        "   447,\n",
257 |        "   1,\n",
258 |        "   2,\n",
259 |        "   574,\n",
260 |        "   2057,\n",
261 |        "   13,\n",
262 |        "   0,\n",
263 |        "   21394,\n",
264 |        "   518,\n",
265 |        "   1,\n",
266 |        "   35,\n",
267 |        "   227,\n",
268 |        "   98,\n",
269 |        "   11,\n",
270 |        "   46,\n",
271 |        "   205,\n",
272 |        "   3585,\n",
273 |        "   9860,\n",
274 |        "   98,\n",
275 |        "   1770]}]"
276 |       ]
277 |      },
278 |      "execution_count": 32,
279 |      "metadata": {},
280 |      "output_type": "execute_result"
281 |     }
282 |    ],
283 |    "source": [
284 |     "mentions.take(1)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 42,
290 |    "metadata": {
291 |     "collapsed": false
292 |    },
293 |    "outputs": [
294 |     {
295 |      "data": {
296 |       "text/plain": [
297 |        "[{'_id': u'', 'count': 4},\n",
298 |        " {'_id': u'www.rsssf.com/tabless/slow97.html', 'count': 2},\n",
299 |        " {'_id': u'en.wikipedia.org/wiki/Yuba,_Michigan', 'count': 3},\n",
300 |        " {'_id': u'en.wikipedia.org/wiki/Walnut_River_(Kansas)', 'count': 12},\n",
301 |        " {'_id': u'www.ctheritage.org/encyclopedia/ct1865_1929/admin_baldwin.htm',\n",
302 |        "  'count': 2},\n",
303 |        " {'_id': u'en.wikipedia.org/wiki/Falling_factorial', 'count': 28},\n",
304 |        " {'_id': u'en.wikipedia.org/wiki/WOW_Worship:_Blue', 'count': 5},\n",
305 |        " {'_id': u'en.wikipedia.org/wiki/Ekhane_Pinjar', 'count': 2},\n",
306 |        " {'_id': u'en.wikipedia.org/wiki/Conditional_execution', 'count': 2},\n",
307 |        " {'_id': u'en.wikipedia.org/wiki/Paralititan', 'count': 27}]"
308 |       ]
309 |      },
310 |      "execution_count": 42,
311 |      "metadata": {},
312 |      "output_type": "execute_result"
313 |     }
314 |    ],
315 |    "source": [
316 |     "links.EntityCounts()(docs).take(10)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 49,
322 |    "metadata": {
323 |     "collapsed": false
324 |    },
325 |    "outputs": [
326 |     {
327 |      "data": {
328 |       "text/plain": [
329 |        "[{'_id': u'XMT',\n",
330 |        "  'counts': {u'en.wikipedia.org/wiki/Cray_XMT': 1,\n",
331 |        "   u'en.wikipedia.org/wiki/Extensible_MPEG-4_Textual_Format': 1,\n",
332 |        "   u'en.wikipedia.org/wiki/XMT': 1},\n",
333 |        "  'total': 3},\n",
334 |        " {'_id': u'<a href=\"en.wikipedia.org/wiki/New York\">New York',\n",
335 |        "  'counts': {u'en.wikipedia.org/wiki/New_York_(disambiguation)': 1,\n",
336 |        "   u'en.wikipedia.org/wiki/New_York_Disability_Benefits_Law': 1,\n",
337 |        "   u'en.wikipedia.org/wiki/New_York_State_Armory_(Poughkeepsie)': 1,\n",
338 |        "   u'en.wikipedia.org/wiki/Outline_of_New_York': 1,\n",
339 |        "   u'en.wikipedia.org/wiki/Vehicle_registration_plates_of_New_York': 1},\n",
340 |        "  'total': 5},\n",
341 |        " {'_id': u'Albert Lewis',\n",
342 |        "  'counts': {u'en.wikipedia.org/wiki/Albert_Gerald_Lewis': 1,\n",
343 |        "   u'en.wikipedia.org/wiki/Albert_Lewis': 1,\n",
344 |        "   u'en.wikipedia.org/wiki/Albert_Lewis_(American_football)': 27,\n",
345 |        "   u'en.wikipedia.org/wiki/Albert_Lewis_(Sheffield_United)': 1,\n",
346 |        "   u'en.wikipedia.org/wiki/Albert_Lewis_(footballer)': 3,\n",
347 |        "   u'en.wikipedia.org/wiki/Albert_Lewis_(priest)': 2,\n",
348 |        "   u'en.wikipedia.org/wiki/Albert_Lewis_(producer)': 5,\n",
349 |        "   u'en.wikipedia.org/wiki/Talbot_Lewis': 1},\n",
350 |        "  'total': 41},\n",
351 |        " {'_id': u'WFA website',\n",
352 |        "  'counts': {u'wfafootball.com': 1,\n",
353 |        "   u'www.wfafootball.com': 1,\n",
354 |        "   u'www.wfafootball.com/': 2},\n",
355 |        "  'total': 4},\n",
356 |        " {'_id': u'Marlboro British F3 Championship round 3',\n",
357 |        "  'counts': {u'en.wikipedia.org/wiki/1981_Marlboro_British_F3_Championship,_Rd.3': 1,\n",
358 |        "   u'en.wikipedia.org/wiki/1982_Marlboro_British_F3_Championship,_Rd.3': 1,\n",
359 |        "   u'en.wikipedia.org/wiki/1983_Marlboro_British_F3_Championship,_Rd.3': 1,\n",
360 |        "   u'en.wikipedia.org/wiki/1984_Marlboro_British_F3_Championship,_Rd.3': 1,\n",
361 |        "   u'en.wikipedia.org/wiki/1985_Marlboro_British_F3_Championship,_Rd.3': 1},\n",
362 |        "  'total': 5}]"
363 |       ]
364 |      },
365 |      "execution_count": 49,
366 |      "metadata": {},
367 |      "output_type": "execute_result"
368 |     }
369 |    ],
370 |    "source": [
371 |     "links\\\n",
372 |     "    .EntityNameCounts()(docs)\\\n",
373 |     "    .filter(lambda r: len(r['counts']) >= 3)\\\n",
374 |     "    .take(5)"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "metadata": {
381 |     "collapsed": true
382 |    },
383 |    "outputs": [],
384 |    "source": []
385 |   }
386 |  ],
387 |  "metadata": {
388 |   "kernelspec": {
389 |    "display_name": "Python 2",
390 |    "language": "python",
391 |    "name": "python2"
392 |   },
393 |   "language_info": {
394 |    "codemirror_mode": {
395 |     "name": "ipython",
396 |     "version": 2
397 |    },
398 |    "file_extension": ".py",
399 |    "mimetype": "text/x-python",
400 |    "name": "python",
401 |    "nbconvert_exporter": "python",
402 |    "pygments_lexer": "ipython2",
403 |    "version": "2.7.6"
404 |   }
405 |  },
406 |  "nbformat": 4,
407 |  "nbformat_minor": 0
408 | }
409 | 


--------------------------------------------------------------------------------
/sift/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.2.0'


--------------------------------------------------------------------------------
/sift/build.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | import shutil
 4 | import textwrap
 5 | import argparse
 6 | import ujson as json
 7 | 
 8 | from pyspark import SparkContext, SparkConf
 9 | from sift.format import ModelFormat
10 | 
11 | import logging
12 | log = logging.getLogger() 
13 | 
14 | class DatasetBuilder(object):
15 |     """ Wrapper for modules which extract models of entities or text from a corpus of linked documents """
16 |     def __init__(self, **kwargs):
17 |         self.output_path = kwargs.pop('output_path')
18 |         self.sample = kwargs.pop('sample')
19 | 
20 |         fmtcls = kwargs.pop('fmtcls')
21 |         fmt_args = {p:kwargs[p] for p in fmtcls.__init__.__code__.co_varnames if p in kwargs}
22 |         self.formatter = fmtcls(**fmt_args)
23 | 
24 |         modelcls = kwargs.pop('modelcls')
25 |         self.model_name = re.sub('([A-Z])', r' \1', modelcls.__name__).strip()
26 | 
27 |         log.info("Building %s...", self.model_name)
28 |         self.model = modelcls(**kwargs)
29 | 
30 |     def __call__(self):
31 |         c = SparkConf().setAppName('Build %s' % self.model_name)
32 | 
33 |         log.info('Using spark master: %s', c.get('spark.master'))
34 |         sc = SparkContext(conf=c)
35 | 
36 |         kwargs = self.model.prepare(sc)
37 |         m = self.model.build(**kwargs)
38 |         m = self.model.format_items(m)
39 |         m = self.formatter(m)
40 | 
41 |         if self.output_path:
42 |             log.info("Saving to: %s", self.output_path)
43 |             if os.path.isdir(self.output_path):
44 |                 log.warn('Writing over output path: %s', self.output_path)
45 |                 shutil.rmtree(self.output_path)
46 |             m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec')
47 |         elif self.sample > 0:
48 |             print '\n'.join(str(i) for i in m.take(self.sample))
49 | 
50 |         log.info('Done.')
51 | 
52 |     @classmethod
53 |     def add_arguments(cls, p):
54 |         p.add_argument('--save', dest='output_path', required=False, default=None, metavar='OUTPUT_PATH')
55 |         p.add_argument('--sample', dest='sample', required=False, default=1, type=int, metavar='NUM_SAMPLES')
56 |         p.set_defaults(cls=cls)
57 | 
58 |         sp = p.add_subparsers()
59 |         for modelcls in cls.providers():
60 |             name = modelcls.__name__
61 |             help_str = modelcls.__doc__.split('\n')[0]
62 |             desc = textwrap.dedent(modelcls.__doc__.rstrip())
63 |             csp = sp.add_parser(name,
64 |                                 help=help_str,
65 |                                 description=desc,
66 |                                 formatter_class=argparse.RawDescriptionHelpFormatter)
67 |             modelcls.add_arguments(csp)
68 |             cls.add_formatter_arguments(csp)
69 | 
70 |         return p
71 | 
72 |     @classmethod
73 |     def add_formatter_arguments(cls, p):
74 |         sp = p.add_subparsers()
75 |         for fmtcls in ModelFormat.iter_options():
76 |             name = fmtcls.__name__.lower()
77 |             if name.endswith('format'):
78 |                 name = name[:-len('format')]
79 |             help_str = fmtcls.__doc__.split('\n')[0]
80 |             desc = textwrap.dedent(fmtcls.__doc__.rstrip())
81 |             csp = sp.add_parser(name,
82 |                                 help=help_str,
83 |                                 description=desc,
84 |                                 formatter_class=argparse.RawDescriptionHelpFormatter)
85 |             fmtcls.add_arguments(csp)
86 |         return p
87 | 


--------------------------------------------------------------------------------
/sift/corpora/__init__.py:
--------------------------------------------------------------------------------
1 | class Corpus(object):
2 |     pass
3 | 


--------------------------------------------------------------------------------
/sift/corpora/commoncrawl.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from cStringIO import StringIO
  3 | from warc import WARCFile
  4 | from dragnet import content_extractor, BlockifyError
  5 | from lxml import etree
  6 | from bs4 import BeautifulSoup
  7 | from sift.dataset import ModelBuilder, Model, Documents
  8 | from sift import logging
  9 | import pycld2 as cld
 10 | from pycld2 import error as cld_error
 11 | 
 12 | LINKS_RE = re.compile(r'<a href="(.+?)">(.+?)</a>')
 13 | 
 14 | class WARCCorpus(ModelBuilder, Model):
 15 |     def __init__(self, language=None):
 16 |         self.language = language
 17 | 
 18 |     @staticmethod
 19 |     def parse_warc_content(buf):
 20 |         try:
 21 |             wf = WARCFile(fileobj=StringIO(buf))
 22 |             record = wf.read_record()
 23 |             payload = record.payload.read()
 24 |             top = payload[:15]
 25 | 
 26 |             if top.startswith('HTTP/') and top.endswith('200 OK'):
 27 |                 content_start = payload.find('\r\n\r\n')
 28 |                 if content_start != -1:
 29 |                     yield record.url, payload[content_start+4:]
 30 |         except IOError:
 31 |             pass
 32 | 
 33 |     @staticmethod
 34 |     def try_get_lang(content):
 35 |         try:
 36 |             reliable, _, details = cld.detect(content)
 37 |             if reliable:
 38 |                 return details[0][1]
 39 |         except cld_error:
 40 |             pass
 41 |         return None
 42 | 
 43 |     def build(self, sc, path):
 44 |         PAGE_DELIMITER = "WARC/1.0\r\n"
 45 |         warcs = sc\
 46 |             .newAPIHadoopFile(
 47 |                 path,
 48 |                 "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
 49 |                 "org.apache.hadoop.io.LongWritable",
 50 |                 "org.apache.hadoop.io.Text",
 51 |                 conf = { "textinputformat.record.delimiter": PAGE_DELIMITER })\
 52 |             .filter(lambda (_, part): part)\
 53 |             .map(lambda (_, part): PAGE_DELIMITER+part.encode('utf-8'))\
 54 |             .flatMap(self.parse_warc_content)
 55 | 
 56 |         if self.language != None:
 57 |             warcs = warcs.filter(lambda (url, content): self.try_get_lang(content) == self.language)
 58 |         return warcs
 59 | 
 60 |     @staticmethod
 61 |     def format_item((url, content)):
 62 |         return {
 63 |             '_id': url,
 64 |             'content': content,
 65 |         }
 66 | 
 67 | class CommonCrawlArticles(ModelBuilder, Documents):
 68 |     THRESHOLD_CONTENT_SZ = 250000
 69 | 
 70 |     @staticmethod
 71 |     def clean_content((url, content)):
 72 |         try:
 73 |             blocks = content_extractor.analyze(content, blocks=True)
 74 |             content = ''.join(etree.tostring(b.features['block_start_element']) for b in blocks)
 75 |             if len(content) < CommonCrawlArticles.THRESHOLD_CONTENT_SZ:
 76 |                 yield url, content
 77 |         except (BlockifyError, etree.SerialisationError):
 78 |             pass
 79 | 
 80 |     @staticmethod
 81 |     def parse_article(content):
 82 |         soup = BeautifulSoup(content, 'lxml')
 83 | 
 84 |         for tag in soup.find_all():
 85 |             if tag.name == 'a' and tag.attrs.get('href') and tag.text.strip():
 86 |                 tag.attrs = {'href': tag.attrs['href']}
 87 |             else:
 88 |                 tag.unwrap()
 89 | 
 90 |         return soup.encode_contents().decode('utf-8').strip()
 91 | 
 92 |     @staticmethod
 93 |     def extract_links(content):
 94 |         links = []
 95 |         offset = 0
 96 |         for match in LINKS_RE.finditer(content):
 97 |             target = match.group(1)
 98 |             anchor = match.group(2)
 99 |             start = match.start() - offset
100 |             offset += len(match.group())-len(anchor)
101 |             links.append((target, slice(start, start+len(anchor))))
102 | 
103 |         return LINKS_RE.sub(r'\2', content), links
104 | 
105 |     def build(self, corpus):
106 |         return corpus\
107 |             .map(lambda item: (item['_id'], item['content']))\
108 |             .flatMap(self.clean_content)\
109 |             .mapValues(self.parse_article)\
110 |             .mapValues(self.extract_links)
111 | 


--------------------------------------------------------------------------------
/sift/corpora/redirects.py:
--------------------------------------------------------------------------------
  1 | import urllib
  2 | import ujson as json
  3 | 
  4 | from sift.dataset import Model, DocumentModel
  5 | from sift.util import trim_link_protocol, iter_sent_spans, ngrams
  6 | 
  7 | from sift import logging
  8 | log = logging.getLogger()
  9 | 
 10 | class MapRedirects(Model):
 11 |     """ Map redirects """
 12 |     def __init__(self, *args, **kwargs):
 13 |         self.from_path = kwargs.pop('from_path')
 14 |         self.to_path = kwargs.pop('to_path')
 15 | 
 16 |     def prepare(self, sc):
 17 |         return {
 18 |             "from_rds": self.load(sc, self.from_path).cache(),
 19 |             "to_rds": self.load(sc, self.to_path).cache()
 20 |         }
 21 | 
 22 |     @staticmethod
 23 |     def map_redirects(source, target):
 24 |         return source\
 25 |             .map(lambda (s, t): (t, s))\
 26 |             .leftOuterJoin(target)\
 27 |             .map(lambda (t, (s, r)): (s, r or t))\
 28 |             .distinct()
 29 | 
 30 |     def build(self, from_rds, to_rds):
 31 |         # map source of destination kb
 32 |         # e.g. (a > b) and (a > c) becomes (b > c)
 33 |         mapped_to = to_rds\
 34 |             .leftOuterJoin(from_rds)\
 35 |             .map(lambda (s, (t, f)): (f or s, t))\
 36 | 
 37 |         # map target of origin kb
 38 |         # e.g. (a > b) and (b > c) becomes (a > c)
 39 |         mapped_from = from_rds\
 40 |             .map(lambda (s, t): (t, s))\
 41 |             .leftOuterJoin(mapped_to)\
 42 |             .map(lambda (t, (s, r)): (s, r))\
 43 |             .filter(lambda (s, t): t)
 44 | 
 45 |         rds = (mapped_from + mapped_to).distinct()
 46 |         rds.cache()
 47 | 
 48 |         log.info('Resolving transitive mappings over %i redirects...', rds.count())
 49 |         rds = self.map_redirects(rds, rds)
 50 | 
 51 |         log.info('Resolved %i redirects...', rds.count())
 52 |         return rds
 53 | 
 54 |     @staticmethod
 55 |     def load(sc, path, fmt=json):
 56 |         log.info('Using redirects: %s', path)
 57 |         return sc\
 58 |             .textFile(path)\
 59 |             .map(fmt.loads)\
 60 |             .map(lambda r: (r['_id'], r['target']))
 61 | 
 62 |     def format_items(self, model):
 63 |         return model\
 64 |             .map(lambda (source, target): {
 65 |                 '_id': source,
 66 |                 'target': target
 67 |             })
 68 | 
 69 |     @classmethod
 70 |     def add_arguments(cls, p):
 71 |         super(MapRedirects, cls).add_arguments(p)
 72 |         p.add_argument('from_path', metavar='FROM_REDIRECTS_PATH')
 73 |         p.add_argument('to_path', metavar='TO_REDIRECTS_PATH')
 74 |         return p
 75 | 
 76 | class RedirectDocuments(DocumentModel):
 77 |     """ Map links in a corpus via a set of redirects """
 78 |     def __init__(self, **kwargs):
 79 |         self.redirect_path = kwargs.pop('redirects_path')
 80 |         super(RedirectDocuments, self).__init__(**kwargs)
 81 | 
 82 |     def prepare(self, sc):
 83 |         params = super(RedirectDocuments, self).prepare(sc)
 84 |         params['redirects'] = self.load(sc, self.redirect_path).cache()
 85 |         return params
 86 | 
 87 |     def build(self, corpus, redirects):
 88 |         articles = corpus.map(lambda d: (d['_id'], d))
 89 | 
 90 |         def map_doc_links(doc, rds):
 91 |             for l in doc['links']:
 92 |                 l['target'] = rds[l['target']]
 93 |             return doc
 94 | 
 95 |         return corpus\
 96 |             .map(lambda d: (d['_id'], set(l['target'] for l in d['links'])))\
 97 |             .flatMap(lambda (pid, links): [(t, pid) for t in links])\
 98 |             .leftOuterJoin(redirects)\
 99 |             .map(lambda (t, (pid, r)): (pid, (t, r if r else t)))\
100 |             .groupByKey()\
101 |             .mapValues(dict)\
102 |             .join(articles)\
103 |             .map(lambda (pid, (rds, doc)): map_doc_links(doc, rds))
104 | 
105 |     def format_items(self, model):
106 |         return model
107 | 
108 |     @classmethod
109 |     def add_arguments(cls, p):
110 |         super(RedirectDocuments, cls).add_arguments(p)
111 |         p.add_argument('redirects_path', metavar='REDIRECTS_PATH')
112 |         return p
113 | 


--------------------------------------------------------------------------------
/sift/corpora/wikicorpus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Based on wikicorpus.py from Gensim:
  5 | #   https://github.com/piskvorky/gensim/blob/develop/gensim/corpora/wikicorpus.py
  6 | # Credits:
  7 | #   Radim Rehurek <radimrehurek@seznam.cz>
  8 | #   Lars Buitinck <larsmans@gmail.com>
  9 | 
 10 | import re
 11 | import xml.etree.cElementTree as ET
 12 | 
 13 | from htmlentitydefs import name2codepoint
 14 | 
 15 | wikilink_prefix = 'en.wikipedia.org/wiki/'
 16 | 
 17 | RE_P0 = re.compile('<!--.*?-->', re.DOTALL | re.UNICODE) # comments
 18 | RE_P1 = re.compile('<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE) # footnotes
 19 | RE_P2 = re.compile("(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$", re.UNICODE) # links to languages
 20 | RE_P3 = re.compile("{{([^}{]*)}}", re.DOTALL | re.UNICODE) # template
 21 | RE_P4 = re.compile("{{([^}]*)}}", re.DOTALL | re.UNICODE) # template
 22 | RE_P5 = re.compile('\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description
 23 | RE_P6 = re.compile("\[\[:?([^][]*)\|([^][]*)\]\]", re.DOTALL | re.UNICODE) # simplify links, keep description
 24 | RE_P6_ex = re.compile("\[\[:?([^][]*)\]\]", re.DOTALL | re.UNICODE) # links without description
 25 | RE_P7 = re.compile('\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images
 26 | RE_P8 = re.compile('\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files
 27 | RE_P9 = re.compile('<nowiki([> ].*?)(</nowiki>|/>)', re.DOTALL | re.UNICODE) # outside links
 28 | RE_P10 = re.compile('<math([> ].*?)(</math>|/>)', re.DOTALL | re.UNICODE) # math content
 29 | RE_P11 = re.compile('<(.*?)>', re.DOTALL | re.UNICODE) # all other tags
 30 | RE_P12 = re.compile('\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting
 31 | RE_P13 = re.compile('\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting
 32 | RE_P14 = re.compile('\[\[Category:[^][]*\]\]', re.UNICODE) # categories
 33 | RE_P15 = re.compile('\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
 34 | 
 35 | RE_BI = re.compile(r"'''''([^']*?)'''''")
 36 | RE_B = re.compile(r"'''(.*?)'''")
 37 | RE_IQ = re.compile(r"''\"(.*?)\"''")
 38 | RE_I = re.compile(r"''([^']*)''")
 39 | RE_QQ = re.compile(r'""(.*?)""')
 40 | RE_SECT = re.compile(r'(==+)\s*(.*?)\s*\1')
 41 | RE_EMPTY_PARENS = re.compile(r' \(\s*\)')
 42 | 
 43 | RE_HTML_ENT = re.compile("&#?(\w+);")
 44 | 
 45 | def remove_markup((uri, text)):
 46 |     text = re.sub(RE_P2, "", text)
 47 | 
 48 |     # TODO: may be desirable to extract captions for files and images and insert them back into the document
 49 |     text = remove_template(text)
 50 |     text = extract_tag_content(text, [
 51 |         re.compile('\[\[[fF]ile:(.*?)(\|[^\]\[]+?)*\|'),
 52 |         re.compile('\[\[[iI]mage:(.*?)(\|[^\]\[]+?)*\|')
 53 |     ])
 54 | 
 55 |     # the wiki markup is recursive (markup inside markup etc) we deal with that by removing
 56 |     # markup in a loop, starting with inner-most expressions and working outwards as long as something changes.
 57 |     iters = 0
 58 |     while True:
 59 |         old, iters = text, iters + 1
 60 |         text = re.sub(RE_P0, "", text) # remove comments
 61 |         text = re.sub(RE_P1, '', text) # remove footnotes
 62 |         text = re.sub(RE_P9, "", text) # remove outside links
 63 |         text = re.sub(RE_P10, "", text) # remove math content
 64 |         if iters == 1:
 65 |             text = re.sub(RE_P11, "", text) # remove all remaining tags
 66 | 
 67 |             # todo: extract sections
 68 |             text = re.sub(RE_SECT, '\\2', text)
 69 | 
 70 |             # inject link from the first bolded phrase as a mention of the article entity
 71 |             # this heuristic holds for the vast majority of pages and is a wiki standard
 72 |             text = re.sub(RE_B, '<a href="%s">\\1</a>' % uri, text, 1)
 73 | 
 74 |         text = re.sub(RE_P14, '', text) # remove categories
 75 | 
 76 |         # inject links
 77 |         text = re.sub(RE_P5, '<a href="\\2">\\3</a>', text) # remove urls, keep description
 78 |         text = re.sub(RE_P6, '<a href="%s\\1">\\2</a>' % wikilink_prefix, text) # simplify links, keep description only
 79 |         text = re.sub(RE_P6_ex, '<a href="%s\\1">\\1</a>' % wikilink_prefix, text)
 80 |         # remove table markup
 81 |         text = text.replace('||', '\n|') # each table cell on a separate line
 82 |         text = re.sub(RE_P12, '\n', text) # remove formatting lines
 83 |         text = re.sub(RE_P13, '\n\\3', text) # leave only cell content
 84 |         # remove empty mark-up
 85 |         text = text.replace('[]', '')
 86 | 
 87 |         # formatting
 88 |         text = re.sub(RE_BI, r"\1", text)
 89 |         text = re.sub(RE_B, r"\1", text)
 90 |         text = re.sub(RE_IQ, r'&quot;\1&quot;', text)
 91 |         text = re.sub(RE_I, r'&quot;\1&quot;', text)
 92 |         text = re.sub(RE_QQ, r"\1", text)
 93 | 
 94 |         if old == text or iters > 2: # stop if nothing changed between two iterations or after a fixed number of iterations
 95 |             break
 96 | 
 97 |     text = re.sub(RE_EMPTY_PARENS, '', text) # remove empty parenthesis (usually left by stripped templates)
 98 |     text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text
 99 |     text = html_unescape(text.strip())
100 |     return (uri, text)
101 | 
102 | def remove_template(s):
103 |     # Find the start and end position of each template by finding the opening '{{' and closing '}}'
104 |     n_open, n_close = 0, 0
105 |     starts, ends = [], []
106 |     in_template = False
107 |     prev_c = None
108 |     for i, c in enumerate(iter(s)):
109 |         if not in_template:
110 |             if c == '{' and c == prev_c:
111 |                 starts.append(i - 1)
112 |                 in_template = True
113 |                 n_open = 1
114 |         if in_template:
115 |             if c == '{':
116 |                 n_open += 1
117 |             elif c == '}':
118 |                 n_close += 1
119 |             if n_open == n_close:
120 |                 ends.append(i)
121 |                 in_template = False
122 |                 n_open, n_close = 0, 0
123 |         prev_c = c
124 | 
125 |     # Remove all the templates
126 |     s = ''.join([s[end + 1:start] for start, end in
127 |                  zip(starts + [None], [-1] + ends)])
128 | 
129 |     return s
130 | 
131 | def extract_tag_content(s, tags, include_content=True):
132 |     s = s.replace(u'\u2502','|')
133 |     for t in tags:
134 |         parts = []
135 |         last_match_end = None
136 |         for match in t.finditer(s):
137 |             parts.append(slice(last_match_end,match.start()))
138 | 
139 |             i = match.end()
140 |             while True:
141 |                 next_open = s.find('[[', i)
142 |                 next_close = s.find(']]', i)
143 |                 if next_open == -1 or next_open > next_close:
144 |                     last_match_end = next_close
145 |                     break
146 |                 elif next_close == -1:
147 |                     # unbalanced tags in wikimarkup, bail!
148 |                     last_match_end = i
149 |                     break
150 |                 i = next_close+2
151 |             if include_content and match.end() != last_match_end:
152 |                 content = s[match.end():last_match_end].strip('] ')
153 |                 if content:
154 |                     parts.append(slice(match.end(),last_match_end))
155 |                     if not content.endswith('.'):
156 |                         parts.append('.')
157 |             last_match_end += 2
158 |         parts.append(slice(last_match_end,None))
159 |         s = ''.join(s[p] if type(p) is slice else p for p in parts)
160 | 
161 |     return s
162 | 
163 | def html_unescape(text):
164 |     def replace(m):
165 |         span, code = m.group(0), m.group(1)
166 |         try:
167 |             if span[1] == "#":
168 |                 return unichr(int(code[1:], 16)) if span[2] == "x" else unichr(int(code))
169 |             else:
170 |                 return unichr(name2codepoint[code])
171 |         except:
172 |             return span
173 |     return re.sub(RE_HTML_ENT, replace, text)
174 | 
175 | def extract_page(content):
176 |     e = ET.fromstring(content.encode('utf-8'))
177 | 
178 |     title = e.find('title').text
179 |     ns = e.find('ns').text
180 |     pageid = int(e.find('id').text)
181 |     redirect_elem = e.find('redirect')
182 |     redirect = None if redirect_elem == None else redirect_elem.attrib['title']
183 |     content = None if redirect != None else e.find('revision/text').text
184 |     uri = wikilink_prefix+title.replace(' ', '_')
185 | 
186 |     return uri, ns, pageid, redirect, content
187 | 
188 | def normalise_wikilink(s):
189 |     s = s.replace(' ', '_').strip('_').strip()
190 |     if s and s[0].islower():
191 |         s = s[0].upper() + s[1:]
192 |     return s
193 | 
194 | def normalise_link(s):
195 |     if s.startswith(wikilink_prefix):
196 |         s = wikilink_prefix + normalise_wikilink(s[len(wikilink_prefix):])
197 |     return s 
198 | 
199 | def extract_links(content):
200 |     links_re = re.compile(r'<a href="(.+?)">(.+?)</a>')
201 | 
202 |     links = []
203 |     offset = 0
204 |     for match in list(links_re.finditer(content)):
205 |         target = match.group(1)
206 |         anchor = match.group(2) 
207 |         start = match.start() - offset
208 |         offset += len(match.group())-len(anchor)
209 |         links.append((normalise_link(target), slice(start, start+len(anchor))))
210 | 
211 |     return links_re.sub(r'\2', content), links
212 | 


--------------------------------------------------------------------------------
/sift/corpora/wikidata.py:
--------------------------------------------------------------------------------
 1 | import ujson as json
 2 | 
 3 | from sift.corpora import wikicorpus
 4 | from sift.dataset import ModelBuilder, Model, Relations
 5 | 
 6 | from sift import logging
 7 | log = logging.getLogger()
 8 | 
 9 | ENTITY_PREFIX = 'Q'
10 | PREDICATE_PREFIX = 'P'
11 | 
12 | class WikidataCorpus(ModelBuilder, Model):
13 |     @staticmethod
14 |     def iter_item_for_line(line):
15 |         line = line.strip()
16 |         if line != '[' and line != ']':
17 |             yield json.loads(line.rstrip(',\n'))
18 | 
19 |     def build(self, sc, path):
20 |         return sc\
21 |             .textFile(path)\
22 |             .flatMap(self.iter_item_for_line)\
23 |             .map(lambda i: (i['id'], i))
24 | 
25 |     @staticmethod
26 |     def format_item((wid, item)):
27 |         return {
28 |             '_id': wid,
29 |             'data': item
30 |         }
31 | 
32 | class WikidataRelations(ModelBuilder, Relations):
33 |     """ Prepare a corpus of relations from wikidata """
34 |     @staticmethod
35 |     def iter_relations_for_item(item):
36 |         for pid, statements in item.get('claims', {}).iteritems():
37 |             for statement in statements:
38 |                 if statement['mainsnak'].get('snaktype') == 'value':
39 |                     datatype = statement['mainsnak'].get('datatype')
40 |                     if datatype == 'wikibase-item':
41 |                         yield pid, int(statement['mainsnak']['datavalue']['value']['numeric-id'])
42 |                     elif datatype == 'time':
43 |                         yield pid, statement['mainsnak']['datavalue']['value']['time']
44 |                     elif datatype == 'string' or datatype == 'url':
45 |                         yield pid, statement['mainsnak']['datavalue']['value']
46 | 
47 |     def build(self, corpus):
48 |         entities = corpus\
49 |             .filter(lambda item: item['_id'].startswith(ENTITY_PREFIX))
50 | 
51 |         entity_labels = entities\
52 |             .map(lambda item: (item['_id'], item['data'].get('labels', {}).get('en', {}).get('value', None)))\
53 |             .filter(lambda (pid, label): label)\
54 |             .map(lambda (pid, label): (int(pid[1:]), label))
55 | 
56 |         wiki_entities = entities\
57 |             .map(lambda item: (item['data'].get('sitelinks', {}).get('enwiki', {}).get('title', None), item['data']))\
58 |             .filter(lambda (e, _): e)\
59 |             .cache()
60 |        
61 |         predicate_labels = corpus\
62 |             .filter(lambda item: item['_id'].startswith(PREDICATE_PREFIX))\
63 |             .map(lambda item: (item['_id'], item['data'].get('labels', {}).get('en', {}).get('value', None)))\
64 |             .filter(lambda (pid, label): label)\
65 |             .cache()
66 | 
67 |         relations = wiki_entities\
68 |             .flatMap(lambda (eid, item): ((pid, (value, eid)) for pid, value in self.iter_relations_for_item(item)))\
69 |             .join(predicate_labels)\
70 |             .map(lambda (pid, ((value, eid), label)): (value, (label, eid)))
71 | 
72 |         return relations\
73 |             .leftOuterJoin(entity_labels)\
74 |             .map(lambda (value, ((label, eid), value_label)): (eid, (label, value_label or value)))\
75 |             .groupByKey()\
76 |             .mapValues(dict)
77 | 


--------------------------------------------------------------------------------
/sift/corpora/wikipedia.py:
--------------------------------------------------------------------------------
 1 | import ujson as json
 2 | 
 3 | from sift.corpora import wikicorpus
 4 | from sift.dataset import ModelBuilder, Model, Redirects, Documents
 5 | 
 6 | from sift import logging
 7 | log = logging.getLogger()
 8 | 
 9 | class WikipediaCorpus(ModelBuilder, Model):
10 |     def build(self, sc, path):
11 |         PAGE_DELIMITER = "\n  </page>\n"
12 |         PAGE_START = '<page>\n'
13 |         PAGE_END = '</page>'
14 |         return sc\
15 |             .newAPIHadoopFile(
16 |                 path,
17 |                 "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
18 |                 "org.apache.hadoop.io.LongWritable",
19 |                 "org.apache.hadoop.io.Text",
20 |                 conf = { "textinputformat.record.delimiter": PAGE_DELIMITER })\
21 |             .map(lambda (_, part): (part.find(PAGE_START), part))\
22 |             .filter(lambda (offset, _): offset >= 0)\
23 |             .map(lambda (offset, content): content[offset:]+PAGE_END)\
24 |             .map(wikicorpus.extract_page)
25 | 
26 |     @staticmethod
27 |     def format_item((title, ns, pid, redirect, content)):
28 |         return {
29 |             '_id': title,
30 |             'pid': pid,
31 |             'namespace': ns,
32 |             'redirect': redirect,
33 |             'content': content
34 |         }
35 | 
36 | class WikipediaRedirects(ModelBuilder, Redirects):
37 |     """ Extract a set of redirects from wikipedia """
38 |     def __init__(self, resolve_transitive=False):
39 |         self.resolve_transitive = resolve_transitive
40 | 
41 |     def build(self, pages, verbose=False):
42 |         pfx = wikicorpus.wikilink_prefix
43 |         redirects = pages\
44 |             .filter(lambda page: page['redirect'] != None)\
45 |             .map(lambda page: (page['_id'], page['redirect']))\
46 |             .mapValues(wikicorpus.normalise_wikilink)\
47 |             .map(lambda (s, t): (s, pfx+t))
48 | 
49 |         if self.resolve_transitive:
50 |             redirects = redirects.cache()
51 | 
52 |             num_targets = redirects\
53 |                 .map(lambda (k,v): v)\
54 |                 .distinct()\
55 |                 .count()
56 | 
57 |             redirects = redirects\
58 |                 .map(lambda (s, t): (t, s)).leftOuterJoin(redirects)\
59 |                 .map(lambda (target, (source, redirect)): (source, redirect or target))
60 | 
61 |             if verbose:
62 |                 redirects = redirects.cache()
63 |                 final_num_targets = redirects.map(lambda (k,v): v).distinct().count()
64 |                 log.info('Resolved %i transitive redirects...', num_targets - final_num_targets)
65 | 
66 |         return redirects.distinct()
67 | 
68 | class WikipediaArticles(ModelBuilder, Documents):
69 |     """ Prepare a corpus of documents from wikipedia """
70 |     def build(self, corpus, redirects=None):
71 |         articles = corpus\
72 |             .filter(lambda page: page['namespace'] == '0' and page['redirect'] == None and page['content'])\
73 |             .map(lambda page: (page['_id'], page['content']))\
74 |             .map(wikicorpus.remove_markup)\
75 |             .mapValues(wikicorpus.extract_links)
76 | 
77 |         if redirects:
78 |             redirects = redirects.map(lambda r: (r['_id'], r['target']))
79 |             articles.cache()
80 | 
81 |             # redirect set is typically too large to be broadcasted for a map-side join
82 |             articles = articles\
83 |                 .flatMap(lambda (pid, (text, links)): ((t, (pid, span)) for t, span in links))\
84 |                 .leftOuterJoin(redirects)\
85 |                 .map(lambda (t, ((pid, span), r)): (pid, (r if r else t, span)))\
86 |                 .groupByKey()\
87 |                 .mapValues(list)\
88 |                 .join(articles)\
89 |                 .map(lambda (pid, (links, (text, _))): (pid, (text, links)))
90 | 
91 |         return articles
92 | 


--------------------------------------------------------------------------------
/sift/dataset.py:
--------------------------------------------------------------------------------
 1 | import ujson as json
 2 | 
 3 | class ModelBuilder(object):
 4 |     def __init__(self, *args, **kwargs): pass
 5 | 
 6 |     def __call__(self, *args, **kwargs):
 7 |         return self.build(*args, **kwargs).map(self.format_item)
 8 | 
 9 |     def build(self, *args, **kwargs):
10 |         raise NotImplementedError
11 | 
12 | class Model(object):
13 |     @staticmethod
14 |     def format_item(item):
15 |         raise NotImplementedError
16 | 
17 |     @staticmethod
18 |     def load(sc, path, fmt=json):
19 |         return sc.textFile(path).map(json.loads)
20 | 
21 |     @staticmethod
22 |     def save(m, path, fmt=json):
23 |         m.map(json.dumps).saveAsTextFile(path, 'org.apache.hadoop.io.compress.GzipCodec')
24 | 
25 | class Redirects(Model):
26 |     @staticmethod
27 |     def format_item((source, target)):
28 |         return {'_id': source, 'target': target}
29 | 
30 | class Vocab(Model):
31 |     @staticmethod
32 |     def format_item((term, (count, rank))):
33 |         return {
34 |             '_id': term,
35 |             'count': count,
36 |             'rank': rank
37 |         }
38 | 
39 | class Mentions(Model):
40 |     @staticmethod
41 |     def format_item((target, source, text, span)):
42 |         return {
43 |             '_id': target,
44 |             'source': source,
45 |             'text': text,
46 |             'span': span
47 |         }
48 | 
49 | class IndexedMentions(Model):
50 |     @staticmethod
51 |     def format_item((target, source, text, span)):
52 |         return {
53 |             '_id': target,
54 |             'source': source,
55 |             'sequence': text,
56 |             'span': span
57 |         }
58 | 
59 | class Documents(Model):
60 |     @staticmethod
61 |     def format_item((uri, (text, links))):
62 |         return {
63 |             '_id': uri,
64 |             'text': text,
65 |             'links': [{
66 |                  'target': target,
67 |                  'start': span.start,
68 |                  'stop': span.stop
69 |              } for target, span in links]
70 |         }
71 | 
72 | class Relations(Model):
73 |     @staticmethod
74 |     def format_item((uri, relations)):
75 |         return {
76 |             '_id': uri,
77 |             'relations': relations
78 |         }


--------------------------------------------------------------------------------
/sift/format.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pickle
 2 | import ujson as json
 3 | import msgpack
 4 | import base64
 5 | 
 6 | class ModelFormat(object):
 7 |     def __init__(self):
 8 |         pass
 9 |     def __call__(self, model):
10 |         raise NotImplemented
11 | 
12 |     @classmethod
13 |     def iter_options(cls):
14 |         yield JsonFormat
15 |         yield RedisFormat
16 |         yield TsvFormat
17 | 
18 | class TsvFormat(ModelFormat):
19 |     """ Format model output as tab separated values """
20 |     @staticmethod
21 |     def items_to_tsv(items):
22 |         key_order = None
23 |         for item in items:
24 |             if key_order == None:
25 |                 key_order = []
26 |                 if '_id' in item:
27 |                     key_order.append('_id')
28 |                 key_order += sorted(k for k in item.iterkeys() if k != '_id')
29 | 
30 |             # todo: proper field serialization and escapes
31 |             yield u'\t'.join(unicode(item[k]) for k in key_order).encode('utf-8')
32 | 
33 |     def __call__(self, model):
34 |         return model.mapPartitions(self.items_to_tsv)
35 | 
36 |     @classmethod
37 |     def add_arguments(cls, p):
38 |         p.set_defaults(fmtcls=cls)
39 |         return p
40 | 
41 | class JsonFormat(ModelFormat):
42 |     """ Format model output as json """
43 |     def __call__(self, model):
44 |         return model.map(json.dumps)
45 | 
46 |     @classmethod
47 |     def add_arguments(cls, p):
48 |         p.set_defaults(fmtcls=cls)
49 |         return p
50 | 
51 | class RedisFormat(ModelFormat):
52 |     """ Format model output as redis protocol SET commands """
53 |     def __init__(self, prefix, serializer, field):
54 |         if serializer == 'raw' and not field:
55 |             raise Exception("Target field required for raw serializer")
56 | 
57 |         self.prefix = prefix
58 |         self.field = field
59 |         self.serializer = {
60 |             'json': json.dumps,
61 |             'msgpack': lambda o: base64.b64encode(msgpack.dumps(o)),
62 |             'pickle': lambda o: base64.b64encode(pickle.dumps(o, -1)),
63 |             'raw': lambda o: o
64 |         }[serializer]
65 | 
66 |     def to_value(self, item):
67 |         if self.field:
68 |             item = unicode(item[self.field])
69 |         else:
70 |             item.pop('_id', None)
71 |         return self.serializer(item)
72 | 
73 |     def __call__(self, model):
74 |         cmd = '\r\n'.join(["*3", "$3", "SET", "${}", "{}", "${}", "{}"])+'\r'
75 |         return model\
76 |             .map(lambda i: ((self.prefix+i['_id'].replace('"','\\"')).encode('utf-8'), self.to_value(i)))\
77 |             .map(lambda (t, c): cmd.format(len(t), t, len(c), c))
78 | 
79 |     @classmethod
80 |     def add_arguments(cls, p):
81 |         p.add_argument('--prefix', required=False, default='', metavar='PREFIX')
82 |         p.add_argument('--serializer', choices=['json', 'pickle', 'msgpack', 'raw'], required=False, default='json', metavar='SERIALIZER')
83 |         p.add_argument('--field', required=False, metavar='FIELD_TO_SERIALIZE')
84 |         p.set_defaults(fmtcls=cls)
85 |         return p
86 | 


--------------------------------------------------------------------------------
/sift/logging.py:
--------------------------------------------------------------------------------
 1 | """ Logging Configuration """
 2 | from __future__ import absolute_import
 3 | import logging
 4 | 
 5 | def setup():
 6 |     fmt = '%(asctime)s|%(levelname)s|%(module)s|%(message)s'
 7 |     logging.basicConfig(format=fmt)
 8 |     log = logging.getLogger('nel')
 9 |     log.setLevel(logging.DEBUG)
10 | 
11 | def getLogger():
12 |     return logging.getLogger('nel')
13 | 
14 | setup()


--------------------------------------------------------------------------------
/sift/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andychisholm/sift/c25cc42ab4ad7f44838036c63d0be4b03767c16c/sift/models/__init__.py


--------------------------------------------------------------------------------
/sift/models/embeddings.py:
--------------------------------------------------------------------------------
  1 | from operator import add
  2 | from itertools import chain
  3 | 
  4 | from sift.models.text import EntityMentions
  5 | from sift.util import ngrams
  6 | from sift.dataset import ModelBuilder, Model
  7 | 
  8 | from sift import logging
  9 | log = logging.getLogger()
 10 | 
 11 | class EntitySkipGramEmbeddings(ModelBuilder, Model):
 12 |     """ Learn distributed representations for words and entities in a corpus via skip-gram embedding """
 13 |     def __init__(
 14 |         self,
 15 |         dimensions=100,
 16 |         min_word_count=500,
 17 |         min_entity_count=10,
 18 |         entity_prefix='en.wikipedia.org/wiki/',
 19 |         exclude_words=False,
 20 |         exclude_entities=False,
 21 |         workers=4,
 22 |         coalesce=None,
 23 |         *args, **kwargs):
 24 | 
 25 |         self.dimensions = dimensions
 26 |         self.min_word_count = min_word_count
 27 |         self.min_entity_count = min_entity_count
 28 |         self.filter_target = entity_prefix
 29 |         self.exclude_words = exclude_words
 30 |         self.exclude_entities = exclude_entities
 31 |         self.workers = workers
 32 |         self.coalesce = coalesce
 33 | 
 34 |     def get_trim_rule(self):
 35 |         from gensim.utils import RULE_KEEP, RULE_DISCARD
 36 |         def trim_rule(word, count, min_count):
 37 |             if not word.startswith(self.filter_target):
 38 |                 return RULE_KEEP if count >= self.min_word_count else RULE_DISCARD
 39 |             else:
 40 |                 return RULE_KEEP if count >= self.min_entity_count else RULE_DISCARD
 41 |             return RULE_KEEP
 42 |         return trim_rule
 43 | 
 44 |     def build(self, mentions):
 45 |         from gensim.models.word2vec import Word2Vec
 46 |         sentences = mentions\
 47 |             .filter(lambda (target, source, text, span): target.startswith(self.filter_target))\
 48 | 
 49 |         sentences = sentences\
 50 |             .map(lambda (target, source, text, (s,e)): list(chain(ngrams(text[:s],1), [target], ngrams(text[e:],1))))
 51 | 
 52 |         if self.coalesce:
 53 |             sentences = sentences.coalesce(self.coalesce)
 54 | 
 55 |         sentences = sentences.cache()
 56 | 
 57 |         model = Word2Vec(sample=1e-5, size=self.dimensions, workers=self.workers)
 58 | 
 59 |         log.info('Preparing corpus...')
 60 |         model.corpus_count = sentences.count()
 61 | 
 62 |         log.info('Computing vocab statistics...')
 63 |         term_counts = sentences\
 64 |             .flatMap(lambda tokens: ((t, 1) for t in tokens))\
 65 |             .reduceByKey(add)\
 66 |             .filter(lambda (t, count): \
 67 |                 (t.startswith(self.filter_target) and count >= self.min_entity_count) or \
 68 |                 (count >= self.min_word_count))
 69 | 
 70 |         model.raw_vocab = dict(term_counts.collect())
 71 |         model.scale_vocab(trim_rule=self.get_trim_rule())
 72 |         model.finalize_vocab()
 73 | 
 74 |         log.info('Training local word2vec model...')
 75 |         model.train(sentences.toLocalIterator())
 76 | 
 77 |         log.info('Normalising embeddings...')
 78 |         model.init_sims(replace=True)
 79 | 
 80 |         total_entities = sum(1 if t.startswith(self.filter_target) else 0 for t in model.vocab.iterkeys())
 81 |         total_words = len(model.vocab) - total_entities
 82 | 
 83 |         vocab_sz = 0
 84 |         if not self.exclude_entities:
 85 |             log.info('Including %i entity embeddings in exported vocab...', total_entities)
 86 |             vocab_sz += total_entities
 87 |         if not self.exclude_words:
 88 |             log.info('Including %i word embeddings in exported vocab...', total_words)
 89 |             vocab_sz += total_words
 90 | 
 91 |         log.info('Parallelizing %i learned embeddings...', vocab_sz)
 92 |         return mentions\
 93 |             .context\
 94 |             .parallelize(
 95 |                 (t, model.syn0[vi.index].tolist())
 96 |                 for t, vi in model.vocab.iteritems()
 97 |                     if (not self.exclude_entities and t.startswith(self.filter_target)) or
 98 |                        (not self.exclude_words and not t.startswith(self.filter_target)))
 99 | 
100 |     @staticmethod
101 |     def format_item((entity, embedding)):
102 |         return {
103 |             '_id': entity,
104 |             'embedding': embedding
105 |         }


--------------------------------------------------------------------------------
/sift/models/links.py:
--------------------------------------------------------------------------------
  1 | import ujson as json
  2 | 
  3 | from operator import add
  4 | from collections import Counter
  5 | from itertools import chain
  6 | 
  7 | from sift.dataset import ModelBuilder, Documents, Model
  8 | from sift.util import trim_link_subsection, trim_link_protocol, ngrams
  9 | 
 10 | from sift import logging
 11 | log = logging.getLogger()
 12 | 
 13 | class EntityCounts(ModelBuilder, Model):
 14 |     """ Inlink counts """
 15 |     def __init__(self, min_count=1, filter_target=None):
 16 |         self.min_count = min_count
 17 |         self.filter_target = filter_target
 18 | 
 19 |     def build(self, docs):
 20 |         links = docs\
 21 |             .flatMap(lambda d: d['links'])\
 22 |             .map(lambda l: l['target'])\
 23 |             .map(trim_link_subsection)\
 24 |             .map(trim_link_protocol)
 25 | 
 26 |         if self.filter_target:
 27 |             links = links.filter(lambda l: l.startswith(self.filter_target))
 28 | 
 29 |         return links\
 30 |             .map(lambda l: (l, 1))\
 31 |             .reduceByKey(add)\
 32 |             .filter(lambda (t, c): c > self.min_count)
 33 | 
 34 |     @staticmethod
 35 |     def format_item((target, count)):
 36 |         return {
 37 |             '_id': target,
 38 |             'count': count
 39 |         }
 40 | 
 41 | class EntityNameCounts(ModelBuilder, Model):
 42 |     """ Entity counts by name """
 43 |     def __init__(self, lowercase=False, filter_target=None):
 44 |         self.lowercase = lowercase
 45 |         self.filter_target = filter_target
 46 | 
 47 |     def iter_anchor_target_pairs(self, doc):
 48 |         for link in doc['links']:
 49 |             target = link['target']
 50 |             target = trim_link_subsection(target)
 51 |             target = trim_link_protocol(target)
 52 | 
 53 |             anchor = doc['text'][link['start']:link['stop']].strip()
 54 | 
 55 |             if self.lowercase:
 56 |                 anchor = anchor.lower()
 57 | 
 58 |             if anchor and target:
 59 |                 yield anchor, target
 60 | 
 61 |     def build(self, docs):
 62 |         m = docs.flatMap(lambda d: self.iter_anchor_target_pairs(d))
 63 | 
 64 |         if self.filter_target:
 65 |             m = m.filter(lambda (a, t): t.startswith(self.filter_target))
 66 | 
 67 |         return m\
 68 |             .groupByKey()\
 69 |             .mapValues(Counter)
 70 | 
 71 |     @staticmethod
 72 |     def format_item((anchor, counts)):
 73 |         return {
 74 |             '_id': anchor,
 75 |             'counts': dict(counts),
 76 |             'total': sum(counts.itervalues())
 77 |         }
 78 | 
 79 | class NamePartCounts(ModelBuilder, Model):
 80 |     """
 81 |     Occurrence counts for ngrams at different positions within link anchors.
 82 |         'B' - beginning of span
 83 |         'E' - end of span
 84 |         'I' - inside span
 85 |         'O' - outside span
 86 |     """
 87 |     def __init__(self, max_ngram=2, lowercase=False, filter_target=None):
 88 |         self.lowercase = lowercase
 89 |         self.filter_target = filter_target
 90 |         self.max_ngram = max_ngram
 91 | 
 92 |     def iter_anchors(self, doc):
 93 |         for link in doc['links']:
 94 |             anchor = doc['text'][link['start']:link['stop']].strip()
 95 |             if self.lowercase:
 96 |                 anchor = anchor.lower()
 97 |             if anchor:
 98 |                 yield anchor
 99 | 
100 |     @staticmethod
101 |     def iter_span_count_types(anchor, n):
102 |         parts = list(ngrams(anchor, n, n))
103 |         if parts:
104 |             yield parts[0], 'B'
105 |             yield parts[-1], 'E'
106 |             for i in xrange(1, len(parts)-1):
107 |                 yield parts[i], 'I'
108 | 
109 |     def build(self, docs):
110 |         part_counts = docs\
111 |             .flatMap(self.iter_anchors)\
112 |             .flatMap(lambda a: chain.from_iterable(self.iter_span_count_types(a, i) for i in xrange(1, self.max_ngram+1)))\
113 |             .map(lambda p: (p, 1))\
114 |             .reduceByKey(add)\
115 |             .map(lambda ((term, spantype), count): (term, (spantype, count)))
116 | 
117 |         part_counts += docs\
118 |             .flatMap(lambda d: ngrams(d['text'], self.max_ngram))\
119 |             .map(lambda t: (t, 1))\
120 |             .reduceByKey(add)\
121 |             .filter(lambda (t, c): c > 1)\
122 |             .map(lambda (t, c): (t, ('O', c)))
123 | 
124 |         return part_counts\
125 |             .groupByKey()\
126 |             .mapValues(dict)\
127 |             .filter(lambda (t, cs): 'O' in cs and len(cs) > 1)
128 | 
129 |     @staticmethod
130 |     def format_item((term, part_counts)):
131 |         return {
132 |             '_id': term,
133 |             'counts': dict(part_counts)
134 |         }
135 | 
136 | class EntityInlinks(ModelBuilder, Model):
137 |     """ Inlink sets for each entity """
138 |     def build(self, docs):
139 |         return docs\
140 |             .flatMap(lambda d: ((d['_id'], l) for l in set(l['target'] for l in d['links'])))\
141 |             .mapValues(trim_link_subsection)\
142 |             .mapValues(trim_link_protocol)\
143 |             .map(lambda (k, v): (v, k))\
144 |             .groupByKey()\
145 |             .mapValues(list)
146 | 
147 |     @staticmethod
148 |     def format_item((target, inlinks)):
149 |         return {
150 |             '_id': target,
151 |             'inlinks': inlinks
152 |         }
153 | 
154 | class EntityVocab(ModelBuilder, Model):
155 |     """ Generate unique indexes for entities in a corpus. """
156 |     def __init__(self, min_rank=0, max_rank=10000):
157 |         self.min_rank = min_rank
158 |         self.max_rank = max_rank
159 | 
160 |     def build(self, docs):
161 |         log.info('Building entity vocab: df rank range=(%i, %i)', self.min_rank, self.max_rank)
162 |         m = super(EntityVocab, self)\
163 |             .build(docs)\
164 |             .map(lambda (target, count): (count, target))\
165 |             .sortByKey(False)\
166 |             .zipWithIndex()\
167 |             .map(lambda ((df, t), idx): (t, (df, idx)))
168 | 
169 |         if self.min_rank != None:
170 |             m = m.filter(lambda (t, (df, idx)): idx >= self.min_rank)
171 |         if self.max_rank != None:
172 |             m = m.filter(lambda (t, (df, idx)): idx < self.max_rank)
173 |         return m
174 | 
175 |     @staticmethod
176 |     def format_item((term, (f, idx))):
177 |         return {
178 |             '_id': term,
179 |             'count': f,
180 |             'rank': idx
181 |         }
182 | 
183 |     @staticmethod
184 |     def load(sc, path, fmt=json):
185 |         log.info('Loading entity-index mapping: %s ...', path)
186 |         return sc\
187 |             .textFile(path)\
188 |             .map(fmt.loads)\
189 |             .map(lambda r: (r['_id'], (r['count'], r['rank'])))
190 | 
191 | class EntityComentions(ModelBuilder, Model):
192 |     """ Entity comentions """
193 |     @staticmethod
194 |     def iter_unique_links(doc):
195 |         links = set()
196 |         for l in doc['links']:
197 |             link = trim_link_subsection(l['target'])
198 |             link = trim_link_protocol(link)
199 |             if link not in links:
200 |                 yield link
201 |                 links.add(link)
202 | 
203 |     def build(self, docs):
204 |         return docs\
205 |             .map(lambda d: (d['_id'], list(self.iter_unique_links(d))))\
206 |             .filter(lambda (uri, es): es)
207 | 
208 |     @staticmethod
209 |     def format_item((uri, es)):
210 |         return {
211 |             '_id': uri,
212 |             'entities': es
213 |         }
214 | 
215 | class MappedEntityComentions(EntityComentions):
216 |     """ Entity comentions with entities mapped to a numeric index """
217 |     def build(self, docs, entity_vocab):
218 |         ev = sc.broadcast(dict(ev.collect()))
219 |         return super(MappedEntityComentions, self)\
220 |             .build(docs)\
221 |             .map(lambda (uri, es): (uri, [ev.value[e] for e in es if e in ev.value]))\
222 |             .filter(lambda (uri, es): es)


--------------------------------------------------------------------------------
/sift/models/text.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy
  3 | import ujson as json
  4 | from bisect import bisect_left, bisect_right
  5 | from operator import add
  6 | from collections import Counter
  7 | 
  8 | from sift.models.links import EntityVocab
  9 | from sift.dataset import ModelBuilder, Documents, Model, Mentions, IndexedMentions, Vocab
 10 | from sift.util import ngrams, iter_sent_spans, trim_link_subsection, trim_link_protocol
 11 | 
 12 | from sift import logging
 13 | log = logging.getLogger()
 14 | 
 15 | class TermFrequencies(ModelBuilder, Model):
 16 |     """ Get term frequencies over a corpus """
 17 |     def __init__(self, lowercase, max_ngram):
 18 |         self.lowercase = lowercase
 19 |         self.max_ngram = max_ngram
 20 | 
 21 |     def build(self, docs):
 22 |         m = docs.map(lambda d: d['text'])
 23 |         if self.lowercase:
 24 |             m = m.map(unicode.lower)
 25 | 
 26 |         return m\
 27 |             .flatMap(lambda text: ngrams(text, self.max_ngram))\
 28 |             .map(lambda t: (t, 1))\
 29 |             .reduceByKey(add)\
 30 |             .filter(lambda (k,v): v > 1)
 31 | 
 32 |     @staticmethod
 33 |     def format_item(self, (term, count)):
 34 |         return {
 35 |             '_id': term,
 36 |             'count': count,
 37 |         }
 38 | 
 39 | class EntityMentions(ModelBuilder, Mentions):
 40 |     """ Get aggregated sentence context around links in a corpus """
 41 |     def __init__(self, sentence_window = 1, lowercase=False, normalize_url=True, strict_sentences=True):
 42 |         self.sentence_window = sentence_window
 43 |         self.lowercase = lowercase
 44 |         self.strict_sentences = strict_sentences
 45 |         self.normalize_url = normalize_url
 46 | 
 47 |     @staticmethod
 48 |     def iter_mentions(doc, window = 1, norm_url=True, strict=True):
 49 |         sent_spans = list(iter_sent_spans(doc['text']))
 50 |         sent_offsets = [s.start for s in sent_spans]
 51 | 
 52 |         for link in doc['links']:
 53 |             # align the link span over sentence spans in the document
 54 |             # mention span may cross sentence bounds if sentence tokenisation is dodgy
 55 |             # if so, the entire span between bounding sentences will be used as context
 56 |             sent_start_idx = bisect_right(sent_offsets, link['start']) - 1
 57 |             sent_end_idx = bisect_left(sent_offsets, link['stop']) - 1
 58 | 
 59 |             lhs_offset = window / 2
 60 |             rhs_offset = (window - lhs_offset) - 1
 61 |             sent_start_idx = max(0, sent_start_idx - lhs_offset)
 62 |             sent_end_idx = min(len(sent_spans)-1, sent_end_idx + rhs_offset)
 63 |             sent_offset = sent_spans[sent_start_idx].start
 64 | 
 65 |             span = (link['start'] - sent_offset, link['stop'] - sent_offset)
 66 |             target = link['target']
 67 |             if norm_url:
 68 |                 target = trim_link_subsection(link['target'])
 69 |                 target = trim_link_protocol(target)
 70 |             mention = doc['text'][sent_spans[sent_start_idx].start:sent_spans[sent_end_idx].stop]
 71 | 
 72 |             # filter out instances where the mention span is the entire sentence
 73 |             if span == (0, len(mention)):
 74 |                 continue
 75 | 
 76 |             if strict:
 77 |                 # filter out list item sentences
 78 |                 sm = mention.strip()
 79 |                 if not sm or sm.startswith('*') or sm[-1] not in '.!?"\'':
 80 |                     continue
 81 | 
 82 |             yield target, doc['_id'], mention, span
 83 | 
 84 |     def build(self, docs):
 85 |         m = docs.flatMap(lambda d: self.iter_mentions(d, self.sentence_window, self.normalize_url, self.strict_sentences))
 86 |         if self.lowercase:
 87 |             m = m.map(lambda (t, src, m, s): (t, src, m.lower(), s))
 88 |         return m
 89 | 
 90 | class IndexMappedMentions(EntityMentions, IndexedMentions):
 91 |     """ Entity mention corpus with terms mapped to numeric indexes """
 92 |     def build(self, sc, docs, vocab):
 93 |         tv = sc.broadcast(dict(vocab.map(lambda r: (r['_id'], r['rank'])).collect()))
 94 |         return super(IndexMappedMentions, self)\
 95 |             .build(docs)\
 96 |             .map(lambda m: self.transform(m, tv))
 97 | 
 98 |     @staticmethod
 99 |     def transform((target, source, text, span), vocab):
100 |         vocab = vocab.value
101 | 
102 |         start, stop = span
103 |         pre = list(ngrams(text[:start], 1))
104 |         ins = list(ngrams(text[start:stop], 1))
105 |         post = list(ngrams(text[stop:], 1))
106 |         indexes = [vocab.get(t, len(vocab)-1) for t in (pre+ins+post)]
107 | 
108 |         return target, source, indexes, (len(pre), len(pre)+len(ins))
109 | 
110 | class TermDocumentFrequencies(ModelBuilder):
111 |     """ Get document frequencies for terms in a corpus """
112 |     def __init__(self, lowercase=False, max_ngram=1, min_df=2):
113 |         self.lowercase = lowercase
114 |         self.max_ngram = max_ngram
115 |         self.min_df = min_df
116 | 
117 |     def build(self, docs):
118 |         m = docs.map(lambda d: d['text'])
119 |         if self.lowercase:
120 |             m = m.map(lambda text: text.lower())
121 | 
122 |         return m\
123 |             .flatMap(lambda text: set(ngrams(text, self.max_ngram)))\
124 |             .map(lambda t: (t, 1))\
125 |             .reduceByKey(add)\
126 |             .filter(lambda (k,v): v > self.min_df)
127 | 
128 | class TermVocab(TermDocumentFrequencies, Vocab):
129 |     """ Generate unique indexes for termed based on their document frequency ranking. """
130 |     def __init__(self, max_rank, min_rank=100, *args, **kwargs):
131 |         self.max_rank = max_rank
132 |         self.min_rank = min_rank
133 |         super(TermVocab, self).__init__(*args, **kwargs)
134 | 
135 |     def build(self, docs):
136 |         m = super(TermVocab, self)\
137 |             .build(docs)\
138 |             .map(lambda (t, df): (df, t))\
139 |             .sortByKey(False)\
140 |             .zipWithIndex()\
141 |             .map(lambda ((df, t), idx): (t, (df, idx)))
142 | 
143 |         if self.min_rank != None:
144 |             m = m.filter(lambda (t, (df, idx)): idx >= self.min_rank)
145 |         if self.max_rank != None:
146 |             m = m.filter(lambda (t, (df, idx)): idx < self.max_rank)
147 |         return m
148 | 
149 |     @staticmethod
150 |     def format_item((term, (f, idx))):
151 |         return {
152 |             '_id': term,
153 |             'count': f,
154 |             'rank': idx
155 |         }
156 | 
157 | class TermIdfs(TermDocumentFrequencies, Model):
158 |     """ Compute tf-idf weighted token counts over sentence contexts around links in a corpus """
159 |     def build(self, corpus):
160 |         log.info('Counting documents in corpus...')
161 |         N = float(corpus.count())
162 |         dfs = super(TermIdfs, self).build(corpus)
163 | 
164 |         log.info('Building idf model: N=%i', N)
165 |         return dfs\
166 |             .map(lambda (term, (df, rank)): (term, df))\
167 |             .mapValues(lambda df: math.log(N/df))
168 | 
169 |     @staticmethod
170 |     def format_item((term, idf)):
171 |         return {
172 |             '_id': term,
173 |             'idf': idf,
174 |         }
175 | 
176 | class EntityMentionTermFrequency(ModelBuilder, Model):
177 |     """ Compute tf-idf weighted token counts over sentence contexts around links in a corpus """
178 |     def __init__(self, max_ngram=1, normalize = True):
179 |         self.max_ngram = max_ngram
180 |         self.normalize = normalize
181 | 
182 |     def build(self, mentions, idfs):
183 |         m = mentions\
184 |             .map(lambda (target, (span, text)): (target, text))\
185 |             .mapValues(lambda v: ngrams(v, self.max_ngram))\
186 |             .flatMap(lambda (target, tokens): (((target, t), 1) for t in tokens))\
187 |             .reduceByKey(add)\
188 |             .map(lambda ((target, token), count): (token, (target, count)))\
189 |             .leftOuterJoin(idfs)\
190 |             .filter(lambda (token, ((target, count), idf)): idf != None)\
191 |             .map(lambda (token, ((target, count), idf)): (target, (token, math.sqrt(count)*idf)))\
192 |             .groupByKey()
193 | 
194 |         return m.mapValues(self.normalize_counts if self.normalize else list)
195 | 
196 |     @staticmethod
197 |     def normalize_counts(counts):
198 |         norm = numpy.linalg.norm([v for _, v in counts])
199 |         return [(k, v/norm) for k, v in counts]
200 | 
201 |     @staticmethod
202 |     def format_item((link, counts)):
203 |         return {
204 |             '_id': link,
205 |             'counts': dict(counts),
206 |         }
207 | 


--------------------------------------------------------------------------------
/sift/util.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from pattern import en
 3 | 
 4 | # todo: use spacy tokenization
 5 | def ngrams(text, max_n=1, min_n=1, strip_punctuation=True):
 6 |     pattern_args = {} if strip_punctuation else {'punctuation':''}
 7 |     for i in xrange(min_n-1,max_n):
 8 |         for n in en.ngrams(text, n=i+1, **pattern_args):
 9 |             yield ' '.join(n)
10 | 
11 | 
12 | # sentences can't end with a single lowercase letter
13 | SENT_NO_END_LC = "(?<!(\s[a-z]\.))"
14 | 
15 | # abbreviation sequences don't delimit
16 | ABBREV = "(?<!\w\.\w.)"
17 | 
18 | SENT_POS_HEUR = [
19 |     "(?<=\.|[\?!])"     # should end with punctuation
20 | ]
21 | SENT_NEG_ABBREVS = '(('+')|('.join([
22 |     "[Ii]nc",
23 |     "[Pp]ty",
24 |     "[Ll]td",
25 | ]) + '))'
26 | SENT_NEG_HEUR = [
27 |     "(?<!(\s[a-z]\.))",  # can't end with a single lowercase letter (e.g. "c.")
28 |     "(?<!\w\.\w.)",      # abbreviation sequences don't delimit (e.g. "e.g.")
29 |     "(?<![A-Z][a-z]\.)", # can't end in two character capitalised word (e.g. "Ph.D")
30 |     "(?<!"+SENT_NEG_ABBREVS+"\.)" # can't end with a hardcoded abbreviation (e.g. "Inc.")
31 | ]
32 | SENT_HEURISTICS = '('+''.join(SENT_NEG_HEUR+SENT_POS_HEUR)+')'
33 | 
34 | SENT_RE = re.compile('('+SENT_HEURISTICS+'\s)|(\s*\n\s*)')
35 | def iter_sent_spans(text):
36 |     last = 0
37 |     for m in SENT_RE.finditer(text):
38 |         if last != m.start():
39 |             yield slice(last, m.start())
40 |         last = m.end()
41 |     if last != len(text):
42 |         yield slice(last, len(text))
43 | 
44 | def trim_link_subsection(s):
45 |     idx = s.find('#')
46 |     return s if idx == -1 else s[:idx]
47 | 
48 | def trim_link_protocol(s):
49 |     idx = s.find('://')
50 |     return s if idx == -1 else s[idx+3:]
51 | 


--------------------------------------------------------------------------------