├── .gitignore
├── COPYING.LESSER.txt
├── COPYING.txt
├── README.md
├── conf
    ├── semanticizer.memory.yml
    ├── semanticizer.redis.yml
    ├── semanticizer.trove.yml
    ├── semanticizer.uva.yml
    └── semanticizer.yml
├── doc
    ├── Makefile
    ├── Semanticizer.js
    ├── advanced.js
    ├── docs
    │   ├── Semanticizer.html
    │   ├── advanced.html
    │   ├── docco.css
    │   ├── learning.html
    │   └── public
    │   │   ├── fonts
    │   │       ├── aller-bold.eot
    │   │       ├── aller-bold.ttf
    │   │       ├── aller-bold.woff
    │   │       ├── aller-light.eot
    │   │       ├── aller-light.ttf
    │   │       ├── aller-light.woff
    │   │       ├── fleurons.eot
    │   │       ├── fleurons.ttf
    │   │       ├── fleurons.woff
    │   │       ├── novecento-bold.eot
    │   │       ├── novecento-bold.ttf
    │   │       └── novecento-bold.woff
    │   │   ├── images
    │   │       └── gray.png
    │   │   └── stylesheets
    │   │       └── normalize.css
    └── learning.js
├── semanticizer.svg
├── semanticizer
    ├── __init__.py
    ├── config.py
    ├── dbinsert
    │   ├── __init__.py
    │   └── __main__.py
    ├── processors
    │   ├── __init__.py
    │   ├── context.py
    │   ├── core.py
    │   ├── external.py
    │   ├── feature.py
    │   ├── features.py
    │   ├── image.py
    │   ├── learning.py
    │   ├── multiple.py
    │   ├── semanticize.py
    │   ├── semanticizer.py
    │   ├── stringUtils.py
    │   └── util.py
    ├── procpipeline.py
    ├── server
    │   ├── __init__.py
    │   └── __main__.py
    ├── util
    │   ├── __init__.py
    │   ├── online_learning.py
    │   ├── profiler.py
    │   ├── store_dataset.py
    │   └── timer.py
    └── wpm
    │   ├── __init__.py
    │   ├── data.py
    │   ├── db
    │       ├── __init__.py
    │       ├── inmemory.py
    │       ├── mongodb.py
    │       └── redisdb.py
    │   ├── load.py
    │   ├── namespace.py
    │   └── utils
    │       ├── __init__.py
    │       ├── emphasis_resolver.py
    │       ├── markup_stripper.py
    │       └── wikidumps.py
├── semanticizer_wsgi.py
├── setup.py
└── test
    ├── TestConfig.py
    ├── TestInputdata.py
    ├── TestMain.py
    ├── TestProcpipeline.py
    ├── TestServer.py
    └── __init__.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Editor cruft
 2 | *.sw[op]
 3 | *~
 4 | ._*
 5 | .DS_Store
 6 | 
 7 | *.pyc
 8 | 
 9 | logs
10 | /log.txt
11 | 
12 | # Packages
13 | *.egg
14 | *.egg-info
15 | dist
16 | build
17 | eggs
18 | parts
19 | bin
20 | var
21 | sdist
22 | develop-eggs
23 | .installed.cfg
24 | lib
25 | lib64
26 | __pycache__
27 | 
28 | # Installer logs
29 | pip-log.txt
30 | 
31 | # Unit test / coverage reports
32 | .coverage
33 | .tox
34 | nosetests.xml
35 | 
36 | # Translations
37 | *.mo
38 | 
39 | # Mr Developer
40 | .mr.developer.cfg
41 | .project
42 | .pydevproject
43 | 
44 | #netbeans
45 | /nbproject/
46 | 
47 | # MediaWiki dumps
48 | *.bz2
49 | *.gz
50 | *.sql
51 | *.xml
52 | 


--------------------------------------------------------------------------------
/COPYING.LESSER.txt:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Semanticizer
  2 | 
  3 | The Semanticizer is a web service application for semantic linking
  4 | created in 2012 by [Daan Odijk](http://staff.science.uva.nl/~dodijk/)
  5 | at [ILPS](http://ilps.science.uva.nl/) (University of Amsterdam).
  6 | 
  7 | This project since received contributions from (in alphabetical order):
  8 | [Marc Bron](http://staff.science.uva.nl/~mbron/), 
  9 | [Lars Buitinck](http://staff.science.uva.nl/~buitinck/),
 10 | [Bart van den Ende](http://www.bartvandenende.nl/), 
 11 | [David Graus](http://graus.nu/),
 12 | [Tom Kenter](http://staff.science.uva.nl/~tkenter1/),
 13 | [Evert Lammerts](http://www.evertlammerts.nl/),
 14 | [Edgar Meij](http://edgar.meij.pro/),
 15 | [Daan Odijk](http://staff.science.uva.nl/~dodijk/),
 16 | [Anne Schuth](http://www.anneschuth.nl/) and
 17 | [Isaac Sijaranamual](http://nl.linkedin.com/pub/isaac-sijaranamual/).
 18 | 
 19 | The algorithms for this webservice are developed for and described in
 20 | a OAIR2013 publication on
 21 | [Feeding the Second Screen](http://ilps.science.uva.nl/biblio/feeding-second-screen-semantic-linking-based-subtitles)
 22 | by [Daan Odijk](http://staff.science.uva.nl/~dodijk/),
 23 | [Edgar Meij](http://edgar.meij.pro/) and
 24 | [Maarten de Rijke](http://staff.science.uva.nl/~mdr/).  Part of this
 25 | research was inspired by earlier ILPS publications:
 26 | [Adding Semantics to Microblog Posts](http://ilps.science.uva.nl/biblio/adding-semantics-microblog-posts)
 27 | and
 28 | [Mapping Queries To The Linking Open Data Cloud](http://ilps.science.uva.nl/node/889).
 29 | If you use this webservice for your own research, please include a
 30 | reference to the OAIR2013 article or alternatively any of these
 31 | articles.
 32 | 
 33 | The [online documentation](http://semanticize.uva.nl/doc/) describes
 34 | how to use the Semanticizer Web API. This
 35 | [REST](http://en.wikipedia.org/wiki/Representational_state_transfer)-like
 36 | web service returns [JSON](http://www.json.org/) and is exposed to
 37 | public at: http://semanticize.uva.nl/api/. Currently an access key for
 38 | the webservice is not needed.
 39 | 
 40 | The [code](https://github.com/semanticize/semanticizer/) is released
 41 | under LGPL license (see below). If you have any questions, contact
 42 | [Daan](http://staff.science.uva.nl/~dodijk/).
 43 | 
 44 | If you want to dive into the code, start at `semanticizer/server/__main__.py`.
 45 | 
 46 | 
 47 | ## Requirements
 48 | 
 49 | 1. The software has been tested with Python 2.7.3 on Mac OS X 2.8 and
 50 |    Linux (RedHat EL5, Debian jessie/sid and Ubuntu 12.04.)
 51 | 
 52 | 2. The following Python modules need to be installed (using
 53 |    easy_install or pip):
 54 | 
 55 |    * nltk
 56 |    * leven
 57 |    * networkx
 58 |    * lxml
 59 |    * flask
 60 |    * redis (optional, see point 4)
 61 |    * scikit-learn (optional, see point 6)
 62 |    * scipy (optional, see point 6)
 63 |    * mock (optional, used by the tests)
 64 | 
 65 | 3. A summary of a Wikipedia dump is needed. For this, download the
 66 |    [Wikipedia Miner CSV files](http://sourceforge.net/projects/wikipedia-miner/files/data/).
 67 | 
 68 | 4. Copy one of the two config files in the `conf` folder to
 69 |    `semanticizer.yml` in that folder and adapt to your situation. You
 70 |    have the choice of loading all data into memory (use
 71 |    `semanticizer.memory.yml`) or into [Redis](http://redis.io/) using
 72 |    the following steps:
 73 | 
 74 | 	1. Copy `semanticizer.redis.yml` into `semanticizer.yml`.
 75 | 
 76 | 	2. Redis server needs to be set up and running.
 77 | 
 78 | 	3. Load data into redis: `python -m semanticizer.dbinsert [--language=<languagecode>] [--output=/tmp/redisinsert.log]`.
 79 | 
 80 | 4. Run the server using `python -m semanticizer.server`.
 81 | 
 82 | 5. In order to work with the features you need to install the
 83 |    scikit-learn and scipy packages. Before installing scipy you need
 84 |    to have [swig](http://www.swig.org/download.html) installed. See
 85 |    its INSTALL for instructions. (configure, make, make
 86 |    install). Note that working with features is still under active
 87 |    development and therefore not fully documented and tested.
 88 | 
 89 | ## License
 90 | 
 91 | This program is free software: you can redistribute it and/or modify
 92 | it under the terms of the GNU Lesser General Public License as
 93 | published by the Free Software Foundation, either version 3 of the
 94 | License, or (at your option) any later version.
 95 | 
 96 | This program is distributed in the hope that it will be useful, but
 97 | WITHOUT ANY WARRANTY; without even the implied warranty of
 98 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 99 | Lesser General Public License for more details.
100 | 
101 | You should have received a copy of the GNU Lesser General Public
102 | License along with this program.  If not, see
103 | <http://www.gnu.org/licenses/>.
104 | 


--------------------------------------------------------------------------------
/conf/semanticizer.memory.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | server:
15 |   port: 5000
16 |   host: 0.0.0.0
17 | 
18 | wpm:
19 |   languages:
20 |     en:
21 |       source: memory
22 |       initparams:
23 |         path: ./enwiki-20110722
24 |         language: english
25 |   threads: 16
26 |   bdburl: http://wikipedia-miner.cms.waikato.ac.nz/services/exploreArticle
27 | 
28 | linkprocs:
29 |   features: false
30 | 
31 | logging:
32 |   verbose: true
33 |   path: log.txt
34 |   format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s'
35 | 
36 | misc:
37 |   tempdir: /tmp
38 | 


--------------------------------------------------------------------------------
/conf/semanticizer.redis.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | server:
15 |   port: 5000
16 |   host: 0.0.0.0
17 |   use_reloader: true
18 | 
19 | wpm:
20 |   languages:
21 |     en:
22 |       source: redis
23 |       initparams:
24 |         path: ./enwiki-20110722
25 |         language: english
26 |         host: localhost
27 |         port: 6379
28 | 
29 | linkprocs:
30 |   features: false
31 | 
32 | logging:
33 |   verbose: true
34 |   path: log.txt
35 |   format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s'
36 | 
37 | misc:
38 |   tempdir: /tmp
39 | 


--------------------------------------------------------------------------------
/conf/semanticizer.trove.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | server:
15 |   port: 5000
16 |   host: 0.0.0.0
17 | 
18 | wpm:
19 |   languages:
20 |     # memory backend
21 |     nl:
22 |       source: WpmDataInProc
23 |       initparams:
24 |         path: /zfs/ilps-plexer/wikipediaminer/nlwiki-20130318
25 |         language: dutch
26 |         # translation_languages should be a list of iso 639-2 language
27 |         # codes
28 |         translation_languages: []
29 |     # Redis backend
30 |     # nl:
31 |     #   source: wpmdata_redis.WpmDataRedis
32 |     #   initparams:
33 |     #     host: localhost
34 |     #     port: 6379
35 |   threads: 16
36 |   bdburl: http://zookst13.science.uva.nl:8080/dutchsemcor/article
37 | 
38 | semanticize:
39 |   max_ngram_length: 12
40 | 
41 | linkprocs:
42 |   includefeatures: false
43 | 
44 | logging:
45 |   verbose: true
46 |   path: log.txt
47 |   format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s'
48 | 
49 | misc:
50 |   tempdir: /tmp
51 | 


--------------------------------------------------------------------------------
/conf/semanticizer.uva.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | server:
15 |   port: 5000
16 |   host: 0.0.0.0
17 |   use_reloader: false
18 | 
19 | wpm:
20 |   languages:
21 |     nl:
22 |       source: WpmDataRedis
23 |       initparams:
24 |         host: zookst14.science.uva.nl
25 |         port: 6379
26 | 
27 |     # Use the in-memory backend: this is faster than the Redis backend
28 |     # but uses a lot more memory, especially if you intent to run
29 |     # multiple semanticizers.
30 |     # nl:
31 |     #   source: WpmDataInProc
32 |     #   initparams:
33 |     #     path: /zfs/ilps-plexer/wikipediaminer/nlwiki-20111104
34 |     #     language: dutch
35 |     #     # translation_languages should be a list of iso 639-2 language
36 |     #     # codes
37 |     #     translation_languages: ["en", "fr", "de", "nl"]
38 |     en:
39 |       source: WpmDataRedis
40 |       initparams:
41 |         host: zookst14.science.uva.nl
42 |         port: 6379
43 |     es:
44 |       source: WpmDataRedis
45 |       initparams:
46 |         host: zookst14.science.uva.nl
47 |         port: 6379
48 |     fr:
49 |       source: WpmDataRedis
50 |       initparams:
51 |         host: zookst14.science.uva.nl
52 |         port: 6379
53 |     de:
54 |       source: WpmDataRedis
55 |       initparams:
56 |         host: zookst14.science.uva.nl
57 |         port: 6379
58 |   threads: 16
59 |   bdburl: http://zookst13.science.uva.nl:8080/dutchsemcor/article
60 | 
61 | linkprocs:
62 |   includefeatures: true
63 | 
64 | learning:
65 |   model_dir: /zfs/ilps-plexer/dodijk/semanticizer.models
66 | 
67 | logging:
68 |   verbose: true
69 |   path: log.txt
70 |   format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s'
71 | 
72 | misc:
73 |   tempdir: /tmp
74 | 
75 | settings:
76 |   vara:
77 |     pre_filter: unique,senseProbability>0.01
78 |     learning: coling-SP0.2-100.RandomForestClassifier-10-auto.pkl
79 |     filter: unique,learningProbability>=0.5
80 | 


--------------------------------------------------------------------------------
/conf/semanticizer.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | server:
15 |   port: 8005
16 |   host: 0.0.0.0
17 |   use_reloader: false
18 | 
19 | settings:
20 |   include_categories: True
21 |   include_definitions: True
22 | 
23 | wpm:
24 |   languages:
25 |     #en:
26 |     #  source: redis
27 |     #  initparams:
28 |     #    path: /zfs/ilps-plexer/wikipediaminer/en
29 |     #    host: localhost
30 |     #    port: 6379
31 |     #    language: english
32 |     #    #translation_languages: ["nl", "fr", "de", "es"] # TODO: We should include all possible params in the config file [DG]
33 | 
34 |     nl:
35 |       source: redis
36 |       initparams:
37 |         path: /zfs/ilps-plexer/wikipediaminer/nlwiki-latest
38 |         host: localhost
39 |         port: 6379
40 |         language: nederlands
41 |         #translation_languages: ["en", "fr", "de", "es"]
42 | 
43 | linkprocs:
44 |   features: false
45 | 
46 | logging:
47 |   verbose: true
48 |   path: log.txt
49 |   format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s'
50 | 
51 | misc:
52 |   tempdir: /tmp
53 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | doc: 
 2 | 	docco -l linear Semanticizer.js
 3 | 	docco -l linear advanced.js
 4 | 	docco -l linear learning.js
 5 | 
 6 | all: doc
 7 | 
 8 | publish: all
 9 | 	rsync -av docs/ zookma:/datastore/applications/semanticize/doc/
10 | 
11 | watch:
12 | 	watch "*.js" 1s "make publish" 
13 | 


--------------------------------------------------------------------------------
/doc/docs/docco.css:
--------------------------------------------------------------------------------
  1 | /*--------------------- Typography ----------------------------*/
  2 | 
  3 | @font-face {
  4 |     font-family: 'aller-light';
  5 |     src: url('public/fonts/aller-light.eot');
  6 |     src: url('public/fonts/aller-light.eot?#iefix') format('embedded-opentype'),
  7 |          url('public/fonts/aller-light.woff') format('woff'),
  8 |          url('public/fonts/aller-light.ttf') format('truetype');
  9 |     font-weight: normal;
 10 |     font-style: normal;
 11 | }
 12 | 
 13 | @font-face {
 14 |     font-family: 'aller-bold';
 15 |     src: url('public/fonts/aller-bold.eot');
 16 |     src: url('public/fonts/aller-bold.eot?#iefix') format('embedded-opentype'),
 17 |          url('public/fonts/aller-bold.woff') format('woff'),
 18 |          url('public/fonts/aller-bold.ttf') format('truetype');
 19 |     font-weight: normal;
 20 |     font-style: normal;
 21 | }
 22 | 
 23 | @font-face {
 24 |     font-family: 'novecento-bold';
 25 |     src: url('public/fonts/novecento-bold.eot');
 26 |     src: url('public/fonts/novecento-bold.eot?#iefix') format('embedded-opentype'),
 27 |          url('public/fonts/novecento-bold.woff') format('woff'),
 28 |          url('public/fonts/novecento-bold.ttf') format('truetype');
 29 |     font-weight: normal;
 30 |     font-style: normal;
 31 | }
 32 | 
 33 | @font-face {
 34 |     font-family: 'fleurons';
 35 |     src: url('public/fonts/fleurons.eot');
 36 |     src: url('public/fonts/fleurons.eot?#iefix') format('embedded-opentype'),
 37 |          url('public/fonts/fleurons.woff') format('woff'),
 38 |          url('public/fonts/fleurons.ttf') format('truetype');
 39 |     font-weight: normal;
 40 |     font-style: normal;
 41 | }
 42 | 
 43 | /*--------------------- Base Styles ----------------------------*/
 44 | 
 45 | body {
 46 |   font-family: "aller-light";
 47 |   background: url('public/images/gray.png') #fff;
 48 |   background-size: 322px;
 49 |   margin: 0;
 50 | }
 51 | 
 52 | hr {
 53 |   height: 1px;
 54 |   background: #ddd;
 55 |   border: 0;
 56 | }
 57 | 
 58 | h1, h2, h3, h4, h5, h6 {
 59 |   color: #112233;
 60 |   font-weight: normal;
 61 |   font-family: "novecento-bold";
 62 |   text-transform: uppercase;
 63 |   line-height: 1em;
 64 |   margin-top: 50px;
 65 | }
 66 |   h1 {
 67 |     margin: 0;
 68 |     text-align: center;
 69 |   }
 70 |   h2 {
 71 |     font-size: 1.3em;
 72 |   }
 73 |   h1:after {
 74 |     content: "8";
 75 |     display: block;
 76 |     font-family: "fleurons";
 77 |     color: #999;
 78 |     font-size: 80px;
 79 |     padding: 10px 0 25px;
 80 |   }
 81 | 
 82 | a {
 83 |   color: #000;
 84 | }
 85 | 
 86 | b, strong {
 87 |   font-weight: normal;
 88 |   font-family: "aller-bold";
 89 | }
 90 | 
 91 | blockquote {
 92 |   border-left: 5px solid #ccc;
 93 |   margin-left: 0;
 94 |   padding: 1px 0 1px 1em;
 95 | }
 96 |   .page blockquote p {
 97 |     font-family: Menlo, Consolas, Monaco, monospace;
 98 |     font-size: 14px; line-height: 19px;
 99 |     color: #999;
100 |     margin: 10px 0 0;
101 |     white-space: pre-wrap;
102 |   }
103 | 
104 | pre, tt, code {
105 |   font-family: Menlo, Consolas, Monaco, monospace;
106 |   font-size: 12px;
107 |   display: inline-block;
108 |   border: 1px solid #EAEAEA;
109 |   background: #f8f8f8;
110 |   color: #555;
111 |   padding: 0 5px;
112 |   line-height: 20px;
113 | }
114 |   .page pre {
115 |     margin: 0;
116 |     width: 608px;
117 |     padding: 10px 15px;
118 |     background: #fcfcfc;
119 |     -moz-box-shadow:    inset 0 0 10px rgba(0,0,0,0.1);
120 |     -webkit-box-shadow: inset 0 0 10px rgba(0,0,0,0.1);
121 |     box-shadow:         inset 0 0 10px rgba(0,0,0,0.1);
122 |     overflow-x: auto;
123 |   }
124 |     .page pre code {
125 |       border: 0;
126 |       padding: 0;
127 |       background: transparent;
128 |     }
129 | 
130 | .fleur {
131 |   font-family: "fleurons";
132 |   font-size: 100px;
133 |   text-align: center;
134 |   margin: 40px 0;
135 |   color: #ccc;
136 | }
137 | 
138 | /*--------------------- Layout ----------------------------*/
139 | 
140 | .container {
141 |   width: 760px;
142 |   margin: 0 auto;
143 |   background: #fff;
144 |   background: rgba(255,255,255, 0.4);
145 |   overflow: hidden;
146 | }
147 |   .page {
148 |     width: 640px;
149 |     padding: 30px;
150 |     margin: 30px;
151 |     background: #fff;
152 |     font-size: 17px;
153 |     line-height: 26px;
154 |   }
155 |     .page p {
156 |       color: #30404f;
157 |       margin: 26px 0;
158 |     }
159 | 
160 | ul.sections {
161 |   list-style: none;
162 |   padding:0 0 5px 0;;
163 |   margin:0;
164 | }
165 | 
166 | .page li p {
167 |   margin: 12px 0;
168 | }
169 | 
170 | .toc {
171 |   max-height: 0;
172 |   overflow: hidden;
173 |   text-align: center;
174 |   font-size: 13px;
175 |   line-height: 20px;
176 |   -moz-transition: max-height 1s;
177 |   -webkit-transition: max-height 1s;
178 |   transition: max-height 1s;
179 | }
180 |   .header:hover .toc {
181 |     max-height: 500px;
182 |   }
183 |   .toc h3 {
184 |     margin-top: 20px;
185 |   }
186 |   .toc ol {
187 |     margin: 0 0 20px 0;
188 |     display: inline-block;
189 |     text-align: left;
190 |     list-style-type: upper-roman;
191 |   }
192 |     .toc li {
193 |       font-family: 'novecento-bold';
194 |     }
195 |       .toc li a {
196 |         font-family: 'aller-light';
197 |       }
198 | 
199 | 
200 | /*---------------------- Syntax Highlighting -----------------------------*/
201 | 
202 | td.linenos { background-color: #f0f0f0; padding-right: 10px; }
203 | span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
204 | /*
205 | 
206 | github.com style (c) Vasily Polovnyov <vast@whiteants.net>
207 | 
208 | */
209 | 
210 | pre code {
211 |   display: block; padding: 0.5em;
212 |   color: #000;
213 |   background: #f8f8ff
214 | }
215 | 
216 | pre .comment,
217 | pre .template_comment,
218 | pre .diff .header,
219 | pre .javadoc {
220 |   color: #408080;
221 |   font-style: italic
222 | }
223 | 
224 | pre .keyword,
225 | pre .assignment,
226 | pre .literal,
227 | pre .css .rule .keyword,
228 | pre .winutils,
229 | pre .javascript .title,
230 | pre .lisp .title,
231 | pre .subst {
232 |   color: #954121;
233 |   /*font-weight: bold*/
234 | }
235 | 
236 | pre .number,
237 | pre .hexcolor {
238 |   color: #40a070
239 | }
240 | 
241 | pre .string,
242 | pre .tag .value,
243 | pre .phpdoc,
244 | pre .tex .formula {
245 |   color: #219161;
246 | }
247 | 
248 | pre .title,
249 | pre .id {
250 |   color: #19469D;
251 | }
252 | pre .params {
253 |   color: #00F;
254 | }
255 | 
256 | pre .javascript .title,
257 | pre .lisp .title,
258 | pre .subst {
259 |   font-weight: normal
260 | }
261 | 
262 | pre .class .title,
263 | pre .haskell .label,
264 | pre .tex .command {
265 |   color: #458;
266 |   font-weight: bold
267 | }
268 | 
269 | pre .tag,
270 | pre .tag .title,
271 | pre .rules .property,
272 | pre .django .tag .keyword {
273 |   color: #000080;
274 |   font-weight: normal
275 | }
276 | 
277 | pre .attribute,
278 | pre .variable,
279 | pre .instancevar,
280 | pre .lisp .body {
281 |   color: #008080
282 | }
283 | 
284 | pre .regexp {
285 |   color: #B68
286 | }
287 | 
288 | pre .class {
289 |   color: #458;
290 |   font-weight: bold
291 | }
292 | 
293 | pre .symbol,
294 | pre .ruby .symbol .string,
295 | pre .ruby .symbol .keyword,
296 | pre .ruby .symbol .keymethods,
297 | pre .lisp .keyword,
298 | pre .tex .special,
299 | pre .input_number {
300 |   color: #990073
301 | }
302 | 
303 | pre .builtin,
304 | pre .constructor,
305 | pre .built_in,
306 | pre .lisp .title {
307 |   color: #0086b3
308 | }
309 | 
310 | pre .preprocessor,
311 | pre .pi,
312 | pre .doctype,
313 | pre .shebang,
314 | pre .cdata {
315 |   color: #999;
316 |   font-weight: bold
317 | }
318 | 
319 | pre .deletion {
320 |   background: #fdd
321 | }
322 | 
323 | pre .addition {
324 |   background: #dfd
325 | }
326 | 
327 | pre .diff .change {
328 |   background: #0086b3
329 | }
330 | 
331 | pre .chunk {
332 |   color: #aaa
333 | }
334 | 
335 | pre .tex .formula {
336 |   opacity: 0.5;
337 | }
338 | 


--------------------------------------------------------------------------------
/doc/docs/public/fonts/aller-bold.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-bold.eot


--------------------------------------------------------------------------------
/doc/docs/public/fonts/aller-bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-bold.ttf


--------------------------------------------------------------------------------
/doc/docs/public/fonts/aller-bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-bold.woff


--------------------------------------------------------------------------------
/doc/docs/public/fonts/aller-light.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-light.eot


--------------------------------------------------------------------------------
/doc/docs/public/fonts/aller-light.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-light.ttf


--------------------------------------------------------------------------------
/doc/docs/public/fonts/aller-light.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-light.woff


--------------------------------------------------------------------------------
/doc/docs/public/fonts/fleurons.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/fleurons.eot


--------------------------------------------------------------------------------
/doc/docs/public/fonts/fleurons.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/fleurons.ttf


--------------------------------------------------------------------------------
/doc/docs/public/fonts/fleurons.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/fleurons.woff


--------------------------------------------------------------------------------
/doc/docs/public/fonts/novecento-bold.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/novecento-bold.eot


--------------------------------------------------------------------------------
/doc/docs/public/fonts/novecento-bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/novecento-bold.ttf


--------------------------------------------------------------------------------
/doc/docs/public/fonts/novecento-bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/novecento-bold.woff


--------------------------------------------------------------------------------
/doc/docs/public/images/gray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/images/gray.png


--------------------------------------------------------------------------------
/doc/docs/public/stylesheets/normalize.css:
--------------------------------------------------------------------------------
  1 | /*! normalize.css v2.0.1 | MIT License | git.io/normalize */
  2 | 
  3 | /* ==========================================================================
  4 |    HTML5 display definitions
  5 |    ========================================================================== */
  6 | 
  7 | /*
  8 |  * Corrects `block` display not defined in IE 8/9.
  9 |  */
 10 | 
 11 | article,
 12 | aside,
 13 | details,
 14 | figcaption,
 15 | figure,
 16 | footer,
 17 | header,
 18 | hgroup,
 19 | nav,
 20 | section,
 21 | summary {
 22 |     display: block;
 23 | }
 24 | 
 25 | /*
 26 |  * Corrects `inline-block` display not defined in IE 8/9.
 27 |  */
 28 | 
 29 | audio,
 30 | canvas,
 31 | video {
 32 |     display: inline-block;
 33 | }
 34 | 
 35 | /*
 36 |  * Prevents modern browsers from displaying `audio` without controls.
 37 |  * Remove excess height in iOS 5 devices.
 38 |  */
 39 | 
 40 | audio:not([controls]) {
 41 |     display: none;
 42 |     height: 0;
 43 | }
 44 | 
 45 | /*
 46 |  * Addresses styling for `hidden` attribute not present in IE 8/9.
 47 |  */
 48 | 
 49 | [hidden] {
 50 |     display: none;
 51 | }
 52 | 
 53 | /* ==========================================================================
 54 |    Base
 55 |    ========================================================================== */
 56 | 
 57 | /*
 58 |  * 1. Sets default font family to sans-serif.
 59 |  * 2. Prevents iOS text size adjust after orientation change, without disabling
 60 |  *    user zoom.
 61 |  */
 62 | 
 63 | html {
 64 |     font-family: sans-serif; /* 1 */
 65 |     -webkit-text-size-adjust: 100%; /* 2 */
 66 |     -ms-text-size-adjust: 100%; /* 2 */
 67 | }
 68 | 
 69 | /*
 70 |  * Removes default margin.
 71 |  */
 72 | 
 73 | body {
 74 |     margin: 0;
 75 | }
 76 | 
 77 | /* ==========================================================================
 78 |    Links
 79 |    ========================================================================== */
 80 | 
 81 | /*
 82 |  * Addresses `outline` inconsistency between Chrome and other browsers.
 83 |  */
 84 | 
 85 | a:focus {
 86 |     outline: thin dotted;
 87 | }
 88 | 
 89 | /*
 90 |  * Improves readability when focused and also mouse hovered in all browsers.
 91 |  */
 92 | 
 93 | a:active,
 94 | a:hover {
 95 |     outline: 0;
 96 | }
 97 | 
 98 | /* ==========================================================================
 99 |    Typography
100 |    ========================================================================== */
101 | 
102 | /*
103 |  * Addresses `h1` font sizes within `section` and `article` in Firefox 4+,
104 |  * Safari 5, and Chrome.
105 |  */
106 | 
107 | h1 {
108 |     font-size: 2em;
109 | }
110 | 
111 | /*
112 |  * Addresses styling not present in IE 8/9, Safari 5, and Chrome.
113 |  */
114 | 
115 | abbr[title] {
116 |     border-bottom: 1px dotted;
117 | }
118 | 
119 | /*
120 |  * Addresses style set to `bolder` in Firefox 4+, Safari 5, and Chrome.
121 |  */
122 | 
123 | b,
124 | strong {
125 |     font-weight: bold;
126 | }
127 | 
128 | /*
129 |  * Addresses styling not present in Safari 5 and Chrome.
130 |  */
131 | 
132 | dfn {
133 |     font-style: italic;
134 | }
135 | 
136 | /*
137 |  * Addresses styling not present in IE 8/9.
138 |  */
139 | 
140 | mark {
141 |     background: #ff0;
142 |     color: #000;
143 | }
144 | 
145 | 
146 | /*
147 |  * Corrects font family set oddly in Safari 5 and Chrome.
148 |  */
149 | 
150 | code,
151 | kbd,
152 | pre,
153 | samp {
154 |     font-family: monospace, serif;
155 |     font-size: 1em;
156 | }
157 | 
158 | /*
159 |  * Improves readability of pre-formatted text in all browsers.
160 |  */
161 | 
162 | pre {
163 |     white-space: pre;
164 |     white-space: pre-wrap;
165 |     word-wrap: break-word;
166 | }
167 | 
168 | /*
169 |  * Sets consistent quote types.
170 |  */
171 | 
172 | q {
173 |     quotes: "\201C" "\201D" "\2018" "\2019";
174 | }
175 | 
176 | /*
177 |  * Addresses inconsistent and variable font size in all browsers.
178 |  */
179 | 
180 | small {
181 |     font-size: 80%;
182 | }
183 | 
184 | /*
185 |  * Prevents `sub` and `sup` affecting `line-height` in all browsers.
186 |  */
187 | 
188 | sub,
189 | sup {
190 |     font-size: 75%;
191 |     line-height: 0;
192 |     position: relative;
193 |     vertical-align: baseline;
194 | }
195 | 
196 | sup {
197 |     top: -0.5em;
198 | }
199 | 
200 | sub {
201 |     bottom: -0.25em;
202 | }
203 | 
204 | /* ==========================================================================
205 |    Embedded content
206 |    ========================================================================== */
207 | 
208 | /*
209 |  * Removes border when inside `a` element in IE 8/9.
210 |  */
211 | 
212 | img {
213 |     border: 0;
214 | }
215 | 
216 | /*
217 |  * Corrects overflow displayed oddly in IE 9.
218 |  */
219 | 
220 | svg:not(:root) {
221 |     overflow: hidden;
222 | }
223 | 
224 | /* ==========================================================================
225 |    Figures
226 |    ========================================================================== */
227 | 
228 | /*
229 |  * Addresses margin not present in IE 8/9 and Safari 5.
230 |  */
231 | 
232 | figure {
233 |     margin: 0;
234 | }
235 | 
236 | /* ==========================================================================
237 |    Forms
238 |    ========================================================================== */
239 | 
240 | /*
241 |  * Define consistent border, margin, and padding.
242 |  */
243 | 
244 | fieldset {
245 |     border: 1px solid #c0c0c0;
246 |     margin: 0 2px;
247 |     padding: 0.35em 0.625em 0.75em;
248 | }
249 | 
250 | /*
251 |  * 1. Corrects color not being inherited in IE 8/9.
252 |  * 2. Remove padding so people aren't caught out if they zero out fieldsets.
253 |  */
254 | 
255 | legend {
256 |     border: 0; /* 1 */
257 |     padding: 0; /* 2 */
258 | }
259 | 
260 | /*
261 |  * 1. Corrects font family not being inherited in all browsers.
262 |  * 2. Corrects font size not being inherited in all browsers.
263 |  * 3. Addresses margins set differently in Firefox 4+, Safari 5, and Chrome
264 |  */
265 | 
266 | button,
267 | input,
268 | select,
269 | textarea {
270 |     font-family: inherit; /* 1 */
271 |     font-size: 100%; /* 2 */
272 |     margin: 0; /* 3 */
273 | }
274 | 
275 | /*
276 |  * Addresses Firefox 4+ setting `line-height` on `input` using `!important` in
277 |  * the UA stylesheet.
278 |  */
279 | 
280 | button,
281 | input {
282 |     line-height: normal;
283 | }
284 | 
285 | /*
286 |  * 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio`
287 |  *    and `video` controls.
288 |  * 2. Corrects inability to style clickable `input` types in iOS.
289 |  * 3. Improves usability and consistency of cursor style between image-type
290 |  *    `input` and others.
291 |  */
292 | 
293 | button,
294 | html input[type="button"], /* 1 */
295 | input[type="reset"],
296 | input[type="submit"] {
297 |     -webkit-appearance: button; /* 2 */
298 |     cursor: pointer; /* 3 */
299 | }
300 | 
301 | /*
302 |  * Re-set default cursor for disabled elements.
303 |  */
304 | 
305 | button[disabled],
306 | input[disabled] {
307 |     cursor: default;
308 | }
309 | 
310 | /*
311 |  * 1. Addresses box sizing set to `content-box` in IE 8/9.
312 |  * 2. Removes excess padding in IE 8/9.
313 |  */
314 | 
315 | input[type="checkbox"],
316 | input[type="radio"] {
317 |     box-sizing: border-box; /* 1 */
318 |     padding: 0; /* 2 */
319 | }
320 | 
321 | /*
322 |  * 1. Addresses `appearance` set to `searchfield` in Safari 5 and Chrome.
323 |  * 2. Addresses `box-sizing` set to `border-box` in Safari 5 and Chrome
324 |  *    (include `-moz` to future-proof).
325 |  */
326 | 
327 | input[type="search"] {
328 |     -webkit-appearance: textfield; /* 1 */
329 |     -moz-box-sizing: content-box;
330 |     -webkit-box-sizing: content-box; /* 2 */
331 |     box-sizing: content-box;
332 | }
333 | 
334 | /*
335 |  * Removes inner padding and search cancel button in Safari 5 and Chrome
336 |  * on OS X.
337 |  */
338 | 
339 | input[type="search"]::-webkit-search-cancel-button,
340 | input[type="search"]::-webkit-search-decoration {
341 |     -webkit-appearance: none;
342 | }
343 | 
344 | /*
345 |  * Removes inner padding and border in Firefox 4+.
346 |  */
347 | 
348 | button::-moz-focus-inner,
349 | input::-moz-focus-inner {
350 |     border: 0;
351 |     padding: 0;
352 | }
353 | 
354 | /*
355 |  * 1. Removes default vertical scrollbar in IE 8/9.
356 |  * 2. Improves readability and alignment in all browsers.
357 |  */
358 | 
359 | textarea {
360 |     overflow: auto; /* 1 */
361 |     vertical-align: top; /* 2 */
362 | }
363 | 
364 | /* ==========================================================================
365 |    Tables
366 |    ========================================================================== */
367 | 
368 | /*
369 |  * Remove most spacing between table cells.
370 |  */
371 | 
372 | table {
373 |     border-collapse: collapse;
374 |     border-spacing: 0;
375 | }


--------------------------------------------------------------------------------
/semanticizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/__init__.py


--------------------------------------------------------------------------------
/semanticizer/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | """
15 | This module is responsible for loading all possible configuration params and
16 | their defaults, overwriting the defaults by reading values from a given config
17 | file, then overwriting these values to whatever's been passed as argument.
18 | """
19 | import yaml
20 | import sys
21 | import argparse
22 | import traceback
23 | import os
24 | 
25 | def load_config(path='../conf/semanticizer.yml'):
26 |     
27 |     #add command line args
28 |     parser = argparse.ArgumentParser(description="""
29 |             Run sematicizer.""")
30 |        
31 |     parser.add_argument("-p", "--port", help="Port number ")
32 |     parser.add_argument("-v", "--verbose", help="Verbose ")
33 |     parser.add_argument("-s", "--host", help="Host ip address ")
34 |     parser.add_argument("-c", "--config", help="Config file ")
35 |      
36 |     args = parser.parse_args()
37 |     
38 |     if args.config != None:
39 |         path = args.config
40 |         
41 |     if not path.startswith("/"):
42 |     	path = os.path.join(os.path.dirname(__file__), path)
43 |     	
44 |     configYaml = yaml.load(file(path))
45 |     
46 |     if args.port != None:
47 |         configYaml["server"]["port"] = int(args.port)
48 |         
49 |     if args.verbose != None:
50 |         configYaml["logging"]["verbose"] = str2bool(args.verbose)
51 |     
52 |     if args.host != None:
53 |         configYaml["server"]["host"] = args.host
54 |     
55 |     return configYaml
56 |     
57 | def str2bool(v):
58 |   return v.lower() in ("yes", "true", "t", "1")
59 | 
60 | def config_get(keys=(), default=None, config=None):
61 |     """
62 |     Allows user to access configuration variables and arguments. The function
63 |     takes the variable name as its input, and returns the value or None is it
64 |     isn't set.
65 | 
66 |     @param keys: The name of the configuration parameter to fetch. (Optional)
67 |     @param default: The default value to return if the key is not found.
68 |     @param config: dictionary to represent config. If None, load_config is
69 |                    called.
70 |     @return: The value for the given parameter if name was set and valid, \
71 |              the default value if invalid or None if no default value was set.
72 |     """
73 |     if config is None:
74 |         config = load_config()
75 | 
76 |     if isinstance(keys, basestring):
77 |         keys = [keys]
78 |     
79 |     pointer = config
80 |     for key in keys:
81 |         if not key in pointer:
82 |             if default is not None:
83 |                 return default
84 |             else:
85 |                 raise KeyError('Could not find %s in configuration' % key)
86 |         pointer = pointer[key]
87 |         
88 |     index = 0
89 |     return pointer
90 | 


--------------------------------------------------------------------------------
/semanticizer/dbinsert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/dbinsert/__init__.py


--------------------------------------------------------------------------------
/semanticizer/dbinsert/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | import yaml
15 | import sys
16 | import getopt
17 | 
18 | from ..wpm.load import WpmLoader
19 | from ..config import config_get
20 | 
21 | def load_wpm_data(datasource, langcode, settings, **kwargs):
22 |     if datasource == "redis":
23 |         from ..wpm.db.redisdb import RedisDB
24 |         db = RedisDB(**kwargs)
25 |         WpmLoader(db, langcode, settings, **kwargs)
26 |     elif datasource == "mongo":
27 |         from ..wpm.db.mongodb import MongoDB
28 |         db = MongoDB(**kwargs)
29 |         WpmLoader(db, langcode, settings, **kwargs)
30 |     else:
31 |         raise ValueError("No %s backend for language %s" % (datasource, langcode))
32 | 
33 | 
34 | 
35 | ##
36 | ## usage
37 | ## python -m semanticizer.dbinsert --language=<languagecode> --output=/tmp/redisinsert.log
38 | if __name__ == '__main__':
39 |     configYaml = yaml.load(file('conf/semanticizer.yml'))
40 |     wpm_languages = config_get(('wpm', 'languages'), None, configYaml)
41 |     settings = config_get("settings", {}, configYaml)
42 |     try:
43 |        opts, args = getopt.getopt(sys.argv[1:], 'l:o:', ['language=', 'output='])
44 |     except getopt.GetoptError:
45 |        usage()
46 |        sys.exit(2)
47 | 
48 |     showprogress = True
49 |     output = None
50 |     language = None
51 | 
52 |     for opt, arg in opts:
53 |         if opt in ('-l', '--language'):
54 |             language = arg
55 |         elif opt in ('-o', '--output'):
56 |             output = arg
57 | 
58 |     if output:
59 |         f = open(output, "w+")
60 |         sys.stdout = f
61 |         showprogress = False
62 | 
63 |     #if language code is specified only import that language
64 |     if language and wpm_languages[language]:
65 |         load_wpm_data(wpm_languages[language]['source'], language, settings, progress=showprogress, **wpm_languages[language]['initparams'])
66 |     #else important all languages in the config file
67 |     else:
68 |         for langcode, langconfig in wpm_languages.iteritems():
69 |             load_wpm_data(langconfig['source'], langcode, settings, progress=showprogress, **langconfig['initparams'])
70 | 
71 | 


--------------------------------------------------------------------------------
/semanticizer/processors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/processors/__init__.py


--------------------------------------------------------------------------------
/semanticizer/processors/context.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | import networkx
 15 | from networkx.algorithms.centrality import degree_centrality
 16 | 
 17 | from multiprocessing import Pool
 18 | 
 19 | def pagerank_worker(graph, page_ranked):
 20 |     print "Pagerank on graph with %d nodes and %d edges." \
 21 |               % (len(graph.nodes()), \
 22 |                  len(graph.edges()))
 23 |     for node in graph.nodes():
 24 |         page_ranked.setdefault(node, 1)
 25 |         
 26 |     from networkx.algorithms.link_analysis import pagerank
 27 |     from time import time
 28 |     
 29 |     try:
 30 |         start = time()
 31 |         page_ranked = pagerank(graph, max_iter=1000, nstart=page_ranked) # 0.2-1.5s for #node = 2500
 32 |         print "Pagerank took: %f seconds" % (time()-start)
 33 |     except ZeroDivisionError:
 34 |         print "ZeroDivisionError in pagerank"
 35 |     
 36 |     page_ranked_sorted = sorted(page_ranked.items(), key=lambda x: x[1], reverse=True)
 37 |     print page_ranked_sorted[:4]
 38 |     
 39 | pool = Pool()
 40 | 
 41 | class contextGraph:
 42 |     def __init__(self, label, threshold_function, threshold, min_t):
 43 |         self.graph = networkx.Graph()
 44 |         self.page_ranked = {}
 45 |         self.chunk = -1
 46 |         self.feature_label = "CONTEXT_" + label.upper()
 47 | 
 48 |         self.threshold_function = threshold_function
 49 |         self.threshold = threshold
 50 |         self.min_t = min_t
 51 |         
 52 |     def to_dict_of_dicts(self):
 53 |         return networkx.convert.to_dict_of_dicts(self.graph)
 54 | 
 55 |     def add_chunk(self):
 56 |         self.chunk += 1
 57 |         self.page_ranked.setdefault("[Chunk%d]" % self.chunk, 0)
 58 |         if self.chunk > 0:
 59 |             self.graph.add_edge("[Chunk%d]" % self.chunk, \
 60 |                                 "[Chunk%d]" % (self.chunk-1), t=self.chunk)
 61 |         
 62 |     def add_link(self, link):
 63 |     	assert link.has_key("title")
 64 |     	assert link.has_key(self.threshold_function)
 65 |     	assert link.has_key("label")
 66 | 
 67 |     	if link[self.threshold_function] < self.threshold: return
 68 |     
 69 |     	label_text = "[%d-%s]" % (self.chunk, link["label"])
 70 |     	self.page_ranked.setdefault(link["title"], 1)
 71 |     	self.page_ranked.setdefault(label_text, 0)
 72 |     	self.graph.add_edge(label_text, link["title"], t=self.chunk) # weight=senseProbability
 73 |     	self.graph.add_edge(label_text, "[Chunk%d]" % self.chunk, t=self.chunk)
 74 | 
 75 |     def prepare_features(self):
 76 |         self.clean_graph(self.chunk-self.min_t)
 77 |         
 78 |         self.pagerank_result = pool.apply_async(pagerank_worker, (self.graph, self.page_ranked,))
 79 |         
 80 | #         def degree_centrality_worker():
 81 | #             self.degree_centralities = degree_centrality(self.graph)
 82 | #         
 83 | #         self.degree_centrality_thread = Thread(target=degree_centrality_worker)
 84 | #         self.degree_centrality_thread.start()
 85 | 
 86 |         self.degree_centrality_result = pool.apply_async(degree_centrality, (self.graph,))
 87 | 
 88 |     def compute_features(self, title):
 89 | #         self.degree_centrality_thread.join()
 90 | #         self.pagerank_thread.join()
 91 |         self.degree_centralities = self.degree_centrality_result.get()
 92 |         self.pagerank_result.wait()
 93 |         
 94 |         features = {}
 95 |         features[self.feature_label + "_DEGREE"] = 0
 96 |         features[self.feature_label + "_PAGERANK"] = 0
 97 |         features[self.feature_label + "_PAGERANK_NORMALIZED"] = 0
 98 |         features[self.feature_label + "_DEGREE_CENTRALITY"] = 0
 99 |         if title in self.page_ranked:
100 |             features[self.feature_label + "_PAGERANK"] = self.page_ranked[title]
101 |             features[self.feature_label + "_PAGERANK_NORMALIZED"] = \
102 |                 len(self.graph.nodes()) * self.page_ranked[title]
103 |         if title in self.degree_centralities:
104 |             features[self.feature_label + "_DEGREE"] = \
105 |                 self.graph.degree(title)
106 |             features[self.feature_label + "_DEGREE_CENTRALITY"] = \
107 |                 self.degree_centralities[title]                                
108 |         return features
109 | 				
110 |     def clean_graph(self, min_t):
111 |     	# Remove edges with a t lower than min_t
112 |     	for edge in self.graph.edges():
113 |     		if self.graph[edge[0]][edge[1]]["t"] < min_t:
114 |     			self.graph.remove_edge(edge[0], edge[1])
115 |     	# Remove nodes that have become disconnected
116 |     	for node in self.graph.nodes():
117 |     		if self.graph.degree(node) == 0:
118 |     			self.graph.remove_node(node)
119 |     			del self.page_ranked[node]
120 | 
121 |     def pagerank(self):
122 |     # 	from networkx.algorithms.link_analysis import pagerank_scipy
123 |     # 	from networkx.algorithms.link_analysis import pagerank_numpy
124 |     	from networkx.algorithms.link_analysis import pagerank
125 |     	from time import time
126 |     	try:
127 |     		start = time()
128 |     # 		pagerank(graph, max_iter=1000) # 1.7s for #nodes = 2500
129 |     		pagerank(self.graph, max_iter=1000, nstart=self.page_ranked) # 0.2-1.5s for #node = 2500
130 |     # 		pagerank_scipy(graph) # 1.0s for #nodes = 2500
131 |     # 		pagerank_numpy(graph) # > 30s if #nodes > 1000
132 |     		print "Pagerank took: %f seconds" % (time()-start)
133 |     	except ZeroDivisionError:
134 |     		print "ZeroDivisionError in pagerank"
135 |     
136 |     	page_ranked_sorted = sorted(self.page_ranked.items(), key=lambda x: x[1], reverse=True)
137 |     	print page_ranked_sorted[:4]
138 |     	
139 |     # 	from networkx.algorithms.centrality import *
140 |     
141 |     # 	start = time()
142 |     # 	degree_centrality = degree_centrality(graph) # 0.003s for 1500 nodes
143 |     # 	print "Degree centrality took: %f seconds" % (time()-start)	
144 |     # 		
145 |     # 	start = time()
146 |     # 	closeness_centrality = closeness_centrality(graph) # 4s for 1500 nodes
147 |     # 	print "Closeness centrality took: %f seconds" % (time()-start)	
148 |     # 
149 |     # 	start = time()
150 |     # 	betweenness_centrality = betweenness_centrality(graph) # 18s for 1500 nodes
151 |     # 	print "Betweenness centrality took: %f seconds" % (time()-start)	
152 |     
153 |     	return self.page_ranked
154 | 


--------------------------------------------------------------------------------
/semanticizer/processors/core.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | class LinksProcessor:
 15 |     '''A LinksProcessor takes a set of links, a text and a language code to 
 16 |        produce or process links. Processing is done in two steps, a preprocessing
 17 |        step and a processing step. '''
 18 |        
 19 |     def preprocess(self, links, text, settings):
 20 |         return (links, text, settings)
 21 |         
 22 |     def process(self, links, text, settings):
 23 |         return (links, text, settings)
 24 | 
 25 |     def postprocess(self, links, text, settings):
 26 |         return (links, text, settings)
 27 |         
 28 |     def inspect(self):
 29 |         return {}
 30 | 
 31 | class SettingsProcessor(LinksProcessor):
 32 |     def __init__(self, settings):
 33 |         self.settings = settings
 34 |         
 35 |     def preprocess(self, links, text, settings):
 36 |         if "settings" in settings and settings["settings"] in self.settings:
 37 |             for k, v in self.settings[settings["settings"]].iteritems():
 38 |                 if k not in settings:
 39 |                     settings[k] = v
 40 |             del settings["settings"]
 41 |         return (links, text, settings)
 42 |     def inspect(self):
 43 |         return {self.__class__.__name__: self.settings}
 44 | 
 45 | class FilterProcessor(LinksProcessor):
 46 |     def __init__(self):
 47 |         self.context_links = {}
 48 | 
 49 |     def preprocess(self, links, text, settings):
 50 |         if settings.has_key("prefilter"):
 51 |             links = self.filter_links(settings["prefilter"].split(","), links, settings)
 52 | 
 53 |         return (links, text, settings)
 54 | 
 55 |     def postprocess(self, links, text, settings):
 56 |         if "filter" in settings:
 57 |             links = self.filter_links(settings["filter"].split(","),
 58 |                                       links, settings)
 59 | 
 60 |         return (links, text, settings)
 61 | 
 62 |     def filter_links(self, filters, links, settings):
 63 |         filters_gte = [fltr.split(">=") for fltr in filters if ">=" in fltr]
 64 |         filters_gt = [fltr.split(">") for fltr in filters \
 65 |                       if ">" in fltr and not ">=" in fltr]
 66 | 
 67 |         filter_unique = ("unique" in filters) and "context" in settings
 68 | 
 69 |         if len(filters_gte) == 0 and len(filters_gt) == 0 \
 70 |                                  and not filter_unique:
 71 |             return links
 72 | 
 73 |         filtered_links = []
 74 |         # Q: why do we not apply the gt filter if a gte filter fails?
 75 |         for link in links:
 76 |             skip = False
 77 |             for fltr in filters_gte:
 78 |                 if not link[fltr[0]] >= float(fltr[1]):
 79 |                     skip = True
 80 |                     break
 81 |             else:
 82 |                 for fltr in filters_gt:
 83 |                     if not link[fltr[0]] > float(fltr[1]):
 84 |                         skip = True
 85 |                         break
 86 | 
 87 |             if filter_unique:
 88 |                 self.context_links.setdefault(settings["context"], {})
 89 |                 if link["title"] in self.context_links[settings["context"]]:
 90 |                     skip = True
 91 | 
 92 |             if not skip:
 93 |                 filtered_links.append(link)
 94 | 
 95 |                 if filter_unique:
 96 |                     self.context_links[settings["context"]][link["title"]] = link
 97 | 
 98 |         print "Filtered %d links to %d" % (len(links), len(filtered_links))
 99 |     
100 |         return filtered_links
101 | 
102 |     def inspect(self):
103 |         return {self.__class__.__name__: self.context_links}


--------------------------------------------------------------------------------
/semanticizer/processors/external.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | from Queue import Queue, Empty
 15 | from threading import Thread
 16 | 
 17 | import urllib2
 18 | 
 19 | import datetime
 20 | import shelve
 21 | import os
 22 | from copy import deepcopy
 23 | 
 24 | from .core import LinksProcessor
 25 | from ..wpm.data import wpm_dumps
 26 | from ..wpm.utils import get_relatedness
 27 | 
 28 | 
 29 | class ArticlesProcessor(LinksProcessor):
 30 |     def __init__(self, langcodes, pickledir):
 31 |         self.langcodes = langcodes
 32 |         self.article_template = {
 33 |             "article_id": -1,
 34 |             "article_title": "",
 35 |             "Definition": "",
 36 |             "InLinks": [],
 37 |             "OutLinks": [],
 38 |             "Labels": [],
 39 |             "Images": [],
 40 |             "ParentCategories": []
 41 |         }
 42 | 
 43 |     def preprocess(self, links, text, settings):
 44 |         if not "article" in settings and not "features" in settings and not \
 45 |                "learning" in settings and "multi" not in settings:
 46 |             return (links, text, settings)
 47 |         if not settings["langcode"] in self.langcodes:
 48 |             return (links, text, settings)
 49 | 
 50 |         return (links, text, settings)
 51 | 
 52 |     def process(self, links, text, settings):
 53 |         if not "article" in settings and not "features" in settings and not \
 54 |                "learning" in settings:
 55 |             return (links, text, settings)
 56 |         if not settings["langcode"] in self.langcodes:
 57 |             return (links, text, settings)
 58 |         
 59 |         wpm = wpm_dumps[settings["langcode"]]
 60 |         
 61 |         if "article" in settings:
 62 |             parts = settings["article"].lower().split(',') 
 63 |         else:
 64 |             parts = [key.lower() for key in self.article_template.keys()]
 65 |         
 66 |         titles = [link["title"] for link in links]
 67 |         ids = [link["id"] for link in links]
 68 |         articles = wpm.get_articles(*ids)
 69 |         
 70 |         for link, id, title, article in zip(links, ids, titles, articles):
 71 |             
 72 |             link.update(deepcopy(self.article_template))
 73 | 
 74 |             link["article_title"] = title
 75 |             link["article_id"] = id
 76 | 
 77 |             inlinks = article["InLinks"]
 78 |             if inlinks and (not parts or 'inlinks' in parts):
 79 |                 if not parts or 'relatedness' in parts:
 80 |                     for inlink in inlinks:
 81 |                         title = wpm.get_item_title(inlink)
 82 |                         relatedness = get_relatedness(inlinks, wpm.get_item_inlinks(inlink) )
 83 |                         link["InLinks"].append( {"title":title, "id":int(inlink), "relatedness":relatedness} )
 84 |                 else:
 85 |                     link["InLinks"] = [{ "id":int(inlink) } for inlink in inlinks]
 86 | 
 87 |             outlinks = article["OutLinks"]
 88 |             if outlinks and (not parts or 'outlinks' in parts):
 89 |                 if not parts or 'relatedness' in parts:
 90 |                     for outlink in outlinks:
 91 |                         title = wpm.get_item_title(outlink)
 92 |                         relatedness = get_relatedness(outlinks, wpm.get_item_outlinks(outlink) )
 93 |                         link["OutLinks"].append( {"title":title, "id":int(outlink), "relatedness":relatedness} )
 94 |                 else:
 95 |                     link["OutLinks"] = [{ "id":int(outlink) } for outlink in outlinks]
 96 | 
 97 |             if not parts or 'categories' in parts:
 98 |                 categories = wpm.get_item_categories( link["article_id"] )
 99 |                 if categories:
100 |                     for category in categories:
101 |                         title = wpm.get_item_title(category)
102 |                         link["ParentCategories"].append( {"title":title, "id":int(category)} )
103 | 
104 |             if not parts or 'definition' in parts:
105 |                 definition = wpm.get_item_definition(link["article_id"])
106 |                 if definition:
107 |                     link["Definition"] = definition
108 | 
109 |             if article["Labels"] and "labels" in parts:
110 |                 link["Labels"] = article["Labels"]
111 | 
112 |         return (links, text, settings)
113 | 
114 |     def postprocess(self, links, text, settings):
115 |         if "article" in settings and len(settings["article"]) == 0:
116 |             return (links, text, settings)
117 |         remove = [key.lower() for key in self.article_template.keys()]
118 |         remove.extend(["fromtitle", "fromredirect"])
119 |         if "article" in settings:
120 |             for label in settings["article"].replace(";", ",").split(","):
121 |                 if label.lower() in remove:
122 |                     remove.remove(label)
123 |         for link in links:
124 |             for label in link.keys():
125 |                 if label.lower() in remove:
126 |                     del link[label]
127 | 
128 |         return (links, text, settings)
129 | 
130 | 
131 | class StatisticsProcessor(LinksProcessor):
132 |     def __init__(self, langcodes, num_of_threads, pickledir):
133 |         self.num_of_threads = num_of_threads
134 |         self.WIKIPEDIA_STATS_URL = {}
135 |         self.wikipedia_statistics_cache = {}
136 |         for langcode in langcodes:
137 |             self.WIKIPEDIA_STATS_URL[langcode] = \
138 |                           "http://stats.grok.se/json/" \
139 |                           + langcode \
140 |                           + "/%d%02d/%s"  # 201001/De%20Jakhalzen
141 | 
142 |             pickle_root = os.path.join(pickledir, langcode)
143 |             if not os.path.isdir(pickle_root):
144 |                 os.makedirs(pickle_root)
145 |             self.wikipedia_statistics_cache[langcode] = \
146 |                 shelve.open(os.path.join(pickle_root, \
147 |                                          'wikipedia_statistics_cache.db'))
148 |             print "Loaded %d sets of statistics for %s from cache." \
149 |                   % (len(self.wikipedia_statistics_cache[langcode]), langcode)
150 | 
151 |     def inspect(self):
152 |         return {self.__class__.__name__: self.WIKIPEDIA_STATS_URL}
153 | 
154 |     def preprocess(self, links, text, settings):
155 |         if "wikistats" not in settings:
156 |             return (links, text, settings)
157 | 
158 |         now = self.get_timestamp(settings)
159 | 
160 |         def worker():
161 |             while True:
162 |                 try:
163 |                     (year, month, article) = queue.get_nowait()
164 |                     self.wikipedia_page_views(year, month,
165 |                                               article, settings["langcode"])
166 |                     queue.task_done()
167 |                 except Empty:
168 |                     break
169 | 
170 |         queue = Queue()
171 |         for _ in set([link["title"] for link in links]):
172 |             day = now
173 |             for _ in range(14):
174 |                 queue.put((day.year, day.month, article))
175 |                 day += timedelta(days=28)
176 | 
177 |         for _ in range(self.num_of_threads):
178 |             t = Thread(target=worker)
179 |             t.daemon = True
180 |             t.start()
181 | 
182 |     def process(self, links, text, settings):
183 |         if "wikistats" not in settings:
184 |             return (links, text, settings)
185 | 
186 |         now = self.get_timestamp(settings)
187 | 
188 |         self.queue.join()
189 | 
190 |         for link in links:
191 |             features = {"WIKISTATSDAY": 0,
192 |                         "WIKISTATSWK": 0,
193 |                         "WIKISTATS4WK": 0,
194 |                         "WIKISTATSYEAR": 0,
195 |                         "WIKISTATSDAYOFWK": 0,
196 |                         "WIKISTATSWKOF4WK": 0,
197 |                         "WIKISTATS4WKOFYEAR": 0
198 |                         }
199 | 
200 |             self.feature_WIKISTATSDAY(datetime, link["title"], features, now)
201 |             self.feature_WIKISTATSWK(datetime, link["title"], features, now)
202 |             self.feature_WIKISTATS4WK(datetime, link["title"], features, now)
203 |             self.feature_WIKISTATSYEAR(datetime, link["title"], features, now)
204 |             self.feature_WIKISTATSTRENDS(features)
205 | 
206 |             del features["WIKISTATSDAY"]
207 | 
208 |             link["features"].update(features)
209 | 
210 |         for langcode, cache in self.wikipedia_statistics_cache.iteritems():
211 |             print "Saving %d sets of statistics for %s from cache." \
212 |                   % (len(cache), langcode)
213 |             cache.sync()
214 | 
215 |         return (links, text, settings)
216 | 
217 |     def get_timestamp(self, settings):
218 |         # Should be more robust against unexpected values
219 |         if len(settings["wikistats"]) > 0:
220 |             return datetime.datetime.fromtimestamp(int(settings["wikistats"]))
221 |         else:
222 |             return datetime.datetime.now()
223 | 
224 |     def wikipedia_page_views(self, year, month, article, langcode):
225 |         url = self.WIKIPEDIA_STATS_URL[langcode] % (year, month, article)
226 |         url = url.encode('utf-8')
227 |         if url in self.wikipedia_statistics_cache[langcode]:
228 |             resultJson = self.wikipedia_statistics_cache[langcode][url]
229 |         else:
230 |             try:
231 |                 request = urllib2.urlopen(url, timeout=1)
232 |                 resultJson = request.read()
233 |             except urllib2.URLError:
234 |                 try:
235 |                     request = urllib2.urlopen(url)
236 |                     resultJson = request.read()
237 |                 except urllib2.URLError:
238 |                     request = urllib2.urlopen(url)
239 |                     resultJson = request.read()
240 | 
241 |             self.wikipedia_statistics_cache[langcode][url] = resultJson
242 | 
243 |         from json import loads
244 |         result = loads(resultJson)
245 | 
246 |         return result
247 | 
248 |     def feature_WIKISTATSDAY(self, datetime, article, features, now):
249 |         day = now
250 |         day += timedelta(days=-1)
251 |         monthly_views = self.wikipedia_page_views(day.year,
252 |                                                   day.month, article)
253 |         views = monthly_views["daily_views"][self.date_format % \
254 |                                              (day.year, day.month, day.day)]
255 |         features["WIKISTATSDAY"] = views
256 | 
257 |     def feature_WIKISTATSWK(self, datetime, article, features, now):
258 |         day = now
259 |         for _ in range(7):
260 |             day += timedelta(days=-1)
261 |             monthly_views = self.wikipedia_page_views(day.year,
262 |                                                       day.month, article)
263 |             views = \
264 |                   monthly_views["daily_views"][self.date_format % \
265 |                                                (day.year, day.month, day.day)]
266 |             features["WIKISTATSWK"] += views
267 | 
268 |     def feature_WIKISTATS4WK(self, datetime, article, features, now):
269 |         day = now
270 |         for _ in range(28):
271 |             day += timedelta(days=-1)
272 |             monthly_views = self.wikipedia_page_views(day.year,
273 |                                                       day.month, article)
274 |             views = monthly_views["daily_views"][self.date_format % \
275 |                                                 (day.year, day.month, day.day)]
276 |             features["WIKISTATS4WK"] += views
277 | 
278 |     def feature_WIKISTATSYEAR(self, datetime, article, features, now):
279 |         day = now
280 |         for _ in range(365):
281 |             day += timedelta(days=-1)
282 |             monthly_views = self.wikipedia_page_views(day.year,
283 |                                                       day.month, article)
284 |             views = monthly_views["daily_views"][self.date_format % \
285 |                                                 (day.year, day.month, day.day)]
286 |             features["WIKISTATSYEAR"] += views
287 | 
288 |     def feature_WIKISTATSTRENDS(self, features):
289 |         if features["WIKISTATSWK"] > 0:
290 |             features["WIKISTATSDAYOFWK"] = \
291 |                     float(features["WIKISTATSDAY"]) / features["WIKISTATSWK"]
292 |         if features["WIKISTATS4WK"] > 0:
293 |             features["WIKISTATSWKOF4WK"] = \
294 |                     float(features["WIKISTATSWK"]) / features["WIKISTATS4WK"]
295 |         if features["WIKISTATSYEAR"] > 0:
296 |             features["WIKISTATS4WKOFYEAR"] = \
297 |                     float(features["WIKISTATS4WK"]) / features["WIKISTATSYEAR"]
298 | 


--------------------------------------------------------------------------------
/semanticizer/processors/feature.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | import collections
 15 | from math import log
 16 | import cPickle as pickle
 17 | import os
 18 | import re
 19 | 
 20 | from leven import levenshtein
 21 | 
 22 | from . import stringUtils
 23 | from ..wpm.data import wpm_dumps
 24 | 
 25 | class anchorFeatures:
 26 |     def __init__(self, langcode):
 27 |         self.wpm = wpm_dumps[langcode]
 28 |         self.wikipediaArticleCount = int(self.wpm.get_stat("articleCount")) #970139
 29 |         self.wikipediaCategoryCount = int(self.wpm.get_stat("categoryCount")) #63108
 30 | 
 31 |     def feature_LEN(self, lnk):
 32 |         return len(re.findall(stringUtils.reTokenPattern, lnk["label"]))
 33 | 
 34 |     def feature_IDF_title(self, lnk):
 35 |         score = self.wpm.get_title_ngram_score(lnk["label"])
 36 |         if not score == None:
 37 |             in_title_count = int(score)
 38 |         else:
 39 |             in_title_count = 0
 40 |         return log(float(self.wikipediaArticleCount) / \
 41 |                    (float(in_title_count) + 0.00001))
 42 | 
 43 |     def feature_IDF_anchor(self, lnk):
 44 |         return log(float(self.wikipediaArticleCount) / \
 45 |                    (float(lnk["linkDocCount"]) + 0.00001))
 46 | 
 47 |     def feature_IDF_content(self, lnk):
 48 |         return log(float(self.wikipediaArticleCount) / \
 49 |                    (float(lnk["docCount"]) + 0.00001))
 50 | 
 51 |     def feature_KEYPHRASENESS(self, lnk):
 52 |         return float(lnk["linkDocCount"]) / (float(lnk["docCount"]) + 0.00001)
 53 | 
 54 |     def feature_LINKPROB(self, lnk):
 55 |         return float(lnk["linkOccCount"]) / (float(lnk["occCount"]) + 0.00001)
 56 | 
 57 |     def feature_SNIL(self, lnk):
 58 |         SNIL = 0
 59 | 
 60 |         words = lnk["label"].split()
 61 |         for n in range(1, len(words) + 1):
 62 |             for i in range(0, len(words) - n):
 63 |                 ngram = " ".join(words[i:i + n])
 64 |                 if not self.wpm.get_item_id(ngram) == None:
 65 |                     SNIL += 1
 66 |         return SNIL
 67 | 
 68 |     def feature_SNCL(self, lnk):
 69 |         SNCL = 0
 70 | 
 71 |         words = lnk["label"].split()
 72 |         for n in range(1, len(words) + 1):
 73 |             for i in range(0, len(words) - n):
 74 |                 ngram = " ".join(words[i:i + n])
 75 |                 score = self.wpm.get_title_ngram_score(ngram)
 76 |                 if not score == None:
 77 |                     SNCL += int(score)
 78 |         return SNCL
 79 | 
 80 |     def feature_NORMALIZATION(self, lnk):
 81 |         edit = levenshtein(unicode(lnk["label"]), unicode(lnk["text"]))
 82 |         return float(edit) / len(lnk["text"])
 83 | 
 84 |     def compute_anchor_features(self, lnk):
 85 |         return {'LEN': self.feature_LEN(lnk),
 86 |                 'IDF_title': self.feature_IDF_title(lnk),
 87 |                 'IDF_anchor': self.feature_IDF_anchor(lnk),
 88 |                 'IDF_content': self.feature_IDF_content(lnk),
 89 |                 'KEYPHRASENESS': self.feature_KEYPHRASENESS(lnk),
 90 |                 'LINKPROB': self.feature_LINKPROB(lnk),
 91 |                 'SNIL': self.feature_SNIL(lnk),
 92 |                 'SNCL': self.feature_SNCL(lnk),
 93 |                 'NORMALIZATION': self.feature_NORMALIZATION(lnk)
 94 |                 }
 95 | 
 96 | 
 97 | class articleFeatures:
 98 |     def __init__(self):
 99 |         self.re_non_word_chars = re.compile(r'(?u)\W+', re.UNICODE)
100 | 
101 |     def feature_INLINKS(self, lnk):
102 |         if "InLinks" not in lnk:
103 |             return 0
104 |         return len(lnk["InLinks"])
105 | 
106 |     def feature_OUTLINKS(self, lnk):
107 |         if "OutLinks" not in lnk:
108 |             return 0
109 |         return len(lnk["OutLinks"])
110 | 
111 |     def feature_REDIRECT(self, lnk):
112 |         # Should be fromRedirect but bug in Wikipedia Miner
113 |         if "fromTitle" in lnk and lnk["fromTitle"]:
114 |             return 1
115 |         return 0
116 | 
117 |     def feature_TF(self, lnk, re_label_text, features):
118 |         aMatches = re.findall(re_label_text, lnk['title'])
119 |         features["TF_title"] = float(len(aMatches))
120 | 
121 |         text = " "
122 |         if "Definition" in lnk:
123 |             if lnk["Definition"] and len(lnk["Definition"]):
124 |                 text = re.sub(r"<.*?>", "", lnk["Definition"])
125 |                 text = re.sub(r"^[|\- }]*", "", text)
126 | 
127 |         while len(text) and (text[0] == "."):
128 |             text = text[1:].strip()
129 | 
130 |         # Very rarely articles do not have a Definition text (or a dummy one
131 |         # like "----")
132 |         if len(text) == 0:
133 |             features["TF_sentence"] = 0
134 |             features["TF_paragraph"] = 0
135 |             features["POS_first_in_paragraph"] = 1
136 |         else:
137 |             # Sentence is first sentence
138 |             sentence = text.split('.')[0]
139 | 
140 |             aMatches = re.findall(re_label_text, sentence)
141 |             features["TF_sentence"] = float(len(aMatches))
142 | 
143 |             aMatches = re.findall(re_label_text, text)
144 |             features["TF_paragraph"] = float(len(aMatches))
145 | 
146 |             if len(aMatches):
147 |                 features["POS_first_in_paragraph"] = \
148 |                                float(re.search(re_label_text, text).start())
149 |             else:
150 |                 features["POS_first_in_paragraph"] = 1
151 | 
152 |     def feature_TITLE(self, lnk, re_label_text, features):
153 |         label_text = unicode(lnk["label"])
154 | 
155 |         re_title = stringUtils.ngramToPattern(lnk['title'])
156 |         article_title = unicode(lnk['title'])
157 | 
158 |         features["NCT"] = 0 if re.search(re_title, label_text) is None \
159 |             else 1
160 | 
161 |         features["TCN"] = 0 \
162 |             if re.search(re_label_text, article_title) is None else 1
163 | 
164 |         features["TEN"] = 1 if article_title == label_text else 0
165 | 
166 |         # Irritatingly enough, split() can give you empty values as last
167 |         # element
168 |         split_label = self.re_non_word_chars.split(label_text)
169 |         if split_label[-1] == '':
170 |             split_label.pop()
171 |         split_title = self.re_non_word_chars.split(article_title)
172 |         if split_title[-1] == '':
173 |             split_title.pop()
174 | 
175 |         # I: True if the title of the candidate begins with the the query
176 |         # (e.g. "Cambridge, Massachusetts" and "Cambridge" )
177 |         features["SUBSTRING_MATCH_1"] = 1 \
178 |             if split_title[0] == split_label[0] else 0
179 | 
180 |         # II: True if the title of the candidate ends with the the query
181 |         # (e.g: "Venice-Simplon Orient Express" and "Orient Express")
182 |         features["SUBSTRING_MATCH_2"] = 1 \
183 |             if split_title[-1] == split_label[-1] else 0
184 | 
185 |         # collections.Counter() converts an array to a dict of words
186 |         # and their frequencies
187 |         cSplitLabel = collections.Counter(split_label)
188 |         cSplitTitle = collections.Counter(split_title)
189 | 
190 |         # Number of shared words between the title of the candidate and
191 |         # the  query
192 |         features['WORD_MATCH'] = len(list(cSplitLabel & cSplitTitle))
193 | 
194 |         # Number of different words between the title of the candidate
195 |         # and the query
196 |         features['WORD_MISS'] = len(split_label) + len(split_title) \
197 |             - (2 * features['WORD_MATCH'])
198 | 
199 |         # Levenshtein distance between query and title of the candidate
200 |         features["EDIT_DISTANCE"] = levenshtein(label_text, article_title)
201 | 
202 |     def feature_COMMONNESS(self, lnk, features):
203 |         features["COMMONNESS"] = lnk["priorProbability"]
204 | 
205 |     def compute_article_features(self, lnk):
206 |         features = {
207 |             'INLINKS': self.feature_INLINKS(lnk),
208 |             'OUTLINKS': self.feature_OUTLINKS(lnk),
209 |             'REDIRECT': self.feature_REDIRECT(lnk)
210 |         }
211 | 
212 |         re_label_text = stringUtils.ngramToPattern(lnk["label"])
213 | 
214 |         self.feature_TF(lnk, re_label_text, features)
215 |         self.feature_TITLE(lnk, re_label_text, features)
216 |         self.feature_COMMONNESS(lnk, features)
217 | 
218 |         return features
219 | 
220 |         ### TK: Ik heb nog wat extra features gemaakt die kijken hoe vaak
221 |         ### inlink anchors en inlink/outlink titels voorkomen in de
222 |         ### referentietekst en de zogenaamde aposition in de titel
223 |         ### ('actress' in 'Sue Johnson (actress)')
224 |         ###
225 |         ### 'NR_OF_MATCHING_INLINK_ANCHORS', 'NR_OF_MATCHING_INLINK_TITLES',
226 |         ### 'NR_OF_MATCHING_OUTLINK_TITLES', 'APOSITION'
227 |         ###
228 |         ### Dat is er nu niet zo makkelijk in te bouwen omdat we hier geen
229 |         ### toegang hebben tot de referentietekst. Maar die features van David
230 |         ### gaan dat ook zeker nodig hebben!
231 |         ###
232 |         ### Maar goed, ik heb ze nu nog even weg gelaten...
233 | 
234 | if __name__ == "__main__":
235 |     # Some settings
236 |     langcode = "en"
237 |     wikipediaminer_root = '/zfs/ilps-plexer/wikipediaminer/enwiki-20111007/'
238 |     pickledir = "/Users/evertlammerts/semanticizer/pickles/"
239 | 
240 |     # Test data
241 |     link = {"label":  "Alabama",
242 |             "linkDocCount": 10,  # Al deze waardes slaan nergens op natuurlijk,
243 |             "docCount": 20,     # maar ok...
244 |             "linkOccCount": 100,
245 |             "occCount": 200,
246 |             "commonness": 0.12345
247 |             }
248 | 
249 |     # Article
250 |     article_url = ''  # Wordt niet gebruikt nu
251 |     fh_article_xml = open("unitTest.article.xml", "r")
252 |     article_xml = fh_article_xml.read()
253 |     fh_article_xml.close()
254 |     article = ElementTree.fromstring(article_xml).find("Response")
255 | 
256 |     # Initialize the objects
257 |     print "Initializing anchor features"
258 |     anchor_features = anchorFeatures(langcode)
259 |     print "Initializing concept features"
260 |     concept_features = conceptFeatures(langcode, wikipediaminer_root,
261 |                                        article_url)
262 |     print "Initializing anchor/concept features"
263 |     anchor_concept_features = anchorConceptFeatures()
264 |     print "Initializing statistics features"
265 |     statistics_features = statisticsFeatures(langcode)
266 | 
267 |     print "Start calculating"
268 |     test_features = {
269 |         "anchor": anchor_features.compute_anchor_features(link),
270 |         "concept": concept_features.compute_concept_features(article),
271 |         "anchor_concept": \
272 |         anchor_concept_features.compute_anchor_concept_features(link, article),
273 |         "statistics": statistics_features.compute_statistics_features(article),
274 |         }
275 | 
276 |     print "%s" % test_features
277 | 


--------------------------------------------------------------------------------
/semanticizer/processors/features.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | from collections import defaultdict
 15 | 
 16 | from . import feature as features
 17 | from . import context
 18 | 
 19 | from .core import LinksProcessor
 20 | 
 21 | class FeaturesProcessor(LinksProcessor):
 22 |     def __init__(self, langcodes):
 23 |         self.features = {}
 24 |         for langcode in langcodes:
 25 |             self.features[langcode] = features.anchorFeatures(langcode)
 26 | 
 27 |     def process(self, links, text, settings):
 28 |         if not "features" in settings and not "learning" in settings:
 29 |             return (links, text, settings)
 30 |         if not settings["langcode"] in self.features:
 31 |             return (links, text, settings)
 32 | 
 33 |         featuresets = self.features[settings["langcode"]]
 34 | 
 35 |         for link in links:
 36 |             link.setdefault("features", {})
 37 |             link["features"].update(featuresets.compute_anchor_features(link))
 38 | 
 39 |         return (links, text, settings)
 40 | 
 41 |     def inspect(self):
 42 |         return {self.__class__.__name__: self.features.keys()}
 43 | 
 44 | 
 45 | class ArticleFeaturesProcessor(LinksProcessor):
 46 |     def __init__(self):
 47 |         self.features = features.articleFeatures()
 48 | 
 49 |     def process(self, links, text, settings):
 50 |         if not "features" in settings and not "learning" in settings:
 51 |             return (links, text, settings)
 52 |         # Check if ArticleProcessor has run
 53 | 
 54 |         for link in links:
 55 |             link.setdefault("features", {})
 56 |             link["features"].update(
 57 |                 self.features.compute_article_features(link)
 58 |             )
 59 | 
 60 |         return (links, text, settings)
 61 | 
 62 |     def inspect(self):
 63 |         return {self.__class__.__name__: str(self.features)}
 64 | 
 65 | 
 66 | class ContextFeaturesProcessor(LinksProcessor):
 67 |     def __init__(self):
 68 |         self.context_features = {}
 69 |         self.context_text = defaultdict(list)
 70 |         self.context_id_pattern = "%s:%d"
 71 | 
 72 |     def new_context(self, context_label):
 73 |         self.context_features[context_label] = {
 74 |             "SP0.2-100": context.contextGraph("SP0.2-100", "senseProbability",
 75 |                                               0.2, 100)
 76 |         }
 77 | 
 78 |     def preprocess(self, links, text, settings):
 79 |         if "context" in settings:
 80 |             settings["context_id"] = self.context_id_pattern % \
 81 |                 (settings["context"], len(self.context_text[settings["context"]]))
 82 |             self.context_text[settings["context"]].append(text)
 83 | 
 84 |         return (links, text, settings)
 85 | 
 86 |     def process(self, links, text, settings):
 87 |         if not "context" in settings or "skip_context_features" in settings or \
 88 |           (not "features" in settings and not "learning" in settings):
 89 |             return (links, text, settings)
 90 | 
 91 |         # Create context_features if it does not exist
 92 |         if settings["context"] not in self.context_features:
 93 |             self.new_context(settings["context"])
 94 | 
 95 |         # For each set of context features
 96 |         for label in self.context_features[settings["context"]]:
 97 |             # Create a new chunk
 98 |             self.context_features[settings["context"]][label].add_chunk()
 99 |             graph = self.context_features[settings["context"]][label]
100 |             # Add each link to graph and prepare features
101 |             for link in links:
102 |                 graph.add_link(link)
103 |                 graph.prepare_features()
104 | 
105 |             # Compute context features for each link
106 |             for link in links:
107 |                 link["features"].update(graph.compute_features(link["title"]))
108 | 
109 |         return (links, text, settings)
110 | 
111 |     def inspect(self):
112 |         context = {}
113 |         for context_label, features in self.context_features.iteritems():
114 |             context[context_label] = {"text": self.context_text[context_label]}
115 |             for label, context_graph in features.iteritems():
116 |                 graph = {"page_ranked": context_graph.page_ranked,
117 |                          "graph": context_graph.to_dict_of_dicts(),
118 |                          "chunk": context_graph.chunk}
119 |                 context[context_label][label] = graph
120 | 
121 |         return {self.__class__.__name__: context}
122 | 


--------------------------------------------------------------------------------
/semanticizer/processors/image.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | from Queue import Queue, Empty
 15 | from threading import Thread
 16 | 
 17 | import urllib2, re
 18 | 
 19 | from .core import LinksProcessor
 20 | 
 21 | class AddImageProcessor(LinksProcessor):
 22 |     def postprocess(self, links, text, settings):
 23 |         if "image" in settings and "langcode" in settings:
 24 |             links = add_image_url(links, settings["langcode"])
 25 |         return (links, text, settings)
 26 | 
 27 | image_url_cache = {}
 28 | 
 29 | def add_image_url(links, langcode):
 30 |     urls = [link["url"].replace(".wikipedia.org/", ".m.wikipedia.org/") \
 31 |             for link in links]
 32 |     
 33 |     print "Getting images for %d Wikipedia pages" % len(urls)
 34 |     get_image_urls(urls)
 35 |     for link, url in zip(links, urls):
 36 |         if url in image_url_cache:
 37 |             print link["title"], "->", image_url_cache[url]
 38 |             link["image_url"] = image_url_cache[url]
 39 |     
 40 |     return links
 41 | 
 42 | IMG_DIMENSION_PATTERN = '<img .*?width="(\d+)" height="(\d+)".*?>'
 43 | IMG_URL_PATTERN = '<img .*?src="(.+?)".*?>'
 44 | 
 45 | BLACKLISTED_IMAGE_URLS = ('http://upload.wikimedia.org/wikipedia/en/f/f4/Ambox_content.png',
 46 |       'http://upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png',
 47 |       'http://upload.wikimedia.org/wikipedia/en/thumb/f/f2/Edit-clear.svg/40px-Edit-clear.svg.png',
 48 |       'http://upload.wikimedia.org/wikipedia/commons/thumb/f/f8/Wiktionary-logo-en.svg/37px-Wiktionary-logo-en.svg.png',
 49 |       'http://upload.wikimedia.org/wikipedia/commons/thumb/a/a4/Text_document_with_red_question_mark.svg/40px-Text_document_with_red_question_mark.svg.png')
 50 | 
 51 | def convert_image_url(image):
 52 |     if image.startswith("//"): 
 53 |         image = "http:" + image
 54 |     elif image.startswith("/"):
 55 |         image = "http://" + url.split("/")[2] + image
 56 |     return image
 57 | 
 58 | def get_image_urls(urls, num_of_threads=8, min_dimension=36):
 59 |     def worker():
 60 |         while True:
 61 |             try:
 62 |                 url = queue.get_nowait()
 63 |                 try:
 64 |                     page = urllib2.urlopen(url, timeout=1).read()
 65 |                 except:
 66 |                 	page = ""
 67 |                 images = re.findall("<img.*?>", page)
 68 |                 
 69 |                 # Filter Wikipedia images
 70 |                 images = [img for img in images if " id=" not in img \
 71 |                                                 and " title=" not in img]
 72 |                 image = None
 73 |                 for img in images:
 74 |                     match = re.match(IMG_DIMENSION_PATTERN, img)
 75 |                     if match == None: continue
 76 |                     dimension = max([int(value) for value in match.groups()])
 77 |                     if dimension >= min_dimension: # Do not use fallback: or image == None:
 78 |                         match = re.match(IMG_URL_PATTERN, img)
 79 |                         if match != None and len(match.groups()) > 0:
 80 |                             image_url = convert_image_url(match.groups()[0])
 81 |                             if image_url in BLACKLISTED_IMAGE_URLS: continue
 82 |                             image = image_url
 83 | #                            if dimension >= min_dimension:
 84 |                             break
 85 | 
 86 |                 image_url_cache[url] = image
 87 |                 
 88 |                 queue.task_done()
 89 |             except Empty:
 90 |                 break
 91 |     
 92 |     queue = Queue()
 93 |     for url in urls:
 94 |         queue.put(url)
 95 |     
 96 |     for i in range(min(num_of_threads, len(urls))):
 97 |         t = Thread(target=worker)
 98 |         t.daemon = True
 99 |         t.start()
100 |     
101 |     queue.join()
102 | 


--------------------------------------------------------------------------------
/semanticizer/processors/multiple.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | import collections
 15 | 
 16 | from .core import LinksProcessor
 17 | 
 18 | 
 19 | class MultipleEntityFeaturesProcessor(LinksProcessor):
 20 | 
 21 |     def process(self, links, text, settings):
 22 |         self.link_dict = {}
 23 |         self.labels = []
 24 | 
 25 |         if 'multi' not in settings:
 26 |             return (links, text, settings)
 27 | 
 28 |         # First run through links to fill dict
 29 |         for link in links:
 30 |             self.link_dict.setdefault(link['id'], []) \
 31 |                 .append([link['label'], link['senseProbability'],
 32 |                         link['priorProbability'], link['linkProbability']])
 33 |             self.labels.append(link['label'])
 34 |             link['features'] = {}
 35 | 
 36 |         # Second run to calculate features
 37 |         for link in links:
 38 |             if 'tier1' in settings['multi']:
 39 |                 features = self.FEATURE_tier_one_overlap(link, self.labels)
 40 |                 link['features'].update(features)
 41 |             if 'outlinks' in settings['multi']:
 42 |                 features = self.FEATURE_linked_entity_overlap(link['label'],
 43 |                                                               link['OutLinks'],
 44 |                                                               'outlinks')
 45 |                 link['features'].update(features)
 46 |             if 'inlinks' in settings['multi']:
 47 |                 features = self.FEATURE_linked_entity_overlap(link['label'],
 48 |                                                               link['InLinks'],
 49 |                                                               'inlinks')
 50 |                 link['features'].update(features)
 51 | 
 52 |         return (links, text, settings)
 53 | 
 54 |     def FEATURE_tier_one_overlap(self, link, labels):
 55 |         """
 56 |         Perform simple 'list intersect'
 57 |         To find matching labels of candidate
 58 |         """
 59 | 
 60 |         tier_one = [link['title']] + [label['title'] for label in \
 61 |                     link['Labels']]
 62 |         tier_one = [(anchor, link['id']) for anchor in \
 63 |                     list((collections.Counter(tier_one) & \
 64 |                     collections.Counter(self.labels)).elements())]
 65 | 
 66 |         return_list = []
 67 |         for l, i in tier_one:
 68 | 
 69 |             if i in self.link_dict:
 70 |                 for label, senseProb, priorProb, cmns in self.link_dict[i]:
 71 |                     if label == anchor:
 72 |                         return_list.append((l, i, senseProb, priorProb, cmns))
 73 |         if return_list:
 74 |             return self.calculate_features(return_list, 1, 'tier_one')
 75 | 
 76 |         else:
 77 |             return {}
 78 | 
 79 |     def FEATURE_linked_entity_overlap(self, current_label, linked_entities,
 80 |                                       features):
 81 |         """
 82 |         IN: json of {in,out}-link_ids
 83 |         Check if they occur in doc dict
 84 |         if they do, see if they are referred to
 85 |         by a different label.
 86 |         """
 87 | 
 88 |         # Find stuff
 89 |         result_list = []
 90 |         for link in linked_entities:
 91 |             if str(link['id']) in self.link_dict:
 92 |                 link_label = self.link_dict[str(link['id'])]
 93 |                 for sub_link in link_label:
 94 |                     if current_label != sub_link[0]:
 95 |                         result_list.append((sub_link[0], link['id'],
 96 |                                             sub_link[1], sub_link[2],
 97 |                                             sub_link[3]))
 98 |         # Calculate features
 99 |         if result_list:
100 |             return self.calculate_features(result_list, len(linked_entities),
101 |                                            features)
102 |         else:
103 |             return {}
104 | 
105 |     def calculate_features(self, results, max_entities, features):
106 |         """
107 |         Given result list in format:
108 |         label, wiki_id, senseProb, priorProb, commonness
109 |         'Unzip' lists and create feature vectors.
110 |         """
111 | 
112 |         label_list, id_list, sense_list, prior_list, cmns_list = \
113 |             ([l for l, w, s, p, c in results],
114 |              [w for l, w, s, p, c in results],
115 |              [s for l, w, s, p, c in results],
116 |              [p for l, w, s, p, c in results],
117 |              [c for l, w, s, p, c in results])
118 | 
119 |         if features == 'outlinks':
120 |             PREFIX = 'ME_OUT_'
121 |         elif features == 'inlinks':
122 |             PREFIX = 'ME_IN_'
123 |         elif features == 'tier_one':
124 |             PREFIX = 'ME_T1_'
125 | 
126 |         return {PREFIX + 'label_overlap': len(label_list),
127 |                 PREFIX + 'label_unique': len(set(label_list)),
128 |                 PREFIX + 'entity_overlap': len(id_list),
129 |                 PREFIX + 'entity_unique': len(set(id_list)),
130 |                 PREFIX + 'entity_proportion': float(len(set(id_list))) / \
131 |                                               float(max_entities),
132 |                 PREFIX + 'sense_prob_sum': sum(sense_list),
133 |                 PREFIX + 'prior_prob_sum': sum(prior_list),
134 |                 PREFIX + 'cmns_sum': sum(cmns_list)}
135 | 


--------------------------------------------------------------------------------
/semanticizer/processors/semanticize.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
  3 | # General Public License as published by the Free Software Foundation, either
  4 | # version 3 of the License, or (at your option) any later version.
  5 | #
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
  9 | # for more details.
 10 | #
 11 | # You should have received a copy of the GNU Lesser General Public License
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | from nltk.util import ngrams as nltk_ngrams
 15 | import re
 16 | import urllib
 17 | 
 18 | from ..wpm import utils as wpmutil
 19 | from ..wpm.data import wpm_dumps
 20 | 
 21 | tokenize = re.compile(r'\w+(?:[.,\']\w+)*|[^\w\s]+',
 22 |                       re.UNICODE | re.MULTILINE | re.DOTALL).findall
 23 | 
 24 | 
 25 | class Semanticizer:
 26 |     def __init__(self, language_code, sense_probability_threshold, 
 27 |                  max_ngram_length=None, debug=False):
 28 |         """constructor"""
 29 |         self.language_code = language_code
 30 |         self.sense_probability_threshold = sense_probability_threshold
 31 |         self.wikipedia_url_template = 'http://%s.wikipedia.org/wiki/%s'
 32 |         self.wpm = wpm_dumps[language_code]
 33 |         self.title_page = {} # This needs to be removed
 34 |         self.max_ngram_length = max_ngram_length
 35 |         self.debug = debug
 36 | 
 37 |     def semanticize(self, sentence, normalize_dash=True,
 38 |                     normalize_accents=True, normalize_lower=False,
 39 |                     translations=True, counts=False,
 40 |                     largest_matching=False,
 41 |                     sense_probability_threshold=None):
 42 |         if sense_probability_threshold == None:
 43 |             sense_probability_threshold = self.sense_probability_threshold
 44 |         result = {"links": []}
 45 |         ngrams = set()
 46 |         token_lists = [tokenize(sentence),
 47 |                        tokenize(sentence.replace('-', ' ')),
 48 |                        tokenize(sentence.replace('.', ' ')),
 49 |                        tokenize(sentence.replace('.', ''))]
 50 | 
 51 |         # get all ngrams for this sentence, limit to max_ngram_length
 52 |         # if applicable
 53 |         for token_list in token_lists:
 54 |             max_len = len(token_list) + 1
 55 |             if self.max_ngram_length is not None:
 56 |                 max_len = min(max_len, self.max_ngram_length)
 57 | 
 58 |             for n in range(1, max_len):
 59 |                 for ngram in nltk_ngrams(token_list, n):
 60 |                     ngrams.add(' '.join(ngram))
 61 | 
 62 |         normal_ngrams = map(wpmutil.normalize, ngrams)
 63 |         exist = self.wpm.normalized_entities_exist(normal_ngrams)
 64 | 
 65 |         for i, (ngram, normal_ngram) in enumerate(zip(ngrams, normal_ngrams)):
 66 |             if exist[i]:
 67 |                 normalized_ngram = wpmutil.normalize(ngram, normalize_dash,
 68 |                                                      normalize_accents,
 69 |                                                      normalize_lower)
 70 |                 anchors = self.wpm.get_all_entities(normal_ngram)
 71 |                 for anchor in anchors:
 72 |                     normalized_anchor = wpmutil.normalize(anchor, normalize_dash,
 73 |                                                           normalize_accents,
 74 |                                                           normalize_lower)
 75 |                     if normalized_ngram == normalized_anchor:
 76 |                         if self.debug and not self.wpm.entity_exists(anchor):
 77 |                             raise LookupError("Data corrupted, cannot "
 78 |                                               + "find %s in the database" \
 79 |                                               % anchor)
 80 |                         entity = self.wpm.get_entity_data(anchor)
 81 |                         senses = [(sense, self.wpm.get_sense_data(anchor, str(sense))) for sense in entity['senses']]
 82 |                         if largest_matching: senses = sorted(senses, key=lambda (_, d): -d['cntlinkdoc'])[:1]
 83 |                         for sense, sense_data in senses:
 84 |                             if sense_data:
 85 |                                 if entity['cnttextocc'] == 0:
 86 |                                     link_probability = 0
 87 |                                     sense_probability = 0
 88 |                                 else:
 89 |                                     link_probability = float(entity['cntlinkdoc']) / entity['cnttextdoc']
 90 |                                     sense_probability = float(sense_data['cntlinkdoc']) / entity['cnttextdoc']
 91 |                                 if sense_probability > sense_probability_threshold:
 92 |                                     title = unicode(self.wpm.get_item_title(str(sense)))
 93 |                                     url = self.wikipedia_url_template \
 94 |                                           % (self.language_code,
 95 |                                              urllib.quote(title.encode('utf-8')))
 96 |                                     if entity['cntlinkocc'] == 0:
 97 |                                         prior_probability = 0
 98 |                                     else:
 99 |                                         prior_probability = float(sense_data['cntlinkocc']) / entity['cntlinkocc']
100 |                                     link = {
101 |                                         "label": anchor,
102 |                                         "text": ngram,
103 |                                         "title": title,
104 |                                         "id": sense,
105 |                                         "url": url,
106 |                                         "linkProbability": link_probability,
107 |                                         "senseProbability": sense_probability,
108 |                                         "priorProbability": prior_probability
109 |                                     }
110 |                                     if translations:
111 |                                         link["translations"] = {self.language_code:
112 |                                                                 {"title": title,
113 |                                                                  "url": url}}
114 |                                         if self.wpm.sense_has_trnsl(str(sense)):
115 |                                             for lang in self.wpm.get_trnsl_langs(str(sense)):
116 |                                                 trnsl = self.wpm.get_sense_trnsl(str(sense), lang)
117 |                                                 link["translations"][lang] = {
118 |                                                     'title': unicode(trnsl),
119 |                                                     'url': self.wikipedia_url_template % (lang, urllib.quote(unicode(trnsl).encode('utf-8')))
120 |                                                 }
121 |                                     if counts:
122 |                                         link["occCount"] = entity['cnttextocc']
123 |                                         link["docCount"] = entity['cnttextdoc']
124 |                                         link["linkOccCount"] = entity['cntlinkocc']
125 |                                         link["linkDocCount"] = entity['cntlinkdoc']
126 |                                         link["senseOccCount"] = int(sense_data['cntlinkocc'])
127 |                                         link["senseDocCount"] = int(sense_data['cntlinkdoc'])
128 |                                         link['fromTitle'] = sense_data['from_title']
129 |                                         link['fromRedirect'] = sense_data['from_redir']
130 |                                     result["links"].append(link)
131 | 
132 |         if largest_matching:
133 |             available_text = wpmutil.normalize(sentence, normalize_dash, normalize_accents, normalize_lower)
134 |             for link in sorted(result["links"], key=lambda link: -link["priorProbability"]/2-len(link["label"])):
135 |                 normalized_label = wpmutil.normalize(link["label"], normalize_dash, normalize_accents, normalize_lower)
136 |                 if normalized_label in available_text: 
137 |                     available_text = available_text.replace(normalized_label, "")
138 |                 else: result["links"].remove(link)
139 |         return result
140 | 


--------------------------------------------------------------------------------
/semanticizer/processors/semanticizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | """
15 | The Processor wrapping Semanticizer
16 | """
17 | from nltk.tokenize.punkt import PunktSentenceTokenizer
18 | 
19 | from .core import LinksProcessor
20 | from .semanticize import Semanticizer
21 | 
22 | 
23 | class SemanticizeProcessor(LinksProcessor):
24 |     """Processor handling the semanticizing"""
25 | 
26 |     def __init__(self, debug=False):
27 |         """Set the class variables"""
28 |         self.langcodes = []
29 |         self.semanticizers = {}
30 |         self.debug = debug
31 | 
32 |     def load_languages(self, langcodes, max_ngram_length=None):
33 |         """Save the languages and load the semanticizer"""
34 |         self.langcodes = langcodes
35 |         for langcode in langcodes:
36 |             self.semanticizers[langcode] = Semanticizer(langcode, None,
37 |                                                         max_ngram_length,
38 |                                                         self.debug)
39 | 
40 |     def preprocess(self, links, text, settings):
41 |         """
42 |         Semanticize the given text and return the links, text, and
43 |         settings.
44 |         """
45 |         links = []
46 |         if "langcode" in settings and settings["langcode"] in self.semanticizers:
47 |             translations = "translations" in settings
48 |             normalize_dash = not("normalize" in settings and \
49 |                                  not "dash" in settings["normalize"])
50 |             normalize_accents = not("normalize" in settings and \
51 |                                     not "accents" in settings["normalize"])
52 |             normalize_lower = "normalize" in settings and \
53 |                               "lower" in settings["normalize"]
54 |             lower_confidence_bound = "lowerConfidenceBound" in settings
55 |             largest_matching = "largestMatching" in settings
56 | 
57 |             if "split_sentences" in settings:
58 |                 sentences = PunktSentenceTokenizer().tokenize(text)
59 |             else:
60 |                 sentences = [text]
61 | 
62 |             sem = self.semanticizers[settings["langcode"]]
63 |             for sentence in sentences:
64 |                 results = sem.semanticize(sentence, counts=True,
65 |                                           normalize_dash=normalize_dash,
66 |                                           normalize_accents=normalize_accents,
67 |                                           normalize_lower=normalize_lower,
68 |                                           largest_matching=largest_matching, 
69 |                                           lower_confidence_bound=lower_confidence_bound,
70 |                                           translations=translations,
71 |                                           sense_probability_threshold=-1)
72 | 
73 |                 links.extend(results["links"])
74 | 
75 |         return (links, text, settings)
76 | 
77 |     def postprocess(self, links, text, settings):
78 |         """
79 |         Remove counts from links
80 |         @todo: why do this here? In Semanticizer.semanticize there's already \
81 |                a check being done on whether counts should be included.
82 |         """
83 |         if not "counts" in settings:
84 |             for link in links:
85 |                 for key in link.keys():
86 |                     if key.endswith("Count"):
87 |                         del link[key]
88 | 
89 |         return (links, text, settings)
90 | 
91 |     def inspect(self):
92 |         """Return the loaded languages"""
93 |         return {self.__class__.__name__: self.langcodes}
94 | 


--------------------------------------------------------------------------------
/semanticizer/processors/stringUtils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 4 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 5 | # General Public License as published by the Free Software Foundation, either 
 6 | # version 3 of the License, or (at your option) any later version.
 7 | # 
 8 | # This program is distributed in the hope that it will be useful, but WITHOUT
 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
11 | # for more details.
12 | # 
13 | # You should have received a copy of the GNU Lesser General Public License 
14 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
15 | 
16 | import re
17 | 
18 | # Kan ook met de hand...:(\A|\s|\'|"|\.|\,|:|;|!|\?)
19 | #                        (?=(\s|\'|"|\.|\,|:|;|!|\?|\'s|\Z)
20 | # reNonWordChars = re.compile('(?u)\W+', re.UNICODE)
21 | 
22 | # We took the reg exp from scikit-learn:
23 | # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py
24 | reTokenPattern = re.compile(r"(?u)\b\w\w+\b", re.UNICODE)
25 | 
26 | def ngramToPattern(sNgram):
27 |     return ngramsToPattern([sNgram])
28 | 
29 | def ngramsToPattern(aNgrams):
30 |     #import sys
31 |     #print >> sys.stderr, "n-grams: '%s'" % aNgrams
32 |     try:
33 |         # So this reads, inside out:
34 |         # Replace all white space by a single space and re.escape that.
35 |         # Replace the (by now escaped) spaces by '\s+'s and join the different
36 |         # n-grams by pipes ('|')
37 |         # 
38 |         sNgrams = '|'.join([re.escape(re.sub('\s+', ' ', x)).replace('\\ ', '\s+')  for x in aNgrams])
39 |         reNgrams = re.compile('((\A|\W)(' + sNgrams + ')(?=\W|\Z))',
40 |                               flags=re.UNICODE|re.IGNORECASE)
41 |     except OverflowError:
42 |         # Some articles have such a ridiculous number of inlink anchors that
43 |         # the regular expression gets too big.
44 |         # This doesn't happen if we make it a bit stricter....
45 |         # So, if that happens we make the same expression but we do not replace
46 |         # the spaces by \s+'s
47 |         sNgrams = '|'.join([re.escape(re.sub('\s+', ' ', x)) for x in aNgrams])
48 |         reNgrams = re.compile('((\A|\W)(' + sNgrams + ')(?=\W|\Z))',
49 |                               flags=re.UNICODE|re.IGNORECASE)
50 |     return reNgrams
51 | 
52 | # For one word
53 | def findNgramInText(sNgram, sText):
54 |     return findNgramsInText([sNgram], sText)
55 | 
56 | # For several words
57 | def findNgramsInText(aNgrams, sText):
58 |     # A check beforehand because an empty array will lead to a pattern that
59 |     # matches empty lines, double spaces, etc....
60 |     if len(aNgrams) == 0:
61 |         return []
62 |     return re.findall(ngramsToPattern(aNgrams), sText)
63 | 
64 | if __name__ == "__main__":
65 |     sText = u"aap noot mies\nwim jüf duif “Noot” roos ühalloü"
66 | 
67 |     aMatches = findNgramInText(u'aap', sText)
68 |     print "%s" % aMatches
69 | 
70 |     aMatches = findNgramInText(u'hallo', sText)
71 |     print "%s" % aMatches
72 | 
73 |     aMatches = findNgramsInText([u'mies wim', u'noot'], sText)
74 |     print "%s" % aMatches
75 | 


--------------------------------------------------------------------------------
/semanticizer/processors/util.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | import os, yaml
 15 | import sklearn.metrics
 16 | import sklearn.externals.joblib
 17 | 
 18 | def compute_metrics(labels, scores, threshold=0.5):
 19 |     metrics = {}
 20 |     # Sort according to score
 21 |     scores, labels = zip(*sorted(zip(scores, labels)))
 22 |     predictions = [score >= threshold for score in scores]    
 23 |     # Classification metrics
 24 |     metrics["precision"], metrics["recall"], metrics["f1"], support = \
 25 |         sklearn.metrics.precision_recall_fscore_support(labels, predictions, \
 26 |                                                         average="weighted")
 27 |     metrics["accuracy"] = sklearn.metrics.accuracy_score(labels, predictions)
 28 |     metrics["zeroOneLoss"] = sklearn.metrics.zero_one_loss(labels, predictions)
 29 |     # Rank-based metrics
 30 |     metrics["averagePrecision"] = \
 31 |         sklearn.metrics.average_precision_score(labels, scores)
 32 |     metrics["ROC AUC"] = sklearn.metrics.roc_auc_score(labels, scores)
 33 |     # R-precision
 34 |     r_labels = labels[-support:]
 35 |     r_predictions = [True for label in r_labels]
 36 |     metrics["rPrecision"] = \
 37 |         sklearn.metrics.precision_score(r_labels, r_predictions)
 38 |     return metrics
 39 | 
 40 | class ModelStore():
 41 |     def __init__(self, model_dir):
 42 |         self.model_dir = model_dir
 43 |         self.model_cache = {}
 44 | 
 45 |     def load_model(self, modelname):
 46 |         if modelname.endswith(".pkl"):
 47 |             return self.load_model(modelname[:-4])
 48 | 
 49 |         if modelname in self.model_cache:
 50 |             return self.model_cache[modelname]
 51 | 
 52 |         modelfile = os.path.join(self.model_dir, modelname)
 53 |         model = sklearn.externals.joblib.load(modelfile + ".pkl")
 54 | 
 55 |         description = {"name": modelname, "source": modelfile + ".pkl"}
 56 |         if os.path.exists(modelfile + ".yaml"):
 57 |             description.update(yaml.load(file(modelfile + ".yaml")))
 58 |             
 59 |         if os.path.exists(modelfile + ".preprocessor.pkl"):
 60 |             preprocessor = sklearn.externals.joblib.load(modelfile + \
 61 |                                                          ".preprocessor.pkl")
 62 |         else:
 63 |             preprocessor = None
 64 | 
 65 |         self.model_cache[modelname] = (model, description, preprocessor)
 66 |         return (model, description, preprocessor)
 67 | 
 68 |     def save_model(self, model, modelname, description=None, preprocessor=None):
 69 |         if modelname.endswith(".pkl"):
 70 |             modelname = modelname[:-4]
 71 | 
 72 |         modelfile = os.path.join(self.model_dir, modelname)
 73 |         sklearn.externals.joblib.dump(model, modelfile + ".pkl")
 74 |         
 75 |         if preprocessor:
 76 |             sklearn.externals.joblib.dump(preprocessor, \
 77 |                                           modelfile + ".preprocessor.pkl")
 78 | 
 79 |         if description != None:
 80 |             with open(modelfile + ".yaml", 'w') as out:
 81 |                 out.write(yaml.dump(description))
 82 |         else:
 83 |             description = {}
 84 |         
 85 |         description.update({"name": modelname, "source": modelfile + ".pkl"})
 86 |         self.model_cache[modelname] = (model, description, preprocessor)
 87 |             
 88 |     def _convert_dict(self, data, skip=[]):
 89 |         """Helper function that convert the values of dictionary to int/float. 
 90 |            Optionally you can skip a list of values."""
 91 |         converted_data = {}
 92 |         for k,v in data.iteritems():
 93 |             if k in skip: continue
 94 |             try:
 95 |                 converted_data[k] = int("".join(v))
 96 |             except ValueError:
 97 |                 try:
 98 |                     converted_data[k] = float("".join(v))
 99 |                 except ValueError:
100 |                     converted_data[k] = v
101 |         return converted_data    
102 | 
103 |     def create_model(self, settings, skip_settings=[]):
104 |         if not "classifier" in settings:
105 |             raise ValueError("Expecting a classifier in settings.")
106 |         if not "." in settings["classifier"]:
107 |             raise ValueError("Expecting a package in classifier settings.")
108 | 
109 |         classifier = settings["classifier"].split(".")[-1]
110 |         package = ".".join(settings["classifier"].split(".")[:-1])
111 | 
112 |         preprocessor_settings = dict([(key, value) for key, value \
113 |                                       in settings.iteritems() \
114 |                                       if key.startswith("preprocessor.")])
115 | 
116 |         skip_settings.extend(["classifier", "preprocessor"])
117 |         skip_settings.extend(preprocessor_settings.keys())
118 |         arguments = self._convert_dict(settings, skip_settings)
119 |         model = self._create_instance(package, classifier, **arguments)
120 |         
121 |         if "preprocessor" in settings:
122 |             if not "." in settings["preprocessor"]:
123 |                 raise ValueError("Expecting a package in preprocessor settings.")
124 |         
125 |             preprocessor_classname = settings["preprocessor"].split(".")[-1]
126 |             preprocessor_package = ".".join(settings["preprocessor"].split(".")[:-1])
127 | 
128 |             preprocessor_settings = dict([(".".join(key.split(".")[1:]), value)\
129 |                                           for key, value \
130 |                                           in preprocessor_settings.iteritems()])
131 |             preprocessor_arguments = self._convert_dict(preprocessor_settings)
132 |             preprocessor = self._create_instance(preprocessor_package, \
133 |                                                  preprocessor_classname, \
134 |                                                  **preprocessor_arguments)
135 |         else:            
136 |             preprocessor = None
137 |         
138 |         return model, preprocessor
139 |          
140 |     def _create_instance(self, package, classname, *args, **kwargs):    
141 |         # Import package module
142 |         package_module = __import__(package, globals(), locals(), \
143 |                                        [str(classname)], -1)
144 |         # Class instance
145 |         package_class = getattr(package_module, classname)
146 |         
147 |         instance = package_class(*args, **kwargs)
148 |         
149 |         return instance
150 | 


--------------------------------------------------------------------------------
/semanticizer/procpipeline.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | import time
 15 | import logging
 16 | 
 17 | from .processors.core import SettingsProcessor, FilterProcessor
 18 | from .processors.semanticizer import SemanticizeProcessor
 19 | from .processors.features import FeaturesProcessor, ArticleFeaturesProcessor, \
 20 |                                  ContextFeaturesProcessor
 21 | from .processors.multiple import MultipleEntityFeaturesProcessor
 22 | from .processors.external import ArticlesProcessor, StatisticsProcessor
 23 | from .processors.learning import LearningProcessor
 24 | from .processors.image import AddImageProcessor
 25 | 
 26 | from .config import config_get
 27 | 
 28 | 
 29 | def build(langcodes, use_features=False, debug=False):
 30 |     """
 31 |     Initialize the pipeline.
 32 | 
 33 |     @param wikipedia_ids: A list with all loaded wikipedia ids
 34 |     @return: The pipeline
 35 |     @todo: See todo at _load_languages
 36 |     """
 37 |     logging.getLogger().info("Initializing pipeline")
 38 |     pipeline = []
 39 |     if 'max_ngram_length' in config_get('semanticize', {}):
 40 |         max_ngram_length = config_get(('semanticize', 'max_ngram_length'))
 41 |     else:
 42 |         max_ngram_length = None
 43 |     semanticize_processor = _load_semanticize_processor(langcodes,
 44 |                                                         max_ngram_length,
 45 |                                                         debug=debug)
 46 |     settings = config_get("settings", {})
 47 |     pipeline.append(("Settings", SettingsProcessor(settings)))
 48 |     pipeline.append(("Semanticize", semanticize_processor))
 49 |     pipeline.append(("Filter", FilterProcessor()))
 50 |     if use_features:
 51 |         _load_features(pipeline, langcodes)
 52 |     else:
 53 |         _load_articles(pipeline, langcodes)
 54 |     pipeline.append(("AddImage", AddImageProcessor()))
 55 |     logging.getLogger().info("Done initializing pipeline")
 56 |     return pipeline
 57 | 
 58 | 
 59 | def _load_semanticize_processor(langcodes, max_ngram_length=None, debug=False):
 60 |     """
 61 |     Load the Semanticizer.
 62 | 
 63 |     @param wikipedia_ids: A list with all loaded wikipedia ids
 64 |     @return: a configured instance of SemanticizeProcessor
 65 |     @see: processors.SemanticizeProcessor
 66 |     """
 67 |     logging.getLogger().info("Loading semanticizer")
 68 |     semanticize_processor = SemanticizeProcessor(debug=debug)
 69 |     start = time.time()
 70 |     logging.getLogger().info("Loading semanticizers for langcode(s) "
 71 |                      + ", ".join(langcodes))
 72 |     semanticize_processor.load_languages(langcodes, max_ngram_length)
 73 |     logging.getLogger().info("Loading semanticizers took %.2f seconds." \
 74 |                      % (time.time() - start))
 75 |     logging.getLogger().info("Done loading semanticizer")
 76 |     return semanticize_processor
 77 | 
 78 | 
 79 | def _load_features(pipeline, langcodes):
 80 |     """
 81 |     Load all features into the pipeline
 82 | 
 83 |     @param pipeline: A reference to the pipeline
 84 |     @param semanticize_processor: A reference to the semanticize processor
 85 |     @param wikipedia_ids: Wikipedia ids & data
 86 |     """
 87 |     logging.getLogger().info("Loading features")
 88 |     start = time.time()
 89 |     pipeline.append(("Features",
 90 |                      FeaturesProcessor(langcodes)))
 91 |     _load_articles(pipeline, langcodes)
 92 |     pipeline.append(("Statistics",
 93 |                      StatisticsProcessor(langcodes,
 94 |                                          config_get(('wpm', 'threads'), 1),
 95 |                                          config_get(('misc', 'tempdir')))))
 96 |     pipeline.append(("ArticleFeatures", ArticleFeaturesProcessor()))
 97 |     pipeline.append(("MultipleFeatures", MultipleEntityFeaturesProcessor()))
 98 |     pipeline.append(("ContextFeatures", ContextFeaturesProcessor()))
 99 |     logging.getLogger().info("Loading features took %.2f seconds." \
100 |                       % (time.time() - start))
101 |     model_dir = config_get(('learning', 'model_dir'), \
102 |                            config_get(('misc', 'tempdir')))
103 |     pipeline.append(("Learning", LearningProcessor(model_dir)))
104 |     logging.getLogger().info("Done loading features")
105 | 
106 | def _load_articles(pipeline, langcodes):
107 |     pipeline.append(("Articles",
108 |                      ArticlesProcessor(langcodes, config_get(('misc', 'tempdir')))))
109 | 


--------------------------------------------------------------------------------
/semanticizer/server/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | # Can do without ujson and simplejson, but speeds up considerably.
 15 | try:
 16 |     import ujson
 17 | except ImportError:
 18 |     pass
 19 | try:
 20 |     import simplejson as json
 21 | except ImportError:
 22 |     import json
 23 | 
 24 | import re
 25 | from flask import Flask, Response, request, abort
 26 | 
 27 | from uuid import uuid4
 28 | 
 29 | class Server(object):
 30 |     """
 31 |     The HTTP server that will serve the complete namespace
 32 |     """
 33 | 
 34 |     APPLICATION_JSON="application/json"
 35 | 
 36 |     def __init__(self):
 37 |         """
 38 |         Initialize the server. The constructor creates the initial Flask server
 39 |         object.
 40 |         """
 41 |         self.app = Flask(__name__)
 42 | 
 43 |     def set_debug(self, debug=None, debug_log_format=None):
 44 |         """
 45 |         Set Flask server debug parameters.
 46 | 
 47 |         @param debug: Enable or disable debug mode
 48 |         @param debug_log_format: Set the logformat string for the server
 49 |         """
 50 |         if not debug is None:
 51 |             self.app.debug = debug
 52 |         if not debug_log_format is None:
 53 |             self.app.debug_log_format = debug_log_format
 54 | 
 55 |     def _json_dumps(self, o, pretty=False):
 56 |         """
 57 |         Util function to create json dumps based on an object.
 58 | 
 59 |         @param o: Object to transform
 60 |         @param pretty: Whether or not to prettify the JSON
 61 |         @return: The JSON string
 62 |         """
 63 |         if not pretty and "ujson" in locals():
 64 |             return ujson.dumps(o)
 65 |         elif not pretty:
 66 |             return json.dumps(o)
 67 |         else:
 68 |             return json.dumps(o, indent=4)
 69 | 
 70 |     def _get_text_from_request(self):
 71 |         """
 72 |         Util function to get the param called "text" from the current request
 73 | 
 74 |         @return: the value of "text"
 75 |         """
 76 |         content_type = request.headers['Content-Type'] if 'Content-Type' in request.headers else "text/plain" 
 77 |         if request.method == "POST":
 78 |             if content_type == 'application/x-www-form-urlencoded':
 79 |                 return request.form['text']
 80 |             elif content_type == 'text/plain':
 81 |                 return request.data
 82 |             else:
 83 |                 abort(Response("Unsupported Content Type, use: text/plain\n",
 84 |                                status=415))
 85 |         elif "text" in request.args:
 86 |             return request.args["text"]
 87 |         else:
 88 |             abort(Response("No text provided, use: POST or GET with attribute \
 89 |                             'text'\n", status=400))
 90 | 
 91 |     def _get_values_from_request(self, values=None):
 92 |         """
 93 |         Util function to get the values from the current request
 94 | 
 95 |         @param values: initial dictionary of values
 96 |         @return: a dictionary of values
 97 |         """
 98 |         if not values:
 99 |             values = {}
100 |         for key, value in request.values.iteritems():
101 |             assert key not in values
102 |             values[key] = value
103 | 
104 |         return values
105 | 
106 |     def setup_route_semanticize(self, langcodes):
107 |         """
108 |         Setup the /semanticize/<langcode> namespace.
109 | 
110 |         @param langcodes: The languages supported for semanticizing.
111 |         """
112 |         self.langcodes = langcodes
113 |         self.app.add_url_rule("/semanticize/<langcode>", "_semanticize",
114 |                               self._semanticize_handler, methods=["GET", "POST"])
115 |         self.app.add_url_rule("/semanticize", "_semanticize_usage",
116 |                               self._semanticize_usage,
117 |                               methods=["GET", "POST"])
118 | 
119 |     def setup_route_inspect(self):
120 |         """
121 |         Setup the /inspect namespace.
122 | 
123 |         @param pipeline: The pipeline of processors to inspect.
124 |         """
125 |         self.app.add_url_rule("/inspect", "_inspect",
126 |                               self._inspect, methods=["GET"])
127 | 
128 |     def setup_route_feedback(self):
129 |         """
130 |         Setup the /feedback namespace.
131 | 
132 |         @param pipeline: The pipeline of processors to feed back to.
133 |         """
134 |         hex = "[a-fA-F0-9]"
135 |         pattern = "hex{8}-hex{4}-hex{4}-hex{4}-hex{12}".replace("hex", hex)
136 |         self.request_id_pattern = re.compile(pattern)
137 |         self.app.add_url_rule("/feedback/<path:context_path>", "_feedback",
138 |                               self._feedback, methods=["GET", "POST"])
139 |         self.app.add_url_rule("/evaluate/<path:context_path>", "_evaluate",
140 |                               self._evaluate, methods=["GET", "POST"])
141 |         self.app.add_url_rule("/evaluate", "_evaluate",
142 |                             self._evaluate, methods=["GET", "POST"])
143 |         self.app.add_url_rule("/learn/<name>", "_learn",
144 |                               self._learn, methods=["GET", "POST"])
145 | 
146 |     def setup_all_routes(self, pipeline, langcodes):
147 |         """
148 |         Convenience function to start all namespaces at once.
149 | 
150 |         @param pipeline: The pipeline of processors
151 |         """
152 |         self.pipeline = pipeline
153 |         self.setup_route_semanticize(langcodes)
154 |         self.setup_route_inspect()
155 |         self.setup_route_feedback()
156 | 
157 |     def start(self, host, port, use_reloader=False):
158 |         """
159 |         Wrapper for the Flask run() function. Will start the HTTP server with
160 |         all initialized namespaces.
161 | 
162 |         @param host: The hostname to bind on
163 |         @param port: The port to bind on
164 |         """
165 |         print "Server started on %s:%d" % (host, port)
166 |         self.app.run(host, port, self.app.debug, use_reloader=use_reloader)
167 | 
168 |     def _semanticize_usage(self):
169 |         """
170 |         The function handling the /semanticize namespace. Returns the available
171 |         languages.
172 | 
173 |         @return: The body of the response, in this case a json formatted list \
174 |                  of links and their relevance
175 |         @see: _semanticize
176 |         """
177 | 
178 |         json = self._json_dumps({"languages": self.langcodes},
179 |                                  "pretty" in request.args)
180 | 
181 |         return Response(json, mimetype=Server.APPLICATION_JSON)
182 | 
183 |     def _semanticize_handler(self, langcode):
184 |         """
185 |         The function handling the /semanticize/<langcode> namespace. It uses
186 |         the chain-of-command pattern to run all processors, using the
187 |         corresponding preprocess, process, and postprocess steps.
188 | 
189 |         @param langcode: The language to use in the semanticizing
190 |         @return: The body of the response, in this case a json formatted list \
191 |                  of links and their relevance
192 |         """
193 |         self.app.logger.debug("Semanticizing: start")
194 |         text = self._get_text_from_request()
195 |         self.app.logger.debug("Semanticizing text: " + text)
196 | 
197 |         settings = self._get_values_from_request({"langcode": langcode})
198 |         settings["request_id"] = str(uuid4())
199 | 
200 |         sem_result = self._semanticize(langcode, settings, text)
201 |         sem_result["request_id"] = settings["request_id"]
202 |         json = self._json_dumps(sem_result, "pretty" in settings)
203 | 
204 |         self.app.logger.debug("Semanticizing: Created %d characters of JSON "
205 |                               "for request id %s." \
206 |                               % (len(json), sem_result["request_id"]))
207 |         return Response(json, mimetype=Server.APPLICATION_JSON)
208 | 
209 |     def _semanticize(self, langcode, settings, text):
210 |         """
211 |         Method that performs the actual semantization.
212 |         """
213 |         links = []
214 | 
215 |         for function in ("preprocess", "process", "postprocess"):
216 |             for step, processor in self.pipeline:
217 |                 self.app.logger.debug("Semanticizing: %s for step %s" \
218 |                                       % (function, step))
219 |                 (links, text, settings) = getattr(processor, function)(links,
220 |                                                                        text,
221 |                                                                        settings
222 |                                                                        )
223 |             self.app.logger.debug("Semanticizing: %s pipeline with %d steps \
224 |                                    done" % (function, len(self.pipeline)))
225 | 
226 |         result = {"links": links, "text": text}
227 | 
228 |         return result
229 | 
230 |     def _inspect(self):
231 |         """
232 |         Function that handles the /inspect namespace. Will print the settings
233 |         used by the different processors.
234 | 
235 |         @return: The body of the response, in this case a json formatted \
236 |                  string containing all found settings.
237 |         """
238 |         inspect = {}
239 |         for _, processor in self.pipeline:
240 |             inspect.update(processor.inspect())
241 |         return Response(self._json_dumps(inspect, pretty=True),
242 |                         mimetype=Server.APPLICATION_JSON)
243 | 
244 |     def _feedback(self, context_path):
245 |         """
246 |         Function that handles the /feedback namespace. Will process the 
247 |         feedback in supported processors in the pipeline.
248 |         """
249 |         context_parts = context_path.split("/")
250 |         if len(context_parts) == 0:
251 |             raise ValueError("No context for feedback is provided!")
252 | 
253 |         request_id_match = self.request_id_pattern.match(context_parts[-1])
254 |         if request_id_match:
255 |             request_id = request_id_match.string
256 |             context_parts.pop()
257 |         else:
258 |             request_id = None
259 |         
260 |         context = "/".join(context_parts) if len(context_parts) else None
261 |         feedback = request.values
262 |         for processor_name, processor in self.pipeline:
263 |             if "feedback" in processor.__class__.__dict__:
264 |                 self.app.logger.debug("Feeding feedback for request_id %s in "
265 |                                       "context %s to %s." %
266 |                                       (request_id, context, processor_name))
267 |                 processor.feedback(request_id, context, feedback)
268 | 
269 |         return "OK"
270 | 
271 |     def _evaluate(self, context_path=""):
272 |         """
273 |         Function that handles the /evaluate namespace. Will evaluate a metric based 
274 |         on the feedback in supported processors in the pipeline.
275 |         """
276 |         evaluation = {} 
277 |         for processor_name, processor in self.pipeline:
278 |             if "evaluate" in processor.__class__.__dict__:
279 |                 self.app.logger.debug("Evaluating %s in %s." %
280 |                                       (context_path, processor_name))
281 |                 evaluation.update(processor.evaluate(context_path, 
282 |                                                      request.values))
283 | 
284 |         return Response(self._json_dumps(evaluation, pretty=True),
285 |                         mimetype=Server.APPLICATION_JSON)
286 | 
287 |     def _learn(self, name):
288 |         """
289 |         Function that handles the /learn namespace. Will learn based on the 
290 |         feedback in supported processors in the pipeline.
291 |         """        
292 |         for processor_name, processor in self.pipeline:
293 |             if "learn" in processor.__class__.__dict__:
294 |                 self.app.logger.debug("Learning %s in %s." %
295 |                                       (name, processor_name))
296 |                 processor.learn(name, request.values)
297 | 
298 |         return "OK"
299 | 


--------------------------------------------------------------------------------
/semanticizer/server/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | import logging
15 | from logging.handlers import TimedRotatingFileHandler
16 | 
17 | from .. import procpipeline
18 | from ..config import config_get
19 | from ..server import Server
20 | from ..wpm.data import init_datasource
21 | 
22 | 
23 | def start_server(langcodes,
24 |                  host,
25 |                  port,
26 |                  use_reloader,
27 |                  verbose=False,
28 |                  logformat='[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s',
29 |                  use_features=False,
30 |                  debug=False):
31 |     """
32 |     Start a SemanticizerFlaskServer with all processors loaded into the
33 |     pipeline.
34 | 
35 |     @param verbose: Set whether the Flask server should be verbose
36 |     @param logformat: The logformat used by the Flask server
37 |     """
38 |     # Initialize the pipeline
39 |     pipeline = procpipeline.build(langcodes, use_features, debug=debug)
40 |     # Create the FlaskServer
41 |     logging.getLogger().info("Setting up server")
42 |     server = Server()
43 |     server.set_debug(verbose, logformat)
44 |     # Setup all available routes / namespaces for the HTTP server
45 |     server.setup_all_routes(pipeline, langcodes)
46 |     logging.getLogger().info("Done setting up server, now starting...")
47 |     # And finally, start the thing
48 |     server.start(host, port, use_reloader)
49 | 
50 | def init_logging(log, verbose, logformat):
51 |     """
52 |     A convencience function that initializes the logging framework by setting
53 |     the path to the log, verbosity, and the logformat.
54 |     """
55 |     file_handler = TimedRotatingFileHandler(log, when='midnight')
56 |     file_handler.setFormatter(logging.Formatter(logformat))
57 |     stream_handler = logging.StreamHandler()
58 |     stream_handler.setFormatter(logging.Formatter(logformat))
59 |     if verbose == True:
60 |         file_handler.setLevel(logging.DEBUG)
61 |         stream_handler.setLevel(logging.DEBUG)
62 |         logging.getLogger().setLevel(logging.DEBUG)
63 |     logging.getLogger().addHandler(file_handler)
64 |     logging.getLogger().addHandler(stream_handler)
65 | 
66 | 
67 | def main():
68 |     # Init the logger
69 |     init_logging(config_get(('logging', 'path'), 'log.txt'),
70 |                  config_get(('logging', 'verbose'), False),
71 |                  config_get(('logging', 'format'), None))
72 | 
73 |     # Set the datasource and init it
74 |     wpmlangs = config_get(('wpm', 'languages'))
75 |     settings = config_get(('settings'), {})
76 |     init_datasource(wpmlangs, settings)
77 | 
78 |     # Start the server
79 |     try:
80 |         start_server(config_get(('wpm', 'languages')).keys(),
81 |                      config_get(('server', 'host'), '0.0.0.0'),
82 |                      config_get(('server', 'port'), 5000),
83 |                      config_get(('server', 'use_reloader'), False),
84 |                      config_get(('logging', 'verbose'), False),
85 |                      config_get(('logging', 'format'), None),
86 |                      config_get(('linkprocs', 'features'), False),
87 |                      config_get(('server', 'debug'), False))
88 |     except ValueError as e:
89 |         logging.getLogger().fatal("Error running Semanticizer server: %s" \
90 |                                   % e.message)
91 |         raise
92 | 
93 | 
94 | if __name__ == '__main__':
95 |     main()
96 | 


--------------------------------------------------------------------------------
/semanticizer/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/util/__init__.py


--------------------------------------------------------------------------------
/semanticizer/util/online_learning.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | import os, re, argparse, urllib, urllib2, json
 15 | from collections import defaultdict
 16 | from timer import Timer
 17 | from random import choice, shuffle
 18 |         
 19 | def parse_args():
 20 |     parser = argparse.ArgumentParser(
 21 |                 description='Online learn a classifier.',
 22 |                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 23 |                                      
 24 |     parser.add_argument('classifier', metavar='classifier', 
 25 |                    help='a classifier to train')
 26 |     parser.add_argument('datafiles', metavar='file', nargs='+',
 27 |                    help='a set of datafiles to process')
 28 |                    
 29 |     group = parser.add_argument_group('Semanticizer')
 30 |     group.add_argument('--url', default='http://localhost:5000/',
 31 |                    help='URL where the semanticizer webservice is running')
 32 | 
 33 |     group = parser.add_argument_group('Learning')
 34 |     group.add_argument('--learn', nargs=2, action='append',
 35 |                        metavar=('setting', 'value'), 
 36 |                        default=[('context', 'EMPTY')],
 37 |                        help='Setting for the learn call')
 38 |     group.add_argument('--model-prefix', metavar='prefix', 
 39 |                        default='Online.',
 40 |                        help='Prefix to add to the modelname')
 41 |     group.add_argument('--iterations', metavar='number', 
 42 |                        default=50, type=int,
 43 |                        help='Number of iterations for learning.f')
 44 | 
 45 |     group = parser.add_argument_group('Context')
 46 |     group.add_argument('--context-pattern', nargs=2,
 47 |                    metavar=('pattern', 'replacement'),
 48 |                    default=('^(?:.*/)*(.*?)(?:\.txt)?$', '\g<1>'),
 49 |                    help='Pattern to generate context from filename')
 50 |     group.add_argument('--context-prefix', 
 51 |                    metavar='prefix', default='',
 52 |                    help='Prefix to add to the context')
 53 | 
 54 |     group = parser.add_argument_group('Output')
 55 |     group.add_argument('--output', default=None,
 56 |                    help='Filename for the output')
 57 |     
 58 |     args = parser.parse_args()
 59 |     args.learn.append(('classifier', args.classifier))
 60 |     return args
 61 | 
 62 | def online_learning(args):
 63 |     results = defaultdict(list)
 64 |     
 65 |     shuffle(args.datafiles)
 66 |     for filenr, filename in enumerate(args.datafiles):
 67 |         assert os.path.exists(filename)
 68 |         context = args.context_prefix + re.sub(args.context_pattern[0], \
 69 |                                                args.context_pattern[1], \
 70 |                                                filename)
 71 |         
 72 |         modelname = args.model_prefix + context.replace('/', '.')
 73 |         learn_url = args.url + 'learn/' + modelname
 74 |         url_data = urllib.urlencode(args.learn)
 75 |         
 76 |         print "Initializing model", modelname,
 77 |         print urllib2.urlopen(learn_url, url_data).read()
 78 | 
 79 |         train_files = [f for f in args.datafiles if f != filename]
 80 |         for i in range(args.iterations):
 81 |             print "%02d/%02d" % (filenr+1, len(args.datafiles)),
 82 |             print "%03d/%03d" % (i+1, args.iterations),
 83 |             train_filename = choice(train_files)
 84 |             #with Timer("Learning for %s" % train_filename, 'timer'):
 85 |             train_context = args.context_prefix + \
 86 |                             re.sub(args.context_pattern[0], \
 87 |                                    args.context_pattern[1], train_filename)
 88 | 
 89 |             url_data = urllib.urlencode({"context": train_context})
 90 |             print "Training", modelname, "on", train_context,
 91 |             print urllib2.urlopen(learn_url, url_data).read()
 92 | 
 93 |             evaluate_url = args.url + 'evaluate/' + context
 94 |             url_data = urllib.urlencode({"model": modelname})
 95 |             result = json.loads(urllib2.urlopen(evaluate_url, url_data).read())
 96 |             print "%.4f %.4f %.4f" % \
 97 |                 (result["macro_metrics"]["accuracy"],
 98 |                  result["macro_metrics"]["averagePrecision"],
 99 |                  result["macro_metrics"]["rPrecision"])
100 |             results[filename].append(result)
101 | 
102 |     if args.output:
103 |         with open(args.output, 'w') as out:
104 |             out.write(json.dumps(results))
105 | 
106 | if __name__ == '__main__':
107 |     args = parse_args()
108 |     
109 |     with Timer("Online learning %d files" % len(args.datafiles), 'timer'):
110 |         online_learning(args)
111 | 


--------------------------------------------------------------------------------
/semanticizer/util/profiler.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | import pstats
15 | import sys
16 | 
17 | if __name__ == '__main__':
18 |     stats = pstats.Stats(sys.argv[1])
19 |     stats.sort_stats('time')
20 |     stats.print_stats(.01)
21 |     stats.print_callers(.01)
22 | 


--------------------------------------------------------------------------------
/semanticizer/util/store_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | import sys, os, re, argparse, urllib, urllib2, json
 15 | from collections import defaultdict
 16 | from timer import Timer
 17 |         
 18 | def parse_args():
 19 |     parser = argparse.ArgumentParser(
 20 |                 description='Process and store a dataset.',
 21 |                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 22 |                                      
 23 |     parser.add_argument('datafiles', metavar='file', nargs='+',
 24 |                    help='a set of datafiles to process')
 25 |                    
 26 |     group = parser.add_argument_group('Semanticizer')
 27 |     group.add_argument('--url', default='http://localhost:5000/',
 28 |                    help='URL where the semanticizer webservice is running')
 29 |     group.add_argument('--language', metavar='langcode',
 30 |                    default='en',
 31 |                    help='Language of the semanticizer (2 letters, eg. en)')
 32 |     group.add_argument('--semanticize', nargs=2, action='append',
 33 |                   metavar=('setting', 'value'),
 34 |                   default=[('save', "true")],
 35 |                   help='Setting for the semanticizer call')
 36 | 
 37 |     group = parser.add_argument_group('Feedback')
 38 |     group.add_argument('--feedback', nargs=3, action='append',
 39 |                    metavar=('type', 'pattern', 'replacement'),
 40 |                    help='Pattern to generate feedback filenames '
 41 |                         '(default: positive "\\.txt$" ".positives.txt")')
 42 |     group.add_argument('--default', 
 43 |                    default='negative', metavar='type',
 44 |                    help='Default type of feedback')
 45 |     group.add_argument('--no-default', action='store_true',
 46 |                    help='Do not use default feedback')
 47 | 
 48 |     group = parser.add_argument_group('Context')
 49 |     group.add_argument('--context-pattern', nargs=2,
 50 |                    metavar=('pattern', 'replacement'),
 51 |                    default=('^(?:.*/)*(.*?)(?:\.txt)?$', '\g<1>'),
 52 |                    help='Pattern to generate context from filename')
 53 |     group.add_argument('--context-prefix', 
 54 |                    metavar='prefix', default='',
 55 |                    help='Prefix to add to the context')
 56 |     
 57 |     args = parser.parse_args()
 58 |     if not args.feedback:
 59 |         args.feedback = [('positive', '\.txt$', '.positives.txt')]    
 60 |     
 61 |     return args
 62 | 
 63 | def store_dataset(args):
 64 |     semanticize_url = '%ssemanticize/%s' % (args.url, args.language)
 65 |     request_ids = defaultdict(list)
 66 |     for filename in args.datafiles:
 67 |         assert os.path.exists(filename)
 68 |         context = args.context_prefix + re.sub(args.context_pattern[0], \
 69 |                                                args.context_pattern[1], \
 70 |                                                filename)
 71 | 
 72 |         with Timer("Semanticizing %s" % filename, 'timer'):
 73 |             with open(filename) as file:
 74 |                 lines = file.readlines()
 75 |                 print "Read %d lines from %s." % (len(lines), filename)
 76 |     
 77 |                 for line in lines:
 78 |                     data = [("context", context), ("text", line.strip())]
 79 |                     data.extend(args.semanticize)
 80 |                     url_data = urllib.urlencode(data)
 81 |                     result = json.loads(urllib2.urlopen(semanticize_url, 
 82 |                                                         url_data).read())                
 83 |                     print "Request %s: %d links" % \
 84 |                             (result["request_id"], len(result["links"]))
 85 |                     request_ids[filename].append(result["request_id"])
 86 |         
 87 |         with Timer("Feedback for %s" % context, 'timer'):
 88 |             feedback = []
 89 |             for (feedback_type, pattern, replacement) in args.feedback:
 90 |                 feedback_filename = re.sub(pattern, replacement, filename)
 91 |                 if not os.path.exists(feedback_filename):
 92 |                     print feedback_filename, "does not exist"
 93 |                     continue
 94 |                 with open(feedback_filename) as file:
 95 |                     lines = file.readlines()
 96 |                     print "Read %d lines of %s feedback from %s." % \
 97 |                             (len(lines), feedback_type, feedback_filename)
 98 |                     for line in lines:
 99 |                         feedback.append((feedback_type, line.strip()))
100 | 
101 |             if not args.no_default:
102 |                 feedback.append(("default", args.default))
103 |             
104 |             feedback_url = args.url + 'feedback/' + context
105 |             url_data = urllib.urlencode(feedback)
106 |             result = urllib2.urlopen(feedback_url, url_data).read()            
107 |             print "%d items of feedback for %s: %s" % \
108 |                     (len(feedback), context, result)
109 |                 
110 | if __name__ == '__main__':
111 |     args = parse_args()
112 |     
113 |     with Timer("Storing %d files" % len(args.datafiles), 'timer'):
114 |         store_dataset(args)
115 | 


--------------------------------------------------------------------------------
/semanticizer/util/timer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | import time
15 | 
16 | class Timer(object):
17 |     """Convience method to time activities. Can be used as context."""
18 |     
19 |     def __init__(self, activity, name=None):
20 |         self.name = name
21 |         self.activity = activity
22 |         self.tstart = time.time()
23 |     
24 |     def __del__(self):
25 |         if self.name: print '[%s]' % self.name,
26 |         print self.activity,
27 |         print 'took %s seconds.' % (time.time() - self.tstart)
28 | 
29 |     def __enter__(self):
30 |         pass
31 | 
32 |     def __exit__(self, type, value, traceback):
33 |         pass
34 | 


--------------------------------------------------------------------------------
/semanticizer/wpm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/wpm/__init__.py


--------------------------------------------------------------------------------
/semanticizer/wpm/data.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | import json
 14 | 
 15 | from .load import WpmLoader
 16 | from .namespace import WpmNS
 17 | 
 18 | wpm_dumps = {}
 19 | 
 20 | def init_datasource(wpm_languages, settings):
 21 |     """Set the datasource and init it"""
 22 |     for langcode, langconfig in wpm_languages.iteritems():
 23 |         load_wpm_data(langconfig['source'], langcode, settings, **langconfig['initparams'])
 24 | 
 25 | def load_wpm_data(datasource, langcode, settings, **kwargs):
 26 |     if datasource == "redis":
 27 |         from .db.redisdb import RedisDB
 28 |         db = RedisDB(**kwargs)
 29 |     elif datasource == "memory":
 30 |         from .db.inmemory import MemoryDB
 31 |         db = MemoryDB()
 32 |     elif datasource == "mongo":
 33 |         from .db.mongodb import MongoDB
 34 |         db = MongoDB()
 35 |         #load wpm data into memory
 36 |         WpmLoader(db, langcode, settings, **kwargs)
 37 |     else:
 38 |         raise ValueError("Unknown backend {}".format(datasource))
 39 |     wpm_dumps[langcode] = WpmData(db, langcode)
 40 |     
 41 |     
 42 | class WpmData:
 43 | 
 44 |     def __init__(self, db, langcode):
 45 |         
 46 |         #set database [memory or redis]
 47 |         self.db = db
 48 |         
 49 |         #get current db version
 50 |         self.version = self.db.get(langcode+":version")
 51 |         
 52 |         #load correct NameSpace
 53 |         self.ns = WpmNS(db, langcode, self.version) 
 54 |         
 55 |     def entity_exists(self, entity):
 56 |         return self.exists(self.ns.label(entity))
 57 | 
 58 |     def normalized_entities_exist(self, entities):
 59 |         with self.db.pipeline() as pipe:
 60 |             for e in entities:
 61 |                 pipe.exists(self.ns.normalized(e))
 62 |             return pipe.execute()
 63 | 
 64 |     def get_all_entities(self, normalized_entity):
 65 |         return self.db.smembers(self.ns.normalized(normalized_entity))
 66 | 
 67 |     def get_entity_data(self, entity):
 68 |         entity_data = self.db.lrange(self.ns.label(entity) , 0, -1)
 69 |         senses = []
 70 |         if len(entity_data) > 4:
 71 |             senses = entity_data[4:]
 72 |         return {'cntlinkocc': int(entity_data[0]),
 73 |                 'cntlinkdoc': int(entity_data[1]),
 74 |                 'cnttextocc': int(entity_data[2]),
 75 |                 'cnttextdoc': int(entity_data[3]),
 76 |                 'senses': senses}
 77 |                 
 78 |     def get_sense_data(self, entity, sense):
 79 |         sense_data = self.db.lrange(self.ns.label_sense(entity, sense), 0, -1)
 80 |         return {'cntlinkocc': int(sense_data[0]),
 81 |                 'cntlinkdoc': int(sense_data[1]),
 82 |                 'from_title': sense_data[2],
 83 |                 'from_redir': sense_data[3]}
 84 | 
 85 |     def get_item_id(self, title):
 86 |         return self.db.get(self.ns.page_id(title))
 87 |     
 88 |     def get_item_ids(self, *titles):
 89 |         with self.db.pipeline() as pipe:
 90 |             for title in titles:
 91 |                 pipe.get(self.ns.page_id(title))
 92 |             return pipe.execute()
 93 |     
 94 |     def get_item_title(self, pid):
 95 |         return self.db.get(self.ns.page_title(pid))
 96 |     
 97 |     def get_item_inlinks(self, pid):
 98 |         return self.db.lrange(self.ns.page_inlinks(pid), 0, -1)
 99 |     
100 |     def get_item_outlinks(self, pid):
101 |         return self.db.lrange(self.ns.page_outlinks(pid), 0, -1)
102 |     
103 |     def get_item_categories(self, pid):
104 |         return self.db.get(self.ns.page_categories(pid))
105 |     
106 |     def get_item_definition(self, pid):
107 |         return self.db.get(self.ns.page_definition(pid))
108 |     
109 |     def get_item_labels(self, pid):
110 |         json_labels = self.db.lrange(self.ns.page_labels(pid), 0, -1)
111 |         results = []
112 |         for json_label in json_labels:
113 |             label = json.loads(json_label)
114 |             results.append({
115 |                 'title': label[0],
116 |                 'occurances': label[1],
117 |                 'fromRedirect': label[2],
118 |                 'fromTitle': label[3],
119 |                 'isPrimary': label[4],
120 |                 'proportion': label[5] 
121 |             })
122 |         return results
123 |                 
124 |     def sense_has_trnsl(self, sid):
125 |         return self.db.exists(self.ns.translation_sense(sid))
126 | 
127 |     def get_trnsl_langs(self, sid):
128 |         return self.db.lrange(self.ns.translation_sense(sid), 0, -1)
129 | 
130 |     def get_sense_trnsl(self, sid, lang):
131 |         return self.db.get(self.ns.translation_sense_language(sid, lang))
132 | 
133 |     def get_wikipedia_name(self):
134 |         path = self.db.get(self.ns.wiki_path())
135 |         if path[-1] == '/':
136 |             return path.split('/')[-2]
137 |         return path.split('/')[-1]
138 | 
139 |     def get_data_path(self):
140 |         return self.db.get(self.ns.wiki_path())
141 | 
142 |     def get_lang_name(self):
143 |         return self.db.get(self.ns.wiki_language_name())
144 | 
145 |     def get_title_ngram_score(self, title):
146 |         nr_of_tokens = len(title.split())
147 |         return self.db.zscore(self.ns.ngramscore(str(nr_of_tokens)), title)
148 |     
149 |     def get_stat(self, value):
150 |         return self.db.get(self.ns.wiki_stats(value))
151 |     
152 |     def get_articles(self, *pids):
153 |         pipe = self.db.pipeline()
154 |         for pid in pids:
155 |             pipe.lrange(self.ns.page_inlinks(pid), 0, -1)
156 |             pipe.lrange(self.ns.page_outlinks(pid), 0, -1)
157 |             pipe.lrange(self.ns.page_labels(pid), 0, -1)
158 |         data = pipe.execute()
159 |         
160 |         results = []
161 |         for i in xrange(0, len(data)-1, 3):
162 |             labels = []
163 |             json_labels = data[i+2]
164 |             for json_label in json_labels:
165 |                 label = json.loads(json_label)
166 |                 labels.append({
167 |                     'title': label[0],
168 |                     'occurances': label[1],
169 |                     'fromRedirect': label[2],
170 |                     'fromTitle': label[3],
171 |                     'isPrimary': label[4],
172 |                     'proportion': label[5] 
173 |                 })
174 |             result = {
175 |                 "InLinks":data[i],
176 |                 "OutLinks":data[i+1],
177 |                 "Labels":labels
178 |             }
179 |             results.append(result)
180 |         return results
181 | 


--------------------------------------------------------------------------------
/semanticizer/wpm/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/wpm/db/__init__.py


--------------------------------------------------------------------------------
/semanticizer/wpm/db/inmemory.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | 
 15 | 
 16 | class MemoryDB:
 17 |     #store all data in memory instead of redis, mimic redis functions
 18 |     def __init__(self, **kwargs): 
 19 |         self.cache = dict()
 20 |     
 21 |     def pipeline(self,  **kwargs):
 22 |         return Pipe(self.cache)
 23 |     
 24 |     def exists(self, key):
 25 |         return key in self.cache
 26 |     
 27 |     def keys(self, key):
 28 |         key = key.replace("*", "")  
 29 |         # simple abstraction of redis wildcard key search, only valid for startswith equivalent search which should be sufficient, probably faster then full regular expression search over keys 
 30 |         return [k for k in self.cache.iterkeys() if k.startswith(key)]
 31 |     
 32 |     def get(self, key):
 33 |         return self.cache[key]
 34 |     
 35 |     def set(self, key, value):
 36 |         self.cache[key] = value
 37 |         return True
 38 |         
 39 |     def smembers(self, key):
 40 |         return self.get(key)
 41 |     
 42 |     def sismember(self, key, value):
 43 |         return value in self.cache[key]
 44 |     
 45 |     def sadd(self, key, *values):
 46 |         self.cache.setdefault(key, set()).update(values)
 47 |         return [True]*len(values)
 48 |         
 49 |     def lrange(self, key, start=0, end=-1):
 50 |         data = self.cache.get(key, list())
 51 |         if end < -1:
 52 |           return data[start:end+1] 
 53 |         elif end == -1:
 54 |           return data[start:] 
 55 |         else:
 56 |           return data[start:end] 
 57 |         
 58 |     def rpush(self, key, *values):
 59 |         self.cache.setdefault(key, []).extend(values)
 60 |         return [True]*len(values)
 61 |         
 62 |     def zscore(self, key, value):
 63 |         return self.cache[key][value]
 64 |     
 65 |     def zincrby(self, key, value, amount=1):
 66 |         # in case value does not exist init 
 67 |         if not value in self.cache.setdefault(key, {}):
 68 |             self.cache[key][value] = amount
 69 |         else:
 70 |             self.cache[key][value] += amount
 71 |         return self.cache[key][value]
 72 |         
 73 |     def delete(self,*keys):
 74 |         for key in keys:
 75 |             self.cache.pop(key, None)
 76 |         return True
 77 | 
 78 | 
 79 | #proxy all returns to pipe class
 80 | class Proxy(object):
 81 |     def __getattribute__(self,name):
 82 |         attr = object.__getattribute__(self, name)
 83 |         if hasattr(attr, '__call__') and name not in ["execute", "reset"]:
 84 |             def newfunc(*args, **kwargs):
 85 |                 result = attr(*args, **kwargs)
 86 |                 self.results.append(result)
 87 |                 return True
 88 |             return newfunc
 89 |         else:
 90 |             return attr
 91 |         
 92 | #implicity add a decorator Proxy to all functions of MemoryDB to fetch all returns and output them on execute
 93 | class Pipe(Proxy, MemoryDB):
 94 |     def __init__(self, cache):
 95 |         self.reset()
 96 |         self.cache = cache
 97 |         
 98 |     def __enter__(self):
 99 |         return self
100 | 
101 |     def __exit__(self, exc_type, exc_value, traceback):
102 |         self.reset()
103 | 
104 |     def __del__(self):
105 |         try:
106 |             self.reset()
107 |         except Exception:
108 |             pass
109 |         
110 |     def __len__(self):
111 |         return len(self.results)
112 |          
113 |     def reset(self):
114 |         self.results = []
115 |         
116 |     def execute(self):
117 |         results = self.results
118 |         self.reset()
119 |         return results


--------------------------------------------------------------------------------
/semanticizer/wpm/db/mongodb.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | from pymongo import MongoClient
 15 | 
 16 | class MongoDB:
 17 |     def __init__(self, host='localhost', port=27017, **kwargs): 
 18 |         global client 
 19 |         client = MongoClient(host, port)
 20 |     
 21 |     def pipeline(self,  **kwargs):
 22 |         return Pipe()
 23 |     
 24 |     def exists(self, key):
 25 |         item = client.sem.wpm.find_one( {"_id": key})
 26 |         return False if not item else True
 27 |         
 28 |     def keys(self, key):
 29 |         item = client.sem.wpm.find( {"_id": "/"+key+"/"})
 30 |         return [] if not item else item
 31 |     
 32 |     def get(self, key):
 33 |         item = client.sem.wpm.find_one( {"_id": key})
 34 |         return item['value']
 35 |     
 36 |     def set(self, key, value):
 37 |         client.sem.wpm.save( {"_id":key, "value": value})
 38 |         return True
 39 |         
 40 |     def smembers(self, key):
 41 |         return self.get(key)
 42 |     
 43 |     def sismember(self, key, value):
 44 |         item = client.sem.wpm.find_one( {"_id": key})
 45 |         return False if not item else value in item['value']
 46 |     
 47 |     def sadd(self, key, *values):
 48 |         item = client.sem.wpm.find_one( {"_id": key})
 49 |         svalue = set(values) if not item else set(list(item['value']) + list(values))
 50 |         client.sem.wpm.update( {"_id":key},{'$set':{'value': list(svalue)}},upsert=True, multi=False)
 51 |         return [True]*len(values)
 52 |         
 53 |     def lrange(self, key, start, end):
 54 |         item = client.sem.wpm.find_one( {"_id": key})
 55 |         return [] if not item else value in item['value'][start:end]
 56 |         
 57 |     def rpush(self, key, *values):
 58 |         item = client.sem.wpm.find_one( {"_id": key})
 59 |         lvalue  = list(values) if not item else list(item['value']) + list(values)
 60 |         client.sem.wpm.update( {"_id":key},{'$set':{'value': lvalue}},upsert=True, multi=False)
 61 |         return [True]*len(values)
 62 |         
 63 |     def zscore(self, key, value):
 64 |         item = client.sem.wpm.find_one( {"_id": key})
 65 |         subkey = ":"+str(value)+":"
 66 |         if not item:
 67 |             return None
 68 |         if not subkey in item:
 69 |             return None
 70 |         return item[subkey]
 71 |     
 72 |     def zincrby(self, key, value, ammount=1):
 73 |         client.sem.wpm.update( {"_id":key},{'$inc':{":"+str(value)+":": 1}},upsert=True, multi=False)
 74 |         return True
 75 |     
 76 |     def delete(self,*keys):
 77 |         for key in keys:
 78 |             client.sem.wpm.remove({"_id":key})
 79 |         return True
 80 | 
 81 | #proxy all returns to pipe class
 82 | class Proxy(object):
 83 |     def __getattribute__(self,name):
 84 |         attr = object.__getattribute__(self, name)
 85 |         if hasattr(attr, '__call__'):
 86 |             def newfunc(*args, **kwargs):
 87 |                 result = attr(*args, **kwargs)
 88 |                 self.results.append(result)
 89 |                 return True
 90 |             return newfunc
 91 |         else:
 92 |             return attr
 93 |         
 94 | #implicity add a decorator Proxy to all functions of MongoDB to fetch all returns and output them on execute
 95 | class Pipe(Proxy, MongoDB):
 96 |     def __init__(self):
 97 |         self.reset()
 98 |         
 99 |     def __enter__(self):
100 |         return self
101 | 
102 |     def __exit__(self, exc_type, exc_value, traceback):
103 |         self.reset()
104 | 
105 |     def __del__(self):
106 |         try:
107 |             self.reset()
108 |         except Exception:
109 |             pass
110 |         
111 |     def __len__(self):
112 |         return len(self.results)
113 |          
114 |     def reset(self):
115 |         self.results = []
116 |         
117 |     def execute(self):
118 |         results = self.results
119 |         self.reset()
120 |         return results


--------------------------------------------------------------------------------
/semanticizer/wpm/db/redisdb.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | import redis
15 | 
16 | def RedisDB(host='localhost', port=6379, **kwargs):
17 |     return redis.StrictRedis(host=host, port=port, db=0, decode_responses=True)


--------------------------------------------------------------------------------
/semanticizer/wpm/namespace.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | class WpmNS:
 15 |     def __init__(self, db, langcode, version=None):
 16 |         self.sep = ':'
 17 |         self.lc  = langcode
 18 |         self.db  = db
 19 |         self.manual_version = version
 20 |    
 21 |     def version (self):
 22 |         if self.manual_version:
 23 |             return self.manual_version
 24 |         version = self.db.get(self.db_version())
 25 |         if not version:
 26 |             raise Exception("No database version") 
 27 |         return version
 28 |     
 29 |     def db_version(self):
 30 |         """
 31 |         key
 32 |             <langcode>:db:version
 33 |         value
 34 |             string(cache version)
 35 |         """
 36 |         return self.sep.join( (self.lc, "db", "version") ) 
 37 |     
 38 |     def wiki_language_name(self):
 39 |         """
 40 |         key
 41 |             <langcode>:<version>:wiki:lname
 42 |         value
 43 |             string(wiki name)
 44 |         """
 45 |         return self.sep.join( (self.lc, self.version(), "wiki", "lname") )
 46 |         
 47 |     def wiki_path(self):
 48 |         """
 49 |         key
 50 |             <langcode>:<version>:wiki:path
 51 |         value
 52 |             string(wiki path)
 53 |         """
 54 |         return self.sep.join( (self.lc, self.version(), "wiki", "path") )
 55 |     
 56 |     def wiki_stats(self, statName):
 57 |         """
 58 |         key
 59 |             <langcode>:<version>:wiki:stats:<statName>
 60 |         value
 61 |             string(stats)
 62 |         """
 63 |         return self.sep.join( (self.lc, self.version(), "wiki", "stats", statName) )
 64 |     
 65 |     def label(self, name):
 66 |         """
 67 |         key
 68 |             <langcode>:<version>:label:<name>
 69 |         value
 70 |             list( LinkOccCount, LinkDocCount, TextOccCount, TextDocCount, SenseId, SenseId, ..)
 71 |         """
 72 |         return self.sep.join( (self.lc, self.version(), "label", name) )
 73 | 
 74 |     def label_sense(self, name, senseid):
 75 |         """
 76 |         key
 77 |             <langcode>:<version>:label:<name>:<senseid>
 78 |         value
 79 |             list( sLinkDocCount, sLinkOccCount, FromTitle, FromRedirect)
 80 |         """
 81 |         return self.sep.join( (self.lc, self.version(), "label", name, senseid) )
 82 | 
 83 |     def normalized(self, name):
 84 |         """
 85 |         key
 86 |             <langcode>:<version>:norm:<name>
 87 |         value
 88 |             set( name, name, ... )
 89 |         """
 90 |         return self.sep.join( (self.lc, self.version(), "norm", name) )
 91 | 
 92 |     def translation_sense(self, senseid):
 93 |         """
 94 |         key
 95 |             <langcode>:<version>:trnsl:<senseid>
 96 |         value
 97 |             list( langcode, langcode, ... )
 98 |         """
 99 |         return self.sep.join( (self.lc, self.version(), "trnsl", senseid) )
100 | 
101 |     def translation_sense_language(self, senseid, langcode):
102 |         """
103 |         key
104 |             <langcode>:<version>:trnsl:<senseid>:<langcode>
105 |         value
106 |             string(name)
107 |         """
108 |         return self.sep.join( (self.lc, self.version(), "trnsl", senseid, langcode) )
109 | 
110 |     def page_id(self, name):
111 |         """
112 |         key
113 |             <langcode>:<version>:page:id<name>
114 |         value
115 |             string(id)
116 |         """
117 |         return self.sep.join( (self.lc, self.version(), "page", "id", name) )
118 |     
119 |     def page_title(self, pageid):
120 |         """
121 |         key
122 |             <langcode>:<version>:page:<pageid>:name
123 |         value
124 |             string(name)
125 |         """
126 |         return self.sep.join( (self.lc, self.version(), "page", pageid, "name") )
127 |     
128 |     def page_labels(self, pageid):
129 |         """
130 |         key
131 |             <langcode>:<version>:page:<pageid>:labels
132 |         value
133 |             list( json([title, occurances, fromRedirect, fromTitle isPrimary, proportion]), ...)
134 |         """
135 |         return self.sep.join( (self.lc, self.version(), "page", pageid, "labels") )
136 |  
137 |     def page_definition(self, pageid):
138 |         """
139 |         key
140 |             <langcode>:<version>:page:<pageid>:definition
141 |         value
142 |             string(synopsis)
143 |         """
144 |         return self.sep.join( (self.lc, self.version(), "page", pageid, "definition") )
145 |  
146 |     def page_inlinks(self, pageid):
147 |         """
148 |         key
149 |             <langcode>:<version>:page:<pageid>:inlinks
150 |         value
151 |             list( pageid, pageid, ... )
152 |         """
153 |         return self.sep.join( (self.lc, self.version(), "page", pageid, "inlinks") )
154 |  
155 |  
156 |     def page_outlinks(self, pageid):
157 |         """
158 |         key
159 |             <langcode>:<version>:page:<pageid>:outlinks
160 |         value
161 |             list( pageid, pageid, ... )
162 |         """
163 |         return self.sep.join( (self.lc, self.version(), "page", pageid, "outlinks") )
164 | 
165 |     def page_categories(self, pageid):
166 |         """
167 |         key
168 |             <langcode>:<version>:page:<pageid>:categories
169 |         value
170 |             list( category, category, ... )
171 |         """
172 |         return self.sep.join( (self.lc, self.version(), "page", pageid, "categories") )
173 | 
174 | 
175 |     def ngramscore(self, n):
176 |         """
177 |         key
178 |             <langcode>:<version>:<n>grms
179 |         value
180 |             zset([words{score}, [...]])translation_sense
181 |         """
182 |         return self.sep.join( (self.lc, self.version(), "%sgrms" % n) )
183 |     


--------------------------------------------------------------------------------
/semanticizer/wpm/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | import math
 15 | import re
 16 | import unicodedata
 17 | import sys
 18 | 
 19 | from .markup_stripper import MarkupStripper 
 20 | 
 21 | dump_filenames = {
 22 |     'translations': 'translations.csv',
 23 |     'stats': 'stats.csv',
 24 |     'labels': 'label.csv',
 25 |     'pages': 'page.csv',
 26 |     'pageLabels': 'pageLabel.csv',
 27 |     'pageCategories': 'articleParents.csv',
 28 |     'inlinks': 'pageLinkIn.csv',
 29 |     'outlinks': 'pageLinkOut.csv'
 30 | }
 31 | 
 32 | 
 33 | def normalize(raw, dash=True, accents=True, lower=True):
 34 |     """Replaces hyphens with spaces, removes accents, lower cases and
 35 |     strips the input text.
 36 | 
 37 |     All steps, except for the strip(), can be disabled with the
 38 |     optional arguments.
 39 |     """
 40 |     text = raw
 41 |     if dash:
 42 |         text = text.replace('-', ' ')
 43 |     if accents:
 44 |         text = remove_accents(text)
 45 |     if lower:
 46 |         text = text.lower()
 47 |     text = text.strip()
 48 |     return text if len(text) else raw
 49 | 
 50 | 
 51 | def remove_accents(input_str):
 52 |     """Replaces accented characters in the input with their
 53 |     non-accented counterpart."""
 54 |     if isinstance(input_str, str):
 55 |         input_unicode = input_str.decode(errors="ignore")
 56 |     else:
 57 |         input_unicode = input_str
 58 |     nkfd_form = unicodedata.normalize('NFKD', input_unicode)
 59 |     return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
 60 | 
 61 | 
 62 | def check_dump_path(path, settings):
 63 |     """
 64 |     Checks whether a path exists and raises an error if it doesn't.
 65 | 
 66 |     @param path: The pathname to check
 67 |     @raise IOError: If the path doesn't exist or isn't readbale
 68 |     """
 69 |     import os
 70 |     import glob
 71 |     pathlist = [os.path.normpath(path) + os.sep,
 72 |                 os.path.normpath(os.path.abspath(path)) + os.sep]
 73 |     for fullpath in pathlist:
 74 |         print "Checking " + fullpath
 75 |         if os.path.exists(fullpath):
 76 |             for filetype, filename in dump_filenames.iteritems():
 77 |                 if os.path.isfile(fullpath + filename) == True:
 78 |                     print "Found " + fullpath + filename
 79 |                 else:
 80 |                     raise IOError("Cannot find " + fullpath + filename)
 81 |             if settings.get("include_definitions", True):
 82 |                 wiki = glob.glob(fullpath + '*-pages-articles.xml')
 83 |                 if len(wiki) > 0:
 84 |                     print "Found " + wiki[0]
 85 |                 else:
 86 |                     raise IOError("Cannot find wiki *-pages-articles.xml")
 87 |             return fullpath
 88 |         else:
 89 |             print fullpath + " doesn't exist"
 90 |     raise IOError("Cannot find " + path)
 91 | 
 92 | 
 93 | def get_relatedness(linksA, linksB):
 94 |     """
 95 |     Compare relatedness of 2 articles based on in or outlinks.
 96 | 
 97 |     @param linksA: in or out links of article A
 98 |     @param linksB: in or out links of article B 
 99 |     """   
100 |     if not linksA or not linksB:
101 |         return 0.0 
102 |     
103 |     if linksA == linksB:
104 |         return 1.0
105 |     
106 |     intersection = 0
107 |     indexA = 0
108 |     indexB = 0
109 |     
110 |     while indexA < len(linksA) or indexB < len(linksB):
111 |         useA = False
112 |         useB = False
113 | 
114 |         linkA = None
115 |         linkB = None
116 |         
117 |         if indexA < len(linksA):
118 |             linkA = linksA[indexA]
119 | 
120 |         if indexB < len(linksB):
121 |             linkB = linksB[indexB]
122 |             
123 |         if linkA and linkB and linkA == linkB:
124 |             useA = True
125 |             useB = True
126 |             intersection += 1
127 |         else:
128 |             if linkA and (not linkB or linkA < linkB):
129 |                 useA = True
130 |                 if linkA == artB:
131 |                     intersection += 1
132 |             else:
133 |                 useB = True
134 |                 if linkB == artA:
135 |                     intersection += 1
136 |         
137 |         if useA:
138 |             indexA += 1
139 |         if useB:
140 |             indexB += 1 
141 | 
142 |     googleMeasure = None
143 | 
144 |     if intersection == 0:
145 |         googleMeasure = 1.0
146 |     else:
147 |         a = math.log(len(linksA))
148 |         b = math.log(len(linksB))
149 |         ab = math.log(len(intersection))
150 | 
151 |         googleMeasure = (max(a, b) - ab) / (m - min(a, b))
152 |     
153 |     #normalize
154 |     if not googleMeasure:
155 |         return 0
156 |     if googleMeasure >= 1:
157 |         return 0
158 |     
159 |     return 1 - googleMeasure
160 | 
161 | def generate_markup_definition(markup):
162 |     """
163 |     Strip wiki markup and convert some wiki tags to html
164 | 
165 |     @param markup: wiki markup
166 |     """       
167 |     stripper = MarkupStripper()
168 | 
169 |     # strip markup
170 |     markup = re.sub("={2,}(.+)={2,}", "\n", markup) #clear section headings completely - not just formating, but content as well.			
171 |     markup = stripper.strip_all_but_internal_links_and_emphasis(markup) 
172 |     markup = stripper.strip_non_article_internal_links(markup) 
173 |     markup = stripper.strip_excess_newlines(markup) 
174 | 
175 |     # convert wiki tags to html
176 |     markup = stripper.emphasisResolver.resolve_emphasis(markup) 
177 | 
178 |     # todo convert links
179 |     #...
180 | 
181 |     # slice markup to definition
182 |     fp = ""
183 |     pos = 0
184 |     p = re.compile("\n\n", re.DOTALL)
185 |     for m in p.finditer(markup):
186 |         fp = markup[0:pos]
187 |         if (pos > 150): 
188 |             break
189 |         pos = m.start()+2 
190 |     fp = re.sub("\n", " ", fp)
191 |     fp = re.sub("\\s+", " ", fp) #turn all whitespace into spaces, and collapse them.
192 |     fp = fp.strip()
193 | 
194 |     return fp
195 | 
196 | def cli_progress(current, total, bar_length=40):
197 |     """
198 |     shows progressbar in CLI 
199 | 
200 |     @param current: int of current step
201 |     @param current: int of total steps
202 |     @param bar_length: length of the progressbar in cli window
203 |     """           
204 |     percent = float(current) / total
205 |     hashes = '#' * int(round(percent * bar_length))
206 |     spaces = ' ' * (bar_length - len(hashes))
207 |     sys.stdout.write("\rPercent: [{0}] {1}%".format(hashes + spaces, int(round(percent * 100))))
208 |     sys.stdout.flush()
209 | 


--------------------------------------------------------------------------------
/semanticizer/wpm/utils/emphasis_resolver.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | import re
 15 | 
 16 | # This parses MediaWiki syntax for '''bold''' and ''italic'' text with the equivalent html markup.
 17 | class EmphasisResolver:
 18 |     def resolve_emphasis(self, text):
 19 |         sb = []
 20 |         for line in text.split("\n"):
 21 |             sb.append(self.resolve_line(line))
 22 |             sb.append("\n")
 23 | 
 24 |         result = "".join(sb)
 25 |         result = result[:-1]
 26 |         return result
 27 | 
 28 |     # This is a direct translation of the php function doAllQuotes used by the original MediaWiki software.
 29 |     # 
 30 |     # @param line the line to resolve emphasis within
 31 |     # @return the line, with all emphasis markup resolved to html tags
 32 |     # 
 33 |     def resolve_line(self, line):
 34 |         
 35 |         #print "Resolving line '" + line + "'"
 36 |         
 37 |         arr = self.get_splits("$"+line)
 38 |         if len(arr) <= 1:
 39 |             return line
 40 | 
 41 |         # First, do some preliminary work. This may shift some apostrophes from
 42 |         # being mark-up to being text. It also counts the number of occurrences
 43 |         # of bold and italics mark-ups.
 44 | 
 45 |         numBold = 0
 46 |         numItalics = 0
 47 | 
 48 |         for i, value in enumerate(arr):
 49 |             if (i % 2 == 1):
 50 |                 # If there are ever four apostrophes, assume the first is supposed to
 51 |                 # be text, and the remaining three constitute mark-up for bold text.
 52 |                 if (len(arr[i]) == 4):
 53 |                     arr[i-1] = arr[i-1] + "'" ;
 54 |                     arr[i] = self.get_filled_string(3) ;
 55 |                 elif len(arr[i]) > 5:
 56 |                     # If there are more than 5 apostrophes in a row, assume they're all
 57 |                     # text except for the last 5.
 58 |                     arr[i-1] = arr[i-1] + self.get_filled_string(len(arr[i])-5)
 59 |                     arr[i] = self.get_filled_string(5)
 60 |                 
 61 |                 size = len(arr[i])
 62 |                 if size == 2:
 63 |                     numItalics +=1
 64 |                 elif size == 3:
 65 |                     numBold+=1
 66 |                 elif size == 5:
 67 |                     numItalics +=1
 68 |                     numBold +=1
 69 | 
 70 |         # If there is an odd number of both bold and italics, it is likely
 71 |         # that one of the bold ones was meant to be an apostrophe followed
 72 |         # by italics. Which one we cannot know for certain, but it is more
 73 |         # likely to be one that has a single-letter word before it.
 74 |         if (numBold%2==1) and (numItalics%2==1):
 75 |             i= 0
 76 |             firstSingleLetterWord = -1
 77 |             firstMultiLetterWord = -1
 78 |             firstSpace = -1
 79 | 
 80 |             for r in arr:
 81 |                 if i%2==1 and len(r)==3:
 82 |                     x1 = arr[i-1][len(arr[i-1])-1]
 83 |                     x2 = arr[i-1][len(arr[i-1])-2]
 84 |                     if x1==' ':
 85 |                         if firstSpace == -1:
 86 |                             firstSpace = i ;
 87 |                     elif x2==' ':
 88 |                         if firstSingleLetterWord == -1:
 89 |                             firstSingleLetterWord = i
 90 |                     else:
 91 |                         if firstMultiLetterWord == -1:
 92 |                             firstMultiLetterWord = i
 93 | 
 94 |                 i += 1
 95 | 
 96 |             # If there is a single-letter word, use it!
 97 |             if firstSingleLetterWord > -1:
 98 |                 arr[firstSingleLetterWord] = "''"
 99 |                 arr[firstSingleLetterWord-1] = arr[firstSingleLetterWord] + "'" 
100 |             elif firstMultiLetterWord > -1:
101 |                 # If not, but there's a multi-letter word, use that one.
102 |                 arr[firstMultiLetterWord] = "''" 
103 |                 arr[firstMultiLetterWord-1] = arr[firstMultiLetterWord] + "'" 
104 |             elif firstSpace > -1:
105 |                 # ... otherwise use the first one that has neither.
106 |                 # (notice that it is possible for all three to be -1 if, for example,
107 |                 # there is only one pentuple-apostrophe in the line)
108 |                 arr[firstSpace] = "''" 
109 |                 arr[firstSpace-1] = arr[firstSpace] + "'" 
110 | 
111 |         # Now let's actually convert our apostrophic mush to HTML!
112 | 
113 |         output = []
114 |         buffer = []
115 |         state = "" ;
116 |         i = 0
117 |         for r in arr:
118 |             if i%2==0:
119 |                 if state == "both":
120 |                     buffer.append(r)
121 |                 else:
122 |                     output.append(r)
123 |             else:
124 |                 if len(r) == 2:
125 |                     if state == "i":
126 |                         output.append("</i>")
127 |                         state = ""
128 |                     elif state == "bi":
129 |                         output.append("</i>")
130 |                         state = "b"
131 |                     elif state =="ib":
132 |                         output.append("</b></i><b>"); 
133 |                         state = "b";
134 |                     elif state =="both":
135 |                         output.append("<b><i>") ;
136 |                         output.append("".join(buffer))
137 |                         output.append("</i>") ;
138 |                         state = "b";
139 |                     else:
140 |                         # $state can be "b" or ""
141 |                         output.append("<i>")
142 |                         state = state + "i"
143 |                 elif len(r) == 3:
144 |                     if state == "b":
145 |                         output.append("</b>")
146 |                         state = ""
147 |                     elif state == "bi":
148 |                         output.append("</i></b><i>")
149 |                         state = "i"
150 |                     elif state =="ib":
151 |                         output.append("</b>"); 
152 |                         state = "i";
153 |                     elif state =="both":
154 |                         output.append("<i><b>") ;
155 |                         output.append("".join(buffer))
156 |                         output.append("</b>") ;
157 |                         state = "i";
158 |                     else:
159 |                         # $state can be "b" or ""
160 |                         output.append("<b>")
161 |                         state = state + "b"
162 |                 elif len(r) == 5:
163 |                     if state == "b":
164 |                         output.append("</b><i>")
165 |                         state = "i"
166 |                     elif state == "i":
167 |                         output.append("</i><b>")
168 |                         state = "b"
169 |                     elif state =="bi":
170 |                         output.append("</i></b>"); 
171 |                         state = "";
172 |                     elif state =="ib":
173 |                         output.append("</b></i>") ;
174 |                         state = "";
175 |                     elif state =="both":
176 |                         output.append("<i><b>") ;
177 |                         output.append("".join(buffer))
178 |                         output.append("</b></i>") ;
179 |                         state = "i";
180 |                     else:
181 |                         # ($state == "")
182 |                         buffer = []
183 |                         state = "both"
184 |             i += 1
185 | 
186 | 
187 |         # Now close all remaining tags.  Notice that the order is important.
188 |         if state == "b" or state == "ib":
189 |             output.append("</b>")
190 | 
191 |         if state == "i" or state == "bi" or state == "ib":
192 |             output.append("</i>")
193 |         if state == "bi":
194 |             output.append("</b>")
195 | 
196 |         # There might be lonely ''''', so make sure we have a buffer
197 |         if state == "both" and len(buffer) > 0:
198 |             output.append("<b><i>")
199 |             output.append("".join(buffer))
200 |             output.append("</i></b>")
201 | 
202 |         #remove leading $
203 |         output = output[1:]
204 | 
205 |         return "".join(output)
206 | 
207 |         
208 | 
209 |     # Does the same job as php function preg_split 
210 |     def get_splits(self, text):
211 |         #return re.split("\\'{2,}", text)
212 |         splits = []
213 |         lastCopyIndex = 0
214 |         p = re.compile("\\'{2,}")
215 | 
216 |         for m in p.finditer(text):
217 |             if m.start() > lastCopyIndex:
218 |                 splits.append( text[lastCopyIndex: m.start()] )
219 |             splits.append( m.group() )
220 |             lastCopyIndex = m.end()
221 | 
222 |         if lastCopyIndex < len(text)-1:
223 |             splits.append(text[lastCopyIndex])
224 | 
225 |         return splits
226 | 
227 | 
228 |     def get_filled_string(self, length):
229 |         sb = []
230 |         for i in xrange(0,length): 
231 |             sb.append("'")
232 |         return "".join(sb)
233 | 
234 | ## EmphasisResolver testing using 
235 | ## python -m semanticizer.wpm.utils.emphasis_resolver
236 | if __name__ == '__main__':  
237 |     er = EmphasisResolver()
238 |     markup = "'''War''' is an openly declared state of organized [[violent]] [[Group conflict|conflict]], typified by extreme [[aggression]], [[societal]] disruption, and high [[Mortality rate|mortality]]. As a behavior pattern, warlike tendencies are found in many [[primate]] species, including [[humans]], and also found in many [[ant]] species. The set of techniques used by a group to carry out war is known as '''warfare'''." 
239 |     #markup = "Parsing '''MediaWiki''''s syntax for '''bold''' and ''italic'' markup is a '''''deceptively''' difficult'' task. Whoever came up with the markup scheme should be '''shot'''." ; 
240 |     print er.resolve_emphasis(markup)


--------------------------------------------------------------------------------
/semanticizer/wpm/utils/wikidumps.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012 University of Amsterdam
 2 | # Copyright 2014 Netherlands eScience Center
 3 | # Written by Lars Buitinck.
 4 | 
 5 | """Parsing utilities for Wikipedia database dumps."""
 6 | 
 7 | from __future__ import print_function
 8 | 
 9 | import re
10 | import xml.etree.ElementTree as etree   # don't use LXML, it's slower (!)
11 | 
12 | 
13 | def _get_namespace(tag):
14 |     try:
15 |         namespace = re.match(r"^{(.*?)}", tag).group(1)
16 |     except AttributeError:
17 |         namespace = ''
18 |     if not namespace.startswith("http://www.mediawiki.org/xml/export-"):
19 |         raise ValueError("namespace %r not recognized as MediaWiki dump"
20 |                          % namespace)
21 |     return namespace
22 | 
23 | 
24 | def extract_pages(f):
25 |     """Extract pages from Wikimedia database dump.
26 | 
27 |     Parameters
28 |     ----------
29 |     f : file-like or str
30 |         Handle on Wikimedia article dump. May be any type supported by
31 |         etree.iterparse.
32 | 
33 |     Returns
34 |     -------
35 |     pages : iterable over (int, string, string)
36 |         Generates (page_id, title, content) triples.
37 |         In Python 2.x, may produce either str or unicode strings.
38 |     """
39 |     elems = (elem for _, elem in etree.iterparse(f, events=["end"]))
40 | 
41 |     # We can't rely on the namespace for database dumps, since it's changed
42 |     # it every time a small modification to the format is made. So, determine
43 |     # those from the first element we find, which will be part of the metadata,
44 |     # and construct element paths.
45 |     elem = next(elems)
46 |     namespace = _get_namespace(elem.tag)
47 |     ns_mapping = {"ns": namespace}
48 |     page_tag = "{%(ns)s}page" % ns_mapping
49 |     text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
50 |     id_path = "./{%(ns)s}id" % ns_mapping
51 |     title_path = "./{%(ns)s}title" % ns_mapping
52 | 
53 |     for elem in elems:
54 |         if elem.tag == page_tag:
55 |             text = elem.find(text_path).text
56 |             if text is None:
57 |                 # Empty article; these occur in Wikinews dumps.
58 |                 continue
59 |             yield (int(elem.find(id_path).text),
60 |                    elem.find(title_path).text,
61 |                    text)
62 | 
63 |             # Prune the element tree, as per
64 |             # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
65 |             # We do this only for <page>s, since we need to inspect the
66 |             # ./revision/text element. That shouldn't matter since the pages
67 |             # comprise the bulk of the file.
68 |             elem.clear()
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     # Test; will write article info + prefix of content to stdout
73 |     import sys
74 | 
75 |     if len(sys.argv) > 1:
76 |         print("usage: %s; will read from standard input" % sys.argv[0],
77 |               file=sys.stderr)
78 |         sys.exit(1)
79 | 
80 |     for pageid, title, text in extract_pages(sys.stdin):
81 |         title = title.encode("utf-8")
82 |         text = text[:40].replace("\n", "_").encode("utf-8")
83 |         print("%d '%s' (%s)" % (pageid, title, text))
84 | 


--------------------------------------------------------------------------------
/semanticizer_wsgi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  4 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  5 | # General Public License as published by the Free Software Foundation, either 
  6 | # version 3 of the License, or (at your option) any later version.
  7 | # 
  8 | # This program is distributed in the hope that it will be useful, but WITHOUT
  9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 11 | # for more details.
 12 | # 
 13 | # You should have received a copy of the GNU Lesser General Public License 
 14 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 15 | 
 16 | """ Semanticizer (WSGI version)
 17 | 
 18 | A stripped down, WSGI compatible, version of the semanticizer.
 19 | 
 20 | Usage:
 21 |   gunicorn --bind 0.0.0.0:5001 --workers 4 semanticizer_wsgi:application
 22 | or
 23 |   uwsgi --http :5001 --master --processes 4 --wsgi-file semanticizer_wsgi.py
 24 | 
 25 | """
 26 | 
 27 | import re
 28 | from semanticizer.config import config_get
 29 | settings = config_get(('settings'), {})
 30 | 
 31 | # Can do without ujson and simplejson, but speeds up considerably.
 32 | try:
 33 |     import ujson
 34 | except ImportError:
 35 |     pass
 36 | try:
 37 |     import simplejson as json
 38 | except ImportError:
 39 |     import json
 40 | 
 41 | from flask import Flask, Response, request
 42 | 
 43 | from semanticizer import procpipeline
 44 | from semanticizer.config import config_get
 45 | from semanticizer.wpm.data import init_datasource
 46 | 
 47 | 
 48 | wpm_languages = config_get(('wpm', 'languages'))
 49 | init_datasource(wpm_languages, settings)
 50 | PIPELINE = procpipeline.build(wpm_languages)
 51 | 
 52 | # WSGI application!
 53 | application = Flask(__name__)
 54 | application.debug = True
 55 | 
 56 | 
 57 | APPLICATION_JSON = "application/json"
 58 | 
 59 | # RegExens for CleanTweet
 60 | CLEAN_TWEET = \
 61 |     {'user': re.compile(r"(@\w+)"),
 62 |      'url': re.compile(r"(http://[a-zA-Z0-9_=\-\.\?&/#]+)"),
 63 |      'punctuation': re.compile(r"[-!\"#\$%&'\(\)\*\+,\.\/:;<=>\?@\[\\\]\^_`\{\|\}~]+"),
 64 |      'retweet': re.compile(r"(\bRT\b)")
 65 |      }
 66 | 
 67 | 
 68 | @application.route('/')
 69 | def hello_world():
 70 |     """Hello World!"""
 71 |     return 'Hello World!\n'
 72 | 
 73 | 
 74 | @application.route('/semanticize/<langcode>', methods=['GET', 'POST'])
 75 | def _semanticize_handler(langcode):
 76 |     """
 77 |     The function handling the /semanticize/<langcode> namespace. It uses
 78 |     the chain-of-command pattern to run all processors, using the
 79 |     corresponding preprocess, process, and postprocess steps.
 80 | 
 81 |     @param langcode: The language to use in the semanticizing
 82 |     @return: The body of the response, in this case a json formatted list \
 83 |              of links and their relevance
 84 |     """
 85 |     # self.application.logger.debug("Semanticizing: start")
 86 |     text = _get_text_from_request()
 87 | 
 88 |     # self.application.logger.debug("Semanticizing text: " + text)
 89 |     settings = {"langcode": langcode}
 90 |     for key, value in request.values.iteritems():
 91 |         assert key not in settings
 92 |         settings[key] = value
 93 | 
 94 |     sem_result = _semanticize(langcode, settings, text)
 95 |     json = _json_dumps(sem_result, "pretty" in settings)
 96 | 
 97 |     # self.application.logger.debug("Semanticizing: Created %d characters of JSON." \
 98 |     #                       % len(json))
 99 |     return Response(json, mimetype=APPLICATION_JSON)
100 | 
101 | 
102 | @application.route('/cleantweet', methods=['GET', 'POST'])
103 | def _cleantweet():
104 |     """
105 |     The function that handles the /cleantweet namespace. Will use regular
106 |     expressions to completely clean a given tweet.
107 | 
108 |     @return: The body of the response, in this case a json formatted \
109 |              string containing the cleaned tweet.
110 |     """
111 |     text = _get_text_from_request()
112 |     clean_text = cleantweet(text)
113 | 
114 |     return _json_dumps({"cleaned_text": clean_text})
115 | 
116 | 
117 | def cleantweet(text):
118 |     """
119 |     Tweet cleaner/tokenizer.
120 | 
121 |     Uses regular expressions to completely clean, and tokenize, a
122 |     given tweet.
123 |     """
124 | 
125 |     for cleaner in ['user', 'url', 'punctuation', 'retweet']:
126 |         text = CLEAN_TWEET[cleaner].sub(" ", text)
127 |     text = " ".join([w for w in re.split(r'\s+', text) if len(w) > 1])
128 | 
129 |     return text
130 | 
131 | 
132 | def _semanticize(langcode, settings, text):
133 |     """
134 |     Method that performs the actual semantization.
135 |     """
136 |     links = []
137 | 
138 |     for function in ("preprocess", "process", "postprocess"):
139 |         for step, processor in PIPELINE:
140 |             # self.application.logger.debug("Semanticizing: %s for step %s" \
141 |             #                       % (function, step))
142 |             (links, text, settings) = getattr(processor, function)(links,
143 |                                                                    text,
144 |                                                                    settings
145 |                                                                    )
146 |         # self.application.logger.debug("Semanticizing: %s pipeline with %d steps \
147 |         #                        done" % (function, len(self.pipeline)))
148 | 
149 |     result = {"links": links, "text": text}
150 | 
151 |     return result
152 | 
153 | 
154 | def _json_dumps(obj, pretty=False):
155 |     """
156 |     Util function to create json dumps based on an object.
157 | 
158 |     @param o: Object to transform
159 |     @param pretty: Whether or not to prettify the JSON
160 |     @return: The JSON string
161 |     """
162 |     if not pretty and "ujson" in locals():
163 |         return ujson.dumps(obj)
164 |     elif not pretty:
165 |         return json.dumps(obj)
166 |     else:
167 |         return json.dumps(obj, indent=4)
168 | 
169 | def _get_text_from_request():
170 |     """
171 |     Util function to get the param called "text" from the current request
172 | 
173 |     @return: the value of "text"
174 |     """
175 | 
176 |     return request.values['text']
177 |     # content_type = request.headers['Content-Type']
178 |     # if request.method == "POST":
179 |     #     if content_type == 'application/x-www-form-urlencoded':
180 |     #         return request.form['text']
181 |     #     elif content_type == 'text/plain':
182 |     #         return request.data
183 |     #     else:
184 |     #         abort(Response("Unsupported Content Type, use: text/plain\n",
185 |     #                        status=415))
186 |     # elif "text" in request.args:
187 |     #     return request.args["text"]
188 |     # else:
189 |     #     abort(Response("No text provided, use: POST or GET with attribute \
190 |     #                     'text'\n", status=400))
191 | 
192 | 
193 | if __name__ == '__main__':
194 |     application.run()
195 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from distutils.core import setup
 4 | 
 5 | pkgs = (["semanticizer"] +
 6 |         ["semanticizer." + sub for sub in ("processors", "redisinsert",
 7 |                                            "server", "util", "wpm")])
 8 | 
 9 | setup(
10 |     name="semanticizer",
11 |     description="Entity Linking for the masses",
12 |     packages=pkgs,
13 |     classifiers=[
14 |         "Intended Audience :: Science/Research",
15 |         "Topic :: Scientific/Engineering",
16 |         "Topic :: Scientific/Engineering :: Information Analysis",
17 |         "Topic :: Text Processing",
18 |     ],
19 |     install_requires=[
20 |         "flask",
21 |         "mock",
22 |         "leven",
23 |         "lxml",
24 |         "networkx",
25 |         "numpy",
26 |         "redis>=2.8.0",
27 |         "scikit-learn",
28 |         "simplejson",
29 |     ],
30 | )
31 | 


--------------------------------------------------------------------------------
/test/TestConfig.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
  2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
  3 | # General Public License as published by the Free Software Foundation, either 
  4 | # version 3 of the License, or (at your option) any later version.
  5 | # 
  6 | # This program is distributed in the hope that it will be useful, but WITHOUT
  7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
  8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
  9 | # for more details.
 10 | # 
 11 | # You should have received a copy of the GNU Lesser General Public License 
 12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 13 | 
 14 | '''
 15 | Testsuite for the config.py module
 16 | '''
 17 | # Disable check for calling protected members
 18 | # pylint: disable-msg=W0212
 19 | # Disable check for naming conventions that disturb setUp and tearDown
 20 | # pylint: disable-msg=C0103
 21 | # Disable check for too many public methods
 22 | # pylint: disable-msg=R0904
 23 | 
 24 | import unittest
 25 | import config
 26 | from os import remove
 27 | from argparse import ArgumentTypeError
 28 | from argparse import ArgumentParser
 29 | from tempfile import mkstemp
 30 | from ConfigParser import MissingSectionHeaderError
 31 | 
 32 | 
 33 | class Test(unittest.TestCase):
 34 |     """Testclass for config.py"""
 35 | 
 36 |     def setUp(self):
 37 |         """setup the test by creating a tempfile and a test config"""
 38 |         self.tmpfile, self.tmpfilename = mkstemp()
 39 |         self.testconfig = {
 40 |                 'port': 6000,
 41 |                 'lmpath': self.tmpfilename,
 42 |                 'verbose': None
 43 |                 }
 44 | 
 45 |     def tearDown(self):
 46 |         """Tear down by removing the tempfile created during setup"""
 47 |         remove(self.tmpfilename)
 48 | 
 49 |     def test_readable_path(self):
 50 |         """Test the function that guarantees a path given in the config
 51 |         is readable"""
 52 |         valid_path = '/'
 53 |         invalid_path = '/invalid/path'
 54 |         self.assertTrue(
 55 |                 config._readable_path(valid_path).endswith(valid_path),
 56 |                 "_readable_path returns an unexpected value for %s" \
 57 |                 % valid_path)
 58 |         self.assertRaises(ArgumentTypeError,
 59 |                           config._readable_path,
 60 |                           invalid_path)
 61 | 
 62 |     def test_writable_file(self):
 63 |         """Test the function that guarantees a path given in the config
 64 |         is writable"""
 65 |         valid_file = self.tmpfilename
 66 |         invalid_file = '/test/test/invalid'
 67 |         self.assertTrue(
 68 |                 config._writable_file(valid_file).endswith(valid_file),
 69 |                 "_writable_file returns an unexpected value for %s" \
 70 |                 % valid_file)
 71 |         self.assertRaises(ArgumentTypeError,
 72 |                           config._writable_file,
 73 |                           invalid_file)
 74 | 
 75 |     def test_valid_absolute_url(self):
 76 |         """Test the function that guarantees a value given in the config
 77 |         is a valid URL"""
 78 |         valid_url = 'http://www.google.com:890/something?param=1&else=2'
 79 |         invalid_url = 'ha//%st||al}avista'
 80 |         self.assertEqual(
 81 |                 config._valid_absolute_url(valid_url),
 82 |                 valid_url,
 83 |                 "_valid_absolute_url returns an unexpected value for %s" \
 84 |                 % valid_url)
 85 |         self.assertRaises(ArgumentTypeError,
 86 |                           config._valid_absolute_url,
 87 |                           invalid_url)
 88 | 
 89 |     def test_get_conf_vals(self):
 90 |         """Test the params are being parsed as we expect"""
 91 |         # the expected result after parsing the config
 92 |         result = ["--lmpath", self.tmpfilename, "--port", "6000", "--verbose"]
 93 |         # writing a random line to the config file and test that ConfigParser
 94 |         # raises a MissingSectionHeaderError
 95 |         tmpfile = open(self.tmpfilename, 'w')
 96 |         tmpfile.write("somekey = somevalue\n")
 97 |         tmpfile.close()
 98 |         self.assertRaises(MissingSectionHeaderError,
 99 |                           config._get_conf_vals,
100 |                           self.tmpfilename)
101 |         # writing valid values to the config file and comparing the result to
102 |         # what we expect
103 |         tmpfile = open(self.tmpfilename, 'w')
104 |         tmpfile.write("[generic]\n")
105 |         for key, value in self.testconfig.iteritems():
106 |             if value:
107 |                 tmpfile.write(key + " = " + str(value) + "\n")
108 |             else:
109 |                 tmpfile.write(key + "\n")
110 |         tmpfile.close()
111 |         self.assertEqual(config._get_conf_vals(self.tmpfilename),
112 |                          result,
113 |                          "_get_conf_vals doesn't create the expected list: ")
114 | 
115 |     def test_get_arg_parser(self):
116 |         """Test we get a valid ArgumentParser"""
117 |         self.assertIsInstance(config._get_arg_parser(),
118 |                               ArgumentParser,
119 |                               "_get_arg_parser doesn't return an instance of \
120 |                               ArgumentParser")
121 | 
122 |     def test_set_data_and_set_conf(self):
123 |         """Test the set_data and set_conf functions"""
124 |         # generate and set data
125 |         configuration = []
126 |         for key, value in self.testconfig.iteritems():
127 |             configuration += ["--" + key]
128 |             if value:
129 |                 configuration += [str(value)]
130 |         config.set_data(configuration)
131 |         # check we can read back the data we set
132 |         self.assertEqual(config.config_get(("server","port")),
133 |                          6000,
134 |                          "can't find argument values set by set_data")
135 |         self.assertEqual(config.config_get(("logging", "verbose")),
136 |                          True,
137 |                          "can't find argument values set by set_data")
138 |         # check that the system exits when we give unrecognized arguments
139 |         config.set_data("--some values --that --dont --exist".split())
140 |         self.assertRaises(SystemExit, config._set_conf)
141 | 
142 |     def test_config_get(self):
143 |         """Test the most important function of the config module: config_get"""
144 |         # generate and set data
145 |         configuration = []
146 |         for key, value in self.testconfig.iteritems():
147 |             configuration += ["--" + key]
148 |             if value:
149 |                 configuration += [str(value)]
150 |         config.set_data(configuration)
151 |         # check we can read back the data we set
152 |         config.config_get(('server', 'port'))
153 |         self.assertEqual(config.config_get(('server', 'port')),
154 |                          6000,
155 |                          "can't find argument values set by set_data")
156 |         self.assertEqual(config.config_get("nonexisting", None),
157 |                          None,
158 |                          "config_get doesn't return None on a nonexisting param")
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     #import sys;sys.argv = ['', 'Test.testName']
163 |     unittest.main()
164 | 


--------------------------------------------------------------------------------
/test/TestInputdata.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | '''
15 | Testsuite for the init.Initializer module
16 | '''
17 | import unittest
18 | import os
19 | import inputdata
20 | 
21 | from tempfile import mkstemp
22 | from textcat import NGram
23 | 
24 | 
25 | class Test(unittest.TestCase):
26 | 
27 |     def setUp(self):
28 |         self.tmpfile, self.tmpfilename = mkstemp()
29 | 
30 |     def test_load_textcat(self):
31 |         # Initialize
32 |         invalid_lm_dir = os.path.dirname(self.tmpfilename)
33 |         valid_lm_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
34 |                                     "../LM.lrej2011")
35 | 
36 |         # ++++++++++++++++++++++++++++
37 |         # ++++++++ Run tests +++++++++
38 |         # ++++++++++++++++++++++++++++
39 | 
40 |         # Fail if lm_dir isn't set
41 |         self.assertRaises(TypeError, inputdata.load_textcat)
42 | 
43 |         # Fail if lm_dir is invalid
44 |         self.assertRaises(ValueError, inputdata.load_textcat, invalid_lm_dir)
45 | 
46 |         # Return an NGram object if lm_dir is valid
47 |         self.assertIsInstance(inputdata.load_textcat(valid_lm_dir), NGram,
48 |                               "_load_textcat with %s should result in a" \
49 |                               % valid_lm_dir + "valid_lm_dir NGram instance."
50 |                               + "Does the path contain valid lm files?")
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     #import sys;sys.argv = ['', 'Test.testName']
55 |     unittest.main()
56 | 


--------------------------------------------------------------------------------
/test/TestMain.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | '''
15 | Created on 13 Apr 2013
16 | 
17 | @author: evert
18 | '''
19 | import unittest
20 | 
21 | 
22 | class Test(unittest.TestCase):
23 | 
24 |     def setUp(self):
25 |         pass
26 | 
27 |     def tearDown(self):
28 |         pass
29 | 
30 |     @unittest.skip("not yet implemented")
31 |     def test_start_server(self):
32 |         pass
33 | 
34 |     @unittest.skip("not yet implemented")
35 |     def test_init_logging(self):
36 |         pass
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     #import sys;sys.argv = ['', 'Test.testName']
41 |     unittest.main()
42 | 


--------------------------------------------------------------------------------
/test/TestProcpipeline.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | '''
15 | Created on 13 Apr 2013
16 | 
17 | @author: evert
18 | '''
19 | import unittest
20 | import procpipeline
21 | 
22 | from mock import patch
23 | 
24 | 
25 | class Test(unittest.TestCase):
26 | 
27 |     def setUp(self):
28 |         pass
29 | 
30 |     def tearDown(self):
31 |         pass
32 | 
33 |     def test_build(self):
34 |         pass
35 | 
36 |     @patch('procpipeline.SemanticizeProcessor', autospec=True, create=True)
37 |     def test_load_semanticize_processor(self, mock):
38 |         # Initialize
39 | 
40 |         # ++++++++++++++++++++++++++++
41 |         # ++++++++ Run tests +++++++++
42 |         # ++++++++++++++++++++++++++++
43 | 
44 |         # Running with wikipedia_ids as None throws an AttributeException
45 |         # because we access attributes
46 |         self.assertRaises(AttributeError,
47 |                           procpipeline._load_semanticize_processor,
48 |                           None)
49 | 
50 |         # Running with a dict of zero wikipedia_ids should work fine
51 |         assert procpipeline._load_semanticize_processor(dict())
52 | 
53 |         # use the mocked-out SemanticizeProcessor
54 |         print procpipeline._load_semanticize_processor(
55 |                                                 {'me': ['hey', 'later'],
56 |                                                  'you': ['hi', 'bye']})
57 | 
58 |     @unittest.skip("not yet implemented")
59 |     def test_load_features(self):
60 |         # Initialize
61 | 
62 |         # ++++++++++++++++++++++++++++
63 |         # ++++++++ Run tests +++++++++
64 |         # ++++++++++++++++++++++++++++
65 |         pass
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     #import sys;sys.argv = ['', 'Test.testName']
70 |     unittest.main()
71 | 


--------------------------------------------------------------------------------
/test/TestServer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 
 3 | # General Public License as published by the Free Software Foundation, either 
 4 | # version 3 of the License, or (at your option) any later version.
 5 | # 
 6 | # This program is distributed in the hope that it will be useful, but WITHOUT
 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
 9 | # for more details.
10 | # 
11 | # You should have received a copy of the GNU Lesser General Public License 
12 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
13 | 
14 | '''
15 | Created on 13 Apr 2013
16 | 
17 | @author: evert
18 | '''
19 | import unittest
20 | 
21 | 
22 | class Test(unittest.TestCase):
23 | 
24 |     def setUp(self):
25 |         pass
26 | 
27 |     def tearDown(self):
28 |         pass
29 | 
30 |     @unittest.skip("not yet implemented")
31 |     def test_set_debug(self):
32 |         pass
33 | 
34 |     @unittest.skip("not yet implemented")
35 |     def test_json_dumps(self):
36 |         pass
37 | 
38 |     @unittest.skip("not yet implemented")
39 |     def test_get_text_from_request(self):
40 |         pass
41 | 
42 |     @unittest.skip("not yet implemented")
43 |     def test_setup_route_semanticize(self):
44 |         pass
45 | 
46 |     def test_setup_route_language(self):
47 |         pass
48 | 
49 |     @unittest.skip("not yet implemented")
50 |     def test_setup_route_inspect(self):
51 |         pass
52 | 
53 |     @unittest.skip("not yet implemented")
54 |     def test_setup_all_routes(self):
55 |         pass
56 | 
57 |     @unittest.skip("not yet implemented")
58 |     def test_start(self):
59 |         pass
60 | 
61 |     @unittest.skip("not yet implemented")
62 |     def test_autolang_semanticize(self):
63 |         pass
64 | 
65 |     @unittest.skip("not yet implemented")
66 |     def test_semanticize(self):
67 |         pass
68 | 
69 |     @unittest.skip("not yet implemented")
70 |     def test_remove_stopwords(self):
71 |         pass
72 | 
73 |     @unittest.skip("not yet implemented")
74 |     def test_cleantweet(self):
75 |         pass
76 | 
77 |     @unittest.skip("not yet implemented")
78 |     def test_language(self):
79 |         pass
80 | 
81 |     @unittest.skip("not yet implemented")
82 |     def test_inspect(self):
83 |         pass
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     #import sys;sys.argv = ['', 'Test.testName']
88 |     unittest.main()
89 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/test/__init__.py


--------------------------------------------------------------------------------