├── .gitignore
├── COPYING.LESSER.txt
├── COPYING.txt
├── README.md
├── conf
├── semanticizer.memory.yml
├── semanticizer.redis.yml
├── semanticizer.trove.yml
├── semanticizer.uva.yml
└── semanticizer.yml
├── doc
├── Makefile
├── Semanticizer.js
├── advanced.js
├── docs
│ ├── Semanticizer.html
│ ├── advanced.html
│ ├── docco.css
│ ├── learning.html
│ └── public
│ │ ├── fonts
│ │ ├── aller-bold.eot
│ │ ├── aller-bold.ttf
│ │ ├── aller-bold.woff
│ │ ├── aller-light.eot
│ │ ├── aller-light.ttf
│ │ ├── aller-light.woff
│ │ ├── fleurons.eot
│ │ ├── fleurons.ttf
│ │ ├── fleurons.woff
│ │ ├── novecento-bold.eot
│ │ ├── novecento-bold.ttf
│ │ └── novecento-bold.woff
│ │ ├── images
│ │ └── gray.png
│ │ └── stylesheets
│ │ └── normalize.css
└── learning.js
├── semanticizer.svg
├── semanticizer
├── __init__.py
├── config.py
├── dbinsert
│ ├── __init__.py
│ └── __main__.py
├── processors
│ ├── __init__.py
│ ├── context.py
│ ├── core.py
│ ├── external.py
│ ├── feature.py
│ ├── features.py
│ ├── image.py
│ ├── learning.py
│ ├── multiple.py
│ ├── semanticize.py
│ ├── semanticizer.py
│ ├── stringUtils.py
│ └── util.py
├── procpipeline.py
├── server
│ ├── __init__.py
│ └── __main__.py
├── util
│ ├── __init__.py
│ ├── online_learning.py
│ ├── profiler.py
│ ├── store_dataset.py
│ └── timer.py
└── wpm
│ ├── __init__.py
│ ├── data.py
│ ├── db
│ ├── __init__.py
│ ├── inmemory.py
│ ├── mongodb.py
│ └── redisdb.py
│ ├── load.py
│ ├── namespace.py
│ └── utils
│ ├── __init__.py
│ ├── emphasis_resolver.py
│ ├── markup_stripper.py
│ └── wikidumps.py
├── semanticizer_wsgi.py
├── setup.py
└── test
├── TestConfig.py
├── TestInputdata.py
├── TestMain.py
├── TestProcpipeline.py
├── TestServer.py
└── __init__.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Editor cruft
2 | *.sw[op]
3 | *~
4 | ._*
5 | .DS_Store
6 |
7 | *.pyc
8 |
9 | logs
10 | /log.txt
11 |
12 | # Packages
13 | *.egg
14 | *.egg-info
15 | dist
16 | build
17 | eggs
18 | parts
19 | bin
20 | var
21 | sdist
22 | develop-eggs
23 | .installed.cfg
24 | lib
25 | lib64
26 | __pycache__
27 |
28 | # Installer logs
29 | pip-log.txt
30 |
31 | # Unit test / coverage reports
32 | .coverage
33 | .tox
34 | nosetests.xml
35 |
36 | # Translations
37 | *.mo
38 |
39 | # Mr Developer
40 | .mr.developer.cfg
41 | .project
42 | .pydevproject
43 |
44 | #netbeans
45 | /nbproject/
46 |
47 | # MediaWiki dumps
48 | *.bz2
49 | *.gz
50 | *.sql
51 | *.xml
52 |
--------------------------------------------------------------------------------
/COPYING.LESSER.txt:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Semanticizer
2 |
3 | The Semanticizer is a web service application for semantic linking
4 | created in 2012 by [Daan Odijk](http://staff.science.uva.nl/~dodijk/)
5 | at [ILPS](http://ilps.science.uva.nl/) (University of Amsterdam).
6 |
7 | This project since received contributions from (in alphabetical order):
8 | [Marc Bron](http://staff.science.uva.nl/~mbron/),
9 | [Lars Buitinck](http://staff.science.uva.nl/~buitinck/),
10 | [Bart van den Ende](http://www.bartvandenende.nl/),
11 | [David Graus](http://graus.nu/),
12 | [Tom Kenter](http://staff.science.uva.nl/~tkenter1/),
13 | [Evert Lammerts](http://www.evertlammerts.nl/),
14 | [Edgar Meij](http://edgar.meij.pro/),
15 | [Daan Odijk](http://staff.science.uva.nl/~dodijk/),
16 | [Anne Schuth](http://www.anneschuth.nl/) and
17 | [Isaac Sijaranamual](http://nl.linkedin.com/pub/isaac-sijaranamual/).
18 |
19 | The algorithms for this webservice are developed for and described in
20 | a OAIR2013 publication on
21 | [Feeding the Second Screen](http://ilps.science.uva.nl/biblio/feeding-second-screen-semantic-linking-based-subtitles)
22 | by [Daan Odijk](http://staff.science.uva.nl/~dodijk/),
23 | [Edgar Meij](http://edgar.meij.pro/) and
24 | [Maarten de Rijke](http://staff.science.uva.nl/~mdr/). Part of this
25 | research was inspired by earlier ILPS publications:
26 | [Adding Semantics to Microblog Posts](http://ilps.science.uva.nl/biblio/adding-semantics-microblog-posts)
27 | and
28 | [Mapping Queries To The Linking Open Data Cloud](http://ilps.science.uva.nl/node/889).
29 | If you use this webservice for your own research, please include a
30 | reference to the OAIR2013 article or alternatively any of these
31 | articles.
32 |
33 | The [online documentation](http://semanticize.uva.nl/doc/) describes
34 | how to use the Semanticizer Web API. This
35 | [REST](http://en.wikipedia.org/wiki/Representational_state_transfer)-like
36 | web service returns [JSON](http://www.json.org/) and is exposed to
37 | public at: http://semanticize.uva.nl/api/. Currently an access key for
38 | the webservice is not needed.
39 |
40 | The [code](https://github.com/semanticize/semanticizer/) is released
41 | under LGPL license (see below). If you have any questions, contact
42 | [Daan](http://staff.science.uva.nl/~dodijk/).
43 |
44 | If you want to dive into the code, start at `semanticizer/server/__main__.py`.
45 |
46 |
47 | ## Requirements
48 |
49 | 1. The software has been tested with Python 2.7.3 on Mac OS X 2.8 and
50 | Linux (RedHat EL5, Debian jessie/sid and Ubuntu 12.04.)
51 |
52 | 2. The following Python modules need to be installed (using
53 | easy_install or pip):
54 |
55 | * nltk
56 | * leven
57 | * networkx
58 | * lxml
59 | * flask
60 | * redis (optional, see point 4)
61 | * scikit-learn (optional, see point 6)
62 | * scipy (optional, see point 6)
63 | * mock (optional, used by the tests)
64 |
65 | 3. A summary of a Wikipedia dump is needed. For this, download the
66 | [Wikipedia Miner CSV files](http://sourceforge.net/projects/wikipedia-miner/files/data/).
67 |
68 | 4. Copy one of the two config files in the `conf` folder to
69 | `semanticizer.yml` in that folder and adapt to your situation. You
70 | have the choice of loading all data into memory (use
71 | `semanticizer.memory.yml`) or into [Redis](http://redis.io/) using
72 | the following steps:
73 |
74 | 1. Copy `semanticizer.redis.yml` into `semanticizer.yml`.
75 |
76 | 2. Redis server needs to be set up and running.
77 |
78 | 3. Load data into redis: `python -m semanticizer.dbinsert [--language=] [--output=/tmp/redisinsert.log]`.
79 |
80 | 4. Run the server using `python -m semanticizer.server`.
81 |
82 | 5. In order to work with the features you need to install the
83 | scikit-learn and scipy packages. Before installing scipy you need
84 | to have [swig](http://www.swig.org/download.html) installed. See
85 | its INSTALL for instructions. (configure, make, make
86 | install). Note that working with features is still under active
87 | development and therefore not fully documented and tested.
88 |
89 | ## License
90 |
91 | This program is free software: you can redistribute it and/or modify
92 | it under the terms of the GNU Lesser General Public License as
93 | published by the Free Software Foundation, either version 3 of the
94 | License, or (at your option) any later version.
95 |
96 | This program is distributed in the hope that it will be useful, but
97 | WITHOUT ANY WARRANTY; without even the implied warranty of
98 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
99 | Lesser General Public License for more details.
100 |
101 | You should have received a copy of the GNU Lesser General Public
102 | License along with this program. If not, see
103 | .
104 |
--------------------------------------------------------------------------------
/conf/semanticizer.memory.yml:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | server:
15 | port: 5000
16 | host: 0.0.0.0
17 |
18 | wpm:
19 | languages:
20 | en:
21 | source: memory
22 | initparams:
23 | path: ./enwiki-20110722
24 | language: english
25 | threads: 16
26 | bdburl: http://wikipedia-miner.cms.waikato.ac.nz/services/exploreArticle
27 |
28 | linkprocs:
29 | features: false
30 |
31 | logging:
32 | verbose: true
33 | path: log.txt
34 | format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s'
35 |
36 | misc:
37 | tempdir: /tmp
38 |
--------------------------------------------------------------------------------
/conf/semanticizer.redis.yml:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | server:
15 | port: 5000
16 | host: 0.0.0.0
17 | use_reloader: true
18 |
19 | wpm:
20 | languages:
21 | en:
22 | source: redis
23 | initparams:
24 | path: ./enwiki-20110722
25 | language: english
26 | host: localhost
27 | port: 6379
28 |
29 | linkprocs:
30 | features: false
31 |
32 | logging:
33 | verbose: true
34 | path: log.txt
35 | format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s'
36 |
37 | misc:
38 | tempdir: /tmp
39 |
--------------------------------------------------------------------------------
/conf/semanticizer.trove.yml:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | server:
15 | port: 5000
16 | host: 0.0.0.0
17 |
18 | wpm:
19 | languages:
20 | # memory backend
21 | nl:
22 | source: WpmDataInProc
23 | initparams:
24 | path: /zfs/ilps-plexer/wikipediaminer/nlwiki-20130318
25 | language: dutch
26 | # translation_languages should be a list of iso 639-2 language
27 | # codes
28 | translation_languages: []
29 | # Redis backend
30 | # nl:
31 | # source: wpmdata_redis.WpmDataRedis
32 | # initparams:
33 | # host: localhost
34 | # port: 6379
35 | threads: 16
36 | bdburl: http://zookst13.science.uva.nl:8080/dutchsemcor/article
37 |
38 | semanticize:
39 | max_ngram_length: 12
40 |
41 | linkprocs:
42 | includefeatures: false
43 |
44 | logging:
45 | verbose: true
46 | path: log.txt
47 | format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s'
48 |
49 | misc:
50 | tempdir: /tmp
51 |
--------------------------------------------------------------------------------
/conf/semanticizer.uva.yml:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | server:
15 | port: 5000
16 | host: 0.0.0.0
17 | use_reloader: false
18 |
19 | wpm:
20 | languages:
21 | nl:
22 | source: WpmDataRedis
23 | initparams:
24 | host: zookst14.science.uva.nl
25 | port: 6379
26 |
27 | # Use the in-memory backend: this is faster than the Redis backend
28 | # but uses a lot more memory, especially if you intent to run
29 | # multiple semanticizers.
30 | # nl:
31 | # source: WpmDataInProc
32 | # initparams:
33 | # path: /zfs/ilps-plexer/wikipediaminer/nlwiki-20111104
34 | # language: dutch
35 | # # translation_languages should be a list of iso 639-2 language
36 | # # codes
37 | # translation_languages: ["en", "fr", "de", "nl"]
38 | en:
39 | source: WpmDataRedis
40 | initparams:
41 | host: zookst14.science.uva.nl
42 | port: 6379
43 | es:
44 | source: WpmDataRedis
45 | initparams:
46 | host: zookst14.science.uva.nl
47 | port: 6379
48 | fr:
49 | source: WpmDataRedis
50 | initparams:
51 | host: zookst14.science.uva.nl
52 | port: 6379
53 | de:
54 | source: WpmDataRedis
55 | initparams:
56 | host: zookst14.science.uva.nl
57 | port: 6379
58 | threads: 16
59 | bdburl: http://zookst13.science.uva.nl:8080/dutchsemcor/article
60 |
61 | linkprocs:
62 | includefeatures: true
63 |
64 | learning:
65 | model_dir: /zfs/ilps-plexer/dodijk/semanticizer.models
66 |
67 | logging:
68 | verbose: true
69 | path: log.txt
70 | format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s'
71 |
72 | misc:
73 | tempdir: /tmp
74 |
75 | settings:
76 | vara:
77 | pre_filter: unique,senseProbability>0.01
78 | learning: coling-SP0.2-100.RandomForestClassifier-10-auto.pkl
79 | filter: unique,learningProbability>=0.5
80 |
--------------------------------------------------------------------------------
/conf/semanticizer.yml:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | server:
15 | port: 8005
16 | host: 0.0.0.0
17 | use_reloader: false
18 |
19 | settings:
20 | include_categories: True
21 | include_definitions: True
22 |
23 | wpm:
24 | languages:
25 | #en:
26 | # source: redis
27 | # initparams:
28 | # path: /zfs/ilps-plexer/wikipediaminer/en
29 | # host: localhost
30 | # port: 6379
31 | # language: english
32 | # #translation_languages: ["nl", "fr", "de", "es"] # TODO: We should include all possible params in the config file [DG]
33 |
34 | nl:
35 | source: redis
36 | initparams:
37 | path: /zfs/ilps-plexer/wikipediaminer/nlwiki-latest
38 | host: localhost
39 | port: 6379
40 | language: nederlands
41 | #translation_languages: ["en", "fr", "de", "es"]
42 |
43 | linkprocs:
44 | features: false
45 |
46 | logging:
47 | verbose: true
48 | path: log.txt
49 | format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s'
50 |
51 | misc:
52 | tempdir: /tmp
53 |
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | doc:
2 | docco -l linear Semanticizer.js
3 | docco -l linear advanced.js
4 | docco -l linear learning.js
5 |
6 | all: doc
7 |
8 | publish: all
9 | rsync -av docs/ zookma:/datastore/applications/semanticize/doc/
10 |
11 | watch:
12 | watch "*.js" 1s "make publish"
13 |
--------------------------------------------------------------------------------
/doc/docs/docco.css:
--------------------------------------------------------------------------------
1 | /*--------------------- Typography ----------------------------*/
2 |
3 | @font-face {
4 | font-family: 'aller-light';
5 | src: url('public/fonts/aller-light.eot');
6 | src: url('public/fonts/aller-light.eot?#iefix') format('embedded-opentype'),
7 | url('public/fonts/aller-light.woff') format('woff'),
8 | url('public/fonts/aller-light.ttf') format('truetype');
9 | font-weight: normal;
10 | font-style: normal;
11 | }
12 |
13 | @font-face {
14 | font-family: 'aller-bold';
15 | src: url('public/fonts/aller-bold.eot');
16 | src: url('public/fonts/aller-bold.eot?#iefix') format('embedded-opentype'),
17 | url('public/fonts/aller-bold.woff') format('woff'),
18 | url('public/fonts/aller-bold.ttf') format('truetype');
19 | font-weight: normal;
20 | font-style: normal;
21 | }
22 |
23 | @font-face {
24 | font-family: 'novecento-bold';
25 | src: url('public/fonts/novecento-bold.eot');
26 | src: url('public/fonts/novecento-bold.eot?#iefix') format('embedded-opentype'),
27 | url('public/fonts/novecento-bold.woff') format('woff'),
28 | url('public/fonts/novecento-bold.ttf') format('truetype');
29 | font-weight: normal;
30 | font-style: normal;
31 | }
32 |
33 | @font-face {
34 | font-family: 'fleurons';
35 | src: url('public/fonts/fleurons.eot');
36 | src: url('public/fonts/fleurons.eot?#iefix') format('embedded-opentype'),
37 | url('public/fonts/fleurons.woff') format('woff'),
38 | url('public/fonts/fleurons.ttf') format('truetype');
39 | font-weight: normal;
40 | font-style: normal;
41 | }
42 |
43 | /*--------------------- Base Styles ----------------------------*/
44 |
45 | body {
46 | font-family: "aller-light";
47 | background: url('public/images/gray.png') #fff;
48 | background-size: 322px;
49 | margin: 0;
50 | }
51 |
52 | hr {
53 | height: 1px;
54 | background: #ddd;
55 | border: 0;
56 | }
57 |
58 | h1, h2, h3, h4, h5, h6 {
59 | color: #112233;
60 | font-weight: normal;
61 | font-family: "novecento-bold";
62 | text-transform: uppercase;
63 | line-height: 1em;
64 | margin-top: 50px;
65 | }
66 | h1 {
67 | margin: 0;
68 | text-align: center;
69 | }
70 | h2 {
71 | font-size: 1.3em;
72 | }
73 | h1:after {
74 | content: "8";
75 | display: block;
76 | font-family: "fleurons";
77 | color: #999;
78 | font-size: 80px;
79 | padding: 10px 0 25px;
80 | }
81 |
82 | a {
83 | color: #000;
84 | }
85 |
86 | b, strong {
87 | font-weight: normal;
88 | font-family: "aller-bold";
89 | }
90 |
91 | blockquote {
92 | border-left: 5px solid #ccc;
93 | margin-left: 0;
94 | padding: 1px 0 1px 1em;
95 | }
96 | .page blockquote p {
97 | font-family: Menlo, Consolas, Monaco, monospace;
98 | font-size: 14px; line-height: 19px;
99 | color: #999;
100 | margin: 10px 0 0;
101 | white-space: pre-wrap;
102 | }
103 |
104 | pre, tt, code {
105 | font-family: Menlo, Consolas, Monaco, monospace;
106 | font-size: 12px;
107 | display: inline-block;
108 | border: 1px solid #EAEAEA;
109 | background: #f8f8f8;
110 | color: #555;
111 | padding: 0 5px;
112 | line-height: 20px;
113 | }
114 | .page pre {
115 | margin: 0;
116 | width: 608px;
117 | padding: 10px 15px;
118 | background: #fcfcfc;
119 | -moz-box-shadow: inset 0 0 10px rgba(0,0,0,0.1);
120 | -webkit-box-shadow: inset 0 0 10px rgba(0,0,0,0.1);
121 | box-shadow: inset 0 0 10px rgba(0,0,0,0.1);
122 | overflow-x: auto;
123 | }
124 | .page pre code {
125 | border: 0;
126 | padding: 0;
127 | background: transparent;
128 | }
129 |
130 | .fleur {
131 | font-family: "fleurons";
132 | font-size: 100px;
133 | text-align: center;
134 | margin: 40px 0;
135 | color: #ccc;
136 | }
137 |
138 | /*--------------------- Layout ----------------------------*/
139 |
140 | .container {
141 | width: 760px;
142 | margin: 0 auto;
143 | background: #fff;
144 | background: rgba(255,255,255, 0.4);
145 | overflow: hidden;
146 | }
147 | .page {
148 | width: 640px;
149 | padding: 30px;
150 | margin: 30px;
151 | background: #fff;
152 | font-size: 17px;
153 | line-height: 26px;
154 | }
155 | .page p {
156 | color: #30404f;
157 | margin: 26px 0;
158 | }
159 |
160 | ul.sections {
161 | list-style: none;
162 | padding:0 0 5px 0;;
163 | margin:0;
164 | }
165 |
166 | .page li p {
167 | margin: 12px 0;
168 | }
169 |
170 | .toc {
171 | max-height: 0;
172 | overflow: hidden;
173 | text-align: center;
174 | font-size: 13px;
175 | line-height: 20px;
176 | -moz-transition: max-height 1s;
177 | -webkit-transition: max-height 1s;
178 | transition: max-height 1s;
179 | }
180 | .header:hover .toc {
181 | max-height: 500px;
182 | }
183 | .toc h3 {
184 | margin-top: 20px;
185 | }
186 | .toc ol {
187 | margin: 0 0 20px 0;
188 | display: inline-block;
189 | text-align: left;
190 | list-style-type: upper-roman;
191 | }
192 | .toc li {
193 | font-family: 'novecento-bold';
194 | }
195 | .toc li a {
196 | font-family: 'aller-light';
197 | }
198 |
199 |
200 | /*---------------------- Syntax Highlighting -----------------------------*/
201 |
202 | td.linenos { background-color: #f0f0f0; padding-right: 10px; }
203 | span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
204 | /*
205 |
206 | github.com style (c) Vasily Polovnyov
207 |
208 | */
209 |
210 | pre code {
211 | display: block; padding: 0.5em;
212 | color: #000;
213 | background: #f8f8ff
214 | }
215 |
216 | pre .comment,
217 | pre .template_comment,
218 | pre .diff .header,
219 | pre .javadoc {
220 | color: #408080;
221 | font-style: italic
222 | }
223 |
224 | pre .keyword,
225 | pre .assignment,
226 | pre .literal,
227 | pre .css .rule .keyword,
228 | pre .winutils,
229 | pre .javascript .title,
230 | pre .lisp .title,
231 | pre .subst {
232 | color: #954121;
233 | /*font-weight: bold*/
234 | }
235 |
236 | pre .number,
237 | pre .hexcolor {
238 | color: #40a070
239 | }
240 |
241 | pre .string,
242 | pre .tag .value,
243 | pre .phpdoc,
244 | pre .tex .formula {
245 | color: #219161;
246 | }
247 |
248 | pre .title,
249 | pre .id {
250 | color: #19469D;
251 | }
252 | pre .params {
253 | color: #00F;
254 | }
255 |
256 | pre .javascript .title,
257 | pre .lisp .title,
258 | pre .subst {
259 | font-weight: normal
260 | }
261 |
262 | pre .class .title,
263 | pre .haskell .label,
264 | pre .tex .command {
265 | color: #458;
266 | font-weight: bold
267 | }
268 |
269 | pre .tag,
270 | pre .tag .title,
271 | pre .rules .property,
272 | pre .django .tag .keyword {
273 | color: #000080;
274 | font-weight: normal
275 | }
276 |
277 | pre .attribute,
278 | pre .variable,
279 | pre .instancevar,
280 | pre .lisp .body {
281 | color: #008080
282 | }
283 |
284 | pre .regexp {
285 | color: #B68
286 | }
287 |
288 | pre .class {
289 | color: #458;
290 | font-weight: bold
291 | }
292 |
293 | pre .symbol,
294 | pre .ruby .symbol .string,
295 | pre .ruby .symbol .keyword,
296 | pre .ruby .symbol .keymethods,
297 | pre .lisp .keyword,
298 | pre .tex .special,
299 | pre .input_number {
300 | color: #990073
301 | }
302 |
303 | pre .builtin,
304 | pre .constructor,
305 | pre .built_in,
306 | pre .lisp .title {
307 | color: #0086b3
308 | }
309 |
310 | pre .preprocessor,
311 | pre .pi,
312 | pre .doctype,
313 | pre .shebang,
314 | pre .cdata {
315 | color: #999;
316 | font-weight: bold
317 | }
318 |
319 | pre .deletion {
320 | background: #fdd
321 | }
322 |
323 | pre .addition {
324 | background: #dfd
325 | }
326 |
327 | pre .diff .change {
328 | background: #0086b3
329 | }
330 |
331 | pre .chunk {
332 | color: #aaa
333 | }
334 |
335 | pre .tex .formula {
336 | opacity: 0.5;
337 | }
338 |
--------------------------------------------------------------------------------
/doc/docs/public/fonts/aller-bold.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-bold.eot
--------------------------------------------------------------------------------
/doc/docs/public/fonts/aller-bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-bold.ttf
--------------------------------------------------------------------------------
/doc/docs/public/fonts/aller-bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-bold.woff
--------------------------------------------------------------------------------
/doc/docs/public/fonts/aller-light.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-light.eot
--------------------------------------------------------------------------------
/doc/docs/public/fonts/aller-light.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-light.ttf
--------------------------------------------------------------------------------
/doc/docs/public/fonts/aller-light.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-light.woff
--------------------------------------------------------------------------------
/doc/docs/public/fonts/fleurons.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/fleurons.eot
--------------------------------------------------------------------------------
/doc/docs/public/fonts/fleurons.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/fleurons.ttf
--------------------------------------------------------------------------------
/doc/docs/public/fonts/fleurons.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/fleurons.woff
--------------------------------------------------------------------------------
/doc/docs/public/fonts/novecento-bold.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/novecento-bold.eot
--------------------------------------------------------------------------------
/doc/docs/public/fonts/novecento-bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/novecento-bold.ttf
--------------------------------------------------------------------------------
/doc/docs/public/fonts/novecento-bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/novecento-bold.woff
--------------------------------------------------------------------------------
/doc/docs/public/images/gray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/images/gray.png
--------------------------------------------------------------------------------
/doc/docs/public/stylesheets/normalize.css:
--------------------------------------------------------------------------------
1 | /*! normalize.css v2.0.1 | MIT License | git.io/normalize */
2 |
3 | /* ==========================================================================
4 | HTML5 display definitions
5 | ========================================================================== */
6 |
7 | /*
8 | * Corrects `block` display not defined in IE 8/9.
9 | */
10 |
11 | article,
12 | aside,
13 | details,
14 | figcaption,
15 | figure,
16 | footer,
17 | header,
18 | hgroup,
19 | nav,
20 | section,
21 | summary {
22 | display: block;
23 | }
24 |
25 | /*
26 | * Corrects `inline-block` display not defined in IE 8/9.
27 | */
28 |
29 | audio,
30 | canvas,
31 | video {
32 | display: inline-block;
33 | }
34 |
35 | /*
36 | * Prevents modern browsers from displaying `audio` without controls.
37 | * Remove excess height in iOS 5 devices.
38 | */
39 |
40 | audio:not([controls]) {
41 | display: none;
42 | height: 0;
43 | }
44 |
45 | /*
46 | * Addresses styling for `hidden` attribute not present in IE 8/9.
47 | */
48 |
49 | [hidden] {
50 | display: none;
51 | }
52 |
53 | /* ==========================================================================
54 | Base
55 | ========================================================================== */
56 |
57 | /*
58 | * 1. Sets default font family to sans-serif.
59 | * 2. Prevents iOS text size adjust after orientation change, without disabling
60 | * user zoom.
61 | */
62 |
63 | html {
64 | font-family: sans-serif; /* 1 */
65 | -webkit-text-size-adjust: 100%; /* 2 */
66 | -ms-text-size-adjust: 100%; /* 2 */
67 | }
68 |
69 | /*
70 | * Removes default margin.
71 | */
72 |
73 | body {
74 | margin: 0;
75 | }
76 |
77 | /* ==========================================================================
78 | Links
79 | ========================================================================== */
80 |
81 | /*
82 | * Addresses `outline` inconsistency between Chrome and other browsers.
83 | */
84 |
85 | a:focus {
86 | outline: thin dotted;
87 | }
88 |
89 | /*
90 | * Improves readability when focused and also mouse hovered in all browsers.
91 | */
92 |
93 | a:active,
94 | a:hover {
95 | outline: 0;
96 | }
97 |
98 | /* ==========================================================================
99 | Typography
100 | ========================================================================== */
101 |
102 | /*
103 | * Addresses `h1` font sizes within `section` and `article` in Firefox 4+,
104 | * Safari 5, and Chrome.
105 | */
106 |
107 | h1 {
108 | font-size: 2em;
109 | }
110 |
111 | /*
112 | * Addresses styling not present in IE 8/9, Safari 5, and Chrome.
113 | */
114 |
115 | abbr[title] {
116 | border-bottom: 1px dotted;
117 | }
118 |
119 | /*
120 | * Addresses style set to `bolder` in Firefox 4+, Safari 5, and Chrome.
121 | */
122 |
123 | b,
124 | strong {
125 | font-weight: bold;
126 | }
127 |
128 | /*
129 | * Addresses styling not present in Safari 5 and Chrome.
130 | */
131 |
132 | dfn {
133 | font-style: italic;
134 | }
135 |
136 | /*
137 | * Addresses styling not present in IE 8/9.
138 | */
139 |
140 | mark {
141 | background: #ff0;
142 | color: #000;
143 | }
144 |
145 |
146 | /*
147 | * Corrects font family set oddly in Safari 5 and Chrome.
148 | */
149 |
150 | code,
151 | kbd,
152 | pre,
153 | samp {
154 | font-family: monospace, serif;
155 | font-size: 1em;
156 | }
157 |
158 | /*
159 | * Improves readability of pre-formatted text in all browsers.
160 | */
161 |
162 | pre {
163 | white-space: pre;
164 | white-space: pre-wrap;
165 | word-wrap: break-word;
166 | }
167 |
168 | /*
169 | * Sets consistent quote types.
170 | */
171 |
172 | q {
173 | quotes: "\201C" "\201D" "\2018" "\2019";
174 | }
175 |
176 | /*
177 | * Addresses inconsistent and variable font size in all browsers.
178 | */
179 |
180 | small {
181 | font-size: 80%;
182 | }
183 |
184 | /*
185 | * Prevents `sub` and `sup` affecting `line-height` in all browsers.
186 | */
187 |
188 | sub,
189 | sup {
190 | font-size: 75%;
191 | line-height: 0;
192 | position: relative;
193 | vertical-align: baseline;
194 | }
195 |
196 | sup {
197 | top: -0.5em;
198 | }
199 |
200 | sub {
201 | bottom: -0.25em;
202 | }
203 |
204 | /* ==========================================================================
205 | Embedded content
206 | ========================================================================== */
207 |
208 | /*
209 | * Removes border when inside `a` element in IE 8/9.
210 | */
211 |
212 | img {
213 | border: 0;
214 | }
215 |
216 | /*
217 | * Corrects overflow displayed oddly in IE 9.
218 | */
219 |
220 | svg:not(:root) {
221 | overflow: hidden;
222 | }
223 |
224 | /* ==========================================================================
225 | Figures
226 | ========================================================================== */
227 |
228 | /*
229 | * Addresses margin not present in IE 8/9 and Safari 5.
230 | */
231 |
232 | figure {
233 | margin: 0;
234 | }
235 |
236 | /* ==========================================================================
237 | Forms
238 | ========================================================================== */
239 |
240 | /*
241 | * Define consistent border, margin, and padding.
242 | */
243 |
244 | fieldset {
245 | border: 1px solid #c0c0c0;
246 | margin: 0 2px;
247 | padding: 0.35em 0.625em 0.75em;
248 | }
249 |
250 | /*
251 | * 1. Corrects color not being inherited in IE 8/9.
252 | * 2. Remove padding so people aren't caught out if they zero out fieldsets.
253 | */
254 |
255 | legend {
256 | border: 0; /* 1 */
257 | padding: 0; /* 2 */
258 | }
259 |
260 | /*
261 | * 1. Corrects font family not being inherited in all browsers.
262 | * 2. Corrects font size not being inherited in all browsers.
263 | * 3. Addresses margins set differently in Firefox 4+, Safari 5, and Chrome
264 | */
265 |
266 | button,
267 | input,
268 | select,
269 | textarea {
270 | font-family: inherit; /* 1 */
271 | font-size: 100%; /* 2 */
272 | margin: 0; /* 3 */
273 | }
274 |
275 | /*
276 | * Addresses Firefox 4+ setting `line-height` on `input` using `!important` in
277 | * the UA stylesheet.
278 | */
279 |
280 | button,
281 | input {
282 | line-height: normal;
283 | }
284 |
285 | /*
286 | * 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio`
287 | * and `video` controls.
288 | * 2. Corrects inability to style clickable `input` types in iOS.
289 | * 3. Improves usability and consistency of cursor style between image-type
290 | * `input` and others.
291 | */
292 |
293 | button,
294 | html input[type="button"], /* 1 */
295 | input[type="reset"],
296 | input[type="submit"] {
297 | -webkit-appearance: button; /* 2 */
298 | cursor: pointer; /* 3 */
299 | }
300 |
301 | /*
302 | * Re-set default cursor for disabled elements.
303 | */
304 |
305 | button[disabled],
306 | input[disabled] {
307 | cursor: default;
308 | }
309 |
310 | /*
311 | * 1. Addresses box sizing set to `content-box` in IE 8/9.
312 | * 2. Removes excess padding in IE 8/9.
313 | */
314 |
315 | input[type="checkbox"],
316 | input[type="radio"] {
317 | box-sizing: border-box; /* 1 */
318 | padding: 0; /* 2 */
319 | }
320 |
321 | /*
322 | * 1. Addresses `appearance` set to `searchfield` in Safari 5 and Chrome.
323 | * 2. Addresses `box-sizing` set to `border-box` in Safari 5 and Chrome
324 | * (include `-moz` to future-proof).
325 | */
326 |
327 | input[type="search"] {
328 | -webkit-appearance: textfield; /* 1 */
329 | -moz-box-sizing: content-box;
330 | -webkit-box-sizing: content-box; /* 2 */
331 | box-sizing: content-box;
332 | }
333 |
334 | /*
335 | * Removes inner padding and search cancel button in Safari 5 and Chrome
336 | * on OS X.
337 | */
338 |
339 | input[type="search"]::-webkit-search-cancel-button,
340 | input[type="search"]::-webkit-search-decoration {
341 | -webkit-appearance: none;
342 | }
343 |
344 | /*
345 | * Removes inner padding and border in Firefox 4+.
346 | */
347 |
348 | button::-moz-focus-inner,
349 | input::-moz-focus-inner {
350 | border: 0;
351 | padding: 0;
352 | }
353 |
354 | /*
355 | * 1. Removes default vertical scrollbar in IE 8/9.
356 | * 2. Improves readability and alignment in all browsers.
357 | */
358 |
359 | textarea {
360 | overflow: auto; /* 1 */
361 | vertical-align: top; /* 2 */
362 | }
363 |
364 | /* ==========================================================================
365 | Tables
366 | ========================================================================== */
367 |
368 | /*
369 | * Remove most spacing between table cells.
370 | */
371 |
372 | table {
373 | border-collapse: collapse;
374 | border-spacing: 0;
375 | }
--------------------------------------------------------------------------------
/semanticizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/__init__.py
--------------------------------------------------------------------------------
/semanticizer/config.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | """
15 | This module is responsible for loading all possible configuration params and
16 | their defaults, overwriting the defaults by reading values from a given config
17 | file, then overwriting these values to whatever's been passed as argument.
18 | """
19 | import yaml
20 | import sys
21 | import argparse
22 | import traceback
23 | import os
24 |
25 | def load_config(path='../conf/semanticizer.yml'):
26 |
27 | #add command line args
28 | parser = argparse.ArgumentParser(description="""
29 | Run sematicizer.""")
30 |
31 | parser.add_argument("-p", "--port", help="Port number ")
32 | parser.add_argument("-v", "--verbose", help="Verbose ")
33 | parser.add_argument("-s", "--host", help="Host ip address ")
34 | parser.add_argument("-c", "--config", help="Config file ")
35 |
36 | args = parser.parse_args()
37 |
38 | if args.config != None:
39 | path = args.config
40 |
41 | if not path.startswith("/"):
42 | path = os.path.join(os.path.dirname(__file__), path)
43 |
44 | configYaml = yaml.load(file(path))
45 |
46 | if args.port != None:
47 | configYaml["server"]["port"] = int(args.port)
48 |
49 | if args.verbose != None:
50 | configYaml["logging"]["verbose"] = str2bool(args.verbose)
51 |
52 | if args.host != None:
53 | configYaml["server"]["host"] = args.host
54 |
55 | return configYaml
56 |
57 | def str2bool(v):
58 | return v.lower() in ("yes", "true", "t", "1")
59 |
60 | def config_get(keys=(), default=None, config=None):
61 | """
62 | Allows user to access configuration variables and arguments. The function
63 | takes the variable name as its input, and returns the value or None is it
64 | isn't set.
65 |
66 | @param keys: The name of the configuration parameter to fetch. (Optional)
67 | @param default: The default value to return if the key is not found.
68 | @param config: dictionary to represent config. If None, load_config is
69 | called.
70 | @return: The value for the given parameter if name was set and valid, \
71 | the default value if invalid or None if no default value was set.
72 | """
73 | if config is None:
74 | config = load_config()
75 |
76 | if isinstance(keys, basestring):
77 | keys = [keys]
78 |
79 | pointer = config
80 | for key in keys:
81 | if not key in pointer:
82 | if default is not None:
83 | return default
84 | else:
85 | raise KeyError('Could not find %s in configuration' % key)
86 | pointer = pointer[key]
87 |
88 | index = 0
89 | return pointer
90 |
--------------------------------------------------------------------------------
/semanticizer/dbinsert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/dbinsert/__init__.py
--------------------------------------------------------------------------------
/semanticizer/dbinsert/__main__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import yaml
15 | import sys
16 | import getopt
17 |
18 | from ..wpm.load import WpmLoader
19 | from ..config import config_get
20 |
21 | def load_wpm_data(datasource, langcode, settings, **kwargs):
22 | if datasource == "redis":
23 | from ..wpm.db.redisdb import RedisDB
24 | db = RedisDB(**kwargs)
25 | WpmLoader(db, langcode, settings, **kwargs)
26 | elif datasource == "mongo":
27 | from ..wpm.db.mongodb import MongoDB
28 | db = MongoDB(**kwargs)
29 | WpmLoader(db, langcode, settings, **kwargs)
30 | else:
31 | raise ValueError("No %s backend for language %s" % (datasource, langcode))
32 |
33 |
34 |
35 | ##
36 | ## usage
37 | ## python -m semanticizer.dbinsert --language= --output=/tmp/redisinsert.log
38 | if __name__ == '__main__':
39 | configYaml = yaml.load(file('conf/semanticizer.yml'))
40 | wpm_languages = config_get(('wpm', 'languages'), None, configYaml)
41 | settings = config_get("settings", {}, configYaml)
42 | try:
43 | opts, args = getopt.getopt(sys.argv[1:], 'l:o:', ['language=', 'output='])
44 | except getopt.GetoptError:
45 | usage()
46 | sys.exit(2)
47 |
48 | showprogress = True
49 | output = None
50 | language = None
51 |
52 | for opt, arg in opts:
53 | if opt in ('-l', '--language'):
54 | language = arg
55 | elif opt in ('-o', '--output'):
56 | output = arg
57 |
58 | if output:
59 | f = open(output, "w+")
60 | sys.stdout = f
61 | showprogress = False
62 |
63 | #if language code is specified only import that language
64 | if language and wpm_languages[language]:
65 | load_wpm_data(wpm_languages[language]['source'], language, settings, progress=showprogress, **wpm_languages[language]['initparams'])
66 | #else important all languages in the config file
67 | else:
68 | for langcode, langconfig in wpm_languages.iteritems():
69 | load_wpm_data(langconfig['source'], langcode, settings, progress=showprogress, **langconfig['initparams'])
70 |
71 |
--------------------------------------------------------------------------------
/semanticizer/processors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/processors/__init__.py
--------------------------------------------------------------------------------
/semanticizer/processors/context.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import networkx
15 | from networkx.algorithms.centrality import degree_centrality
16 |
17 | from multiprocessing import Pool
18 |
19 | def pagerank_worker(graph, page_ranked):
20 | print "Pagerank on graph with %d nodes and %d edges." \
21 | % (len(graph.nodes()), \
22 | len(graph.edges()))
23 | for node in graph.nodes():
24 | page_ranked.setdefault(node, 1)
25 |
26 | from networkx.algorithms.link_analysis import pagerank
27 | from time import time
28 |
29 | try:
30 | start = time()
31 | page_ranked = pagerank(graph, max_iter=1000, nstart=page_ranked) # 0.2-1.5s for #node = 2500
32 | print "Pagerank took: %f seconds" % (time()-start)
33 | except ZeroDivisionError:
34 | print "ZeroDivisionError in pagerank"
35 |
36 | page_ranked_sorted = sorted(page_ranked.items(), key=lambda x: x[1], reverse=True)
37 | print page_ranked_sorted[:4]
38 |
39 | pool = Pool()
40 |
41 | class contextGraph:
42 | def __init__(self, label, threshold_function, threshold, min_t):
43 | self.graph = networkx.Graph()
44 | self.page_ranked = {}
45 | self.chunk = -1
46 | self.feature_label = "CONTEXT_" + label.upper()
47 |
48 | self.threshold_function = threshold_function
49 | self.threshold = threshold
50 | self.min_t = min_t
51 |
52 | def to_dict_of_dicts(self):
53 | return networkx.convert.to_dict_of_dicts(self.graph)
54 |
55 | def add_chunk(self):
56 | self.chunk += 1
57 | self.page_ranked.setdefault("[Chunk%d]" % self.chunk, 0)
58 | if self.chunk > 0:
59 | self.graph.add_edge("[Chunk%d]" % self.chunk, \
60 | "[Chunk%d]" % (self.chunk-1), t=self.chunk)
61 |
62 | def add_link(self, link):
63 | assert link.has_key("title")
64 | assert link.has_key(self.threshold_function)
65 | assert link.has_key("label")
66 |
67 | if link[self.threshold_function] < self.threshold: return
68 |
69 | label_text = "[%d-%s]" % (self.chunk, link["label"])
70 | self.page_ranked.setdefault(link["title"], 1)
71 | self.page_ranked.setdefault(label_text, 0)
72 | self.graph.add_edge(label_text, link["title"], t=self.chunk) # weight=senseProbability
73 | self.graph.add_edge(label_text, "[Chunk%d]" % self.chunk, t=self.chunk)
74 |
75 | def prepare_features(self):
76 | self.clean_graph(self.chunk-self.min_t)
77 |
78 | self.pagerank_result = pool.apply_async(pagerank_worker, (self.graph, self.page_ranked,))
79 |
80 | # def degree_centrality_worker():
81 | # self.degree_centralities = degree_centrality(self.graph)
82 | #
83 | # self.degree_centrality_thread = Thread(target=degree_centrality_worker)
84 | # self.degree_centrality_thread.start()
85 |
86 | self.degree_centrality_result = pool.apply_async(degree_centrality, (self.graph,))
87 |
88 | def compute_features(self, title):
89 | # self.degree_centrality_thread.join()
90 | # self.pagerank_thread.join()
91 | self.degree_centralities = self.degree_centrality_result.get()
92 | self.pagerank_result.wait()
93 |
94 | features = {}
95 | features[self.feature_label + "_DEGREE"] = 0
96 | features[self.feature_label + "_PAGERANK"] = 0
97 | features[self.feature_label + "_PAGERANK_NORMALIZED"] = 0
98 | features[self.feature_label + "_DEGREE_CENTRALITY"] = 0
99 | if title in self.page_ranked:
100 | features[self.feature_label + "_PAGERANK"] = self.page_ranked[title]
101 | features[self.feature_label + "_PAGERANK_NORMALIZED"] = \
102 | len(self.graph.nodes()) * self.page_ranked[title]
103 | if title in self.degree_centralities:
104 | features[self.feature_label + "_DEGREE"] = \
105 | self.graph.degree(title)
106 | features[self.feature_label + "_DEGREE_CENTRALITY"] = \
107 | self.degree_centralities[title]
108 | return features
109 |
110 | def clean_graph(self, min_t):
111 | # Remove edges with a t lower than min_t
112 | for edge in self.graph.edges():
113 | if self.graph[edge[0]][edge[1]]["t"] < min_t:
114 | self.graph.remove_edge(edge[0], edge[1])
115 | # Remove nodes that have become disconnected
116 | for node in self.graph.nodes():
117 | if self.graph.degree(node) == 0:
118 | self.graph.remove_node(node)
119 | del self.page_ranked[node]
120 |
121 | def pagerank(self):
122 | # from networkx.algorithms.link_analysis import pagerank_scipy
123 | # from networkx.algorithms.link_analysis import pagerank_numpy
124 | from networkx.algorithms.link_analysis import pagerank
125 | from time import time
126 | try:
127 | start = time()
128 | # pagerank(graph, max_iter=1000) # 1.7s for #nodes = 2500
129 | pagerank(self.graph, max_iter=1000, nstart=self.page_ranked) # 0.2-1.5s for #node = 2500
130 | # pagerank_scipy(graph) # 1.0s for #nodes = 2500
131 | # pagerank_numpy(graph) # > 30s if #nodes > 1000
132 | print "Pagerank took: %f seconds" % (time()-start)
133 | except ZeroDivisionError:
134 | print "ZeroDivisionError in pagerank"
135 |
136 | page_ranked_sorted = sorted(self.page_ranked.items(), key=lambda x: x[1], reverse=True)
137 | print page_ranked_sorted[:4]
138 |
139 | # from networkx.algorithms.centrality import *
140 |
141 | # start = time()
142 | # degree_centrality = degree_centrality(graph) # 0.003s for 1500 nodes
143 | # print "Degree centrality took: %f seconds" % (time()-start)
144 | #
145 | # start = time()
146 | # closeness_centrality = closeness_centrality(graph) # 4s for 1500 nodes
147 | # print "Closeness centrality took: %f seconds" % (time()-start)
148 | #
149 | # start = time()
150 | # betweenness_centrality = betweenness_centrality(graph) # 18s for 1500 nodes
151 | # print "Betweenness centrality took: %f seconds" % (time()-start)
152 |
153 | return self.page_ranked
154 |
--------------------------------------------------------------------------------
/semanticizer/processors/core.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | class LinksProcessor:
15 | '''A LinksProcessor takes a set of links, a text and a language code to
16 | produce or process links. Processing is done in two steps, a preprocessing
17 | step and a processing step. '''
18 |
19 | def preprocess(self, links, text, settings):
20 | return (links, text, settings)
21 |
22 | def process(self, links, text, settings):
23 | return (links, text, settings)
24 |
25 | def postprocess(self, links, text, settings):
26 | return (links, text, settings)
27 |
28 | def inspect(self):
29 | return {}
30 |
31 | class SettingsProcessor(LinksProcessor):
32 | def __init__(self, settings):
33 | self.settings = settings
34 |
35 | def preprocess(self, links, text, settings):
36 | if "settings" in settings and settings["settings"] in self.settings:
37 | for k, v in self.settings[settings["settings"]].iteritems():
38 | if k not in settings:
39 | settings[k] = v
40 | del settings["settings"]
41 | return (links, text, settings)
42 | def inspect(self):
43 | return {self.__class__.__name__: self.settings}
44 |
45 | class FilterProcessor(LinksProcessor):
46 | def __init__(self):
47 | self.context_links = {}
48 |
49 | def preprocess(self, links, text, settings):
50 | if settings.has_key("prefilter"):
51 | links = self.filter_links(settings["prefilter"].split(","), links, settings)
52 |
53 | return (links, text, settings)
54 |
55 | def postprocess(self, links, text, settings):
56 | if "filter" in settings:
57 | links = self.filter_links(settings["filter"].split(","),
58 | links, settings)
59 |
60 | return (links, text, settings)
61 |
62 | def filter_links(self, filters, links, settings):
63 | filters_gte = [fltr.split(">=") for fltr in filters if ">=" in fltr]
64 | filters_gt = [fltr.split(">") for fltr in filters \
65 | if ">" in fltr and not ">=" in fltr]
66 |
67 | filter_unique = ("unique" in filters) and "context" in settings
68 |
69 | if len(filters_gte) == 0 and len(filters_gt) == 0 \
70 | and not filter_unique:
71 | return links
72 |
73 | filtered_links = []
74 | # Q: why do we not apply the gt filter if a gte filter fails?
75 | for link in links:
76 | skip = False
77 | for fltr in filters_gte:
78 | if not link[fltr[0]] >= float(fltr[1]):
79 | skip = True
80 | break
81 | else:
82 | for fltr in filters_gt:
83 | if not link[fltr[0]] > float(fltr[1]):
84 | skip = True
85 | break
86 |
87 | if filter_unique:
88 | self.context_links.setdefault(settings["context"], {})
89 | if link["title"] in self.context_links[settings["context"]]:
90 | skip = True
91 |
92 | if not skip:
93 | filtered_links.append(link)
94 |
95 | if filter_unique:
96 | self.context_links[settings["context"]][link["title"]] = link
97 |
98 | print "Filtered %d links to %d" % (len(links), len(filtered_links))
99 |
100 | return filtered_links
101 |
102 | def inspect(self):
103 | return {self.__class__.__name__: self.context_links}
--------------------------------------------------------------------------------
/semanticizer/processors/external.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | from Queue import Queue, Empty
15 | from threading import Thread
16 |
17 | import urllib2
18 |
19 | import datetime
20 | import shelve
21 | import os
22 | from copy import deepcopy
23 |
24 | from .core import LinksProcessor
25 | from ..wpm.data import wpm_dumps
26 | from ..wpm.utils import get_relatedness
27 |
28 |
29 | class ArticlesProcessor(LinksProcessor):
30 | def __init__(self, langcodes, pickledir):
31 | self.langcodes = langcodes
32 | self.article_template = {
33 | "article_id": -1,
34 | "article_title": "",
35 | "Definition": "",
36 | "InLinks": [],
37 | "OutLinks": [],
38 | "Labels": [],
39 | "Images": [],
40 | "ParentCategories": []
41 | }
42 |
43 | def preprocess(self, links, text, settings):
44 | if not "article" in settings and not "features" in settings and not \
45 | "learning" in settings and "multi" not in settings:
46 | return (links, text, settings)
47 | if not settings["langcode"] in self.langcodes:
48 | return (links, text, settings)
49 |
50 | return (links, text, settings)
51 |
52 | def process(self, links, text, settings):
53 | if not "article" in settings and not "features" in settings and not \
54 | "learning" in settings:
55 | return (links, text, settings)
56 | if not settings["langcode"] in self.langcodes:
57 | return (links, text, settings)
58 |
59 | wpm = wpm_dumps[settings["langcode"]]
60 |
61 | if "article" in settings:
62 | parts = settings["article"].lower().split(',')
63 | else:
64 | parts = [key.lower() for key in self.article_template.keys()]
65 |
66 | titles = [link["title"] for link in links]
67 | ids = [link["id"] for link in links]
68 | articles = wpm.get_articles(*ids)
69 |
70 | for link, id, title, article in zip(links, ids, titles, articles):
71 |
72 | link.update(deepcopy(self.article_template))
73 |
74 | link["article_title"] = title
75 | link["article_id"] = id
76 |
77 | inlinks = article["InLinks"]
78 | if inlinks and (not parts or 'inlinks' in parts):
79 | if not parts or 'relatedness' in parts:
80 | for inlink in inlinks:
81 | title = wpm.get_item_title(inlink)
82 | relatedness = get_relatedness(inlinks, wpm.get_item_inlinks(inlink) )
83 | link["InLinks"].append( {"title":title, "id":int(inlink), "relatedness":relatedness} )
84 | else:
85 | link["InLinks"] = [{ "id":int(inlink) } for inlink in inlinks]
86 |
87 | outlinks = article["OutLinks"]
88 | if outlinks and (not parts or 'outlinks' in parts):
89 | if not parts or 'relatedness' in parts:
90 | for outlink in outlinks:
91 | title = wpm.get_item_title(outlink)
92 | relatedness = get_relatedness(outlinks, wpm.get_item_outlinks(outlink) )
93 | link["OutLinks"].append( {"title":title, "id":int(outlink), "relatedness":relatedness} )
94 | else:
95 | link["OutLinks"] = [{ "id":int(outlink) } for outlink in outlinks]
96 |
97 | if not parts or 'categories' in parts:
98 | categories = wpm.get_item_categories( link["article_id"] )
99 | if categories:
100 | for category in categories:
101 | title = wpm.get_item_title(category)
102 | link["ParentCategories"].append( {"title":title, "id":int(category)} )
103 |
104 | if not parts or 'definition' in parts:
105 | definition = wpm.get_item_definition(link["article_id"])
106 | if definition:
107 | link["Definition"] = definition
108 |
109 | if article["Labels"] and "labels" in parts:
110 | link["Labels"] = article["Labels"]
111 |
112 | return (links, text, settings)
113 |
114 | def postprocess(self, links, text, settings):
115 | if "article" in settings and len(settings["article"]) == 0:
116 | return (links, text, settings)
117 | remove = [key.lower() for key in self.article_template.keys()]
118 | remove.extend(["fromtitle", "fromredirect"])
119 | if "article" in settings:
120 | for label in settings["article"].replace(";", ",").split(","):
121 | if label.lower() in remove:
122 | remove.remove(label)
123 | for link in links:
124 | for label in link.keys():
125 | if label.lower() in remove:
126 | del link[label]
127 |
128 | return (links, text, settings)
129 |
130 |
131 | class StatisticsProcessor(LinksProcessor):
132 | def __init__(self, langcodes, num_of_threads, pickledir):
133 | self.num_of_threads = num_of_threads
134 | self.WIKIPEDIA_STATS_URL = {}
135 | self.wikipedia_statistics_cache = {}
136 | for langcode in langcodes:
137 | self.WIKIPEDIA_STATS_URL[langcode] = \
138 | "http://stats.grok.se/json/" \
139 | + langcode \
140 | + "/%d%02d/%s" # 201001/De%20Jakhalzen
141 |
142 | pickle_root = os.path.join(pickledir, langcode)
143 | if not os.path.isdir(pickle_root):
144 | os.makedirs(pickle_root)
145 | self.wikipedia_statistics_cache[langcode] = \
146 | shelve.open(os.path.join(pickle_root, \
147 | 'wikipedia_statistics_cache.db'))
148 | print "Loaded %d sets of statistics for %s from cache." \
149 | % (len(self.wikipedia_statistics_cache[langcode]), langcode)
150 |
151 | def inspect(self):
152 | return {self.__class__.__name__: self.WIKIPEDIA_STATS_URL}
153 |
154 | def preprocess(self, links, text, settings):
155 | if "wikistats" not in settings:
156 | return (links, text, settings)
157 |
158 | now = self.get_timestamp(settings)
159 |
160 | def worker():
161 | while True:
162 | try:
163 | (year, month, article) = queue.get_nowait()
164 | self.wikipedia_page_views(year, month,
165 | article, settings["langcode"])
166 | queue.task_done()
167 | except Empty:
168 | break
169 |
170 | queue = Queue()
171 | for _ in set([link["title"] for link in links]):
172 | day = now
173 | for _ in range(14):
174 | queue.put((day.year, day.month, article))
175 | day += timedelta(days=28)
176 |
177 | for _ in range(self.num_of_threads):
178 | t = Thread(target=worker)
179 | t.daemon = True
180 | t.start()
181 |
182 | def process(self, links, text, settings):
183 | if "wikistats" not in settings:
184 | return (links, text, settings)
185 |
186 | now = self.get_timestamp(settings)
187 |
188 | self.queue.join()
189 |
190 | for link in links:
191 | features = {"WIKISTATSDAY": 0,
192 | "WIKISTATSWK": 0,
193 | "WIKISTATS4WK": 0,
194 | "WIKISTATSYEAR": 0,
195 | "WIKISTATSDAYOFWK": 0,
196 | "WIKISTATSWKOF4WK": 0,
197 | "WIKISTATS4WKOFYEAR": 0
198 | }
199 |
200 | self.feature_WIKISTATSDAY(datetime, link["title"], features, now)
201 | self.feature_WIKISTATSWK(datetime, link["title"], features, now)
202 | self.feature_WIKISTATS4WK(datetime, link["title"], features, now)
203 | self.feature_WIKISTATSYEAR(datetime, link["title"], features, now)
204 | self.feature_WIKISTATSTRENDS(features)
205 |
206 | del features["WIKISTATSDAY"]
207 |
208 | link["features"].update(features)
209 |
210 | for langcode, cache in self.wikipedia_statistics_cache.iteritems():
211 | print "Saving %d sets of statistics for %s from cache." \
212 | % (len(cache), langcode)
213 | cache.sync()
214 |
215 | return (links, text, settings)
216 |
217 | def get_timestamp(self, settings):
218 | # Should be more robust against unexpected values
219 | if len(settings["wikistats"]) > 0:
220 | return datetime.datetime.fromtimestamp(int(settings["wikistats"]))
221 | else:
222 | return datetime.datetime.now()
223 |
224 | def wikipedia_page_views(self, year, month, article, langcode):
225 | url = self.WIKIPEDIA_STATS_URL[langcode] % (year, month, article)
226 | url = url.encode('utf-8')
227 | if url in self.wikipedia_statistics_cache[langcode]:
228 | resultJson = self.wikipedia_statistics_cache[langcode][url]
229 | else:
230 | try:
231 | request = urllib2.urlopen(url, timeout=1)
232 | resultJson = request.read()
233 | except urllib2.URLError:
234 | try:
235 | request = urllib2.urlopen(url)
236 | resultJson = request.read()
237 | except urllib2.URLError:
238 | request = urllib2.urlopen(url)
239 | resultJson = request.read()
240 |
241 | self.wikipedia_statistics_cache[langcode][url] = resultJson
242 |
243 | from json import loads
244 | result = loads(resultJson)
245 |
246 | return result
247 |
248 | def feature_WIKISTATSDAY(self, datetime, article, features, now):
249 | day = now
250 | day += timedelta(days=-1)
251 | monthly_views = self.wikipedia_page_views(day.year,
252 | day.month, article)
253 | views = monthly_views["daily_views"][self.date_format % \
254 | (day.year, day.month, day.day)]
255 | features["WIKISTATSDAY"] = views
256 |
257 | def feature_WIKISTATSWK(self, datetime, article, features, now):
258 | day = now
259 | for _ in range(7):
260 | day += timedelta(days=-1)
261 | monthly_views = self.wikipedia_page_views(day.year,
262 | day.month, article)
263 | views = \
264 | monthly_views["daily_views"][self.date_format % \
265 | (day.year, day.month, day.day)]
266 | features["WIKISTATSWK"] += views
267 |
268 | def feature_WIKISTATS4WK(self, datetime, article, features, now):
269 | day = now
270 | for _ in range(28):
271 | day += timedelta(days=-1)
272 | monthly_views = self.wikipedia_page_views(day.year,
273 | day.month, article)
274 | views = monthly_views["daily_views"][self.date_format % \
275 | (day.year, day.month, day.day)]
276 | features["WIKISTATS4WK"] += views
277 |
278 | def feature_WIKISTATSYEAR(self, datetime, article, features, now):
279 | day = now
280 | for _ in range(365):
281 | day += timedelta(days=-1)
282 | monthly_views = self.wikipedia_page_views(day.year,
283 | day.month, article)
284 | views = monthly_views["daily_views"][self.date_format % \
285 | (day.year, day.month, day.day)]
286 | features["WIKISTATSYEAR"] += views
287 |
288 | def feature_WIKISTATSTRENDS(self, features):
289 | if features["WIKISTATSWK"] > 0:
290 | features["WIKISTATSDAYOFWK"] = \
291 | float(features["WIKISTATSDAY"]) / features["WIKISTATSWK"]
292 | if features["WIKISTATS4WK"] > 0:
293 | features["WIKISTATSWKOF4WK"] = \
294 | float(features["WIKISTATSWK"]) / features["WIKISTATS4WK"]
295 | if features["WIKISTATSYEAR"] > 0:
296 | features["WIKISTATS4WKOFYEAR"] = \
297 | float(features["WIKISTATS4WK"]) / features["WIKISTATSYEAR"]
298 |
--------------------------------------------------------------------------------
/semanticizer/processors/feature.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import collections
15 | from math import log
16 | import cPickle as pickle
17 | import os
18 | import re
19 |
20 | from leven import levenshtein
21 |
22 | from . import stringUtils
23 | from ..wpm.data import wpm_dumps
24 |
25 | class anchorFeatures:
26 | def __init__(self, langcode):
27 | self.wpm = wpm_dumps[langcode]
28 | self.wikipediaArticleCount = int(self.wpm.get_stat("articleCount")) #970139
29 | self.wikipediaCategoryCount = int(self.wpm.get_stat("categoryCount")) #63108
30 |
31 | def feature_LEN(self, lnk):
32 | return len(re.findall(stringUtils.reTokenPattern, lnk["label"]))
33 |
34 | def feature_IDF_title(self, lnk):
35 | score = self.wpm.get_title_ngram_score(lnk["label"])
36 | if not score == None:
37 | in_title_count = int(score)
38 | else:
39 | in_title_count = 0
40 | return log(float(self.wikipediaArticleCount) / \
41 | (float(in_title_count) + 0.00001))
42 |
43 | def feature_IDF_anchor(self, lnk):
44 | return log(float(self.wikipediaArticleCount) / \
45 | (float(lnk["linkDocCount"]) + 0.00001))
46 |
47 | def feature_IDF_content(self, lnk):
48 | return log(float(self.wikipediaArticleCount) / \
49 | (float(lnk["docCount"]) + 0.00001))
50 |
51 | def feature_KEYPHRASENESS(self, lnk):
52 | return float(lnk["linkDocCount"]) / (float(lnk["docCount"]) + 0.00001)
53 |
54 | def feature_LINKPROB(self, lnk):
55 | return float(lnk["linkOccCount"]) / (float(lnk["occCount"]) + 0.00001)
56 |
57 | def feature_SNIL(self, lnk):
58 | SNIL = 0
59 |
60 | words = lnk["label"].split()
61 | for n in range(1, len(words) + 1):
62 | for i in range(0, len(words) - n):
63 | ngram = " ".join(words[i:i + n])
64 | if not self.wpm.get_item_id(ngram) == None:
65 | SNIL += 1
66 | return SNIL
67 |
68 | def feature_SNCL(self, lnk):
69 | SNCL = 0
70 |
71 | words = lnk["label"].split()
72 | for n in range(1, len(words) + 1):
73 | for i in range(0, len(words) - n):
74 | ngram = " ".join(words[i:i + n])
75 | score = self.wpm.get_title_ngram_score(ngram)
76 | if not score == None:
77 | SNCL += int(score)
78 | return SNCL
79 |
80 | def feature_NORMALIZATION(self, lnk):
81 | edit = levenshtein(unicode(lnk["label"]), unicode(lnk["text"]))
82 | return float(edit) / len(lnk["text"])
83 |
84 | def compute_anchor_features(self, lnk):
85 | return {'LEN': self.feature_LEN(lnk),
86 | 'IDF_title': self.feature_IDF_title(lnk),
87 | 'IDF_anchor': self.feature_IDF_anchor(lnk),
88 | 'IDF_content': self.feature_IDF_content(lnk),
89 | 'KEYPHRASENESS': self.feature_KEYPHRASENESS(lnk),
90 | 'LINKPROB': self.feature_LINKPROB(lnk),
91 | 'SNIL': self.feature_SNIL(lnk),
92 | 'SNCL': self.feature_SNCL(lnk),
93 | 'NORMALIZATION': self.feature_NORMALIZATION(lnk)
94 | }
95 |
96 |
97 | class articleFeatures:
98 | def __init__(self):
99 | self.re_non_word_chars = re.compile(r'(?u)\W+', re.UNICODE)
100 |
101 | def feature_INLINKS(self, lnk):
102 | if "InLinks" not in lnk:
103 | return 0
104 | return len(lnk["InLinks"])
105 |
106 | def feature_OUTLINKS(self, lnk):
107 | if "OutLinks" not in lnk:
108 | return 0
109 | return len(lnk["OutLinks"])
110 |
111 | def feature_REDIRECT(self, lnk):
112 | # Should be fromRedirect but bug in Wikipedia Miner
113 | if "fromTitle" in lnk and lnk["fromTitle"]:
114 | return 1
115 | return 0
116 |
117 | def feature_TF(self, lnk, re_label_text, features):
118 | aMatches = re.findall(re_label_text, lnk['title'])
119 | features["TF_title"] = float(len(aMatches))
120 |
121 | text = " "
122 | if "Definition" in lnk:
123 | if lnk["Definition"] and len(lnk["Definition"]):
124 | text = re.sub(r"<.*?>", "", lnk["Definition"])
125 | text = re.sub(r"^[|\- }]*", "", text)
126 |
127 | while len(text) and (text[0] == "."):
128 | text = text[1:].strip()
129 |
130 | # Very rarely articles do not have a Definition text (or a dummy one
131 | # like "----")
132 | if len(text) == 0:
133 | features["TF_sentence"] = 0
134 | features["TF_paragraph"] = 0
135 | features["POS_first_in_paragraph"] = 1
136 | else:
137 | # Sentence is first sentence
138 | sentence = text.split('.')[0]
139 |
140 | aMatches = re.findall(re_label_text, sentence)
141 | features["TF_sentence"] = float(len(aMatches))
142 |
143 | aMatches = re.findall(re_label_text, text)
144 | features["TF_paragraph"] = float(len(aMatches))
145 |
146 | if len(aMatches):
147 | features["POS_first_in_paragraph"] = \
148 | float(re.search(re_label_text, text).start())
149 | else:
150 | features["POS_first_in_paragraph"] = 1
151 |
152 | def feature_TITLE(self, lnk, re_label_text, features):
153 | label_text = unicode(lnk["label"])
154 |
155 | re_title = stringUtils.ngramToPattern(lnk['title'])
156 | article_title = unicode(lnk['title'])
157 |
158 | features["NCT"] = 0 if re.search(re_title, label_text) is None \
159 | else 1
160 |
161 | features["TCN"] = 0 \
162 | if re.search(re_label_text, article_title) is None else 1
163 |
164 | features["TEN"] = 1 if article_title == label_text else 0
165 |
166 | # Irritatingly enough, split() can give you empty values as last
167 | # element
168 | split_label = self.re_non_word_chars.split(label_text)
169 | if split_label[-1] == '':
170 | split_label.pop()
171 | split_title = self.re_non_word_chars.split(article_title)
172 | if split_title[-1] == '':
173 | split_title.pop()
174 |
175 | # I: True if the title of the candidate begins with the the query
176 | # (e.g. "Cambridge, Massachusetts" and "Cambridge" )
177 | features["SUBSTRING_MATCH_1"] = 1 \
178 | if split_title[0] == split_label[0] else 0
179 |
180 | # II: True if the title of the candidate ends with the the query
181 | # (e.g: "Venice-Simplon Orient Express" and "Orient Express")
182 | features["SUBSTRING_MATCH_2"] = 1 \
183 | if split_title[-1] == split_label[-1] else 0
184 |
185 | # collections.Counter() converts an array to a dict of words
186 | # and their frequencies
187 | cSplitLabel = collections.Counter(split_label)
188 | cSplitTitle = collections.Counter(split_title)
189 |
190 | # Number of shared words between the title of the candidate and
191 | # the query
192 | features['WORD_MATCH'] = len(list(cSplitLabel & cSplitTitle))
193 |
194 | # Number of different words between the title of the candidate
195 | # and the query
196 | features['WORD_MISS'] = len(split_label) + len(split_title) \
197 | - (2 * features['WORD_MATCH'])
198 |
199 | # Levenshtein distance between query and title of the candidate
200 | features["EDIT_DISTANCE"] = levenshtein(label_text, article_title)
201 |
202 | def feature_COMMONNESS(self, lnk, features):
203 | features["COMMONNESS"] = lnk["priorProbability"]
204 |
205 | def compute_article_features(self, lnk):
206 | features = {
207 | 'INLINKS': self.feature_INLINKS(lnk),
208 | 'OUTLINKS': self.feature_OUTLINKS(lnk),
209 | 'REDIRECT': self.feature_REDIRECT(lnk)
210 | }
211 |
212 | re_label_text = stringUtils.ngramToPattern(lnk["label"])
213 |
214 | self.feature_TF(lnk, re_label_text, features)
215 | self.feature_TITLE(lnk, re_label_text, features)
216 | self.feature_COMMONNESS(lnk, features)
217 |
218 | return features
219 |
220 | ### TK: Ik heb nog wat extra features gemaakt die kijken hoe vaak
221 | ### inlink anchors en inlink/outlink titels voorkomen in de
222 | ### referentietekst en de zogenaamde aposition in de titel
223 | ### ('actress' in 'Sue Johnson (actress)')
224 | ###
225 | ### 'NR_OF_MATCHING_INLINK_ANCHORS', 'NR_OF_MATCHING_INLINK_TITLES',
226 | ### 'NR_OF_MATCHING_OUTLINK_TITLES', 'APOSITION'
227 | ###
228 | ### Dat is er nu niet zo makkelijk in te bouwen omdat we hier geen
229 | ### toegang hebben tot de referentietekst. Maar die features van David
230 | ### gaan dat ook zeker nodig hebben!
231 | ###
232 | ### Maar goed, ik heb ze nu nog even weg gelaten...
233 |
234 | if __name__ == "__main__":
235 | # Some settings
236 | langcode = "en"
237 | wikipediaminer_root = '/zfs/ilps-plexer/wikipediaminer/enwiki-20111007/'
238 | pickledir = "/Users/evertlammerts/semanticizer/pickles/"
239 |
240 | # Test data
241 | link = {"label": "Alabama",
242 | "linkDocCount": 10, # Al deze waardes slaan nergens op natuurlijk,
243 | "docCount": 20, # maar ok...
244 | "linkOccCount": 100,
245 | "occCount": 200,
246 | "commonness": 0.12345
247 | }
248 |
249 | # Article
250 | article_url = '' # Wordt niet gebruikt nu
251 | fh_article_xml = open("unitTest.article.xml", "r")
252 | article_xml = fh_article_xml.read()
253 | fh_article_xml.close()
254 | article = ElementTree.fromstring(article_xml).find("Response")
255 |
256 | # Initialize the objects
257 | print "Initializing anchor features"
258 | anchor_features = anchorFeatures(langcode)
259 | print "Initializing concept features"
260 | concept_features = conceptFeatures(langcode, wikipediaminer_root,
261 | article_url)
262 | print "Initializing anchor/concept features"
263 | anchor_concept_features = anchorConceptFeatures()
264 | print "Initializing statistics features"
265 | statistics_features = statisticsFeatures(langcode)
266 |
267 | print "Start calculating"
268 | test_features = {
269 | "anchor": anchor_features.compute_anchor_features(link),
270 | "concept": concept_features.compute_concept_features(article),
271 | "anchor_concept": \
272 | anchor_concept_features.compute_anchor_concept_features(link, article),
273 | "statistics": statistics_features.compute_statistics_features(article),
274 | }
275 |
276 | print "%s" % test_features
277 |
--------------------------------------------------------------------------------
/semanticizer/processors/features.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | from collections import defaultdict
15 |
16 | from . import feature as features
17 | from . import context
18 |
19 | from .core import LinksProcessor
20 |
21 | class FeaturesProcessor(LinksProcessor):
22 | def __init__(self, langcodes):
23 | self.features = {}
24 | for langcode in langcodes:
25 | self.features[langcode] = features.anchorFeatures(langcode)
26 |
27 | def process(self, links, text, settings):
28 | if not "features" in settings and not "learning" in settings:
29 | return (links, text, settings)
30 | if not settings["langcode"] in self.features:
31 | return (links, text, settings)
32 |
33 | featuresets = self.features[settings["langcode"]]
34 |
35 | for link in links:
36 | link.setdefault("features", {})
37 | link["features"].update(featuresets.compute_anchor_features(link))
38 |
39 | return (links, text, settings)
40 |
41 | def inspect(self):
42 | return {self.__class__.__name__: self.features.keys()}
43 |
44 |
45 | class ArticleFeaturesProcessor(LinksProcessor):
46 | def __init__(self):
47 | self.features = features.articleFeatures()
48 |
49 | def process(self, links, text, settings):
50 | if not "features" in settings and not "learning" in settings:
51 | return (links, text, settings)
52 | # Check if ArticleProcessor has run
53 |
54 | for link in links:
55 | link.setdefault("features", {})
56 | link["features"].update(
57 | self.features.compute_article_features(link)
58 | )
59 |
60 | return (links, text, settings)
61 |
62 | def inspect(self):
63 | return {self.__class__.__name__: str(self.features)}
64 |
65 |
66 | class ContextFeaturesProcessor(LinksProcessor):
67 | def __init__(self):
68 | self.context_features = {}
69 | self.context_text = defaultdict(list)
70 | self.context_id_pattern = "%s:%d"
71 |
72 | def new_context(self, context_label):
73 | self.context_features[context_label] = {
74 | "SP0.2-100": context.contextGraph("SP0.2-100", "senseProbability",
75 | 0.2, 100)
76 | }
77 |
78 | def preprocess(self, links, text, settings):
79 | if "context" in settings:
80 | settings["context_id"] = self.context_id_pattern % \
81 | (settings["context"], len(self.context_text[settings["context"]]))
82 | self.context_text[settings["context"]].append(text)
83 |
84 | return (links, text, settings)
85 |
86 | def process(self, links, text, settings):
87 | if not "context" in settings or "skip_context_features" in settings or \
88 | (not "features" in settings and not "learning" in settings):
89 | return (links, text, settings)
90 |
91 | # Create context_features if it does not exist
92 | if settings["context"] not in self.context_features:
93 | self.new_context(settings["context"])
94 |
95 | # For each set of context features
96 | for label in self.context_features[settings["context"]]:
97 | # Create a new chunk
98 | self.context_features[settings["context"]][label].add_chunk()
99 | graph = self.context_features[settings["context"]][label]
100 | # Add each link to graph and prepare features
101 | for link in links:
102 | graph.add_link(link)
103 | graph.prepare_features()
104 |
105 | # Compute context features for each link
106 | for link in links:
107 | link["features"].update(graph.compute_features(link["title"]))
108 |
109 | return (links, text, settings)
110 |
111 | def inspect(self):
112 | context = {}
113 | for context_label, features in self.context_features.iteritems():
114 | context[context_label] = {"text": self.context_text[context_label]}
115 | for label, context_graph in features.iteritems():
116 | graph = {"page_ranked": context_graph.page_ranked,
117 | "graph": context_graph.to_dict_of_dicts(),
118 | "chunk": context_graph.chunk}
119 | context[context_label][label] = graph
120 |
121 | return {self.__class__.__name__: context}
122 |
--------------------------------------------------------------------------------
/semanticizer/processors/image.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | from Queue import Queue, Empty
15 | from threading import Thread
16 |
17 | import urllib2, re
18 |
19 | from .core import LinksProcessor
20 |
21 | class AddImageProcessor(LinksProcessor):
22 | def postprocess(self, links, text, settings):
23 | if "image" in settings and "langcode" in settings:
24 | links = add_image_url(links, settings["langcode"])
25 | return (links, text, settings)
26 |
27 | image_url_cache = {}
28 |
29 | def add_image_url(links, langcode):
30 | urls = [link["url"].replace(".wikipedia.org/", ".m.wikipedia.org/") \
31 | for link in links]
32 |
33 | print "Getting images for %d Wikipedia pages" % len(urls)
34 | get_image_urls(urls)
35 | for link, url in zip(links, urls):
36 | if url in image_url_cache:
37 | print link["title"], "->", image_url_cache[url]
38 | link["image_url"] = image_url_cache[url]
39 |
40 | return links
41 |
42 | IMG_DIMENSION_PATTERN = '
'
43 | IMG_URL_PATTERN = '
'
44 |
45 | BLACKLISTED_IMAGE_URLS = ('http://upload.wikimedia.org/wikipedia/en/f/f4/Ambox_content.png',
46 | 'http://upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png',
47 | 'http://upload.wikimedia.org/wikipedia/en/thumb/f/f2/Edit-clear.svg/40px-Edit-clear.svg.png',
48 | 'http://upload.wikimedia.org/wikipedia/commons/thumb/f/f8/Wiktionary-logo-en.svg/37px-Wiktionary-logo-en.svg.png',
49 | 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/a4/Text_document_with_red_question_mark.svg/40px-Text_document_with_red_question_mark.svg.png')
50 |
51 | def convert_image_url(image):
52 | if image.startswith("//"):
53 | image = "http:" + image
54 | elif image.startswith("/"):
55 | image = "http://" + url.split("/")[2] + image
56 | return image
57 |
58 | def get_image_urls(urls, num_of_threads=8, min_dimension=36):
59 | def worker():
60 | while True:
61 | try:
62 | url = queue.get_nowait()
63 | try:
64 | page = urllib2.urlopen(url, timeout=1).read()
65 | except:
66 | page = ""
67 | images = re.findall("", page)
68 |
69 | # Filter Wikipedia images
70 | images = [img for img in images if " id=" not in img \
71 | and " title=" not in img]
72 | image = None
73 | for img in images:
74 | match = re.match(IMG_DIMENSION_PATTERN, img)
75 | if match == None: continue
76 | dimension = max([int(value) for value in match.groups()])
77 | if dimension >= min_dimension: # Do not use fallback: or image == None:
78 | match = re.match(IMG_URL_PATTERN, img)
79 | if match != None and len(match.groups()) > 0:
80 | image_url = convert_image_url(match.groups()[0])
81 | if image_url in BLACKLISTED_IMAGE_URLS: continue
82 | image = image_url
83 | # if dimension >= min_dimension:
84 | break
85 |
86 | image_url_cache[url] = image
87 |
88 | queue.task_done()
89 | except Empty:
90 | break
91 |
92 | queue = Queue()
93 | for url in urls:
94 | queue.put(url)
95 |
96 | for i in range(min(num_of_threads, len(urls))):
97 | t = Thread(target=worker)
98 | t.daemon = True
99 | t.start()
100 |
101 | queue.join()
102 |
--------------------------------------------------------------------------------
/semanticizer/processors/multiple.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import collections
15 |
16 | from .core import LinksProcessor
17 |
18 |
19 | class MultipleEntityFeaturesProcessor(LinksProcessor):
20 |
21 | def process(self, links, text, settings):
22 | self.link_dict = {}
23 | self.labels = []
24 |
25 | if 'multi' not in settings:
26 | return (links, text, settings)
27 |
28 | # First run through links to fill dict
29 | for link in links:
30 | self.link_dict.setdefault(link['id'], []) \
31 | .append([link['label'], link['senseProbability'],
32 | link['priorProbability'], link['linkProbability']])
33 | self.labels.append(link['label'])
34 | link['features'] = {}
35 |
36 | # Second run to calculate features
37 | for link in links:
38 | if 'tier1' in settings['multi']:
39 | features = self.FEATURE_tier_one_overlap(link, self.labels)
40 | link['features'].update(features)
41 | if 'outlinks' in settings['multi']:
42 | features = self.FEATURE_linked_entity_overlap(link['label'],
43 | link['OutLinks'],
44 | 'outlinks')
45 | link['features'].update(features)
46 | if 'inlinks' in settings['multi']:
47 | features = self.FEATURE_linked_entity_overlap(link['label'],
48 | link['InLinks'],
49 | 'inlinks')
50 | link['features'].update(features)
51 |
52 | return (links, text, settings)
53 |
54 | def FEATURE_tier_one_overlap(self, link, labels):
55 | """
56 | Perform simple 'list intersect'
57 | To find matching labels of candidate
58 | """
59 |
60 | tier_one = [link['title']] + [label['title'] for label in \
61 | link['Labels']]
62 | tier_one = [(anchor, link['id']) for anchor in \
63 | list((collections.Counter(tier_one) & \
64 | collections.Counter(self.labels)).elements())]
65 |
66 | return_list = []
67 | for l, i in tier_one:
68 |
69 | if i in self.link_dict:
70 | for label, senseProb, priorProb, cmns in self.link_dict[i]:
71 | if label == anchor:
72 | return_list.append((l, i, senseProb, priorProb, cmns))
73 | if return_list:
74 | return self.calculate_features(return_list, 1, 'tier_one')
75 |
76 | else:
77 | return {}
78 |
79 | def FEATURE_linked_entity_overlap(self, current_label, linked_entities,
80 | features):
81 | """
82 | IN: json of {in,out}-link_ids
83 | Check if they occur in doc dict
84 | if they do, see if they are referred to
85 | by a different label.
86 | """
87 |
88 | # Find stuff
89 | result_list = []
90 | for link in linked_entities:
91 | if str(link['id']) in self.link_dict:
92 | link_label = self.link_dict[str(link['id'])]
93 | for sub_link in link_label:
94 | if current_label != sub_link[0]:
95 | result_list.append((sub_link[0], link['id'],
96 | sub_link[1], sub_link[2],
97 | sub_link[3]))
98 | # Calculate features
99 | if result_list:
100 | return self.calculate_features(result_list, len(linked_entities),
101 | features)
102 | else:
103 | return {}
104 |
105 | def calculate_features(self, results, max_entities, features):
106 | """
107 | Given result list in format:
108 | label, wiki_id, senseProb, priorProb, commonness
109 | 'Unzip' lists and create feature vectors.
110 | """
111 |
112 | label_list, id_list, sense_list, prior_list, cmns_list = \
113 | ([l for l, w, s, p, c in results],
114 | [w for l, w, s, p, c in results],
115 | [s for l, w, s, p, c in results],
116 | [p for l, w, s, p, c in results],
117 | [c for l, w, s, p, c in results])
118 |
119 | if features == 'outlinks':
120 | PREFIX = 'ME_OUT_'
121 | elif features == 'inlinks':
122 | PREFIX = 'ME_IN_'
123 | elif features == 'tier_one':
124 | PREFIX = 'ME_T1_'
125 |
126 | return {PREFIX + 'label_overlap': len(label_list),
127 | PREFIX + 'label_unique': len(set(label_list)),
128 | PREFIX + 'entity_overlap': len(id_list),
129 | PREFIX + 'entity_unique': len(set(id_list)),
130 | PREFIX + 'entity_proportion': float(len(set(id_list))) / \
131 | float(max_entities),
132 | PREFIX + 'sense_prob_sum': sum(sense_list),
133 | PREFIX + 'prior_prob_sum': sum(prior_list),
134 | PREFIX + 'cmns_sum': sum(cmns_list)}
135 |
--------------------------------------------------------------------------------
/semanticizer/processors/semanticize.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | from nltk.util import ngrams as nltk_ngrams
15 | import re
16 | import urllib
17 |
18 | from ..wpm import utils as wpmutil
19 | from ..wpm.data import wpm_dumps
20 |
21 | tokenize = re.compile(r'\w+(?:[.,\']\w+)*|[^\w\s]+',
22 | re.UNICODE | re.MULTILINE | re.DOTALL).findall
23 |
24 |
25 | class Semanticizer:
26 | def __init__(self, language_code, sense_probability_threshold,
27 | max_ngram_length=None, debug=False):
28 | """constructor"""
29 | self.language_code = language_code
30 | self.sense_probability_threshold = sense_probability_threshold
31 | self.wikipedia_url_template = 'http://%s.wikipedia.org/wiki/%s'
32 | self.wpm = wpm_dumps[language_code]
33 | self.title_page = {} # This needs to be removed
34 | self.max_ngram_length = max_ngram_length
35 | self.debug = debug
36 |
37 | def semanticize(self, sentence, normalize_dash=True,
38 | normalize_accents=True, normalize_lower=False,
39 | translations=True, counts=False,
40 | largest_matching=False,
41 | sense_probability_threshold=None):
42 | if sense_probability_threshold == None:
43 | sense_probability_threshold = self.sense_probability_threshold
44 | result = {"links": []}
45 | ngrams = set()
46 | token_lists = [tokenize(sentence),
47 | tokenize(sentence.replace('-', ' ')),
48 | tokenize(sentence.replace('.', ' ')),
49 | tokenize(sentence.replace('.', ''))]
50 |
51 | # get all ngrams for this sentence, limit to max_ngram_length
52 | # if applicable
53 | for token_list in token_lists:
54 | max_len = len(token_list) + 1
55 | if self.max_ngram_length is not None:
56 | max_len = min(max_len, self.max_ngram_length)
57 |
58 | for n in range(1, max_len):
59 | for ngram in nltk_ngrams(token_list, n):
60 | ngrams.add(' '.join(ngram))
61 |
62 | normal_ngrams = map(wpmutil.normalize, ngrams)
63 | exist = self.wpm.normalized_entities_exist(normal_ngrams)
64 |
65 | for i, (ngram, normal_ngram) in enumerate(zip(ngrams, normal_ngrams)):
66 | if exist[i]:
67 | normalized_ngram = wpmutil.normalize(ngram, normalize_dash,
68 | normalize_accents,
69 | normalize_lower)
70 | anchors = self.wpm.get_all_entities(normal_ngram)
71 | for anchor in anchors:
72 | normalized_anchor = wpmutil.normalize(anchor, normalize_dash,
73 | normalize_accents,
74 | normalize_lower)
75 | if normalized_ngram == normalized_anchor:
76 | if self.debug and not self.wpm.entity_exists(anchor):
77 | raise LookupError("Data corrupted, cannot "
78 | + "find %s in the database" \
79 | % anchor)
80 | entity = self.wpm.get_entity_data(anchor)
81 | senses = [(sense, self.wpm.get_sense_data(anchor, str(sense))) for sense in entity['senses']]
82 | if largest_matching: senses = sorted(senses, key=lambda (_, d): -d['cntlinkdoc'])[:1]
83 | for sense, sense_data in senses:
84 | if sense_data:
85 | if entity['cnttextocc'] == 0:
86 | link_probability = 0
87 | sense_probability = 0
88 | else:
89 | link_probability = float(entity['cntlinkdoc']) / entity['cnttextdoc']
90 | sense_probability = float(sense_data['cntlinkdoc']) / entity['cnttextdoc']
91 | if sense_probability > sense_probability_threshold:
92 | title = unicode(self.wpm.get_item_title(str(sense)))
93 | url = self.wikipedia_url_template \
94 | % (self.language_code,
95 | urllib.quote(title.encode('utf-8')))
96 | if entity['cntlinkocc'] == 0:
97 | prior_probability = 0
98 | else:
99 | prior_probability = float(sense_data['cntlinkocc']) / entity['cntlinkocc']
100 | link = {
101 | "label": anchor,
102 | "text": ngram,
103 | "title": title,
104 | "id": sense,
105 | "url": url,
106 | "linkProbability": link_probability,
107 | "senseProbability": sense_probability,
108 | "priorProbability": prior_probability
109 | }
110 | if translations:
111 | link["translations"] = {self.language_code:
112 | {"title": title,
113 | "url": url}}
114 | if self.wpm.sense_has_trnsl(str(sense)):
115 | for lang in self.wpm.get_trnsl_langs(str(sense)):
116 | trnsl = self.wpm.get_sense_trnsl(str(sense), lang)
117 | link["translations"][lang] = {
118 | 'title': unicode(trnsl),
119 | 'url': self.wikipedia_url_template % (lang, urllib.quote(unicode(trnsl).encode('utf-8')))
120 | }
121 | if counts:
122 | link["occCount"] = entity['cnttextocc']
123 | link["docCount"] = entity['cnttextdoc']
124 | link["linkOccCount"] = entity['cntlinkocc']
125 | link["linkDocCount"] = entity['cntlinkdoc']
126 | link["senseOccCount"] = int(sense_data['cntlinkocc'])
127 | link["senseDocCount"] = int(sense_data['cntlinkdoc'])
128 | link['fromTitle'] = sense_data['from_title']
129 | link['fromRedirect'] = sense_data['from_redir']
130 | result["links"].append(link)
131 |
132 | if largest_matching:
133 | available_text = wpmutil.normalize(sentence, normalize_dash, normalize_accents, normalize_lower)
134 | for link in sorted(result["links"], key=lambda link: -link["priorProbability"]/2-len(link["label"])):
135 | normalized_label = wpmutil.normalize(link["label"], normalize_dash, normalize_accents, normalize_lower)
136 | if normalized_label in available_text:
137 | available_text = available_text.replace(normalized_label, "")
138 | else: result["links"].remove(link)
139 | return result
140 |
--------------------------------------------------------------------------------
/semanticizer/processors/semanticizer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | """
15 | The Processor wrapping Semanticizer
16 | """
17 | from nltk.tokenize.punkt import PunktSentenceTokenizer
18 |
19 | from .core import LinksProcessor
20 | from .semanticize import Semanticizer
21 |
22 |
23 | class SemanticizeProcessor(LinksProcessor):
24 | """Processor handling the semanticizing"""
25 |
26 | def __init__(self, debug=False):
27 | """Set the class variables"""
28 | self.langcodes = []
29 | self.semanticizers = {}
30 | self.debug = debug
31 |
32 | def load_languages(self, langcodes, max_ngram_length=None):
33 | """Save the languages and load the semanticizer"""
34 | self.langcodes = langcodes
35 | for langcode in langcodes:
36 | self.semanticizers[langcode] = Semanticizer(langcode, None,
37 | max_ngram_length,
38 | self.debug)
39 |
40 | def preprocess(self, links, text, settings):
41 | """
42 | Semanticize the given text and return the links, text, and
43 | settings.
44 | """
45 | links = []
46 | if "langcode" in settings and settings["langcode"] in self.semanticizers:
47 | translations = "translations" in settings
48 | normalize_dash = not("normalize" in settings and \
49 | not "dash" in settings["normalize"])
50 | normalize_accents = not("normalize" in settings and \
51 | not "accents" in settings["normalize"])
52 | normalize_lower = "normalize" in settings and \
53 | "lower" in settings["normalize"]
54 | lower_confidence_bound = "lowerConfidenceBound" in settings
55 | largest_matching = "largestMatching" in settings
56 |
57 | if "split_sentences" in settings:
58 | sentences = PunktSentenceTokenizer().tokenize(text)
59 | else:
60 | sentences = [text]
61 |
62 | sem = self.semanticizers[settings["langcode"]]
63 | for sentence in sentences:
64 | results = sem.semanticize(sentence, counts=True,
65 | normalize_dash=normalize_dash,
66 | normalize_accents=normalize_accents,
67 | normalize_lower=normalize_lower,
68 | largest_matching=largest_matching,
69 | lower_confidence_bound=lower_confidence_bound,
70 | translations=translations,
71 | sense_probability_threshold=-1)
72 |
73 | links.extend(results["links"])
74 |
75 | return (links, text, settings)
76 |
77 | def postprocess(self, links, text, settings):
78 | """
79 | Remove counts from links
80 | @todo: why do this here? In Semanticizer.semanticize there's already \
81 | a check being done on whether counts should be included.
82 | """
83 | if not "counts" in settings:
84 | for link in links:
85 | for key in link.keys():
86 | if key.endswith("Count"):
87 | del link[key]
88 |
89 | return (links, text, settings)
90 |
91 | def inspect(self):
92 | """Return the loaded languages"""
93 | return {self.__class__.__name__: self.langcodes}
94 |
--------------------------------------------------------------------------------
/semanticizer/processors/stringUtils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
4 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
5 | # General Public License as published by the Free Software Foundation, either
6 | # version 3 of the License, or (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful, but WITHOUT
9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
11 | # for more details.
12 | #
13 | # You should have received a copy of the GNU Lesser General Public License
14 | # along with this program. If not, see .
15 |
16 | import re
17 |
18 | # Kan ook met de hand...:(\A|\s|\'|"|\.|\,|:|;|!|\?)
19 | # (?=(\s|\'|"|\.|\,|:|;|!|\?|\'s|\Z)
20 | # reNonWordChars = re.compile('(?u)\W+', re.UNICODE)
21 |
22 | # We took the reg exp from scikit-learn:
23 | # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py
24 | reTokenPattern = re.compile(r"(?u)\b\w\w+\b", re.UNICODE)
25 |
26 | def ngramToPattern(sNgram):
27 | return ngramsToPattern([sNgram])
28 |
29 | def ngramsToPattern(aNgrams):
30 | #import sys
31 | #print >> sys.stderr, "n-grams: '%s'" % aNgrams
32 | try:
33 | # So this reads, inside out:
34 | # Replace all white space by a single space and re.escape that.
35 | # Replace the (by now escaped) spaces by '\s+'s and join the different
36 | # n-grams by pipes ('|')
37 | #
38 | sNgrams = '|'.join([re.escape(re.sub('\s+', ' ', x)).replace('\\ ', '\s+') for x in aNgrams])
39 | reNgrams = re.compile('((\A|\W)(' + sNgrams + ')(?=\W|\Z))',
40 | flags=re.UNICODE|re.IGNORECASE)
41 | except OverflowError:
42 | # Some articles have such a ridiculous number of inlink anchors that
43 | # the regular expression gets too big.
44 | # This doesn't happen if we make it a bit stricter....
45 | # So, if that happens we make the same expression but we do not replace
46 | # the spaces by \s+'s
47 | sNgrams = '|'.join([re.escape(re.sub('\s+', ' ', x)) for x in aNgrams])
48 | reNgrams = re.compile('((\A|\W)(' + sNgrams + ')(?=\W|\Z))',
49 | flags=re.UNICODE|re.IGNORECASE)
50 | return reNgrams
51 |
52 | # For one word
53 | def findNgramInText(sNgram, sText):
54 | return findNgramsInText([sNgram], sText)
55 |
56 | # For several words
57 | def findNgramsInText(aNgrams, sText):
58 | # A check beforehand because an empty array will lead to a pattern that
59 | # matches empty lines, double spaces, etc....
60 | if len(aNgrams) == 0:
61 | return []
62 | return re.findall(ngramsToPattern(aNgrams), sText)
63 |
64 | if __name__ == "__main__":
65 | sText = u"aap noot mies\nwim jüf duif “Noot” roos ühalloü"
66 |
67 | aMatches = findNgramInText(u'aap', sText)
68 | print "%s" % aMatches
69 |
70 | aMatches = findNgramInText(u'hallo', sText)
71 | print "%s" % aMatches
72 |
73 | aMatches = findNgramsInText([u'mies wim', u'noot'], sText)
74 | print "%s" % aMatches
75 |
--------------------------------------------------------------------------------
/semanticizer/processors/util.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import os, yaml
15 | import sklearn.metrics
16 | import sklearn.externals.joblib
17 |
18 | def compute_metrics(labels, scores, threshold=0.5):
19 | metrics = {}
20 | # Sort according to score
21 | scores, labels = zip(*sorted(zip(scores, labels)))
22 | predictions = [score >= threshold for score in scores]
23 | # Classification metrics
24 | metrics["precision"], metrics["recall"], metrics["f1"], support = \
25 | sklearn.metrics.precision_recall_fscore_support(labels, predictions, \
26 | average="weighted")
27 | metrics["accuracy"] = sklearn.metrics.accuracy_score(labels, predictions)
28 | metrics["zeroOneLoss"] = sklearn.metrics.zero_one_loss(labels, predictions)
29 | # Rank-based metrics
30 | metrics["averagePrecision"] = \
31 | sklearn.metrics.average_precision_score(labels, scores)
32 | metrics["ROC AUC"] = sklearn.metrics.roc_auc_score(labels, scores)
33 | # R-precision
34 | r_labels = labels[-support:]
35 | r_predictions = [True for label in r_labels]
36 | metrics["rPrecision"] = \
37 | sklearn.metrics.precision_score(r_labels, r_predictions)
38 | return metrics
39 |
40 | class ModelStore():
41 | def __init__(self, model_dir):
42 | self.model_dir = model_dir
43 | self.model_cache = {}
44 |
45 | def load_model(self, modelname):
46 | if modelname.endswith(".pkl"):
47 | return self.load_model(modelname[:-4])
48 |
49 | if modelname in self.model_cache:
50 | return self.model_cache[modelname]
51 |
52 | modelfile = os.path.join(self.model_dir, modelname)
53 | model = sklearn.externals.joblib.load(modelfile + ".pkl")
54 |
55 | description = {"name": modelname, "source": modelfile + ".pkl"}
56 | if os.path.exists(modelfile + ".yaml"):
57 | description.update(yaml.load(file(modelfile + ".yaml")))
58 |
59 | if os.path.exists(modelfile + ".preprocessor.pkl"):
60 | preprocessor = sklearn.externals.joblib.load(modelfile + \
61 | ".preprocessor.pkl")
62 | else:
63 | preprocessor = None
64 |
65 | self.model_cache[modelname] = (model, description, preprocessor)
66 | return (model, description, preprocessor)
67 |
68 | def save_model(self, model, modelname, description=None, preprocessor=None):
69 | if modelname.endswith(".pkl"):
70 | modelname = modelname[:-4]
71 |
72 | modelfile = os.path.join(self.model_dir, modelname)
73 | sklearn.externals.joblib.dump(model, modelfile + ".pkl")
74 |
75 | if preprocessor:
76 | sklearn.externals.joblib.dump(preprocessor, \
77 | modelfile + ".preprocessor.pkl")
78 |
79 | if description != None:
80 | with open(modelfile + ".yaml", 'w') as out:
81 | out.write(yaml.dump(description))
82 | else:
83 | description = {}
84 |
85 | description.update({"name": modelname, "source": modelfile + ".pkl"})
86 | self.model_cache[modelname] = (model, description, preprocessor)
87 |
88 | def _convert_dict(self, data, skip=[]):
89 | """Helper function that convert the values of dictionary to int/float.
90 | Optionally you can skip a list of values."""
91 | converted_data = {}
92 | for k,v in data.iteritems():
93 | if k in skip: continue
94 | try:
95 | converted_data[k] = int("".join(v))
96 | except ValueError:
97 | try:
98 | converted_data[k] = float("".join(v))
99 | except ValueError:
100 | converted_data[k] = v
101 | return converted_data
102 |
103 | def create_model(self, settings, skip_settings=[]):
104 | if not "classifier" in settings:
105 | raise ValueError("Expecting a classifier in settings.")
106 | if not "." in settings["classifier"]:
107 | raise ValueError("Expecting a package in classifier settings.")
108 |
109 | classifier = settings["classifier"].split(".")[-1]
110 | package = ".".join(settings["classifier"].split(".")[:-1])
111 |
112 | preprocessor_settings = dict([(key, value) for key, value \
113 | in settings.iteritems() \
114 | if key.startswith("preprocessor.")])
115 |
116 | skip_settings.extend(["classifier", "preprocessor"])
117 | skip_settings.extend(preprocessor_settings.keys())
118 | arguments = self._convert_dict(settings, skip_settings)
119 | model = self._create_instance(package, classifier, **arguments)
120 |
121 | if "preprocessor" in settings:
122 | if not "." in settings["preprocessor"]:
123 | raise ValueError("Expecting a package in preprocessor settings.")
124 |
125 | preprocessor_classname = settings["preprocessor"].split(".")[-1]
126 | preprocessor_package = ".".join(settings["preprocessor"].split(".")[:-1])
127 |
128 | preprocessor_settings = dict([(".".join(key.split(".")[1:]), value)\
129 | for key, value \
130 | in preprocessor_settings.iteritems()])
131 | preprocessor_arguments = self._convert_dict(preprocessor_settings)
132 | preprocessor = self._create_instance(preprocessor_package, \
133 | preprocessor_classname, \
134 | **preprocessor_arguments)
135 | else:
136 | preprocessor = None
137 |
138 | return model, preprocessor
139 |
140 | def _create_instance(self, package, classname, *args, **kwargs):
141 | # Import package module
142 | package_module = __import__(package, globals(), locals(), \
143 | [str(classname)], -1)
144 | # Class instance
145 | package_class = getattr(package_module, classname)
146 |
147 | instance = package_class(*args, **kwargs)
148 |
149 | return instance
150 |
--------------------------------------------------------------------------------
/semanticizer/procpipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import time
15 | import logging
16 |
17 | from .processors.core import SettingsProcessor, FilterProcessor
18 | from .processors.semanticizer import SemanticizeProcessor
19 | from .processors.features import FeaturesProcessor, ArticleFeaturesProcessor, \
20 | ContextFeaturesProcessor
21 | from .processors.multiple import MultipleEntityFeaturesProcessor
22 | from .processors.external import ArticlesProcessor, StatisticsProcessor
23 | from .processors.learning import LearningProcessor
24 | from .processors.image import AddImageProcessor
25 |
26 | from .config import config_get
27 |
28 |
29 | def build(langcodes, use_features=False, debug=False):
30 | """
31 | Initialize the pipeline.
32 |
33 | @param wikipedia_ids: A list with all loaded wikipedia ids
34 | @return: The pipeline
35 | @todo: See todo at _load_languages
36 | """
37 | logging.getLogger().info("Initializing pipeline")
38 | pipeline = []
39 | if 'max_ngram_length' in config_get('semanticize', {}):
40 | max_ngram_length = config_get(('semanticize', 'max_ngram_length'))
41 | else:
42 | max_ngram_length = None
43 | semanticize_processor = _load_semanticize_processor(langcodes,
44 | max_ngram_length,
45 | debug=debug)
46 | settings = config_get("settings", {})
47 | pipeline.append(("Settings", SettingsProcessor(settings)))
48 | pipeline.append(("Semanticize", semanticize_processor))
49 | pipeline.append(("Filter", FilterProcessor()))
50 | if use_features:
51 | _load_features(pipeline, langcodes)
52 | else:
53 | _load_articles(pipeline, langcodes)
54 | pipeline.append(("AddImage", AddImageProcessor()))
55 | logging.getLogger().info("Done initializing pipeline")
56 | return pipeline
57 |
58 |
59 | def _load_semanticize_processor(langcodes, max_ngram_length=None, debug=False):
60 | """
61 | Load the Semanticizer.
62 |
63 | @param wikipedia_ids: A list with all loaded wikipedia ids
64 | @return: a configured instance of SemanticizeProcessor
65 | @see: processors.SemanticizeProcessor
66 | """
67 | logging.getLogger().info("Loading semanticizer")
68 | semanticize_processor = SemanticizeProcessor(debug=debug)
69 | start = time.time()
70 | logging.getLogger().info("Loading semanticizers for langcode(s) "
71 | + ", ".join(langcodes))
72 | semanticize_processor.load_languages(langcodes, max_ngram_length)
73 | logging.getLogger().info("Loading semanticizers took %.2f seconds." \
74 | % (time.time() - start))
75 | logging.getLogger().info("Done loading semanticizer")
76 | return semanticize_processor
77 |
78 |
79 | def _load_features(pipeline, langcodes):
80 | """
81 | Load all features into the pipeline
82 |
83 | @param pipeline: A reference to the pipeline
84 | @param semanticize_processor: A reference to the semanticize processor
85 | @param wikipedia_ids: Wikipedia ids & data
86 | """
87 | logging.getLogger().info("Loading features")
88 | start = time.time()
89 | pipeline.append(("Features",
90 | FeaturesProcessor(langcodes)))
91 | _load_articles(pipeline, langcodes)
92 | pipeline.append(("Statistics",
93 | StatisticsProcessor(langcodes,
94 | config_get(('wpm', 'threads'), 1),
95 | config_get(('misc', 'tempdir')))))
96 | pipeline.append(("ArticleFeatures", ArticleFeaturesProcessor()))
97 | pipeline.append(("MultipleFeatures", MultipleEntityFeaturesProcessor()))
98 | pipeline.append(("ContextFeatures", ContextFeaturesProcessor()))
99 | logging.getLogger().info("Loading features took %.2f seconds." \
100 | % (time.time() - start))
101 | model_dir = config_get(('learning', 'model_dir'), \
102 | config_get(('misc', 'tempdir')))
103 | pipeline.append(("Learning", LearningProcessor(model_dir)))
104 | logging.getLogger().info("Done loading features")
105 |
106 | def _load_articles(pipeline, langcodes):
107 | pipeline.append(("Articles",
108 | ArticlesProcessor(langcodes, config_get(('misc', 'tempdir')))))
109 |
--------------------------------------------------------------------------------
/semanticizer/server/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | # Can do without ujson and simplejson, but speeds up considerably.
15 | try:
16 | import ujson
17 | except ImportError:
18 | pass
19 | try:
20 | import simplejson as json
21 | except ImportError:
22 | import json
23 |
24 | import re
25 | from flask import Flask, Response, request, abort
26 |
27 | from uuid import uuid4
28 |
29 | class Server(object):
30 | """
31 | The HTTP server that will serve the complete namespace
32 | """
33 |
34 | APPLICATION_JSON="application/json"
35 |
36 | def __init__(self):
37 | """
38 | Initialize the server. The constructor creates the initial Flask server
39 | object.
40 | """
41 | self.app = Flask(__name__)
42 |
43 | def set_debug(self, debug=None, debug_log_format=None):
44 | """
45 | Set Flask server debug parameters.
46 |
47 | @param debug: Enable or disable debug mode
48 | @param debug_log_format: Set the logformat string for the server
49 | """
50 | if not debug is None:
51 | self.app.debug = debug
52 | if not debug_log_format is None:
53 | self.app.debug_log_format = debug_log_format
54 |
55 | def _json_dumps(self, o, pretty=False):
56 | """
57 | Util function to create json dumps based on an object.
58 |
59 | @param o: Object to transform
60 | @param pretty: Whether or not to prettify the JSON
61 | @return: The JSON string
62 | """
63 | if not pretty and "ujson" in locals():
64 | return ujson.dumps(o)
65 | elif not pretty:
66 | return json.dumps(o)
67 | else:
68 | return json.dumps(o, indent=4)
69 |
70 | def _get_text_from_request(self):
71 | """
72 | Util function to get the param called "text" from the current request
73 |
74 | @return: the value of "text"
75 | """
76 | content_type = request.headers['Content-Type'] if 'Content-Type' in request.headers else "text/plain"
77 | if request.method == "POST":
78 | if content_type == 'application/x-www-form-urlencoded':
79 | return request.form['text']
80 | elif content_type == 'text/plain':
81 | return request.data
82 | else:
83 | abort(Response("Unsupported Content Type, use: text/plain\n",
84 | status=415))
85 | elif "text" in request.args:
86 | return request.args["text"]
87 | else:
88 | abort(Response("No text provided, use: POST or GET with attribute \
89 | 'text'\n", status=400))
90 |
91 | def _get_values_from_request(self, values=None):
92 | """
93 | Util function to get the values from the current request
94 |
95 | @param values: initial dictionary of values
96 | @return: a dictionary of values
97 | """
98 | if not values:
99 | values = {}
100 | for key, value in request.values.iteritems():
101 | assert key not in values
102 | values[key] = value
103 |
104 | return values
105 |
106 | def setup_route_semanticize(self, langcodes):
107 | """
108 | Setup the /semanticize/ namespace.
109 |
110 | @param langcodes: The languages supported for semanticizing.
111 | """
112 | self.langcodes = langcodes
113 | self.app.add_url_rule("/semanticize/", "_semanticize",
114 | self._semanticize_handler, methods=["GET", "POST"])
115 | self.app.add_url_rule("/semanticize", "_semanticize_usage",
116 | self._semanticize_usage,
117 | methods=["GET", "POST"])
118 |
119 | def setup_route_inspect(self):
120 | """
121 | Setup the /inspect namespace.
122 |
123 | @param pipeline: The pipeline of processors to inspect.
124 | """
125 | self.app.add_url_rule("/inspect", "_inspect",
126 | self._inspect, methods=["GET"])
127 |
128 | def setup_route_feedback(self):
129 | """
130 | Setup the /feedback namespace.
131 |
132 | @param pipeline: The pipeline of processors to feed back to.
133 | """
134 | hex = "[a-fA-F0-9]"
135 | pattern = "hex{8}-hex{4}-hex{4}-hex{4}-hex{12}".replace("hex", hex)
136 | self.request_id_pattern = re.compile(pattern)
137 | self.app.add_url_rule("/feedback/", "_feedback",
138 | self._feedback, methods=["GET", "POST"])
139 | self.app.add_url_rule("/evaluate/", "_evaluate",
140 | self._evaluate, methods=["GET", "POST"])
141 | self.app.add_url_rule("/evaluate", "_evaluate",
142 | self._evaluate, methods=["GET", "POST"])
143 | self.app.add_url_rule("/learn/", "_learn",
144 | self._learn, methods=["GET", "POST"])
145 |
146 | def setup_all_routes(self, pipeline, langcodes):
147 | """
148 | Convenience function to start all namespaces at once.
149 |
150 | @param pipeline: The pipeline of processors
151 | """
152 | self.pipeline = pipeline
153 | self.setup_route_semanticize(langcodes)
154 | self.setup_route_inspect()
155 | self.setup_route_feedback()
156 |
157 | def start(self, host, port, use_reloader=False):
158 | """
159 | Wrapper for the Flask run() function. Will start the HTTP server with
160 | all initialized namespaces.
161 |
162 | @param host: The hostname to bind on
163 | @param port: The port to bind on
164 | """
165 | print "Server started on %s:%d" % (host, port)
166 | self.app.run(host, port, self.app.debug, use_reloader=use_reloader)
167 |
168 | def _semanticize_usage(self):
169 | """
170 | The function handling the /semanticize namespace. Returns the available
171 | languages.
172 |
173 | @return: The body of the response, in this case a json formatted list \
174 | of links and their relevance
175 | @see: _semanticize
176 | """
177 |
178 | json = self._json_dumps({"languages": self.langcodes},
179 | "pretty" in request.args)
180 |
181 | return Response(json, mimetype=Server.APPLICATION_JSON)
182 |
183 | def _semanticize_handler(self, langcode):
184 | """
185 | The function handling the /semanticize/ namespace. It uses
186 | the chain-of-command pattern to run all processors, using the
187 | corresponding preprocess, process, and postprocess steps.
188 |
189 | @param langcode: The language to use in the semanticizing
190 | @return: The body of the response, in this case a json formatted list \
191 | of links and their relevance
192 | """
193 | self.app.logger.debug("Semanticizing: start")
194 | text = self._get_text_from_request()
195 | self.app.logger.debug("Semanticizing text: " + text)
196 |
197 | settings = self._get_values_from_request({"langcode": langcode})
198 | settings["request_id"] = str(uuid4())
199 |
200 | sem_result = self._semanticize(langcode, settings, text)
201 | sem_result["request_id"] = settings["request_id"]
202 | json = self._json_dumps(sem_result, "pretty" in settings)
203 |
204 | self.app.logger.debug("Semanticizing: Created %d characters of JSON "
205 | "for request id %s." \
206 | % (len(json), sem_result["request_id"]))
207 | return Response(json, mimetype=Server.APPLICATION_JSON)
208 |
209 | def _semanticize(self, langcode, settings, text):
210 | """
211 | Method that performs the actual semantization.
212 | """
213 | links = []
214 |
215 | for function in ("preprocess", "process", "postprocess"):
216 | for step, processor in self.pipeline:
217 | self.app.logger.debug("Semanticizing: %s for step %s" \
218 | % (function, step))
219 | (links, text, settings) = getattr(processor, function)(links,
220 | text,
221 | settings
222 | )
223 | self.app.logger.debug("Semanticizing: %s pipeline with %d steps \
224 | done" % (function, len(self.pipeline)))
225 |
226 | result = {"links": links, "text": text}
227 |
228 | return result
229 |
230 | def _inspect(self):
231 | """
232 | Function that handles the /inspect namespace. Will print the settings
233 | used by the different processors.
234 |
235 | @return: The body of the response, in this case a json formatted \
236 | string containing all found settings.
237 | """
238 | inspect = {}
239 | for _, processor in self.pipeline:
240 | inspect.update(processor.inspect())
241 | return Response(self._json_dumps(inspect, pretty=True),
242 | mimetype=Server.APPLICATION_JSON)
243 |
244 | def _feedback(self, context_path):
245 | """
246 | Function that handles the /feedback namespace. Will process the
247 | feedback in supported processors in the pipeline.
248 | """
249 | context_parts = context_path.split("/")
250 | if len(context_parts) == 0:
251 | raise ValueError("No context for feedback is provided!")
252 |
253 | request_id_match = self.request_id_pattern.match(context_parts[-1])
254 | if request_id_match:
255 | request_id = request_id_match.string
256 | context_parts.pop()
257 | else:
258 | request_id = None
259 |
260 | context = "/".join(context_parts) if len(context_parts) else None
261 | feedback = request.values
262 | for processor_name, processor in self.pipeline:
263 | if "feedback" in processor.__class__.__dict__:
264 | self.app.logger.debug("Feeding feedback for request_id %s in "
265 | "context %s to %s." %
266 | (request_id, context, processor_name))
267 | processor.feedback(request_id, context, feedback)
268 |
269 | return "OK"
270 |
271 | def _evaluate(self, context_path=""):
272 | """
273 | Function that handles the /evaluate namespace. Will evaluate a metric based
274 | on the feedback in supported processors in the pipeline.
275 | """
276 | evaluation = {}
277 | for processor_name, processor in self.pipeline:
278 | if "evaluate" in processor.__class__.__dict__:
279 | self.app.logger.debug("Evaluating %s in %s." %
280 | (context_path, processor_name))
281 | evaluation.update(processor.evaluate(context_path,
282 | request.values))
283 |
284 | return Response(self._json_dumps(evaluation, pretty=True),
285 | mimetype=Server.APPLICATION_JSON)
286 |
287 | def _learn(self, name):
288 | """
289 | Function that handles the /learn namespace. Will learn based on the
290 | feedback in supported processors in the pipeline.
291 | """
292 | for processor_name, processor in self.pipeline:
293 | if "learn" in processor.__class__.__dict__:
294 | self.app.logger.debug("Learning %s in %s." %
295 | (name, processor_name))
296 | processor.learn(name, request.values)
297 |
298 | return "OK"
299 |
--------------------------------------------------------------------------------
/semanticizer/server/__main__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import logging
15 | from logging.handlers import TimedRotatingFileHandler
16 |
17 | from .. import procpipeline
18 | from ..config import config_get
19 | from ..server import Server
20 | from ..wpm.data import init_datasource
21 |
22 |
23 | def start_server(langcodes,
24 | host,
25 | port,
26 | use_reloader,
27 | verbose=False,
28 | logformat='[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s',
29 | use_features=False,
30 | debug=False):
31 | """
32 | Start a SemanticizerFlaskServer with all processors loaded into the
33 | pipeline.
34 |
35 | @param verbose: Set whether the Flask server should be verbose
36 | @param logformat: The logformat used by the Flask server
37 | """
38 | # Initialize the pipeline
39 | pipeline = procpipeline.build(langcodes, use_features, debug=debug)
40 | # Create the FlaskServer
41 | logging.getLogger().info("Setting up server")
42 | server = Server()
43 | server.set_debug(verbose, logformat)
44 | # Setup all available routes / namespaces for the HTTP server
45 | server.setup_all_routes(pipeline, langcodes)
46 | logging.getLogger().info("Done setting up server, now starting...")
47 | # And finally, start the thing
48 | server.start(host, port, use_reloader)
49 |
50 | def init_logging(log, verbose, logformat):
51 | """
52 | A convencience function that initializes the logging framework by setting
53 | the path to the log, verbosity, and the logformat.
54 | """
55 | file_handler = TimedRotatingFileHandler(log, when='midnight')
56 | file_handler.setFormatter(logging.Formatter(logformat))
57 | stream_handler = logging.StreamHandler()
58 | stream_handler.setFormatter(logging.Formatter(logformat))
59 | if verbose == True:
60 | file_handler.setLevel(logging.DEBUG)
61 | stream_handler.setLevel(logging.DEBUG)
62 | logging.getLogger().setLevel(logging.DEBUG)
63 | logging.getLogger().addHandler(file_handler)
64 | logging.getLogger().addHandler(stream_handler)
65 |
66 |
67 | def main():
68 | # Init the logger
69 | init_logging(config_get(('logging', 'path'), 'log.txt'),
70 | config_get(('logging', 'verbose'), False),
71 | config_get(('logging', 'format'), None))
72 |
73 | # Set the datasource and init it
74 | wpmlangs = config_get(('wpm', 'languages'))
75 | settings = config_get(('settings'), {})
76 | init_datasource(wpmlangs, settings)
77 |
78 | # Start the server
79 | try:
80 | start_server(config_get(('wpm', 'languages')).keys(),
81 | config_get(('server', 'host'), '0.0.0.0'),
82 | config_get(('server', 'port'), 5000),
83 | config_get(('server', 'use_reloader'), False),
84 | config_get(('logging', 'verbose'), False),
85 | config_get(('logging', 'format'), None),
86 | config_get(('linkprocs', 'features'), False),
87 | config_get(('server', 'debug'), False))
88 | except ValueError as e:
89 | logging.getLogger().fatal("Error running Semanticizer server: %s" \
90 | % e.message)
91 | raise
92 |
93 |
94 | if __name__ == '__main__':
95 | main()
96 |
--------------------------------------------------------------------------------
/semanticizer/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/util/__init__.py
--------------------------------------------------------------------------------
/semanticizer/util/online_learning.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import os, re, argparse, urllib, urllib2, json
15 | from collections import defaultdict
16 | from timer import Timer
17 | from random import choice, shuffle
18 |
19 | def parse_args():
20 | parser = argparse.ArgumentParser(
21 | description='Online learn a classifier.',
22 | formatter_class=argparse.ArgumentDefaultsHelpFormatter)
23 |
24 | parser.add_argument('classifier', metavar='classifier',
25 | help='a classifier to train')
26 | parser.add_argument('datafiles', metavar='file', nargs='+',
27 | help='a set of datafiles to process')
28 |
29 | group = parser.add_argument_group('Semanticizer')
30 | group.add_argument('--url', default='http://localhost:5000/',
31 | help='URL where the semanticizer webservice is running')
32 |
33 | group = parser.add_argument_group('Learning')
34 | group.add_argument('--learn', nargs=2, action='append',
35 | metavar=('setting', 'value'),
36 | default=[('context', 'EMPTY')],
37 | help='Setting for the learn call')
38 | group.add_argument('--model-prefix', metavar='prefix',
39 | default='Online.',
40 | help='Prefix to add to the modelname')
41 | group.add_argument('--iterations', metavar='number',
42 | default=50, type=int,
43 | help='Number of iterations for learning.f')
44 |
45 | group = parser.add_argument_group('Context')
46 | group.add_argument('--context-pattern', nargs=2,
47 | metavar=('pattern', 'replacement'),
48 | default=('^(?:.*/)*(.*?)(?:\.txt)?$', '\g<1>'),
49 | help='Pattern to generate context from filename')
50 | group.add_argument('--context-prefix',
51 | metavar='prefix', default='',
52 | help='Prefix to add to the context')
53 |
54 | group = parser.add_argument_group('Output')
55 | group.add_argument('--output', default=None,
56 | help='Filename for the output')
57 |
58 | args = parser.parse_args()
59 | args.learn.append(('classifier', args.classifier))
60 | return args
61 |
62 | def online_learning(args):
63 | results = defaultdict(list)
64 |
65 | shuffle(args.datafiles)
66 | for filenr, filename in enumerate(args.datafiles):
67 | assert os.path.exists(filename)
68 | context = args.context_prefix + re.sub(args.context_pattern[0], \
69 | args.context_pattern[1], \
70 | filename)
71 |
72 | modelname = args.model_prefix + context.replace('/', '.')
73 | learn_url = args.url + 'learn/' + modelname
74 | url_data = urllib.urlencode(args.learn)
75 |
76 | print "Initializing model", modelname,
77 | print urllib2.urlopen(learn_url, url_data).read()
78 |
79 | train_files = [f for f in args.datafiles if f != filename]
80 | for i in range(args.iterations):
81 | print "%02d/%02d" % (filenr+1, len(args.datafiles)),
82 | print "%03d/%03d" % (i+1, args.iterations),
83 | train_filename = choice(train_files)
84 | #with Timer("Learning for %s" % train_filename, 'timer'):
85 | train_context = args.context_prefix + \
86 | re.sub(args.context_pattern[0], \
87 | args.context_pattern[1], train_filename)
88 |
89 | url_data = urllib.urlencode({"context": train_context})
90 | print "Training", modelname, "on", train_context,
91 | print urllib2.urlopen(learn_url, url_data).read()
92 |
93 | evaluate_url = args.url + 'evaluate/' + context
94 | url_data = urllib.urlencode({"model": modelname})
95 | result = json.loads(urllib2.urlopen(evaluate_url, url_data).read())
96 | print "%.4f %.4f %.4f" % \
97 | (result["macro_metrics"]["accuracy"],
98 | result["macro_metrics"]["averagePrecision"],
99 | result["macro_metrics"]["rPrecision"])
100 | results[filename].append(result)
101 |
102 | if args.output:
103 | with open(args.output, 'w') as out:
104 | out.write(json.dumps(results))
105 |
106 | if __name__ == '__main__':
107 | args = parse_args()
108 |
109 | with Timer("Online learning %d files" % len(args.datafiles), 'timer'):
110 | online_learning(args)
111 |
--------------------------------------------------------------------------------
/semanticizer/util/profiler.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import pstats
15 | import sys
16 |
17 | if __name__ == '__main__':
18 | stats = pstats.Stats(sys.argv[1])
19 | stats.sort_stats('time')
20 | stats.print_stats(.01)
21 | stats.print_callers(.01)
22 |
--------------------------------------------------------------------------------
/semanticizer/util/store_dataset.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import sys, os, re, argparse, urllib, urllib2, json
15 | from collections import defaultdict
16 | from timer import Timer
17 |
18 | def parse_args():
19 | parser = argparse.ArgumentParser(
20 | description='Process and store a dataset.',
21 | formatter_class=argparse.ArgumentDefaultsHelpFormatter)
22 |
23 | parser.add_argument('datafiles', metavar='file', nargs='+',
24 | help='a set of datafiles to process')
25 |
26 | group = parser.add_argument_group('Semanticizer')
27 | group.add_argument('--url', default='http://localhost:5000/',
28 | help='URL where the semanticizer webservice is running')
29 | group.add_argument('--language', metavar='langcode',
30 | default='en',
31 | help='Language of the semanticizer (2 letters, eg. en)')
32 | group.add_argument('--semanticize', nargs=2, action='append',
33 | metavar=('setting', 'value'),
34 | default=[('save', "true")],
35 | help='Setting for the semanticizer call')
36 |
37 | group = parser.add_argument_group('Feedback')
38 | group.add_argument('--feedback', nargs=3, action='append',
39 | metavar=('type', 'pattern', 'replacement'),
40 | help='Pattern to generate feedback filenames '
41 | '(default: positive "\\.txt$" ".positives.txt")')
42 | group.add_argument('--default',
43 | default='negative', metavar='type',
44 | help='Default type of feedback')
45 | group.add_argument('--no-default', action='store_true',
46 | help='Do not use default feedback')
47 |
48 | group = parser.add_argument_group('Context')
49 | group.add_argument('--context-pattern', nargs=2,
50 | metavar=('pattern', 'replacement'),
51 | default=('^(?:.*/)*(.*?)(?:\.txt)?$', '\g<1>'),
52 | help='Pattern to generate context from filename')
53 | group.add_argument('--context-prefix',
54 | metavar='prefix', default='',
55 | help='Prefix to add to the context')
56 |
57 | args = parser.parse_args()
58 | if not args.feedback:
59 | args.feedback = [('positive', '\.txt$', '.positives.txt')]
60 |
61 | return args
62 |
63 | def store_dataset(args):
64 | semanticize_url = '%ssemanticize/%s' % (args.url, args.language)
65 | request_ids = defaultdict(list)
66 | for filename in args.datafiles:
67 | assert os.path.exists(filename)
68 | context = args.context_prefix + re.sub(args.context_pattern[0], \
69 | args.context_pattern[1], \
70 | filename)
71 |
72 | with Timer("Semanticizing %s" % filename, 'timer'):
73 | with open(filename) as file:
74 | lines = file.readlines()
75 | print "Read %d lines from %s." % (len(lines), filename)
76 |
77 | for line in lines:
78 | data = [("context", context), ("text", line.strip())]
79 | data.extend(args.semanticize)
80 | url_data = urllib.urlencode(data)
81 | result = json.loads(urllib2.urlopen(semanticize_url,
82 | url_data).read())
83 | print "Request %s: %d links" % \
84 | (result["request_id"], len(result["links"]))
85 | request_ids[filename].append(result["request_id"])
86 |
87 | with Timer("Feedback for %s" % context, 'timer'):
88 | feedback = []
89 | for (feedback_type, pattern, replacement) in args.feedback:
90 | feedback_filename = re.sub(pattern, replacement, filename)
91 | if not os.path.exists(feedback_filename):
92 | print feedback_filename, "does not exist"
93 | continue
94 | with open(feedback_filename) as file:
95 | lines = file.readlines()
96 | print "Read %d lines of %s feedback from %s." % \
97 | (len(lines), feedback_type, feedback_filename)
98 | for line in lines:
99 | feedback.append((feedback_type, line.strip()))
100 |
101 | if not args.no_default:
102 | feedback.append(("default", args.default))
103 |
104 | feedback_url = args.url + 'feedback/' + context
105 | url_data = urllib.urlencode(feedback)
106 | result = urllib2.urlopen(feedback_url, url_data).read()
107 | print "%d items of feedback for %s: %s" % \
108 | (len(feedback), context, result)
109 |
110 | if __name__ == '__main__':
111 | args = parse_args()
112 |
113 | with Timer("Storing %d files" % len(args.datafiles), 'timer'):
114 | store_dataset(args)
115 |
--------------------------------------------------------------------------------
/semanticizer/util/timer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import time
15 |
16 | class Timer(object):
17 | """Convience method to time activities. Can be used as context."""
18 |
19 | def __init__(self, activity, name=None):
20 | self.name = name
21 | self.activity = activity
22 | self.tstart = time.time()
23 |
24 | def __del__(self):
25 | if self.name: print '[%s]' % self.name,
26 | print self.activity,
27 | print 'took %s seconds.' % (time.time() - self.tstart)
28 |
29 | def __enter__(self):
30 | pass
31 |
32 | def __exit__(self, type, value, traceback):
33 | pass
34 |
--------------------------------------------------------------------------------
/semanticizer/wpm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/wpm/__init__.py
--------------------------------------------------------------------------------
/semanticizer/wpm/data.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 | import json
14 |
15 | from .load import WpmLoader
16 | from .namespace import WpmNS
17 |
18 | wpm_dumps = {}
19 |
20 | def init_datasource(wpm_languages, settings):
21 | """Set the datasource and init it"""
22 | for langcode, langconfig in wpm_languages.iteritems():
23 | load_wpm_data(langconfig['source'], langcode, settings, **langconfig['initparams'])
24 |
25 | def load_wpm_data(datasource, langcode, settings, **kwargs):
26 | if datasource == "redis":
27 | from .db.redisdb import RedisDB
28 | db = RedisDB(**kwargs)
29 | elif datasource == "memory":
30 | from .db.inmemory import MemoryDB
31 | db = MemoryDB()
32 | elif datasource == "mongo":
33 | from .db.mongodb import MongoDB
34 | db = MongoDB()
35 | #load wpm data into memory
36 | WpmLoader(db, langcode, settings, **kwargs)
37 | else:
38 | raise ValueError("Unknown backend {}".format(datasource))
39 | wpm_dumps[langcode] = WpmData(db, langcode)
40 |
41 |
42 | class WpmData:
43 |
44 | def __init__(self, db, langcode):
45 |
46 | #set database [memory or redis]
47 | self.db = db
48 |
49 | #get current db version
50 | self.version = self.db.get(langcode+":version")
51 |
52 | #load correct NameSpace
53 | self.ns = WpmNS(db, langcode, self.version)
54 |
55 | def entity_exists(self, entity):
56 | return self.exists(self.ns.label(entity))
57 |
58 | def normalized_entities_exist(self, entities):
59 | with self.db.pipeline() as pipe:
60 | for e in entities:
61 | pipe.exists(self.ns.normalized(e))
62 | return pipe.execute()
63 |
64 | def get_all_entities(self, normalized_entity):
65 | return self.db.smembers(self.ns.normalized(normalized_entity))
66 |
67 | def get_entity_data(self, entity):
68 | entity_data = self.db.lrange(self.ns.label(entity) , 0, -1)
69 | senses = []
70 | if len(entity_data) > 4:
71 | senses = entity_data[4:]
72 | return {'cntlinkocc': int(entity_data[0]),
73 | 'cntlinkdoc': int(entity_data[1]),
74 | 'cnttextocc': int(entity_data[2]),
75 | 'cnttextdoc': int(entity_data[3]),
76 | 'senses': senses}
77 |
78 | def get_sense_data(self, entity, sense):
79 | sense_data = self.db.lrange(self.ns.label_sense(entity, sense), 0, -1)
80 | return {'cntlinkocc': int(sense_data[0]),
81 | 'cntlinkdoc': int(sense_data[1]),
82 | 'from_title': sense_data[2],
83 | 'from_redir': sense_data[3]}
84 |
85 | def get_item_id(self, title):
86 | return self.db.get(self.ns.page_id(title))
87 |
88 | def get_item_ids(self, *titles):
89 | with self.db.pipeline() as pipe:
90 | for title in titles:
91 | pipe.get(self.ns.page_id(title))
92 | return pipe.execute()
93 |
94 | def get_item_title(self, pid):
95 | return self.db.get(self.ns.page_title(pid))
96 |
97 | def get_item_inlinks(self, pid):
98 | return self.db.lrange(self.ns.page_inlinks(pid), 0, -1)
99 |
100 | def get_item_outlinks(self, pid):
101 | return self.db.lrange(self.ns.page_outlinks(pid), 0, -1)
102 |
103 | def get_item_categories(self, pid):
104 | return self.db.get(self.ns.page_categories(pid))
105 |
106 | def get_item_definition(self, pid):
107 | return self.db.get(self.ns.page_definition(pid))
108 |
109 | def get_item_labels(self, pid):
110 | json_labels = self.db.lrange(self.ns.page_labels(pid), 0, -1)
111 | results = []
112 | for json_label in json_labels:
113 | label = json.loads(json_label)
114 | results.append({
115 | 'title': label[0],
116 | 'occurances': label[1],
117 | 'fromRedirect': label[2],
118 | 'fromTitle': label[3],
119 | 'isPrimary': label[4],
120 | 'proportion': label[5]
121 | })
122 | return results
123 |
124 | def sense_has_trnsl(self, sid):
125 | return self.db.exists(self.ns.translation_sense(sid))
126 |
127 | def get_trnsl_langs(self, sid):
128 | return self.db.lrange(self.ns.translation_sense(sid), 0, -1)
129 |
130 | def get_sense_trnsl(self, sid, lang):
131 | return self.db.get(self.ns.translation_sense_language(sid, lang))
132 |
133 | def get_wikipedia_name(self):
134 | path = self.db.get(self.ns.wiki_path())
135 | if path[-1] == '/':
136 | return path.split('/')[-2]
137 | return path.split('/')[-1]
138 |
139 | def get_data_path(self):
140 | return self.db.get(self.ns.wiki_path())
141 |
142 | def get_lang_name(self):
143 | return self.db.get(self.ns.wiki_language_name())
144 |
145 | def get_title_ngram_score(self, title):
146 | nr_of_tokens = len(title.split())
147 | return self.db.zscore(self.ns.ngramscore(str(nr_of_tokens)), title)
148 |
149 | def get_stat(self, value):
150 | return self.db.get(self.ns.wiki_stats(value))
151 |
152 | def get_articles(self, *pids):
153 | pipe = self.db.pipeline()
154 | for pid in pids:
155 | pipe.lrange(self.ns.page_inlinks(pid), 0, -1)
156 | pipe.lrange(self.ns.page_outlinks(pid), 0, -1)
157 | pipe.lrange(self.ns.page_labels(pid), 0, -1)
158 | data = pipe.execute()
159 |
160 | results = []
161 | for i in xrange(0, len(data)-1, 3):
162 | labels = []
163 | json_labels = data[i+2]
164 | for json_label in json_labels:
165 | label = json.loads(json_label)
166 | labels.append({
167 | 'title': label[0],
168 | 'occurances': label[1],
169 | 'fromRedirect': label[2],
170 | 'fromTitle': label[3],
171 | 'isPrimary': label[4],
172 | 'proportion': label[5]
173 | })
174 | result = {
175 | "InLinks":data[i],
176 | "OutLinks":data[i+1],
177 | "Labels":labels
178 | }
179 | results.append(result)
180 | return results
181 |
--------------------------------------------------------------------------------
/semanticizer/wpm/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/wpm/db/__init__.py
--------------------------------------------------------------------------------
/semanticizer/wpm/db/inmemory.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 |
15 |
16 | class MemoryDB:
17 | #store all data in memory instead of redis, mimic redis functions
18 | def __init__(self, **kwargs):
19 | self.cache = dict()
20 |
21 | def pipeline(self, **kwargs):
22 | return Pipe(self.cache)
23 |
24 | def exists(self, key):
25 | return key in self.cache
26 |
27 | def keys(self, key):
28 | key = key.replace("*", "")
29 | # simple abstraction of redis wildcard key search, only valid for startswith equivalent search which should be sufficient, probably faster then full regular expression search over keys
30 | return [k for k in self.cache.iterkeys() if k.startswith(key)]
31 |
32 | def get(self, key):
33 | return self.cache[key]
34 |
35 | def set(self, key, value):
36 | self.cache[key] = value
37 | return True
38 |
39 | def smembers(self, key):
40 | return self.get(key)
41 |
42 | def sismember(self, key, value):
43 | return value in self.cache[key]
44 |
45 | def sadd(self, key, *values):
46 | self.cache.setdefault(key, set()).update(values)
47 | return [True]*len(values)
48 |
49 | def lrange(self, key, start=0, end=-1):
50 | data = self.cache.get(key, list())
51 | if end < -1:
52 | return data[start:end+1]
53 | elif end == -1:
54 | return data[start:]
55 | else:
56 | return data[start:end]
57 |
58 | def rpush(self, key, *values):
59 | self.cache.setdefault(key, []).extend(values)
60 | return [True]*len(values)
61 |
62 | def zscore(self, key, value):
63 | return self.cache[key][value]
64 |
65 | def zincrby(self, key, value, amount=1):
66 | # in case value does not exist init
67 | if not value in self.cache.setdefault(key, {}):
68 | self.cache[key][value] = amount
69 | else:
70 | self.cache[key][value] += amount
71 | return self.cache[key][value]
72 |
73 | def delete(self,*keys):
74 | for key in keys:
75 | self.cache.pop(key, None)
76 | return True
77 |
78 |
79 | #proxy all returns to pipe class
80 | class Proxy(object):
81 | def __getattribute__(self,name):
82 | attr = object.__getattribute__(self, name)
83 | if hasattr(attr, '__call__') and name not in ["execute", "reset"]:
84 | def newfunc(*args, **kwargs):
85 | result = attr(*args, **kwargs)
86 | self.results.append(result)
87 | return True
88 | return newfunc
89 | else:
90 | return attr
91 |
92 | #implicity add a decorator Proxy to all functions of MemoryDB to fetch all returns and output them on execute
93 | class Pipe(Proxy, MemoryDB):
94 | def __init__(self, cache):
95 | self.reset()
96 | self.cache = cache
97 |
98 | def __enter__(self):
99 | return self
100 |
101 | def __exit__(self, exc_type, exc_value, traceback):
102 | self.reset()
103 |
104 | def __del__(self):
105 | try:
106 | self.reset()
107 | except Exception:
108 | pass
109 |
110 | def __len__(self):
111 | return len(self.results)
112 |
113 | def reset(self):
114 | self.results = []
115 |
116 | def execute(self):
117 | results = self.results
118 | self.reset()
119 | return results
--------------------------------------------------------------------------------
/semanticizer/wpm/db/mongodb.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | from pymongo import MongoClient
15 |
16 | class MongoDB:
17 | def __init__(self, host='localhost', port=27017, **kwargs):
18 | global client
19 | client = MongoClient(host, port)
20 |
21 | def pipeline(self, **kwargs):
22 | return Pipe()
23 |
24 | def exists(self, key):
25 | item = client.sem.wpm.find_one( {"_id": key})
26 | return False if not item else True
27 |
28 | def keys(self, key):
29 | item = client.sem.wpm.find( {"_id": "/"+key+"/"})
30 | return [] if not item else item
31 |
32 | def get(self, key):
33 | item = client.sem.wpm.find_one( {"_id": key})
34 | return item['value']
35 |
36 | def set(self, key, value):
37 | client.sem.wpm.save( {"_id":key, "value": value})
38 | return True
39 |
40 | def smembers(self, key):
41 | return self.get(key)
42 |
43 | def sismember(self, key, value):
44 | item = client.sem.wpm.find_one( {"_id": key})
45 | return False if not item else value in item['value']
46 |
47 | def sadd(self, key, *values):
48 | item = client.sem.wpm.find_one( {"_id": key})
49 | svalue = set(values) if not item else set(list(item['value']) + list(values))
50 | client.sem.wpm.update( {"_id":key},{'$set':{'value': list(svalue)}},upsert=True, multi=False)
51 | return [True]*len(values)
52 |
53 | def lrange(self, key, start, end):
54 | item = client.sem.wpm.find_one( {"_id": key})
55 | return [] if not item else value in item['value'][start:end]
56 |
57 | def rpush(self, key, *values):
58 | item = client.sem.wpm.find_one( {"_id": key})
59 | lvalue = list(values) if not item else list(item['value']) + list(values)
60 | client.sem.wpm.update( {"_id":key},{'$set':{'value': lvalue}},upsert=True, multi=False)
61 | return [True]*len(values)
62 |
63 | def zscore(self, key, value):
64 | item = client.sem.wpm.find_one( {"_id": key})
65 | subkey = ":"+str(value)+":"
66 | if not item:
67 | return None
68 | if not subkey in item:
69 | return None
70 | return item[subkey]
71 |
72 | def zincrby(self, key, value, ammount=1):
73 | client.sem.wpm.update( {"_id":key},{'$inc':{":"+str(value)+":": 1}},upsert=True, multi=False)
74 | return True
75 |
76 | def delete(self,*keys):
77 | for key in keys:
78 | client.sem.wpm.remove({"_id":key})
79 | return True
80 |
81 | #proxy all returns to pipe class
82 | class Proxy(object):
83 | def __getattribute__(self,name):
84 | attr = object.__getattribute__(self, name)
85 | if hasattr(attr, '__call__'):
86 | def newfunc(*args, **kwargs):
87 | result = attr(*args, **kwargs)
88 | self.results.append(result)
89 | return True
90 | return newfunc
91 | else:
92 | return attr
93 |
94 | #implicity add a decorator Proxy to all functions of MongoDB to fetch all returns and output them on execute
95 | class Pipe(Proxy, MongoDB):
96 | def __init__(self):
97 | self.reset()
98 |
99 | def __enter__(self):
100 | return self
101 |
102 | def __exit__(self, exc_type, exc_value, traceback):
103 | self.reset()
104 |
105 | def __del__(self):
106 | try:
107 | self.reset()
108 | except Exception:
109 | pass
110 |
111 | def __len__(self):
112 | return len(self.results)
113 |
114 | def reset(self):
115 | self.results = []
116 |
117 | def execute(self):
118 | results = self.results
119 | self.reset()
120 | return results
--------------------------------------------------------------------------------
/semanticizer/wpm/db/redisdb.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import redis
15 |
16 | def RedisDB(host='localhost', port=6379, **kwargs):
17 | return redis.StrictRedis(host=host, port=port, db=0, decode_responses=True)
--------------------------------------------------------------------------------
/semanticizer/wpm/namespace.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | class WpmNS:
15 | def __init__(self, db, langcode, version=None):
16 | self.sep = ':'
17 | self.lc = langcode
18 | self.db = db
19 | self.manual_version = version
20 |
21 | def version (self):
22 | if self.manual_version:
23 | return self.manual_version
24 | version = self.db.get(self.db_version())
25 | if not version:
26 | raise Exception("No database version")
27 | return version
28 |
29 | def db_version(self):
30 | """
31 | key
32 | :db:version
33 | value
34 | string(cache version)
35 | """
36 | return self.sep.join( (self.lc, "db", "version") )
37 |
38 | def wiki_language_name(self):
39 | """
40 | key
41 | ::wiki:lname
42 | value
43 | string(wiki name)
44 | """
45 | return self.sep.join( (self.lc, self.version(), "wiki", "lname") )
46 |
47 | def wiki_path(self):
48 | """
49 | key
50 | ::wiki:path
51 | value
52 | string(wiki path)
53 | """
54 | return self.sep.join( (self.lc, self.version(), "wiki", "path") )
55 |
56 | def wiki_stats(self, statName):
57 | """
58 | key
59 | ::wiki:stats:
60 | value
61 | string(stats)
62 | """
63 | return self.sep.join( (self.lc, self.version(), "wiki", "stats", statName) )
64 |
65 | def label(self, name):
66 | """
67 | key
68 | ::label:
69 | value
70 | list( LinkOccCount, LinkDocCount, TextOccCount, TextDocCount, SenseId, SenseId, ..)
71 | """
72 | return self.sep.join( (self.lc, self.version(), "label", name) )
73 |
74 | def label_sense(self, name, senseid):
75 | """
76 | key
77 | ::label::
78 | value
79 | list( sLinkDocCount, sLinkOccCount, FromTitle, FromRedirect)
80 | """
81 | return self.sep.join( (self.lc, self.version(), "label", name, senseid) )
82 |
83 | def normalized(self, name):
84 | """
85 | key
86 | ::norm:
87 | value
88 | set( name, name, ... )
89 | """
90 | return self.sep.join( (self.lc, self.version(), "norm", name) )
91 |
92 | def translation_sense(self, senseid):
93 | """
94 | key
95 | ::trnsl:
96 | value
97 | list( langcode, langcode, ... )
98 | """
99 | return self.sep.join( (self.lc, self.version(), "trnsl", senseid) )
100 |
101 | def translation_sense_language(self, senseid, langcode):
102 | """
103 | key
104 | ::trnsl::
105 | value
106 | string(name)
107 | """
108 | return self.sep.join( (self.lc, self.version(), "trnsl", senseid, langcode) )
109 |
110 | def page_id(self, name):
111 | """
112 | key
113 | ::page:id
114 | value
115 | string(id)
116 | """
117 | return self.sep.join( (self.lc, self.version(), "page", "id", name) )
118 |
119 | def page_title(self, pageid):
120 | """
121 | key
122 | ::page::name
123 | value
124 | string(name)
125 | """
126 | return self.sep.join( (self.lc, self.version(), "page", pageid, "name") )
127 |
128 | def page_labels(self, pageid):
129 | """
130 | key
131 | ::page::labels
132 | value
133 | list( json([title, occurances, fromRedirect, fromTitle isPrimary, proportion]), ...)
134 | """
135 | return self.sep.join( (self.lc, self.version(), "page", pageid, "labels") )
136 |
137 | def page_definition(self, pageid):
138 | """
139 | key
140 | ::page::definition
141 | value
142 | string(synopsis)
143 | """
144 | return self.sep.join( (self.lc, self.version(), "page", pageid, "definition") )
145 |
146 | def page_inlinks(self, pageid):
147 | """
148 | key
149 | ::page::inlinks
150 | value
151 | list( pageid, pageid, ... )
152 | """
153 | return self.sep.join( (self.lc, self.version(), "page", pageid, "inlinks") )
154 |
155 |
156 | def page_outlinks(self, pageid):
157 | """
158 | key
159 | ::page::outlinks
160 | value
161 | list( pageid, pageid, ... )
162 | """
163 | return self.sep.join( (self.lc, self.version(), "page", pageid, "outlinks") )
164 |
165 | def page_categories(self, pageid):
166 | """
167 | key
168 | ::page::categories
169 | value
170 | list( category, category, ... )
171 | """
172 | return self.sep.join( (self.lc, self.version(), "page", pageid, "categories") )
173 |
174 |
175 | def ngramscore(self, n):
176 | """
177 | key
178 | ::grms
179 | value
180 | zset([words{score}, [...]])translation_sense
181 | """
182 | return self.sep.join( (self.lc, self.version(), "%sgrms" % n) )
183 |
--------------------------------------------------------------------------------
/semanticizer/wpm/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import math
15 | import re
16 | import unicodedata
17 | import sys
18 |
19 | from .markup_stripper import MarkupStripper
20 |
21 | dump_filenames = {
22 | 'translations': 'translations.csv',
23 | 'stats': 'stats.csv',
24 | 'labels': 'label.csv',
25 | 'pages': 'page.csv',
26 | 'pageLabels': 'pageLabel.csv',
27 | 'pageCategories': 'articleParents.csv',
28 | 'inlinks': 'pageLinkIn.csv',
29 | 'outlinks': 'pageLinkOut.csv'
30 | }
31 |
32 |
33 | def normalize(raw, dash=True, accents=True, lower=True):
34 | """Replaces hyphens with spaces, removes accents, lower cases and
35 | strips the input text.
36 |
37 | All steps, except for the strip(), can be disabled with the
38 | optional arguments.
39 | """
40 | text = raw
41 | if dash:
42 | text = text.replace('-', ' ')
43 | if accents:
44 | text = remove_accents(text)
45 | if lower:
46 | text = text.lower()
47 | text = text.strip()
48 | return text if len(text) else raw
49 |
50 |
51 | def remove_accents(input_str):
52 | """Replaces accented characters in the input with their
53 | non-accented counterpart."""
54 | if isinstance(input_str, str):
55 | input_unicode = input_str.decode(errors="ignore")
56 | else:
57 | input_unicode = input_str
58 | nkfd_form = unicodedata.normalize('NFKD', input_unicode)
59 | return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
60 |
61 |
62 | def check_dump_path(path, settings):
63 | """
64 | Checks whether a path exists and raises an error if it doesn't.
65 |
66 | @param path: The pathname to check
67 | @raise IOError: If the path doesn't exist or isn't readbale
68 | """
69 | import os
70 | import glob
71 | pathlist = [os.path.normpath(path) + os.sep,
72 | os.path.normpath(os.path.abspath(path)) + os.sep]
73 | for fullpath in pathlist:
74 | print "Checking " + fullpath
75 | if os.path.exists(fullpath):
76 | for filetype, filename in dump_filenames.iteritems():
77 | if os.path.isfile(fullpath + filename) == True:
78 | print "Found " + fullpath + filename
79 | else:
80 | raise IOError("Cannot find " + fullpath + filename)
81 | if settings.get("include_definitions", True):
82 | wiki = glob.glob(fullpath + '*-pages-articles.xml')
83 | if len(wiki) > 0:
84 | print "Found " + wiki[0]
85 | else:
86 | raise IOError("Cannot find wiki *-pages-articles.xml")
87 | return fullpath
88 | else:
89 | print fullpath + " doesn't exist"
90 | raise IOError("Cannot find " + path)
91 |
92 |
93 | def get_relatedness(linksA, linksB):
94 | """
95 | Compare relatedness of 2 articles based on in or outlinks.
96 |
97 | @param linksA: in or out links of article A
98 | @param linksB: in or out links of article B
99 | """
100 | if not linksA or not linksB:
101 | return 0.0
102 |
103 | if linksA == linksB:
104 | return 1.0
105 |
106 | intersection = 0
107 | indexA = 0
108 | indexB = 0
109 |
110 | while indexA < len(linksA) or indexB < len(linksB):
111 | useA = False
112 | useB = False
113 |
114 | linkA = None
115 | linkB = None
116 |
117 | if indexA < len(linksA):
118 | linkA = linksA[indexA]
119 |
120 | if indexB < len(linksB):
121 | linkB = linksB[indexB]
122 |
123 | if linkA and linkB and linkA == linkB:
124 | useA = True
125 | useB = True
126 | intersection += 1
127 | else:
128 | if linkA and (not linkB or linkA < linkB):
129 | useA = True
130 | if linkA == artB:
131 | intersection += 1
132 | else:
133 | useB = True
134 | if linkB == artA:
135 | intersection += 1
136 |
137 | if useA:
138 | indexA += 1
139 | if useB:
140 | indexB += 1
141 |
142 | googleMeasure = None
143 |
144 | if intersection == 0:
145 | googleMeasure = 1.0
146 | else:
147 | a = math.log(len(linksA))
148 | b = math.log(len(linksB))
149 | ab = math.log(len(intersection))
150 |
151 | googleMeasure = (max(a, b) - ab) / (m - min(a, b))
152 |
153 | #normalize
154 | if not googleMeasure:
155 | return 0
156 | if googleMeasure >= 1:
157 | return 0
158 |
159 | return 1 - googleMeasure
160 |
161 | def generate_markup_definition(markup):
162 | """
163 | Strip wiki markup and convert some wiki tags to html
164 |
165 | @param markup: wiki markup
166 | """
167 | stripper = MarkupStripper()
168 |
169 | # strip markup
170 | markup = re.sub("={2,}(.+)={2,}", "\n", markup) #clear section headings completely - not just formating, but content as well.
171 | markup = stripper.strip_all_but_internal_links_and_emphasis(markup)
172 | markup = stripper.strip_non_article_internal_links(markup)
173 | markup = stripper.strip_excess_newlines(markup)
174 |
175 | # convert wiki tags to html
176 | markup = stripper.emphasisResolver.resolve_emphasis(markup)
177 |
178 | # todo convert links
179 | #...
180 |
181 | # slice markup to definition
182 | fp = ""
183 | pos = 0
184 | p = re.compile("\n\n", re.DOTALL)
185 | for m in p.finditer(markup):
186 | fp = markup[0:pos]
187 | if (pos > 150):
188 | break
189 | pos = m.start()+2
190 | fp = re.sub("\n", " ", fp)
191 | fp = re.sub("\\s+", " ", fp) #turn all whitespace into spaces, and collapse them.
192 | fp = fp.strip()
193 |
194 | return fp
195 |
196 | def cli_progress(current, total, bar_length=40):
197 | """
198 | shows progressbar in CLI
199 |
200 | @param current: int of current step
201 | @param current: int of total steps
202 | @param bar_length: length of the progressbar in cli window
203 | """
204 | percent = float(current) / total
205 | hashes = '#' * int(round(percent * bar_length))
206 | spaces = ' ' * (bar_length - len(hashes))
207 | sys.stdout.write("\rPercent: [{0}] {1}%".format(hashes + spaces, int(round(percent * 100))))
208 | sys.stdout.flush()
209 |
--------------------------------------------------------------------------------
/semanticizer/wpm/utils/emphasis_resolver.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | import re
15 |
16 | # This parses MediaWiki syntax for '''bold''' and ''italic'' text with the equivalent html markup.
17 | class EmphasisResolver:
18 | def resolve_emphasis(self, text):
19 | sb = []
20 | for line in text.split("\n"):
21 | sb.append(self.resolve_line(line))
22 | sb.append("\n")
23 |
24 | result = "".join(sb)
25 | result = result[:-1]
26 | return result
27 |
28 | # This is a direct translation of the php function doAllQuotes used by the original MediaWiki software.
29 | #
30 | # @param line the line to resolve emphasis within
31 | # @return the line, with all emphasis markup resolved to html tags
32 | #
33 | def resolve_line(self, line):
34 |
35 | #print "Resolving line '" + line + "'"
36 |
37 | arr = self.get_splits("$"+line)
38 | if len(arr) <= 1:
39 | return line
40 |
41 | # First, do some preliminary work. This may shift some apostrophes from
42 | # being mark-up to being text. It also counts the number of occurrences
43 | # of bold and italics mark-ups.
44 |
45 | numBold = 0
46 | numItalics = 0
47 |
48 | for i, value in enumerate(arr):
49 | if (i % 2 == 1):
50 | # If there are ever four apostrophes, assume the first is supposed to
51 | # be text, and the remaining three constitute mark-up for bold text.
52 | if (len(arr[i]) == 4):
53 | arr[i-1] = arr[i-1] + "'" ;
54 | arr[i] = self.get_filled_string(3) ;
55 | elif len(arr[i]) > 5:
56 | # If there are more than 5 apostrophes in a row, assume they're all
57 | # text except for the last 5.
58 | arr[i-1] = arr[i-1] + self.get_filled_string(len(arr[i])-5)
59 | arr[i] = self.get_filled_string(5)
60 |
61 | size = len(arr[i])
62 | if size == 2:
63 | numItalics +=1
64 | elif size == 3:
65 | numBold+=1
66 | elif size == 5:
67 | numItalics +=1
68 | numBold +=1
69 |
70 | # If there is an odd number of both bold and italics, it is likely
71 | # that one of the bold ones was meant to be an apostrophe followed
72 | # by italics. Which one we cannot know for certain, but it is more
73 | # likely to be one that has a single-letter word before it.
74 | if (numBold%2==1) and (numItalics%2==1):
75 | i= 0
76 | firstSingleLetterWord = -1
77 | firstMultiLetterWord = -1
78 | firstSpace = -1
79 |
80 | for r in arr:
81 | if i%2==1 and len(r)==3:
82 | x1 = arr[i-1][len(arr[i-1])-1]
83 | x2 = arr[i-1][len(arr[i-1])-2]
84 | if x1==' ':
85 | if firstSpace == -1:
86 | firstSpace = i ;
87 | elif x2==' ':
88 | if firstSingleLetterWord == -1:
89 | firstSingleLetterWord = i
90 | else:
91 | if firstMultiLetterWord == -1:
92 | firstMultiLetterWord = i
93 |
94 | i += 1
95 |
96 | # If there is a single-letter word, use it!
97 | if firstSingleLetterWord > -1:
98 | arr[firstSingleLetterWord] = "''"
99 | arr[firstSingleLetterWord-1] = arr[firstSingleLetterWord] + "'"
100 | elif firstMultiLetterWord > -1:
101 | # If not, but there's a multi-letter word, use that one.
102 | arr[firstMultiLetterWord] = "''"
103 | arr[firstMultiLetterWord-1] = arr[firstMultiLetterWord] + "'"
104 | elif firstSpace > -1:
105 | # ... otherwise use the first one that has neither.
106 | # (notice that it is possible for all three to be -1 if, for example,
107 | # there is only one pentuple-apostrophe in the line)
108 | arr[firstSpace] = "''"
109 | arr[firstSpace-1] = arr[firstSpace] + "'"
110 |
111 | # Now let's actually convert our apostrophic mush to HTML!
112 |
113 | output = []
114 | buffer = []
115 | state = "" ;
116 | i = 0
117 | for r in arr:
118 | if i%2==0:
119 | if state == "both":
120 | buffer.append(r)
121 | else:
122 | output.append(r)
123 | else:
124 | if len(r) == 2:
125 | if state == "i":
126 | output.append("")
127 | state = ""
128 | elif state == "bi":
129 | output.append("")
130 | state = "b"
131 | elif state =="ib":
132 | output.append("");
133 | state = "b";
134 | elif state =="both":
135 | output.append("") ;
136 | output.append("".join(buffer))
137 | output.append("") ;
138 | state = "b";
139 | else:
140 | # $state can be "b" or ""
141 | output.append("")
142 | state = state + "i"
143 | elif len(r) == 3:
144 | if state == "b":
145 | output.append("")
146 | state = ""
147 | elif state == "bi":
148 | output.append("")
149 | state = "i"
150 | elif state =="ib":
151 | output.append("");
152 | state = "i";
153 | elif state =="both":
154 | output.append("") ;
155 | output.append("".join(buffer))
156 | output.append("") ;
157 | state = "i";
158 | else:
159 | # $state can be "b" or ""
160 | output.append("")
161 | state = state + "b"
162 | elif len(r) == 5:
163 | if state == "b":
164 | output.append("")
165 | state = "i"
166 | elif state == "i":
167 | output.append("")
168 | state = "b"
169 | elif state =="bi":
170 | output.append("");
171 | state = "";
172 | elif state =="ib":
173 | output.append("") ;
174 | state = "";
175 | elif state =="both":
176 | output.append("") ;
177 | output.append("".join(buffer))
178 | output.append("") ;
179 | state = "i";
180 | else:
181 | # ($state == "")
182 | buffer = []
183 | state = "both"
184 | i += 1
185 |
186 |
187 | # Now close all remaining tags. Notice that the order is important.
188 | if state == "b" or state == "ib":
189 | output.append("")
190 |
191 | if state == "i" or state == "bi" or state == "ib":
192 | output.append("")
193 | if state == "bi":
194 | output.append("")
195 |
196 | # There might be lonely ''''', so make sure we have a buffer
197 | if state == "both" and len(buffer) > 0:
198 | output.append("")
199 | output.append("".join(buffer))
200 | output.append("")
201 |
202 | #remove leading $
203 | output = output[1:]
204 |
205 | return "".join(output)
206 |
207 |
208 |
209 | # Does the same job as php function preg_split
210 | def get_splits(self, text):
211 | #return re.split("\\'{2,}", text)
212 | splits = []
213 | lastCopyIndex = 0
214 | p = re.compile("\\'{2,}")
215 |
216 | for m in p.finditer(text):
217 | if m.start() > lastCopyIndex:
218 | splits.append( text[lastCopyIndex: m.start()] )
219 | splits.append( m.group() )
220 | lastCopyIndex = m.end()
221 |
222 | if lastCopyIndex < len(text)-1:
223 | splits.append(text[lastCopyIndex])
224 |
225 | return splits
226 |
227 |
228 | def get_filled_string(self, length):
229 | sb = []
230 | for i in xrange(0,length):
231 | sb.append("'")
232 | return "".join(sb)
233 |
234 | ## EmphasisResolver testing using
235 | ## python -m semanticizer.wpm.utils.emphasis_resolver
236 | if __name__ == '__main__':
237 | er = EmphasisResolver()
238 | markup = "'''War''' is an openly declared state of organized [[violent]] [[Group conflict|conflict]], typified by extreme [[aggression]], [[societal]] disruption, and high [[Mortality rate|mortality]]. As a behavior pattern, warlike tendencies are found in many [[primate]] species, including [[humans]], and also found in many [[ant]] species. The set of techniques used by a group to carry out war is known as '''warfare'''."
239 | #markup = "Parsing '''MediaWiki''''s syntax for '''bold''' and ''italic'' markup is a '''''deceptively''' difficult'' task. Whoever came up with the markup scheme should be '''shot'''." ;
240 | print er.resolve_emphasis(markup)
--------------------------------------------------------------------------------
/semanticizer/wpm/utils/wikidumps.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012 University of Amsterdam
2 | # Copyright 2014 Netherlands eScience Center
3 | # Written by Lars Buitinck.
4 |
5 | """Parsing utilities for Wikipedia database dumps."""
6 |
7 | from __future__ import print_function
8 |
9 | import re
10 | import xml.etree.ElementTree as etree # don't use LXML, it's slower (!)
11 |
12 |
13 | def _get_namespace(tag):
14 | try:
15 | namespace = re.match(r"^{(.*?)}", tag).group(1)
16 | except AttributeError:
17 | namespace = ''
18 | if not namespace.startswith("http://www.mediawiki.org/xml/export-"):
19 | raise ValueError("namespace %r not recognized as MediaWiki dump"
20 | % namespace)
21 | return namespace
22 |
23 |
24 | def extract_pages(f):
25 | """Extract pages from Wikimedia database dump.
26 |
27 | Parameters
28 | ----------
29 | f : file-like or str
30 | Handle on Wikimedia article dump. May be any type supported by
31 | etree.iterparse.
32 |
33 | Returns
34 | -------
35 | pages : iterable over (int, string, string)
36 | Generates (page_id, title, content) triples.
37 | In Python 2.x, may produce either str or unicode strings.
38 | """
39 | elems = (elem for _, elem in etree.iterparse(f, events=["end"]))
40 |
41 | # We can't rely on the namespace for database dumps, since it's changed
42 | # it every time a small modification to the format is made. So, determine
43 | # those from the first element we find, which will be part of the metadata,
44 | # and construct element paths.
45 | elem = next(elems)
46 | namespace = _get_namespace(elem.tag)
47 | ns_mapping = {"ns": namespace}
48 | page_tag = "{%(ns)s}page" % ns_mapping
49 | text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
50 | id_path = "./{%(ns)s}id" % ns_mapping
51 | title_path = "./{%(ns)s}title" % ns_mapping
52 |
53 | for elem in elems:
54 | if elem.tag == page_tag:
55 | text = elem.find(text_path).text
56 | if text is None:
57 | # Empty article; these occur in Wikinews dumps.
58 | continue
59 | yield (int(elem.find(id_path).text),
60 | elem.find(title_path).text,
61 | text)
62 |
63 | # Prune the element tree, as per
64 | # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
65 | # We do this only for s, since we need to inspect the
66 | # ./revision/text element. That shouldn't matter since the pages
67 | # comprise the bulk of the file.
68 | elem.clear()
69 |
70 |
71 | if __name__ == "__main__":
72 | # Test; will write article info + prefix of content to stdout
73 | import sys
74 |
75 | if len(sys.argv) > 1:
76 | print("usage: %s; will read from standard input" % sys.argv[0],
77 | file=sys.stderr)
78 | sys.exit(1)
79 |
80 | for pageid, title, text in extract_pages(sys.stdin):
81 | title = title.encode("utf-8")
82 | text = text[:40].replace("\n", "_").encode("utf-8")
83 | print("%d '%s' (%s)" % (pageid, title, text))
84 |
--------------------------------------------------------------------------------
/semanticizer_wsgi.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
4 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
5 | # General Public License as published by the Free Software Foundation, either
6 | # version 3 of the License, or (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful, but WITHOUT
9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
11 | # for more details.
12 | #
13 | # You should have received a copy of the GNU Lesser General Public License
14 | # along with this program. If not, see .
15 |
16 | """ Semanticizer (WSGI version)
17 |
18 | A stripped down, WSGI compatible, version of the semanticizer.
19 |
20 | Usage:
21 | gunicorn --bind 0.0.0.0:5001 --workers 4 semanticizer_wsgi:application
22 | or
23 | uwsgi --http :5001 --master --processes 4 --wsgi-file semanticizer_wsgi.py
24 |
25 | """
26 |
27 | import re
28 | from semanticizer.config import config_get
29 | settings = config_get(('settings'), {})
30 |
31 | # Can do without ujson and simplejson, but speeds up considerably.
32 | try:
33 | import ujson
34 | except ImportError:
35 | pass
36 | try:
37 | import simplejson as json
38 | except ImportError:
39 | import json
40 |
41 | from flask import Flask, Response, request
42 |
43 | from semanticizer import procpipeline
44 | from semanticizer.config import config_get
45 | from semanticizer.wpm.data import init_datasource
46 |
47 |
48 | wpm_languages = config_get(('wpm', 'languages'))
49 | init_datasource(wpm_languages, settings)
50 | PIPELINE = procpipeline.build(wpm_languages)
51 |
52 | # WSGI application!
53 | application = Flask(__name__)
54 | application.debug = True
55 |
56 |
57 | APPLICATION_JSON = "application/json"
58 |
59 | # RegExens for CleanTweet
60 | CLEAN_TWEET = \
61 | {'user': re.compile(r"(@\w+)"),
62 | 'url': re.compile(r"(http://[a-zA-Z0-9_=\-\.\?&/#]+)"),
63 | 'punctuation': re.compile(r"[-!\"#\$%&'\(\)\*\+,\.\/:;<=>\?@\[\\\]\^_`\{\|\}~]+"),
64 | 'retweet': re.compile(r"(\bRT\b)")
65 | }
66 |
67 |
68 | @application.route('/')
69 | def hello_world():
70 | """Hello World!"""
71 | return 'Hello World!\n'
72 |
73 |
74 | @application.route('/semanticize/', methods=['GET', 'POST'])
75 | def _semanticize_handler(langcode):
76 | """
77 | The function handling the /semanticize/ namespace. It uses
78 | the chain-of-command pattern to run all processors, using the
79 | corresponding preprocess, process, and postprocess steps.
80 |
81 | @param langcode: The language to use in the semanticizing
82 | @return: The body of the response, in this case a json formatted list \
83 | of links and their relevance
84 | """
85 | # self.application.logger.debug("Semanticizing: start")
86 | text = _get_text_from_request()
87 |
88 | # self.application.logger.debug("Semanticizing text: " + text)
89 | settings = {"langcode": langcode}
90 | for key, value in request.values.iteritems():
91 | assert key not in settings
92 | settings[key] = value
93 |
94 | sem_result = _semanticize(langcode, settings, text)
95 | json = _json_dumps(sem_result, "pretty" in settings)
96 |
97 | # self.application.logger.debug("Semanticizing: Created %d characters of JSON." \
98 | # % len(json))
99 | return Response(json, mimetype=APPLICATION_JSON)
100 |
101 |
102 | @application.route('/cleantweet', methods=['GET', 'POST'])
103 | def _cleantweet():
104 | """
105 | The function that handles the /cleantweet namespace. Will use regular
106 | expressions to completely clean a given tweet.
107 |
108 | @return: The body of the response, in this case a json formatted \
109 | string containing the cleaned tweet.
110 | """
111 | text = _get_text_from_request()
112 | clean_text = cleantweet(text)
113 |
114 | return _json_dumps({"cleaned_text": clean_text})
115 |
116 |
117 | def cleantweet(text):
118 | """
119 | Tweet cleaner/tokenizer.
120 |
121 | Uses regular expressions to completely clean, and tokenize, a
122 | given tweet.
123 | """
124 |
125 | for cleaner in ['user', 'url', 'punctuation', 'retweet']:
126 | text = CLEAN_TWEET[cleaner].sub(" ", text)
127 | text = " ".join([w for w in re.split(r'\s+', text) if len(w) > 1])
128 |
129 | return text
130 |
131 |
132 | def _semanticize(langcode, settings, text):
133 | """
134 | Method that performs the actual semantization.
135 | """
136 | links = []
137 |
138 | for function in ("preprocess", "process", "postprocess"):
139 | for step, processor in PIPELINE:
140 | # self.application.logger.debug("Semanticizing: %s for step %s" \
141 | # % (function, step))
142 | (links, text, settings) = getattr(processor, function)(links,
143 | text,
144 | settings
145 | )
146 | # self.application.logger.debug("Semanticizing: %s pipeline with %d steps \
147 | # done" % (function, len(self.pipeline)))
148 |
149 | result = {"links": links, "text": text}
150 |
151 | return result
152 |
153 |
154 | def _json_dumps(obj, pretty=False):
155 | """
156 | Util function to create json dumps based on an object.
157 |
158 | @param o: Object to transform
159 | @param pretty: Whether or not to prettify the JSON
160 | @return: The JSON string
161 | """
162 | if not pretty and "ujson" in locals():
163 | return ujson.dumps(obj)
164 | elif not pretty:
165 | return json.dumps(obj)
166 | else:
167 | return json.dumps(obj, indent=4)
168 |
169 | def _get_text_from_request():
170 | """
171 | Util function to get the param called "text" from the current request
172 |
173 | @return: the value of "text"
174 | """
175 |
176 | return request.values['text']
177 | # content_type = request.headers['Content-Type']
178 | # if request.method == "POST":
179 | # if content_type == 'application/x-www-form-urlencoded':
180 | # return request.form['text']
181 | # elif content_type == 'text/plain':
182 | # return request.data
183 | # else:
184 | # abort(Response("Unsupported Content Type, use: text/plain\n",
185 | # status=415))
186 | # elif "text" in request.args:
187 | # return request.args["text"]
188 | # else:
189 | # abort(Response("No text provided, use: POST or GET with attribute \
190 | # 'text'\n", status=400))
191 |
192 |
193 | if __name__ == '__main__':
194 | application.run()
195 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from distutils.core import setup
4 |
5 | pkgs = (["semanticizer"] +
6 | ["semanticizer." + sub for sub in ("processors", "redisinsert",
7 | "server", "util", "wpm")])
8 |
9 | setup(
10 | name="semanticizer",
11 | description="Entity Linking for the masses",
12 | packages=pkgs,
13 | classifiers=[
14 | "Intended Audience :: Science/Research",
15 | "Topic :: Scientific/Engineering",
16 | "Topic :: Scientific/Engineering :: Information Analysis",
17 | "Topic :: Text Processing",
18 | ],
19 | install_requires=[
20 | "flask",
21 | "mock",
22 | "leven",
23 | "lxml",
24 | "networkx",
25 | "numpy",
26 | "redis>=2.8.0",
27 | "scikit-learn",
28 | "simplejson",
29 | ],
30 | )
31 |
--------------------------------------------------------------------------------
/test/TestConfig.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | '''
15 | Testsuite for the config.py module
16 | '''
17 | # Disable check for calling protected members
18 | # pylint: disable-msg=W0212
19 | # Disable check for naming conventions that disturb setUp and tearDown
20 | # pylint: disable-msg=C0103
21 | # Disable check for too many public methods
22 | # pylint: disable-msg=R0904
23 |
24 | import unittest
25 | import config
26 | from os import remove
27 | from argparse import ArgumentTypeError
28 | from argparse import ArgumentParser
29 | from tempfile import mkstemp
30 | from ConfigParser import MissingSectionHeaderError
31 |
32 |
33 | class Test(unittest.TestCase):
34 | """Testclass for config.py"""
35 |
36 | def setUp(self):
37 | """setup the test by creating a tempfile and a test config"""
38 | self.tmpfile, self.tmpfilename = mkstemp()
39 | self.testconfig = {
40 | 'port': 6000,
41 | 'lmpath': self.tmpfilename,
42 | 'verbose': None
43 | }
44 |
45 | def tearDown(self):
46 | """Tear down by removing the tempfile created during setup"""
47 | remove(self.tmpfilename)
48 |
49 | def test_readable_path(self):
50 | """Test the function that guarantees a path given in the config
51 | is readable"""
52 | valid_path = '/'
53 | invalid_path = '/invalid/path'
54 | self.assertTrue(
55 | config._readable_path(valid_path).endswith(valid_path),
56 | "_readable_path returns an unexpected value for %s" \
57 | % valid_path)
58 | self.assertRaises(ArgumentTypeError,
59 | config._readable_path,
60 | invalid_path)
61 |
62 | def test_writable_file(self):
63 | """Test the function that guarantees a path given in the config
64 | is writable"""
65 | valid_file = self.tmpfilename
66 | invalid_file = '/test/test/invalid'
67 | self.assertTrue(
68 | config._writable_file(valid_file).endswith(valid_file),
69 | "_writable_file returns an unexpected value for %s" \
70 | % valid_file)
71 | self.assertRaises(ArgumentTypeError,
72 | config._writable_file,
73 | invalid_file)
74 |
75 | def test_valid_absolute_url(self):
76 | """Test the function that guarantees a value given in the config
77 | is a valid URL"""
78 | valid_url = 'http://www.google.com:890/something?param=1&else=2'
79 | invalid_url = 'ha//%st||al}avista'
80 | self.assertEqual(
81 | config._valid_absolute_url(valid_url),
82 | valid_url,
83 | "_valid_absolute_url returns an unexpected value for %s" \
84 | % valid_url)
85 | self.assertRaises(ArgumentTypeError,
86 | config._valid_absolute_url,
87 | invalid_url)
88 |
89 | def test_get_conf_vals(self):
90 | """Test the params are being parsed as we expect"""
91 | # the expected result after parsing the config
92 | result = ["--lmpath", self.tmpfilename, "--port", "6000", "--verbose"]
93 | # writing a random line to the config file and test that ConfigParser
94 | # raises a MissingSectionHeaderError
95 | tmpfile = open(self.tmpfilename, 'w')
96 | tmpfile.write("somekey = somevalue\n")
97 | tmpfile.close()
98 | self.assertRaises(MissingSectionHeaderError,
99 | config._get_conf_vals,
100 | self.tmpfilename)
101 | # writing valid values to the config file and comparing the result to
102 | # what we expect
103 | tmpfile = open(self.tmpfilename, 'w')
104 | tmpfile.write("[generic]\n")
105 | for key, value in self.testconfig.iteritems():
106 | if value:
107 | tmpfile.write(key + " = " + str(value) + "\n")
108 | else:
109 | tmpfile.write(key + "\n")
110 | tmpfile.close()
111 | self.assertEqual(config._get_conf_vals(self.tmpfilename),
112 | result,
113 | "_get_conf_vals doesn't create the expected list: ")
114 |
115 | def test_get_arg_parser(self):
116 | """Test we get a valid ArgumentParser"""
117 | self.assertIsInstance(config._get_arg_parser(),
118 | ArgumentParser,
119 | "_get_arg_parser doesn't return an instance of \
120 | ArgumentParser")
121 |
122 | def test_set_data_and_set_conf(self):
123 | """Test the set_data and set_conf functions"""
124 | # generate and set data
125 | configuration = []
126 | for key, value in self.testconfig.iteritems():
127 | configuration += ["--" + key]
128 | if value:
129 | configuration += [str(value)]
130 | config.set_data(configuration)
131 | # check we can read back the data we set
132 | self.assertEqual(config.config_get(("server","port")),
133 | 6000,
134 | "can't find argument values set by set_data")
135 | self.assertEqual(config.config_get(("logging", "verbose")),
136 | True,
137 | "can't find argument values set by set_data")
138 | # check that the system exits when we give unrecognized arguments
139 | config.set_data("--some values --that --dont --exist".split())
140 | self.assertRaises(SystemExit, config._set_conf)
141 |
142 | def test_config_get(self):
143 | """Test the most important function of the config module: config_get"""
144 | # generate and set data
145 | configuration = []
146 | for key, value in self.testconfig.iteritems():
147 | configuration += ["--" + key]
148 | if value:
149 | configuration += [str(value)]
150 | config.set_data(configuration)
151 | # check we can read back the data we set
152 | config.config_get(('server', 'port'))
153 | self.assertEqual(config.config_get(('server', 'port')),
154 | 6000,
155 | "can't find argument values set by set_data")
156 | self.assertEqual(config.config_get("nonexisting", None),
157 | None,
158 | "config_get doesn't return None on a nonexisting param")
159 |
160 |
161 | if __name__ == "__main__":
162 | #import sys;sys.argv = ['', 'Test.testName']
163 | unittest.main()
164 |
--------------------------------------------------------------------------------
/test/TestInputdata.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | '''
15 | Testsuite for the init.Initializer module
16 | '''
17 | import unittest
18 | import os
19 | import inputdata
20 |
21 | from tempfile import mkstemp
22 | from textcat import NGram
23 |
24 |
25 | class Test(unittest.TestCase):
26 |
27 | def setUp(self):
28 | self.tmpfile, self.tmpfilename = mkstemp()
29 |
30 | def test_load_textcat(self):
31 | # Initialize
32 | invalid_lm_dir = os.path.dirname(self.tmpfilename)
33 | valid_lm_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
34 | "../LM.lrej2011")
35 |
36 | # ++++++++++++++++++++++++++++
37 | # ++++++++ Run tests +++++++++
38 | # ++++++++++++++++++++++++++++
39 |
40 | # Fail if lm_dir isn't set
41 | self.assertRaises(TypeError, inputdata.load_textcat)
42 |
43 | # Fail if lm_dir is invalid
44 | self.assertRaises(ValueError, inputdata.load_textcat, invalid_lm_dir)
45 |
46 | # Return an NGram object if lm_dir is valid
47 | self.assertIsInstance(inputdata.load_textcat(valid_lm_dir), NGram,
48 | "_load_textcat with %s should result in a" \
49 | % valid_lm_dir + "valid_lm_dir NGram instance."
50 | + "Does the path contain valid lm files?")
51 |
52 |
53 | if __name__ == "__main__":
54 | #import sys;sys.argv = ['', 'Test.testName']
55 | unittest.main()
56 |
--------------------------------------------------------------------------------
/test/TestMain.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | '''
15 | Created on 13 Apr 2013
16 |
17 | @author: evert
18 | '''
19 | import unittest
20 |
21 |
22 | class Test(unittest.TestCase):
23 |
24 | def setUp(self):
25 | pass
26 |
27 | def tearDown(self):
28 | pass
29 |
30 | @unittest.skip("not yet implemented")
31 | def test_start_server(self):
32 | pass
33 |
34 | @unittest.skip("not yet implemented")
35 | def test_init_logging(self):
36 | pass
37 |
38 |
39 | if __name__ == "__main__":
40 | #import sys;sys.argv = ['', 'Test.testName']
41 | unittest.main()
42 |
--------------------------------------------------------------------------------
/test/TestProcpipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | '''
15 | Created on 13 Apr 2013
16 |
17 | @author: evert
18 | '''
19 | import unittest
20 | import procpipeline
21 |
22 | from mock import patch
23 |
24 |
25 | class Test(unittest.TestCase):
26 |
27 | def setUp(self):
28 | pass
29 |
30 | def tearDown(self):
31 | pass
32 |
33 | def test_build(self):
34 | pass
35 |
36 | @patch('procpipeline.SemanticizeProcessor', autospec=True, create=True)
37 | def test_load_semanticize_processor(self, mock):
38 | # Initialize
39 |
40 | # ++++++++++++++++++++++++++++
41 | # ++++++++ Run tests +++++++++
42 | # ++++++++++++++++++++++++++++
43 |
44 | # Running with wikipedia_ids as None throws an AttributeException
45 | # because we access attributes
46 | self.assertRaises(AttributeError,
47 | procpipeline._load_semanticize_processor,
48 | None)
49 |
50 | # Running with a dict of zero wikipedia_ids should work fine
51 | assert procpipeline._load_semanticize_processor(dict())
52 |
53 | # use the mocked-out SemanticizeProcessor
54 | print procpipeline._load_semanticize_processor(
55 | {'me': ['hey', 'later'],
56 | 'you': ['hi', 'bye']})
57 |
58 | @unittest.skip("not yet implemented")
59 | def test_load_features(self):
60 | # Initialize
61 |
62 | # ++++++++++++++++++++++++++++
63 | # ++++++++ Run tests +++++++++
64 | # ++++++++++++++++++++++++++++
65 | pass
66 |
67 |
68 | if __name__ == "__main__":
69 | #import sys;sys.argv = ['', 'Test.testName']
70 | unittest.main()
71 |
--------------------------------------------------------------------------------
/test/TestServer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2012-2013, University of Amsterdam. This program is free software:
2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser
3 | # General Public License as published by the Free Software Foundation, either
4 | # version 3 of the License, or (at your option) any later version.
5 | #
6 | # This program is distributed in the hope that it will be useful, but WITHOUT
7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
9 | # for more details.
10 | #
11 | # You should have received a copy of the GNU Lesser General Public License
12 | # along with this program. If not, see .
13 |
14 | '''
15 | Created on 13 Apr 2013
16 |
17 | @author: evert
18 | '''
19 | import unittest
20 |
21 |
22 | class Test(unittest.TestCase):
23 |
24 | def setUp(self):
25 | pass
26 |
27 | def tearDown(self):
28 | pass
29 |
30 | @unittest.skip("not yet implemented")
31 | def test_set_debug(self):
32 | pass
33 |
34 | @unittest.skip("not yet implemented")
35 | def test_json_dumps(self):
36 | pass
37 |
38 | @unittest.skip("not yet implemented")
39 | def test_get_text_from_request(self):
40 | pass
41 |
42 | @unittest.skip("not yet implemented")
43 | def test_setup_route_semanticize(self):
44 | pass
45 |
46 | def test_setup_route_language(self):
47 | pass
48 |
49 | @unittest.skip("not yet implemented")
50 | def test_setup_route_inspect(self):
51 | pass
52 |
53 | @unittest.skip("not yet implemented")
54 | def test_setup_all_routes(self):
55 | pass
56 |
57 | @unittest.skip("not yet implemented")
58 | def test_start(self):
59 | pass
60 |
61 | @unittest.skip("not yet implemented")
62 | def test_autolang_semanticize(self):
63 | pass
64 |
65 | @unittest.skip("not yet implemented")
66 | def test_semanticize(self):
67 | pass
68 |
69 | @unittest.skip("not yet implemented")
70 | def test_remove_stopwords(self):
71 | pass
72 |
73 | @unittest.skip("not yet implemented")
74 | def test_cleantweet(self):
75 | pass
76 |
77 | @unittest.skip("not yet implemented")
78 | def test_language(self):
79 | pass
80 |
81 | @unittest.skip("not yet implemented")
82 | def test_inspect(self):
83 | pass
84 |
85 |
86 | if __name__ == "__main__":
87 | #import sys;sys.argv = ['', 'Test.testName']
88 | unittest.main()
89 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/test/__init__.py
--------------------------------------------------------------------------------