├── .gitignore ├── COPYING.LESSER.txt ├── COPYING.txt ├── README.md ├── conf ├── semanticizer.memory.yml ├── semanticizer.redis.yml ├── semanticizer.trove.yml ├── semanticizer.uva.yml └── semanticizer.yml ├── doc ├── Makefile ├── Semanticizer.js ├── advanced.js ├── docs │ ├── Semanticizer.html │ ├── advanced.html │ ├── docco.css │ ├── learning.html │ └── public │ │ ├── fonts │ │ ├── aller-bold.eot │ │ ├── aller-bold.ttf │ │ ├── aller-bold.woff │ │ ├── aller-light.eot │ │ ├── aller-light.ttf │ │ ├── aller-light.woff │ │ ├── fleurons.eot │ │ ├── fleurons.ttf │ │ ├── fleurons.woff │ │ ├── novecento-bold.eot │ │ ├── novecento-bold.ttf │ │ └── novecento-bold.woff │ │ ├── images │ │ └── gray.png │ │ └── stylesheets │ │ └── normalize.css └── learning.js ├── semanticizer.svg ├── semanticizer ├── __init__.py ├── config.py ├── dbinsert │ ├── __init__.py │ └── __main__.py ├── processors │ ├── __init__.py │ ├── context.py │ ├── core.py │ ├── external.py │ ├── feature.py │ ├── features.py │ ├── image.py │ ├── learning.py │ ├── multiple.py │ ├── semanticize.py │ ├── semanticizer.py │ ├── stringUtils.py │ └── util.py ├── procpipeline.py ├── server │ ├── __init__.py │ └── __main__.py ├── util │ ├── __init__.py │ ├── online_learning.py │ ├── profiler.py │ ├── store_dataset.py │ └── timer.py └── wpm │ ├── __init__.py │ ├── data.py │ ├── db │ ├── __init__.py │ ├── inmemory.py │ ├── mongodb.py │ └── redisdb.py │ ├── load.py │ ├── namespace.py │ └── utils │ ├── __init__.py │ ├── emphasis_resolver.py │ ├── markup_stripper.py │ └── wikidumps.py ├── semanticizer_wsgi.py ├── setup.py └── test ├── TestConfig.py ├── TestInputdata.py ├── TestMain.py ├── TestProcpipeline.py ├── TestServer.py └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Editor cruft 2 | *.sw[op] 3 | *~ 4 | ._* 5 | .DS_Store 6 | 7 | *.pyc 8 | 9 | logs 10 | /log.txt 11 | 12 | # Packages 13 | *.egg 14 | *.egg-info 15 | dist 16 | build 17 | eggs 18 | parts 19 | bin 20 | var 21 | sdist 22 | develop-eggs 23 | .installed.cfg 24 | lib 25 | lib64 26 | __pycache__ 27 | 28 | # Installer logs 29 | pip-log.txt 30 | 31 | # Unit test / coverage reports 32 | .coverage 33 | .tox 34 | nosetests.xml 35 | 36 | # Translations 37 | *.mo 38 | 39 | # Mr Developer 40 | .mr.developer.cfg 41 | .project 42 | .pydevproject 43 | 44 | #netbeans 45 | /nbproject/ 46 | 47 | # MediaWiki dumps 48 | *.bz2 49 | *.gz 50 | *.sql 51 | *.xml 52 | -------------------------------------------------------------------------------- /COPYING.LESSER.txt: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Semanticizer 2 | 3 | The Semanticizer is a web service application for semantic linking 4 | created in 2012 by [Daan Odijk](http://staff.science.uva.nl/~dodijk/) 5 | at [ILPS](http://ilps.science.uva.nl/) (University of Amsterdam). 6 | 7 | This project since received contributions from (in alphabetical order): 8 | [Marc Bron](http://staff.science.uva.nl/~mbron/), 9 | [Lars Buitinck](http://staff.science.uva.nl/~buitinck/), 10 | [Bart van den Ende](http://www.bartvandenende.nl/), 11 | [David Graus](http://graus.nu/), 12 | [Tom Kenter](http://staff.science.uva.nl/~tkenter1/), 13 | [Evert Lammerts](http://www.evertlammerts.nl/), 14 | [Edgar Meij](http://edgar.meij.pro/), 15 | [Daan Odijk](http://staff.science.uva.nl/~dodijk/), 16 | [Anne Schuth](http://www.anneschuth.nl/) and 17 | [Isaac Sijaranamual](http://nl.linkedin.com/pub/isaac-sijaranamual/). 18 | 19 | The algorithms for this webservice are developed for and described in 20 | a OAIR2013 publication on 21 | [Feeding the Second Screen](http://ilps.science.uva.nl/biblio/feeding-second-screen-semantic-linking-based-subtitles) 22 | by [Daan Odijk](http://staff.science.uva.nl/~dodijk/), 23 | [Edgar Meij](http://edgar.meij.pro/) and 24 | [Maarten de Rijke](http://staff.science.uva.nl/~mdr/). Part of this 25 | research was inspired by earlier ILPS publications: 26 | [Adding Semantics to Microblog Posts](http://ilps.science.uva.nl/biblio/adding-semantics-microblog-posts) 27 | and 28 | [Mapping Queries To The Linking Open Data Cloud](http://ilps.science.uva.nl/node/889). 29 | If you use this webservice for your own research, please include a 30 | reference to the OAIR2013 article or alternatively any of these 31 | articles. 32 | 33 | The [online documentation](http://semanticize.uva.nl/doc/) describes 34 | how to use the Semanticizer Web API. This 35 | [REST](http://en.wikipedia.org/wiki/Representational_state_transfer)-like 36 | web service returns [JSON](http://www.json.org/) and is exposed to 37 | public at: http://semanticize.uva.nl/api/. Currently an access key for 38 | the webservice is not needed. 39 | 40 | The [code](https://github.com/semanticize/semanticizer/) is released 41 | under LGPL license (see below). If you have any questions, contact 42 | [Daan](http://staff.science.uva.nl/~dodijk/). 43 | 44 | If you want to dive into the code, start at `semanticizer/server/__main__.py`. 45 | 46 | 47 | ## Requirements 48 | 49 | 1. The software has been tested with Python 2.7.3 on Mac OS X 2.8 and 50 | Linux (RedHat EL5, Debian jessie/sid and Ubuntu 12.04.) 51 | 52 | 2. The following Python modules need to be installed (using 53 | easy_install or pip): 54 | 55 | * nltk 56 | * leven 57 | * networkx 58 | * lxml 59 | * flask 60 | * redis (optional, see point 4) 61 | * scikit-learn (optional, see point 6) 62 | * scipy (optional, see point 6) 63 | * mock (optional, used by the tests) 64 | 65 | 3. A summary of a Wikipedia dump is needed. For this, download the 66 | [Wikipedia Miner CSV files](http://sourceforge.net/projects/wikipedia-miner/files/data/). 67 | 68 | 4. Copy one of the two config files in the `conf` folder to 69 | `semanticizer.yml` in that folder and adapt to your situation. You 70 | have the choice of loading all data into memory (use 71 | `semanticizer.memory.yml`) or into [Redis](http://redis.io/) using 72 | the following steps: 73 | 74 | 1. Copy `semanticizer.redis.yml` into `semanticizer.yml`. 75 | 76 | 2. Redis server needs to be set up and running. 77 | 78 | 3. Load data into redis: `python -m semanticizer.dbinsert [--language=] [--output=/tmp/redisinsert.log]`. 79 | 80 | 4. Run the server using `python -m semanticizer.server`. 81 | 82 | 5. In order to work with the features you need to install the 83 | scikit-learn and scipy packages. Before installing scipy you need 84 | to have [swig](http://www.swig.org/download.html) installed. See 85 | its INSTALL for instructions. (configure, make, make 86 | install). Note that working with features is still under active 87 | development and therefore not fully documented and tested. 88 | 89 | ## License 90 | 91 | This program is free software: you can redistribute it and/or modify 92 | it under the terms of the GNU Lesser General Public License as 93 | published by the Free Software Foundation, either version 3 of the 94 | License, or (at your option) any later version. 95 | 96 | This program is distributed in the hope that it will be useful, but 97 | WITHOUT ANY WARRANTY; without even the implied warranty of 98 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 99 | Lesser General Public License for more details. 100 | 101 | You should have received a copy of the GNU Lesser General Public 102 | License along with this program. If not, see 103 | . 104 | -------------------------------------------------------------------------------- /conf/semanticizer.memory.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | server: 15 | port: 5000 16 | host: 0.0.0.0 17 | 18 | wpm: 19 | languages: 20 | en: 21 | source: memory 22 | initparams: 23 | path: ./enwiki-20110722 24 | language: english 25 | threads: 16 26 | bdburl: http://wikipedia-miner.cms.waikato.ac.nz/services/exploreArticle 27 | 28 | linkprocs: 29 | features: false 30 | 31 | logging: 32 | verbose: true 33 | path: log.txt 34 | format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s' 35 | 36 | misc: 37 | tempdir: /tmp 38 | -------------------------------------------------------------------------------- /conf/semanticizer.redis.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | server: 15 | port: 5000 16 | host: 0.0.0.0 17 | use_reloader: true 18 | 19 | wpm: 20 | languages: 21 | en: 22 | source: redis 23 | initparams: 24 | path: ./enwiki-20110722 25 | language: english 26 | host: localhost 27 | port: 6379 28 | 29 | linkprocs: 30 | features: false 31 | 32 | logging: 33 | verbose: true 34 | path: log.txt 35 | format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s' 36 | 37 | misc: 38 | tempdir: /tmp 39 | -------------------------------------------------------------------------------- /conf/semanticizer.trove.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | server: 15 | port: 5000 16 | host: 0.0.0.0 17 | 18 | wpm: 19 | languages: 20 | # memory backend 21 | nl: 22 | source: WpmDataInProc 23 | initparams: 24 | path: /zfs/ilps-plexer/wikipediaminer/nlwiki-20130318 25 | language: dutch 26 | # translation_languages should be a list of iso 639-2 language 27 | # codes 28 | translation_languages: [] 29 | # Redis backend 30 | # nl: 31 | # source: wpmdata_redis.WpmDataRedis 32 | # initparams: 33 | # host: localhost 34 | # port: 6379 35 | threads: 16 36 | bdburl: http://zookst13.science.uva.nl:8080/dutchsemcor/article 37 | 38 | semanticize: 39 | max_ngram_length: 12 40 | 41 | linkprocs: 42 | includefeatures: false 43 | 44 | logging: 45 | verbose: true 46 | path: log.txt 47 | format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s' 48 | 49 | misc: 50 | tempdir: /tmp 51 | -------------------------------------------------------------------------------- /conf/semanticizer.uva.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | server: 15 | port: 5000 16 | host: 0.0.0.0 17 | use_reloader: false 18 | 19 | wpm: 20 | languages: 21 | nl: 22 | source: WpmDataRedis 23 | initparams: 24 | host: zookst14.science.uva.nl 25 | port: 6379 26 | 27 | # Use the in-memory backend: this is faster than the Redis backend 28 | # but uses a lot more memory, especially if you intent to run 29 | # multiple semanticizers. 30 | # nl: 31 | # source: WpmDataInProc 32 | # initparams: 33 | # path: /zfs/ilps-plexer/wikipediaminer/nlwiki-20111104 34 | # language: dutch 35 | # # translation_languages should be a list of iso 639-2 language 36 | # # codes 37 | # translation_languages: ["en", "fr", "de", "nl"] 38 | en: 39 | source: WpmDataRedis 40 | initparams: 41 | host: zookst14.science.uva.nl 42 | port: 6379 43 | es: 44 | source: WpmDataRedis 45 | initparams: 46 | host: zookst14.science.uva.nl 47 | port: 6379 48 | fr: 49 | source: WpmDataRedis 50 | initparams: 51 | host: zookst14.science.uva.nl 52 | port: 6379 53 | de: 54 | source: WpmDataRedis 55 | initparams: 56 | host: zookst14.science.uva.nl 57 | port: 6379 58 | threads: 16 59 | bdburl: http://zookst13.science.uva.nl:8080/dutchsemcor/article 60 | 61 | linkprocs: 62 | includefeatures: true 63 | 64 | learning: 65 | model_dir: /zfs/ilps-plexer/dodijk/semanticizer.models 66 | 67 | logging: 68 | verbose: true 69 | path: log.txt 70 | format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s' 71 | 72 | misc: 73 | tempdir: /tmp 74 | 75 | settings: 76 | vara: 77 | pre_filter: unique,senseProbability>0.01 78 | learning: coling-SP0.2-100.RandomForestClassifier-10-auto.pkl 79 | filter: unique,learningProbability>=0.5 80 | -------------------------------------------------------------------------------- /conf/semanticizer.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | server: 15 | port: 8005 16 | host: 0.0.0.0 17 | use_reloader: false 18 | 19 | settings: 20 | include_categories: True 21 | include_definitions: True 22 | 23 | wpm: 24 | languages: 25 | #en: 26 | # source: redis 27 | # initparams: 28 | # path: /zfs/ilps-plexer/wikipediaminer/en 29 | # host: localhost 30 | # port: 6379 31 | # language: english 32 | # #translation_languages: ["nl", "fr", "de", "es"] # TODO: We should include all possible params in the config file [DG] 33 | 34 | nl: 35 | source: redis 36 | initparams: 37 | path: /zfs/ilps-plexer/wikipediaminer/nlwiki-latest 38 | host: localhost 39 | port: 6379 40 | language: nederlands 41 | #translation_languages: ["en", "fr", "de", "es"] 42 | 43 | linkprocs: 44 | features: false 45 | 46 | logging: 47 | verbose: true 48 | path: log.txt 49 | format: '[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s' 50 | 51 | misc: 52 | tempdir: /tmp 53 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | doc: 2 | docco -l linear Semanticizer.js 3 | docco -l linear advanced.js 4 | docco -l linear learning.js 5 | 6 | all: doc 7 | 8 | publish: all 9 | rsync -av docs/ zookma:/datastore/applications/semanticize/doc/ 10 | 11 | watch: 12 | watch "*.js" 1s "make publish" 13 | -------------------------------------------------------------------------------- /doc/docs/docco.css: -------------------------------------------------------------------------------- 1 | /*--------------------- Typography ----------------------------*/ 2 | 3 | @font-face { 4 | font-family: 'aller-light'; 5 | src: url('public/fonts/aller-light.eot'); 6 | src: url('public/fonts/aller-light.eot?#iefix') format('embedded-opentype'), 7 | url('public/fonts/aller-light.woff') format('woff'), 8 | url('public/fonts/aller-light.ttf') format('truetype'); 9 | font-weight: normal; 10 | font-style: normal; 11 | } 12 | 13 | @font-face { 14 | font-family: 'aller-bold'; 15 | src: url('public/fonts/aller-bold.eot'); 16 | src: url('public/fonts/aller-bold.eot?#iefix') format('embedded-opentype'), 17 | url('public/fonts/aller-bold.woff') format('woff'), 18 | url('public/fonts/aller-bold.ttf') format('truetype'); 19 | font-weight: normal; 20 | font-style: normal; 21 | } 22 | 23 | @font-face { 24 | font-family: 'novecento-bold'; 25 | src: url('public/fonts/novecento-bold.eot'); 26 | src: url('public/fonts/novecento-bold.eot?#iefix') format('embedded-opentype'), 27 | url('public/fonts/novecento-bold.woff') format('woff'), 28 | url('public/fonts/novecento-bold.ttf') format('truetype'); 29 | font-weight: normal; 30 | font-style: normal; 31 | } 32 | 33 | @font-face { 34 | font-family: 'fleurons'; 35 | src: url('public/fonts/fleurons.eot'); 36 | src: url('public/fonts/fleurons.eot?#iefix') format('embedded-opentype'), 37 | url('public/fonts/fleurons.woff') format('woff'), 38 | url('public/fonts/fleurons.ttf') format('truetype'); 39 | font-weight: normal; 40 | font-style: normal; 41 | } 42 | 43 | /*--------------------- Base Styles ----------------------------*/ 44 | 45 | body { 46 | font-family: "aller-light"; 47 | background: url('public/images/gray.png') #fff; 48 | background-size: 322px; 49 | margin: 0; 50 | } 51 | 52 | hr { 53 | height: 1px; 54 | background: #ddd; 55 | border: 0; 56 | } 57 | 58 | h1, h2, h3, h4, h5, h6 { 59 | color: #112233; 60 | font-weight: normal; 61 | font-family: "novecento-bold"; 62 | text-transform: uppercase; 63 | line-height: 1em; 64 | margin-top: 50px; 65 | } 66 | h1 { 67 | margin: 0; 68 | text-align: center; 69 | } 70 | h2 { 71 | font-size: 1.3em; 72 | } 73 | h1:after { 74 | content: "8"; 75 | display: block; 76 | font-family: "fleurons"; 77 | color: #999; 78 | font-size: 80px; 79 | padding: 10px 0 25px; 80 | } 81 | 82 | a { 83 | color: #000; 84 | } 85 | 86 | b, strong { 87 | font-weight: normal; 88 | font-family: "aller-bold"; 89 | } 90 | 91 | blockquote { 92 | border-left: 5px solid #ccc; 93 | margin-left: 0; 94 | padding: 1px 0 1px 1em; 95 | } 96 | .page blockquote p { 97 | font-family: Menlo, Consolas, Monaco, monospace; 98 | font-size: 14px; line-height: 19px; 99 | color: #999; 100 | margin: 10px 0 0; 101 | white-space: pre-wrap; 102 | } 103 | 104 | pre, tt, code { 105 | font-family: Menlo, Consolas, Monaco, monospace; 106 | font-size: 12px; 107 | display: inline-block; 108 | border: 1px solid #EAEAEA; 109 | background: #f8f8f8; 110 | color: #555; 111 | padding: 0 5px; 112 | line-height: 20px; 113 | } 114 | .page pre { 115 | margin: 0; 116 | width: 608px; 117 | padding: 10px 15px; 118 | background: #fcfcfc; 119 | -moz-box-shadow: inset 0 0 10px rgba(0,0,0,0.1); 120 | -webkit-box-shadow: inset 0 0 10px rgba(0,0,0,0.1); 121 | box-shadow: inset 0 0 10px rgba(0,0,0,0.1); 122 | overflow-x: auto; 123 | } 124 | .page pre code { 125 | border: 0; 126 | padding: 0; 127 | background: transparent; 128 | } 129 | 130 | .fleur { 131 | font-family: "fleurons"; 132 | font-size: 100px; 133 | text-align: center; 134 | margin: 40px 0; 135 | color: #ccc; 136 | } 137 | 138 | /*--------------------- Layout ----------------------------*/ 139 | 140 | .container { 141 | width: 760px; 142 | margin: 0 auto; 143 | background: #fff; 144 | background: rgba(255,255,255, 0.4); 145 | overflow: hidden; 146 | } 147 | .page { 148 | width: 640px; 149 | padding: 30px; 150 | margin: 30px; 151 | background: #fff; 152 | font-size: 17px; 153 | line-height: 26px; 154 | } 155 | .page p { 156 | color: #30404f; 157 | margin: 26px 0; 158 | } 159 | 160 | ul.sections { 161 | list-style: none; 162 | padding:0 0 5px 0;; 163 | margin:0; 164 | } 165 | 166 | .page li p { 167 | margin: 12px 0; 168 | } 169 | 170 | .toc { 171 | max-height: 0; 172 | overflow: hidden; 173 | text-align: center; 174 | font-size: 13px; 175 | line-height: 20px; 176 | -moz-transition: max-height 1s; 177 | -webkit-transition: max-height 1s; 178 | transition: max-height 1s; 179 | } 180 | .header:hover .toc { 181 | max-height: 500px; 182 | } 183 | .toc h3 { 184 | margin-top: 20px; 185 | } 186 | .toc ol { 187 | margin: 0 0 20px 0; 188 | display: inline-block; 189 | text-align: left; 190 | list-style-type: upper-roman; 191 | } 192 | .toc li { 193 | font-family: 'novecento-bold'; 194 | } 195 | .toc li a { 196 | font-family: 'aller-light'; 197 | } 198 | 199 | 200 | /*---------------------- Syntax Highlighting -----------------------------*/ 201 | 202 | td.linenos { background-color: #f0f0f0; padding-right: 10px; } 203 | span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; } 204 | /* 205 | 206 | github.com style (c) Vasily Polovnyov 207 | 208 | */ 209 | 210 | pre code { 211 | display: block; padding: 0.5em; 212 | color: #000; 213 | background: #f8f8ff 214 | } 215 | 216 | pre .comment, 217 | pre .template_comment, 218 | pre .diff .header, 219 | pre .javadoc { 220 | color: #408080; 221 | font-style: italic 222 | } 223 | 224 | pre .keyword, 225 | pre .assignment, 226 | pre .literal, 227 | pre .css .rule .keyword, 228 | pre .winutils, 229 | pre .javascript .title, 230 | pre .lisp .title, 231 | pre .subst { 232 | color: #954121; 233 | /*font-weight: bold*/ 234 | } 235 | 236 | pre .number, 237 | pre .hexcolor { 238 | color: #40a070 239 | } 240 | 241 | pre .string, 242 | pre .tag .value, 243 | pre .phpdoc, 244 | pre .tex .formula { 245 | color: #219161; 246 | } 247 | 248 | pre .title, 249 | pre .id { 250 | color: #19469D; 251 | } 252 | pre .params { 253 | color: #00F; 254 | } 255 | 256 | pre .javascript .title, 257 | pre .lisp .title, 258 | pre .subst { 259 | font-weight: normal 260 | } 261 | 262 | pre .class .title, 263 | pre .haskell .label, 264 | pre .tex .command { 265 | color: #458; 266 | font-weight: bold 267 | } 268 | 269 | pre .tag, 270 | pre .tag .title, 271 | pre .rules .property, 272 | pre .django .tag .keyword { 273 | color: #000080; 274 | font-weight: normal 275 | } 276 | 277 | pre .attribute, 278 | pre .variable, 279 | pre .instancevar, 280 | pre .lisp .body { 281 | color: #008080 282 | } 283 | 284 | pre .regexp { 285 | color: #B68 286 | } 287 | 288 | pre .class { 289 | color: #458; 290 | font-weight: bold 291 | } 292 | 293 | pre .symbol, 294 | pre .ruby .symbol .string, 295 | pre .ruby .symbol .keyword, 296 | pre .ruby .symbol .keymethods, 297 | pre .lisp .keyword, 298 | pre .tex .special, 299 | pre .input_number { 300 | color: #990073 301 | } 302 | 303 | pre .builtin, 304 | pre .constructor, 305 | pre .built_in, 306 | pre .lisp .title { 307 | color: #0086b3 308 | } 309 | 310 | pre .preprocessor, 311 | pre .pi, 312 | pre .doctype, 313 | pre .shebang, 314 | pre .cdata { 315 | color: #999; 316 | font-weight: bold 317 | } 318 | 319 | pre .deletion { 320 | background: #fdd 321 | } 322 | 323 | pre .addition { 324 | background: #dfd 325 | } 326 | 327 | pre .diff .change { 328 | background: #0086b3 329 | } 330 | 331 | pre .chunk { 332 | color: #aaa 333 | } 334 | 335 | pre .tex .formula { 336 | opacity: 0.5; 337 | } 338 | -------------------------------------------------------------------------------- /doc/docs/public/fonts/aller-bold.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-bold.eot -------------------------------------------------------------------------------- /doc/docs/public/fonts/aller-bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-bold.ttf -------------------------------------------------------------------------------- /doc/docs/public/fonts/aller-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-bold.woff -------------------------------------------------------------------------------- /doc/docs/public/fonts/aller-light.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-light.eot -------------------------------------------------------------------------------- /doc/docs/public/fonts/aller-light.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-light.ttf -------------------------------------------------------------------------------- /doc/docs/public/fonts/aller-light.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/aller-light.woff -------------------------------------------------------------------------------- /doc/docs/public/fonts/fleurons.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/fleurons.eot -------------------------------------------------------------------------------- /doc/docs/public/fonts/fleurons.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/fleurons.ttf -------------------------------------------------------------------------------- /doc/docs/public/fonts/fleurons.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/fleurons.woff -------------------------------------------------------------------------------- /doc/docs/public/fonts/novecento-bold.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/novecento-bold.eot -------------------------------------------------------------------------------- /doc/docs/public/fonts/novecento-bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/novecento-bold.ttf -------------------------------------------------------------------------------- /doc/docs/public/fonts/novecento-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/fonts/novecento-bold.woff -------------------------------------------------------------------------------- /doc/docs/public/images/gray.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/doc/docs/public/images/gray.png -------------------------------------------------------------------------------- /doc/docs/public/stylesheets/normalize.css: -------------------------------------------------------------------------------- 1 | /*! normalize.css v2.0.1 | MIT License | git.io/normalize */ 2 | 3 | /* ========================================================================== 4 | HTML5 display definitions 5 | ========================================================================== */ 6 | 7 | /* 8 | * Corrects `block` display not defined in IE 8/9. 9 | */ 10 | 11 | article, 12 | aside, 13 | details, 14 | figcaption, 15 | figure, 16 | footer, 17 | header, 18 | hgroup, 19 | nav, 20 | section, 21 | summary { 22 | display: block; 23 | } 24 | 25 | /* 26 | * Corrects `inline-block` display not defined in IE 8/9. 27 | */ 28 | 29 | audio, 30 | canvas, 31 | video { 32 | display: inline-block; 33 | } 34 | 35 | /* 36 | * Prevents modern browsers from displaying `audio` without controls. 37 | * Remove excess height in iOS 5 devices. 38 | */ 39 | 40 | audio:not([controls]) { 41 | display: none; 42 | height: 0; 43 | } 44 | 45 | /* 46 | * Addresses styling for `hidden` attribute not present in IE 8/9. 47 | */ 48 | 49 | [hidden] { 50 | display: none; 51 | } 52 | 53 | /* ========================================================================== 54 | Base 55 | ========================================================================== */ 56 | 57 | /* 58 | * 1. Sets default font family to sans-serif. 59 | * 2. Prevents iOS text size adjust after orientation change, without disabling 60 | * user zoom. 61 | */ 62 | 63 | html { 64 | font-family: sans-serif; /* 1 */ 65 | -webkit-text-size-adjust: 100%; /* 2 */ 66 | -ms-text-size-adjust: 100%; /* 2 */ 67 | } 68 | 69 | /* 70 | * Removes default margin. 71 | */ 72 | 73 | body { 74 | margin: 0; 75 | } 76 | 77 | /* ========================================================================== 78 | Links 79 | ========================================================================== */ 80 | 81 | /* 82 | * Addresses `outline` inconsistency between Chrome and other browsers. 83 | */ 84 | 85 | a:focus { 86 | outline: thin dotted; 87 | } 88 | 89 | /* 90 | * Improves readability when focused and also mouse hovered in all browsers. 91 | */ 92 | 93 | a:active, 94 | a:hover { 95 | outline: 0; 96 | } 97 | 98 | /* ========================================================================== 99 | Typography 100 | ========================================================================== */ 101 | 102 | /* 103 | * Addresses `h1` font sizes within `section` and `article` in Firefox 4+, 104 | * Safari 5, and Chrome. 105 | */ 106 | 107 | h1 { 108 | font-size: 2em; 109 | } 110 | 111 | /* 112 | * Addresses styling not present in IE 8/9, Safari 5, and Chrome. 113 | */ 114 | 115 | abbr[title] { 116 | border-bottom: 1px dotted; 117 | } 118 | 119 | /* 120 | * Addresses style set to `bolder` in Firefox 4+, Safari 5, and Chrome. 121 | */ 122 | 123 | b, 124 | strong { 125 | font-weight: bold; 126 | } 127 | 128 | /* 129 | * Addresses styling not present in Safari 5 and Chrome. 130 | */ 131 | 132 | dfn { 133 | font-style: italic; 134 | } 135 | 136 | /* 137 | * Addresses styling not present in IE 8/9. 138 | */ 139 | 140 | mark { 141 | background: #ff0; 142 | color: #000; 143 | } 144 | 145 | 146 | /* 147 | * Corrects font family set oddly in Safari 5 and Chrome. 148 | */ 149 | 150 | code, 151 | kbd, 152 | pre, 153 | samp { 154 | font-family: monospace, serif; 155 | font-size: 1em; 156 | } 157 | 158 | /* 159 | * Improves readability of pre-formatted text in all browsers. 160 | */ 161 | 162 | pre { 163 | white-space: pre; 164 | white-space: pre-wrap; 165 | word-wrap: break-word; 166 | } 167 | 168 | /* 169 | * Sets consistent quote types. 170 | */ 171 | 172 | q { 173 | quotes: "\201C" "\201D" "\2018" "\2019"; 174 | } 175 | 176 | /* 177 | * Addresses inconsistent and variable font size in all browsers. 178 | */ 179 | 180 | small { 181 | font-size: 80%; 182 | } 183 | 184 | /* 185 | * Prevents `sub` and `sup` affecting `line-height` in all browsers. 186 | */ 187 | 188 | sub, 189 | sup { 190 | font-size: 75%; 191 | line-height: 0; 192 | position: relative; 193 | vertical-align: baseline; 194 | } 195 | 196 | sup { 197 | top: -0.5em; 198 | } 199 | 200 | sub { 201 | bottom: -0.25em; 202 | } 203 | 204 | /* ========================================================================== 205 | Embedded content 206 | ========================================================================== */ 207 | 208 | /* 209 | * Removes border when inside `a` element in IE 8/9. 210 | */ 211 | 212 | img { 213 | border: 0; 214 | } 215 | 216 | /* 217 | * Corrects overflow displayed oddly in IE 9. 218 | */ 219 | 220 | svg:not(:root) { 221 | overflow: hidden; 222 | } 223 | 224 | /* ========================================================================== 225 | Figures 226 | ========================================================================== */ 227 | 228 | /* 229 | * Addresses margin not present in IE 8/9 and Safari 5. 230 | */ 231 | 232 | figure { 233 | margin: 0; 234 | } 235 | 236 | /* ========================================================================== 237 | Forms 238 | ========================================================================== */ 239 | 240 | /* 241 | * Define consistent border, margin, and padding. 242 | */ 243 | 244 | fieldset { 245 | border: 1px solid #c0c0c0; 246 | margin: 0 2px; 247 | padding: 0.35em 0.625em 0.75em; 248 | } 249 | 250 | /* 251 | * 1. Corrects color not being inherited in IE 8/9. 252 | * 2. Remove padding so people aren't caught out if they zero out fieldsets. 253 | */ 254 | 255 | legend { 256 | border: 0; /* 1 */ 257 | padding: 0; /* 2 */ 258 | } 259 | 260 | /* 261 | * 1. Corrects font family not being inherited in all browsers. 262 | * 2. Corrects font size not being inherited in all browsers. 263 | * 3. Addresses margins set differently in Firefox 4+, Safari 5, and Chrome 264 | */ 265 | 266 | button, 267 | input, 268 | select, 269 | textarea { 270 | font-family: inherit; /* 1 */ 271 | font-size: 100%; /* 2 */ 272 | margin: 0; /* 3 */ 273 | } 274 | 275 | /* 276 | * Addresses Firefox 4+ setting `line-height` on `input` using `!important` in 277 | * the UA stylesheet. 278 | */ 279 | 280 | button, 281 | input { 282 | line-height: normal; 283 | } 284 | 285 | /* 286 | * 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio` 287 | * and `video` controls. 288 | * 2. Corrects inability to style clickable `input` types in iOS. 289 | * 3. Improves usability and consistency of cursor style between image-type 290 | * `input` and others. 291 | */ 292 | 293 | button, 294 | html input[type="button"], /* 1 */ 295 | input[type="reset"], 296 | input[type="submit"] { 297 | -webkit-appearance: button; /* 2 */ 298 | cursor: pointer; /* 3 */ 299 | } 300 | 301 | /* 302 | * Re-set default cursor for disabled elements. 303 | */ 304 | 305 | button[disabled], 306 | input[disabled] { 307 | cursor: default; 308 | } 309 | 310 | /* 311 | * 1. Addresses box sizing set to `content-box` in IE 8/9. 312 | * 2. Removes excess padding in IE 8/9. 313 | */ 314 | 315 | input[type="checkbox"], 316 | input[type="radio"] { 317 | box-sizing: border-box; /* 1 */ 318 | padding: 0; /* 2 */ 319 | } 320 | 321 | /* 322 | * 1. Addresses `appearance` set to `searchfield` in Safari 5 and Chrome. 323 | * 2. Addresses `box-sizing` set to `border-box` in Safari 5 and Chrome 324 | * (include `-moz` to future-proof). 325 | */ 326 | 327 | input[type="search"] { 328 | -webkit-appearance: textfield; /* 1 */ 329 | -moz-box-sizing: content-box; 330 | -webkit-box-sizing: content-box; /* 2 */ 331 | box-sizing: content-box; 332 | } 333 | 334 | /* 335 | * Removes inner padding and search cancel button in Safari 5 and Chrome 336 | * on OS X. 337 | */ 338 | 339 | input[type="search"]::-webkit-search-cancel-button, 340 | input[type="search"]::-webkit-search-decoration { 341 | -webkit-appearance: none; 342 | } 343 | 344 | /* 345 | * Removes inner padding and border in Firefox 4+. 346 | */ 347 | 348 | button::-moz-focus-inner, 349 | input::-moz-focus-inner { 350 | border: 0; 351 | padding: 0; 352 | } 353 | 354 | /* 355 | * 1. Removes default vertical scrollbar in IE 8/9. 356 | * 2. Improves readability and alignment in all browsers. 357 | */ 358 | 359 | textarea { 360 | overflow: auto; /* 1 */ 361 | vertical-align: top; /* 2 */ 362 | } 363 | 364 | /* ========================================================================== 365 | Tables 366 | ========================================================================== */ 367 | 368 | /* 369 | * Remove most spacing between table cells. 370 | */ 371 | 372 | table { 373 | border-collapse: collapse; 374 | border-spacing: 0; 375 | } -------------------------------------------------------------------------------- /semanticizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/__init__.py -------------------------------------------------------------------------------- /semanticizer/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | """ 15 | This module is responsible for loading all possible configuration params and 16 | their defaults, overwriting the defaults by reading values from a given config 17 | file, then overwriting these values to whatever's been passed as argument. 18 | """ 19 | import yaml 20 | import sys 21 | import argparse 22 | import traceback 23 | import os 24 | 25 | def load_config(path='../conf/semanticizer.yml'): 26 | 27 | #add command line args 28 | parser = argparse.ArgumentParser(description=""" 29 | Run sematicizer.""") 30 | 31 | parser.add_argument("-p", "--port", help="Port number ") 32 | parser.add_argument("-v", "--verbose", help="Verbose ") 33 | parser.add_argument("-s", "--host", help="Host ip address ") 34 | parser.add_argument("-c", "--config", help="Config file ") 35 | 36 | args = parser.parse_args() 37 | 38 | if args.config != None: 39 | path = args.config 40 | 41 | if not path.startswith("/"): 42 | path = os.path.join(os.path.dirname(__file__), path) 43 | 44 | configYaml = yaml.load(file(path)) 45 | 46 | if args.port != None: 47 | configYaml["server"]["port"] = int(args.port) 48 | 49 | if args.verbose != None: 50 | configYaml["logging"]["verbose"] = str2bool(args.verbose) 51 | 52 | if args.host != None: 53 | configYaml["server"]["host"] = args.host 54 | 55 | return configYaml 56 | 57 | def str2bool(v): 58 | return v.lower() in ("yes", "true", "t", "1") 59 | 60 | def config_get(keys=(), default=None, config=None): 61 | """ 62 | Allows user to access configuration variables and arguments. The function 63 | takes the variable name as its input, and returns the value or None is it 64 | isn't set. 65 | 66 | @param keys: The name of the configuration parameter to fetch. (Optional) 67 | @param default: The default value to return if the key is not found. 68 | @param config: dictionary to represent config. If None, load_config is 69 | called. 70 | @return: The value for the given parameter if name was set and valid, \ 71 | the default value if invalid or None if no default value was set. 72 | """ 73 | if config is None: 74 | config = load_config() 75 | 76 | if isinstance(keys, basestring): 77 | keys = [keys] 78 | 79 | pointer = config 80 | for key in keys: 81 | if not key in pointer: 82 | if default is not None: 83 | return default 84 | else: 85 | raise KeyError('Could not find %s in configuration' % key) 86 | pointer = pointer[key] 87 | 88 | index = 0 89 | return pointer 90 | -------------------------------------------------------------------------------- /semanticizer/dbinsert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/dbinsert/__init__.py -------------------------------------------------------------------------------- /semanticizer/dbinsert/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import yaml 15 | import sys 16 | import getopt 17 | 18 | from ..wpm.load import WpmLoader 19 | from ..config import config_get 20 | 21 | def load_wpm_data(datasource, langcode, settings, **kwargs): 22 | if datasource == "redis": 23 | from ..wpm.db.redisdb import RedisDB 24 | db = RedisDB(**kwargs) 25 | WpmLoader(db, langcode, settings, **kwargs) 26 | elif datasource == "mongo": 27 | from ..wpm.db.mongodb import MongoDB 28 | db = MongoDB(**kwargs) 29 | WpmLoader(db, langcode, settings, **kwargs) 30 | else: 31 | raise ValueError("No %s backend for language %s" % (datasource, langcode)) 32 | 33 | 34 | 35 | ## 36 | ## usage 37 | ## python -m semanticizer.dbinsert --language= --output=/tmp/redisinsert.log 38 | if __name__ == '__main__': 39 | configYaml = yaml.load(file('conf/semanticizer.yml')) 40 | wpm_languages = config_get(('wpm', 'languages'), None, configYaml) 41 | settings = config_get("settings", {}, configYaml) 42 | try: 43 | opts, args = getopt.getopt(sys.argv[1:], 'l:o:', ['language=', 'output=']) 44 | except getopt.GetoptError: 45 | usage() 46 | sys.exit(2) 47 | 48 | showprogress = True 49 | output = None 50 | language = None 51 | 52 | for opt, arg in opts: 53 | if opt in ('-l', '--language'): 54 | language = arg 55 | elif opt in ('-o', '--output'): 56 | output = arg 57 | 58 | if output: 59 | f = open(output, "w+") 60 | sys.stdout = f 61 | showprogress = False 62 | 63 | #if language code is specified only import that language 64 | if language and wpm_languages[language]: 65 | load_wpm_data(wpm_languages[language]['source'], language, settings, progress=showprogress, **wpm_languages[language]['initparams']) 66 | #else important all languages in the config file 67 | else: 68 | for langcode, langconfig in wpm_languages.iteritems(): 69 | load_wpm_data(langconfig['source'], langcode, settings, progress=showprogress, **langconfig['initparams']) 70 | 71 | -------------------------------------------------------------------------------- /semanticizer/processors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/processors/__init__.py -------------------------------------------------------------------------------- /semanticizer/processors/context.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import networkx 15 | from networkx.algorithms.centrality import degree_centrality 16 | 17 | from multiprocessing import Pool 18 | 19 | def pagerank_worker(graph, page_ranked): 20 | print "Pagerank on graph with %d nodes and %d edges." \ 21 | % (len(graph.nodes()), \ 22 | len(graph.edges())) 23 | for node in graph.nodes(): 24 | page_ranked.setdefault(node, 1) 25 | 26 | from networkx.algorithms.link_analysis import pagerank 27 | from time import time 28 | 29 | try: 30 | start = time() 31 | page_ranked = pagerank(graph, max_iter=1000, nstart=page_ranked) # 0.2-1.5s for #node = 2500 32 | print "Pagerank took: %f seconds" % (time()-start) 33 | except ZeroDivisionError: 34 | print "ZeroDivisionError in pagerank" 35 | 36 | page_ranked_sorted = sorted(page_ranked.items(), key=lambda x: x[1], reverse=True) 37 | print page_ranked_sorted[:4] 38 | 39 | pool = Pool() 40 | 41 | class contextGraph: 42 | def __init__(self, label, threshold_function, threshold, min_t): 43 | self.graph = networkx.Graph() 44 | self.page_ranked = {} 45 | self.chunk = -1 46 | self.feature_label = "CONTEXT_" + label.upper() 47 | 48 | self.threshold_function = threshold_function 49 | self.threshold = threshold 50 | self.min_t = min_t 51 | 52 | def to_dict_of_dicts(self): 53 | return networkx.convert.to_dict_of_dicts(self.graph) 54 | 55 | def add_chunk(self): 56 | self.chunk += 1 57 | self.page_ranked.setdefault("[Chunk%d]" % self.chunk, 0) 58 | if self.chunk > 0: 59 | self.graph.add_edge("[Chunk%d]" % self.chunk, \ 60 | "[Chunk%d]" % (self.chunk-1), t=self.chunk) 61 | 62 | def add_link(self, link): 63 | assert link.has_key("title") 64 | assert link.has_key(self.threshold_function) 65 | assert link.has_key("label") 66 | 67 | if link[self.threshold_function] < self.threshold: return 68 | 69 | label_text = "[%d-%s]" % (self.chunk, link["label"]) 70 | self.page_ranked.setdefault(link["title"], 1) 71 | self.page_ranked.setdefault(label_text, 0) 72 | self.graph.add_edge(label_text, link["title"], t=self.chunk) # weight=senseProbability 73 | self.graph.add_edge(label_text, "[Chunk%d]" % self.chunk, t=self.chunk) 74 | 75 | def prepare_features(self): 76 | self.clean_graph(self.chunk-self.min_t) 77 | 78 | self.pagerank_result = pool.apply_async(pagerank_worker, (self.graph, self.page_ranked,)) 79 | 80 | # def degree_centrality_worker(): 81 | # self.degree_centralities = degree_centrality(self.graph) 82 | # 83 | # self.degree_centrality_thread = Thread(target=degree_centrality_worker) 84 | # self.degree_centrality_thread.start() 85 | 86 | self.degree_centrality_result = pool.apply_async(degree_centrality, (self.graph,)) 87 | 88 | def compute_features(self, title): 89 | # self.degree_centrality_thread.join() 90 | # self.pagerank_thread.join() 91 | self.degree_centralities = self.degree_centrality_result.get() 92 | self.pagerank_result.wait() 93 | 94 | features = {} 95 | features[self.feature_label + "_DEGREE"] = 0 96 | features[self.feature_label + "_PAGERANK"] = 0 97 | features[self.feature_label + "_PAGERANK_NORMALIZED"] = 0 98 | features[self.feature_label + "_DEGREE_CENTRALITY"] = 0 99 | if title in self.page_ranked: 100 | features[self.feature_label + "_PAGERANK"] = self.page_ranked[title] 101 | features[self.feature_label + "_PAGERANK_NORMALIZED"] = \ 102 | len(self.graph.nodes()) * self.page_ranked[title] 103 | if title in self.degree_centralities: 104 | features[self.feature_label + "_DEGREE"] = \ 105 | self.graph.degree(title) 106 | features[self.feature_label + "_DEGREE_CENTRALITY"] = \ 107 | self.degree_centralities[title] 108 | return features 109 | 110 | def clean_graph(self, min_t): 111 | # Remove edges with a t lower than min_t 112 | for edge in self.graph.edges(): 113 | if self.graph[edge[0]][edge[1]]["t"] < min_t: 114 | self.graph.remove_edge(edge[0], edge[1]) 115 | # Remove nodes that have become disconnected 116 | for node in self.graph.nodes(): 117 | if self.graph.degree(node) == 0: 118 | self.graph.remove_node(node) 119 | del self.page_ranked[node] 120 | 121 | def pagerank(self): 122 | # from networkx.algorithms.link_analysis import pagerank_scipy 123 | # from networkx.algorithms.link_analysis import pagerank_numpy 124 | from networkx.algorithms.link_analysis import pagerank 125 | from time import time 126 | try: 127 | start = time() 128 | # pagerank(graph, max_iter=1000) # 1.7s for #nodes = 2500 129 | pagerank(self.graph, max_iter=1000, nstart=self.page_ranked) # 0.2-1.5s for #node = 2500 130 | # pagerank_scipy(graph) # 1.0s for #nodes = 2500 131 | # pagerank_numpy(graph) # > 30s if #nodes > 1000 132 | print "Pagerank took: %f seconds" % (time()-start) 133 | except ZeroDivisionError: 134 | print "ZeroDivisionError in pagerank" 135 | 136 | page_ranked_sorted = sorted(self.page_ranked.items(), key=lambda x: x[1], reverse=True) 137 | print page_ranked_sorted[:4] 138 | 139 | # from networkx.algorithms.centrality import * 140 | 141 | # start = time() 142 | # degree_centrality = degree_centrality(graph) # 0.003s for 1500 nodes 143 | # print "Degree centrality took: %f seconds" % (time()-start) 144 | # 145 | # start = time() 146 | # closeness_centrality = closeness_centrality(graph) # 4s for 1500 nodes 147 | # print "Closeness centrality took: %f seconds" % (time()-start) 148 | # 149 | # start = time() 150 | # betweenness_centrality = betweenness_centrality(graph) # 18s for 1500 nodes 151 | # print "Betweenness centrality took: %f seconds" % (time()-start) 152 | 153 | return self.page_ranked 154 | -------------------------------------------------------------------------------- /semanticizer/processors/core.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | class LinksProcessor: 15 | '''A LinksProcessor takes a set of links, a text and a language code to 16 | produce or process links. Processing is done in two steps, a preprocessing 17 | step and a processing step. ''' 18 | 19 | def preprocess(self, links, text, settings): 20 | return (links, text, settings) 21 | 22 | def process(self, links, text, settings): 23 | return (links, text, settings) 24 | 25 | def postprocess(self, links, text, settings): 26 | return (links, text, settings) 27 | 28 | def inspect(self): 29 | return {} 30 | 31 | class SettingsProcessor(LinksProcessor): 32 | def __init__(self, settings): 33 | self.settings = settings 34 | 35 | def preprocess(self, links, text, settings): 36 | if "settings" in settings and settings["settings"] in self.settings: 37 | for k, v in self.settings[settings["settings"]].iteritems(): 38 | if k not in settings: 39 | settings[k] = v 40 | del settings["settings"] 41 | return (links, text, settings) 42 | def inspect(self): 43 | return {self.__class__.__name__: self.settings} 44 | 45 | class FilterProcessor(LinksProcessor): 46 | def __init__(self): 47 | self.context_links = {} 48 | 49 | def preprocess(self, links, text, settings): 50 | if settings.has_key("prefilter"): 51 | links = self.filter_links(settings["prefilter"].split(","), links, settings) 52 | 53 | return (links, text, settings) 54 | 55 | def postprocess(self, links, text, settings): 56 | if "filter" in settings: 57 | links = self.filter_links(settings["filter"].split(","), 58 | links, settings) 59 | 60 | return (links, text, settings) 61 | 62 | def filter_links(self, filters, links, settings): 63 | filters_gte = [fltr.split(">=") for fltr in filters if ">=" in fltr] 64 | filters_gt = [fltr.split(">") for fltr in filters \ 65 | if ">" in fltr and not ">=" in fltr] 66 | 67 | filter_unique = ("unique" in filters) and "context" in settings 68 | 69 | if len(filters_gte) == 0 and len(filters_gt) == 0 \ 70 | and not filter_unique: 71 | return links 72 | 73 | filtered_links = [] 74 | # Q: why do we not apply the gt filter if a gte filter fails? 75 | for link in links: 76 | skip = False 77 | for fltr in filters_gte: 78 | if not link[fltr[0]] >= float(fltr[1]): 79 | skip = True 80 | break 81 | else: 82 | for fltr in filters_gt: 83 | if not link[fltr[0]] > float(fltr[1]): 84 | skip = True 85 | break 86 | 87 | if filter_unique: 88 | self.context_links.setdefault(settings["context"], {}) 89 | if link["title"] in self.context_links[settings["context"]]: 90 | skip = True 91 | 92 | if not skip: 93 | filtered_links.append(link) 94 | 95 | if filter_unique: 96 | self.context_links[settings["context"]][link["title"]] = link 97 | 98 | print "Filtered %d links to %d" % (len(links), len(filtered_links)) 99 | 100 | return filtered_links 101 | 102 | def inspect(self): 103 | return {self.__class__.__name__: self.context_links} -------------------------------------------------------------------------------- /semanticizer/processors/external.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | from Queue import Queue, Empty 15 | from threading import Thread 16 | 17 | import urllib2 18 | 19 | import datetime 20 | import shelve 21 | import os 22 | from copy import deepcopy 23 | 24 | from .core import LinksProcessor 25 | from ..wpm.data import wpm_dumps 26 | from ..wpm.utils import get_relatedness 27 | 28 | 29 | class ArticlesProcessor(LinksProcessor): 30 | def __init__(self, langcodes, pickledir): 31 | self.langcodes = langcodes 32 | self.article_template = { 33 | "article_id": -1, 34 | "article_title": "", 35 | "Definition": "", 36 | "InLinks": [], 37 | "OutLinks": [], 38 | "Labels": [], 39 | "Images": [], 40 | "ParentCategories": [] 41 | } 42 | 43 | def preprocess(self, links, text, settings): 44 | if not "article" in settings and not "features" in settings and not \ 45 | "learning" in settings and "multi" not in settings: 46 | return (links, text, settings) 47 | if not settings["langcode"] in self.langcodes: 48 | return (links, text, settings) 49 | 50 | return (links, text, settings) 51 | 52 | def process(self, links, text, settings): 53 | if not "article" in settings and not "features" in settings and not \ 54 | "learning" in settings: 55 | return (links, text, settings) 56 | if not settings["langcode"] in self.langcodes: 57 | return (links, text, settings) 58 | 59 | wpm = wpm_dumps[settings["langcode"]] 60 | 61 | if "article" in settings: 62 | parts = settings["article"].lower().split(',') 63 | else: 64 | parts = [key.lower() for key in self.article_template.keys()] 65 | 66 | titles = [link["title"] for link in links] 67 | ids = [link["id"] for link in links] 68 | articles = wpm.get_articles(*ids) 69 | 70 | for link, id, title, article in zip(links, ids, titles, articles): 71 | 72 | link.update(deepcopy(self.article_template)) 73 | 74 | link["article_title"] = title 75 | link["article_id"] = id 76 | 77 | inlinks = article["InLinks"] 78 | if inlinks and (not parts or 'inlinks' in parts): 79 | if not parts or 'relatedness' in parts: 80 | for inlink in inlinks: 81 | title = wpm.get_item_title(inlink) 82 | relatedness = get_relatedness(inlinks, wpm.get_item_inlinks(inlink) ) 83 | link["InLinks"].append( {"title":title, "id":int(inlink), "relatedness":relatedness} ) 84 | else: 85 | link["InLinks"] = [{ "id":int(inlink) } for inlink in inlinks] 86 | 87 | outlinks = article["OutLinks"] 88 | if outlinks and (not parts or 'outlinks' in parts): 89 | if not parts or 'relatedness' in parts: 90 | for outlink in outlinks: 91 | title = wpm.get_item_title(outlink) 92 | relatedness = get_relatedness(outlinks, wpm.get_item_outlinks(outlink) ) 93 | link["OutLinks"].append( {"title":title, "id":int(outlink), "relatedness":relatedness} ) 94 | else: 95 | link["OutLinks"] = [{ "id":int(outlink) } for outlink in outlinks] 96 | 97 | if not parts or 'categories' in parts: 98 | categories = wpm.get_item_categories( link["article_id"] ) 99 | if categories: 100 | for category in categories: 101 | title = wpm.get_item_title(category) 102 | link["ParentCategories"].append( {"title":title, "id":int(category)} ) 103 | 104 | if not parts or 'definition' in parts: 105 | definition = wpm.get_item_definition(link["article_id"]) 106 | if definition: 107 | link["Definition"] = definition 108 | 109 | if article["Labels"] and "labels" in parts: 110 | link["Labels"] = article["Labels"] 111 | 112 | return (links, text, settings) 113 | 114 | def postprocess(self, links, text, settings): 115 | if "article" in settings and len(settings["article"]) == 0: 116 | return (links, text, settings) 117 | remove = [key.lower() for key in self.article_template.keys()] 118 | remove.extend(["fromtitle", "fromredirect"]) 119 | if "article" in settings: 120 | for label in settings["article"].replace(";", ",").split(","): 121 | if label.lower() in remove: 122 | remove.remove(label) 123 | for link in links: 124 | for label in link.keys(): 125 | if label.lower() in remove: 126 | del link[label] 127 | 128 | return (links, text, settings) 129 | 130 | 131 | class StatisticsProcessor(LinksProcessor): 132 | def __init__(self, langcodes, num_of_threads, pickledir): 133 | self.num_of_threads = num_of_threads 134 | self.WIKIPEDIA_STATS_URL = {} 135 | self.wikipedia_statistics_cache = {} 136 | for langcode in langcodes: 137 | self.WIKIPEDIA_STATS_URL[langcode] = \ 138 | "http://stats.grok.se/json/" \ 139 | + langcode \ 140 | + "/%d%02d/%s" # 201001/De%20Jakhalzen 141 | 142 | pickle_root = os.path.join(pickledir, langcode) 143 | if not os.path.isdir(pickle_root): 144 | os.makedirs(pickle_root) 145 | self.wikipedia_statistics_cache[langcode] = \ 146 | shelve.open(os.path.join(pickle_root, \ 147 | 'wikipedia_statistics_cache.db')) 148 | print "Loaded %d sets of statistics for %s from cache." \ 149 | % (len(self.wikipedia_statistics_cache[langcode]), langcode) 150 | 151 | def inspect(self): 152 | return {self.__class__.__name__: self.WIKIPEDIA_STATS_URL} 153 | 154 | def preprocess(self, links, text, settings): 155 | if "wikistats" not in settings: 156 | return (links, text, settings) 157 | 158 | now = self.get_timestamp(settings) 159 | 160 | def worker(): 161 | while True: 162 | try: 163 | (year, month, article) = queue.get_nowait() 164 | self.wikipedia_page_views(year, month, 165 | article, settings["langcode"]) 166 | queue.task_done() 167 | except Empty: 168 | break 169 | 170 | queue = Queue() 171 | for _ in set([link["title"] for link in links]): 172 | day = now 173 | for _ in range(14): 174 | queue.put((day.year, day.month, article)) 175 | day += timedelta(days=28) 176 | 177 | for _ in range(self.num_of_threads): 178 | t = Thread(target=worker) 179 | t.daemon = True 180 | t.start() 181 | 182 | def process(self, links, text, settings): 183 | if "wikistats" not in settings: 184 | return (links, text, settings) 185 | 186 | now = self.get_timestamp(settings) 187 | 188 | self.queue.join() 189 | 190 | for link in links: 191 | features = {"WIKISTATSDAY": 0, 192 | "WIKISTATSWK": 0, 193 | "WIKISTATS4WK": 0, 194 | "WIKISTATSYEAR": 0, 195 | "WIKISTATSDAYOFWK": 0, 196 | "WIKISTATSWKOF4WK": 0, 197 | "WIKISTATS4WKOFYEAR": 0 198 | } 199 | 200 | self.feature_WIKISTATSDAY(datetime, link["title"], features, now) 201 | self.feature_WIKISTATSWK(datetime, link["title"], features, now) 202 | self.feature_WIKISTATS4WK(datetime, link["title"], features, now) 203 | self.feature_WIKISTATSYEAR(datetime, link["title"], features, now) 204 | self.feature_WIKISTATSTRENDS(features) 205 | 206 | del features["WIKISTATSDAY"] 207 | 208 | link["features"].update(features) 209 | 210 | for langcode, cache in self.wikipedia_statistics_cache.iteritems(): 211 | print "Saving %d sets of statistics for %s from cache." \ 212 | % (len(cache), langcode) 213 | cache.sync() 214 | 215 | return (links, text, settings) 216 | 217 | def get_timestamp(self, settings): 218 | # Should be more robust against unexpected values 219 | if len(settings["wikistats"]) > 0: 220 | return datetime.datetime.fromtimestamp(int(settings["wikistats"])) 221 | else: 222 | return datetime.datetime.now() 223 | 224 | def wikipedia_page_views(self, year, month, article, langcode): 225 | url = self.WIKIPEDIA_STATS_URL[langcode] % (year, month, article) 226 | url = url.encode('utf-8') 227 | if url in self.wikipedia_statistics_cache[langcode]: 228 | resultJson = self.wikipedia_statistics_cache[langcode][url] 229 | else: 230 | try: 231 | request = urllib2.urlopen(url, timeout=1) 232 | resultJson = request.read() 233 | except urllib2.URLError: 234 | try: 235 | request = urllib2.urlopen(url) 236 | resultJson = request.read() 237 | except urllib2.URLError: 238 | request = urllib2.urlopen(url) 239 | resultJson = request.read() 240 | 241 | self.wikipedia_statistics_cache[langcode][url] = resultJson 242 | 243 | from json import loads 244 | result = loads(resultJson) 245 | 246 | return result 247 | 248 | def feature_WIKISTATSDAY(self, datetime, article, features, now): 249 | day = now 250 | day += timedelta(days=-1) 251 | monthly_views = self.wikipedia_page_views(day.year, 252 | day.month, article) 253 | views = monthly_views["daily_views"][self.date_format % \ 254 | (day.year, day.month, day.day)] 255 | features["WIKISTATSDAY"] = views 256 | 257 | def feature_WIKISTATSWK(self, datetime, article, features, now): 258 | day = now 259 | for _ in range(7): 260 | day += timedelta(days=-1) 261 | monthly_views = self.wikipedia_page_views(day.year, 262 | day.month, article) 263 | views = \ 264 | monthly_views["daily_views"][self.date_format % \ 265 | (day.year, day.month, day.day)] 266 | features["WIKISTATSWK"] += views 267 | 268 | def feature_WIKISTATS4WK(self, datetime, article, features, now): 269 | day = now 270 | for _ in range(28): 271 | day += timedelta(days=-1) 272 | monthly_views = self.wikipedia_page_views(day.year, 273 | day.month, article) 274 | views = monthly_views["daily_views"][self.date_format % \ 275 | (day.year, day.month, day.day)] 276 | features["WIKISTATS4WK"] += views 277 | 278 | def feature_WIKISTATSYEAR(self, datetime, article, features, now): 279 | day = now 280 | for _ in range(365): 281 | day += timedelta(days=-1) 282 | monthly_views = self.wikipedia_page_views(day.year, 283 | day.month, article) 284 | views = monthly_views["daily_views"][self.date_format % \ 285 | (day.year, day.month, day.day)] 286 | features["WIKISTATSYEAR"] += views 287 | 288 | def feature_WIKISTATSTRENDS(self, features): 289 | if features["WIKISTATSWK"] > 0: 290 | features["WIKISTATSDAYOFWK"] = \ 291 | float(features["WIKISTATSDAY"]) / features["WIKISTATSWK"] 292 | if features["WIKISTATS4WK"] > 0: 293 | features["WIKISTATSWKOF4WK"] = \ 294 | float(features["WIKISTATSWK"]) / features["WIKISTATS4WK"] 295 | if features["WIKISTATSYEAR"] > 0: 296 | features["WIKISTATS4WKOFYEAR"] = \ 297 | float(features["WIKISTATS4WK"]) / features["WIKISTATSYEAR"] 298 | -------------------------------------------------------------------------------- /semanticizer/processors/feature.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import collections 15 | from math import log 16 | import cPickle as pickle 17 | import os 18 | import re 19 | 20 | from leven import levenshtein 21 | 22 | from . import stringUtils 23 | from ..wpm.data import wpm_dumps 24 | 25 | class anchorFeatures: 26 | def __init__(self, langcode): 27 | self.wpm = wpm_dumps[langcode] 28 | self.wikipediaArticleCount = int(self.wpm.get_stat("articleCount")) #970139 29 | self.wikipediaCategoryCount = int(self.wpm.get_stat("categoryCount")) #63108 30 | 31 | def feature_LEN(self, lnk): 32 | return len(re.findall(stringUtils.reTokenPattern, lnk["label"])) 33 | 34 | def feature_IDF_title(self, lnk): 35 | score = self.wpm.get_title_ngram_score(lnk["label"]) 36 | if not score == None: 37 | in_title_count = int(score) 38 | else: 39 | in_title_count = 0 40 | return log(float(self.wikipediaArticleCount) / \ 41 | (float(in_title_count) + 0.00001)) 42 | 43 | def feature_IDF_anchor(self, lnk): 44 | return log(float(self.wikipediaArticleCount) / \ 45 | (float(lnk["linkDocCount"]) + 0.00001)) 46 | 47 | def feature_IDF_content(self, lnk): 48 | return log(float(self.wikipediaArticleCount) / \ 49 | (float(lnk["docCount"]) + 0.00001)) 50 | 51 | def feature_KEYPHRASENESS(self, lnk): 52 | return float(lnk["linkDocCount"]) / (float(lnk["docCount"]) + 0.00001) 53 | 54 | def feature_LINKPROB(self, lnk): 55 | return float(lnk["linkOccCount"]) / (float(lnk["occCount"]) + 0.00001) 56 | 57 | def feature_SNIL(self, lnk): 58 | SNIL = 0 59 | 60 | words = lnk["label"].split() 61 | for n in range(1, len(words) + 1): 62 | for i in range(0, len(words) - n): 63 | ngram = " ".join(words[i:i + n]) 64 | if not self.wpm.get_item_id(ngram) == None: 65 | SNIL += 1 66 | return SNIL 67 | 68 | def feature_SNCL(self, lnk): 69 | SNCL = 0 70 | 71 | words = lnk["label"].split() 72 | for n in range(1, len(words) + 1): 73 | for i in range(0, len(words) - n): 74 | ngram = " ".join(words[i:i + n]) 75 | score = self.wpm.get_title_ngram_score(ngram) 76 | if not score == None: 77 | SNCL += int(score) 78 | return SNCL 79 | 80 | def feature_NORMALIZATION(self, lnk): 81 | edit = levenshtein(unicode(lnk["label"]), unicode(lnk["text"])) 82 | return float(edit) / len(lnk["text"]) 83 | 84 | def compute_anchor_features(self, lnk): 85 | return {'LEN': self.feature_LEN(lnk), 86 | 'IDF_title': self.feature_IDF_title(lnk), 87 | 'IDF_anchor': self.feature_IDF_anchor(lnk), 88 | 'IDF_content': self.feature_IDF_content(lnk), 89 | 'KEYPHRASENESS': self.feature_KEYPHRASENESS(lnk), 90 | 'LINKPROB': self.feature_LINKPROB(lnk), 91 | 'SNIL': self.feature_SNIL(lnk), 92 | 'SNCL': self.feature_SNCL(lnk), 93 | 'NORMALIZATION': self.feature_NORMALIZATION(lnk) 94 | } 95 | 96 | 97 | class articleFeatures: 98 | def __init__(self): 99 | self.re_non_word_chars = re.compile(r'(?u)\W+', re.UNICODE) 100 | 101 | def feature_INLINKS(self, lnk): 102 | if "InLinks" not in lnk: 103 | return 0 104 | return len(lnk["InLinks"]) 105 | 106 | def feature_OUTLINKS(self, lnk): 107 | if "OutLinks" not in lnk: 108 | return 0 109 | return len(lnk["OutLinks"]) 110 | 111 | def feature_REDIRECT(self, lnk): 112 | # Should be fromRedirect but bug in Wikipedia Miner 113 | if "fromTitle" in lnk and lnk["fromTitle"]: 114 | return 1 115 | return 0 116 | 117 | def feature_TF(self, lnk, re_label_text, features): 118 | aMatches = re.findall(re_label_text, lnk['title']) 119 | features["TF_title"] = float(len(aMatches)) 120 | 121 | text = " " 122 | if "Definition" in lnk: 123 | if lnk["Definition"] and len(lnk["Definition"]): 124 | text = re.sub(r"<.*?>", "", lnk["Definition"]) 125 | text = re.sub(r"^[|\- }]*", "", text) 126 | 127 | while len(text) and (text[0] == "."): 128 | text = text[1:].strip() 129 | 130 | # Very rarely articles do not have a Definition text (or a dummy one 131 | # like "----") 132 | if len(text) == 0: 133 | features["TF_sentence"] = 0 134 | features["TF_paragraph"] = 0 135 | features["POS_first_in_paragraph"] = 1 136 | else: 137 | # Sentence is first sentence 138 | sentence = text.split('.')[0] 139 | 140 | aMatches = re.findall(re_label_text, sentence) 141 | features["TF_sentence"] = float(len(aMatches)) 142 | 143 | aMatches = re.findall(re_label_text, text) 144 | features["TF_paragraph"] = float(len(aMatches)) 145 | 146 | if len(aMatches): 147 | features["POS_first_in_paragraph"] = \ 148 | float(re.search(re_label_text, text).start()) 149 | else: 150 | features["POS_first_in_paragraph"] = 1 151 | 152 | def feature_TITLE(self, lnk, re_label_text, features): 153 | label_text = unicode(lnk["label"]) 154 | 155 | re_title = stringUtils.ngramToPattern(lnk['title']) 156 | article_title = unicode(lnk['title']) 157 | 158 | features["NCT"] = 0 if re.search(re_title, label_text) is None \ 159 | else 1 160 | 161 | features["TCN"] = 0 \ 162 | if re.search(re_label_text, article_title) is None else 1 163 | 164 | features["TEN"] = 1 if article_title == label_text else 0 165 | 166 | # Irritatingly enough, split() can give you empty values as last 167 | # element 168 | split_label = self.re_non_word_chars.split(label_text) 169 | if split_label[-1] == '': 170 | split_label.pop() 171 | split_title = self.re_non_word_chars.split(article_title) 172 | if split_title[-1] == '': 173 | split_title.pop() 174 | 175 | # I: True if the title of the candidate begins with the the query 176 | # (e.g. "Cambridge, Massachusetts" and "Cambridge" ) 177 | features["SUBSTRING_MATCH_1"] = 1 \ 178 | if split_title[0] == split_label[0] else 0 179 | 180 | # II: True if the title of the candidate ends with the the query 181 | # (e.g: "Venice-Simplon Orient Express" and "Orient Express") 182 | features["SUBSTRING_MATCH_2"] = 1 \ 183 | if split_title[-1] == split_label[-1] else 0 184 | 185 | # collections.Counter() converts an array to a dict of words 186 | # and their frequencies 187 | cSplitLabel = collections.Counter(split_label) 188 | cSplitTitle = collections.Counter(split_title) 189 | 190 | # Number of shared words between the title of the candidate and 191 | # the query 192 | features['WORD_MATCH'] = len(list(cSplitLabel & cSplitTitle)) 193 | 194 | # Number of different words between the title of the candidate 195 | # and the query 196 | features['WORD_MISS'] = len(split_label) + len(split_title) \ 197 | - (2 * features['WORD_MATCH']) 198 | 199 | # Levenshtein distance between query and title of the candidate 200 | features["EDIT_DISTANCE"] = levenshtein(label_text, article_title) 201 | 202 | def feature_COMMONNESS(self, lnk, features): 203 | features["COMMONNESS"] = lnk["priorProbability"] 204 | 205 | def compute_article_features(self, lnk): 206 | features = { 207 | 'INLINKS': self.feature_INLINKS(lnk), 208 | 'OUTLINKS': self.feature_OUTLINKS(lnk), 209 | 'REDIRECT': self.feature_REDIRECT(lnk) 210 | } 211 | 212 | re_label_text = stringUtils.ngramToPattern(lnk["label"]) 213 | 214 | self.feature_TF(lnk, re_label_text, features) 215 | self.feature_TITLE(lnk, re_label_text, features) 216 | self.feature_COMMONNESS(lnk, features) 217 | 218 | return features 219 | 220 | ### TK: Ik heb nog wat extra features gemaakt die kijken hoe vaak 221 | ### inlink anchors en inlink/outlink titels voorkomen in de 222 | ### referentietekst en de zogenaamde aposition in de titel 223 | ### ('actress' in 'Sue Johnson (actress)') 224 | ### 225 | ### 'NR_OF_MATCHING_INLINK_ANCHORS', 'NR_OF_MATCHING_INLINK_TITLES', 226 | ### 'NR_OF_MATCHING_OUTLINK_TITLES', 'APOSITION' 227 | ### 228 | ### Dat is er nu niet zo makkelijk in te bouwen omdat we hier geen 229 | ### toegang hebben tot de referentietekst. Maar die features van David 230 | ### gaan dat ook zeker nodig hebben! 231 | ### 232 | ### Maar goed, ik heb ze nu nog even weg gelaten... 233 | 234 | if __name__ == "__main__": 235 | # Some settings 236 | langcode = "en" 237 | wikipediaminer_root = '/zfs/ilps-plexer/wikipediaminer/enwiki-20111007/' 238 | pickledir = "/Users/evertlammerts/semanticizer/pickles/" 239 | 240 | # Test data 241 | link = {"label": "Alabama", 242 | "linkDocCount": 10, # Al deze waardes slaan nergens op natuurlijk, 243 | "docCount": 20, # maar ok... 244 | "linkOccCount": 100, 245 | "occCount": 200, 246 | "commonness": 0.12345 247 | } 248 | 249 | # Article 250 | article_url = '' # Wordt niet gebruikt nu 251 | fh_article_xml = open("unitTest.article.xml", "r") 252 | article_xml = fh_article_xml.read() 253 | fh_article_xml.close() 254 | article = ElementTree.fromstring(article_xml).find("Response") 255 | 256 | # Initialize the objects 257 | print "Initializing anchor features" 258 | anchor_features = anchorFeatures(langcode) 259 | print "Initializing concept features" 260 | concept_features = conceptFeatures(langcode, wikipediaminer_root, 261 | article_url) 262 | print "Initializing anchor/concept features" 263 | anchor_concept_features = anchorConceptFeatures() 264 | print "Initializing statistics features" 265 | statistics_features = statisticsFeatures(langcode) 266 | 267 | print "Start calculating" 268 | test_features = { 269 | "anchor": anchor_features.compute_anchor_features(link), 270 | "concept": concept_features.compute_concept_features(article), 271 | "anchor_concept": \ 272 | anchor_concept_features.compute_anchor_concept_features(link, article), 273 | "statistics": statistics_features.compute_statistics_features(article), 274 | } 275 | 276 | print "%s" % test_features 277 | -------------------------------------------------------------------------------- /semanticizer/processors/features.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | from collections import defaultdict 15 | 16 | from . import feature as features 17 | from . import context 18 | 19 | from .core import LinksProcessor 20 | 21 | class FeaturesProcessor(LinksProcessor): 22 | def __init__(self, langcodes): 23 | self.features = {} 24 | for langcode in langcodes: 25 | self.features[langcode] = features.anchorFeatures(langcode) 26 | 27 | def process(self, links, text, settings): 28 | if not "features" in settings and not "learning" in settings: 29 | return (links, text, settings) 30 | if not settings["langcode"] in self.features: 31 | return (links, text, settings) 32 | 33 | featuresets = self.features[settings["langcode"]] 34 | 35 | for link in links: 36 | link.setdefault("features", {}) 37 | link["features"].update(featuresets.compute_anchor_features(link)) 38 | 39 | return (links, text, settings) 40 | 41 | def inspect(self): 42 | return {self.__class__.__name__: self.features.keys()} 43 | 44 | 45 | class ArticleFeaturesProcessor(LinksProcessor): 46 | def __init__(self): 47 | self.features = features.articleFeatures() 48 | 49 | def process(self, links, text, settings): 50 | if not "features" in settings and not "learning" in settings: 51 | return (links, text, settings) 52 | # Check if ArticleProcessor has run 53 | 54 | for link in links: 55 | link.setdefault("features", {}) 56 | link["features"].update( 57 | self.features.compute_article_features(link) 58 | ) 59 | 60 | return (links, text, settings) 61 | 62 | def inspect(self): 63 | return {self.__class__.__name__: str(self.features)} 64 | 65 | 66 | class ContextFeaturesProcessor(LinksProcessor): 67 | def __init__(self): 68 | self.context_features = {} 69 | self.context_text = defaultdict(list) 70 | self.context_id_pattern = "%s:%d" 71 | 72 | def new_context(self, context_label): 73 | self.context_features[context_label] = { 74 | "SP0.2-100": context.contextGraph("SP0.2-100", "senseProbability", 75 | 0.2, 100) 76 | } 77 | 78 | def preprocess(self, links, text, settings): 79 | if "context" in settings: 80 | settings["context_id"] = self.context_id_pattern % \ 81 | (settings["context"], len(self.context_text[settings["context"]])) 82 | self.context_text[settings["context"]].append(text) 83 | 84 | return (links, text, settings) 85 | 86 | def process(self, links, text, settings): 87 | if not "context" in settings or "skip_context_features" in settings or \ 88 | (not "features" in settings and not "learning" in settings): 89 | return (links, text, settings) 90 | 91 | # Create context_features if it does not exist 92 | if settings["context"] not in self.context_features: 93 | self.new_context(settings["context"]) 94 | 95 | # For each set of context features 96 | for label in self.context_features[settings["context"]]: 97 | # Create a new chunk 98 | self.context_features[settings["context"]][label].add_chunk() 99 | graph = self.context_features[settings["context"]][label] 100 | # Add each link to graph and prepare features 101 | for link in links: 102 | graph.add_link(link) 103 | graph.prepare_features() 104 | 105 | # Compute context features for each link 106 | for link in links: 107 | link["features"].update(graph.compute_features(link["title"])) 108 | 109 | return (links, text, settings) 110 | 111 | def inspect(self): 112 | context = {} 113 | for context_label, features in self.context_features.iteritems(): 114 | context[context_label] = {"text": self.context_text[context_label]} 115 | for label, context_graph in features.iteritems(): 116 | graph = {"page_ranked": context_graph.page_ranked, 117 | "graph": context_graph.to_dict_of_dicts(), 118 | "chunk": context_graph.chunk} 119 | context[context_label][label] = graph 120 | 121 | return {self.__class__.__name__: context} 122 | -------------------------------------------------------------------------------- /semanticizer/processors/image.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | from Queue import Queue, Empty 15 | from threading import Thread 16 | 17 | import urllib2, re 18 | 19 | from .core import LinksProcessor 20 | 21 | class AddImageProcessor(LinksProcessor): 22 | def postprocess(self, links, text, settings): 23 | if "image" in settings and "langcode" in settings: 24 | links = add_image_url(links, settings["langcode"]) 25 | return (links, text, settings) 26 | 27 | image_url_cache = {} 28 | 29 | def add_image_url(links, langcode): 30 | urls = [link["url"].replace(".wikipedia.org/", ".m.wikipedia.org/") \ 31 | for link in links] 32 | 33 | print "Getting images for %d Wikipedia pages" % len(urls) 34 | get_image_urls(urls) 35 | for link, url in zip(links, urls): 36 | if url in image_url_cache: 37 | print link["title"], "->", image_url_cache[url] 38 | link["image_url"] = image_url_cache[url] 39 | 40 | return links 41 | 42 | IMG_DIMENSION_PATTERN = '' 43 | IMG_URL_PATTERN = '' 44 | 45 | BLACKLISTED_IMAGE_URLS = ('http://upload.wikimedia.org/wikipedia/en/f/f4/Ambox_content.png', 46 | 'http://upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png', 47 | 'http://upload.wikimedia.org/wikipedia/en/thumb/f/f2/Edit-clear.svg/40px-Edit-clear.svg.png', 48 | 'http://upload.wikimedia.org/wikipedia/commons/thumb/f/f8/Wiktionary-logo-en.svg/37px-Wiktionary-logo-en.svg.png', 49 | 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/a4/Text_document_with_red_question_mark.svg/40px-Text_document_with_red_question_mark.svg.png') 50 | 51 | def convert_image_url(image): 52 | if image.startswith("//"): 53 | image = "http:" + image 54 | elif image.startswith("/"): 55 | image = "http://" + url.split("/")[2] + image 56 | return image 57 | 58 | def get_image_urls(urls, num_of_threads=8, min_dimension=36): 59 | def worker(): 60 | while True: 61 | try: 62 | url = queue.get_nowait() 63 | try: 64 | page = urllib2.urlopen(url, timeout=1).read() 65 | except: 66 | page = "" 67 | images = re.findall("", page) 68 | 69 | # Filter Wikipedia images 70 | images = [img for img in images if " id=" not in img \ 71 | and " title=" not in img] 72 | image = None 73 | for img in images: 74 | match = re.match(IMG_DIMENSION_PATTERN, img) 75 | if match == None: continue 76 | dimension = max([int(value) for value in match.groups()]) 77 | if dimension >= min_dimension: # Do not use fallback: or image == None: 78 | match = re.match(IMG_URL_PATTERN, img) 79 | if match != None and len(match.groups()) > 0: 80 | image_url = convert_image_url(match.groups()[0]) 81 | if image_url in BLACKLISTED_IMAGE_URLS: continue 82 | image = image_url 83 | # if dimension >= min_dimension: 84 | break 85 | 86 | image_url_cache[url] = image 87 | 88 | queue.task_done() 89 | except Empty: 90 | break 91 | 92 | queue = Queue() 93 | for url in urls: 94 | queue.put(url) 95 | 96 | for i in range(min(num_of_threads, len(urls))): 97 | t = Thread(target=worker) 98 | t.daemon = True 99 | t.start() 100 | 101 | queue.join() 102 | -------------------------------------------------------------------------------- /semanticizer/processors/multiple.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import collections 15 | 16 | from .core import LinksProcessor 17 | 18 | 19 | class MultipleEntityFeaturesProcessor(LinksProcessor): 20 | 21 | def process(self, links, text, settings): 22 | self.link_dict = {} 23 | self.labels = [] 24 | 25 | if 'multi' not in settings: 26 | return (links, text, settings) 27 | 28 | # First run through links to fill dict 29 | for link in links: 30 | self.link_dict.setdefault(link['id'], []) \ 31 | .append([link['label'], link['senseProbability'], 32 | link['priorProbability'], link['linkProbability']]) 33 | self.labels.append(link['label']) 34 | link['features'] = {} 35 | 36 | # Second run to calculate features 37 | for link in links: 38 | if 'tier1' in settings['multi']: 39 | features = self.FEATURE_tier_one_overlap(link, self.labels) 40 | link['features'].update(features) 41 | if 'outlinks' in settings['multi']: 42 | features = self.FEATURE_linked_entity_overlap(link['label'], 43 | link['OutLinks'], 44 | 'outlinks') 45 | link['features'].update(features) 46 | if 'inlinks' in settings['multi']: 47 | features = self.FEATURE_linked_entity_overlap(link['label'], 48 | link['InLinks'], 49 | 'inlinks') 50 | link['features'].update(features) 51 | 52 | return (links, text, settings) 53 | 54 | def FEATURE_tier_one_overlap(self, link, labels): 55 | """ 56 | Perform simple 'list intersect' 57 | To find matching labels of candidate 58 | """ 59 | 60 | tier_one = [link['title']] + [label['title'] for label in \ 61 | link['Labels']] 62 | tier_one = [(anchor, link['id']) for anchor in \ 63 | list((collections.Counter(tier_one) & \ 64 | collections.Counter(self.labels)).elements())] 65 | 66 | return_list = [] 67 | for l, i in tier_one: 68 | 69 | if i in self.link_dict: 70 | for label, senseProb, priorProb, cmns in self.link_dict[i]: 71 | if label == anchor: 72 | return_list.append((l, i, senseProb, priorProb, cmns)) 73 | if return_list: 74 | return self.calculate_features(return_list, 1, 'tier_one') 75 | 76 | else: 77 | return {} 78 | 79 | def FEATURE_linked_entity_overlap(self, current_label, linked_entities, 80 | features): 81 | """ 82 | IN: json of {in,out}-link_ids 83 | Check if they occur in doc dict 84 | if they do, see if they are referred to 85 | by a different label. 86 | """ 87 | 88 | # Find stuff 89 | result_list = [] 90 | for link in linked_entities: 91 | if str(link['id']) in self.link_dict: 92 | link_label = self.link_dict[str(link['id'])] 93 | for sub_link in link_label: 94 | if current_label != sub_link[0]: 95 | result_list.append((sub_link[0], link['id'], 96 | sub_link[1], sub_link[2], 97 | sub_link[3])) 98 | # Calculate features 99 | if result_list: 100 | return self.calculate_features(result_list, len(linked_entities), 101 | features) 102 | else: 103 | return {} 104 | 105 | def calculate_features(self, results, max_entities, features): 106 | """ 107 | Given result list in format: 108 | label, wiki_id, senseProb, priorProb, commonness 109 | 'Unzip' lists and create feature vectors. 110 | """ 111 | 112 | label_list, id_list, sense_list, prior_list, cmns_list = \ 113 | ([l for l, w, s, p, c in results], 114 | [w for l, w, s, p, c in results], 115 | [s for l, w, s, p, c in results], 116 | [p for l, w, s, p, c in results], 117 | [c for l, w, s, p, c in results]) 118 | 119 | if features == 'outlinks': 120 | PREFIX = 'ME_OUT_' 121 | elif features == 'inlinks': 122 | PREFIX = 'ME_IN_' 123 | elif features == 'tier_one': 124 | PREFIX = 'ME_T1_' 125 | 126 | return {PREFIX + 'label_overlap': len(label_list), 127 | PREFIX + 'label_unique': len(set(label_list)), 128 | PREFIX + 'entity_overlap': len(id_list), 129 | PREFIX + 'entity_unique': len(set(id_list)), 130 | PREFIX + 'entity_proportion': float(len(set(id_list))) / \ 131 | float(max_entities), 132 | PREFIX + 'sense_prob_sum': sum(sense_list), 133 | PREFIX + 'prior_prob_sum': sum(prior_list), 134 | PREFIX + 'cmns_sum': sum(cmns_list)} 135 | -------------------------------------------------------------------------------- /semanticizer/processors/semanticize.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | from nltk.util import ngrams as nltk_ngrams 15 | import re 16 | import urllib 17 | 18 | from ..wpm import utils as wpmutil 19 | from ..wpm.data import wpm_dumps 20 | 21 | tokenize = re.compile(r'\w+(?:[.,\']\w+)*|[^\w\s]+', 22 | re.UNICODE | re.MULTILINE | re.DOTALL).findall 23 | 24 | 25 | class Semanticizer: 26 | def __init__(self, language_code, sense_probability_threshold, 27 | max_ngram_length=None, debug=False): 28 | """constructor""" 29 | self.language_code = language_code 30 | self.sense_probability_threshold = sense_probability_threshold 31 | self.wikipedia_url_template = 'http://%s.wikipedia.org/wiki/%s' 32 | self.wpm = wpm_dumps[language_code] 33 | self.title_page = {} # This needs to be removed 34 | self.max_ngram_length = max_ngram_length 35 | self.debug = debug 36 | 37 | def semanticize(self, sentence, normalize_dash=True, 38 | normalize_accents=True, normalize_lower=False, 39 | translations=True, counts=False, 40 | largest_matching=False, 41 | sense_probability_threshold=None): 42 | if sense_probability_threshold == None: 43 | sense_probability_threshold = self.sense_probability_threshold 44 | result = {"links": []} 45 | ngrams = set() 46 | token_lists = [tokenize(sentence), 47 | tokenize(sentence.replace('-', ' ')), 48 | tokenize(sentence.replace('.', ' ')), 49 | tokenize(sentence.replace('.', ''))] 50 | 51 | # get all ngrams for this sentence, limit to max_ngram_length 52 | # if applicable 53 | for token_list in token_lists: 54 | max_len = len(token_list) + 1 55 | if self.max_ngram_length is not None: 56 | max_len = min(max_len, self.max_ngram_length) 57 | 58 | for n in range(1, max_len): 59 | for ngram in nltk_ngrams(token_list, n): 60 | ngrams.add(' '.join(ngram)) 61 | 62 | normal_ngrams = map(wpmutil.normalize, ngrams) 63 | exist = self.wpm.normalized_entities_exist(normal_ngrams) 64 | 65 | for i, (ngram, normal_ngram) in enumerate(zip(ngrams, normal_ngrams)): 66 | if exist[i]: 67 | normalized_ngram = wpmutil.normalize(ngram, normalize_dash, 68 | normalize_accents, 69 | normalize_lower) 70 | anchors = self.wpm.get_all_entities(normal_ngram) 71 | for anchor in anchors: 72 | normalized_anchor = wpmutil.normalize(anchor, normalize_dash, 73 | normalize_accents, 74 | normalize_lower) 75 | if normalized_ngram == normalized_anchor: 76 | if self.debug and not self.wpm.entity_exists(anchor): 77 | raise LookupError("Data corrupted, cannot " 78 | + "find %s in the database" \ 79 | % anchor) 80 | entity = self.wpm.get_entity_data(anchor) 81 | senses = [(sense, self.wpm.get_sense_data(anchor, str(sense))) for sense in entity['senses']] 82 | if largest_matching: senses = sorted(senses, key=lambda (_, d): -d['cntlinkdoc'])[:1] 83 | for sense, sense_data in senses: 84 | if sense_data: 85 | if entity['cnttextocc'] == 0: 86 | link_probability = 0 87 | sense_probability = 0 88 | else: 89 | link_probability = float(entity['cntlinkdoc']) / entity['cnttextdoc'] 90 | sense_probability = float(sense_data['cntlinkdoc']) / entity['cnttextdoc'] 91 | if sense_probability > sense_probability_threshold: 92 | title = unicode(self.wpm.get_item_title(str(sense))) 93 | url = self.wikipedia_url_template \ 94 | % (self.language_code, 95 | urllib.quote(title.encode('utf-8'))) 96 | if entity['cntlinkocc'] == 0: 97 | prior_probability = 0 98 | else: 99 | prior_probability = float(sense_data['cntlinkocc']) / entity['cntlinkocc'] 100 | link = { 101 | "label": anchor, 102 | "text": ngram, 103 | "title": title, 104 | "id": sense, 105 | "url": url, 106 | "linkProbability": link_probability, 107 | "senseProbability": sense_probability, 108 | "priorProbability": prior_probability 109 | } 110 | if translations: 111 | link["translations"] = {self.language_code: 112 | {"title": title, 113 | "url": url}} 114 | if self.wpm.sense_has_trnsl(str(sense)): 115 | for lang in self.wpm.get_trnsl_langs(str(sense)): 116 | trnsl = self.wpm.get_sense_trnsl(str(sense), lang) 117 | link["translations"][lang] = { 118 | 'title': unicode(trnsl), 119 | 'url': self.wikipedia_url_template % (lang, urllib.quote(unicode(trnsl).encode('utf-8'))) 120 | } 121 | if counts: 122 | link["occCount"] = entity['cnttextocc'] 123 | link["docCount"] = entity['cnttextdoc'] 124 | link["linkOccCount"] = entity['cntlinkocc'] 125 | link["linkDocCount"] = entity['cntlinkdoc'] 126 | link["senseOccCount"] = int(sense_data['cntlinkocc']) 127 | link["senseDocCount"] = int(sense_data['cntlinkdoc']) 128 | link['fromTitle'] = sense_data['from_title'] 129 | link['fromRedirect'] = sense_data['from_redir'] 130 | result["links"].append(link) 131 | 132 | if largest_matching: 133 | available_text = wpmutil.normalize(sentence, normalize_dash, normalize_accents, normalize_lower) 134 | for link in sorted(result["links"], key=lambda link: -link["priorProbability"]/2-len(link["label"])): 135 | normalized_label = wpmutil.normalize(link["label"], normalize_dash, normalize_accents, normalize_lower) 136 | if normalized_label in available_text: 137 | available_text = available_text.replace(normalized_label, "") 138 | else: result["links"].remove(link) 139 | return result 140 | -------------------------------------------------------------------------------- /semanticizer/processors/semanticizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | """ 15 | The Processor wrapping Semanticizer 16 | """ 17 | from nltk.tokenize.punkt import PunktSentenceTokenizer 18 | 19 | from .core import LinksProcessor 20 | from .semanticize import Semanticizer 21 | 22 | 23 | class SemanticizeProcessor(LinksProcessor): 24 | """Processor handling the semanticizing""" 25 | 26 | def __init__(self, debug=False): 27 | """Set the class variables""" 28 | self.langcodes = [] 29 | self.semanticizers = {} 30 | self.debug = debug 31 | 32 | def load_languages(self, langcodes, max_ngram_length=None): 33 | """Save the languages and load the semanticizer""" 34 | self.langcodes = langcodes 35 | for langcode in langcodes: 36 | self.semanticizers[langcode] = Semanticizer(langcode, None, 37 | max_ngram_length, 38 | self.debug) 39 | 40 | def preprocess(self, links, text, settings): 41 | """ 42 | Semanticize the given text and return the links, text, and 43 | settings. 44 | """ 45 | links = [] 46 | if "langcode" in settings and settings["langcode"] in self.semanticizers: 47 | translations = "translations" in settings 48 | normalize_dash = not("normalize" in settings and \ 49 | not "dash" in settings["normalize"]) 50 | normalize_accents = not("normalize" in settings and \ 51 | not "accents" in settings["normalize"]) 52 | normalize_lower = "normalize" in settings and \ 53 | "lower" in settings["normalize"] 54 | lower_confidence_bound = "lowerConfidenceBound" in settings 55 | largest_matching = "largestMatching" in settings 56 | 57 | if "split_sentences" in settings: 58 | sentences = PunktSentenceTokenizer().tokenize(text) 59 | else: 60 | sentences = [text] 61 | 62 | sem = self.semanticizers[settings["langcode"]] 63 | for sentence in sentences: 64 | results = sem.semanticize(sentence, counts=True, 65 | normalize_dash=normalize_dash, 66 | normalize_accents=normalize_accents, 67 | normalize_lower=normalize_lower, 68 | largest_matching=largest_matching, 69 | lower_confidence_bound=lower_confidence_bound, 70 | translations=translations, 71 | sense_probability_threshold=-1) 72 | 73 | links.extend(results["links"]) 74 | 75 | return (links, text, settings) 76 | 77 | def postprocess(self, links, text, settings): 78 | """ 79 | Remove counts from links 80 | @todo: why do this here? In Semanticizer.semanticize there's already \ 81 | a check being done on whether counts should be included. 82 | """ 83 | if not "counts" in settings: 84 | for link in links: 85 | for key in link.keys(): 86 | if key.endswith("Count"): 87 | del link[key] 88 | 89 | return (links, text, settings) 90 | 91 | def inspect(self): 92 | """Return the loaded languages""" 93 | return {self.__class__.__name__: self.langcodes} 94 | -------------------------------------------------------------------------------- /semanticizer/processors/stringUtils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 4 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 5 | # General Public License as published by the Free Software Foundation, either 6 | # version 3 of the License, or (at your option) any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, but WITHOUT 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 11 | # for more details. 12 | # 13 | # You should have received a copy of the GNU Lesser General Public License 14 | # along with this program. If not, see . 15 | 16 | import re 17 | 18 | # Kan ook met de hand...:(\A|\s|\'|"|\.|\,|:|;|!|\?) 19 | # (?=(\s|\'|"|\.|\,|:|;|!|\?|\'s|\Z) 20 | # reNonWordChars = re.compile('(?u)\W+', re.UNICODE) 21 | 22 | # We took the reg exp from scikit-learn: 23 | # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py 24 | reTokenPattern = re.compile(r"(?u)\b\w\w+\b", re.UNICODE) 25 | 26 | def ngramToPattern(sNgram): 27 | return ngramsToPattern([sNgram]) 28 | 29 | def ngramsToPattern(aNgrams): 30 | #import sys 31 | #print >> sys.stderr, "n-grams: '%s'" % aNgrams 32 | try: 33 | # So this reads, inside out: 34 | # Replace all white space by a single space and re.escape that. 35 | # Replace the (by now escaped) spaces by '\s+'s and join the different 36 | # n-grams by pipes ('|') 37 | # 38 | sNgrams = '|'.join([re.escape(re.sub('\s+', ' ', x)).replace('\\ ', '\s+') for x in aNgrams]) 39 | reNgrams = re.compile('((\A|\W)(' + sNgrams + ')(?=\W|\Z))', 40 | flags=re.UNICODE|re.IGNORECASE) 41 | except OverflowError: 42 | # Some articles have such a ridiculous number of inlink anchors that 43 | # the regular expression gets too big. 44 | # This doesn't happen if we make it a bit stricter.... 45 | # So, if that happens we make the same expression but we do not replace 46 | # the spaces by \s+'s 47 | sNgrams = '|'.join([re.escape(re.sub('\s+', ' ', x)) for x in aNgrams]) 48 | reNgrams = re.compile('((\A|\W)(' + sNgrams + ')(?=\W|\Z))', 49 | flags=re.UNICODE|re.IGNORECASE) 50 | return reNgrams 51 | 52 | # For one word 53 | def findNgramInText(sNgram, sText): 54 | return findNgramsInText([sNgram], sText) 55 | 56 | # For several words 57 | def findNgramsInText(aNgrams, sText): 58 | # A check beforehand because an empty array will lead to a pattern that 59 | # matches empty lines, double spaces, etc.... 60 | if len(aNgrams) == 0: 61 | return [] 62 | return re.findall(ngramsToPattern(aNgrams), sText) 63 | 64 | if __name__ == "__main__": 65 | sText = u"aap noot mies\nwim jüf duif “Noot” roos ühalloü" 66 | 67 | aMatches = findNgramInText(u'aap', sText) 68 | print "%s" % aMatches 69 | 70 | aMatches = findNgramInText(u'hallo', sText) 71 | print "%s" % aMatches 72 | 73 | aMatches = findNgramsInText([u'mies wim', u'noot'], sText) 74 | print "%s" % aMatches 75 | -------------------------------------------------------------------------------- /semanticizer/processors/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import os, yaml 15 | import sklearn.metrics 16 | import sklearn.externals.joblib 17 | 18 | def compute_metrics(labels, scores, threshold=0.5): 19 | metrics = {} 20 | # Sort according to score 21 | scores, labels = zip(*sorted(zip(scores, labels))) 22 | predictions = [score >= threshold for score in scores] 23 | # Classification metrics 24 | metrics["precision"], metrics["recall"], metrics["f1"], support = \ 25 | sklearn.metrics.precision_recall_fscore_support(labels, predictions, \ 26 | average="weighted") 27 | metrics["accuracy"] = sklearn.metrics.accuracy_score(labels, predictions) 28 | metrics["zeroOneLoss"] = sklearn.metrics.zero_one_loss(labels, predictions) 29 | # Rank-based metrics 30 | metrics["averagePrecision"] = \ 31 | sklearn.metrics.average_precision_score(labels, scores) 32 | metrics["ROC AUC"] = sklearn.metrics.roc_auc_score(labels, scores) 33 | # R-precision 34 | r_labels = labels[-support:] 35 | r_predictions = [True for label in r_labels] 36 | metrics["rPrecision"] = \ 37 | sklearn.metrics.precision_score(r_labels, r_predictions) 38 | return metrics 39 | 40 | class ModelStore(): 41 | def __init__(self, model_dir): 42 | self.model_dir = model_dir 43 | self.model_cache = {} 44 | 45 | def load_model(self, modelname): 46 | if modelname.endswith(".pkl"): 47 | return self.load_model(modelname[:-4]) 48 | 49 | if modelname in self.model_cache: 50 | return self.model_cache[modelname] 51 | 52 | modelfile = os.path.join(self.model_dir, modelname) 53 | model = sklearn.externals.joblib.load(modelfile + ".pkl") 54 | 55 | description = {"name": modelname, "source": modelfile + ".pkl"} 56 | if os.path.exists(modelfile + ".yaml"): 57 | description.update(yaml.load(file(modelfile + ".yaml"))) 58 | 59 | if os.path.exists(modelfile + ".preprocessor.pkl"): 60 | preprocessor = sklearn.externals.joblib.load(modelfile + \ 61 | ".preprocessor.pkl") 62 | else: 63 | preprocessor = None 64 | 65 | self.model_cache[modelname] = (model, description, preprocessor) 66 | return (model, description, preprocessor) 67 | 68 | def save_model(self, model, modelname, description=None, preprocessor=None): 69 | if modelname.endswith(".pkl"): 70 | modelname = modelname[:-4] 71 | 72 | modelfile = os.path.join(self.model_dir, modelname) 73 | sklearn.externals.joblib.dump(model, modelfile + ".pkl") 74 | 75 | if preprocessor: 76 | sklearn.externals.joblib.dump(preprocessor, \ 77 | modelfile + ".preprocessor.pkl") 78 | 79 | if description != None: 80 | with open(modelfile + ".yaml", 'w') as out: 81 | out.write(yaml.dump(description)) 82 | else: 83 | description = {} 84 | 85 | description.update({"name": modelname, "source": modelfile + ".pkl"}) 86 | self.model_cache[modelname] = (model, description, preprocessor) 87 | 88 | def _convert_dict(self, data, skip=[]): 89 | """Helper function that convert the values of dictionary to int/float. 90 | Optionally you can skip a list of values.""" 91 | converted_data = {} 92 | for k,v in data.iteritems(): 93 | if k in skip: continue 94 | try: 95 | converted_data[k] = int("".join(v)) 96 | except ValueError: 97 | try: 98 | converted_data[k] = float("".join(v)) 99 | except ValueError: 100 | converted_data[k] = v 101 | return converted_data 102 | 103 | def create_model(self, settings, skip_settings=[]): 104 | if not "classifier" in settings: 105 | raise ValueError("Expecting a classifier in settings.") 106 | if not "." in settings["classifier"]: 107 | raise ValueError("Expecting a package in classifier settings.") 108 | 109 | classifier = settings["classifier"].split(".")[-1] 110 | package = ".".join(settings["classifier"].split(".")[:-1]) 111 | 112 | preprocessor_settings = dict([(key, value) for key, value \ 113 | in settings.iteritems() \ 114 | if key.startswith("preprocessor.")]) 115 | 116 | skip_settings.extend(["classifier", "preprocessor"]) 117 | skip_settings.extend(preprocessor_settings.keys()) 118 | arguments = self._convert_dict(settings, skip_settings) 119 | model = self._create_instance(package, classifier, **arguments) 120 | 121 | if "preprocessor" in settings: 122 | if not "." in settings["preprocessor"]: 123 | raise ValueError("Expecting a package in preprocessor settings.") 124 | 125 | preprocessor_classname = settings["preprocessor"].split(".")[-1] 126 | preprocessor_package = ".".join(settings["preprocessor"].split(".")[:-1]) 127 | 128 | preprocessor_settings = dict([(".".join(key.split(".")[1:]), value)\ 129 | for key, value \ 130 | in preprocessor_settings.iteritems()]) 131 | preprocessor_arguments = self._convert_dict(preprocessor_settings) 132 | preprocessor = self._create_instance(preprocessor_package, \ 133 | preprocessor_classname, \ 134 | **preprocessor_arguments) 135 | else: 136 | preprocessor = None 137 | 138 | return model, preprocessor 139 | 140 | def _create_instance(self, package, classname, *args, **kwargs): 141 | # Import package module 142 | package_module = __import__(package, globals(), locals(), \ 143 | [str(classname)], -1) 144 | # Class instance 145 | package_class = getattr(package_module, classname) 146 | 147 | instance = package_class(*args, **kwargs) 148 | 149 | return instance 150 | -------------------------------------------------------------------------------- /semanticizer/procpipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import time 15 | import logging 16 | 17 | from .processors.core import SettingsProcessor, FilterProcessor 18 | from .processors.semanticizer import SemanticizeProcessor 19 | from .processors.features import FeaturesProcessor, ArticleFeaturesProcessor, \ 20 | ContextFeaturesProcessor 21 | from .processors.multiple import MultipleEntityFeaturesProcessor 22 | from .processors.external import ArticlesProcessor, StatisticsProcessor 23 | from .processors.learning import LearningProcessor 24 | from .processors.image import AddImageProcessor 25 | 26 | from .config import config_get 27 | 28 | 29 | def build(langcodes, use_features=False, debug=False): 30 | """ 31 | Initialize the pipeline. 32 | 33 | @param wikipedia_ids: A list with all loaded wikipedia ids 34 | @return: The pipeline 35 | @todo: See todo at _load_languages 36 | """ 37 | logging.getLogger().info("Initializing pipeline") 38 | pipeline = [] 39 | if 'max_ngram_length' in config_get('semanticize', {}): 40 | max_ngram_length = config_get(('semanticize', 'max_ngram_length')) 41 | else: 42 | max_ngram_length = None 43 | semanticize_processor = _load_semanticize_processor(langcodes, 44 | max_ngram_length, 45 | debug=debug) 46 | settings = config_get("settings", {}) 47 | pipeline.append(("Settings", SettingsProcessor(settings))) 48 | pipeline.append(("Semanticize", semanticize_processor)) 49 | pipeline.append(("Filter", FilterProcessor())) 50 | if use_features: 51 | _load_features(pipeline, langcodes) 52 | else: 53 | _load_articles(pipeline, langcodes) 54 | pipeline.append(("AddImage", AddImageProcessor())) 55 | logging.getLogger().info("Done initializing pipeline") 56 | return pipeline 57 | 58 | 59 | def _load_semanticize_processor(langcodes, max_ngram_length=None, debug=False): 60 | """ 61 | Load the Semanticizer. 62 | 63 | @param wikipedia_ids: A list with all loaded wikipedia ids 64 | @return: a configured instance of SemanticizeProcessor 65 | @see: processors.SemanticizeProcessor 66 | """ 67 | logging.getLogger().info("Loading semanticizer") 68 | semanticize_processor = SemanticizeProcessor(debug=debug) 69 | start = time.time() 70 | logging.getLogger().info("Loading semanticizers for langcode(s) " 71 | + ", ".join(langcodes)) 72 | semanticize_processor.load_languages(langcodes, max_ngram_length) 73 | logging.getLogger().info("Loading semanticizers took %.2f seconds." \ 74 | % (time.time() - start)) 75 | logging.getLogger().info("Done loading semanticizer") 76 | return semanticize_processor 77 | 78 | 79 | def _load_features(pipeline, langcodes): 80 | """ 81 | Load all features into the pipeline 82 | 83 | @param pipeline: A reference to the pipeline 84 | @param semanticize_processor: A reference to the semanticize processor 85 | @param wikipedia_ids: Wikipedia ids & data 86 | """ 87 | logging.getLogger().info("Loading features") 88 | start = time.time() 89 | pipeline.append(("Features", 90 | FeaturesProcessor(langcodes))) 91 | _load_articles(pipeline, langcodes) 92 | pipeline.append(("Statistics", 93 | StatisticsProcessor(langcodes, 94 | config_get(('wpm', 'threads'), 1), 95 | config_get(('misc', 'tempdir'))))) 96 | pipeline.append(("ArticleFeatures", ArticleFeaturesProcessor())) 97 | pipeline.append(("MultipleFeatures", MultipleEntityFeaturesProcessor())) 98 | pipeline.append(("ContextFeatures", ContextFeaturesProcessor())) 99 | logging.getLogger().info("Loading features took %.2f seconds." \ 100 | % (time.time() - start)) 101 | model_dir = config_get(('learning', 'model_dir'), \ 102 | config_get(('misc', 'tempdir'))) 103 | pipeline.append(("Learning", LearningProcessor(model_dir))) 104 | logging.getLogger().info("Done loading features") 105 | 106 | def _load_articles(pipeline, langcodes): 107 | pipeline.append(("Articles", 108 | ArticlesProcessor(langcodes, config_get(('misc', 'tempdir'))))) 109 | -------------------------------------------------------------------------------- /semanticizer/server/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | # Can do without ujson and simplejson, but speeds up considerably. 15 | try: 16 | import ujson 17 | except ImportError: 18 | pass 19 | try: 20 | import simplejson as json 21 | except ImportError: 22 | import json 23 | 24 | import re 25 | from flask import Flask, Response, request, abort 26 | 27 | from uuid import uuid4 28 | 29 | class Server(object): 30 | """ 31 | The HTTP server that will serve the complete namespace 32 | """ 33 | 34 | APPLICATION_JSON="application/json" 35 | 36 | def __init__(self): 37 | """ 38 | Initialize the server. The constructor creates the initial Flask server 39 | object. 40 | """ 41 | self.app = Flask(__name__) 42 | 43 | def set_debug(self, debug=None, debug_log_format=None): 44 | """ 45 | Set Flask server debug parameters. 46 | 47 | @param debug: Enable or disable debug mode 48 | @param debug_log_format: Set the logformat string for the server 49 | """ 50 | if not debug is None: 51 | self.app.debug = debug 52 | if not debug_log_format is None: 53 | self.app.debug_log_format = debug_log_format 54 | 55 | def _json_dumps(self, o, pretty=False): 56 | """ 57 | Util function to create json dumps based on an object. 58 | 59 | @param o: Object to transform 60 | @param pretty: Whether or not to prettify the JSON 61 | @return: The JSON string 62 | """ 63 | if not pretty and "ujson" in locals(): 64 | return ujson.dumps(o) 65 | elif not pretty: 66 | return json.dumps(o) 67 | else: 68 | return json.dumps(o, indent=4) 69 | 70 | def _get_text_from_request(self): 71 | """ 72 | Util function to get the param called "text" from the current request 73 | 74 | @return: the value of "text" 75 | """ 76 | content_type = request.headers['Content-Type'] if 'Content-Type' in request.headers else "text/plain" 77 | if request.method == "POST": 78 | if content_type == 'application/x-www-form-urlencoded': 79 | return request.form['text'] 80 | elif content_type == 'text/plain': 81 | return request.data 82 | else: 83 | abort(Response("Unsupported Content Type, use: text/plain\n", 84 | status=415)) 85 | elif "text" in request.args: 86 | return request.args["text"] 87 | else: 88 | abort(Response("No text provided, use: POST or GET with attribute \ 89 | 'text'\n", status=400)) 90 | 91 | def _get_values_from_request(self, values=None): 92 | """ 93 | Util function to get the values from the current request 94 | 95 | @param values: initial dictionary of values 96 | @return: a dictionary of values 97 | """ 98 | if not values: 99 | values = {} 100 | for key, value in request.values.iteritems(): 101 | assert key not in values 102 | values[key] = value 103 | 104 | return values 105 | 106 | def setup_route_semanticize(self, langcodes): 107 | """ 108 | Setup the /semanticize/ namespace. 109 | 110 | @param langcodes: The languages supported for semanticizing. 111 | """ 112 | self.langcodes = langcodes 113 | self.app.add_url_rule("/semanticize/", "_semanticize", 114 | self._semanticize_handler, methods=["GET", "POST"]) 115 | self.app.add_url_rule("/semanticize", "_semanticize_usage", 116 | self._semanticize_usage, 117 | methods=["GET", "POST"]) 118 | 119 | def setup_route_inspect(self): 120 | """ 121 | Setup the /inspect namespace. 122 | 123 | @param pipeline: The pipeline of processors to inspect. 124 | """ 125 | self.app.add_url_rule("/inspect", "_inspect", 126 | self._inspect, methods=["GET"]) 127 | 128 | def setup_route_feedback(self): 129 | """ 130 | Setup the /feedback namespace. 131 | 132 | @param pipeline: The pipeline of processors to feed back to. 133 | """ 134 | hex = "[a-fA-F0-9]" 135 | pattern = "hex{8}-hex{4}-hex{4}-hex{4}-hex{12}".replace("hex", hex) 136 | self.request_id_pattern = re.compile(pattern) 137 | self.app.add_url_rule("/feedback/", "_feedback", 138 | self._feedback, methods=["GET", "POST"]) 139 | self.app.add_url_rule("/evaluate/", "_evaluate", 140 | self._evaluate, methods=["GET", "POST"]) 141 | self.app.add_url_rule("/evaluate", "_evaluate", 142 | self._evaluate, methods=["GET", "POST"]) 143 | self.app.add_url_rule("/learn/", "_learn", 144 | self._learn, methods=["GET", "POST"]) 145 | 146 | def setup_all_routes(self, pipeline, langcodes): 147 | """ 148 | Convenience function to start all namespaces at once. 149 | 150 | @param pipeline: The pipeline of processors 151 | """ 152 | self.pipeline = pipeline 153 | self.setup_route_semanticize(langcodes) 154 | self.setup_route_inspect() 155 | self.setup_route_feedback() 156 | 157 | def start(self, host, port, use_reloader=False): 158 | """ 159 | Wrapper for the Flask run() function. Will start the HTTP server with 160 | all initialized namespaces. 161 | 162 | @param host: The hostname to bind on 163 | @param port: The port to bind on 164 | """ 165 | print "Server started on %s:%d" % (host, port) 166 | self.app.run(host, port, self.app.debug, use_reloader=use_reloader) 167 | 168 | def _semanticize_usage(self): 169 | """ 170 | The function handling the /semanticize namespace. Returns the available 171 | languages. 172 | 173 | @return: The body of the response, in this case a json formatted list \ 174 | of links and their relevance 175 | @see: _semanticize 176 | """ 177 | 178 | json = self._json_dumps({"languages": self.langcodes}, 179 | "pretty" in request.args) 180 | 181 | return Response(json, mimetype=Server.APPLICATION_JSON) 182 | 183 | def _semanticize_handler(self, langcode): 184 | """ 185 | The function handling the /semanticize/ namespace. It uses 186 | the chain-of-command pattern to run all processors, using the 187 | corresponding preprocess, process, and postprocess steps. 188 | 189 | @param langcode: The language to use in the semanticizing 190 | @return: The body of the response, in this case a json formatted list \ 191 | of links and their relevance 192 | """ 193 | self.app.logger.debug("Semanticizing: start") 194 | text = self._get_text_from_request() 195 | self.app.logger.debug("Semanticizing text: " + text) 196 | 197 | settings = self._get_values_from_request({"langcode": langcode}) 198 | settings["request_id"] = str(uuid4()) 199 | 200 | sem_result = self._semanticize(langcode, settings, text) 201 | sem_result["request_id"] = settings["request_id"] 202 | json = self._json_dumps(sem_result, "pretty" in settings) 203 | 204 | self.app.logger.debug("Semanticizing: Created %d characters of JSON " 205 | "for request id %s." \ 206 | % (len(json), sem_result["request_id"])) 207 | return Response(json, mimetype=Server.APPLICATION_JSON) 208 | 209 | def _semanticize(self, langcode, settings, text): 210 | """ 211 | Method that performs the actual semantization. 212 | """ 213 | links = [] 214 | 215 | for function in ("preprocess", "process", "postprocess"): 216 | for step, processor in self.pipeline: 217 | self.app.logger.debug("Semanticizing: %s for step %s" \ 218 | % (function, step)) 219 | (links, text, settings) = getattr(processor, function)(links, 220 | text, 221 | settings 222 | ) 223 | self.app.logger.debug("Semanticizing: %s pipeline with %d steps \ 224 | done" % (function, len(self.pipeline))) 225 | 226 | result = {"links": links, "text": text} 227 | 228 | return result 229 | 230 | def _inspect(self): 231 | """ 232 | Function that handles the /inspect namespace. Will print the settings 233 | used by the different processors. 234 | 235 | @return: The body of the response, in this case a json formatted \ 236 | string containing all found settings. 237 | """ 238 | inspect = {} 239 | for _, processor in self.pipeline: 240 | inspect.update(processor.inspect()) 241 | return Response(self._json_dumps(inspect, pretty=True), 242 | mimetype=Server.APPLICATION_JSON) 243 | 244 | def _feedback(self, context_path): 245 | """ 246 | Function that handles the /feedback namespace. Will process the 247 | feedback in supported processors in the pipeline. 248 | """ 249 | context_parts = context_path.split("/") 250 | if len(context_parts) == 0: 251 | raise ValueError("No context for feedback is provided!") 252 | 253 | request_id_match = self.request_id_pattern.match(context_parts[-1]) 254 | if request_id_match: 255 | request_id = request_id_match.string 256 | context_parts.pop() 257 | else: 258 | request_id = None 259 | 260 | context = "/".join(context_parts) if len(context_parts) else None 261 | feedback = request.values 262 | for processor_name, processor in self.pipeline: 263 | if "feedback" in processor.__class__.__dict__: 264 | self.app.logger.debug("Feeding feedback for request_id %s in " 265 | "context %s to %s." % 266 | (request_id, context, processor_name)) 267 | processor.feedback(request_id, context, feedback) 268 | 269 | return "OK" 270 | 271 | def _evaluate(self, context_path=""): 272 | """ 273 | Function that handles the /evaluate namespace. Will evaluate a metric based 274 | on the feedback in supported processors in the pipeline. 275 | """ 276 | evaluation = {} 277 | for processor_name, processor in self.pipeline: 278 | if "evaluate" in processor.__class__.__dict__: 279 | self.app.logger.debug("Evaluating %s in %s." % 280 | (context_path, processor_name)) 281 | evaluation.update(processor.evaluate(context_path, 282 | request.values)) 283 | 284 | return Response(self._json_dumps(evaluation, pretty=True), 285 | mimetype=Server.APPLICATION_JSON) 286 | 287 | def _learn(self, name): 288 | """ 289 | Function that handles the /learn namespace. Will learn based on the 290 | feedback in supported processors in the pipeline. 291 | """ 292 | for processor_name, processor in self.pipeline: 293 | if "learn" in processor.__class__.__dict__: 294 | self.app.logger.debug("Learning %s in %s." % 295 | (name, processor_name)) 296 | processor.learn(name, request.values) 297 | 298 | return "OK" 299 | -------------------------------------------------------------------------------- /semanticizer/server/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import logging 15 | from logging.handlers import TimedRotatingFileHandler 16 | 17 | from .. import procpipeline 18 | from ..config import config_get 19 | from ..server import Server 20 | from ..wpm.data import init_datasource 21 | 22 | 23 | def start_server(langcodes, 24 | host, 25 | port, 26 | use_reloader, 27 | verbose=False, 28 | logformat='[%(asctime)-15s][%(levelname)s][%(module)s][%(pathname)s:%(lineno)d]: %(message)s', 29 | use_features=False, 30 | debug=False): 31 | """ 32 | Start a SemanticizerFlaskServer with all processors loaded into the 33 | pipeline. 34 | 35 | @param verbose: Set whether the Flask server should be verbose 36 | @param logformat: The logformat used by the Flask server 37 | """ 38 | # Initialize the pipeline 39 | pipeline = procpipeline.build(langcodes, use_features, debug=debug) 40 | # Create the FlaskServer 41 | logging.getLogger().info("Setting up server") 42 | server = Server() 43 | server.set_debug(verbose, logformat) 44 | # Setup all available routes / namespaces for the HTTP server 45 | server.setup_all_routes(pipeline, langcodes) 46 | logging.getLogger().info("Done setting up server, now starting...") 47 | # And finally, start the thing 48 | server.start(host, port, use_reloader) 49 | 50 | def init_logging(log, verbose, logformat): 51 | """ 52 | A convencience function that initializes the logging framework by setting 53 | the path to the log, verbosity, and the logformat. 54 | """ 55 | file_handler = TimedRotatingFileHandler(log, when='midnight') 56 | file_handler.setFormatter(logging.Formatter(logformat)) 57 | stream_handler = logging.StreamHandler() 58 | stream_handler.setFormatter(logging.Formatter(logformat)) 59 | if verbose == True: 60 | file_handler.setLevel(logging.DEBUG) 61 | stream_handler.setLevel(logging.DEBUG) 62 | logging.getLogger().setLevel(logging.DEBUG) 63 | logging.getLogger().addHandler(file_handler) 64 | logging.getLogger().addHandler(stream_handler) 65 | 66 | 67 | def main(): 68 | # Init the logger 69 | init_logging(config_get(('logging', 'path'), 'log.txt'), 70 | config_get(('logging', 'verbose'), False), 71 | config_get(('logging', 'format'), None)) 72 | 73 | # Set the datasource and init it 74 | wpmlangs = config_get(('wpm', 'languages')) 75 | settings = config_get(('settings'), {}) 76 | init_datasource(wpmlangs, settings) 77 | 78 | # Start the server 79 | try: 80 | start_server(config_get(('wpm', 'languages')).keys(), 81 | config_get(('server', 'host'), '0.0.0.0'), 82 | config_get(('server', 'port'), 5000), 83 | config_get(('server', 'use_reloader'), False), 84 | config_get(('logging', 'verbose'), False), 85 | config_get(('logging', 'format'), None), 86 | config_get(('linkprocs', 'features'), False), 87 | config_get(('server', 'debug'), False)) 88 | except ValueError as e: 89 | logging.getLogger().fatal("Error running Semanticizer server: %s" \ 90 | % e.message) 91 | raise 92 | 93 | 94 | if __name__ == '__main__': 95 | main() 96 | -------------------------------------------------------------------------------- /semanticizer/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/util/__init__.py -------------------------------------------------------------------------------- /semanticizer/util/online_learning.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import os, re, argparse, urllib, urllib2, json 15 | from collections import defaultdict 16 | from timer import Timer 17 | from random import choice, shuffle 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser( 21 | description='Online learn a classifier.', 22 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 23 | 24 | parser.add_argument('classifier', metavar='classifier', 25 | help='a classifier to train') 26 | parser.add_argument('datafiles', metavar='file', nargs='+', 27 | help='a set of datafiles to process') 28 | 29 | group = parser.add_argument_group('Semanticizer') 30 | group.add_argument('--url', default='http://localhost:5000/', 31 | help='URL where the semanticizer webservice is running') 32 | 33 | group = parser.add_argument_group('Learning') 34 | group.add_argument('--learn', nargs=2, action='append', 35 | metavar=('setting', 'value'), 36 | default=[('context', 'EMPTY')], 37 | help='Setting for the learn call') 38 | group.add_argument('--model-prefix', metavar='prefix', 39 | default='Online.', 40 | help='Prefix to add to the modelname') 41 | group.add_argument('--iterations', metavar='number', 42 | default=50, type=int, 43 | help='Number of iterations for learning.f') 44 | 45 | group = parser.add_argument_group('Context') 46 | group.add_argument('--context-pattern', nargs=2, 47 | metavar=('pattern', 'replacement'), 48 | default=('^(?:.*/)*(.*?)(?:\.txt)?$', '\g<1>'), 49 | help='Pattern to generate context from filename') 50 | group.add_argument('--context-prefix', 51 | metavar='prefix', default='', 52 | help='Prefix to add to the context') 53 | 54 | group = parser.add_argument_group('Output') 55 | group.add_argument('--output', default=None, 56 | help='Filename for the output') 57 | 58 | args = parser.parse_args() 59 | args.learn.append(('classifier', args.classifier)) 60 | return args 61 | 62 | def online_learning(args): 63 | results = defaultdict(list) 64 | 65 | shuffle(args.datafiles) 66 | for filenr, filename in enumerate(args.datafiles): 67 | assert os.path.exists(filename) 68 | context = args.context_prefix + re.sub(args.context_pattern[0], \ 69 | args.context_pattern[1], \ 70 | filename) 71 | 72 | modelname = args.model_prefix + context.replace('/', '.') 73 | learn_url = args.url + 'learn/' + modelname 74 | url_data = urllib.urlencode(args.learn) 75 | 76 | print "Initializing model", modelname, 77 | print urllib2.urlopen(learn_url, url_data).read() 78 | 79 | train_files = [f for f in args.datafiles if f != filename] 80 | for i in range(args.iterations): 81 | print "%02d/%02d" % (filenr+1, len(args.datafiles)), 82 | print "%03d/%03d" % (i+1, args.iterations), 83 | train_filename = choice(train_files) 84 | #with Timer("Learning for %s" % train_filename, 'timer'): 85 | train_context = args.context_prefix + \ 86 | re.sub(args.context_pattern[0], \ 87 | args.context_pattern[1], train_filename) 88 | 89 | url_data = urllib.urlencode({"context": train_context}) 90 | print "Training", modelname, "on", train_context, 91 | print urllib2.urlopen(learn_url, url_data).read() 92 | 93 | evaluate_url = args.url + 'evaluate/' + context 94 | url_data = urllib.urlencode({"model": modelname}) 95 | result = json.loads(urllib2.urlopen(evaluate_url, url_data).read()) 96 | print "%.4f %.4f %.4f" % \ 97 | (result["macro_metrics"]["accuracy"], 98 | result["macro_metrics"]["averagePrecision"], 99 | result["macro_metrics"]["rPrecision"]) 100 | results[filename].append(result) 101 | 102 | if args.output: 103 | with open(args.output, 'w') as out: 104 | out.write(json.dumps(results)) 105 | 106 | if __name__ == '__main__': 107 | args = parse_args() 108 | 109 | with Timer("Online learning %d files" % len(args.datafiles), 'timer'): 110 | online_learning(args) 111 | -------------------------------------------------------------------------------- /semanticizer/util/profiler.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import pstats 15 | import sys 16 | 17 | if __name__ == '__main__': 18 | stats = pstats.Stats(sys.argv[1]) 19 | stats.sort_stats('time') 20 | stats.print_stats(.01) 21 | stats.print_callers(.01) 22 | -------------------------------------------------------------------------------- /semanticizer/util/store_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import sys, os, re, argparse, urllib, urllib2, json 15 | from collections import defaultdict 16 | from timer import Timer 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description='Process and store a dataset.', 21 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 22 | 23 | parser.add_argument('datafiles', metavar='file', nargs='+', 24 | help='a set of datafiles to process') 25 | 26 | group = parser.add_argument_group('Semanticizer') 27 | group.add_argument('--url', default='http://localhost:5000/', 28 | help='URL where the semanticizer webservice is running') 29 | group.add_argument('--language', metavar='langcode', 30 | default='en', 31 | help='Language of the semanticizer (2 letters, eg. en)') 32 | group.add_argument('--semanticize', nargs=2, action='append', 33 | metavar=('setting', 'value'), 34 | default=[('save', "true")], 35 | help='Setting for the semanticizer call') 36 | 37 | group = parser.add_argument_group('Feedback') 38 | group.add_argument('--feedback', nargs=3, action='append', 39 | metavar=('type', 'pattern', 'replacement'), 40 | help='Pattern to generate feedback filenames ' 41 | '(default: positive "\\.txt$" ".positives.txt")') 42 | group.add_argument('--default', 43 | default='negative', metavar='type', 44 | help='Default type of feedback') 45 | group.add_argument('--no-default', action='store_true', 46 | help='Do not use default feedback') 47 | 48 | group = parser.add_argument_group('Context') 49 | group.add_argument('--context-pattern', nargs=2, 50 | metavar=('pattern', 'replacement'), 51 | default=('^(?:.*/)*(.*?)(?:\.txt)?$', '\g<1>'), 52 | help='Pattern to generate context from filename') 53 | group.add_argument('--context-prefix', 54 | metavar='prefix', default='', 55 | help='Prefix to add to the context') 56 | 57 | args = parser.parse_args() 58 | if not args.feedback: 59 | args.feedback = [('positive', '\.txt$', '.positives.txt')] 60 | 61 | return args 62 | 63 | def store_dataset(args): 64 | semanticize_url = '%ssemanticize/%s' % (args.url, args.language) 65 | request_ids = defaultdict(list) 66 | for filename in args.datafiles: 67 | assert os.path.exists(filename) 68 | context = args.context_prefix + re.sub(args.context_pattern[0], \ 69 | args.context_pattern[1], \ 70 | filename) 71 | 72 | with Timer("Semanticizing %s" % filename, 'timer'): 73 | with open(filename) as file: 74 | lines = file.readlines() 75 | print "Read %d lines from %s." % (len(lines), filename) 76 | 77 | for line in lines: 78 | data = [("context", context), ("text", line.strip())] 79 | data.extend(args.semanticize) 80 | url_data = urllib.urlencode(data) 81 | result = json.loads(urllib2.urlopen(semanticize_url, 82 | url_data).read()) 83 | print "Request %s: %d links" % \ 84 | (result["request_id"], len(result["links"])) 85 | request_ids[filename].append(result["request_id"]) 86 | 87 | with Timer("Feedback for %s" % context, 'timer'): 88 | feedback = [] 89 | for (feedback_type, pattern, replacement) in args.feedback: 90 | feedback_filename = re.sub(pattern, replacement, filename) 91 | if not os.path.exists(feedback_filename): 92 | print feedback_filename, "does not exist" 93 | continue 94 | with open(feedback_filename) as file: 95 | lines = file.readlines() 96 | print "Read %d lines of %s feedback from %s." % \ 97 | (len(lines), feedback_type, feedback_filename) 98 | for line in lines: 99 | feedback.append((feedback_type, line.strip())) 100 | 101 | if not args.no_default: 102 | feedback.append(("default", args.default)) 103 | 104 | feedback_url = args.url + 'feedback/' + context 105 | url_data = urllib.urlencode(feedback) 106 | result = urllib2.urlopen(feedback_url, url_data).read() 107 | print "%d items of feedback for %s: %s" % \ 108 | (len(feedback), context, result) 109 | 110 | if __name__ == '__main__': 111 | args = parse_args() 112 | 113 | with Timer("Storing %d files" % len(args.datafiles), 'timer'): 114 | store_dataset(args) 115 | -------------------------------------------------------------------------------- /semanticizer/util/timer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import time 15 | 16 | class Timer(object): 17 | """Convience method to time activities. Can be used as context.""" 18 | 19 | def __init__(self, activity, name=None): 20 | self.name = name 21 | self.activity = activity 22 | self.tstart = time.time() 23 | 24 | def __del__(self): 25 | if self.name: print '[%s]' % self.name, 26 | print self.activity, 27 | print 'took %s seconds.' % (time.time() - self.tstart) 28 | 29 | def __enter__(self): 30 | pass 31 | 32 | def __exit__(self, type, value, traceback): 33 | pass 34 | -------------------------------------------------------------------------------- /semanticizer/wpm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/wpm/__init__.py -------------------------------------------------------------------------------- /semanticizer/wpm/data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | import json 14 | 15 | from .load import WpmLoader 16 | from .namespace import WpmNS 17 | 18 | wpm_dumps = {} 19 | 20 | def init_datasource(wpm_languages, settings): 21 | """Set the datasource and init it""" 22 | for langcode, langconfig in wpm_languages.iteritems(): 23 | load_wpm_data(langconfig['source'], langcode, settings, **langconfig['initparams']) 24 | 25 | def load_wpm_data(datasource, langcode, settings, **kwargs): 26 | if datasource == "redis": 27 | from .db.redisdb import RedisDB 28 | db = RedisDB(**kwargs) 29 | elif datasource == "memory": 30 | from .db.inmemory import MemoryDB 31 | db = MemoryDB() 32 | elif datasource == "mongo": 33 | from .db.mongodb import MongoDB 34 | db = MongoDB() 35 | #load wpm data into memory 36 | WpmLoader(db, langcode, settings, **kwargs) 37 | else: 38 | raise ValueError("Unknown backend {}".format(datasource)) 39 | wpm_dumps[langcode] = WpmData(db, langcode) 40 | 41 | 42 | class WpmData: 43 | 44 | def __init__(self, db, langcode): 45 | 46 | #set database [memory or redis] 47 | self.db = db 48 | 49 | #get current db version 50 | self.version = self.db.get(langcode+":version") 51 | 52 | #load correct NameSpace 53 | self.ns = WpmNS(db, langcode, self.version) 54 | 55 | def entity_exists(self, entity): 56 | return self.exists(self.ns.label(entity)) 57 | 58 | def normalized_entities_exist(self, entities): 59 | with self.db.pipeline() as pipe: 60 | for e in entities: 61 | pipe.exists(self.ns.normalized(e)) 62 | return pipe.execute() 63 | 64 | def get_all_entities(self, normalized_entity): 65 | return self.db.smembers(self.ns.normalized(normalized_entity)) 66 | 67 | def get_entity_data(self, entity): 68 | entity_data = self.db.lrange(self.ns.label(entity) , 0, -1) 69 | senses = [] 70 | if len(entity_data) > 4: 71 | senses = entity_data[4:] 72 | return {'cntlinkocc': int(entity_data[0]), 73 | 'cntlinkdoc': int(entity_data[1]), 74 | 'cnttextocc': int(entity_data[2]), 75 | 'cnttextdoc': int(entity_data[3]), 76 | 'senses': senses} 77 | 78 | def get_sense_data(self, entity, sense): 79 | sense_data = self.db.lrange(self.ns.label_sense(entity, sense), 0, -1) 80 | return {'cntlinkocc': int(sense_data[0]), 81 | 'cntlinkdoc': int(sense_data[1]), 82 | 'from_title': sense_data[2], 83 | 'from_redir': sense_data[3]} 84 | 85 | def get_item_id(self, title): 86 | return self.db.get(self.ns.page_id(title)) 87 | 88 | def get_item_ids(self, *titles): 89 | with self.db.pipeline() as pipe: 90 | for title in titles: 91 | pipe.get(self.ns.page_id(title)) 92 | return pipe.execute() 93 | 94 | def get_item_title(self, pid): 95 | return self.db.get(self.ns.page_title(pid)) 96 | 97 | def get_item_inlinks(self, pid): 98 | return self.db.lrange(self.ns.page_inlinks(pid), 0, -1) 99 | 100 | def get_item_outlinks(self, pid): 101 | return self.db.lrange(self.ns.page_outlinks(pid), 0, -1) 102 | 103 | def get_item_categories(self, pid): 104 | return self.db.get(self.ns.page_categories(pid)) 105 | 106 | def get_item_definition(self, pid): 107 | return self.db.get(self.ns.page_definition(pid)) 108 | 109 | def get_item_labels(self, pid): 110 | json_labels = self.db.lrange(self.ns.page_labels(pid), 0, -1) 111 | results = [] 112 | for json_label in json_labels: 113 | label = json.loads(json_label) 114 | results.append({ 115 | 'title': label[0], 116 | 'occurances': label[1], 117 | 'fromRedirect': label[2], 118 | 'fromTitle': label[3], 119 | 'isPrimary': label[4], 120 | 'proportion': label[5] 121 | }) 122 | return results 123 | 124 | def sense_has_trnsl(self, sid): 125 | return self.db.exists(self.ns.translation_sense(sid)) 126 | 127 | def get_trnsl_langs(self, sid): 128 | return self.db.lrange(self.ns.translation_sense(sid), 0, -1) 129 | 130 | def get_sense_trnsl(self, sid, lang): 131 | return self.db.get(self.ns.translation_sense_language(sid, lang)) 132 | 133 | def get_wikipedia_name(self): 134 | path = self.db.get(self.ns.wiki_path()) 135 | if path[-1] == '/': 136 | return path.split('/')[-2] 137 | return path.split('/')[-1] 138 | 139 | def get_data_path(self): 140 | return self.db.get(self.ns.wiki_path()) 141 | 142 | def get_lang_name(self): 143 | return self.db.get(self.ns.wiki_language_name()) 144 | 145 | def get_title_ngram_score(self, title): 146 | nr_of_tokens = len(title.split()) 147 | return self.db.zscore(self.ns.ngramscore(str(nr_of_tokens)), title) 148 | 149 | def get_stat(self, value): 150 | return self.db.get(self.ns.wiki_stats(value)) 151 | 152 | def get_articles(self, *pids): 153 | pipe = self.db.pipeline() 154 | for pid in pids: 155 | pipe.lrange(self.ns.page_inlinks(pid), 0, -1) 156 | pipe.lrange(self.ns.page_outlinks(pid), 0, -1) 157 | pipe.lrange(self.ns.page_labels(pid), 0, -1) 158 | data = pipe.execute() 159 | 160 | results = [] 161 | for i in xrange(0, len(data)-1, 3): 162 | labels = [] 163 | json_labels = data[i+2] 164 | for json_label in json_labels: 165 | label = json.loads(json_label) 166 | labels.append({ 167 | 'title': label[0], 168 | 'occurances': label[1], 169 | 'fromRedirect': label[2], 170 | 'fromTitle': label[3], 171 | 'isPrimary': label[4], 172 | 'proportion': label[5] 173 | }) 174 | result = { 175 | "InLinks":data[i], 176 | "OutLinks":data[i+1], 177 | "Labels":labels 178 | } 179 | results.append(result) 180 | return results 181 | -------------------------------------------------------------------------------- /semanticizer/wpm/db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/semanticizer/wpm/db/__init__.py -------------------------------------------------------------------------------- /semanticizer/wpm/db/inmemory.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | 15 | 16 | class MemoryDB: 17 | #store all data in memory instead of redis, mimic redis functions 18 | def __init__(self, **kwargs): 19 | self.cache = dict() 20 | 21 | def pipeline(self, **kwargs): 22 | return Pipe(self.cache) 23 | 24 | def exists(self, key): 25 | return key in self.cache 26 | 27 | def keys(self, key): 28 | key = key.replace("*", "") 29 | # simple abstraction of redis wildcard key search, only valid for startswith equivalent search which should be sufficient, probably faster then full regular expression search over keys 30 | return [k for k in self.cache.iterkeys() if k.startswith(key)] 31 | 32 | def get(self, key): 33 | return self.cache[key] 34 | 35 | def set(self, key, value): 36 | self.cache[key] = value 37 | return True 38 | 39 | def smembers(self, key): 40 | return self.get(key) 41 | 42 | def sismember(self, key, value): 43 | return value in self.cache[key] 44 | 45 | def sadd(self, key, *values): 46 | self.cache.setdefault(key, set()).update(values) 47 | return [True]*len(values) 48 | 49 | def lrange(self, key, start=0, end=-1): 50 | data = self.cache.get(key, list()) 51 | if end < -1: 52 | return data[start:end+1] 53 | elif end == -1: 54 | return data[start:] 55 | else: 56 | return data[start:end] 57 | 58 | def rpush(self, key, *values): 59 | self.cache.setdefault(key, []).extend(values) 60 | return [True]*len(values) 61 | 62 | def zscore(self, key, value): 63 | return self.cache[key][value] 64 | 65 | def zincrby(self, key, value, amount=1): 66 | # in case value does not exist init 67 | if not value in self.cache.setdefault(key, {}): 68 | self.cache[key][value] = amount 69 | else: 70 | self.cache[key][value] += amount 71 | return self.cache[key][value] 72 | 73 | def delete(self,*keys): 74 | for key in keys: 75 | self.cache.pop(key, None) 76 | return True 77 | 78 | 79 | #proxy all returns to pipe class 80 | class Proxy(object): 81 | def __getattribute__(self,name): 82 | attr = object.__getattribute__(self, name) 83 | if hasattr(attr, '__call__') and name not in ["execute", "reset"]: 84 | def newfunc(*args, **kwargs): 85 | result = attr(*args, **kwargs) 86 | self.results.append(result) 87 | return True 88 | return newfunc 89 | else: 90 | return attr 91 | 92 | #implicity add a decorator Proxy to all functions of MemoryDB to fetch all returns and output them on execute 93 | class Pipe(Proxy, MemoryDB): 94 | def __init__(self, cache): 95 | self.reset() 96 | self.cache = cache 97 | 98 | def __enter__(self): 99 | return self 100 | 101 | def __exit__(self, exc_type, exc_value, traceback): 102 | self.reset() 103 | 104 | def __del__(self): 105 | try: 106 | self.reset() 107 | except Exception: 108 | pass 109 | 110 | def __len__(self): 111 | return len(self.results) 112 | 113 | def reset(self): 114 | self.results = [] 115 | 116 | def execute(self): 117 | results = self.results 118 | self.reset() 119 | return results -------------------------------------------------------------------------------- /semanticizer/wpm/db/mongodb.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | from pymongo import MongoClient 15 | 16 | class MongoDB: 17 | def __init__(self, host='localhost', port=27017, **kwargs): 18 | global client 19 | client = MongoClient(host, port) 20 | 21 | def pipeline(self, **kwargs): 22 | return Pipe() 23 | 24 | def exists(self, key): 25 | item = client.sem.wpm.find_one( {"_id": key}) 26 | return False if not item else True 27 | 28 | def keys(self, key): 29 | item = client.sem.wpm.find( {"_id": "/"+key+"/"}) 30 | return [] if not item else item 31 | 32 | def get(self, key): 33 | item = client.sem.wpm.find_one( {"_id": key}) 34 | return item['value'] 35 | 36 | def set(self, key, value): 37 | client.sem.wpm.save( {"_id":key, "value": value}) 38 | return True 39 | 40 | def smembers(self, key): 41 | return self.get(key) 42 | 43 | def sismember(self, key, value): 44 | item = client.sem.wpm.find_one( {"_id": key}) 45 | return False if not item else value in item['value'] 46 | 47 | def sadd(self, key, *values): 48 | item = client.sem.wpm.find_one( {"_id": key}) 49 | svalue = set(values) if not item else set(list(item['value']) + list(values)) 50 | client.sem.wpm.update( {"_id":key},{'$set':{'value': list(svalue)}},upsert=True, multi=False) 51 | return [True]*len(values) 52 | 53 | def lrange(self, key, start, end): 54 | item = client.sem.wpm.find_one( {"_id": key}) 55 | return [] if not item else value in item['value'][start:end] 56 | 57 | def rpush(self, key, *values): 58 | item = client.sem.wpm.find_one( {"_id": key}) 59 | lvalue = list(values) if not item else list(item['value']) + list(values) 60 | client.sem.wpm.update( {"_id":key},{'$set':{'value': lvalue}},upsert=True, multi=False) 61 | return [True]*len(values) 62 | 63 | def zscore(self, key, value): 64 | item = client.sem.wpm.find_one( {"_id": key}) 65 | subkey = ":"+str(value)+":" 66 | if not item: 67 | return None 68 | if not subkey in item: 69 | return None 70 | return item[subkey] 71 | 72 | def zincrby(self, key, value, ammount=1): 73 | client.sem.wpm.update( {"_id":key},{'$inc':{":"+str(value)+":": 1}},upsert=True, multi=False) 74 | return True 75 | 76 | def delete(self,*keys): 77 | for key in keys: 78 | client.sem.wpm.remove({"_id":key}) 79 | return True 80 | 81 | #proxy all returns to pipe class 82 | class Proxy(object): 83 | def __getattribute__(self,name): 84 | attr = object.__getattribute__(self, name) 85 | if hasattr(attr, '__call__'): 86 | def newfunc(*args, **kwargs): 87 | result = attr(*args, **kwargs) 88 | self.results.append(result) 89 | return True 90 | return newfunc 91 | else: 92 | return attr 93 | 94 | #implicity add a decorator Proxy to all functions of MongoDB to fetch all returns and output them on execute 95 | class Pipe(Proxy, MongoDB): 96 | def __init__(self): 97 | self.reset() 98 | 99 | def __enter__(self): 100 | return self 101 | 102 | def __exit__(self, exc_type, exc_value, traceback): 103 | self.reset() 104 | 105 | def __del__(self): 106 | try: 107 | self.reset() 108 | except Exception: 109 | pass 110 | 111 | def __len__(self): 112 | return len(self.results) 113 | 114 | def reset(self): 115 | self.results = [] 116 | 117 | def execute(self): 118 | results = self.results 119 | self.reset() 120 | return results -------------------------------------------------------------------------------- /semanticizer/wpm/db/redisdb.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import redis 15 | 16 | def RedisDB(host='localhost', port=6379, **kwargs): 17 | return redis.StrictRedis(host=host, port=port, db=0, decode_responses=True) -------------------------------------------------------------------------------- /semanticizer/wpm/namespace.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | class WpmNS: 15 | def __init__(self, db, langcode, version=None): 16 | self.sep = ':' 17 | self.lc = langcode 18 | self.db = db 19 | self.manual_version = version 20 | 21 | def version (self): 22 | if self.manual_version: 23 | return self.manual_version 24 | version = self.db.get(self.db_version()) 25 | if not version: 26 | raise Exception("No database version") 27 | return version 28 | 29 | def db_version(self): 30 | """ 31 | key 32 | :db:version 33 | value 34 | string(cache version) 35 | """ 36 | return self.sep.join( (self.lc, "db", "version") ) 37 | 38 | def wiki_language_name(self): 39 | """ 40 | key 41 | ::wiki:lname 42 | value 43 | string(wiki name) 44 | """ 45 | return self.sep.join( (self.lc, self.version(), "wiki", "lname") ) 46 | 47 | def wiki_path(self): 48 | """ 49 | key 50 | ::wiki:path 51 | value 52 | string(wiki path) 53 | """ 54 | return self.sep.join( (self.lc, self.version(), "wiki", "path") ) 55 | 56 | def wiki_stats(self, statName): 57 | """ 58 | key 59 | ::wiki:stats: 60 | value 61 | string(stats) 62 | """ 63 | return self.sep.join( (self.lc, self.version(), "wiki", "stats", statName) ) 64 | 65 | def label(self, name): 66 | """ 67 | key 68 | ::label: 69 | value 70 | list( LinkOccCount, LinkDocCount, TextOccCount, TextDocCount, SenseId, SenseId, ..) 71 | """ 72 | return self.sep.join( (self.lc, self.version(), "label", name) ) 73 | 74 | def label_sense(self, name, senseid): 75 | """ 76 | key 77 | ::label:: 78 | value 79 | list( sLinkDocCount, sLinkOccCount, FromTitle, FromRedirect) 80 | """ 81 | return self.sep.join( (self.lc, self.version(), "label", name, senseid) ) 82 | 83 | def normalized(self, name): 84 | """ 85 | key 86 | ::norm: 87 | value 88 | set( name, name, ... ) 89 | """ 90 | return self.sep.join( (self.lc, self.version(), "norm", name) ) 91 | 92 | def translation_sense(self, senseid): 93 | """ 94 | key 95 | ::trnsl: 96 | value 97 | list( langcode, langcode, ... ) 98 | """ 99 | return self.sep.join( (self.lc, self.version(), "trnsl", senseid) ) 100 | 101 | def translation_sense_language(self, senseid, langcode): 102 | """ 103 | key 104 | ::trnsl:: 105 | value 106 | string(name) 107 | """ 108 | return self.sep.join( (self.lc, self.version(), "trnsl", senseid, langcode) ) 109 | 110 | def page_id(self, name): 111 | """ 112 | key 113 | ::page:id 114 | value 115 | string(id) 116 | """ 117 | return self.sep.join( (self.lc, self.version(), "page", "id", name) ) 118 | 119 | def page_title(self, pageid): 120 | """ 121 | key 122 | ::page::name 123 | value 124 | string(name) 125 | """ 126 | return self.sep.join( (self.lc, self.version(), "page", pageid, "name") ) 127 | 128 | def page_labels(self, pageid): 129 | """ 130 | key 131 | ::page::labels 132 | value 133 | list( json([title, occurances, fromRedirect, fromTitle isPrimary, proportion]), ...) 134 | """ 135 | return self.sep.join( (self.lc, self.version(), "page", pageid, "labels") ) 136 | 137 | def page_definition(self, pageid): 138 | """ 139 | key 140 | ::page::definition 141 | value 142 | string(synopsis) 143 | """ 144 | return self.sep.join( (self.lc, self.version(), "page", pageid, "definition") ) 145 | 146 | def page_inlinks(self, pageid): 147 | """ 148 | key 149 | ::page::inlinks 150 | value 151 | list( pageid, pageid, ... ) 152 | """ 153 | return self.sep.join( (self.lc, self.version(), "page", pageid, "inlinks") ) 154 | 155 | 156 | def page_outlinks(self, pageid): 157 | """ 158 | key 159 | ::page::outlinks 160 | value 161 | list( pageid, pageid, ... ) 162 | """ 163 | return self.sep.join( (self.lc, self.version(), "page", pageid, "outlinks") ) 164 | 165 | def page_categories(self, pageid): 166 | """ 167 | key 168 | ::page::categories 169 | value 170 | list( category, category, ... ) 171 | """ 172 | return self.sep.join( (self.lc, self.version(), "page", pageid, "categories") ) 173 | 174 | 175 | def ngramscore(self, n): 176 | """ 177 | key 178 | ::grms 179 | value 180 | zset([words{score}, [...]])translation_sense 181 | """ 182 | return self.sep.join( (self.lc, self.version(), "%sgrms" % n) ) 183 | -------------------------------------------------------------------------------- /semanticizer/wpm/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import math 15 | import re 16 | import unicodedata 17 | import sys 18 | 19 | from .markup_stripper import MarkupStripper 20 | 21 | dump_filenames = { 22 | 'translations': 'translations.csv', 23 | 'stats': 'stats.csv', 24 | 'labels': 'label.csv', 25 | 'pages': 'page.csv', 26 | 'pageLabels': 'pageLabel.csv', 27 | 'pageCategories': 'articleParents.csv', 28 | 'inlinks': 'pageLinkIn.csv', 29 | 'outlinks': 'pageLinkOut.csv' 30 | } 31 | 32 | 33 | def normalize(raw, dash=True, accents=True, lower=True): 34 | """Replaces hyphens with spaces, removes accents, lower cases and 35 | strips the input text. 36 | 37 | All steps, except for the strip(), can be disabled with the 38 | optional arguments. 39 | """ 40 | text = raw 41 | if dash: 42 | text = text.replace('-', ' ') 43 | if accents: 44 | text = remove_accents(text) 45 | if lower: 46 | text = text.lower() 47 | text = text.strip() 48 | return text if len(text) else raw 49 | 50 | 51 | def remove_accents(input_str): 52 | """Replaces accented characters in the input with their 53 | non-accented counterpart.""" 54 | if isinstance(input_str, str): 55 | input_unicode = input_str.decode(errors="ignore") 56 | else: 57 | input_unicode = input_str 58 | nkfd_form = unicodedata.normalize('NFKD', input_unicode) 59 | return u"".join([c for c in nkfd_form if not unicodedata.combining(c)]) 60 | 61 | 62 | def check_dump_path(path, settings): 63 | """ 64 | Checks whether a path exists and raises an error if it doesn't. 65 | 66 | @param path: The pathname to check 67 | @raise IOError: If the path doesn't exist or isn't readbale 68 | """ 69 | import os 70 | import glob 71 | pathlist = [os.path.normpath(path) + os.sep, 72 | os.path.normpath(os.path.abspath(path)) + os.sep] 73 | for fullpath in pathlist: 74 | print "Checking " + fullpath 75 | if os.path.exists(fullpath): 76 | for filetype, filename in dump_filenames.iteritems(): 77 | if os.path.isfile(fullpath + filename) == True: 78 | print "Found " + fullpath + filename 79 | else: 80 | raise IOError("Cannot find " + fullpath + filename) 81 | if settings.get("include_definitions", True): 82 | wiki = glob.glob(fullpath + '*-pages-articles.xml') 83 | if len(wiki) > 0: 84 | print "Found " + wiki[0] 85 | else: 86 | raise IOError("Cannot find wiki *-pages-articles.xml") 87 | return fullpath 88 | else: 89 | print fullpath + " doesn't exist" 90 | raise IOError("Cannot find " + path) 91 | 92 | 93 | def get_relatedness(linksA, linksB): 94 | """ 95 | Compare relatedness of 2 articles based on in or outlinks. 96 | 97 | @param linksA: in or out links of article A 98 | @param linksB: in or out links of article B 99 | """ 100 | if not linksA or not linksB: 101 | return 0.0 102 | 103 | if linksA == linksB: 104 | return 1.0 105 | 106 | intersection = 0 107 | indexA = 0 108 | indexB = 0 109 | 110 | while indexA < len(linksA) or indexB < len(linksB): 111 | useA = False 112 | useB = False 113 | 114 | linkA = None 115 | linkB = None 116 | 117 | if indexA < len(linksA): 118 | linkA = linksA[indexA] 119 | 120 | if indexB < len(linksB): 121 | linkB = linksB[indexB] 122 | 123 | if linkA and linkB and linkA == linkB: 124 | useA = True 125 | useB = True 126 | intersection += 1 127 | else: 128 | if linkA and (not linkB or linkA < linkB): 129 | useA = True 130 | if linkA == artB: 131 | intersection += 1 132 | else: 133 | useB = True 134 | if linkB == artA: 135 | intersection += 1 136 | 137 | if useA: 138 | indexA += 1 139 | if useB: 140 | indexB += 1 141 | 142 | googleMeasure = None 143 | 144 | if intersection == 0: 145 | googleMeasure = 1.0 146 | else: 147 | a = math.log(len(linksA)) 148 | b = math.log(len(linksB)) 149 | ab = math.log(len(intersection)) 150 | 151 | googleMeasure = (max(a, b) - ab) / (m - min(a, b)) 152 | 153 | #normalize 154 | if not googleMeasure: 155 | return 0 156 | if googleMeasure >= 1: 157 | return 0 158 | 159 | return 1 - googleMeasure 160 | 161 | def generate_markup_definition(markup): 162 | """ 163 | Strip wiki markup and convert some wiki tags to html 164 | 165 | @param markup: wiki markup 166 | """ 167 | stripper = MarkupStripper() 168 | 169 | # strip markup 170 | markup = re.sub("={2,}(.+)={2,}", "\n", markup) #clear section headings completely - not just formating, but content as well. 171 | markup = stripper.strip_all_but_internal_links_and_emphasis(markup) 172 | markup = stripper.strip_non_article_internal_links(markup) 173 | markup = stripper.strip_excess_newlines(markup) 174 | 175 | # convert wiki tags to html 176 | markup = stripper.emphasisResolver.resolve_emphasis(markup) 177 | 178 | # todo convert links 179 | #... 180 | 181 | # slice markup to definition 182 | fp = "" 183 | pos = 0 184 | p = re.compile("\n\n", re.DOTALL) 185 | for m in p.finditer(markup): 186 | fp = markup[0:pos] 187 | if (pos > 150): 188 | break 189 | pos = m.start()+2 190 | fp = re.sub("\n", " ", fp) 191 | fp = re.sub("\\s+", " ", fp) #turn all whitespace into spaces, and collapse them. 192 | fp = fp.strip() 193 | 194 | return fp 195 | 196 | def cli_progress(current, total, bar_length=40): 197 | """ 198 | shows progressbar in CLI 199 | 200 | @param current: int of current step 201 | @param current: int of total steps 202 | @param bar_length: length of the progressbar in cli window 203 | """ 204 | percent = float(current) / total 205 | hashes = '#' * int(round(percent * bar_length)) 206 | spaces = ' ' * (bar_length - len(hashes)) 207 | sys.stdout.write("\rPercent: [{0}] {1}%".format(hashes + spaces, int(round(percent * 100)))) 208 | sys.stdout.flush() 209 | -------------------------------------------------------------------------------- /semanticizer/wpm/utils/emphasis_resolver.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | import re 15 | 16 | # This parses MediaWiki syntax for '''bold''' and ''italic'' text with the equivalent html markup. 17 | class EmphasisResolver: 18 | def resolve_emphasis(self, text): 19 | sb = [] 20 | for line in text.split("\n"): 21 | sb.append(self.resolve_line(line)) 22 | sb.append("\n") 23 | 24 | result = "".join(sb) 25 | result = result[:-1] 26 | return result 27 | 28 | # This is a direct translation of the php function doAllQuotes used by the original MediaWiki software. 29 | # 30 | # @param line the line to resolve emphasis within 31 | # @return the line, with all emphasis markup resolved to html tags 32 | # 33 | def resolve_line(self, line): 34 | 35 | #print "Resolving line '" + line + "'" 36 | 37 | arr = self.get_splits("$"+line) 38 | if len(arr) <= 1: 39 | return line 40 | 41 | # First, do some preliminary work. This may shift some apostrophes from 42 | # being mark-up to being text. It also counts the number of occurrences 43 | # of bold and italics mark-ups. 44 | 45 | numBold = 0 46 | numItalics = 0 47 | 48 | for i, value in enumerate(arr): 49 | if (i % 2 == 1): 50 | # If there are ever four apostrophes, assume the first is supposed to 51 | # be text, and the remaining three constitute mark-up for bold text. 52 | if (len(arr[i]) == 4): 53 | arr[i-1] = arr[i-1] + "'" ; 54 | arr[i] = self.get_filled_string(3) ; 55 | elif len(arr[i]) > 5: 56 | # If there are more than 5 apostrophes in a row, assume they're all 57 | # text except for the last 5. 58 | arr[i-1] = arr[i-1] + self.get_filled_string(len(arr[i])-5) 59 | arr[i] = self.get_filled_string(5) 60 | 61 | size = len(arr[i]) 62 | if size == 2: 63 | numItalics +=1 64 | elif size == 3: 65 | numBold+=1 66 | elif size == 5: 67 | numItalics +=1 68 | numBold +=1 69 | 70 | # If there is an odd number of both bold and italics, it is likely 71 | # that one of the bold ones was meant to be an apostrophe followed 72 | # by italics. Which one we cannot know for certain, but it is more 73 | # likely to be one that has a single-letter word before it. 74 | if (numBold%2==1) and (numItalics%2==1): 75 | i= 0 76 | firstSingleLetterWord = -1 77 | firstMultiLetterWord = -1 78 | firstSpace = -1 79 | 80 | for r in arr: 81 | if i%2==1 and len(r)==3: 82 | x1 = arr[i-1][len(arr[i-1])-1] 83 | x2 = arr[i-1][len(arr[i-1])-2] 84 | if x1==' ': 85 | if firstSpace == -1: 86 | firstSpace = i ; 87 | elif x2==' ': 88 | if firstSingleLetterWord == -1: 89 | firstSingleLetterWord = i 90 | else: 91 | if firstMultiLetterWord == -1: 92 | firstMultiLetterWord = i 93 | 94 | i += 1 95 | 96 | # If there is a single-letter word, use it! 97 | if firstSingleLetterWord > -1: 98 | arr[firstSingleLetterWord] = "''" 99 | arr[firstSingleLetterWord-1] = arr[firstSingleLetterWord] + "'" 100 | elif firstMultiLetterWord > -1: 101 | # If not, but there's a multi-letter word, use that one. 102 | arr[firstMultiLetterWord] = "''" 103 | arr[firstMultiLetterWord-1] = arr[firstMultiLetterWord] + "'" 104 | elif firstSpace > -1: 105 | # ... otherwise use the first one that has neither. 106 | # (notice that it is possible for all three to be -1 if, for example, 107 | # there is only one pentuple-apostrophe in the line) 108 | arr[firstSpace] = "''" 109 | arr[firstSpace-1] = arr[firstSpace] + "'" 110 | 111 | # Now let's actually convert our apostrophic mush to HTML! 112 | 113 | output = [] 114 | buffer = [] 115 | state = "" ; 116 | i = 0 117 | for r in arr: 118 | if i%2==0: 119 | if state == "both": 120 | buffer.append(r) 121 | else: 122 | output.append(r) 123 | else: 124 | if len(r) == 2: 125 | if state == "i": 126 | output.append("") 127 | state = "" 128 | elif state == "bi": 129 | output.append("") 130 | state = "b" 131 | elif state =="ib": 132 | output.append(""); 133 | state = "b"; 134 | elif state =="both": 135 | output.append("") ; 136 | output.append("".join(buffer)) 137 | output.append("") ; 138 | state = "b"; 139 | else: 140 | # $state can be "b" or "" 141 | output.append("") 142 | state = state + "i" 143 | elif len(r) == 3: 144 | if state == "b": 145 | output.append("") 146 | state = "" 147 | elif state == "bi": 148 | output.append("") 149 | state = "i" 150 | elif state =="ib": 151 | output.append(""); 152 | state = "i"; 153 | elif state =="both": 154 | output.append("") ; 155 | output.append("".join(buffer)) 156 | output.append("") ; 157 | state = "i"; 158 | else: 159 | # $state can be "b" or "" 160 | output.append("") 161 | state = state + "b" 162 | elif len(r) == 5: 163 | if state == "b": 164 | output.append("") 165 | state = "i" 166 | elif state == "i": 167 | output.append("") 168 | state = "b" 169 | elif state =="bi": 170 | output.append(""); 171 | state = ""; 172 | elif state =="ib": 173 | output.append("") ; 174 | state = ""; 175 | elif state =="both": 176 | output.append("") ; 177 | output.append("".join(buffer)) 178 | output.append("") ; 179 | state = "i"; 180 | else: 181 | # ($state == "") 182 | buffer = [] 183 | state = "both" 184 | i += 1 185 | 186 | 187 | # Now close all remaining tags. Notice that the order is important. 188 | if state == "b" or state == "ib": 189 | output.append("") 190 | 191 | if state == "i" or state == "bi" or state == "ib": 192 | output.append("") 193 | if state == "bi": 194 | output.append("") 195 | 196 | # There might be lonely ''''', so make sure we have a buffer 197 | if state == "both" and len(buffer) > 0: 198 | output.append("") 199 | output.append("".join(buffer)) 200 | output.append("") 201 | 202 | #remove leading $ 203 | output = output[1:] 204 | 205 | return "".join(output) 206 | 207 | 208 | 209 | # Does the same job as php function preg_split 210 | def get_splits(self, text): 211 | #return re.split("\\'{2,}", text) 212 | splits = [] 213 | lastCopyIndex = 0 214 | p = re.compile("\\'{2,}") 215 | 216 | for m in p.finditer(text): 217 | if m.start() > lastCopyIndex: 218 | splits.append( text[lastCopyIndex: m.start()] ) 219 | splits.append( m.group() ) 220 | lastCopyIndex = m.end() 221 | 222 | if lastCopyIndex < len(text)-1: 223 | splits.append(text[lastCopyIndex]) 224 | 225 | return splits 226 | 227 | 228 | def get_filled_string(self, length): 229 | sb = [] 230 | for i in xrange(0,length): 231 | sb.append("'") 232 | return "".join(sb) 233 | 234 | ## EmphasisResolver testing using 235 | ## python -m semanticizer.wpm.utils.emphasis_resolver 236 | if __name__ == '__main__': 237 | er = EmphasisResolver() 238 | markup = "'''War''' is an openly declared state of organized [[violent]] [[Group conflict|conflict]], typified by extreme [[aggression]], [[societal]] disruption, and high [[Mortality rate|mortality]]. As a behavior pattern, warlike tendencies are found in many [[primate]] species, including [[humans]], and also found in many [[ant]] species. The set of techniques used by a group to carry out war is known as '''warfare'''." 239 | #markup = "Parsing '''MediaWiki''''s syntax for '''bold''' and ''italic'' markup is a '''''deceptively''' difficult'' task. Whoever came up with the markup scheme should be '''shot'''." ; 240 | print er.resolve_emphasis(markup) -------------------------------------------------------------------------------- /semanticizer/wpm/utils/wikidumps.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012 University of Amsterdam 2 | # Copyright 2014 Netherlands eScience Center 3 | # Written by Lars Buitinck. 4 | 5 | """Parsing utilities for Wikipedia database dumps.""" 6 | 7 | from __future__ import print_function 8 | 9 | import re 10 | import xml.etree.ElementTree as etree # don't use LXML, it's slower (!) 11 | 12 | 13 | def _get_namespace(tag): 14 | try: 15 | namespace = re.match(r"^{(.*?)}", tag).group(1) 16 | except AttributeError: 17 | namespace = '' 18 | if not namespace.startswith("http://www.mediawiki.org/xml/export-"): 19 | raise ValueError("namespace %r not recognized as MediaWiki dump" 20 | % namespace) 21 | return namespace 22 | 23 | 24 | def extract_pages(f): 25 | """Extract pages from Wikimedia database dump. 26 | 27 | Parameters 28 | ---------- 29 | f : file-like or str 30 | Handle on Wikimedia article dump. May be any type supported by 31 | etree.iterparse. 32 | 33 | Returns 34 | ------- 35 | pages : iterable over (int, string, string) 36 | Generates (page_id, title, content) triples. 37 | In Python 2.x, may produce either str or unicode strings. 38 | """ 39 | elems = (elem for _, elem in etree.iterparse(f, events=["end"])) 40 | 41 | # We can't rely on the namespace for database dumps, since it's changed 42 | # it every time a small modification to the format is made. So, determine 43 | # those from the first element we find, which will be part of the metadata, 44 | # and construct element paths. 45 | elem = next(elems) 46 | namespace = _get_namespace(elem.tag) 47 | ns_mapping = {"ns": namespace} 48 | page_tag = "{%(ns)s}page" % ns_mapping 49 | text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping 50 | id_path = "./{%(ns)s}id" % ns_mapping 51 | title_path = "./{%(ns)s}title" % ns_mapping 52 | 53 | for elem in elems: 54 | if elem.tag == page_tag: 55 | text = elem.find(text_path).text 56 | if text is None: 57 | # Empty article; these occur in Wikinews dumps. 58 | continue 59 | yield (int(elem.find(id_path).text), 60 | elem.find(title_path).text, 61 | text) 62 | 63 | # Prune the element tree, as per 64 | # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ 65 | # We do this only for s, since we need to inspect the 66 | # ./revision/text element. That shouldn't matter since the pages 67 | # comprise the bulk of the file. 68 | elem.clear() 69 | 70 | 71 | if __name__ == "__main__": 72 | # Test; will write article info + prefix of content to stdout 73 | import sys 74 | 75 | if len(sys.argv) > 1: 76 | print("usage: %s; will read from standard input" % sys.argv[0], 77 | file=sys.stderr) 78 | sys.exit(1) 79 | 80 | for pageid, title, text in extract_pages(sys.stdin): 81 | title = title.encode("utf-8") 82 | text = text[:40].replace("\n", "_").encode("utf-8") 83 | print("%d '%s' (%s)" % (pageid, title, text)) 84 | -------------------------------------------------------------------------------- /semanticizer_wsgi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 4 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 5 | # General Public License as published by the Free Software Foundation, either 6 | # version 3 of the License, or (at your option) any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, but WITHOUT 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 11 | # for more details. 12 | # 13 | # You should have received a copy of the GNU Lesser General Public License 14 | # along with this program. If not, see . 15 | 16 | """ Semanticizer (WSGI version) 17 | 18 | A stripped down, WSGI compatible, version of the semanticizer. 19 | 20 | Usage: 21 | gunicorn --bind 0.0.0.0:5001 --workers 4 semanticizer_wsgi:application 22 | or 23 | uwsgi --http :5001 --master --processes 4 --wsgi-file semanticizer_wsgi.py 24 | 25 | """ 26 | 27 | import re 28 | from semanticizer.config import config_get 29 | settings = config_get(('settings'), {}) 30 | 31 | # Can do without ujson and simplejson, but speeds up considerably. 32 | try: 33 | import ujson 34 | except ImportError: 35 | pass 36 | try: 37 | import simplejson as json 38 | except ImportError: 39 | import json 40 | 41 | from flask import Flask, Response, request 42 | 43 | from semanticizer import procpipeline 44 | from semanticizer.config import config_get 45 | from semanticizer.wpm.data import init_datasource 46 | 47 | 48 | wpm_languages = config_get(('wpm', 'languages')) 49 | init_datasource(wpm_languages, settings) 50 | PIPELINE = procpipeline.build(wpm_languages) 51 | 52 | # WSGI application! 53 | application = Flask(__name__) 54 | application.debug = True 55 | 56 | 57 | APPLICATION_JSON = "application/json" 58 | 59 | # RegExens for CleanTweet 60 | CLEAN_TWEET = \ 61 | {'user': re.compile(r"(@\w+)"), 62 | 'url': re.compile(r"(http://[a-zA-Z0-9_=\-\.\?&/#]+)"), 63 | 'punctuation': re.compile(r"[-!\"#\$%&'\(\)\*\+,\.\/:;<=>\?@\[\\\]\^_`\{\|\}~]+"), 64 | 'retweet': re.compile(r"(\bRT\b)") 65 | } 66 | 67 | 68 | @application.route('/') 69 | def hello_world(): 70 | """Hello World!""" 71 | return 'Hello World!\n' 72 | 73 | 74 | @application.route('/semanticize/', methods=['GET', 'POST']) 75 | def _semanticize_handler(langcode): 76 | """ 77 | The function handling the /semanticize/ namespace. It uses 78 | the chain-of-command pattern to run all processors, using the 79 | corresponding preprocess, process, and postprocess steps. 80 | 81 | @param langcode: The language to use in the semanticizing 82 | @return: The body of the response, in this case a json formatted list \ 83 | of links and their relevance 84 | """ 85 | # self.application.logger.debug("Semanticizing: start") 86 | text = _get_text_from_request() 87 | 88 | # self.application.logger.debug("Semanticizing text: " + text) 89 | settings = {"langcode": langcode} 90 | for key, value in request.values.iteritems(): 91 | assert key not in settings 92 | settings[key] = value 93 | 94 | sem_result = _semanticize(langcode, settings, text) 95 | json = _json_dumps(sem_result, "pretty" in settings) 96 | 97 | # self.application.logger.debug("Semanticizing: Created %d characters of JSON." \ 98 | # % len(json)) 99 | return Response(json, mimetype=APPLICATION_JSON) 100 | 101 | 102 | @application.route('/cleantweet', methods=['GET', 'POST']) 103 | def _cleantweet(): 104 | """ 105 | The function that handles the /cleantweet namespace. Will use regular 106 | expressions to completely clean a given tweet. 107 | 108 | @return: The body of the response, in this case a json formatted \ 109 | string containing the cleaned tweet. 110 | """ 111 | text = _get_text_from_request() 112 | clean_text = cleantweet(text) 113 | 114 | return _json_dumps({"cleaned_text": clean_text}) 115 | 116 | 117 | def cleantweet(text): 118 | """ 119 | Tweet cleaner/tokenizer. 120 | 121 | Uses regular expressions to completely clean, and tokenize, a 122 | given tweet. 123 | """ 124 | 125 | for cleaner in ['user', 'url', 'punctuation', 'retweet']: 126 | text = CLEAN_TWEET[cleaner].sub(" ", text) 127 | text = " ".join([w for w in re.split(r'\s+', text) if len(w) > 1]) 128 | 129 | return text 130 | 131 | 132 | def _semanticize(langcode, settings, text): 133 | """ 134 | Method that performs the actual semantization. 135 | """ 136 | links = [] 137 | 138 | for function in ("preprocess", "process", "postprocess"): 139 | for step, processor in PIPELINE: 140 | # self.application.logger.debug("Semanticizing: %s for step %s" \ 141 | # % (function, step)) 142 | (links, text, settings) = getattr(processor, function)(links, 143 | text, 144 | settings 145 | ) 146 | # self.application.logger.debug("Semanticizing: %s pipeline with %d steps \ 147 | # done" % (function, len(self.pipeline))) 148 | 149 | result = {"links": links, "text": text} 150 | 151 | return result 152 | 153 | 154 | def _json_dumps(obj, pretty=False): 155 | """ 156 | Util function to create json dumps based on an object. 157 | 158 | @param o: Object to transform 159 | @param pretty: Whether or not to prettify the JSON 160 | @return: The JSON string 161 | """ 162 | if not pretty and "ujson" in locals(): 163 | return ujson.dumps(obj) 164 | elif not pretty: 165 | return json.dumps(obj) 166 | else: 167 | return json.dumps(obj, indent=4) 168 | 169 | def _get_text_from_request(): 170 | """ 171 | Util function to get the param called "text" from the current request 172 | 173 | @return: the value of "text" 174 | """ 175 | 176 | return request.values['text'] 177 | # content_type = request.headers['Content-Type'] 178 | # if request.method == "POST": 179 | # if content_type == 'application/x-www-form-urlencoded': 180 | # return request.form['text'] 181 | # elif content_type == 'text/plain': 182 | # return request.data 183 | # else: 184 | # abort(Response("Unsupported Content Type, use: text/plain\n", 185 | # status=415)) 186 | # elif "text" in request.args: 187 | # return request.args["text"] 188 | # else: 189 | # abort(Response("No text provided, use: POST or GET with attribute \ 190 | # 'text'\n", status=400)) 191 | 192 | 193 | if __name__ == '__main__': 194 | application.run() 195 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | pkgs = (["semanticizer"] + 6 | ["semanticizer." + sub for sub in ("processors", "redisinsert", 7 | "server", "util", "wpm")]) 8 | 9 | setup( 10 | name="semanticizer", 11 | description="Entity Linking for the masses", 12 | packages=pkgs, 13 | classifiers=[ 14 | "Intended Audience :: Science/Research", 15 | "Topic :: Scientific/Engineering", 16 | "Topic :: Scientific/Engineering :: Information Analysis", 17 | "Topic :: Text Processing", 18 | ], 19 | install_requires=[ 20 | "flask", 21 | "mock", 22 | "leven", 23 | "lxml", 24 | "networkx", 25 | "numpy", 26 | "redis>=2.8.0", 27 | "scikit-learn", 28 | "simplejson", 29 | ], 30 | ) 31 | -------------------------------------------------------------------------------- /test/TestConfig.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | ''' 15 | Testsuite for the config.py module 16 | ''' 17 | # Disable check for calling protected members 18 | # pylint: disable-msg=W0212 19 | # Disable check for naming conventions that disturb setUp and tearDown 20 | # pylint: disable-msg=C0103 21 | # Disable check for too many public methods 22 | # pylint: disable-msg=R0904 23 | 24 | import unittest 25 | import config 26 | from os import remove 27 | from argparse import ArgumentTypeError 28 | from argparse import ArgumentParser 29 | from tempfile import mkstemp 30 | from ConfigParser import MissingSectionHeaderError 31 | 32 | 33 | class Test(unittest.TestCase): 34 | """Testclass for config.py""" 35 | 36 | def setUp(self): 37 | """setup the test by creating a tempfile and a test config""" 38 | self.tmpfile, self.tmpfilename = mkstemp() 39 | self.testconfig = { 40 | 'port': 6000, 41 | 'lmpath': self.tmpfilename, 42 | 'verbose': None 43 | } 44 | 45 | def tearDown(self): 46 | """Tear down by removing the tempfile created during setup""" 47 | remove(self.tmpfilename) 48 | 49 | def test_readable_path(self): 50 | """Test the function that guarantees a path given in the config 51 | is readable""" 52 | valid_path = '/' 53 | invalid_path = '/invalid/path' 54 | self.assertTrue( 55 | config._readable_path(valid_path).endswith(valid_path), 56 | "_readable_path returns an unexpected value for %s" \ 57 | % valid_path) 58 | self.assertRaises(ArgumentTypeError, 59 | config._readable_path, 60 | invalid_path) 61 | 62 | def test_writable_file(self): 63 | """Test the function that guarantees a path given in the config 64 | is writable""" 65 | valid_file = self.tmpfilename 66 | invalid_file = '/test/test/invalid' 67 | self.assertTrue( 68 | config._writable_file(valid_file).endswith(valid_file), 69 | "_writable_file returns an unexpected value for %s" \ 70 | % valid_file) 71 | self.assertRaises(ArgumentTypeError, 72 | config._writable_file, 73 | invalid_file) 74 | 75 | def test_valid_absolute_url(self): 76 | """Test the function that guarantees a value given in the config 77 | is a valid URL""" 78 | valid_url = 'http://www.google.com:890/something?param=1&else=2' 79 | invalid_url = 'ha//%st||al}avista' 80 | self.assertEqual( 81 | config._valid_absolute_url(valid_url), 82 | valid_url, 83 | "_valid_absolute_url returns an unexpected value for %s" \ 84 | % valid_url) 85 | self.assertRaises(ArgumentTypeError, 86 | config._valid_absolute_url, 87 | invalid_url) 88 | 89 | def test_get_conf_vals(self): 90 | """Test the params are being parsed as we expect""" 91 | # the expected result after parsing the config 92 | result = ["--lmpath", self.tmpfilename, "--port", "6000", "--verbose"] 93 | # writing a random line to the config file and test that ConfigParser 94 | # raises a MissingSectionHeaderError 95 | tmpfile = open(self.tmpfilename, 'w') 96 | tmpfile.write("somekey = somevalue\n") 97 | tmpfile.close() 98 | self.assertRaises(MissingSectionHeaderError, 99 | config._get_conf_vals, 100 | self.tmpfilename) 101 | # writing valid values to the config file and comparing the result to 102 | # what we expect 103 | tmpfile = open(self.tmpfilename, 'w') 104 | tmpfile.write("[generic]\n") 105 | for key, value in self.testconfig.iteritems(): 106 | if value: 107 | tmpfile.write(key + " = " + str(value) + "\n") 108 | else: 109 | tmpfile.write(key + "\n") 110 | tmpfile.close() 111 | self.assertEqual(config._get_conf_vals(self.tmpfilename), 112 | result, 113 | "_get_conf_vals doesn't create the expected list: ") 114 | 115 | def test_get_arg_parser(self): 116 | """Test we get a valid ArgumentParser""" 117 | self.assertIsInstance(config._get_arg_parser(), 118 | ArgumentParser, 119 | "_get_arg_parser doesn't return an instance of \ 120 | ArgumentParser") 121 | 122 | def test_set_data_and_set_conf(self): 123 | """Test the set_data and set_conf functions""" 124 | # generate and set data 125 | configuration = [] 126 | for key, value in self.testconfig.iteritems(): 127 | configuration += ["--" + key] 128 | if value: 129 | configuration += [str(value)] 130 | config.set_data(configuration) 131 | # check we can read back the data we set 132 | self.assertEqual(config.config_get(("server","port")), 133 | 6000, 134 | "can't find argument values set by set_data") 135 | self.assertEqual(config.config_get(("logging", "verbose")), 136 | True, 137 | "can't find argument values set by set_data") 138 | # check that the system exits when we give unrecognized arguments 139 | config.set_data("--some values --that --dont --exist".split()) 140 | self.assertRaises(SystemExit, config._set_conf) 141 | 142 | def test_config_get(self): 143 | """Test the most important function of the config module: config_get""" 144 | # generate and set data 145 | configuration = [] 146 | for key, value in self.testconfig.iteritems(): 147 | configuration += ["--" + key] 148 | if value: 149 | configuration += [str(value)] 150 | config.set_data(configuration) 151 | # check we can read back the data we set 152 | config.config_get(('server', 'port')) 153 | self.assertEqual(config.config_get(('server', 'port')), 154 | 6000, 155 | "can't find argument values set by set_data") 156 | self.assertEqual(config.config_get("nonexisting", None), 157 | None, 158 | "config_get doesn't return None on a nonexisting param") 159 | 160 | 161 | if __name__ == "__main__": 162 | #import sys;sys.argv = ['', 'Test.testName'] 163 | unittest.main() 164 | -------------------------------------------------------------------------------- /test/TestInputdata.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | ''' 15 | Testsuite for the init.Initializer module 16 | ''' 17 | import unittest 18 | import os 19 | import inputdata 20 | 21 | from tempfile import mkstemp 22 | from textcat import NGram 23 | 24 | 25 | class Test(unittest.TestCase): 26 | 27 | def setUp(self): 28 | self.tmpfile, self.tmpfilename = mkstemp() 29 | 30 | def test_load_textcat(self): 31 | # Initialize 32 | invalid_lm_dir = os.path.dirname(self.tmpfilename) 33 | valid_lm_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 34 | "../LM.lrej2011") 35 | 36 | # ++++++++++++++++++++++++++++ 37 | # ++++++++ Run tests +++++++++ 38 | # ++++++++++++++++++++++++++++ 39 | 40 | # Fail if lm_dir isn't set 41 | self.assertRaises(TypeError, inputdata.load_textcat) 42 | 43 | # Fail if lm_dir is invalid 44 | self.assertRaises(ValueError, inputdata.load_textcat, invalid_lm_dir) 45 | 46 | # Return an NGram object if lm_dir is valid 47 | self.assertIsInstance(inputdata.load_textcat(valid_lm_dir), NGram, 48 | "_load_textcat with %s should result in a" \ 49 | % valid_lm_dir + "valid_lm_dir NGram instance." 50 | + "Does the path contain valid lm files?") 51 | 52 | 53 | if __name__ == "__main__": 54 | #import sys;sys.argv = ['', 'Test.testName'] 55 | unittest.main() 56 | -------------------------------------------------------------------------------- /test/TestMain.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | ''' 15 | Created on 13 Apr 2013 16 | 17 | @author: evert 18 | ''' 19 | import unittest 20 | 21 | 22 | class Test(unittest.TestCase): 23 | 24 | def setUp(self): 25 | pass 26 | 27 | def tearDown(self): 28 | pass 29 | 30 | @unittest.skip("not yet implemented") 31 | def test_start_server(self): 32 | pass 33 | 34 | @unittest.skip("not yet implemented") 35 | def test_init_logging(self): 36 | pass 37 | 38 | 39 | if __name__ == "__main__": 40 | #import sys;sys.argv = ['', 'Test.testName'] 41 | unittest.main() 42 | -------------------------------------------------------------------------------- /test/TestProcpipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | ''' 15 | Created on 13 Apr 2013 16 | 17 | @author: evert 18 | ''' 19 | import unittest 20 | import procpipeline 21 | 22 | from mock import patch 23 | 24 | 25 | class Test(unittest.TestCase): 26 | 27 | def setUp(self): 28 | pass 29 | 30 | def tearDown(self): 31 | pass 32 | 33 | def test_build(self): 34 | pass 35 | 36 | @patch('procpipeline.SemanticizeProcessor', autospec=True, create=True) 37 | def test_load_semanticize_processor(self, mock): 38 | # Initialize 39 | 40 | # ++++++++++++++++++++++++++++ 41 | # ++++++++ Run tests +++++++++ 42 | # ++++++++++++++++++++++++++++ 43 | 44 | # Running with wikipedia_ids as None throws an AttributeException 45 | # because we access attributes 46 | self.assertRaises(AttributeError, 47 | procpipeline._load_semanticize_processor, 48 | None) 49 | 50 | # Running with a dict of zero wikipedia_ids should work fine 51 | assert procpipeline._load_semanticize_processor(dict()) 52 | 53 | # use the mocked-out SemanticizeProcessor 54 | print procpipeline._load_semanticize_processor( 55 | {'me': ['hey', 'later'], 56 | 'you': ['hi', 'bye']}) 57 | 58 | @unittest.skip("not yet implemented") 59 | def test_load_features(self): 60 | # Initialize 61 | 62 | # ++++++++++++++++++++++++++++ 63 | # ++++++++ Run tests +++++++++ 64 | # ++++++++++++++++++++++++++++ 65 | pass 66 | 67 | 68 | if __name__ == "__main__": 69 | #import sys;sys.argv = ['', 'Test.testName'] 70 | unittest.main() 71 | -------------------------------------------------------------------------------- /test/TestServer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012-2013, University of Amsterdam. This program is free software: 2 | # you can redistribute it and/or modify it under the terms of the GNU Lesser 3 | # General Public License as published by the Free Software Foundation, either 4 | # version 3 of the License, or (at your option) any later version. 5 | # 6 | # This program is distributed in the hope that it will be useful, but WITHOUT 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 9 | # for more details. 10 | # 11 | # You should have received a copy of the GNU Lesser General Public License 12 | # along with this program. If not, see . 13 | 14 | ''' 15 | Created on 13 Apr 2013 16 | 17 | @author: evert 18 | ''' 19 | import unittest 20 | 21 | 22 | class Test(unittest.TestCase): 23 | 24 | def setUp(self): 25 | pass 26 | 27 | def tearDown(self): 28 | pass 29 | 30 | @unittest.skip("not yet implemented") 31 | def test_set_debug(self): 32 | pass 33 | 34 | @unittest.skip("not yet implemented") 35 | def test_json_dumps(self): 36 | pass 37 | 38 | @unittest.skip("not yet implemented") 39 | def test_get_text_from_request(self): 40 | pass 41 | 42 | @unittest.skip("not yet implemented") 43 | def test_setup_route_semanticize(self): 44 | pass 45 | 46 | def test_setup_route_language(self): 47 | pass 48 | 49 | @unittest.skip("not yet implemented") 50 | def test_setup_route_inspect(self): 51 | pass 52 | 53 | @unittest.skip("not yet implemented") 54 | def test_setup_all_routes(self): 55 | pass 56 | 57 | @unittest.skip("not yet implemented") 58 | def test_start(self): 59 | pass 60 | 61 | @unittest.skip("not yet implemented") 62 | def test_autolang_semanticize(self): 63 | pass 64 | 65 | @unittest.skip("not yet implemented") 66 | def test_semanticize(self): 67 | pass 68 | 69 | @unittest.skip("not yet implemented") 70 | def test_remove_stopwords(self): 71 | pass 72 | 73 | @unittest.skip("not yet implemented") 74 | def test_cleantweet(self): 75 | pass 76 | 77 | @unittest.skip("not yet implemented") 78 | def test_language(self): 79 | pass 80 | 81 | @unittest.skip("not yet implemented") 82 | def test_inspect(self): 83 | pass 84 | 85 | 86 | if __name__ == "__main__": 87 | #import sys;sys.argv = ['', 'Test.testName'] 88 | unittest.main() 89 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/semanticize/semanticizer/af97457cca2f0586a525e001dbb0513f96a916b2/test/__init__.py --------------------------------------------------------------------------------