├── .gitignore ├── .travis.yml ├── LICENSE.md ├── Makefile ├── README.md ├── README.rst ├── bookwormDB ├── CreateDatabase.py ├── MetaParser.py ├── SQLAPI.py ├── __init__.py ├── benchmark.md ├── bin │ ├── dbbindings-flask.py │ ├── dbbindings.py │ └── logParser.py ├── bwExceptions.py ├── configuration.py ├── convertTSVtoJSONarray.py ├── countManager.py ├── general_API.py ├── json_schema.py ├── manager.py ├── mariaDB.py ├── multiprocessingHelp.py ├── schema_primitives.py ├── scripts │ ├── fast_featurecounter.sh │ └── mergecounted.awk ├── search_limits.py ├── sqliteKV.py ├── tokenizer.py ├── variableSet.py └── wsgi.py ├── demos ├── .ipynb_checkpoints │ ├── Reading Binary data-checkpoint.ipynb │ └── Untitled-checkpoint.ipynb └── Untitled.ipynb ├── setup.cfg ├── setup.py └── tests ├── setup.py ├── test_API.py ├── test_bookworm_files ├── field_descriptions.json ├── input.txt ├── jsoncatalog.txt └── test_bookworm_metadata.tsv ├── test_bookworm_files_unicode ├── field_descriptions.json ├── input.txt └── jsoncatalog.txt ├── test_config.py └── test_mysql.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.out 2 | *.log 3 | *.png 4 | *~ 5 | *# 6 | *.tmp 7 | kyoto* 8 | .DS_Store 9 | *.pyc 10 | *.RData 11 | HistoryDiss 12 | HistoryDiss/* 13 | *.tar.gz 14 | tmp.cron 15 | setup.sh 16 | startup.sh 17 | etc/numpy-1.6.2 18 | oldfiles 19 | files 20 | onefile.txt 21 | downloads 22 | newspapers.rdf 23 | extensions 24 | bookworm.cnf 25 | .# 26 | tmp.cron 27 | tmp.tsv 28 | tmp.txt 29 | tmp.txt 30 | .#* 31 | build 32 | old/* 33 | *~ 34 | APIkeys 35 | #* 36 | .#* 37 | .DS_Store 38 | *.cgi 39 | genderizer* 40 | *.pyc 41 | ~/.pypirc 42 | .pypirc 43 | bookwormDB.egg-info 44 | MANIFEST 45 | dist 46 | tests/test_bookworm_files/.bookworm/ 47 | tests/test_bookworm_files/http_server/ 48 | tests/test_bookworm_files_unicode/.bookworm 49 | *.bak 50 | tests/test_bookworm_metadata.tsv 51 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "3.6" 5 | 6 | dist: trusty 7 | 8 | services: 9 | - mysql 10 | 11 | addons: 12 | apt: 13 | packages: 14 | - mysql-server-5.6 15 | - mysql-client-core-5.6 16 | - mysql-client-5.6 17 | 18 | install: 19 | - pip install . 20 | 21 | script: 22 | - cd tests && python test_mysql.py && python test_API.py 23 | 24 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Benjamin Schmidt, Matt Nicklay, Billy Janitsch, and Erez Lieberman Aiden 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # This makefile handles some python package management activities. 2 | 3 | 4 | all: README.rst 5 | 6 | README.rst: README.md 7 | pandoc -o $@ $< 8 | 9 | clean: 10 | rm -rf dist 11 | 12 | dist: 13 | python setup.py sdist 14 | python setup.py bdist_wheel 15 | 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Travis Build Status](https://travis-ci.org/Bookworm-project/BookwormDB.svg?branch=master)](https://travis-ci.org/Bookworm-project/BookwormDB) 2 | 3 | [BookwormDB](https://github.com/bookworm-project/BookwormDB "BookwormDB") is the main code repository for the Bookworm project. Given simply formatted files and metadata, it creates an efficient and easily queryable MySQL database that can make full use of all the metadata and lexical data in the original source. It also includes a powerful API for asking a variety of unigrammatic queries about that data. 4 | 5 | A quick walkthrough is included below: other documentation is at [bookworm.culturomics.org]() and in a [Bookworm Manual](http://bookworm-project.github.io/Docs) on this repository (editable at the repo [here](https://github.com/Bookworm-project/Docs)). 6 | 7 | # Installation 8 | 9 | Installation is tested on Ubuntu and OS X. It may work on other Unixes, but will probably not work on Windows. 10 | 11 | 1. Install some dependencies; mysql or mariadb for databases. 12 | 2. Download the latest release, either by cloning this git repo or downloading a zip. 13 | 3. Navigate to the folder in the terminal, and type `pip install .`. 14 | 4. Type `bookworm --help` to confirm the executable has worked. If this doesn't work, file 15 | a bug report. 16 | 5. (No longer?) Type `bookworm config mysql` for some interactive prompts to allow Bookworm to edit MySQL databases on your server. (Note that this makes some other changes to your mysql configuration files; you may want to copy them first if you're using it for other things.) 17 | 18 | ## Releases 19 | 20 | The `master` branch is regularly tested on Travis; you are generally best off installing the latest version. 21 | 22 | ## Related projects 23 | 24 | This builds a database and implements the Bookworm API on particular set of texts. 25 | 26 | Some basic, widely appealing visualizations of the data are possible with the Bookworm [web app](https://github.com/bookworm-project/BookwormGUI "Bookworm web app"), which runs on top of the API. 27 | 28 | A more wide-ranging set of visualizations is available built on top of D3 in the [Bookworm D3 package](http://github.com/bmschmidt/BookwormD3). 29 | If you're looking to develop on top of Bookworm, that presents a much more flexible set of tools. 30 | 31 | ## Bookworms ## 32 | Here are a couple of Bookworms built using [BookwormDB](https://github.com/bookworm-project/BookwormDB "Bookworm"): 33 | 34 | 1. [Open Library](http://bookworm.culturomics.org/OL/ "Open Library") 35 | 2. [ArXiv](http://bookworm.culturomics.org/arxiv/ "ArXiv") 36 | 3. [Chronicling America](http://arxiv.culturomics.org/ChronAm/ "Chronicling America") 37 | 4. [SSRN](http://bookworm.culturomics.org/ssrn/ "SSRN: Social Science Research Network") 38 | 5. [US Congress](http://bookworm.culturomics.org/congress/ "Bills in US Congress") 39 | 6. [Rate My Professor Gendered Language](http://benschmidt.org/profGender) 40 | 41 | 42 | ## Getting Started ## 43 | 44 | ### Docker 45 | 46 | We're working on docker containerization. Help appreciated. Contact `bs 145 at nyu dot edu`, 47 | no spaces involved. 48 | 49 | ### Required MySQL Database ### 50 | 51 | You must have a MySQL database set up that you can log into with admin access, 52 | probably with a `my.cnf` file at ~/.my.cnf. Depending on your platform, this 53 | can be a little tricky to set up. 54 | 55 | Bookworm will automatically create a select-only user that handles web queries, 56 | preventing any malicious actions through the API. 57 | 58 | There is a command `bookworm config mysql` that will interactively update 59 | certain files in your global my.cnf. It may need to be run with admin privileges. 60 | 61 | 62 | Bookworm by default tries to log on with admin privileges with the following preferences: 63 | 64 | ``` 65 | [client] 66 | host = 127.0.0.1 67 | user = root 68 | password = '' 69 | 70 | ``` 71 | 72 | But it also looks in several locations--`~/etc/my.cnf`, `~/etc/.my.cnf`, and `/etc/bookworm/admin.cnf`--for other passwords. 73 | (I don't have an empty root password on my local MySQL server!). 74 | It updates the host, user, and password with values from each of those files 75 | if they exist in that order. 76 | 77 | The command `bookworm config mysql-info` shows you what password and host it's 78 | trying to use. 79 | 80 | In addition to the username and password, the host matters as well. 81 | Depending on setup, 'localhost' and '127.0.0.1' mean different things to mysql 82 | (the former is a socket, the latter a port). Depending on exactly how you're 83 | invoking mysql, you may need to use one or the other to communicate. 84 | For instance, your root account might not have login privileges through 85 | 127.0.0.1, just at localhost--depends exactly how the server is invoked. 86 | 87 | To debug mysql permissions issues type `mysql -u $USER -h 127.0.0.1 -p` at the prompt, 88 | use your password. Once you have confirmed that brings up a mysql prompt that 89 | can grant privileges, copy those files into something at `~/.my.cnf` (or if 90 | you're able, `/etc/bookworm/admin.cnf`) 91 | in the format given by `bookworm config mysql-info` (or the above block.) 92 | 93 | 94 | 95 | ## The query API 96 | 97 | This distribution also includes two files, general_api.py and SQLapi.py, 98 | which together constitute an implementation of the API for Bookworm, written in Python. 99 | It primarily implements the API on a MySQL database now, 100 | but includes classes for more easily implementing it on top of other platforms (such as Solr). 101 | 102 | It is used with the [Bookworm GUI](https://github.com/Bookworm-project/BookwormGUI) 103 | and can also be used as a standalone tool to query data from your database. 104 | To run the API in its most basic form, type `bookworm query $string`, 105 | where $string is a json-formatted query. In general, query performance will be 106 | faster over bookworm's API process, which you can start by typing `bookworm serve` 107 | and querying over port 10012. 108 | 109 | While the point of the command-line tool `bookworm` is generally to *create* a Bookworm, the API is to retrieves results from it. 110 | 111 | For a more interactive explanation of how the GUI works, see the [Vega-Bookworm project sandbox]. 112 | 113 | Walkthrough 114 | =========== 115 | 116 | These are some instructions on how to build a bookworm. 117 | 118 | We'll use a collection of 450 novels in 3 languages: 119 | 120 | Piper, Andrew (2016): txtlab Multilingual Novels. figshare. 121 | 122 | ### Download and unzip the files. 123 | 124 | ``` 125 | wget https://ndownloader.figshare.com/files/3686805 126 | wget https://ndownloader.figshare.com/files/3686778 127 | unzip 3686778 128 | 129 | ``` 130 | ### Create catalog and text files. 131 | 132 | For this set, a simple python script suffices to build the 133 | two needed files, using the textlab's files. Paste this into parse.py. 134 | 135 | ```python 136 | import pandas as pd 137 | import json 138 | output = open("input.txt", "w") 139 | catalog = open("jsoncatalog.txt", "w") 140 | for book in pd.read_csv("3686805").to_dict(orient="records"): 141 | try: 142 | fulltext_lines = open(f"2_txtalb_Novel450/{book['filename']}").readlines() 143 | # Bookworm reserver newline and tab characters, so they are stripped before 144 | fulltext = "\f".join(fulltext_lines) 145 | fulltext = fulltext.replace("\r", " ").replace("\n", " ").replace("\t", " ") 146 | book['filename'] = str(book['id']) 147 | output.write(f"{book['filename']}\t{fulltext}\n") 148 | book['searchstring'] = book['title'] + ' ' + book['author'] 149 | catalog.write(json.dumps(book) + "\n") 150 | except FileNotFoundError: 151 | # This dataset has errors! 152 | continue 153 | ``` 154 | 155 | ```sh 156 | python parse.py 157 | ``` 158 | 159 | Create a bookworm.cnf file in the file. (This isn't always necessary; usually 160 | it can just infer the database name from your current directory.) 161 | ``` 162 | echo "[client]\ndatabase=txtlab450" > bookworm.cnf 163 | ``` 164 | 165 | ### Initialize the Bookworm 166 | 167 | ``` 168 | bookworm init 169 | bookworm build all 170 | ``` 171 | 172 | ### Required files 173 | 174 | #### Required files 1: full text of each file with an identifier. 175 | 176 | * `input.txt` 177 | 178 | In this format, each line consists of the file's unique identifier, followed by a tab, followed by the **full text** of that file. Note that you'll have to strip out all newlines and returns from original documents. In the event that an identifier is used twice, behavior is undefined. 179 | 180 | By changing the makefile, you can also do some more complex substitutions. (See the metadata parsers for an example of a Bookworm that directly reads hierarchical, bzipped directories without decompressing first). 181 | 182 | #### Required files 2: Metadata about each file. 183 | 184 | * `jsoncatalog.txt` with one JSON object per line. ("newline-delimited json" format.) 185 | The keys represent shared metadata for each file: the values represent the entry for that particular document. There should be no new line or tab characters in this file. 186 | 187 | In addition to the metadata you choose, two fields are required: 188 | 189 | 1. A `searchstring` field that contains valid HTML which will be served to the user to identify the text. 190 | * This can be a link, or simply a description of the field. If you have a URL where the text can be read, it's best to include it inside an tag: otherwise, you can just put in any text field you want in the process of creating the jsoncatalog.txt file: something like author and title is good. 191 | 192 | 2. A `filename` field that includes a unique identifier for the document (linked to the filename or the identifier, depending on your input format). 193 | 194 | **Note that the python script above does both of these at once.** 195 | 196 | #### Required Files 3: Metadata about the metadata. 197 | 198 | Now create a file in the `field_descriptions.json` which is used to define the type of variable for each variable in `jsoncatalog.txt`. 199 | 200 | Currently, you **do** have to include a `searchstring` definition in this, but **should not** include a filename definition. 201 | 202 | ## Running ## 203 | 204 | For a first run, you just want to use `bookworm init` to create the entire database (if you want to rebuild parts of a large bookworm--the metadata, for example--that is also possible.) 205 | 206 | ``` 207 | bookworm init 208 | ``` 209 | 210 | This will walk you through the process of choosing a name for your database. 211 | 212 | Then to build the bookworm, type 213 | 214 | ``` 215 | bookworm build all 216 | ``` 217 | 218 | Depending on the total number and average size of your texts, 219 | this could take a while. Sit back and relax. 220 | 221 | Finally, you want to implement the API and see some results. 222 | 223 | Type 224 | 225 | ``` 226 | bookworm serve 227 | ``` 228 | 229 | To start a process on port 10012 that responds to queries. 230 | This daemon must run continuously. 231 | 232 | Then you can access query results over http. Try visiting this page in a web browser. 233 | 234 | `http://localhost:10012/?q={%22database%22:%22txtlab450%22,%22method%22:%22data%22,%22format%22:%22csv%22,%22groups%22:[%22date%22,%20%22language%22],%22counttype%22:[%22TextCount%22,%22WordCount%22]}` 235 | 236 | 237 | Once this works, you can use various libraries to query the endpoint, 238 | or create an HTML page that builds off the endpoint. See 239 | the (currently underdeveloped) Bookworm-Vega repository for some examples. 240 | 241 | 242 | ## Production servers 243 | 244 | Serving from localhost:10012 won't work especially well in production contexts. 245 | Heavy-duty web servers do rate limiting and other things that the gunicorn process 246 | bookworm uses don't handle. 247 | 248 | One strategy is to serve the web site (using bookworm-vega or something else) 249 | over port 80, while passing all cgi-requests through to port 10012 where the 250 | bookworm server handles them. (Note that this may disable *other* cgi services 251 | on that particular server.) 252 | 253 | This means it's possible to run the bookworm server anywhere, and then just 254 | forward the connection to your server using ssh tunnels. (Note that doing so 255 | may be inefficient, because it adds an extra layer of packet encoding. I'm open 256 | to better solutions here). 257 | 258 | ### Apache 259 | 260 | The steps for Apache are: 261 | 262 | 1. Serve the Bookworm API over port 10012. (`bookworm serve`). 263 | 2. Install an Apache host on port 80. 264 | 3. Enable proxy servers and turn off any existing cgi. 265 | **If you were previously using the CGI bookworm.** 266 | `sudo a2dismod cgi` 267 | `sudo a2enmod proxy proxy_ajp proxy_http rewrite deflate headers proxy_balancer proxy_connect proxy_html` 268 | 4. Add the following to your '/etc/apache2/sites-available/000-default.conf' 269 | (or whatever site from which you run your apache) to pass cgi-bin queries 270 | to the bookworm ser ver. 271 | ``` 272 | 273 | Order deny,allow 274 | Allow from all 275 | 276 | ProxyPreserveHost On 277 | 278 | ProxyPass "http://127.0.0.1:10012/" 279 | ProxyPassReverse "http://127.0.0.1:10012/" 280 | 281 | ``` 282 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |Travis Build Status| 2 | 3 | `BookwormDB `__ is the 4 | main code repository for the Bookworm project. Given simply formatted 5 | files and metadata, it creates an efficient and easily queryable MySQL 6 | database that can make full use of all the metadata and lexical data in 7 | the original source. It also includes a powerful API for asking a 8 | variety of unigrammatic queries about that data. 9 | 10 | A quick walkthrough is included below: other documentation is at 11 | `bookworm.culturomics.org <>`__ and in a `Bookworm 12 | Manual `__ on this repository 13 | (editable at the repo 14 | `here `__). 15 | 16 | Installation 17 | ============ 18 | 19 | Installation is tested on Ubuntu and OS X. It may work on other Unixes, 20 | but will not work on Windows. 21 | 22 | 1. Install some dependencies; mysql or mariadb for databases, and GNU 23 | parallel for parallel processing. 24 | 2. Download the latest release, either by cloning this git repo or 25 | downloading a zip. 26 | 3. Navigate to the folder in the terminal, and type ``pip install .``. 27 | 28 | - If ``/usr/lib/cgi-bin`` is not writeable by your account, you may 29 | need to type ``sudo pip install .`` 30 | 31 | 4. Type ``bookworm --help`` to confirm the executable has worked. If 32 | this doesn't work, file a bug report. 33 | 5. Type ``bookworm config mysql`` for some interactive prompts to allow 34 | Bookworm to edit MySQL databases on your server. (Note that this 35 | makes some other changes to your mysql configuration files; you may 36 | want to copy them first if you're using it for other things.) 37 | 38 | Releases 39 | -------- 40 | 41 | The ``master`` branch is regularly tested on Travis; you are generally 42 | best off installing the latest version. 43 | 44 | Related projects 45 | ---------------- 46 | 47 | This builds a database and implements the Bookworm API on particular set 48 | of texts. 49 | 50 | Some basic, widely appealing visualizations of the data are possible 51 | with the Bookworm `web 52 | app `__, which runs on 53 | top of the API. 54 | 55 | A more wide-ranging set of visualizations is available built on top of 56 | D3 in the `Bookworm D3 57 | package `__. If you're looking 58 | to develop on top of Bookworm, that presents a much more flexible set of 59 | tools. 60 | 61 | Bookworms 62 | --------- 63 | 64 | Here are a couple of Bookworms built using 65 | `BookwormDB `__: 66 | 67 | 1. `Open Library `__ 68 | 2. `ArXiv `__ 69 | 3. `Chronicling America `__ 70 | 4. `SSRN `__ 71 | 5. `US Congress `__ 72 | 6. `Rate My Professor Gendered 73 | Language `__ 74 | 75 | Getting Started 76 | --------------- 77 | 78 | Required MySQL Database 79 | ~~~~~~~~~~~~~~~~~~~~~~~ 80 | 81 | The hardest part about setting up Bookworm is properly configuring the 82 | MySQL installation. The easiest way to test out Bookworm on your home 83 | computer may be to use a VM running Ubuntu; installation is realtively 84 | easy using OS X, as well. 85 | 86 | At the very least, there must be a MySQL user with permissions to insert 87 | + select data from all databases. The easiest way to handle this is to 88 | have a user with root access defined in your system-wide MySQL 89 | configuration files. 90 | 91 | This creates a bit of a security risk, though, so we recommend 2 MySQL 92 | users: an admin user with the ability to create new databases (i.e. 93 | GRANT ALL) and a second user that is only able to select data from 94 | databases (i.e. GRANT SELECT). This is for security: your data is safer 95 | if the web user can't modify it at all. 96 | 97 | Setting up databases automatically 98 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 99 | 100 | Running ``bookworm config mysql`` will take care of most of these tasks 101 | through an interactive prompt; ``bookworm config --force mysql`` will 102 | come up with automatic answers for all questions, and is suitable in a 103 | scripting situation (like setting up a variety of VMs). The 104 | configuration script will ask for a variety of different passwords, and 105 | may request an administrative password from the machine. 106 | 107 | Restart your MySQL server after the configuration script has run. 108 | 109 | If you encounter problems in the config script, please feel free to post 110 | issues to the project github repo. 111 | 112 | Setting up databases manually 113 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 114 | 115 | If you have an existing MySQL configuration you do not want to risk 116 | hurting, you may want to proceed by hand. 117 | 118 | First, create an admin user: 119 | 120 | For example, create a user ``foobar`` with password ``mysecret`` and 121 | full access to all databases from ``localhost``: 122 | 123 | .. code:: mysql 124 | 125 | CREATE USER 'foobar'@'localhost' IDENTIFIED BY 'mysecret'; 126 | GRANT ALL PRIVILEGES ON *.* TO 'foobar'@'localhost' WITH GRANT OPTION; 127 | FLUSH PRIVILEGES; 128 | 129 | Then put the credentials for that user in a file that will be 130 | automatically read when you start up MySQL. The best place for this is 131 | at ``~/.my.cnf``. If multiple users on your machine will be 132 | administering the bookworm, another recommended location is 133 | ``/etc/bookworm/admin.cnf``. In this example, that file would look like 134 | this: 135 | 136 | :: 137 | 138 | [client] 139 | user = 'foobar' 140 | password = 'mysecret' 141 | 142 | The second user would be the user that the API uses to get data for 143 | queries over the web. Note that this user has only "select" rights. 144 | 145 | .. code:: mysql 146 | 147 | GRANT SELECT ON *.* TO 'bookworm_client'@'localhost' IDENTIFIED BY 'otherpassword'; 148 | FLUSH PRIVILEGES; 149 | 150 | Then add a section your **systemwide** ``my.cnf`` file (usually at 151 | ``/etc/mysql/my.cnf``, ``/etc/my.cnf``, or a similar location). 152 | 153 | :: 154 | 155 | [client] 156 | user = 'bookworm_client' 157 | password = 'otherpassword' 158 | 159 | With these settings in place, you're ready to begin building a Bookworm. 160 | See `the walkthrough <#walkthrough>`__ for a fuller example. 161 | 162 | The query API 163 | ------------- 164 | 165 | This distribution also includes two files, general\_api.py and 166 | SQLapi.py, which together constitute an implementation of the API for 167 | Bookworm, written in Python. It primarily implements the API on a MySQL 168 | database now, but includes classes for more easily implementing it on 169 | top of other platforms (such as Solr). 170 | 171 | It is used with the `Bookworm 172 | GUI `__ and can also be 173 | used as a standalone tool to query data from your database. To run the 174 | API in its most basic form, type ``bookworm query $string``, where 175 | $string is a json-formatted query. 176 | 177 | An executable is bundled in the distro at 178 | ``bookwormdb/bin/dbbindings.py`` that, when placed in your cgi-bin 179 | folder, will serve the API over to and from the web; when you install 180 | bookworm, it attempts to move this into a web directory for you. 181 | 182 | While the point of the command-line tool ``bookworm`` is generally to 183 | *create* a Bookworm, the point of the query API is to retrieve results 184 | from it. 185 | 186 | For a more interactive explanation of how the GUI works, see the `D3 187 | bookworm browser `__ (Sorry, this 188 | is broken for the moment). 189 | 190 | Installing the API. 191 | ~~~~~~~~~~~~~~~~~~~ 192 | 193 | On most systems, ``pip install .`` in the ``bookwormDB`` dir should 194 | deposit a copy in an appropriate location on your system (such as 195 | ``/usr/lib/cgi-bin``). 196 | 197 | If that doesn't work, just run 198 | ``cp bookwormDB/bin/dbbindings.py /usr/lib/cgi-bin`` (exact locations 199 | may vary) to place it in the correct place. 200 | 201 | If using homebrew on OS X, the shebang at the beginning of 202 | ``dbbindings.py`` may be incorrect. (It will not load your installed 203 | python modules). Change it from ``#!/usr/bin/env python`` to 204 | ``#!/usr/local/bin/python``, and it should work. (Or you can fix the 205 | PYTHONPATH that apache uses as `described 206 | here `__, but 207 | that is considerably harder than just changing the bookworm code. 208 | 209 | Walkthrough 210 | =========== 211 | 212 | These are some instructions on how to build a bookworm. 213 | 214 | Indented bits tell you how to build on specific bookworm using `text 215 | from the summaries of 216 | bills `__ introduced 217 | in the US Congress from 1973 to the present day. The goal is to 218 | provide everything needed to build a Bookworm using publically 219 | available data. 220 | 221 | Get the Data 222 | ------------ 223 | 224 | First off, you need a collection of texts to analyze. Ideally this 225 | should be more than 1000 individual texts, with some year (or other 226 | time) description. 227 | 228 | To download the congress data, Matt Nicklay has put together a 229 | script in another repo that will download everything you'll need. 230 | Clone that repo and run ``get_and_unzip_data.py`` to fetch and unzip 231 | the data: 232 | 233 | :: 234 | 235 | git clone git://github.com/bmschmidt/congress_api 236 | cd congress_api 237 | python get_and_unzip_data.py 238 | 239 | This will take a few minutes depending on your Internet connection 240 | and the speed of your computer. The ``get_and_unzip_data.py`` script 241 | simply downloads and unzips all the files in parallel using 242 | `multiprocessing `__. 243 | NOTE: Once fully unzipped, the files will take up just under 3GB of 244 | disk space. 245 | 246 | Prep to Build Bookworm 247 | ---------------------- 248 | 249 | If you haven't already, install this repo on your system. 250 | 251 | :: 252 | 253 | git clone git://github.com/Bookworm-project/BookwormDB 254 | cd BookwormDB 255 | python setup.py install 256 | 257 | Required Files 258 | ~~~~~~~~~~~~~~ 259 | 260 | To build a bookworm, you need to build three files in the directory you 261 | plan to use. You can have whatever other files you want in the root 262 | directory. But these three names are reserved for bookworm use. 263 | 264 | :: 265 | 266 | congress/ 267 | | input.txt 268 | | jsoncatalog.txt 269 | | field_descriptions.json 270 | 271 | Required files 1: input.txt: 272 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 273 | 274 | The first is slightly more complicated than it appears. It contains the 275 | various files you'll be reading in as unicode text. These can be input 276 | in one of three ways. 277 | 278 | The first, which will be faster in most cases, is as a *single file*. 279 | 280 | - ``input.txt`` 281 | 282 | In this format, each line consists of the file's unique identifier, 283 | followed by a tab, followed by the **full text** of that file. Note that 284 | you'll have to strip out all newlines and returns from original 285 | documents. In the event that an identifier is used twice, behavior is 286 | undefined. 287 | 288 | By changing the makefile, you can also do some more complex 289 | substitutions. (See the metadata parsers for an example of a Bookworm 290 | that directly reads hierarchical, bzipped directories without 291 | decompressing first). 292 | 293 | **Format 2** is as a directory of files: 294 | 295 | - ``input/`` 296 | 297 | This folder should contain a uniquely named .txt file for every item in 298 | your collection of texts that you want to build a bookworm around. The 299 | files may be stored in subdirectories: if so, their identifier key 300 | should include the full path to the file (but not the trailing '.txt'). 301 | (NOTE: this is currently unimplemented) 302 | 303 | **Format 3** is as a shell script named 304 | 305 | - ``input_script`` 306 | 307 | That script when executed, should out a stream formatted the same as 308 | input.txt. In some cases, this will allow you to save a lot disk space 309 | and/or time. It must be executable and have a shebang on the first line 310 | designating the interpreter. (NOTE: currently unimplemented). 311 | 312 | To build the congress API, we must create an ``input.txt`` file with 313 | raw text from summaries of bills introduced into Congress. Each line 314 | contains a unique ID and the text from the summary of a single bill. 315 | Then, we will create the ``jsoncatalog.txt`` file which will hold 316 | metadata for each bill, including a field that links each JSON 317 | object to a line in input.txt. Included in the 318 | `congress\_api `__ repo is 319 | a script ``congress_parser.py`` which we'll run to create 320 | ``jsoncatalog.txt`` and the ``input.txt`` file. 321 | 322 | :: 323 | 324 | cd congress_api 325 | python congress_parser.py 326 | 327 | Required files 2: Metadata about each file. 328 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 329 | 330 | - ``jsoncatalog.txt`` with one JSON object per line. The keys represent 331 | shared metadata for each file: the values represent the entry for 332 | that particular document. There should be no new line or tab 333 | characters in this file. 334 | 335 | In addition to the metadata you choose, two fields are required: 336 | 337 | 1. A ``searchstring`` field that contains valid HTML which will be 338 | served to the user to identify the text. 339 | 340 | - This can be a link, or simply a description of the field. If you have 341 | a URL where the text can be read, it's best to include it inside an 342 | tag: otherwise, you can just put in any text field you want in the 343 | process of creating the jsoncatalog.txt file: something like author 344 | and title is good. 345 | 346 | 2. A ``filename`` field that includes a unique identifier for the 347 | document (linked to the filename or the identifier, depending on your 348 | input format). 349 | 350 | Congress users have already created this file in the previous step. 351 | 352 | Required Files 3: Metadata about the metadata. 353 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 354 | 355 | Now create a file in the ``field_descriptions.json`` which is used to 356 | define the type of variable for each variable in ``jsoncatalog.txt``. 357 | 358 | Currently, you **do** have to include a ``searchstring`` definition in 359 | this, but **should not** include a filename definition. 360 | 361 | For the Congress demo, copy the following JSON object into 362 | ``field_descriptions.json``: 363 | 364 | .. code:: json 365 | 366 | [ 367 | {"field":"date","datatype":"time","type":"numeric","unique":true,"derived":[{"resolution":"month"}]}, 368 | {"field":"searchstring","datatype":"searchstring","type":"text","unique":true}, 369 | {"field":"enacted","datatype":"categorical","type":"text","unique":false}, 370 | {"field":"sponsor_state","datatype":"categorical","type":"text","unique":false}, 371 | {"field":"cosponsors_state","datatype":"categorical","type":"text","unique":false}, 372 | {"field":"chamber","datatype":"categorical","type":"text","unique":false} 373 | ] 374 | 375 | Everything should now be in place and we are ready to build the 376 | database. 377 | 378 | Running 379 | ------- 380 | 381 | For a first run, you just want to use ``bookworm init`` to create the 382 | entire database (if you want to rebuild parts of a large bookworm--the 383 | metadata, for example--that is also possible.) 384 | 385 | :: 386 | 387 | bookworm init 388 | 389 | This will walk you through the process of choosing a name for your 390 | database. 391 | 392 | Then to build the bookworm, type 393 | 394 | :: 395 | 396 | bookworm build all 397 | 398 | Depending on the total number and average size of your texts, this could 399 | take a while. Sit back and relax. 400 | 401 | Finally, you may want to set up a GUI. 402 | 403 | To test a local one over a python webserver, type 404 | 405 | :: 406 | 407 | bookworm serve 408 | 409 | Otherwise, you can type 410 | 411 | :: 412 | 413 | bookworm build linechartGUI 414 | 415 | General Workflow 416 | ~~~~~~~~~~~~~~~~ 417 | 418 | For reference, the general workflow of the Makefile is the following: 419 | 420 | 5. Build the directory structure in ``files/texts/``. 421 | 6. Derive ``.bookworm/metadata/field_descriptions_derived.json`` from 422 | ``.bookworm/metadata/field_descriptions.txt``. 423 | 7. Derive ``.bookworm/metadata/jsoncatalog_derived.txt`` from 424 | ``.bookworm/metadata/jsoncatalog.json``, respectively. 425 | 8. Create metadata catalog files in ``.bookworm/metadata/``. 426 | 9. Create a table with all words from the text files, and save the 427 | million most common for regular use. 428 | 10. Encode unigrams and bigrams from the texts into 429 | ``.bookworm/encoded`` 430 | 11. Load data into MySQL database. 431 | 12. Create temporary MySQL table and .json file that will be used by the 432 | web app. 433 | 13. Create API settings. 434 | 435 | Dependencies 436 | ============ 437 | 438 | - python 2.7 (with modules): 439 | - ntlk (recommended, to be required) 440 | - numpy 441 | - regex (to handle complicated Unicode regular expressions for 442 | tokenization: ``easy_install regex``) 443 | - pandas (used by the API, not this precise, set of scripts) 444 | - parallel (GNU parallel, in versions available from apt-get or 445 | homebrew) 446 | - MySQL v. 5.6 (will work with 5.5, but future versions may require 5.6 447 | for some functionality; MariaDB 10.0+ is also actively supported. 448 | Some people have reported that it largely works with MySQL 5.1) 449 | - Apache or other webserver (for front end, if you don't just want to 450 | run the simple version through ``bookworm serve`` that uses an 451 | obscure port.) 452 | 453 | .. |Travis Build Status| image:: https://travis-ci.org/Bookworm-project/BookwormDB.svg?branch=master 454 | :target: https://travis-ci.org/Bookworm-project/BookwormDB 455 | -------------------------------------------------------------------------------- /bookwormDB/MetaParser.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from datetime import date 3 | import datetime 4 | import dateutil.parser 5 | import json 6 | import sys 7 | import os 8 | import logging 9 | from multiprocessing import Queue, Process 10 | from queue import Empty 11 | from .multiprocessingHelp import mp_stats, running_processes 12 | import time 13 | 14 | 15 | defaultDate = datetime.datetime(datetime.MINYEAR, 1, 1) 16 | 17 | def DaysSinceZero(dateobj): 18 | #Zero isn't a date, which python knows but MySQL and javascript don't. 19 | return (dateobj - date(1,1,1)).days + 366 20 | 21 | def ParseFieldDescs(write = False): 22 | f = open('field_descriptions.json', 'r') 23 | try: 24 | fields = json.loads(f.read()) 25 | except ValueError: 26 | raise ValueError("Error parsing JSON: Check to make sure that your field_descriptions.json file is valid?") 27 | 28 | 29 | if write: 30 | derivedFile = open('.bookworm/metadata/field_descriptions_derived.json', 'w') 31 | 32 | output = [] 33 | 34 | fields_to_derive = [] 35 | 36 | for field in fields: 37 | if field["datatype"] == "time": 38 | if "derived" in field: 39 | fields_to_derive.append(field) 40 | else: 41 | output.append(field) 42 | else: 43 | output.append(field) 44 | 45 | for field in fields_to_derive: 46 | for derive in field["derived"]: 47 | if "aggregate" in derive: 48 | tmp = dict(datatype="time", type="integer", unique=True) 49 | tmp["field"] = '_'.join([field["field"], derive["resolution"], 50 | derive["aggregate"]]) 51 | output.append(tmp) 52 | else: 53 | tmp = dict(datatype="time", type="integer", unique=True) 54 | tmp["field"] = '_'.join([field["field"], derive["resolution"]]) 55 | output.append(tmp) 56 | if write: 57 | derivedFile.write(json.dumps(output)) 58 | derivedFile.close() 59 | 60 | return (fields_to_derive, fields) 61 | 62 | def parse_json_catalog(line_queue, processes, modulo): 63 | fields_to_derive, fields = ParseFieldDescs(write = False) 64 | 65 | if os.path.exists("jsoncatalog.txt"): 66 | mode = "json" 67 | fin = open("jsoncatalog.txt") 68 | 69 | if os.path.exists("catalog.csv"): 70 | mode = "csv" 71 | import csv 72 | fin = csv.DictReader("catalog.csv") 73 | 74 | for i, line in enumerate(fin): 75 | if i % processes != modulo: 76 | continue 77 | 78 | for char in ['\t', '\n']: 79 | line = line.replace(char, '') 80 | 81 | if mode == "json": 82 | try: 83 | line = json.loads(line) 84 | except: 85 | logging.warn("Couldn't parse catalog line {}".format(line)) 86 | continue 87 | 88 | for field in fields: 89 | # Smash together misidentified lists 90 | try: 91 | if field['unique'] and isinstance(line[field["field"]],list): 92 | line[field["field"]] = "--".join(line[field["field"]]) 93 | except KeyError: 94 | pass 95 | 96 | for field in fields_to_derive: 97 | 98 | """ 99 | Using fields_to_derive as a shorthand for dates--this may break 100 | if we get more ambitious about derived fields, 101 | but this whole metadata-parsing code needs to be refactored anyway. 102 | 103 | Note: this code is inefficient--it parses the same date multiple times. 104 | We should be parsing the date once and pulling 105 | derived fields out of that one parsing. 106 | """ 107 | 108 | try: 109 | if line[field["field"]]=="": 110 | # Use blankness as a proxy for unknown 111 | continue 112 | 113 | time = dateutil.parser.parse(line[field["field"]],default = defaultDate) 114 | intent = [time.year,time.month,time.day] 115 | content = [str(item) for item in intent] 116 | 117 | pass 118 | except: 119 | """ 120 | Fall back to parsing as strings 121 | """ 122 | try: 123 | datem = line[field["field"]].split("T")[0] 124 | content = datem.split('-') 125 | intent = [int(item) for item in content] 126 | except KeyError: 127 | #It's OK not to have an entry for a time field 128 | continue 129 | except ValueError: 130 | # Thrown if fields are empty on taking the int value: treat as junk 131 | continue 132 | except AttributeError: 133 | """ 134 | Happens if it's an integer, which is a forgiveable way 135 | to enter a year: 136 | """ 137 | content = [str(line[field['field']])] 138 | intent = [line[field['field']]] 139 | else: 140 | for derive in field["derived"]: 141 | try: 142 | if "aggregate" in derive: 143 | if derive["resolution"] == 'day' and \ 144 | derive["aggregate"] == "year": 145 | k = "%s_day_year" % field["field"] 146 | dt = date(intent[0], intent[1], intent[2]) 147 | line[k] = dt.timetuple().tm_yday 148 | elif derive["resolution"] == 'day' and \ 149 | derive["aggregate"] == "month": 150 | k = "%s_day_month" % field["field"] 151 | line[k] = intent[2] 152 | elif derive["resolution"] == 'day' and \ 153 | derive["aggregate"] == "week": 154 | k = "%s_day_month" % field["field"] 155 | dt = date(intent[0], intent[1], intent[2]) 156 | # Python and javascript handle weekdays differently: 157 | # Like JS, we want to begin on Sunday with zero 158 | line[k] = dt.weekday() + 1 159 | if (line[k]) == 7: 160 | line[k] = 0 161 | elif derive["resolution"] == 'month' and \ 162 | derive["aggregate"] == "year": 163 | k = "%s_month_year" % field["field"] 164 | dt = date(1,intent[1],1) 165 | line[k] = dt.timetuple().tm_yday 166 | elif derive["resolution"] == 'week' and \ 167 | derive["aggregate"] == "year": 168 | dt = date(intent[0], intent[1], intent[2]) 169 | k = "%s_week_year" % field["field"] 170 | line[k] = int(dt.timetuple().tm_yday/7)*7 171 | elif derive["resolution"] == 'hour' and \ 172 | derive["aggregate"] == "day": 173 | k = "%s_hour_day" % field["field"] 174 | line[k] = time.hour 175 | elif derive["resolution"] == 'minute' and \ 176 | derive["aggregate"] == "day": 177 | k = "%s_hour_day" % field["field"] 178 | line[k] = time.hour*60 + time.minute 179 | else: 180 | logging.warning('Problem with aggregate resolution.') 181 | continue 182 | else: 183 | if derive["resolution"] == 'year': 184 | line["%s_year" % field["field"]] = intent[0] 185 | elif derive["resolution"] == 'month': 186 | try: 187 | k = "%s_month" % field["field"] 188 | dt = date(intent[0], intent[1], 1) 189 | line[k] = DaysSinceZero(dt) 190 | except: 191 | logging.warning("Problem with date fields\n") 192 | pass 193 | elif derive['resolution'] == 'week': 194 | k = "%s_week" % field['field'] 195 | dt = date(intent[0], intent[1], intent[2]) 196 | inttime = DaysSinceZero(dt) 197 | time = int(inttime/7)*7 198 | #Not starting on Sunday or anything funky like that. Actually, I don't know what we're starting on. Adding an integer here would fix that. 199 | line[k] = time 200 | elif derive['resolution'] == 'day': 201 | k = "%s_day" % field['field'] 202 | dt = date(intent[0], intent[1], intent[2]) 203 | inttime = DaysSinceZero(dt) 204 | line[k] = inttime 205 | else: 206 | logging.warning('Resolution %s currently not supported.' % (derive['resolution'])) 207 | continue 208 | except ValueError: 209 | # One of out a million Times articles threw this with 210 | # a year of like 111,203. It's not clear how best to 211 | # handle this. 212 | logging.warning("ERROR: %s " % line[field["field"]] + 213 | "did not convert to proper date. Moving on...") 214 | # raise 215 | pass 216 | except Exception as e: 217 | logging.warning('*'*50) 218 | logging.warning('ERROR: %s\nINFO: %s\n' % (str(e), e.__doc__)) 219 | logging.warning('*'*50) 220 | line.pop(field["field"]) 221 | try: 222 | el = json.dumps(line) 223 | line_queue.put((line["filename"], el)) 224 | except KeyError: 225 | logging.warning("No filename key in {}".format(line)) 226 | except: 227 | logging.warning("Error on {}".format(line)) 228 | raise 229 | logging.debug("Metadata thread done after {} lines".format(i)) 230 | 231 | 232 | def parse_catalog_multicore(): 233 | from .sqliteKV import KV 234 | cpus, _ = mp_stats() 235 | encoded_queue = Queue(10000) 236 | workers = [] 237 | 238 | for i in range(cpus): 239 | p = Process(target = parse_json_catalog, args = (encoded_queue, cpus, i)) 240 | p.start() 241 | workers.append(p) 242 | output = open(".bookworm/metadata/jsoncatalog_derived.txt", "w") 243 | 244 | bookids = KV(".bookworm/metadata/textids.sqlite") 245 | import sqlite3 246 | 247 | while True: 248 | try: 249 | filename, n = encoded_queue.get_nowait() 250 | output.write(n + "\n") 251 | ids = set() 252 | try: 253 | bookids.register(filename) 254 | except sqlite3.IntegrityError: 255 | if filename in ids: 256 | logging.warning("Duplicate key insertion {}".format(filename)) 257 | ids.add(filename) 258 | 259 | except Empty: 260 | if running_processes(workers): 261 | # Give it a sec to fill back up to avoid this thread taking up 262 | # a full processor. 263 | time.sleep(0.01) 264 | else: 265 | # We're done! 266 | break 267 | 268 | bookids.close() 269 | output.close() 270 | -------------------------------------------------------------------------------- /bookwormDB/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bookworm-project/BookwormDB/a7eb8482879143ffc6a0fb55a891f765d2aae383/bookwormDB/__init__.py -------------------------------------------------------------------------------- /bookwormDB/benchmark.md: -------------------------------------------------------------------------------- 1 | at 3000 files per batch, 100 seconds to load in the streets from the raw file: 2 | 3 | -------------------------------------------------------------------------------- /bookwormDB/bin/dbbindings-flask.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # So we load in the terms that allow the API implementation to happen for now. 4 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall 5 | from flask import Flask, request, Response, jsonify 6 | import json 7 | import os 8 | 9 | app = Flask(__name__) 10 | 11 | 12 | @app.route('/') 13 | def index(): 14 | JSONinput = request.args.get('queryTerms') or request.args.get('query') 15 | if not JSONinput: 16 | return "Need query or queryTerms argument" 17 | return main(JSONinput) 18 | 19 | @app.route('/debug') 20 | def debug_api(): 21 | import logging 22 | logging.basicConfig(level=logging.INFO) 23 | JSONinput = request.args.get('queryTerms') or request.args.get('query') 24 | if not JSONinput: 25 | return "Need query or queryTerms argument" 26 | return main(JSONinput) 27 | 28 | @app.route('/debug/query') 29 | def debug_query(): 30 | JSONinput = request.args.get('queryTerms') or request.args.get('query') 31 | return JSONinput 32 | 33 | 34 | def main(JSONinput): 35 | 36 | query = json.loads(JSONinput) 37 | 38 | p = SQLAPIcall(query) 39 | result = p.execute() 40 | 41 | if (query['method'] == 'data' and 'format' in query and 42 | query['format'] == 'json'): 43 | # New format for response 44 | jresp = json.loads(result) 45 | resp = jsonify(jresp) 46 | if jresp['status'] == 'error': 47 | resp.status_code = jresp['code'] if 'code' in jresp else 500 48 | else: 49 | resp = Response(result) 50 | 51 | if query['method'] == "return_tsv": 52 | resp.headers['Content-Type'] = "text; charset=utf-8" 53 | resp.headers["Content-Disposition"] = "filename=Bookworm-data.txt" 54 | resp.headers["Pragma"] = "no-cache" 55 | resp.headers["Expires"] = 0 56 | elif query['method'] in ['return_json', 'return_pickle']: 57 | resp.headers['Content-Type'] = "text/html" 58 | 59 | resp.headers['Access-Control-Allow-Origin'] = '*' 60 | resp.headers['Access-Control-Allow-Methods'] = 'GET, POST, PUT, OPTIONS' 61 | resp.headers['Access-Control-Allow-Headers'] = 'Origin, Accept, '\ 62 | 'Content-Type, X-Requested-With, X-CSRF-Token' 63 | 64 | return resp 65 | 66 | if __name__ == '__main__': 67 | port = int(os.environ.get('PORT', 8080)) 68 | app.run(host='0.0.0.0', port=port, debug=True) 69 | -------------------------------------------------------------------------------- /bookwormDB/bin/dbbindings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # So we load in the terms that allow the API implementation to happen for now. 4 | from __future__ import print_function 5 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall 6 | import cgi 7 | import cgitb 8 | import json 9 | 10 | cgitb.enable() 11 | 12 | 13 | def headers(method, errorcode=False): 14 | 15 | print('Access-Control-Allow-Origin: *') 16 | print('Access-Control-Allow-Methods: GET, POST, PUT, OPTIONS') 17 | print('Access-Control-Allow-Headers: Origin, Accept, Content-Type, ' \ 18 | 'X-Requested-With, X-CSRF-Token') 19 | 20 | if errorcode: 21 | print("Status: %d" % errorcode) 22 | 23 | if method != "return_tsv": 24 | print("Content-type: text/html\n") 25 | 26 | elif method == "return_tsv": 27 | print("Content-type: text; charset=utf-8") 28 | print("Content-Disposition: filename=Bookworm-data.txt") 29 | print("Pragma: no-cache") 30 | print("Expires: 0\n") 31 | 32 | 33 | def debug(string): 34 | """ 35 | Makes it easier to debug through a web browser by handling the headers 36 | No calls should be permanently left in the code ever, or they will break 37 | things badly. 38 | """ 39 | print(headers('1')) 40 | print("
") 41 | print(string) 42 | print("
") 43 | 44 | 45 | def main(JSONinput): 46 | 47 | query = json.loads(JSONinput) 48 | # Set up the query. 49 | p = SQLAPIcall(query) 50 | 51 | # run the query. 52 | resp = p.execute() 53 | 54 | if query['method'] == 'data' and 'format' in query and query['format'] == 'json': 55 | try: 56 | resp = json.loads(resp) 57 | except: 58 | resp = dict(status="error", code=500, 59 | message="Internal error: server did not return json") 60 | 61 | # Print appropriate HTML headers 62 | if 'status' in resp and resp['status'] == 'error': 63 | code = resp['code'] if 'code' in resp else 500 64 | headers(query['method'], errorcode=code) 65 | else: 66 | headers(query['method']) 67 | print(json.dumps(resp)) 68 | else: 69 | headers(query['method']) 70 | print(resp) 71 | 72 | return True 73 | 74 | 75 | if __name__ == "__main__": 76 | form = cgi.FieldStorage() 77 | 78 | # Still supporting two names for the passed parameter. 79 | try: 80 | JSONinput = form["queryTerms"].value 81 | except KeyError: 82 | JSONinput = form["query"].value 83 | 84 | main(JSONinput) 85 | -------------------------------------------------------------------------------- /bookwormDB/bin/logParser.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from future import standard_library 3 | standard_library.install_aliases() 4 | import urllib.request, urllib.parse, urllib.error 5 | import os 6 | import re 7 | import gzip 8 | import json 9 | import sys 10 | 11 | files = os.listdir("/var/log/apache2") 12 | 13 | words = [] 14 | 15 | for file in files: 16 | reading = None 17 | if re.search("^access.log..*.gz", file): 18 | reading = gzip.open("/var/log/apache2/" + file) 19 | elif re.search("^access.log.*", file): 20 | reading = open("/var/log/apache2/" + file) 21 | else: 22 | continue 23 | sys.stderr.write(file + "\n") 24 | 25 | for line in reading: 26 | matches = re.findall(r"([0-9\.]+).*\[(.*)].*cgi-bin/dbbindings.py/?.query=([^ ]+)", line) 27 | for fullmatch in matches: 28 | t = dict() 29 | t['ip'] = fullmatch[0] 30 | match = fullmatch[2] 31 | try: 32 | data = json.loads(urllib.parse.unquote(match).decode('utf8')) 33 | except ValueError: 34 | continue 35 | try: 36 | if isinstance(data['search_limits'], dict): 37 | data['search_limits'] = [data['search_limits']] 38 | for setting in ['words_collation', 'database']: 39 | try: 40 | t[setting] = data[setting] 41 | except KeyError: 42 | t[setting] = "" 43 | for limit in data['search_limits']: 44 | p = dict() 45 | for constraint in ["word", "TV_show", "director"]: 46 | try: 47 | p[constraint] = p[constraint] + "," +\ 48 | (",".join(limit[constraint])) 49 | except KeyError: 50 | try: 51 | p[constraint] = (",".join(limit[constraint])) 52 | except KeyError: 53 | p[constraint] = "" 54 | for key in list(p.keys()): 55 | t[key] = p[key] 56 | vals = [t[key] for key in ('ip', 'database', 57 | 'words_collation', 'word', 58 | 'TV_show', 'director')] 59 | print("\t".join(vals).encode("utf-8")) 60 | 61 | except KeyError: 62 | raise 63 | 64 | print(len(words)) 65 | -------------------------------------------------------------------------------- /bookwormDB/bwExceptions.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This is a stub exception to identify explicitly defined Bookworm Exception. 3 | 4 | The intended usage is to raise the exception with a dict that has an error 5 | message, and optionally a code that matches HTTP status codes. e.g. 6 | 7 | raise BookwormException({"message": "I'm a teapot" code:418}) 8 | 9 | or more tidy for longer messages: 10 | err = dict(message="I'm a teapot", code=418) 11 | raise BookwormException(err) 12 | 13 | Code should be an int, not a string. 14 | ''' 15 | 16 | 17 | class BookwormException(Exception): 18 | pass 19 | -------------------------------------------------------------------------------- /bookwormDB/configuration.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from __future__ import print_function 4 | import configparser 5 | import os 6 | import sys 7 | import re 8 | import MySQLdb 9 | import argparse 10 | import getpass 11 | import subprocess 12 | import logging 13 | import uuid 14 | 15 | def update(): 16 | ## Assemble list of all bookworms on the system. 17 | 18 | bookworms = [] ### ... 19 | 20 | ## Create on-disk versions of memory tables if 'fastcat_' does not exists. 21 | 22 | pass 23 | 24 | ## Allow "'bookworm'@'localhost' IDENTIFIED BY ''" to have select access on each bookworm. 25 | 26 | pass 27 | 28 | ## Print a message about enabling access. 29 | 30 | pass 31 | 32 | 33 | def create(ask_about_defaults=True, database=None): 34 | """ 35 | Through interactive prompts at the command line, builds up a file at 36 | bookworm.cnf that can be used to set preferences for the installation. 37 | """ 38 | 39 | if ask_about_defaults: 40 | print(""" 41 | Welcome to Bookworm. 42 | ~~~~~~~~~~~~~~~~~~~~ 43 | First off, let's build a configuration file. This will live 44 | at bookworm.cnf in the current directory: if you mistype anything, 45 | or want to change settings, edit it directly in that location. 46 | 47 | For each of the following entries, type the value you want, or hit 48 | enter to accept the default: 49 | 50 | """) 51 | else: 52 | logging.info("Auto-generating config file.") 53 | 54 | """ 55 | First, we go to great efforts to find some sensible defaults 56 | Usually the user can just hit enter. 57 | """ 58 | 59 | systemConfigFile = configparser.SafeConfigParser(allow_no_value=True) 60 | 61 | defaults = dict() 62 | # The default bookwormname is just the current location 63 | 64 | if database is None: 65 | defaults['database'] = os.path.relpath(".", "..") 66 | else: 67 | defaults['database'] = database 68 | 69 | defaults["user"] = "bookworm" 70 | defaults["password"] = "" 71 | 72 | config = configparser.ConfigParser() 73 | 74 | for section in ["client"]: 75 | config.add_section(section) 76 | 77 | if ask_about_defaults: 78 | database = input("What is the name of the bookworm [" + defaults['database'] + "]: ") 79 | else: 80 | database = defaults['database'] 81 | 82 | config.set("client", "database", re.sub(" ","_",database)) 83 | config.write(open("bookworm.cnf", "w")) 84 | 85 | class Configfile(object): 86 | def __init__(self, usertype, possible_locations=None, default=None, ask_about_defaults=True): 87 | """ 88 | Initialize with the type of the user. The last encountered file on 89 | the list is the one that will be used. 90 | If default is set, a file will be created at that location if none 91 | of the files in possible_locations exist. 92 | 93 | If ask_about_defaults is false, it will do a force installation. 94 | """ 95 | 96 | if not usertype in ['read_only', 'admin']: 97 | raise NotImplementedError("Only read_only and admin supported") 98 | 99 | self.ask_about_defaults = ask_about_defaults 100 | 101 | logging.info("Creating configuration as " + usertype) 102 | 103 | self.usertype = usertype 104 | 105 | if possible_locations is None: 106 | possible_locations = self.default_locations_from_type(usertype) 107 | 108 | self.location = None 109 | 110 | self.config = configparser.ConfigParser(allow_no_value=True) 111 | 112 | if usertype=="admin": 113 | 114 | self.ensure_section("client") 115 | self.ensure_section("mysqld") 116 | 117 | self.config.set("client", "host", "localhost") 118 | self.config.set("client", "user", "root") 119 | self.config.set("client", "password", "") 120 | 121 | else: 122 | self.ensure_section("client") 123 | self.config.set("client", "host", "localhost") 124 | self.config.set("client", "user", "bookworm") 125 | self.config.set("client", "password", "") 126 | 127 | self.read_config_files(possible_locations) 128 | 129 | for string in possible_locations: 130 | if os.path.exists(string): 131 | self.location = string 132 | 133 | 134 | def read_config_files(self, used_files): 135 | 136 | try: 137 | self.config.read(used_files) 138 | except configparser.MissingSectionHeaderError: 139 | """ 140 | Some files throw this error if you have an empty 141 | my.cnf. This throws those out of the list, and tries again. 142 | """ 143 | for file in used_files: 144 | try: 145 | self.config.read(file) 146 | except configparser.MissingSectionHeaderError: 147 | used_files.remove(file) 148 | successes = self.config.read(used_files) 149 | 150 | 151 | 152 | def default_locations_from_type(self,usertype): 153 | """ 154 | The default locations for each usertype. 155 | Note that these are in ascending order of importance: 156 | so the preferred location for admin and read_only configuration 157 | is in /etc/bookworm/admin.cnf 158 | and /etc/bookworm/client.cnf 159 | """ 160 | 161 | if usertype=="admin": 162 | return [os.path.abspath(os.path.expanduser("~/.my.cnf")), 163 | os.path.abspath(os.path.expanduser("~/my.cnf")), 164 | "/etc/bookworm/admin.cnf"] 165 | if usertype == "read_only": 166 | return ["~/.bookworm-sql.cnf", "/etc/bookworm/client.cnf"] 167 | else: 168 | return [] 169 | 170 | def ensure_section(self,section): 171 | if not self.config.has_section(section): 172 | self.config.add_section(section) 173 | 174 | def set_bookworm_options(self): 175 | """ 176 | A number of specific MySQL changes to ensure fast queries on Bookworm. 177 | """ 178 | self.ensure_section("mysqld") 179 | 180 | mysqldoptions = {"### = =": "THIS FILE SHOULD GENERALLY BE PLACED AT /etc/mysql/my.cnf = = = ###", "max_allowed_packet":"512M","sort_buffer_size":"8M","read_buffer_size":"8M","read_rnd_buffer_size":"8M","bulk_insert_buffer_size":"512M","myisam_sort_buffer_size":"5512M","myisam_max_sort_file_size":"5500G","key_buffer_size":"2500M","query_cache_size":"32M","tmp_table_size":"1024M","max_heap_table_size":"2048M","character_set_server":"utf8","query_cache_type":"1","query_cache_limit":"8M"} 181 | 182 | for option in list(mysqldoptions.keys()): 183 | if not self.config.has_option("mysqld",option): 184 | self.config.set("mysqld", option, mysqldoptions[option]) 185 | else: 186 | if mysqldoptions[option] != self.config.get("mysqld",option): 187 | choice = input("Do you want to change the value for " + option + " from " + self.config.get("mysqld",option) + " to the bookworm-recommended " + mysqldoptions[option] + "? (y/N): ") 188 | if choice=="y": 189 | self.config.set("mysqld",option,mysqldoptions[option]) 190 | 191 | self.write_out() 192 | 193 | def write_out(self): 194 | """ 195 | Write out a new version of the configfile to stdout. 196 | The user is responsible for putting this somewhere it will 197 | affect the MySQL preferences 198 | """ 199 | self.config.write(sys.stdout) 200 | 201 | def recommend_my_cnf(known_loc = None): 202 | if known_loc is None: 203 | for loc in ["/usr/etc/my.cnf","/etc/mysql/my.cnf","/etc/my.cnf"]: 204 | if os.path.exists(loc): 205 | known_loc = loc 206 | if known_loc is None: 207 | raise FileNotFoundError("Could not find MySQL folder: pass one.") 208 | cnf = Configfile(usertype = 'admin', possible_locations = [known_loc]) 209 | cnf.set_bookworm_options() 210 | cnf.write_out() 211 | 212 | 213 | 214 | def apache(self = None): 215 | print(""" 216 | Instructions for Apache: 217 | 218 | 219 | First: Serve the Bookworm API over port 10012. (`bookworm serve`). 220 | 221 | Then: Install an Apache host on port 80. 222 | 223 | Then: enable proxy servers and turn off any existing cgi. 224 | 225 | # If you were previously using the CGI bookworm. 226 | `sudo a2dismod cgi` 227 | 228 | `sudo a2enmod proxy proxy_ajp proxy_http rewrite deflate headers proxy_balancer proxy_connect proxy_html` 229 | 230 | Then: Add the following to your '/etc/apache2/sites-available/000-default.conf' 231 | (or whatever site from which you run your apache. 232 | 233 | ~~~~~~~~~~~~~~~~ 234 | 235 | 236 | Order deny,allow 237 | Allow from all 238 | 239 | ProxyPreserveHost On 240 | 241 | ProxyPass "http://127.0.0.1:10012/" 242 | ProxyPassReverse "http://127.0.0.1:10012/" 243 | 244 | 245 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 246 | 247 | 248 | """) 249 | -------------------------------------------------------------------------------- /bookwormDB/convertTSVtoJSONarray.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | def convertToJSON(filename, location): 4 | """ 5 | given a filename of a tsv, converts that into an ndjson 6 | file for Bookworm. 7 | """ 8 | input = open(filename) 9 | output = open(location, "w") 10 | headers = input.readline() 11 | headers = headers.rstrip("\n") 12 | headers = headers.rstrip("\r") 13 | headers = headers.rstrip("\n") 14 | headers = headers.rstrip("\r") 15 | headers = headers.split("\t") 16 | for line in input: 17 | line = line.rstrip("\n") 18 | line = line.rstrip("\r") 19 | line = line.rstrip("\n") 20 | line = line.rstrip("\r") 21 | values = line.split("\t") 22 | myobject = dict(list(zip(headers,values))) 23 | output.write(json.dumps(myobject) + "\n") 24 | output.close() 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /bookwormDB/countManager.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import bounter 4 | from collections import Counter 5 | from .tokenizer import Tokenizer, tokenBatches, PreTokenized 6 | from multiprocessing import Process, Queue, Pool 7 | from .multiprocessingHelp import mp_stats, running_processes 8 | import multiprocessing as mp 9 | import psutil 10 | import queue 11 | import logging 12 | import fileinput 13 | import time 14 | import csv 15 | 16 | cpus, memory = mp_stats() 17 | 18 | 19 | # Allocate half of available memory for the bounter, in megabytes. 20 | memory = int(memory/1024/1024/2) 21 | 22 | # Use another third of the memory for storing worker counts; divided 23 | # by number of CPUS. 24 | # Assume 200 bytes per entry in python dict. 25 | 26 | QUEUE_POST_THRESH = int(memory / 3 * 1024 * 1024 / 200 / cpus) 27 | logging.debug("Ideal queue size is {}".format(QUEUE_POST_THRESH)) 28 | QUEUE_POST_THRESH = max([100000, QUEUE_POST_THRESH]) 29 | 30 | logging.info("Filling dicts to size {}".format(QUEUE_POST_THRESH)) 31 | 32 | import random 33 | import gzip 34 | 35 | def flush_counter(counter, qout): 36 | for k in ['', '\x00']: 37 | try: 38 | del counter[k] 39 | except KeyError: 40 | continue 41 | qout.put(counter) 42 | 43 | def counter(qout, i, fin, mode = "count"): 44 | """ 45 | # Counts words exactly in a separate process. 46 | # It runs in place. 47 | If mode is 'encode', this is called for a side-effect of writing 48 | files to disk. 49 | """ 50 | 51 | totals = 0 52 | errors = 0 53 | 54 | if mode == "count": 55 | counter = Counter() 56 | encoder = tokenBatches(['words']) 57 | 58 | if mode == "encode": 59 | encoder = tokenBatches(['unigrams', 'bigrams']) 60 | 61 | datatype = "raw" 62 | 63 | count_signals = [".unigrams", ".bigrams", ".trigrams", ".quadgrams"] 64 | for signal in count_signals: 65 | if signal in fin: 66 | datatype = signal.strip(".") 67 | if mode == "encode": 68 | encoder = tokenBatches([datatype]) 69 | 70 | if (fin.endswith(".gz")): 71 | fin = gzip.open(fin, 'rt') 72 | else: 73 | fin = open(fin) 74 | 75 | 76 | for ii, row in enumerate(fin): 77 | if ii % cpus != i: 78 | # Don't do anything on most lines. 79 | continue 80 | totals += 1 81 | try: 82 | (filename, text) = row.rstrip().split("\t",1) 83 | except ValueError: 84 | errors += 1 85 | continue 86 | 87 | if datatype == "raw": 88 | tokenizer = Tokenizer(text) 89 | else: 90 | tokenizer = PreTokenized(text, encoder.levels[0]) 91 | 92 | # When encoding 93 | if mode == "encode": 94 | encoder.encodeRow(filename, tokenizer, write_completed = True) 95 | continue 96 | 97 | # When building counts 98 | counter.update(tokenizer.counts("words")) 99 | 100 | # When the counter is long, post it to the master and clear it. 101 | if len(counter) > QUEUE_POST_THRESH: 102 | flush_counter(counter=counter, qout = qout) 103 | counter = Counter() 104 | 105 | # Cleanup. 106 | if mode == "count": 107 | logging.debug("Flushing leftover counts from thread {}".format(i)) 108 | flush_counter(counter=counter, qout = qout) 109 | if totals > 0 and errors/totals > 0.01: 110 | logging.warning("Skipped {} rows without tabs".format(errors)) 111 | if mode == "encode": 112 | encoder.close() 113 | 114 | def create_counts(input): 115 | qout = Queue(cpus * 2) 116 | workers = [] 117 | logging.info("Spawning {} count processes on {}".format(cpus, input)) 118 | for i in range(cpus): 119 | p = Process(target = counter, args = (qout, i, input, "count")) 120 | p.start() 121 | workers.append(p) 122 | 123 | wordcounter = bounter.bounter(memory) 124 | 125 | while True: 126 | 127 | try: 128 | input_dict = qout.get_nowait() 129 | logging.debug("inputting queue of length {} from worker".format(len(input_dict))) 130 | wordcounter.update(input_dict) 131 | 132 | except queue.Empty: 133 | if running_processes(workers): 134 | time.sleep(1/100) 135 | else: 136 | break 137 | except ValueError: 138 | for k, v in input_dict.items(): 139 | print("'{}'\t'{}'".format(k, v)) 140 | wordcounter.update({k: v}) 141 | raise 142 | except TypeError: 143 | for k, v in input_dict.items(): 144 | print("'{}'\t'{}'".format(k, v)) 145 | wordcounter.update({k: v}) 146 | raise 147 | 148 | return wordcounter 149 | 150 | def create_wordlist(n, input, output): 151 | 152 | counter = create_counts(input) 153 | counter = sorted(list(counter.iteritems()), key = lambda x: -1 * x[1]) 154 | output = open(output, "w") 155 | for i, (k, v) in enumerate(counter): 156 | output.write("{}\t{}\t{}\n".format(i, k, v)) 157 | if i >= n: 158 | break 159 | 160 | def encode_words(wordlist, input = "input.txt"): 161 | qout = Queue(cpus * 2) 162 | workers = [] 163 | 164 | for i in range(cpus): 165 | p = Process(target = counter, args = (qout, i, input, "encode")) 166 | p.start() 167 | workers.append(p) 168 | 169 | while running_processes(workers): 170 | time.sleep(1/30) 171 | -------------------------------------------------------------------------------- /bookwormDB/general_API.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from pandas import merge 4 | from pandas import Series 5 | from pandas.io.sql import read_sql 6 | from pandas import merge 7 | from pandas import set_option 8 | from copy import deepcopy 9 | from collections import defaultdict 10 | from .SQLAPI import DbConnect 11 | from .SQLAPI import userquery 12 | from .mariaDB import Query 13 | from .bwExceptions import BookwormException 14 | import re 15 | import json 16 | import logging 17 | import numpy as np 18 | import csv 19 | import io 20 | import numpy as np 21 | 22 | """ 23 | The general API is some functions for working with pandas to calculate 24 | bag-of-words summary statistics according to the API description. 25 | 26 | It is not bound to any particular backend: instead, a subset of 27 | methods in the API must be supported by subclassing APICall(). 28 | 29 | The only existing example of this is "SQLAPICall." 30 | """ 31 | 32 | # Some settings can be overridden here, if nowhere else. 33 | 34 | prefs = dict() 35 | 36 | def PMI(df, location, groups): 37 | """ 38 | A simple PMI calculation. Arguments: 39 | 40 | 'location': The field to calculate expected values for. 41 | 'groups': The metadata to sum up over. 42 | 43 | """ 44 | copy = df.copy() 45 | total = df[[location]].sum() 46 | copy['expected'] = total[0] 47 | for i in range(len(groups)): 48 | new_name = groups[i] + "__r" 49 | renamer = dict() 50 | renamer[location] = new_name 51 | etc = (df[[groups[i], location]].groupby(groups[i]).sum()/total).rename(renamer, axis="columns") 52 | copy = merge(copy, etc, left_on = groups[i], right_index = True) 53 | copy["expected"] = copy["expected"] * copy[new_name] 54 | return np.log(copy[location]/copy["expected"]) 55 | 56 | def rle(input): 57 | """ 58 | Format a list as run-length encoding JSON. 59 | """ 60 | output = [input[0]] 61 | for item in input[1:]: 62 | if isinstance(output[-1], list) and output[-1][1] == item: 63 | output[-1][0] += 1 64 | elif output[-1] == item: 65 | output[-1] = [2, item] 66 | else: 67 | output.append(item) 68 | return output 69 | 70 | def DunningLog(df, a, b): 71 | from numpy import log as log 72 | destination = "Dunning" 73 | df[a] = df[a].replace(0, 0.5) 74 | df[b] = df[b].replace(0, 0.5) 75 | if a == "WordCount_x": 76 | # Dunning comparisons should be to the sums if counting: 77 | c = sum(df[a]) 78 | d = sum(df[b]) 79 | if a == "TextCount_x": 80 | # The max count isn't necessarily the total number of books, 81 | # but it's a decent proxy. 82 | c = max(df[a]) 83 | d = max(df[b]) 84 | expectedRate = (df[a] + df[b]).divide(c+d) 85 | E1 = c*expectedRate 86 | E2 = d*expectedRate 87 | diff1 = log(df[a].divide(E1)) 88 | diff2 = log(df[b].divide(E2)) 89 | df[destination] = 2*(df[a].multiply(diff1) + df[b].multiply(diff2)) 90 | # A hack, but a useful one: encode the direction of the significance, 91 | # in the sign, so negative 92 | difference = diff1 < diff2 93 | df.ix[difference, destination] = -1*df.ix[difference, destination] 94 | return df[destination] 95 | 96 | class Aggregator(object): 97 | """ 98 | We only collect "WordCount and "TextCount" for each query, 99 | but there are a multitude of things you can do with those: 100 | basic things like frequency, all the way up to TF-IDF. 101 | 102 | """ 103 | def __init__(self, df, groups = None): 104 | self.df = df 105 | self.groups = groups 106 | 107 | def _aggregate(self, parameters): 108 | "Run the aggregation. Prefixed with an underscore so it doesn't show up in the dict." 109 | 110 | parameters = set(map(str, parameters)) 111 | for parameter in parameters: 112 | getattr(self, parameter)() 113 | return self.df 114 | 115 | def WordCount(self): 116 | self.df["WordCount"] = self.df["WordCount_x"] 117 | 118 | def TextCount(self): 119 | self.df["TextCount"] = self.df["TextCount_x"] 120 | 121 | def WordsPerMillion(self): 122 | self.df["WordsPerMillion"] = (self.df["WordCount_x"].multiply(1000000)/ 123 | self.df["WordCount_y"]) 124 | def TotalWords(self): 125 | self.df["TotalWords"] = self.df["WordCount_y"] 126 | 127 | def SumWords(self): 128 | self.df["SumWords"] = self.df["WordCount_y"] + self.df["WordCount_x"] 129 | 130 | def WordsRatio(self): 131 | self.df["WordsRatio"] = self.df["WordCount_x"]/self.df["WordCount_y"] 132 | 133 | def TextPercent(self): 134 | self.df["TextPercent"] = 100*self.df["TextCount_x"].divide(self.df["TextCount_y"]) 135 | 136 | def TextRatio(self): 137 | self.df["TextRatio"] = self.df["TextCount_x"]/self.df["TextCount_y"] 138 | 139 | def TotalTexts(self): 140 | self.df["TotalTexts"] = self.df["TextCount_y"] 141 | 142 | def SumTexts(self): 143 | self.df["SumTexts"] = self.df["TextCount_y"] + self.df["TextCount_x"] 144 | 145 | def HitsPerText(self): 146 | self.df["HitsPerText"] = self.df["WordCount_x"]/self.df["TextCount_x"] 147 | 148 | def TextLength(self): 149 | self.df["TextLength"] = self.df["WordCount_y"]/self.df["TextCount_y"] 150 | 151 | def PMI_words(self): 152 | self.df["PMI_words"] = PMI(self.df, "WordCount_x", self.groups) 153 | 154 | def PMI_texts(self): 155 | self.df["PMI_texts"] = PMI(self.df, "TextCount_x", self.groups) 156 | 157 | def TFIDF(self): 158 | from numpy import log as log 159 | self.df["TF"] = self.df["WordCount_x"]/self.df["WordCount_y"] 160 | self.df["TFIDF"] = self.df["TF"] * np.log(self.df["TextCount_y"]/self.df['TextCount_x']) 161 | 162 | def Dunning(self): 163 | self.df["Dunning"] = DunningLog(self.df, "WordCount_x", "WordCount_y") 164 | 165 | 166 | def DunningTexts(self): 167 | self.df["DunningTexts"] = DunningLog(self.df, "TextCount_x", "TextCount_y") 168 | 169 | def rename(df, newkey): 170 | 171 | # Add "x" and "y" suffixed to the dataframes even when not explicitly needed. 172 | 173 | renamer = {} 174 | for k in ["WordCount", "TextCount"]: 175 | renamer[k] = k + "_" + newkey 176 | df.rename(index=str, columns=renamer, inplace = True) 177 | 178 | 179 | def intersectingNames(p1, p2, full=False): 180 | """ 181 | The list of intersection column names between two DataFrame objects. 182 | 183 | 'full' lets you specify that you want to include the count values: 184 | Otherwise, they're kept separate for convenience in merges. 185 | """ 186 | exclude = set(['WordCount', 'TextCount']) 187 | names1 = set([column for column in p1.columns if column not in exclude]) 188 | names2 = [column for column in p2.columns if column not in exclude] 189 | if full: 190 | return list(names1.union(names2)) 191 | return list(names1.intersection(names2)) 192 | 193 | 194 | def need_comparison_query(count_types): 195 | """ 196 | Do we not need a comparison query? 197 | """ 198 | needing_fields = [c for c in count_types if not c in ["WordCount","TextCount"]] 199 | return len(needing_fields) != 0 200 | 201 | def base_count_types(list_of_final_count_types): 202 | """ 203 | the final count types are calculated from some base types across both 204 | the local query and the superquery. 205 | 206 | These are very not optimized--I should go through and cut out bad ones for more obscure count types. 207 | 208 | """ 209 | 210 | subq = set() 211 | superq = set() 212 | 213 | for count_name in list_of_final_count_types: 214 | if count_name in ["WordCount", "WordsPerMillion", "WordsRatio", 215 | "TotalWords", "SumWords", "Dunning", "PMI_words", "TextLength", "HitsPerMatch", "TFIDF"]: 216 | subq.add("WordCount") 217 | superq.add("WordCount") 218 | if count_name in ["TextCount", "TextPercent", "TextRatio", 219 | "TotalTexts", "SumTexts", "DunningTexts", "PMI_texts", 220 | "TextLength", "HitsPerMatch", "TFIDF"]: 221 | subq.add("TextCount") 222 | superq.add("TextCount") 223 | 224 | return [list(subq), list(superq)] 225 | 226 | 227 | def is_a_wordcount_field(string): 228 | if string in ["unigram", "bigram", "word"]: 229 | return True 230 | return False 231 | 232 | 233 | class APIcall(object): 234 | """ 235 | This is the base class from which more specific classes for actual 236 | methods can be dispatched. 237 | 238 | Without a "return_pandas_frame" method, it won't run. 239 | """ 240 | def __init__(self, APIcall): 241 | 242 | """ 243 | Initialized with a dictionary unJSONed from the API defintion. 244 | """ 245 | 246 | self.query = APIcall 247 | self.idiot_proof_arrays() 248 | self.set_defaults() 249 | 250 | def set_defaults(self): 251 | query = self.query 252 | if "search_limits" not in query: 253 | self.query["search_limits"] = dict() 254 | if "unigram" in query["search_limits"]: 255 | # Hack: change somehow. You can't group on "word", just on 256 | # "unigram" 257 | query["search_limits"]["word"] = query["search_limits"]["unigram"] 258 | del query["search_limits"]["unigram"] 259 | 260 | def idiot_proof_arrays(self): 261 | for element in ['counttype', 'groups']: 262 | try: 263 | if not isinstance(self.query[element], list): 264 | self.query[element] = [self.query[element]] 265 | except KeyError: 266 | # It's OK if it's not there. 267 | pass 268 | 269 | def get_compare_limits(self): 270 | """ 271 | The compare limits will try to 272 | first be the string specified: 273 | if not that, then drop every term that begins with an asterisk: 274 | if not that, then drop the words term; 275 | if not that, then exactly the same as the search limits. 276 | """ 277 | 278 | if "compare_limits" in self.query: 279 | return self.query['compare_limits'] 280 | 281 | search_limits = self.query['search_limits'] 282 | compare_limits = deepcopy(search_limits) 283 | 284 | asterisked = False 285 | for limit in list(search_limits.keys()): 286 | if re.search(r'^\*', limit): 287 | search_limits[limit.replace('*', '')] = search_limits[limit] 288 | del search_limits[limit] 289 | del compare_limits[limit] 290 | asterisked = True 291 | 292 | if asterisked: 293 | return compare_limits 294 | 295 | # Next, try deleting the word term. 296 | 297 | for word_term in list(search_limits.keys()): 298 | if word_term in ['word', 'unigram', 'bigram']: 299 | del compare_limits[word_term] 300 | 301 | # Finally, whether it's deleted a word term or not, return it all. 302 | return compare_limits 303 | 304 | def data(self): 305 | if hasattr(self, "pandas_frame"): 306 | return self.pandas_frame 307 | else: 308 | self.pandas_frame = self.get_data_from_source() 309 | return self.pandas_frame 310 | 311 | def validate_query(self): 312 | self.ensure_query_has_required_fields() 313 | 314 | def ensure_query_has_required_fields(self): 315 | 316 | required_fields = ['counttype', 'groups', 'database'] 317 | if self.query['method'] in ['schema', 'search']: 318 | required_fields = ['database'] 319 | 320 | for field in required_fields: 321 | if field not in self.query: 322 | logging.error("Missing field: %s" % field) 323 | err = dict(message="Bad query. Missing \"%s\" field" % field, 324 | code=400) 325 | raise BookwormException(err) 326 | 327 | def prepare_search_and_compare_queries(self): 328 | 329 | 330 | 331 | call1 = deepcopy(self.query) 332 | call2 = deepcopy(call1) 333 | call2['search_limits'] = self.get_compare_limits() 334 | 335 | # The individual calls need only the base counts: not "Percentage of 336 | # Words," but just "WordCount" twice, and so forth 337 | 338 | call1['counttype'], call2['counttype'] = base_count_types(self.query['counttype']) 339 | 340 | # Drop out asterisks for that syntactic sugar. 341 | for limit in list(call1['search_limits'].keys()): 342 | if re.search(r'^\*', limit): 343 | call1['search_limits'][limit.replace('*', '')] = \ 344 | call1['search_limits'][limit] 345 | del call1['search_limits'][limit] 346 | 347 | for n, group in enumerate(self.query['groups']): 348 | if re.search(r'^\*', group): 349 | replacement = group.replace("*", "") 350 | call1['groups'][n] = replacement 351 | self.query['groups'][n] = replacement 352 | call2['groups'].remove(group) 353 | 354 | self.call1 = call1 355 | self.call2 = call2 356 | 357 | 358 | def get_data_from_source(self): 359 | """ 360 | Retrieves data from the backend, and calculates totals. 361 | 362 | Note that this method could be easily adapted to run on top of a Solr 363 | instance or something else, just by changing the bits in the middle 364 | where it handles storage_format. 365 | """ 366 | 367 | self.validate_query() 368 | 369 | if self.query['method'] in ['schema', 'search']: 370 | return self.generate_pandas_frame() 371 | 372 | self.prepare_search_and_compare_queries() 373 | 374 | """ 375 | This could use any method other than pandas_SQL: 376 | You'd just need to redefine "generate_pandas_frame" 377 | """ 378 | 379 | if not need_comparison_query(self.query['counttype']): 380 | df1 = self.generate_pandas_frame(self.call1) 381 | # rename(df1, "x") 382 | return df1[self.query['groups'] + self.query['counttype']] 383 | 384 | try: 385 | df1 = self.generate_pandas_frame(self.call1) 386 | rename(df1, "x") 387 | logging.debug(self.call2) 388 | df2 = self.generate_pandas_frame(self.call2) 389 | rename(df2, "y") 390 | 391 | except Exception as error: 392 | logging.exception("Database error") 393 | # One common error is putting in an inappropriate column 394 | try: 395 | column_search = re.search("Unknown column '(.+)' in 'field list'",str(error)).groups() 396 | if len(column_search) > 0: 397 | return Series({"status": "error", "message": "No field in database entry matching desired key `{}`".format(column_search[0])}) 398 | else: 399 | return Series({"status": "error", "message": "Database error. " 400 | "Try checking field names.","code":str(error)}) 401 | 402 | except: 403 | return Series({"status": "error", "message": "Unknown error. ", 404 | "code":str(error)}) 405 | 406 | intersections = intersectingNames(df1, df2) 407 | 408 | """ 409 | Would this merge be faster with indexes? 410 | """ 411 | 412 | if len(intersections) > 0: 413 | merged = merge(df1, df2, on=intersections, how='outer') 414 | else: 415 | merged = df1.join(df2, lsuffix='_x', rsuffix='_y') 416 | 417 | merged = merged.fillna(int(0)) 418 | 419 | calculations = self.query['counttype'] 420 | gator = Aggregator(merged, self.query['groups']) 421 | calcced = gator._aggregate(calculations) 422 | # calcced = calculateAggregates(merged, calculations, self.query['groups']) 423 | 424 | calcced = calcced.fillna(int(0)) 425 | 426 | final_DataFrame = (calcced[self.query['groups'] + 427 | self.query['counttype']]) 428 | 429 | return final_DataFrame 430 | 431 | def execute(self): 432 | 433 | method = self.query['method'] 434 | logging.debug("Preparing to execute with method '{}'".format(method)) 435 | fmt = self.query['format'] if 'format' in self.query else False 436 | 437 | if method == 'data' or method == 'schema' or method == 'search': 438 | version = 2 439 | if fmt in ['json_c', 'search', 'html', 'csv', 'tsv']: 440 | version = 3 441 | else: 442 | version = 1 443 | 444 | if version == 1: 445 | # What to do with multiple search_limits 446 | if isinstance(self.query['search_limits'], list): 447 | if method in ["json", "return_json"]: 448 | self.query['method'] = 'data' 449 | self.query['format'] = 'json' 450 | return self.multi_execute(version=version) 451 | else: 452 | # Only return first search limit if not return in json 453 | self.query['search_limits'] = self.query['search_limits'][0] 454 | 455 | form = method[7:] if method[:6] == 'return' else method 456 | 457 | logging.warning("method == \"%s\" is deprecated. Use method=\"data\" " 458 | "with format=\"%s\" instead." % (method, form)) 459 | 460 | if method == "return_json" or method == "json": 461 | self.query['method'] = 'data' 462 | self.query['format'] = 'json' 463 | return self.return_json(version=1) 464 | 465 | elif method == "return_csv" or method == "csv": 466 | self.query['method'] = 'data' 467 | self.query['format'] = 'json' 468 | frame = self.data() 469 | return frame.to_csv(path = None, sep="\t", encoding="utf8", index=False, 470 | quoting=csv.QUOTE_NONE, escapechar="\\") 471 | elif version >= 2: 472 | try: 473 | # What to do with multiple search_limits 474 | 475 | if isinstance(self.query['search_limits'], list): 476 | if fmt == "json" or version >= 3: 477 | frame = self.multi_execute(version = version) 478 | else: 479 | # Only return first search limit if not return in json 480 | self.query['search_limits'] = self.query['search_limits'][0] 481 | else: 482 | frame = self.data() 483 | 484 | if fmt == "json": 485 | return self.return_json(version=2) 486 | 487 | if fmt == "csv": 488 | return frame.to_csv(encoding="utf8", index=False) 489 | 490 | if fmt == "tsv": 491 | return frame.to_csv(sep="\t", encoding="utf8", index=False) 492 | 493 | if fmt == "feather": 494 | fout = io.BytesIO(b'') 495 | try: 496 | frame.to_feather(fout) 497 | except: 498 | logging.warning("You need the pyarrow package installed to export as feather.") 499 | raise 500 | fout.seek(0) 501 | return fout.read() 502 | 503 | if fmt == 'json_c': 504 | return self.return_rle_json(frame) 505 | 506 | if fmt == 'html': 507 | return self.html(frame) 508 | 509 | else: 510 | err = dict(status="error", code=200, 511 | message="Only formats in ['csv', 'tsv', 'json', 'feather']" 512 | " currently supported") 513 | return json.dumps(err) 514 | except BookwormException as e: 515 | # Error status codes are HTTP codes 516 | # http://www.restapitutorial.com/httpstatuscodes.html 517 | err = e.args[0] 518 | err['status'] = "error" 519 | return json.dumps(err) 520 | except Exception as ex: 521 | # General Uncaught error. 522 | logging.exception("{}".format(ex)) 523 | logging.exception("Database error") 524 | return json.dumps({"status": "error", "message": "Database error. " 525 | "Try checking field names."}) 526 | 527 | # Temporary catch-all pushes to the old methods: 528 | if method in ["returnPossibleFields", "search_results", 529 | "return_books", "schema"]: 530 | try: 531 | query = userquery(self.query) 532 | if method == "return_books": 533 | return query.execute() 534 | return json.dumps(query.execute()) 535 | except Exception as e: 536 | if len(str(e)) > 1 and e[1].startswith("Unknown database"): 537 | return "No such bookworm {}".format(e[1].replace("Unknown database","")) 538 | except: 539 | return "General error" 540 | 541 | def multi_execute(self, version=1): 542 | 543 | """ 544 | Queries may define several search limits in an array 545 | if they use the return_json method. 546 | """ 547 | 548 | if version <= 2: 549 | returnable = [] 550 | for limits in self.query['search_limits']: 551 | child = deepcopy(self.query) 552 | child['search_limits'] = limits 553 | q = self.__class__(child).return_json(raw_python_object=True, 554 | version=version) 555 | returnable.append(q) 556 | return self._prepare_response(returnable, version) 557 | 558 | if version == 3: 559 | for i, limits in enumerate(self.query['search_limits']): 560 | child = deepcopy(self.query) 561 | child['search_limits'] = limits 562 | f = self.__class__(child).data() 563 | f['Search'] = i 564 | if i == 0: 565 | frame = f 566 | else: 567 | frame = frame.append(f, ignore_index = True) 568 | return frame 569 | 570 | 571 | def html(self, data): 572 | """ 573 | Return data in column-oriented format with run-length encoding 574 | on duplicate values. 575 | """ 576 | 577 | if isinstance(data, Series) and 'status' in data: 578 | # If data has a status, Bookworm is trying to send us an error 579 | return data.to_json() 580 | 581 | set_option('display.max_colwidth', -1) 582 | return data.to_html(escape = False, index = False) 583 | 584 | 585 | def return_rle_json(self, data): 586 | """ 587 | Return data in column-oriented format with run-length encoding 588 | on duplicate values. 589 | """ 590 | 591 | if isinstance(data, Series) and 'status' in data: 592 | # If data has a status, Bookworm is trying to send us an error 593 | return data.to_json() 594 | 595 | output = {'status':'success', 'data':{}} 596 | 597 | for k in data: 598 | series = data[k] 599 | output['data'][k] = rle(data[k].tolist()) 600 | 601 | return json.dumps(output) 602 | 603 | 604 | def return_json(self, raw_python_object=False, version=1): 605 | ''' 606 | Get JSON data for a single search_limit. 607 | 608 | version: 1 returns just the data, using method = return_json. 609 | 2 formats the response according to the JSend spec. 610 | ''' 611 | query = self.query 612 | data = self.data() 613 | 614 | if isinstance(data, Series) and 'status' in data: 615 | # If data has a status, Bookworm is trying to send us an error 616 | return data.to_json() 617 | 618 | def fixNumpyType(input): 619 | # This is, weirdly, an occasional problem but not a constant one. 620 | if type(input) is np.int64: 621 | return int(input) 622 | else: 623 | return input 624 | 625 | # Define a recursive structure to hold the stuff. 626 | def tree(): 627 | return defaultdict(tree) 628 | returnt = tree() 629 | 630 | for row in data.itertuples(index=False): 631 | row = list(row) 632 | destination = returnt 633 | if len(row) == len(query['counttype']): 634 | returnt = [fixNumpyType(num) for num in row] 635 | while len(row) > len(query['counttype']): 636 | key = row.pop(0) 637 | if len(row) == len(query['counttype']): 638 | # Assign the elements. 639 | try: 640 | row = [ 641 | r if np.isfinite(row) 642 | else None 643 | for r in row 644 | ] 645 | except: 646 | logging.warning(row) 647 | pass 648 | destination[key] = row 649 | break 650 | # This bit of the loop is where we descend the recursive 651 | # dictionary. 652 | destination = destination[key] 653 | if raw_python_object: 654 | return returnt 655 | else: 656 | return self._prepare_response(returnt, version) 657 | 658 | def _prepare_response(self, data, version=1): 659 | if version == 1: 660 | resp = data 661 | elif version == 2: 662 | resp = dict(status="success", data=data) 663 | else: 664 | resp = dict(status="error", 665 | data="Internal error: unknown response version") 666 | 667 | try: 668 | return json.dumps(resp) 669 | except ValueError: 670 | return json.dumps(resp) 671 | 672 | 673 | class oldSQLAPIcall(APIcall): 674 | """ 675 | To make a new backend for the API, you just need to extend the base API 676 | call class like this. 677 | 678 | This one is comically short because all the real work is done in the 679 | userquery object. 680 | 681 | But the point is, you need to define a function "generate_pandas_frame" 682 | that accepts an API call and returns a pandas frame. 683 | 684 | But that API call is more limited than the general API; you only need to 685 | support "WordCount" and "TextCount" methods. 686 | """ 687 | 688 | def generate_pandas_frame(self, call = None): 689 | """ 690 | 691 | This is good example of the query that actually fetches the results. 692 | It creates some SQL, runs it, and returns it as a pandas DataFrame. 693 | 694 | The actual SQL production is handled by the userquery class, which uses 695 | more legacy code. 696 | 697 | """ 698 | 699 | if call is None: 700 | call = self.query 701 | 702 | con = DbConnect(prefs, self.query['database']) 703 | q = userquery(call).query() 704 | df = read_sql(q, con.db) 705 | return df 706 | 707 | class SQLAPIcall(APIcall): 708 | """ 709 | To make a new backend for the API, you just need to extend the base API 710 | call class like this. 711 | 712 | This one is comically short because all the real work is done in the 713 | userquery object. 714 | 715 | But the point is, you need to define a function "generate_pandas_frame" 716 | that accepts an API call and returns a pandas frame. 717 | 718 | But that API call is more limited than the general API; you only need to 719 | support "WordCount" and "TextCount" methods. 720 | """ 721 | 722 | def generate_pandas_frame(self, call = None): 723 | """ 724 | 725 | This is good example of the query that actually fetches the results. 726 | It creates some SQL, runs it, and returns it as a pandas DataFrame. 727 | 728 | The actual SQL production is handled by the userquery class, which uses 729 | more legacy code. 730 | 731 | """ 732 | 733 | if call is None: 734 | call = self.query 735 | con = DbConnect(prefs, self.query['database']) 736 | q = Query(call).query() 737 | logging.debug("Preparing to execute {}".format(q)) 738 | df = read_sql(q, con.db) 739 | logging.debug("Query retrieved") 740 | return df 741 | 742 | -------------------------------------------------------------------------------- /bookwormDB/json_schema.py: -------------------------------------------------------------------------------- 1 | 2 | from .schema_primitives import * 3 | 4 | base_schema = { 5 | "definitions": { 6 | 7 | }, 8 | "type": "object", 9 | "title": "Bookworm Query Schema", 10 | "required": [ 11 | "database", 12 | "method", 13 | "format", 14 | "search_limits", 15 | "groups", 16 | "counttype" 17 | ], 18 | "properties": { 19 | "method": method_schema, 20 | "format": format_schema, 21 | "database": { 22 | "type": "string", 23 | "title": "The Database Schema", 24 | "description": "The name of the database to search in.", 25 | "examples": [ 26 | "federalist_bookworm", 27 | "hathipd" 28 | ], 29 | "pattern": "^([^ ]+)$" 30 | }, 31 | "search_limits": { 32 | "$id": "#/properties/search_limits", 33 | "type": "object", 34 | "description": "A set of constraints to create a corpus. If an array, each will be treated as a grouping field for results and a new key, 'Search,' will be returned." 35 | }, 36 | "compare_limits": { 37 | "$id": "#/properties/compare_limits", 38 | "type": "object", 39 | "description": "The definition of a full corpus against which to run comparisons. In general, this will be automatically inferred from the search_limits field by dropping the 'word' limit.", 40 | }, 41 | "groups": { 42 | "$id": "#/properties/groups", 43 | "type": "array", 44 | "items": { 45 | "$id": "#/properties/groups/items", 46 | "type": "string", 47 | "default": "", 48 | "examples": [ 49 | "author", 50 | "date_day_year" 51 | ], 52 | "pattern": "^(.*)$" 53 | } 54 | }, 55 | "counttype": counts_schema 56 | } 57 | } 58 | 59 | class DataQuerySchema(dict): 60 | """ 61 | A JSON schema for valid queries. 62 | """ 63 | def __init__(self, con): 64 | dict.__init__(self, base_schema) 65 | self.set_base_elements() 66 | 67 | def set_base_elements(self): 68 | pass 69 | 70 | def validate(self, query): 71 | pass 72 | -------------------------------------------------------------------------------- /bookwormDB/manager.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import re 3 | from subprocess import call 4 | from subprocess import Popen 5 | import logging 6 | import sys 7 | import os 8 | import bookwormDB 9 | import argparse 10 | 11 | """ 12 | This is the code that actually gets run from the command-line executable. 13 | 14 | The BookwormManager class defines some methods for controlling bookworm SQL instances 15 | and running upkeep operations; 16 | the run_arguments function pulls commands from the command line. Any useful new bookworm methods 17 | should be passed through run_arguments to work. 18 | 19 | 20 | Some modules, especially bookworm-specific ones, 21 | are imported inline in the code here--that substantially 22 | (as in, 1 second to 0.2 seconds) reduces startup time 23 | for the command-line executable, 24 | even though it's not best practice otherwise. 25 | """ 26 | 27 | class BookwormManager(object): 28 | """ 29 | This class is passed some options that tell it the name of the bookworm it's working on; 30 | some of the methods here are the directly callable as the command line arguments. 31 | section'client' 32 | This is what calls the various other bookworm scripts, whether Python or not. 33 | """ 34 | 35 | def __init__(self, cnf_file=None, database=None): 36 | 37 | # This will likely be changed if it isn't None. 38 | import configparser 39 | 40 | self.basedir = None 41 | self.dbname = None 42 | for i in range(10): 43 | basedir = "../"*i 44 | if os.path.exists(basedir + ".bookworm"): 45 | self.basedir = basedir 46 | break 47 | if self.basedir==None: 48 | logging.debug("No bookworm directory found; hopefully this isn't a build call.") 49 | 50 | if cnf_file is not None: 51 | config = configparser.ConfigParser(allow_no_value=True) 52 | config.read([cnf_file]) 53 | if config.has_section("client"): 54 | """ 55 | Silently go along if the config doesn't exist. 56 | """ 57 | try: 58 | self.dbname = config.get("client", "database") 59 | except configParser.NoOptionError: 60 | pass 61 | 62 | # More specific options override the config file 63 | if database is not None: 64 | # Passed in dbname takes precedence over config file. 65 | self.dbname = database 66 | 67 | def config(self,args): 68 | """ 69 | Performs useful configuration tasks, such as setting up a MySQL installation. 70 | """ 71 | if args.target=="mysql": 72 | import bookwormDB.configuration 73 | bookwormDB.configuration.recommend_my_cnf() 74 | if args.target=="mysql-info": 75 | from bookwormDB.configuration import Configfile 76 | config = Configfile("admin") 77 | print("The admin configuration login currently being used should be the following.\n") 78 | config.write_out() 79 | if args.target=="apache": 80 | from bookwormDB.configuration import apache 81 | apache() 82 | 83 | def ftokenize(self, args): 84 | 85 | import bookwormDB.tokenizer 86 | 87 | """ 88 | Handle functions related to tokenization and encoding. 89 | 90 | Should eventually be able to accept arguments like "token-regex" 91 | and already-tokenized documents. 92 | """ 93 | 94 | if args.process == "encode": 95 | self.encoded(args) 96 | 97 | if args.process == "text_stream" or args.process == "token_stream": 98 | raise NotImplementedError("This feature has been removed") 99 | 100 | if args.process == "word_db": 101 | self.wordlist(args) 102 | 103 | def init(self, args): 104 | """ 105 | Initialize the current directory as a bookworm directory. 106 | """ 107 | # Create a configuration file 108 | if not args.force: 109 | if os.path.exists(".bookworm"): 110 | logging.error(""" 111 | You already have a folder named '.bookworm'. 112 | Probably you've already initialized a Bookworm here. 113 | """) 114 | return 115 | if not os.path.exists("bookworm.cnf"): 116 | fout = open("bookworm.cnf", "w") 117 | if self.dbname: 118 | loc = self.dbname 119 | else: 120 | loc = os.path.relpath(".", "..") 121 | print("Configuring Bookworm named '{}'".format(loc)) 122 | print("Change the file at bookworm.cnf if this is undesirable".format(loc)) 123 | fout.write("[client]\ndatabase = {}\n".format(loc)) 124 | else: 125 | fout = open("bookworm.cnf", "w") 126 | loc = os.path.relpath(".", "..") 127 | print("Configuring Bookworm named '{}'".format(loc)) 128 | print("Change the file at bookworm.cnf if this is undesirable".format(loc)) 129 | fout.write("[client]\ndatabase = {}\n".format(loc)) 130 | 131 | def query(self, args): 132 | """ 133 | Run a query against the API from the command line. 134 | """ 135 | 136 | from bookwormDB.general_API import SQLAPIcall 137 | import json 138 | 139 | query = json.loads(args.APIcall) 140 | caller = SQLAPIcall(query) 141 | print(caller.execute()) 142 | 143 | def serve(self,args): 144 | 145 | """ 146 | Serve the api. 147 | """ 148 | 149 | from bookwormDB.wsgi import run 150 | run(args.bind, args.workers) 151 | 152 | import http.server 153 | from http.server import HTTPServer 154 | import shutil 155 | 156 | base_dir = args.dir 157 | base_cgi_dir = os.path.normpath(base_dir + "/" + "cgi-bin") 158 | d3_dir = os.path.normpath(base_dir + "/" + "D3") 159 | for dir in [base_dir,base_cgi_dir]: 160 | if not os.path.exists(dir): 161 | os.makedirs(dir) 162 | 163 | API = os.path.normpath(os.path.dirname(bookwormDB.__file__) + "/bin/dbbindings.py") 164 | if not os.path.exists(base_cgi_dir + "/" + API): 165 | shutil.copy(API, base_cgi_dir) 166 | 167 | if not os.path.exists(d3_dir): 168 | call(["git","clone","http://github.com/bmschmidt/BookwormD3",d3_dir]) 169 | 170 | # Use the Makefile to build the linechartGUI. This is a little Rube Goldberg-y. 171 | args.target="linechartGUI" 172 | 173 | raise TypeError("The line below this is nonsense") 174 | self.prep(args) 175 | 176 | os.chdir(base_dir) 177 | # Actually serve it. 178 | PORT = args.port 179 | 180 | httpd = HTTPServer(("", PORT), http.server.CGIHTTPRequestHandler) 181 | 182 | print("\n\n" + "****"*20) 183 | print("A local bookworm server is now running") 184 | print("You can now view some charts in a web-browser at http://localhost:%d/D3" % PORT) 185 | print("If you have a time variable, linecharts are at http://localhost:%d/%s" % (PORT,self.dbname)) 186 | print("Please note that this is not a very secure way: if you plan to put your bookworm") 187 | print("on the open web, consider using apache.") 188 | httpd.serve_forever() 189 | 190 | 191 | def extension(self,args): 192 | """ 193 | Creates (or updates) an extension 194 | """ 195 | 196 | if not os.path.exists(self.basedir + ".bookworm/extensions"): 197 | os.makedirs(self.basedir + ".bookworm/extensions") 198 | 199 | my_extension = Extension(args,basedir = self.basedir) 200 | my_extension.clone_or_pull() 201 | my_extension.make() 202 | 203 | def build(self, args): 204 | self.prep(args) 205 | 206 | def prep(self, args): 207 | """ 208 | This is a wrapper to all the functions define here: the purpose 209 | is to continue to allow access to internal methods in, for instance, 210 | the Makefile, without documenting all of them in separate functions. 211 | 212 | That's a little groaty, I know. 213 | """ 214 | logging.debug(args) 215 | 216 | getattr(self, args.goal)(args) 217 | 218 | def wordlist(self, args): 219 | """ 220 | Create a wordlist of the top 1.5 million words. 221 | """ 222 | from .countManager import create_wordlist 223 | if os.path.exists(".bookworm/texts/wordlist/wordlist.txt"): 224 | return 225 | try: 226 | os.makedirs(".bookworm/texts/wordlist") 227 | except FileExistsError: 228 | pass 229 | 230 | input = "input.txt" 231 | if args.feature_counts: 232 | logging.info(args.feature_counts) 233 | input = [a for a in args.feature_counts if 'unigrams' in a][0] 234 | create_wordlist(n = 1.5e06, 235 | input = input, 236 | output = ".bookworm/texts/wordlist/wordlist.txt") 237 | 238 | def pristine(self, args): 239 | 240 | import bookwormDB.CreateDatabase 241 | bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname, variableFile=None) 242 | if self.dbname == "mysql": 243 | raise NameError("Don't try to delete the mysql database") 244 | bookworm.db.query("DROP DATABASE IF EXISTS {}".format(self.dbname)) 245 | 246 | def encoded(self, args): 247 | """ 248 | Using the wordlist and catalog, create encoded files. 249 | """ 250 | self.wordlist(args) 251 | self.derived_catalog(args) 252 | 253 | for k in ['unigrams', 'bigrams', 'trigrams', 'quadgrams', 'completed']: 254 | try: 255 | os.makedirs(".bookworm/texts/encoded/{}".format(k)) 256 | except FileExistsError: 257 | pass 258 | from .countManager import encode_words 259 | 260 | if args.feature_counts: 261 | for feature in args.feature_counts: 262 | encode_words(".bookworm/texts/wordlist/wordlist.txt", feature) 263 | else: 264 | encode_words(".bookworm/texts/wordlist/wordlist.txt", "input.txt") 265 | 266 | def all(self, args): 267 | self.preDatabaseMetadata(args) 268 | self.encoded(args) 269 | self.database_wordcounts(args) 270 | self.database_metadata(args) 271 | 272 | def preDatabaseMetadata(self, args=None, **kwargs): 273 | import os 274 | if not os.path.exists("field_descriptions.json"): 275 | self.guessAtFieldDescriptions() 276 | self.derived_catalog(args) 277 | import bookwormDB.CreateDatabase 278 | # Doesn't need a created database yet, just needs access 279 | # to some pieces. 280 | Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase() 281 | logging.info("Writing metadata to new catalog file...") 282 | Bookworm.variableSet.writeMetadata() 283 | 284 | # This creates helper files in the /metadata/ folder. 285 | 286 | def derived_catalog(self, args): 287 | 288 | if not os.path.exists(".bookworm/metadata"): 289 | os.makedirs(".bookworm/metadata") 290 | if os.path.exists(".bookworm/metadata/jsoncatalog_derived.txt"): 291 | return 292 | 293 | from bookwormDB.MetaParser import parse_catalog_multicore, ParseFieldDescs 294 | 295 | logging.debug("Preparing to write field descriptions") 296 | ParseFieldDescs(write = True) 297 | logging.debug("Preparing to write catalog") 298 | parse_catalog_multicore() 299 | 300 | def guessAtFieldDescriptions(self, args = None, **kwargs): 301 | 302 | """ 303 | Use a number of rules of thumb to automatically generate a field_descriptions.json file. 304 | This may bin some categories incorrectly (depending on names, for example it may treat dates 305 | as either categorical or time variables). 306 | """ 307 | 308 | import bookwormDB.CreateDatabase 309 | import json 310 | Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname, variableFile=None) 311 | Bookworm.setVariables("jsoncatalog.txt", jsonDefinition=None) 312 | import os 313 | if not os.path.exists("field_descriptions.json"): 314 | output = open("field_descriptions.json","w") 315 | guess = json.dumps(Bookworm.variableSet.guessAtFieldDescriptions(), indent = 2) 316 | logging.warning("Creating guess for field descriptions at: {}".format(guess)) 317 | output.write(guess) 318 | else: 319 | logging.error(""" 320 | You already have a file at field_descriptions.json 321 | Dying rather than overwrite it. 322 | """) 323 | sys.exit() 324 | 325 | def reload_memory(self,args): 326 | import bookwormDB.CreateDatabase 327 | dbnames = [self.dbname] 328 | if args.all==True: 329 | dbnames = [] 330 | datahandler = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname,variableFile=None) 331 | cursor = datahandler.db.query("SELECT TABLE_SCHEMA FROM information_schema.tables WHERE TABLE_NAME='masterTableTable'") 332 | for row in cursor.fetchall(): 333 | dbnames.append(row[0]) 334 | logging.info("The following databases are bookworms to be reloaded:") 335 | for name in dbnames: 336 | logging.info("\t" + name) 337 | 338 | for database in dbnames: 339 | logging.info("Reloading memory tables for %s" %database) 340 | Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(database,variableFile=None) 341 | Bookworm.reloadMemoryTables(force=args.force) 342 | 343 | def database_metadata(self, args): 344 | import bookwormDB.CreateDatabase 345 | logging.debug("creating metadata db") 346 | Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname) 347 | Bookworm.variableSet.loadMetadata() 348 | 349 | logging.debug("creating metadata variable tables") 350 | 351 | # This creates a table in the database that makes the results of 352 | # field_descriptions accessible through the API, and updates the 353 | 354 | Bookworm.loadVariableDescriptionsIntoDatabase() 355 | 356 | 357 | Bookworm.create_fastcat_and_wordsheap_disk_tables() 358 | 359 | # The temporary memory tables are no longer automatically created on a build. 360 | # To create them, use `bookworm reload_memory`. 361 | # Bookworm.reloadMemoryTables() 362 | 363 | #print "adding cron job to automatically reload memory tables on launch" 364 | #print "(this assumes this machine is the MySQL server, which need not be the case)" 365 | #call(["sh","scripts/scheduleCronJob.sh"]) 366 | Bookworm.jsonify_data() # Create the self.dbname.json file in the root directory. 367 | Bookworm.create_API_settings() 368 | 369 | Bookworm.grantPrivileges() 370 | 371 | def add_metadata(self, args): 372 | import bookwormDB.CreateDatabase 373 | import bookwormDB.convertTSVtoJSONarray 374 | bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname,None) 375 | anchorField = args.key 376 | if args.format == "tsv": 377 | # TSV is just converted into JSON in a file at tmp.txt, and slurped in that way. 378 | if args.key is None: 379 | args.key = open(args.file).readline().split("\t")[0] 380 | f = "tmp.txt" 381 | bookwormDB.convertTSVtoJSONarray.convertToJSON(args.file, f) 382 | args.file = f 383 | 384 | bookworm.importNewFile(args.file, 385 | anchorField=args.key, 386 | jsonDefinition=args.field_descriptions) 387 | 388 | 389 | def database_wordcounts(self, args = None, **kwargs): 390 | """ 391 | Builds the wordcount components of the database. This will die 392 | if you can't connect to the database server. 393 | """ 394 | cmd_args = args 395 | import bookwormDB.CreateDatabase 396 | 397 | index = True 398 | reverse_index = True 399 | ingest = True 400 | newtable = True 401 | 402 | if cmd_args and hasattr(cmd_args, "index_only"): 403 | if cmd_args.index_only: 404 | ingest = False 405 | newtable = False 406 | else: 407 | index = not cmd_args.no_index 408 | newtable = not cmd_args.no_delete 409 | reverse_index = not cmd_args.no_reverse_index 410 | if not (newtable and ingest and index): 411 | logging.warn("database_wordcounts args not supported for bigrams yet.") 412 | 413 | Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname) 414 | Bookworm.load_word_list() 415 | Bookworm.create_unigram_book_counts(newtable=newtable, ingest=ingest, index=index, reverse_index=reverse_index) 416 | Bookworm.create_bigram_book_counts() 417 | 418 | class Extension(object): 419 | 420 | """ 421 | A bookworm extension. Initialized with an args object, 422 | which has the element url, the location of a clonable git repo. 423 | 424 | Because I don't want people to have to write extensions in python, 425 | they are build using `make`. 426 | """ 427 | 428 | def __init__(self,args,basedir="./"): 429 | self.args = args 430 | self.dir = basedir + ".bookworm/extensions/" + re.sub(".*/","",self.args.url) 431 | 432 | def clone_or_pull(self): 433 | if not os.path.exists(self.dir): 434 | logging.info("cloning git repo from " + self.args.url) 435 | call(["git","clone",self.args.url,self.dir]) 436 | else: 437 | logging.info("updating pre-existing git repo at " + self.dir) 438 | Popen(["git","pull"],cwd=self.dir) 439 | 440 | def make(self): 441 | logging.debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 442 | logging.debug("Running make in " + self.dir) 443 | Popen(["make"], cwd=self.dir) 444 | 445 | # Initiate MySQL connection. 446 | 447 | 448 | # Pull a method from command line input. 449 | 450 | def run_arguments(): 451 | """ 452 | Parse the command line arguments and run them. 453 | 454 | The actual running is handled by an instance of the class `BookwormManager`, 455 | which calls all bookworm-related arguments; that, in turn, calls some specific 456 | methods to make things happen (the most important of which is the `BookwormDB` 457 | class, which is in charge of MySQL calls). 458 | 459 | I apologize for how ugly and linear this code is: it's not clear to me 460 | how to write pretty modular code with the argparse module. 461 | You just end up with a bunch of individual add argument lines that are full of random text. 462 | Refactoring pull requests welcome. 463 | """ 464 | 465 | parser = argparse.ArgumentParser(description='Build and maintain a Bookworm database.',prog="bookworm") 466 | parser.add_argument("--configuration","-c",help="The name of the configuration file to read options from: by default, 'bookworm.cnf' in the current directory.", default="bookworm.cnf") 467 | 468 | parser.add_argument("--database","-d",help="The name of the bookworm database in MySQL to connect to: by default, read from the active configuration file.", default=None) 469 | 470 | parser.add_argument("--log-level","-l", help="The logging detail to use for errors. Default is 'warning', only significant problems; info gives a fuller record, and 'debug' dumps many MySQL queries, etc.",choices=["warning","info","debug"],type=str.lower,default="warning") 471 | 472 | 473 | parser.add_argument("--feature-counts", action='append', 474 | help="Use pre-calculated feature counts rather than tokenizing complete text on the fly. Supply any number of single files per count level like 'input.unigrams', 'input.bigrams', etc.") 475 | 476 | parser.add_argument("--ngrams",nargs="+",default=["unigrams","bigrams"],help="What levels to parse with. Multiple arguments should be unquoted in spaces. This option currently does nothing.") 477 | 478 | 479 | # Use subparsers to have an action syntax, like git. 480 | subparsers = parser.add_subparsers(title="action", help='The commands to run with Bookworm', dest="action") 481 | 482 | 483 | 484 | ############# build ################# 485 | build_parser = subparsers.add_parser("build",description = "Create files",help="""Build up the component parts of a Bookworm.\ 486 | 487 | if you specify something far along the line (for instance, the linechart GUI), it will\ 488 | build all prior files as well.""") 489 | 490 | build_parser.add_argument("target", help="The make that you want to build. To build a full bookworm, type 'build all'.") 491 | 492 | # Grep out all possible targets from the Makefile 493 | 494 | ############# supplement ################# 495 | supplement_parser = subparsers.add_parser("add_metadata",help="""Supplement the\ 496 | metadata for an already-created Bookworm with new items. They can be keyed to any field already in the database.""") 497 | supplement_parser.add_argument("-f","--file",help="""The location of a file with additional metadata to incorporate into your bookworm.""",required=True) 498 | 499 | supplement_parser.add_argument( 500 | "--format", 501 | help="""The file format of the new metadata.\ 502 | Must be "json" or "tsv". For JSON, the format is the same as the default\ 503 | jsoncatalog.txt (a text file of json lines, each corresponding to a metadata field);\ 504 | for TSV, a tsv with first line of which is column names,\ 505 | and the first column of which is shared key (like filename). The TSV format,\ 506 | particularly without field descriptions, is much easier to use, but doesn't\ 507 | permit multiple values for the same key.""", 508 | default="json",type=str.lower,choices=["tsv","json"]) 509 | 510 | supplement_parser.add_argument("--key",help="""The name of the key. If not specified and input type is TSV, the first column is used.""",default=None) 511 | supplement_parser.add_argument("--field_descriptions","-d",help="""A description of the new metadata in the format of "field_descriptions.json"; if empty, we'll just guess at some suitable values.""",default=None) 512 | 513 | ######### Reload Memory ############# 514 | memory_tables_parser = subparsers.add_parser("reload_memory",help="Reload the memory\ 515 | tables for the designated Bookworm; this must be done after every MySQL restart") 516 | memory_tables_parser.add_argument("--force-reload",dest="force",action="store_true", 517 | help="Force reload on all memory tables. Use\ 518 | '--skip-reload' for faster execution. On by default\ 519 | .") 520 | memory_tables_parser.add_argument("--skip-reload",dest="force",action="store_false", 521 | help="Don't reload memory tables which have at least\ 522 | one entry in them. Significantly faster, but may produce\ 523 | bad results if the underlying tables have been\ 524 | changed. Good for maintenance, bad for actively updated\ 525 | installations.") 526 | memory_tables_parser.set_defaults(force=False) 527 | memory_tables_parser.add_argument("--all",action="store_true",default=False, 528 | help="Search for all bookworm installations on\ 529 | the server, and reload memory tables for each of them.") 530 | 531 | 532 | ########## Clone and run extensions 533 | extensions_parser = subparsers.add_parser("extension", help="Install Extensions to the current directory") 534 | extensions_parser.add_argument("url",help="A cloneable url for the extension you want to pul: passed as an argument to 'git clone,' so may be either using the https protocol or the git protocol") 535 | 536 | 537 | ########## Clone and run extensions 538 | extensions_parser = subparsers.add_parser("query", help="Run a query using the Bookworm API") 539 | extensions_parser.add_argument("APIcall",help="The json-formatted query to be run.") 540 | 541 | 542 | ########## Build components 543 | extensions_parser = subparsers.add_parser("prep", help="Build individual components.", aliases = ['build']) 544 | extensions_subparsers = extensions_parser.add_subparsers(title="goal", help="The name of the target.", dest="goal") 545 | 546 | # Bookworm prep targets that allow additional args 547 | catalog_prep_parser = extensions_subparsers.add_parser("preDatabaseMetadata", 548 | help=getattr(BookwormManager, "preDatabaseMetadata").__doc__) 549 | 550 | word_ingest_parser = extensions_subparsers.add_parser("database_wordcounts", 551 | help=getattr(BookwormManager, "database_wordcounts").__doc__) 552 | word_ingest_parser.add_argument("--no-delete", action="store_true", help="Do not delete and rebuild the token tables. Useful for a partially finished ingest.") 553 | 554 | word_ingest_parser.add_argument("--no-reverse-index", action="store_true", help="When creating the table, choose not to index bookid/wordid/counts. This is useful for really large builds. Because this is specified at table creation time, it does nothing with --no-delete or --index-only.") 555 | 556 | word_ingest_parser.add_argument("--no-index", action="store_true", help="Do not re-enable keys after ingesting tokens. Only do this if you intent to manually enable keys or will run this command again.") 557 | 558 | word_ingest_parser.add_argument("--index-only", action="store_true", help="Only re-enable keys. Supercedes other flags.") 559 | 560 | # Bookworm prep targets that don't allow additional args 561 | for prep_arg in BookwormManager.__dict__.keys(): 562 | extensions_subparsers.add_parser(prep_arg, help=getattr(BookwormManager, prep_arg).__doc__) 563 | 564 | """ 565 | Some special functions 566 | """ 567 | 568 | init_parser = subparsers.add_parser("init",help="Initialize the current directory as a bookworm directory") 569 | init_parser.add_argument("--force","-f",help="Overwrite some existing files.",default=False,action="store_true") 570 | init_parser.add_argument("--yes","-y",help="Automatically use default values with no prompts",default=False,action="store_true") 571 | 572 | 573 | # Serve the current bookworm 574 | 575 | serve_parser = subparsers.add_parser("serve", 576 | help="Serve the bookworm. Be default this is an API endpoint," 577 | "served over gunicorn, or (not yet supported) a full installation. You might want to wrap" 578 | "the gunicorn endpoint behind a more powerful webserver like apache or nginx.") 579 | 580 | serve_parser.add_argument("--full-site", action = "store_true", help="Serve a webpage as well as a query endpoint? Not active.") 581 | 582 | serve_parser.add_argument("--bind", "-b", default="10012", help="The port over which to serve the bookworm",type=int) 583 | 584 | serve_parser.add_argument("--workers", "-w", default="0", help="How many gunicorn worker threads to launch for the API. Reduce if you're seeing memory issues.",type=int) 585 | 586 | serve_parser.add_argument("--dir","-d",default="http_server",help="A filepath for a directory to serve from. Will be created if it does not exist.") 587 | 588 | 589 | 590 | # Configure the global server. 591 | configure_parser = subparsers.add_parser("config",help="Some helpers to configure a running bookworm, or to manage your server-wide configuration.") 592 | configure_parser.add_argument("target",help="The thing you want help configuring.",choices=["mysql", "mysql-info", "apache"]) 593 | configure_parser.add_argument("--users",nargs="+",choices=["admin","global","root"],help="The user levels you want to act on.",default=["admin","global"]) 594 | configure_parser.add_argument("--force","-f",help="Overwrite existing configurations in potentially bad ways.",action="store_true",default=False) 595 | 596 | # Call the function 597 | args = parser.parse_args() 598 | # Set the logging level based on the input. 599 | numeric_level = getattr(logging, args.log_level.upper(), None) 600 | if not isinstance(numeric_level, int): 601 | raise ValueError('Invalid log level: %s' % args.log_level) 602 | # While we're at it, log with line numbers 603 | FORMAT = "[%(filename)s:%(lineno)s-%(funcName)s() %(asctime)s.%(msecs)03d] %(message)s" 604 | logging.basicConfig(format=FORMAT, level=numeric_level, datefmt="%I:%M:%S") 605 | logging.info("Info logging enabled.") 606 | logging.info("Debug logging enabled.") 607 | 608 | # Create the bookworm 609 | my_bookworm = BookwormManager(args.configuration, args.database) 610 | 611 | # Call the current action with the arguments passed in. 612 | getattr(my_bookworm,args.action)(args) 613 | -------------------------------------------------------------------------------- /bookwormDB/multiprocessingHelp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import psutil 3 | import logging 4 | 5 | def mp_stats(): 6 | try: 7 | cpus = len(os.sched_getaffinity(0)) 8 | except AttributeError: 9 | # Should be better OS X support than this. 10 | cpus = 6 11 | 12 | # Allocate half of available memory for the bounter 13 | memory = int(psutil.virtual_memory()[4]) 14 | 15 | if memory < 1024: 16 | logging.warning("Not much memory to work with--vocab may be inexact") 17 | 18 | return (cpus, memory) 19 | 20 | def running_processes(workerlist): 21 | running = False 22 | for worker in workerlist: 23 | if worker.is_alive(): 24 | running = True 25 | else: 26 | code = worker.exitcode 27 | if code > 0: 28 | raise("Process died with code {}".format(code)) 29 | return running 30 | -------------------------------------------------------------------------------- /bookwormDB/schema_primitives.py: -------------------------------------------------------------------------------- 1 | from .general_API import Aggregator 2 | 3 | agg_keys = list(Aggregator.__dict__.keys()) 4 | agg_keys = [k for k in agg_keys if not k.startswith("_")] 5 | counts_schema = { 6 | "$id": "#/properties/counttype", 7 | "type": "array", 8 | "items": { 9 | "$id": "#/properties/counttype/items", 10 | "type": "string", 11 | "default": "WordCount", 12 | "enum": agg_keys 13 | "pattern": "^(.*)$" 14 | } 15 | } 16 | 17 | method_schema = { 18 | "type": "string", 19 | "title": "Return Method", 20 | "default": "data", 21 | "enum": [ 22 | "data", 23 | "schema", 24 | "search" 25 | ], 26 | "pattern": "^(.*)$" 27 | } 28 | 29 | format_schema = { 30 | "description": "The return format requested from the API.", 31 | "type": "string", 32 | "title": "The Format Schema", 33 | "default": "json_c", 34 | "enum": [ 35 | "json_c", 36 | "csv", 37 | "tsv", 38 | "feather", 39 | "json", 40 | "html" 41 | ] 42 | } 43 | -------------------------------------------------------------------------------- /bookwormDB/scripts/fast_featurecounter.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Usage: 3 | # htrc_wordcounter.sh [infile] [tmpdir] [blocksize] [outfile] 4 | 5 | # Important: Need to set locale in order to sort properly 6 | export LC_ALL=C 7 | infile=$1 8 | # Explicitly set tmp directory to better manage disk needs 9 | tmpdir=$2 10 | blocksize=$3 11 | outfile=$4 12 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 13 | tmpfile=tmp1-$RANDOM.txt 14 | 15 | mkdir -p $tmpdir 16 | cat $infile |\ 17 | parallel --block $blocksize -j95% --pipe --files --tempdir $tmpdir \ 18 | awk '{print\ \$2\,\ \$3}' "|" sort "|" awk -f $DIR/mergecounted.awk >$tmpfile 19 | 20 | echo $tmpfile 21 | 22 | # We've processed the files in a big batch, but in all likelihood, there's still too many 23 | # of them to glob all together and sort. So, let's merge in batches of 30 and dedupe again 24 | cat $tmpfile | parallel --files --tempdir $tmpdir -Xn30 -j95% \ 25 | sort -m {} "|" awk -f scripts/mergecounted.awk ";" rm {} |\ 26 | parallel -Xj1 sort -m {} "|" awk -f $DIR/mergecounted.awk ";" rm {} |\ 27 | sort -n -r -k2 | awk 'BEGIN {i=0}{i+=1;print i " " $1 " " $2}' >$outfile # Format for bw 28 | 29 | rm $tmpfile 30 | -------------------------------------------------------------------------------- /bookwormDB/scripts/mergecounted.awk: -------------------------------------------------------------------------------- 1 | #/usr/bin/awk -f 2 | # Awk script to merge sorted "word\tcount" files. 3 | # Speed is the reason necessitating awk. 4 | BEGIN {start = 1;} { word = $1; 5 | if (last == word) { sum += $2; } 6 | else { 7 | if (!start) print last " " sum 8 | else start = 0; last=word; sum = $2; 9 | } 10 | } END { print last " " sum } 11 | -------------------------------------------------------------------------------- /bookwormDB/search_limits.py: -------------------------------------------------------------------------------- 1 | import MySQLdb 2 | 3 | def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_joiner = " OR "): 4 | whereterm = [] 5 | # The general idea here is that we try to break everything in search_limits down to a list, and then create a whereterm on that joined by whatever the 'joiner' is ("AND" or "OR"), with the comparison as whatever comp is ("=",">=",etc.). 6 | # For more complicated bits, it gets all recursive until the bits are all in terms of list. 7 | if joiner is None: 8 | joiner = " AND " 9 | for key in list(myhash.keys()): 10 | values = myhash[key] 11 | if isinstance(values, (str, bytes)) or isinstance(values, int) or isinstance(values, float): 12 | # This is just human-being handling. You can pass a single value instead of a list if you like, and it will just convert it 13 | # to a list for you. 14 | values = [values] 15 | # Or queries are special, since the default is "AND". This toggles that around for a subportion. 16 | 17 | if key == "$or" or key == "$OR": 18 | local_set = [] 19 | for comparison in values: 20 | local_set.append(where_from_hash(comparison, comp=comp)) 21 | whereterm.append(" ( " + " OR ".join(local_set) + " )") 22 | elif key == '$and' or key == "$AND": 23 | for comparison in values: 24 | whereterm.append(where_from_hash(comparison, joiner=" AND ", comp=comp)) 25 | elif isinstance(values, dict): 26 | if joiner is None: 27 | joiner = " AND " 28 | # Certain function operators can use MySQL terms. 29 | # These are the only cases that a dict can be passed as a limitations 30 | operations = {"$gt":">", "$ne":"!=", "$lt":"<", 31 | "$grep":" REGEXP ", "$gte":">=", 32 | "$lte":"<=", "$eq":"="} 33 | 34 | for operation in list(values.keys()): 35 | if operation == "$ne": 36 | # If you pass a lot of ne values, they must *all* be false. 37 | subjoiner = " AND " 38 | else: 39 | subjoiner = " OR " 40 | whereterm.append(where_from_hash({key:values[operation]}, comp=operations[operation], list_joiner=subjoiner)) 41 | elif isinstance(values, list): 42 | # and this is where the magic actually happens: 43 | # the cases where the key is a string, and the target is a list. 44 | if isinstance(values[0], dict): 45 | # If it's a list of dicts, then there's one thing that happens. 46 | # Currently all types are assumed to be the same: 47 | # you couldn't pass in, say {"year":[{"$gte":1900}, 1898]} to 48 | # catch post-1898 years except for 1899. Not that you 49 | # should need to. 50 | for entry in values: 51 | whereterm.append(where_from_hash(entry)) 52 | else: 53 | # Note that about a third of the code is spent on escaping strings. 54 | if escapeStrings: 55 | if isinstance(values[0], (str, bytes)): 56 | quotesep = "'" 57 | else: 58 | quotesep = "" 59 | 60 | def escape(value): 61 | # NOTE: stringifying the escape from MySQL; hopefully doesn't break too much. 62 | return str(MySQLdb.escape_string(to_unicode(value)), 'utf-8') 63 | else: 64 | def escape(value): 65 | return to_unicode(value) 66 | quotesep = "" 67 | 68 | joined = list_joiner.join([" ({}{}{}{}{}) ".format(key, comp, quotesep, escape(value), quotesep) for value in values]) 69 | whereterm.append(" ( {} ) ".format(joined)) 70 | 71 | if len(whereterm) > 1: 72 | return "(" + joiner.join(whereterm) + ")" 73 | else: 74 | return whereterm[0] 75 | # This works pretty well, except that it requires very specific sorts of terms going in, I think. 76 | 77 | 78 | class Search_limits(dict): 79 | def to_sql(self): 80 | return where_from_hash(self) 81 | def rkeys(self): 82 | # Recursively return the SQL keys so we know what fields to work with. 83 | keys = [] 84 | for k, v in self.iteritems(): 85 | if not k.starts_with("$"): 86 | keys.append(k) 87 | elif isinstance(v, dict): 88 | for k in Search_limits(v).rkeys(): 89 | keys.append(k) 90 | return keys 91 | def validate(self): 92 | # Some tests to see if a query is valid 93 | for k in self.keys(): 94 | pass 95 | 96 | -------------------------------------------------------------------------------- /bookwormDB/sqliteKV.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2018 Sylvain PULICANI 2 | # Super heavily changed by Ben Schmidt; the old version was a true 3 | # kv store, this one just autoincrements a lookup table. 4 | 5 | # This should generally be thread safe for reads, but not for writes. 6 | # If multip 7 | 8 | # This work is free. You can redistribute it and/or modify it under the 9 | # terms of the Do What The Fuck You Want To Public License, Version 2, 10 | # as published by Sam Hocevar. See the COPYING file for more details. 11 | 12 | # sqlite_kv.py 13 | # 14 | # Python implementation of the SQLiteKV store. 15 | 16 | import sqlite3 17 | 18 | 19 | class KV: 20 | """ 21 | Python implementation of the SQLiteKV store, with additionnal methods 22 | to make it more pythonic. 23 | ..Warning:: 24 | * The `close` method has to be called after use. 25 | * The `delete` method is not yet implemented. 26 | """ 27 | def __init__(self, dbfile): 28 | """ 29 | Open a connection to the SQLite file. If it doesn't exists, create it 30 | and add the needed tables. 31 | """ 32 | self.conn = None 33 | self.conn = sqlite3.connect(dbfile, detect_types=sqlite3.PARSE_DECLTYPES) 34 | self.conn.row_factory = sqlite3.Row 35 | 36 | tables = [dict(r)['name'] for r in self.conn.execute( 37 | "SELECT name FROM sqlite_master WHERE type='table'")] 38 | 39 | if 'keys' not in tables: 40 | self.conn.execute("""CREATE TABLE keys( 41 | ID INTEGER PRIMARY KEY ASC, 42 | key TEXT UNIQUE NOT NULL)""") 43 | 44 | self.conn.execute("CREATE UNIQUE INDEX idx_keys ON keys(key)") 45 | 46 | 47 | def close(self): 48 | """ 49 | Properly close the database. 50 | """ 51 | self.conn.commit() 52 | self.conn.close() 53 | 54 | def __getitem__(self, key): 55 | rows = self.conn.execute("""SELECT ID FROM keys 56 | WHERE keys.key=(?)""", (key, )) 57 | row = rows.fetchone() 58 | if row is None: 59 | raise KeyError(key) 60 | return row['ID'] 61 | 62 | def register(self, key): 63 | self.conn.execute("INSERT INTO keys(key) VALUES (?)", 64 | (key, )) 65 | 66 | -------------------------------------------------------------------------------- /bookwormDB/tokenizer.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | from __future__ import print_function 4 | import random 5 | import sys 6 | import os 7 | from .sqliteKV import KV 8 | import time 9 | import logging 10 | import numpy as np 11 | from pandas import read_csv 12 | from io import StringIO 13 | import re 14 | 15 | """ 16 | This section does a lot of work on tokenizing and aggregating wordcounts. 17 | """ 18 | 19 | # Likewise, store a thread-wise count on whether we've thrown a unicode encoding error. 20 | haveWarnedUnicode = False 21 | # And the default regex is generated by a function on demand. 22 | bigregex = None 23 | 24 | 25 | def wordRegex(): 26 | """ 27 | #I'm including the code to create the regex, which makes it more readable. 28 | Note that this uses *unicode*: among other things, that means that it needs to be passed 29 | a unicode-decoded string: and that we have to use the "regex" module instead of the "re" module. Python3 will make this, perhaps, easier. 30 | """ 31 | global re 32 | MasterExpression = r"\w+" 33 | possessive = MasterExpression + r"'s" 34 | numbers = r"(?:[\$])?\d+" 35 | decimals = numbers + r"\.\d+" 36 | abbreviation = r"(?:mr|ms|mrs|dr|prof|rev|rep|sen|st|sr|jr|ft|gen|adm|lt|col|etc)\." 37 | sharps = r"[a-gjxA-GJX]#" 38 | punctuators = r"[^\w\p{Z}]" 39 | """ 40 | Note: this compiles looking for the most complicated words first, and as it goes on finds simpler and simpler forms 41 | """ 42 | bigregex = re.compile("|".join([decimals,possessive,numbers,abbreviation,sharps,punctuators,MasterExpression]),re.UNICODE|re.IGNORECASE) 43 | return bigregex 44 | 45 | 46 | def readDictionaryFile(prefix=""): 47 | look = dict() 48 | for line in open(prefix + ".bookworm/texts/wordlist/wordlist.txt"): 49 | line = line.rstrip("\n") 50 | v, k, _ = line.split("\t") 51 | look[k] = v 52 | return look 53 | 54 | def readIDfile(prefix=""): 55 | if not os.path.exists(".bookworm/metadata/textids.sqlite"): 56 | raise FileNotFoundError("No textids DB: run `bookworm build textids`") 57 | return KV(prefix + ".bookworm/metadata/textids.sqlite") 58 | 59 | class tokenBatches(object): 60 | """ 61 | A tokenBatches is a manager for tokenizers. Each one corresponds to 62 | a reasonable number of texts to read in to memory on a single processor: 63 | during the initial loads, there will probably be one per core. 64 | It doesn't store the original text, just the unigram and bigram tokenizations in its attached self.counts arrays. 65 | 66 | It writes out its dat to a single file: 67 | in this way, a batch of up to several hundred thousand individual files is grouped into a single file. 68 | 69 | It also has a method that encodes and writes its wordcounts into a tsv file appropriate for reading with mysql, 70 | with 3-byte integer encoding for wordid and bookid. 71 | """ 72 | 73 | def __init__(self, levels=["unigrams","bigrams"]): 74 | """ 75 | 76 | mode: 'encode' (write files out) 77 | """ 78 | self.id = '%030x' % random.randrange(16**30) 79 | self.levels=levels 80 | 81 | # placeholder to alert that createOutputFiles must be run. 82 | self.completedFile = None 83 | 84 | def createOutputFiles(self): 85 | self.completedFile = open(".bookworm/texts/encoded/completed/" + self.id,"w") 86 | self.outputFiles = dict() 87 | for level in self.levels: 88 | self.outputFiles[level] = open(".bookworm/texts/encoded/{}/{}.txt".format(level, self.id),"w") 89 | 90 | def attachDictionaryAndID(self): 91 | self.dictionary = readDictionaryFile() 92 | self.IDfile = readIDfile() 93 | 94 | 95 | def close(self): 96 | """ 97 | This test allows the creation of bookworms with fewer document than requested 98 | threads, which happens to be the case in the tests. 99 | """ 100 | if self.completedFile is not None: 101 | self.completedFile.close() 102 | for v in self.outputFiles.values(): 103 | v.close() 104 | 105 | def encodeRow(self, 106 | filename, 107 | tokenizer, 108 | write_completed=True 109 | ): 110 | """ 111 | 'id': the filename 112 | 'tokenizer': a tokenizer object 113 | 114 | """ 115 | if self.completedFile is None: 116 | self.createOutputFiles() 117 | self.attachDictionaryAndID() 118 | 119 | #The dictionary and ID lookup tables should be pre-attached. 120 | dictionary = self.dictionary 121 | IDfile = self.IDfile 122 | 123 | levels = None 124 | """ 125 | if source=="raw_text": 126 | parts = row.split("\t", 1) 127 | filename = parts[0] 128 | try: 129 | tokens = tokenizer(parts[1]) 130 | except IndexError: 131 | logging.warn("\nFound no tab in the input for '" + filename + "'...skipping row\n") 132 | levels = self.levels 133 | 134 | if source == "countfile": 135 | try: 136 | (filename, token, count) = row.split("\t") 137 | except: 138 | logging.error("Can't find tab\n***************") 139 | logging.error(row) 140 | raise 141 | tokens = preTokenized(token, count, self.levels[0]) 142 | """ 143 | 144 | try: 145 | textid = IDfile[filename] 146 | except KeyError: 147 | logging.warn("Warning: file " + filename + " not found in jsoncatalog.txt, not encoding") 148 | return 149 | 150 | for level in self.levels: 151 | outputFile = self.outputFiles[level] 152 | output = [] 153 | 154 | counts = tokenizer.counts(level) 155 | 156 | for wordset, count in counts.items(): 157 | skip = False 158 | wordList = [] 159 | for word in wordset: 160 | try: 161 | wordList.append(dictionary[word]) 162 | except KeyError: 163 | """ 164 | if any of the words to be included is not in the dictionary, 165 | we don't include the whole n-gram in the counts. 166 | """ 167 | skip = True 168 | if not skip: 169 | wordids = "\t".join(wordList) 170 | output.append("{}\t{}\t{}".format(int(textid), wordids, count)) 171 | 172 | try: 173 | if len(output) > 0: 174 | # The test is necessary because otherwise this prints a blank line. 175 | outputFile.write("\n".join(output) + "\n") 176 | 177 | except IOError as e: 178 | logging.exception(e) 179 | 180 | if write_completed: 181 | self.completedFile.write(filename + "\n") 182 | 183 | class Tokenizer(object): 184 | """ 185 | A tokenizer is initialized with a single text string. 186 | 187 | It assumes that you have in namespace an object called "bigregex" which 188 | identifies words. 189 | 190 | (I'd define it here, but it's a performance optimization to avoid compiling the large regex millions of times.) 191 | 192 | the general way to call it is to initialize, and then for each desired set of counts call "tokenizer.counts("bigrams")" (or whatever). 193 | 194 | That returns a dictionary, whose keys are tuples of length 1 for unigrams, 2 for bigrams, etc., and whose values are counts for that ngram. The tuple form should allow faster parsing down the road. 195 | 196 | """ 197 | 198 | def __init__(self, string, tokenization_regex=None): 199 | global haveWarnedUnicode 200 | self.string = string 201 | self.tokenization_regex = tokenization_regex 202 | self.tokens = None 203 | def tokenize(self): 204 | """ 205 | This tries to return the pre-made tokenization: 206 | if that doesn't exist, it creates it. 207 | """ 208 | if self.tokens is not None: 209 | return self.tokens 210 | """ 211 | For speed, don't import until here. 212 | """ 213 | tokenization_regex=self.tokenization_regex 214 | global re 215 | if re is None: 216 | import regex as re 217 | if tokenization_regex is None: 218 | # by default, use the big regex. 219 | global bigregex 220 | if bigregex==None: 221 | bigregex = wordRegex() 222 | tokenization_regex = bigregex 223 | self.tokens = re.findall(tokenization_regex, self.string) 224 | return self.tokens 225 | 226 | def ngrams(self, n, collapse = False): 227 | """ 228 | All the ngrams in the text can be created as a tuple by zipping an arbitrary number of 229 | copies of the text to itself. 230 | """ 231 | 232 | self.tokenize() 233 | l = list(zip(*[self.tokens[i:] for i in range(n)])) 234 | if collapse: 235 | l = [" ".join(tupled) for tupled in l] 236 | return l 237 | 238 | def unigrams(self): 239 | return self.ngrams(1) 240 | 241 | def bigrams(self): 242 | return self.ngrams(2) 243 | 244 | def trigrams(self): 245 | return self.ngrams(3) 246 | 247 | def allgrams(self, max = 6): 248 | output = [] 249 | for i in range(1, max + 1): 250 | output.extend(self.ngrams(i, collapse = True)) 251 | return output 252 | 253 | def words(self): 254 | """ 255 | 1-grams have tuple keys, but words have index keys. 256 | """ 257 | self.tokenize() 258 | return self.tokens 259 | 260 | def counts(self, whichType): 261 | 262 | count = dict() 263 | for gram in getattr(self,whichType)(): 264 | try: 265 | count[gram] += 1 266 | except KeyError: 267 | count[gram] = 1 268 | return count 269 | 270 | 271 | class PreTokenized(object): 272 | """ 273 | This class is a little goofy: it mimics the behavior of a tokenizer 274 | one data that's already been tokenized by something like 275 | Google Ngrams or JStor Data for Research. 276 | """ 277 | 278 | def __init__(self, csv_string, level): 279 | f = read_csv(StringIO(csv_string), 280 | lineterminator = "\f", 281 | # Ugh--want 'NA' to be a word. 282 | dtype = {'word': str, 'counts': np.int}, 283 | keep_default_na=False, 284 | names = ["word", "counts"]) 285 | self.level = level 286 | if level == 'words': 287 | self.output = dict(zip(f.word, f.counts)) 288 | else: 289 | self.output = dict(zip([tuple(w.split(" ")) for w in f.word], f.counts)) 290 | 291 | def counts(self,level): 292 | if level != self.level: 293 | raise 294 | return self.output 295 | 296 | 297 | def getAlreadySeenList(folder): 298 | #Load in a list of what's already been translated for that level. 299 | #Returns a set. 300 | files = os.listdir(folder) 301 | seen = set([]) 302 | for file in files: 303 | for line in open(folder + "/" + file): 304 | seen.add(line.rstrip("\n")) 305 | return seen 306 | 307 | def encode_text_stream(): 308 | seen = getAlreadySeenList(".bookworm/texts/encoded/completed") 309 | tokenBatch = tokenBatches() 310 | tokenBatch.attachDictionaryAndID() 311 | for line in sys.stdin: 312 | filename = line.split("\t",1)[0] 313 | line = line.rstrip("\n") 314 | if filename not in seen: 315 | tokenBatch.encodeRow(line) 316 | 317 | # And printout again at the end 318 | 319 | if __name__=="__main__": 320 | encode_text_stream() 321 | 322 | -------------------------------------------------------------------------------- /bookwormDB/wsgi.py: -------------------------------------------------------------------------------- 1 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall 2 | import json 3 | from urllib.parse import unquote 4 | import logging 5 | import multiprocessing 6 | import gunicorn.app.base 7 | from datetime import datetime 8 | 9 | def content_type(query): 10 | try: 11 | format = query['format'] 12 | except: 13 | return 'text/plain' 14 | 15 | if format == "json": 16 | return "application/json" 17 | 18 | if format == "feather": 19 | return "application/octet-stream" 20 | 21 | if format == "html": 22 | return "text/html" 23 | 24 | return 'text/plain' 25 | 26 | def application(environ, start_response, logfile = "bookworm_queries.log"): 27 | # Starting with code from http://wsgi.tutorial.codepoint.net/parsing-the-request-post 28 | try: 29 | request_body_size = int(environ.get('QUERY_STRING', 0)) 30 | except (ValueError): 31 | request_body_size = 0 32 | 33 | # When the method is POST the variable will be sent 34 | # in the HTTP request body which is passed by the WSGI server 35 | # in the file like wsgi.input environment variable. 36 | 37 | q = environ.get('QUERY_STRING') 38 | try: 39 | ip = environ.get('HTTP_X_FORWARDED_FOR') 40 | # logging.debug("Request from {}".format(ip)) 41 | except: 42 | ip = environ.get('REMOTE_ADDR') 43 | if ip is None: 44 | ip = environ.get('REMOTE_ADDR') 45 | query = unquote(q) 46 | 47 | headers = { 48 | 'Access-Control-Allow-Origin': '*', 49 | 'Access-Control-Allow-Methods': 'GET, POST, PUT, OPTIONS', 50 | 'Access-Control-Allow-Headers': 51 | 'Origin, Accept, Content-Type, X-Requested-With, X-CSRF-Token', 52 | 'charset': 'utf-8' 53 | } 54 | 55 | 56 | 57 | logging.debug("Received query {}".format(query)) 58 | start = datetime.now() 59 | 60 | # Backward-compatability: we used to force query to be 61 | # a named argument. 62 | query = query.strip("query=") 63 | query = query.strip("queryTerms=") 64 | 65 | try: 66 | query = json.loads(query) 67 | query['ip'] = ip 68 | except: 69 | response_body = "Unable to read JSON" 70 | status = '404' 71 | start_response(status, list(headers.items())) 72 | return [b'{"status":"error", "message": "You have passed invalid JSON to the Bookworm API"}'] 73 | 74 | process = SQLAPIcall(query) 75 | response_body = process.execute() 76 | 77 | # It might be binary already. 78 | headers['Content-type'] = content_type(query) 79 | 80 | if headers['Content-type'] != 'application/octet-stream': 81 | response_body = bytes(response_body, 'utf-8') 82 | 83 | headers['Content-Length'] = str(len(response_body)) 84 | status = '200 OK' 85 | start_response(status, list(headers.items())) 86 | 87 | query['time'] = start.timestamp() 88 | query['duration'] = datetime.now().timestamp() - start.timestamp() 89 | # This writing isn't thread-safe; but generally we're not getting more than a couple queries a second. 90 | with open(logfile, 'a') as fout: 91 | json.dump(query, fout) 92 | fout.write("\n") 93 | logging.debug("Writing to log: \n{}\n".format(json.dumps(query))) 94 | return [response_body] 95 | 96 | # Copied from the gunicorn docs. 97 | 98 | 99 | def number_of_workers(): 100 | return (multiprocessing.cpu_count() * 2) + 1 101 | 102 | class StandaloneApplication(gunicorn.app.base.BaseApplication): 103 | """ 104 | Superclassed to allow bookworm to do the running. 105 | """ 106 | def __init__(self, app, options=None): 107 | self.options = options or {} 108 | self.application = app 109 | super(StandaloneApplication, self).__init__() 110 | 111 | def load_config(self): 112 | config = dict([(key, value) for key, value in self.options.items() 113 | if key in self.cfg.settings and value is not None]) 114 | for key, value in config.items(): 115 | self.cfg.set(key.lower(), value) 116 | 117 | def load(self): 118 | return self.application 119 | 120 | def run(port = 10012, workers = number_of_workers()): 121 | if workers==0: 122 | workers = number_of_workers() 123 | 124 | options = { 125 | 'bind': '{}:{}'.format('127.0.0.1', port), 126 | 'workers': workers, 127 | } 128 | 129 | StandaloneApplication(application, options).run() 130 | 131 | -------------------------------------------------------------------------------- /demos/.ipynb_checkpoints/Reading Binary data-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook shows how to read the wordcount data directly from the mysql binary file. This is likely to be the fastest possible way to iterate over the whole thing. It will only work on Bookworms created under certain, undefined processor architectures. Probably anything you're likely to build, though, will work. We're not talking about Mac vs. Linux type differences, but things like default endianness in the processor." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "First we define the source. This is already a sign things are pretty out of hand." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 218, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "source = \"/drobo/mysql/hathipd\"" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 621, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "import numpy as np\n", 37 | "import sys\n", 38 | "class BinaryBookworm():\n", 39 | " def __init__(self,source_dir):\n", 40 | " #self.file = open(source + \"/master_bookcounts.MYD\",\"rb\")\n", 41 | " self.memmap = np.memmap(source + \"/master_bookcounts.MYD\",\" 1000000:\n", 160 | " break\n", 161 | "print len(foo)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 562, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "[array([295779, 4986, 6], dtype=int32),\n", 175 | " array([295779, 187140, 1], dtype=int32),\n", 176 | " array([295779, 294054, 2], dtype=int32)]" 177 | ] 178 | }, 179 | "execution_count": 562, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "foo[:3]" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 184, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [ 195 | { 196 | "ename": "StopIteration", 197 | "evalue": "", 198 | "output_type": "error", 199 | "traceback": [ 200 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 201 | "\u001b[0;31mStopIteration\u001b[0m Traceback (most recent call last)", 202 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mparse_row\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreadable\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 203 | "\u001b[0;32m\u001b[0m in \u001b[0;36mparse_row\u001b[0;34m(readable)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreadable\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m9\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m\"\\x00\"\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m\"\\xff\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 204 | "\u001b[0;31mStopIteration\u001b[0m: " 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "while True:\n", 210 | " parse_row(readable)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 98, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "read" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 100, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "a = np.ndarray(len(buf), np.dtype('>i1'), buf)\n", 233 | "e = np.zeros(len(buf) / 6, np.dtype('>i4'))\n" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 104, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [ 243 | { 244 | "ename": "ValueError", 245 | "evalue": "new type not compatible with array.", 246 | "output_type": "error", 247 | "traceback": [ 248 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 249 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 250 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'>i4'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'>i4'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 251 | "\u001b[0;31mValueError\u001b[0m: new type not compatible with array." 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "for i in range(3):\n", 257 | " e.view(dtype='>i4')[i + 1::4] = \\\n", 258 | " a.view(dtype='>i4')[i::3]\n" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 113, 264 | "metadata": { 265 | "collapsed": false 266 | }, 267 | "outputs": [ 268 | { 269 | "data": { 270 | "text/plain": [ 271 | "'\\x01\\x00\\x00'" 272 | ] 273 | }, 274 | "execution_count": 113, 275 | "metadata": {}, 276 | "output_type": "execute_result" 277 | } 278 | ], 279 | "source": [ 280 | "foo[1:4]" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 112, 286 | "metadata": { 287 | "collapsed": false 288 | }, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "text/plain": [ 293 | "" 294 | ] 295 | }, 296 | "execution_count": 112, 297 | "metadata": {}, 298 | "output_type": "execute_result" 299 | } 300 | ], 301 | "source": [ 302 | "a.data" 303 | ] 304 | } 305 | ], 306 | "metadata": { 307 | "kernelspec": { 308 | "display_name": "Python 2", 309 | "language": "python", 310 | "name": "python2" 311 | }, 312 | "language_info": { 313 | "codemirror_mode": { 314 | "name": "ipython", 315 | "version": 2 316 | }, 317 | "file_extension": ".py", 318 | "mimetype": "text/x-python", 319 | "name": "python", 320 | "nbconvert_exporter": "python", 321 | "pygments_lexer": "ipython2", 322 | "version": "2.7.12" 323 | } 324 | }, 325 | "nbformat": 4, 326 | "nbformat_minor": 0 327 | } 328 | -------------------------------------------------------------------------------- /demos/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "% load_ext autoreload\n", 10 | "% autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 6, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from bookwormDB.mariaDB import Query" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 7, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "ename": "BookwormException", 29 | "evalue": "{'code': 400, 'message': 'You must specify a value for database'}", 30 | "output_type": "error", 31 | "traceback": [ 32 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 33 | "\u001b[0;31mBookwormException\u001b[0m Traceback (most recent call last)", 34 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mtest\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\"plottype\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"pointchart\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"smoothingSpan\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"host\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"http://localhost:10012/\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"words_collation\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"Case_Sensitive\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"database\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"RMP\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"aesthetic\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"y\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"department\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"x\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"WordsPerMillion\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"color\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"gender\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"search_limits\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"word\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"brilliant\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"vega\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"title\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"The most STEM-happy senators\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"transform\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"filter\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"datum.WordsPerMillion > 130\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"groups\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"department\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"gender\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"counttype\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"WordsPerMillion\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"method\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"data\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"format\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"json_c\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mQuery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 35 | "\u001b[0;32m~/bookwormDB/bookwormDB/mariaDB.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, query_object, db, databaseScheme)\u001b[0m\n\u001b[1;32m 122\u001b[0m \u001b[0;31m# Certain constructions require a DB connection already available, so we just start it here, or use the one passed to it.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 124\u001b[0;31m \u001b[0mcheck_query\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery_object\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 125\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprefs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m'database'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mquery_object\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'database'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 36 | "\u001b[0;32m~/bookwormDB/bookwormDB/mariaDB.py\u001b[0m in \u001b[0;36mcheck_query\u001b[0;34m(query)\u001b[0m\n\u001b[1;32m 102\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'database'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mquery\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 104\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mBookwormException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"code\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;36m400\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"message\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"You must specify a value for {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 105\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 37 | "\u001b[0;31mBookwormException\u001b[0m: {'code': 400, 'message': 'You must specify a value for database'}" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "test = {\n", 43 | " \"host\":\"http://localhost:10012/\",\"words_collation\":\"Case_Sensitive\",\n", 44 | " \"database\":\"RMP\",\"aesthetic\":{\"y\":\"department\",\"x\":\"WordsPerMillion\",\"color\":\"gender\"},\"search_limits\":{\"word\":[\"brilliant\"]},\"vega\":{\"title\":\"The most STEM-happy senators\",\"transform\":[{\"filter\":\"datum.WordsPerMillion > 130\"}]},\"groups\":[\"department\",\"gender\"],\"counttype\":[\"WordsPerMillion\"],\"method\":\"data\",\"format\":\"json_c\"}\n", 45 | "Query(test)" 46 | ] 47 | } 48 | ], 49 | "metadata": { 50 | "kernelspec": { 51 | "display_name": "Python 3", 52 | "language": "python", 53 | "name": "python3" 54 | }, 55 | "language_info": { 56 | "codemirror_mode": { 57 | "name": "ipython", 58 | "version": 3 59 | }, 60 | "file_extension": ".py", 61 | "mimetype": "text/x-python", 62 | "name": "python", 63 | "nbconvert_exporter": "python", 64 | "pygments_lexer": "ipython3", 65 | "version": "3.7.1" 66 | } 67 | }, 68 | "nbformat": 4, 69 | "nbformat_minor": 2 70 | } 71 | -------------------------------------------------------------------------------- /demos/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "% load_ext autoreload\n", 10 | "% autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 6, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from bookwormDB.mariaDB import Query" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 18, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stderr", 29 | "output_type": "stream", 30 | "text": [ 31 | "WARNING:root:'TRUE'\n" 32 | ] 33 | }, 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "\n", 39 | " SELECT sum(nwords) as WordCount, department, gender\n", 40 | " FROM departmentLookup_ NATURAL JOIN fastcat_ NATURAL JOIN genderLookup_ NATURAL JOIN ID_genderheap_\n", 41 | " WHERE\n", 42 | " TRUE \n", 43 | " AND \n", 44 | " TRUE \n", 45 | " AND TRUE \n", 46 | " GROUP BY department__id, gender__id\n", 47 | " \n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "test = {\n", 53 | " \"host\":\"http://localhost:10012/\",\"words_collation\":\"Case_Sensitive\",\n", 54 | " \"database\":\"RMP\", \"search_limits\":{},\n", 55 | " \"groups\":[\"department\",\"gender\"],\"counttype\":[\"WordCount\"],\"method\":\"data\",\"format\":\"json_c\"}\n", 56 | "print(Query(test).query())" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 14, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "ename": "NameError", 66 | "evalue": "name 'SqlFilter' is not defined", 67 | "output_type": "error", 68 | "traceback": [ 69 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 70 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 71 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mlexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlexers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMySqlLexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mlexer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_filter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSqlFilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhighlight\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformatters\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTerminalFormatter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 72 | "\u001b[0;31mNameError\u001b[0m: name 'SqlFilter' is not defined" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "!cd /drobo/" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 15, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "from bookwormDB import CreateDatabase\n", 87 | "z = CreateDatabase.BookwormSQLDatabase(\"RMP\")" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 16, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "ename": "FileNotFoundError", 97 | "evalue": "[Errno 2] No such file or directory: '.bookworm/metadata/field_descriptions_derived.json'", 98 | "output_type": "error", 99 | "traceback": [ 100 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 101 | "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", 102 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mCreateDatabase\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mBookwormSQLDatabase\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"RMP\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 103 | "\u001b[0;32m~/bookwormDB/bookwormDB/CreateDatabase.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, dbname, variableFile)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mvariableFile\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 142\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetVariables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moriginFile\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvariableFile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mgrantPrivileges\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 104 | "\u001b[0;32m~/bookwormDB/bookwormDB/CreateDatabase.py\u001b[0m in \u001b[0;36msetVariables\u001b[0;34m(self, originFile, anchorField, jsonDefinition)\u001b[0m\n\u001b[1;32m 160\u001b[0m def setVariables(self, originFile, anchorField=\"bookid\",\n\u001b[1;32m 161\u001b[0m jsonDefinition=\".bookworm/metadata/field_descriptions_derived.json\"):\n\u001b[0;32m--> 162\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvariableSet\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvariableSet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moriginFile\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moriginFile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0manchorField\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0manchorField\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjsonDefinition\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjsonDefinition\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdb\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 163\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mimportNewFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0moriginFile\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0manchorField\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mjsonDefinition\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 105 | "\u001b[0;32m~/bookwormDB/bookwormDB/variableSet.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, originFile, anchorField, jsonDefinition, db)\u001b[0m\n\u001b[1;32m 500\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjsonDefinition\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mguessAtFieldDescriptions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 501\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 502\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjsonDefinition\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfin\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 503\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjsonDefinition\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfin\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 504\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 106 | "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '.bookworm/metadata/field_descriptions_derived.json'" 107 | ] 108 | } 109 | ], 110 | "source": [] 111 | } 112 | ], 113 | "metadata": { 114 | "kernelspec": { 115 | "display_name": "Python 3", 116 | "language": "python", 117 | "name": "python3" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 3 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython3", 129 | "version": "3.7.1" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 2 134 | } 135 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # This flag says that the code is written to work on both Python 2 and Python 3 | # 3. If at all possible, it is good practice to do this. If you cannot, you 4 | # will need to generate wheels for each Python version that you support. 5 | universal=0 6 | 7 | [flake8] 8 | ignore= E231, E501 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | 5 | setup( 6 | name='bookwormDB', 7 | packages=["bookwormDB"], 8 | version='1.0', 9 | entry_points={ 10 | 'console_scripts': [ 11 | 'bookworm = bookwormDB.manager:run_arguments' 12 | ], 13 | }, 14 | description="Create, deploy, and serve a Bookworm instance.", 15 | long_description="\n".join(open("README.rst").readlines()), 16 | package_data={'bookwormDB':['etc/*','bin/*']}, 17 | url="http://github.com/Bookworm-Project", 18 | author="Benjamin Schmidt", 19 | author_email="bmschmidt@gmail.com", 20 | license="MIT", 21 | classifiers=[ 22 | 'Development Status :: 4 - Beta', 23 | 'Intended Audience :: Developers', 24 | 'Intended Audience :: Education', 25 | "Natural Language :: English", 26 | # Pick your license as you wish (should match "license" above) 27 | 'License :: OSI Approved :: MIT License', 28 | "Operating System :: Unix", 29 | # Specify the Python versions you support here. In particular, ensure 30 | # that you indicate whether you support Python 2, Python 3 or both. 31 | 'Programming Language :: Python :: 3.6', 32 | 'Programming Language :: Python :: 3.7', 33 | "Topic :: Sociology :: History", 34 | "Topic :: Text Processing :: Indexing", 35 | "Topic :: Text Processing :: Linguistic" 36 | ], 37 | install_requires=["numpy","pandas","mysqlclient", 38 | "python-dateutil", "psutil", "bounter", 39 | "gunicorn" 40 | ] 41 | ) 42 | -------------------------------------------------------------------------------- /tests/setup.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import bookwormDB 3 | import bookwormDB.CreateDatabase 4 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall 5 | import logging 6 | import os 7 | from subprocess import call as call 8 | import sys 9 | import json 10 | from shutil import rmtree 11 | 12 | def setup_bookworm(): 13 | """ 14 | Creates a test bookworm. Removes any existing databases called "federalist_bookworm" 15 | """ 16 | logging.info("\n\nTESTING BOOKWORM CREATION\n\n") 17 | import MySQLdb 18 | from warnings import filterwarnings 19 | filterwarnings('ignore', category = MySQLdb.Warning) 20 | 21 | import bookwormDB.configuration 22 | os.chdir(sys.path[0] + "/test_bookworm_files") 23 | rmtree(".bookworm", ignore_errors = True) 24 | 25 | bookwormDB.configuration.create(ask_about_defaults=False, database="federalist_bookworm") 26 | 27 | db = bookwormDB.CreateDatabase.DB(dbname="mysql") 28 | 29 | try: 30 | db.query("DROP DATABASE IF EXISTS federalist_bookworm") 31 | except MySQLdb.OperationalError as e: 32 | if e[0]==1008: 33 | pass 34 | else: 35 | print(e) 36 | raise 37 | except Exception as e: 38 | """ 39 | This is some weird MariaDB exception. It sucks that I'm compensating for it here. 40 | """ 41 | if e[0]=="Cannot load from mysql.proc. The table is probably corrupted": 42 | pass 43 | else: 44 | print(e) 45 | logging.warning("Some mysterious error in attempting to drop previous iterations: just try running it again?") 46 | 47 | call(["bookworm --log-level warning build all"],shell=True,cwd=sys.path[0] + "/test_bookworm_files") 48 | 49 | 50 | def setup_bookworm_unicode(): 51 | """ 52 | Creates a test bookworm. Removes any existing databases called "unicode_test_bookworm" 53 | """ 54 | logging.info("\n\nTESTING BOOKWORM CREATION\n\n") 55 | import MySQLdb 56 | from warnings import filterwarnings 57 | filterwarnings('ignore', category = MySQLdb.Warning) 58 | 59 | import bookwormDB.configuration 60 | os.chdir(sys.path[0] + "/test_bookworm_files_unicode") 61 | rmtree(".bookworm", ignore_errors = True) 62 | 63 | bookwormDB.configuration.create(ask_about_defaults=False,database="unicode_test_bookworm") 64 | 65 | db = bookwormDB.CreateDatabase.DB(dbname="mysql") 66 | 67 | try: 68 | db.query("DROP DATABASE IF EXISTS unicode_test_bookworm") 69 | except MySQLdb.OperationalError as e: 70 | if e[0]==1008: 71 | pass 72 | else: 73 | print(e) 74 | raise 75 | except Exception as e: 76 | """ 77 | This is some weird MariaDB exception. It sucks that I'm compensating for it here. 78 | """ 79 | if e[0]=="Cannot load from mysql.proc. The table is probably corrupted": 80 | pass 81 | else: 82 | logging.warning("Some mysterious error in attempting to drop previous iterations: just try running it again?") 83 | 84 | call(["bookworm --log-level warning build all"], 85 | shell=True, 86 | cwd=sys.path[0] + "/test_bookworm_files_unicode") 87 | 88 | 89 | if __name__=="__main__": 90 | setup_bookworm() 91 | setup_bookworm_unicode() 92 | 93 | -------------------------------------------------------------------------------- /tests/test_API.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from builtins import range 4 | from builtins import object 5 | import unittest 6 | import bookwormDB 7 | import bookwormDB.CreateDatabase 8 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall 9 | import logging 10 | import os 11 | from subprocess import call as call 12 | import sys 13 | import json 14 | from setup import setup_bookworm, setup_bookworm_unicode 15 | 16 | class Bookworm_SQL_Creation(unittest.TestCase): 17 | 18 | def test_bookworm_files_exist(self): 19 | bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase("federalist_bookworm") 20 | db = bookworm.db 21 | db.query("USE federalist_bookworm") 22 | wordCount = db.query("SELECT SUM(nwords) FROM fastcat_").fetchall()[0][0] 23 | # This should be 212,081, but I don't want the tests to start failing when 24 | # we change the tokenization rules or miscellaneous things about encoding. 25 | self.assertTrue(wordCount>100000) 26 | """ 27 | Then we test whether the API can make queries on that bookworm. 28 | """ 29 | 30 | def test_API(self): 31 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall 32 | import json 33 | 34 | query = { 35 | "database":"federalist_bookworm", 36 | "search_limits":{}, 37 | "counttype":"TextPercent", 38 | "groups":["author"], 39 | "method":"data", "format":"json" 40 | } 41 | 42 | m = json.loads(SQLAPIcall(query).execute())['data'] 43 | self.assertEqual(len(m),5) 44 | 45 | 46 | def test_multiword_search(self): 47 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall 48 | import json 49 | 50 | query = { 51 | "database":"federalist_bookworm", 52 | "search_limits":{"word":["on","upon"]}, 53 | "counttype":"TextPercent", 54 | "method":"data", "format":"json", 55 | "groups": [] 56 | } 57 | 58 | m = json.loads(SQLAPIcall(query).execute())['data'] 59 | self.assertTrue(m[0] > 33) 60 | 61 | def test_ne_with_one_entry(self): 62 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall 63 | import json 64 | 65 | query = { 66 | "database":"federalist_bookworm", 67 | "search_limits":{ 68 | "author": {"$ne": ["HAMILTON"]} 69 | }, 70 | "counttype":"TextPercent", 71 | "groups":["author"], 72 | "method":"data", "format":"json" 73 | } 74 | 75 | m = json.loads(SQLAPIcall(query).execute())['data'] 76 | self.assertTrue(len(m)==4) 77 | 78 | def test_ne_with_two_entries(self): 79 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall 80 | import json 81 | 82 | query = { 83 | "database":"federalist_bookworm", 84 | "search_limits":{ 85 | "author": {"$ne": ["HAMILTON","DISPUTED"]} 86 | }, 87 | "counttype":"TextPercent", 88 | "groups":["author"], 89 | "method":"data", "format":"json" 90 | } 91 | 92 | m = json.loads(SQLAPIcall(query).execute())['data'] 93 | self.assertTrue(len(m)==3) 94 | 95 | 96 | def test_ne_with_two_entries(self): 97 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall 98 | import json 99 | 100 | query = { 101 | "database":"federalist_bookworm", 102 | "search_limits":{ 103 | "author": {"$ne": ["HAMILTON","DISPUTED"]} 104 | }, 105 | "counttype":"TextPercent", 106 | "groups":["author"], 107 | "method":"data", "format":"json" 108 | } 109 | 110 | m = json.loads(SQLAPIcall(query).execute())['data'] 111 | self.assertTrue(len(m)==3) 112 | 113 | 114 | def test_or_with_two_entries(self): 115 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall 116 | import json 117 | 118 | query = { 119 | "database":"federalist_bookworm", 120 | "search_limits":{ 121 | "$or": [ 122 | {"author": ["HAMILTON"]}, 123 | {"author": ["DISPUTED"]} 124 | ] 125 | }, 126 | "counttype":"TextCount", 127 | "groups":["author"], 128 | "method":"data", "format":"json" 129 | } 130 | 131 | m = json.loads(SQLAPIcall(query).execute())['data'] 132 | self.assertEqual(len(m),2) 133 | 134 | def test_lte_and_gte(self): 135 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall 136 | import json 137 | 138 | query = { 139 | "database":"federalist_bookworm", 140 | "search_limits":{ 141 | "fedNumber":{"$lte":10,"$gte":5} 142 | }, 143 | "counttype":"TextCount", 144 | "groups":["fedNumber"], 145 | "method":"data", "format":"json" 146 | } 147 | 148 | m = json.loads(SQLAPIcall(query).execute())['data'] 149 | self.assertTrue(len(m)==6) 150 | 151 | def test_and_with_two_entries(self): 152 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall 153 | import json 154 | 155 | query = { 156 | "database":"federalist_bookworm", 157 | "search_limits":{ 158 | "$and": [ 159 | {"author": ["HAMILTON"]}, 160 | {"fedNumber":[40]} 161 | ] 162 | }, 163 | "counttype":"TextCount", 164 | "groups":["author"], 165 | "method":"data", "format":"json" 166 | } 167 | 168 | m = json.loads(SQLAPIcall(query).execute())['data'] 169 | self.assertTrue(len(m)==0) 170 | 171 | def test_adding_metadata_to_bookworm(self): 172 | """ 173 | Build out some dummy metadata: label the difference 174 | between even and odd paragrahs. 175 | """ 176 | 177 | from bookwormDB.manager import BookwormManager 178 | manager = BookwormManager(database="federalist_bookworm") 179 | 180 | # Create a phony derived field to test metadata supplementing 181 | 182 | 183 | def even_even(number): 184 | if number % 2 == 0: 185 | return "even" 186 | return "odd" 187 | 188 | tmp_file = "{}/test_bookworm_metadata.tsv".format(sys.path[0]) 189 | 190 | with open(tmp_file,"w") as newMetadata: 191 | newMetadata.write("paragraphNumber\toddness\n") 192 | for n in range(500): 193 | newMetadata.write("%d\t%s\n" %(n,even_even(n))) 194 | 195 | class Dummy(object): 196 | """ 197 | Just quickly create a namespace to stand in for the command-line args. 198 | """ 199 | key = "paragraphNumber" 200 | format = "tsv" 201 | file = tmp_file 202 | # Test the guessing at field_descriptions while we're at it 203 | field_descriptions = None 204 | 205 | import os 206 | manager.add_metadata(Dummy) 207 | 208 | """ 209 | And then we test if that can be retrieved 210 | """ 211 | 212 | query = { 213 | "database":"federalist_bookworm", 214 | "search_limits":{}, 215 | "counttype":"TextCount", 216 | "groups":["oddness"], 217 | "method":"data", "format":"json" 218 | } 219 | 220 | SQLAPIcall(query) 221 | m = json.loads(SQLAPIcall(query).execute())['data'] 222 | # Even or odd is one of two things. 223 | self.assertTrue(len(m)==2) 224 | 225 | # Since the first paragraph is odd, 226 | # there should be more of those. 227 | 228 | self.assertTrue(m['odd'][0]>=m['even'][0]) 229 | 230 | def test_case_sensitivity(self): 231 | query = { 232 | "database":"federalist_bookworm", 233 | "search_limits":{"word":["the"]}, 234 | "counttype":"WordCount", 235 | "groups":[], 236 | "words_collation":"Case_Sensitive", 237 | "method":"data", "format":"json" 238 | } 239 | 240 | SQLAPIcall(query) 241 | val1 = json.loads(SQLAPIcall(query).execute())['data'] 242 | self.assertTrue(val1[0] > 0) 243 | 244 | query["words_collation"] = "Case_Insensitive" 245 | 246 | SQLAPIcall(query) 247 | val2 = json.loads(SQLAPIcall(query).execute())['data'] 248 | # The words ('The','the') appear more often than ('the') alone. 249 | self.assertTrue(val2[0] > val1[0]) 250 | 251 | 252 | def test_case_insensitivity_works_without_search_term(self): 253 | query = { 254 | "database":"federalist_bookworm", 255 | "search_limits":{"word":["hOwEvEr"]}, 256 | "counttype":"WordCount", 257 | "groups":[], 258 | "words_collation":"Case_Insensitive", 259 | "method":"data", "format":"json" 260 | } 261 | SQLAPIcall(query) 262 | val1 = json.loads(SQLAPIcall(query).execute())['data'] 263 | self.assertTrue(val1[0] > 0) 264 | 265 | def test_unicode_search_term(self): 266 | query = { 267 | "database":"unicode_test_bookworm", 268 | "search_limits":{"word":[u"ᎾᏍᎩ"]}, 269 | "counttype":"WordCount", 270 | "groups":[], 271 | "words_collation":"Case_Insensitive", 272 | "method":"data", "format":"json" 273 | } 274 | SQLAPIcall(query) 275 | val1 = json.loads(SQLAPIcall(query).execute())['data'] 276 | self.assertTrue(val1[0] > 0) 277 | 278 | def test_various_unicode_cases(self): 279 | # There's a 'description_' for each individual item. 280 | catalog_location = sys.path[0] + "/test_bookworm_files_unicode/jsoncatalog.txt" 281 | cases = [json.loads(line)["description_"] for line in open(catalog_location)] 282 | for case in cases: 283 | query = { 284 | "database":"unicode_test_bookworm", 285 | "search_limits":{"description_":case}, 286 | "counttype":"WordCount", 287 | "groups":[], 288 | "words_collation":"Case_Insensitive", 289 | "method":"data", "format":"json" 290 | } 291 | SQLAPIcall(query) 292 | val1 = json.loads(SQLAPIcall(query).execute())['data'] 293 | self.assertTrue(val1[0] > 0) 294 | 295 | def test_asterisks_in_search_limits(self): 296 | """ 297 | The following two queries should, by definition, produce the same result. 298 | """ 299 | query = { 300 | "database":"federalist_bookworm", 301 | "search_limits":{"word":["on"],"author":["HAMILTON"]}, 302 | "compare_limits":{"word":["on"]}, 303 | "counttype":"WordsPerMillion", 304 | "groups":[], 305 | "method":"data", "format":"json" 306 | } 307 | val1 = json.loads(SQLAPIcall(query).execute())['data'] 308 | 309 | query = { 310 | "database":"federalist_bookworm", 311 | "search_limits":{"word":["on"],"*author":["HAMILTON"]}, 312 | "counttype":"WordsPerMillion", 313 | "groups":[], 314 | "method":"data", "format":"json" 315 | } 316 | val2 = json.loads(SQLAPIcall(query).execute())['data'] 317 | self.assertTrue(val1[0] == val2[0]) 318 | 319 | 320 | """ 321 | class SQLConnections(unittest.TestCase): 322 | 323 | 324 | 325 | def test_dunning(self): 326 | query = { 327 | "database":"federalist", 328 | "search_limits":{"author":"Hamilton"}, 329 | "compare_limits":{"author":"Madison"}, 330 | "counttype":"Dunning", 331 | "groups":["unigram"], 332 | "method":"data", "format":"json" 333 | } 334 | 335 | 336 | try: 337 | #dbbindings.main(query) 338 | worked = True 339 | except: 340 | worked = False 341 | 342 | self.assertTrue(worked) 343 | """ 344 | 345 | 346 | if __name__=="__main__": 347 | # The setup is done without verbose logging; any failure 348 | # causes it to try again. 349 | logging.basicConfig(level=40) 350 | try: 351 | setup_bookworm() 352 | setup_bookworm_unicode() 353 | except: 354 | logging.basicConfig(level=10) 355 | setup_bookworm() 356 | setup_bookworm_unicode() 357 | logging.basicConfig(level=10) 358 | unittest.main() 359 | -------------------------------------------------------------------------------- /tests/test_bookworm_files/field_descriptions.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"datatype": "searchstring", "field": "searchstring", "unique": true, "type": "text"}, 3 | {"datatype": "categorical", "field": "title", "unique": true, "type": "text"}, 4 | {"datatype": "categorical", "field": "author", "unique": true, "type": "text"}, 5 | {"datatype": "categorical", "field": "fedNumber", "unique": true, "type": "text"}, 6 | {"datatype": "categorical", "field": "paragraphNumber", "unique": true, "type": "text"}, 7 | {"datatype": "time", "field": "date", "unique": true, "type": "text", "derived":[{"resolution":"year"},{"resolution":"month"},{"resolution":"day"},{"resolution":"week","aggregate":"year"}]} 8 | ] 9 | -------------------------------------------------------------------------------- /tests/test_bookworm_files/test_bookworm_metadata.tsv: -------------------------------------------------------------------------------- 1 | paragraphNumber oddness 2 | 0 even 3 | 1 odd 4 | 2 even 5 | 3 odd 6 | 4 even 7 | 5 odd 8 | 6 even 9 | 7 odd 10 | 8 even 11 | 9 odd 12 | 10 even 13 | 11 odd 14 | 12 even 15 | 13 odd 16 | 14 even 17 | 15 odd 18 | 16 even 19 | 17 odd 20 | 18 even 21 | 19 odd 22 | 20 even 23 | 21 odd 24 | 22 even 25 | 23 odd 26 | 24 even 27 | 25 odd 28 | 26 even 29 | 27 odd 30 | 28 even 31 | 29 odd 32 | 30 even 33 | 31 odd 34 | 32 even 35 | 33 odd 36 | 34 even 37 | 35 odd 38 | 36 even 39 | 37 odd 40 | 38 even 41 | 39 odd 42 | 40 even 43 | 41 odd 44 | 42 even 45 | 43 odd 46 | 44 even 47 | 45 odd 48 | 46 even 49 | 47 odd 50 | 48 even 51 | 49 odd 52 | 50 even 53 | 51 odd 54 | 52 even 55 | 53 odd 56 | 54 even 57 | 55 odd 58 | 56 even 59 | 57 odd 60 | 58 even 61 | 59 odd 62 | 60 even 63 | 61 odd 64 | 62 even 65 | 63 odd 66 | 64 even 67 | 65 odd 68 | 66 even 69 | 67 odd 70 | 68 even 71 | 69 odd 72 | 70 even 73 | 71 odd 74 | 72 even 75 | 73 odd 76 | 74 even 77 | 75 odd 78 | 76 even 79 | 77 odd 80 | 78 even 81 | 79 odd 82 | 80 even 83 | 81 odd 84 | 82 even 85 | 83 odd 86 | 84 even 87 | 85 odd 88 | 86 even 89 | 87 odd 90 | 88 even 91 | 89 odd 92 | 90 even 93 | 91 odd 94 | 92 even 95 | 93 odd 96 | 94 even 97 | 95 odd 98 | 96 even 99 | 97 odd 100 | 98 even 101 | 99 odd 102 | 100 even 103 | 101 odd 104 | 102 even 105 | 103 odd 106 | 104 even 107 | 105 odd 108 | 106 even 109 | 107 odd 110 | 108 even 111 | 109 odd 112 | 110 even 113 | 111 odd 114 | 112 even 115 | 113 odd 116 | 114 even 117 | 115 odd 118 | 116 even 119 | 117 odd 120 | 118 even 121 | 119 odd 122 | 120 even 123 | 121 odd 124 | 122 even 125 | 123 odd 126 | 124 even 127 | 125 odd 128 | 126 even 129 | 127 odd 130 | 128 even 131 | 129 odd 132 | 130 even 133 | 131 odd 134 | 132 even 135 | 133 odd 136 | 134 even 137 | 135 odd 138 | 136 even 139 | 137 odd 140 | 138 even 141 | 139 odd 142 | 140 even 143 | 141 odd 144 | 142 even 145 | 143 odd 146 | 144 even 147 | 145 odd 148 | 146 even 149 | 147 odd 150 | 148 even 151 | 149 odd 152 | 150 even 153 | 151 odd 154 | 152 even 155 | 153 odd 156 | 154 even 157 | 155 odd 158 | 156 even 159 | 157 odd 160 | 158 even 161 | 159 odd 162 | 160 even 163 | 161 odd 164 | 162 even 165 | 163 odd 166 | 164 even 167 | 165 odd 168 | 166 even 169 | 167 odd 170 | 168 even 171 | 169 odd 172 | 170 even 173 | 171 odd 174 | 172 even 175 | 173 odd 176 | 174 even 177 | 175 odd 178 | 176 even 179 | 177 odd 180 | 178 even 181 | 179 odd 182 | 180 even 183 | 181 odd 184 | 182 even 185 | 183 odd 186 | 184 even 187 | 185 odd 188 | 186 even 189 | 187 odd 190 | 188 even 191 | 189 odd 192 | 190 even 193 | 191 odd 194 | 192 even 195 | 193 odd 196 | 194 even 197 | 195 odd 198 | 196 even 199 | 197 odd 200 | 198 even 201 | 199 odd 202 | 200 even 203 | 201 odd 204 | 202 even 205 | 203 odd 206 | 204 even 207 | 205 odd 208 | 206 even 209 | 207 odd 210 | 208 even 211 | 209 odd 212 | 210 even 213 | 211 odd 214 | 212 even 215 | 213 odd 216 | 214 even 217 | 215 odd 218 | 216 even 219 | 217 odd 220 | 218 even 221 | 219 odd 222 | 220 even 223 | 221 odd 224 | 222 even 225 | 223 odd 226 | 224 even 227 | 225 odd 228 | 226 even 229 | 227 odd 230 | 228 even 231 | 229 odd 232 | 230 even 233 | 231 odd 234 | 232 even 235 | 233 odd 236 | 234 even 237 | 235 odd 238 | 236 even 239 | 237 odd 240 | 238 even 241 | 239 odd 242 | 240 even 243 | 241 odd 244 | 242 even 245 | 243 odd 246 | 244 even 247 | 245 odd 248 | 246 even 249 | 247 odd 250 | 248 even 251 | 249 odd 252 | 250 even 253 | 251 odd 254 | 252 even 255 | 253 odd 256 | 254 even 257 | 255 odd 258 | 256 even 259 | 257 odd 260 | 258 even 261 | 259 odd 262 | 260 even 263 | 261 odd 264 | 262 even 265 | 263 odd 266 | 264 even 267 | 265 odd 268 | 266 even 269 | 267 odd 270 | 268 even 271 | 269 odd 272 | 270 even 273 | 271 odd 274 | 272 even 275 | 273 odd 276 | 274 even 277 | 275 odd 278 | 276 even 279 | 277 odd 280 | 278 even 281 | 279 odd 282 | 280 even 283 | 281 odd 284 | 282 even 285 | 283 odd 286 | 284 even 287 | 285 odd 288 | 286 even 289 | 287 odd 290 | 288 even 291 | 289 odd 292 | 290 even 293 | 291 odd 294 | 292 even 295 | 293 odd 296 | 294 even 297 | 295 odd 298 | 296 even 299 | 297 odd 300 | 298 even 301 | 299 odd 302 | 300 even 303 | 301 odd 304 | 302 even 305 | 303 odd 306 | 304 even 307 | 305 odd 308 | 306 even 309 | 307 odd 310 | 308 even 311 | 309 odd 312 | 310 even 313 | 311 odd 314 | 312 even 315 | 313 odd 316 | 314 even 317 | 315 odd 318 | 316 even 319 | 317 odd 320 | 318 even 321 | 319 odd 322 | 320 even 323 | 321 odd 324 | 322 even 325 | 323 odd 326 | 324 even 327 | 325 odd 328 | 326 even 329 | 327 odd 330 | 328 even 331 | 329 odd 332 | 330 even 333 | 331 odd 334 | 332 even 335 | 333 odd 336 | 334 even 337 | 335 odd 338 | 336 even 339 | 337 odd 340 | 338 even 341 | 339 odd 342 | 340 even 343 | 341 odd 344 | 342 even 345 | 343 odd 346 | 344 even 347 | 345 odd 348 | 346 even 349 | 347 odd 350 | 348 even 351 | 349 odd 352 | 350 even 353 | 351 odd 354 | 352 even 355 | 353 odd 356 | 354 even 357 | 355 odd 358 | 356 even 359 | 357 odd 360 | 358 even 361 | 359 odd 362 | 360 even 363 | 361 odd 364 | 362 even 365 | 363 odd 366 | 364 even 367 | 365 odd 368 | 366 even 369 | 367 odd 370 | 368 even 371 | 369 odd 372 | 370 even 373 | 371 odd 374 | 372 even 375 | 373 odd 376 | 374 even 377 | 375 odd 378 | 376 even 379 | 377 odd 380 | 378 even 381 | 379 odd 382 | 380 even 383 | 381 odd 384 | 382 even 385 | 383 odd 386 | 384 even 387 | 385 odd 388 | 386 even 389 | 387 odd 390 | 388 even 391 | 389 odd 392 | 390 even 393 | 391 odd 394 | 392 even 395 | 393 odd 396 | 394 even 397 | 395 odd 398 | 396 even 399 | 397 odd 400 | 398 even 401 | 399 odd 402 | 400 even 403 | 401 odd 404 | 402 even 405 | 403 odd 406 | 404 even 407 | 405 odd 408 | 406 even 409 | 407 odd 410 | 408 even 411 | 409 odd 412 | 410 even 413 | 411 odd 414 | 412 even 415 | 413 odd 416 | 414 even 417 | 415 odd 418 | 416 even 419 | 417 odd 420 | 418 even 421 | 419 odd 422 | 420 even 423 | 421 odd 424 | 422 even 425 | 423 odd 426 | 424 even 427 | 425 odd 428 | 426 even 429 | 427 odd 430 | 428 even 431 | 429 odd 432 | 430 even 433 | 431 odd 434 | 432 even 435 | 433 odd 436 | 434 even 437 | 435 odd 438 | 436 even 439 | 437 odd 440 | 438 even 441 | 439 odd 442 | 440 even 443 | 441 odd 444 | 442 even 445 | 443 odd 446 | 444 even 447 | 445 odd 448 | 446 even 449 | 447 odd 450 | 448 even 451 | 449 odd 452 | 450 even 453 | 451 odd 454 | 452 even 455 | 453 odd 456 | 454 even 457 | 455 odd 458 | 456 even 459 | 457 odd 460 | 458 even 461 | 459 odd 462 | 460 even 463 | 461 odd 464 | 462 even 465 | 463 odd 466 | 464 even 467 | 465 odd 468 | 466 even 469 | 467 odd 470 | 468 even 471 | 469 odd 472 | 470 even 473 | 471 odd 474 | 472 even 475 | 473 odd 476 | 474 even 477 | 475 odd 478 | 476 even 479 | 477 odd 480 | 478 even 481 | 479 odd 482 | 480 even 483 | 481 odd 484 | 482 even 485 | 483 odd 486 | 484 even 487 | 485 odd 488 | 486 even 489 | 487 odd 490 | 488 even 491 | 489 odd 492 | 490 even 493 | 491 odd 494 | 492 even 495 | 493 odd 496 | 494 even 497 | 495 odd 498 | 496 even 499 | 497 odd 500 | 498 even 501 | 499 odd 502 | -------------------------------------------------------------------------------- /tests/test_bookworm_files_unicode/field_descriptions.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"datatype": "searchstring", "field": "searchstring", "unique": true, "type": "text"}, 3 | {"datatype": "categorical", "field": "language", "unique": true, "type": "text"}, 4 | {"datatype": "categorical", "field": "description_", "unique": true, "type": "text"} 5 | ] 6 | -------------------------------------------------------------------------------- /tests/test_bookworm_files_unicode/input.txt: -------------------------------------------------------------------------------- 1 | john_1 ᏗᏓᎴᏂᏍᎬ ᎧᏃᎮᏛ ᎡᎮᎢ, ᎠᎴ ᎾᏍᎩ ᎧᏃᎮᏛ ᎤᏁᎳᏅᎯ ᎢᏧᎳᎭ ᎠᏁᎮᎢ, ᎠᎴ ᎾᏍᎩ ᎧᏃᎮᏛ ᎤᏁᎳᏅᎯ ᎨᏎᎢ. 2 | quran_2 "بِسْمِ اللَّـهِ الرَّحْمَـٰنِ الرَّحِيمِ" 3 | quran_1 بِسْمِ اللَّـهِ الرَّحْمَـٰنِ الرَّحِيمِ 4 | سورة الفاتحة بِسْمِ اللَّـهِ الرَّحْمَـٰنِ الرَّحِيمِ 5 | سورة In the name of Allah, the Entirely Merciful, the Especially Merciful 6 | -------------------------------------------------------------------------------- /tests/test_bookworm_files_unicode/jsoncatalog.txt: -------------------------------------------------------------------------------- 1 | {"filename":"john_1", "language":"cherokee","searchstring":"Cherokee bible verse","description_":"Cherokee Bible Verse"} 2 | {"filename":"quran_1", "language":"arabic","searchstring":"Quran verse 1","description_": "Arabic text"} 3 | {"filename":"quran_2", "language":"arabic","searchstring":"Quran verse 1 in quotes","description_": "Arabic text in ASCII quotes"} 4 | {"filename":"سورة الفاتحة", "language":"arabic","searchstring":"Quran verse 1 with arabic filename","description_":"Arabic Filename with Arabic text"} 5 | {"filename":"سورة", "language":"english","searchstring":"Quran verse 1 in English with arabic filename","description_":"Arabic Filename with English text"} 6 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from bookwormDB.manager import BookwormManager 4 | import unittest 5 | import logging 6 | import os 7 | import sys 8 | 9 | class Bookworm_Configuration(unittest.TestCase): 10 | 11 | def test_config(self): 12 | bookworm = BookwormManager(None, "federalist_bookworm") 13 | 14 | 15 | if __name__=="__main__": 16 | # The setup is done without verbose logging; any failure 17 | # causes it to try again. 18 | unittest.main() 19 | -------------------------------------------------------------------------------- /tests/test_mysql.py: -------------------------------------------------------------------------------- 1 | from builtins import hex 2 | import unittest 3 | import bookwormDB 4 | from bookwormDB.configuration import Configfile 5 | import bookwormDB.CreateDatabase 6 | import logging 7 | import MySQLdb 8 | import random 9 | 10 | logging.basicConfig(level=10) 11 | 12 | 13 | """ 14 | Tests of the MySQL configuration. 15 | """ 16 | 17 | class Bookworm_MySQL_Configuration(unittest.TestCase): 18 | def test_server_connection(self): 19 | logging.info("\n\nTESTING SERVER CONNECTION\n\n") 20 | """ 21 | Connect to MySQL and run a simple query. 22 | """ 23 | import bookwormDB.CreateDatabase 24 | db = bookwormDB.CreateDatabase.DB(dbname="mysql") 25 | sampleQuery=db.query("SELECT 1+1").fetchall() 26 | self.assertTrue(sampleQuery[0][0]==2) 27 | 28 | """ 29 | To properly test things, we actually build some bookworms. 30 | This assumes that the directory '/tmp' is writeable, 31 | which isn't strictly necessary for a bookworm to be built. 32 | """ 33 | 34 | def test_config_files(self): 35 | logging.info("\n\nTESTING CONFIG FILE ACCESS\n\n") 36 | def test_config_file(conf): 37 | user = conf.config.get("client","user") 38 | pw = conf.config.get("client","password") 39 | return (user,pw) 40 | 41 | global_configuration_file = Configfile("read_only") 42 | admin_configuration_file = Configfile("admin") 43 | 44 | (admin_user,admin_pw) = test_config_file(global_configuration_file) 45 | (client_user,client_pw) = test_config_file(admin_configuration_file) 46 | logging.info("admin user is {} and password is {}".format(admin_user,admin_pw)) 47 | logging.info("client user is {} and password is {}".format(client_user,client_pw)) 48 | logging.info("Checking that admin and client users are distinct") 49 | self.assertTrue(admin_user != client_user) 50 | 51 | def test_createDB_permission(self): 52 | logging.info("\nTESTING ABILITY TO CREATE DATABASES\n\n") 53 | import bookwormDB.configuration 54 | dbname = "A" + hex(random.getrandbits(128))[2:-1] 55 | import bookwormDB.CreateDatabase 56 | db = bookwormDB.CreateDatabase.DB(dbname="mysql") 57 | cursor = db.query("CREATE DATABASE {}".format(dbname)) 58 | cursor.execute("DROP DATABASE {}".format(dbname)) 59 | cursor.close() 60 | 61 | 62 | if __name__=="__main__": 63 | unittest.main() 64 | --------------------------------------------------------------------------------