├── .gitignore
├── .travis.yml
├── LICENSE.md
├── Makefile
├── README.md
├── README.rst
├── bookwormDB
    ├── CreateDatabase.py
    ├── MetaParser.py
    ├── SQLAPI.py
    ├── __init__.py
    ├── benchmark.md
    ├── bin
    │   ├── dbbindings-flask.py
    │   ├── dbbindings.py
    │   └── logParser.py
    ├── bwExceptions.py
    ├── configuration.py
    ├── convertTSVtoJSONarray.py
    ├── countManager.py
    ├── general_API.py
    ├── json_schema.py
    ├── manager.py
    ├── mariaDB.py
    ├── multiprocessingHelp.py
    ├── schema_primitives.py
    ├── scripts
    │   ├── fast_featurecounter.sh
    │   └── mergecounted.awk
    ├── search_limits.py
    ├── sqliteKV.py
    ├── tokenizer.py
    ├── variableSet.py
    └── wsgi.py
├── demos
    ├── .ipynb_checkpoints
    │   ├── Reading Binary data-checkpoint.ipynb
    │   └── Untitled-checkpoint.ipynb
    └── Untitled.ipynb
├── setup.cfg
├── setup.py
└── tests
    ├── setup.py
    ├── test_API.py
    ├── test_bookworm_files
        ├── field_descriptions.json
        ├── input.txt
        ├── jsoncatalog.txt
        └── test_bookworm_metadata.tsv
    ├── test_bookworm_files_unicode
        ├── field_descriptions.json
        ├── input.txt
        └── jsoncatalog.txt
    ├── test_config.py
    └── test_mysql.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.out
 2 | *.log
 3 | *.png
 4 | *~
 5 | *#
 6 | *.tmp
 7 | kyoto*
 8 | .DS_Store
 9 | *.pyc
10 | *.RData
11 | HistoryDiss
12 | HistoryDiss/*
13 | *.tar.gz
14 | tmp.cron
15 | setup.sh
16 | startup.sh
17 | etc/numpy-1.6.2
18 | oldfiles
19 | files
20 | onefile.txt
21 | downloads
22 | newspapers.rdf
23 | extensions
24 | bookworm.cnf
25 | .#
26 | tmp.cron
27 | tmp.tsv
28 | tmp.txt
29 | tmp.txt
30 | .#*
31 | build
32 | old/*
33 | *~
34 | APIkeys
35 | #*
36 | .#*
37 | .DS_Store
38 | *.cgi
39 | genderizer*
40 | *.pyc
41 | ~/.pypirc
42 | .pypirc
43 | bookwormDB.egg-info
44 | MANIFEST
45 | dist
46 | tests/test_bookworm_files/.bookworm/
47 | tests/test_bookworm_files/http_server/
48 | tests/test_bookworm_files_unicode/.bookworm
49 | *.bak
50 | tests/test_bookworm_metadata.tsv
51 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   - "3.6"
 5 | 
 6 | dist: trusty
 7 | 
 8 | services:
 9 |   - mysql
10 |   
11 | addons:
12 |   apt:
13 |     packages:
14 |       - mysql-server-5.6
15 |       - mysql-client-core-5.6
16 |       - mysql-client-5.6
17 | 
18 | install:
19 |   - pip install .
20 | 
21 | script:
22 |   - cd tests && python test_mysql.py && python test_API.py
23 |  
24 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Benjamin Schmidt, Matt Nicklay, Billy Janitsch, and Erez Lieberman Aiden
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # This makefile handles some python package management activities.
 2 | 
 3 | 
 4 | all: README.rst
 5 | 
 6 | README.rst: README.md
 7 | 	pandoc -o $@ $<
 8 | 
 9 | clean:
10 | 	rm -rf dist
11 | 
12 | dist:
13 | 	python setup.py sdist
14 | 	python setup.py bdist_wheel
15 | 
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Travis Build Status](https://travis-ci.org/Bookworm-project/BookwormDB.svg?branch=master)](https://travis-ci.org/Bookworm-project/BookwormDB)
  2 | 
  3 | [BookwormDB](https://github.com/bookworm-project/BookwormDB "BookwormDB") is the main code repository for the Bookworm project. Given simply formatted files and metadata, it creates an efficient and easily queryable MySQL database that can make full use of all the metadata and lexical data in the original source. It also includes a powerful API for asking a variety of unigrammatic queries about that data.
  4 | 
  5 | A quick walkthrough is included below: other documentation is at [bookworm.culturomics.org]() and in a [Bookworm Manual](http://bookworm-project.github.io/Docs) on this repository (editable at the repo [here](https://github.com/Bookworm-project/Docs)).
  6 | 
  7 | # Installation
  8 | 
  9 | Installation is tested on Ubuntu and OS X. It may work on other Unixes, but will probably not work on Windows.
 10 | 
 11 | 1. Install some dependencies; mysql or mariadb for databases.
 12 | 2. Download the latest release, either by cloning this git repo or downloading a zip.
 13 | 3. Navigate to the folder in the terminal, and type `pip install .`.
 14 | 4. Type `bookworm --help` to confirm the executable has worked. If this doesn't work, file
 15 |    a bug report.
 16 | 5. (No longer?) Type `bookworm config mysql` for some interactive prompts to allow Bookworm to edit MySQL databases on your server. (Note that this makes some other changes to your mysql configuration files; you may want to copy them first if you're using it for other things.)
 17 | 
 18 | ## Releases
 19 | 
 20 | The `master` branch is regularly tested on Travis; you are generally best off installing the latest version.
 21 | 
 22 | ## Related projects
 23 | 
 24 | This builds a database and implements the Bookworm API on particular set of texts.
 25 | 
 26 | Some basic, widely appealing visualizations of the data are possible with the Bookworm [web app](https://github.com/bookworm-project/BookwormGUI "Bookworm web app"), which runs on top of the API.
 27 | 
 28 | A more wide-ranging set of visualizations is available built on top of D3 in the [Bookworm D3 package](http://github.com/bmschmidt/BookwormD3).
 29 | If you're looking to develop on top of Bookworm, that presents a much more flexible set of tools.
 30 | 
 31 | ## Bookworms ##
 32 | Here are a couple of Bookworms built using [BookwormDB](https://github.com/bookworm-project/BookwormDB "Bookworm"):
 33 | 
 34 | 1. [Open Library](http://bookworm.culturomics.org/OL/ "Open Library")
 35 | 2. [ArXiv](http://bookworm.culturomics.org/arxiv/ "ArXiv")
 36 | 3. [Chronicling America](http://arxiv.culturomics.org/ChronAm/ "Chronicling America")
 37 | 4. [SSRN](http://bookworm.culturomics.org/ssrn/ "SSRN: Social Science Research Network")
 38 | 5. [US Congress](http://bookworm.culturomics.org/congress/ "Bills in US Congress")
 39 | 6. [Rate My Professor Gendered Language](http://benschmidt.org/profGender)
 40 | 
 41 | 
 42 | ## Getting Started ##
 43 | 
 44 | ### Docker
 45 | 
 46 | We're working on docker containerization. Help appreciated. Contact `bs 145 at nyu dot edu`,
 47 | no spaces involved.
 48 | 
 49 | ### Required MySQL Database ###
 50 | 
 51 | You must have a MySQL database set up that you can log into with admin access,
 52 | probably with a `my.cnf` file at ~/.my.cnf. Depending on your platform, this
 53 | can be a little tricky to set up.
 54 | 
 55 | Bookworm will automatically create a select-only user that handles web queries,
 56 | preventing any malicious actions through the API.
 57 | 
 58 | There is a command `bookworm config mysql` that will interactively update
 59 | certain files in your global my.cnf. It may need to be run with admin privileges.
 60 | 
 61 | 
 62 | Bookworm by default tries to log on with admin privileges with the following preferences:
 63 | 
 64 | ```
 65 | [client]
 66 | host = 127.0.0.1
 67 | user = root
 68 | password = ''
 69 | 
 70 | ```
 71 | 
 72 |  But it also looks in several locations--`~/etc/my.cnf`, `~/etc/.my.cnf`, and `/etc/bookworm/admin.cnf`--for other passwords.
 73 |  (I don't have an empty root password on my local MySQL server!).
 74 | It updates the host, user, and password with values from each of those files
 75 | if they exist in that order.
 76 | 
 77 | The command `bookworm config mysql-info` shows you what password and host it's
 78 |  trying to use.
 79 | 
 80 | In addition to the username and password, the host matters as well.
 81 | Depending on setup, 'localhost' and '127.0.0.1' mean different things to mysql
 82 | (the former is a socket, the latter a port). Depending on exactly how you're
 83 | invoking mysql, you may need to use one or the other to communicate.
 84 | For instance, your root account might not have login privileges through
 85 | 127.0.0.1, just at localhost--depends exactly how the server is invoked.
 86 | 
 87 | To debug mysql permissions issues type `mysql -u $USER -h 127.0.0.1  -p` at the prompt,
 88 | use your password. Once you have confirmed that brings up a mysql prompt that
 89 | can grant privileges, copy those files into something at `~/.my.cnf` (or if
 90 |   you're able, `/etc/bookworm/admin.cnf`)
 91 | in the format given by `bookworm config mysql-info` (or the above block.)
 92 | 
 93 | 
 94 | 
 95 | ## The query API
 96 | 
 97 | This distribution also includes two files, general_api.py and SQLapi.py,
 98 | which together constitute an implementation of the API for Bookworm, written in Python.
 99 | It primarily implements the API on a MySQL database now,
100 | but includes classes for more easily implementing it on top of other platforms (such as Solr).
101 | 
102 | It is used with the [Bookworm GUI](https://github.com/Bookworm-project/BookwormGUI)
103 |  and can also be used as a standalone tool to query data from your database.
104 |  To run the API in its most basic form, type `bookworm query $string`,
105 |   where $string is a json-formatted query. In general, query performance will be
106 |   faster over bookworm's API process, which you can start by typing `bookworm serve`
107 |   and querying over port 10012.
108 | 
109 | While the point of the command-line tool `bookworm` is generally to *create* a Bookworm, the API is to retrieves results from it.
110 | 
111 | For a more interactive explanation of how the GUI works, see the [Vega-Bookworm project sandbox].
112 | 
113 | Walkthrough
114 | ===========
115 | 
116 | These are some instructions on how to build a bookworm.
117 | 
118 | We'll use a collection of 450 novels in 3 languages:
119 | 
120 | Piper, Andrew (2016): txtlab Multilingual Novels. figshare.
121 | 
122 | ### Download and unzip the files.
123 | 
124 | ```
125 | wget https://ndownloader.figshare.com/files/3686805
126 | wget https://ndownloader.figshare.com/files/3686778
127 | unzip 3686778
128 | 
129 | ```
130 | ### Create catalog and text files.
131 | 
132 | For this set, a simple python script suffices to build the
133 | two needed files, using the textlab's files. Paste this into parse.py.
134 | 
135 | ```python
136 | import pandas as pd
137 | import json
138 | output = open("input.txt", "w")
139 | catalog = open("jsoncatalog.txt", "w")
140 | for book in pd.read_csv("3686805").to_dict(orient="records"):
141 |     try:
142 |         fulltext_lines = open(f"2_txtalb_Novel450/{book['filename']}").readlines()
143 |         # Bookworm reserver newline and tab characters, so they are stripped before
144 |         fulltext = "\f".join(fulltext_lines)
145 |         fulltext = fulltext.replace("\r", " ").replace("\n", " ").replace("\t", " ")
146 |         book['filename'] = str(book['id'])
147 |         output.write(f"{book['filename']}\t{fulltext}\n")
148 |         book['searchstring'] = book['title'] + ' ' + book['author']
149 |         catalog.write(json.dumps(book) + "\n")
150 |     except FileNotFoundError:
151 |         # This dataset has errors!
152 |         continue
153 | ```
154 | 
155 | ```sh
156 | python parse.py
157 | ```
158 | 
159 | Create a bookworm.cnf file in the file. (This isn't always necessary; usually
160 | it can just infer the database name from your current directory.)
161 | ```
162 | echo "[client]\ndatabase=txtlab450" > bookworm.cnf
163 | ```
164 | 
165 | ### Initialize the Bookworm
166 | 
167 | ```
168 | bookworm init
169 | bookworm build all
170 | ```
171 | 
172 | ### Required files
173 | 
174 | #### Required files 1: full text of each file with an identifier.
175 | 
176 | * `input.txt`
177 | 
178 | In this format, each line consists of the file's unique identifier, followed by a tab, followed by the **full text** of that file. Note that you'll have to strip out all newlines and returns from original documents. In the event that an identifier is used twice, behavior is undefined.
179 | 
180 | By changing the makefile, you can also do some more complex substitutions. (See the metadata parsers for an example of a Bookworm that directly reads hierarchical, bzipped directories without decompressing first).
181 | 
182 | #### Required files 2: Metadata about each file.
183 | 
184 | *  `jsoncatalog.txt` with one JSON object per line. ("newline-delimited json" format.)
185 |     The keys represent shared metadata for each file: the values represent the entry for that particular document. There should be no new line or tab characters in this file.
186 | 
187 | In addition to the metadata you choose, two fields are required:
188 | 
189 | 1. A `searchstring` field that contains valid HTML which will be served to the user to identify the text.
190 |    * This can be a link, or simply a description of the field. If you have a URL where the text can be read, it's best to include it inside an <a> tag: otherwise, you can just put in any text field you want in the process of creating the jsoncatalog.txt file: something like author and title is good.
191 | 
192 | 2. A `filename` field that includes a unique identifier for the document (linked to the filename or the identifier, depending on your input format).
193 | 
194 | **Note that the python script above does both of these at once.**
195 | 
196 | #### Required Files 3: Metadata about the metadata.
197 | 
198 | Now create a file in the `field_descriptions.json` which is used to define the type of variable for each variable in `jsoncatalog.txt`.
199 | 
200 | Currently, you **do** have to include a `searchstring` definition in this, but **should not** include a filename definition.
201 | 
202 | ## Running ##
203 | 
204 | For a first run, you just want to use `bookworm init` to create the entire database (if you want to rebuild parts of a large bookworm--the metadata, for example--that is also possible.)
205 | 
206 | ```
207 | bookworm init
208 | ```
209 | 
210 | This will walk you through the process of choosing a name for your database.
211 | 
212 | Then to build the bookworm, type
213 | 
214 | ```
215 | bookworm build all
216 | ```
217 | 
218 | Depending on the total number and average size of your texts,
219 | this could take a while. Sit back and relax.
220 | 
221 | Finally, you want to implement the API and see some results.
222 | 
223 | Type
224 | 
225 | ```
226 | bookworm serve
227 | ```
228 | 
229 | To start a process on port 10012 that responds to queries.
230 | This daemon must run continuously.
231 | 
232 | Then you can access query results over http. Try visiting this page in a web browser.
233 | 
234 | `http://localhost:10012/?q={%22database%22:%22txtlab450%22,%22method%22:%22data%22,%22format%22:%22csv%22,%22groups%22:[%22date%22,%20%22language%22],%22counttype%22:[%22TextCount%22,%22WordCount%22]}`
235 | 
236 | 
237 | Once this works, you can use various libraries to query the endpoint,
238 | or create an HTML page that builds off the endpoint. See
239 | the (currently underdeveloped) Bookworm-Vega repository for some examples.
240 | 
241 | 
242 | ## Production servers
243 | 
244 | Serving from localhost:10012 won't work especially well in production contexts.
245 | Heavy-duty web servers do rate limiting and other things that the gunicorn process
246 | bookworm uses don't handle.
247 | 
248 | One strategy is to serve the web site (using bookworm-vega or something else)
249 | over port 80, while passing all cgi-requests through to port 10012 where the
250 | bookworm server handles them. (Note that this may disable *other* cgi services
251 | on that particular server.)
252 | 
253 | This means it's possible to run the bookworm server anywhere, and then just
254 | forward the connection to your server using ssh tunnels. (Note that doing so
255 | may be inefficient, because it adds an extra layer of packet encoding. I'm open
256 | to better solutions here).
257 | 
258 | ### Apache
259 | 
260 | The steps for Apache are:
261 | 
262 | 1. Serve the Bookworm API over port 10012. (`bookworm serve`).
263 | 2. Install an Apache host on port 80.
264 | 3. Enable proxy servers and turn off any existing cgi.
265 |   **If you were previously using the CGI bookworm.**
266 |   `sudo a2dismod cgi`
267 |   `sudo a2enmod proxy proxy_ajp proxy_http rewrite deflate headers proxy_balancer proxy_connect proxy_html`
268 | 4. Add the following to your '/etc/apache2/sites-available/000-default.conf'
269 |   (or whatever site from which you run your apache) to pass cgi-bin queries
270 |   to the bookworm ser ver.
271 |   ```
272 |   <Proxy *>
273 |     Order deny,allow
274 |     Allow from all
275 |   </Proxy>
276 |     ProxyPreserveHost On
277 |   <Location "/cgi-bin">
278 |     ProxyPass "http://127.0.0.1:10012/"
279 |     ProxyPassReverse "http://127.0.0.1:10012/"
280 |   </Location>
281 |   ```
282 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | |Travis Build Status|
  2 | 
  3 | `BookwormDB <https://github.com/bookworm-project/BookwormDB>`__ is the
  4 | main code repository for the Bookworm project. Given simply formatted
  5 | files and metadata, it creates an efficient and easily queryable MySQL
  6 | database that can make full use of all the metadata and lexical data in
  7 | the original source. It also includes a powerful API for asking a
  8 | variety of unigrammatic queries about that data.
  9 | 
 10 | A quick walkthrough is included below: other documentation is at
 11 | `bookworm.culturomics.org <>`__ and in a `Bookworm
 12 | Manual <http://bookworm-project.github.io/Docs>`__ on this repository
 13 | (editable at the repo
 14 | `here <https://github.com/Bookworm-project/Docs>`__).
 15 | 
 16 | Installation
 17 | ============
 18 | 
 19 | Installation is tested on Ubuntu and OS X. It may work on other Unixes,
 20 | but will not work on Windows.
 21 | 
 22 | 1. Install some dependencies; mysql or mariadb for databases, and GNU
 23 |    parallel for parallel processing.
 24 | 2. Download the latest release, either by cloning this git repo or
 25 |    downloading a zip.
 26 | 3. Navigate to the folder in the terminal, and type ``pip install .``.
 27 | 
 28 | -  If ``/usr/lib/cgi-bin`` is not writeable by your account, you may
 29 |    need to type ``sudo pip install .``
 30 | 
 31 | 4. Type ``bookworm --help`` to confirm the executable has worked. If
 32 |    this doesn't work, file a bug report.
 33 | 5. Type ``bookworm config mysql`` for some interactive prompts to allow
 34 |    Bookworm to edit MySQL databases on your server. (Note that this
 35 |    makes some other changes to your mysql configuration files; you may
 36 |    want to copy them first if you're using it for other things.)
 37 | 
 38 | Releases
 39 | --------
 40 | 
 41 | The ``master`` branch is regularly tested on Travis; you are generally
 42 | best off installing the latest version.
 43 | 
 44 | Related projects
 45 | ----------------
 46 | 
 47 | This builds a database and implements the Bookworm API on particular set
 48 | of texts.
 49 | 
 50 | Some basic, widely appealing visualizations of the data are possible
 51 | with the Bookworm `web
 52 | app <https://github.com/bookworm-project/BookwormGUI>`__, which runs on
 53 | top of the API.
 54 | 
 55 | A more wide-ranging set of visualizations is available built on top of
 56 | D3 in the `Bookworm D3
 57 | package <http://github.com/bmschmidt/BookwormD3>`__. If you're looking
 58 | to develop on top of Bookworm, that presents a much more flexible set of
 59 | tools.
 60 | 
 61 | Bookworms
 62 | ---------
 63 | 
 64 | Here are a couple of Bookworms built using
 65 | `BookwormDB <https://github.com/bookworm-project/BookwormDB>`__:
 66 | 
 67 | 1. `Open Library <http://bookworm.culturomics.org/OL/>`__
 68 | 2. `ArXiv <http://bookworm.culturomics.org/arxiv/>`__
 69 | 3. `Chronicling America <http://arxiv.culturomics.org/ChronAm/>`__
 70 | 4. `SSRN <http://bookworm.culturomics.org/ssrn/>`__
 71 | 5. `US Congress <http://bookworm.culturomics.org/congress/>`__
 72 | 6. `Rate My Professor Gendered
 73 |    Language <http://benschmidt.org/profGender>`__
 74 | 
 75 | Getting Started
 76 | ---------------
 77 | 
 78 | Required MySQL Database
 79 | ~~~~~~~~~~~~~~~~~~~~~~~
 80 | 
 81 | The hardest part about setting up Bookworm is properly configuring the
 82 | MySQL installation. The easiest way to test out Bookworm on your home
 83 | computer may be to use a VM running Ubuntu; installation is realtively
 84 | easy using OS X, as well.
 85 | 
 86 | At the very least, there must be a MySQL user with permissions to insert
 87 | + select data from all databases. The easiest way to handle this is to
 88 | have a user with root access defined in your system-wide MySQL
 89 | configuration files.
 90 | 
 91 | This creates a bit of a security risk, though, so we recommend 2 MySQL
 92 | users: an admin user with the ability to create new databases (i.e.
 93 | GRANT ALL) and a second user that is only able to select data from
 94 | databases (i.e. GRANT SELECT). This is for security: your data is safer
 95 | if the web user can't modify it at all.
 96 | 
 97 | Setting up databases automatically
 98 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 99 | 
100 | Running ``bookworm config mysql`` will take care of most of these tasks
101 | through an interactive prompt; ``bookworm config --force mysql`` will
102 | come up with automatic answers for all questions, and is suitable in a
103 | scripting situation (like setting up a variety of VMs). The
104 | configuration script will ask for a variety of different passwords, and
105 | may request an administrative password from the machine.
106 | 
107 | Restart your MySQL server after the configuration script has run.
108 | 
109 | If you encounter problems in the config script, please feel free to post
110 | issues to the project github repo.
111 | 
112 | Setting up databases manually
113 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
114 | 
115 | If you have an existing MySQL configuration you do not want to risk
116 | hurting, you may want to proceed by hand.
117 | 
118 | First, create an admin user:
119 | 
120 | For example, create a user ``foobar`` with password ``mysecret`` and
121 | full access to all databases from ``localhost``:
122 | 
123 | .. code:: mysql
124 | 
125 |     CREATE USER 'foobar'@'localhost' IDENTIFIED BY 'mysecret';
126 |     GRANT ALL PRIVILEGES ON *.* TO 'foobar'@'localhost' WITH GRANT OPTION;
127 |     FLUSH PRIVILEGES;
128 | 
129 | Then put the credentials for that user in a file that will be
130 | automatically read when you start up MySQL. The best place for this is
131 | at ``~/.my.cnf``. If multiple users on your machine will be
132 | administering the bookworm, another recommended location is
133 | ``/etc/bookworm/admin.cnf``. In this example, that file would look like
134 | this:
135 | 
136 | ::
137 | 
138 |     [client]
139 |     user = 'foobar'
140 |     password = 'mysecret'
141 | 
142 | The second user would be the user that the API uses to get data for
143 | queries over the web. Note that this user has only "select" rights.
144 | 
145 | .. code:: mysql
146 | 
147 |     GRANT SELECT ON *.* TO 'bookworm_client'@'localhost' IDENTIFIED BY 'otherpassword';
148 |     FLUSH PRIVILEGES;
149 | 
150 | Then add a section your **systemwide** ``my.cnf`` file (usually at
151 | ``/etc/mysql/my.cnf``, ``/etc/my.cnf``, or a similar location).
152 | 
153 | ::
154 | 
155 |     [client]
156 |     user = 'bookworm_client'
157 |     password = 'otherpassword'
158 | 
159 | With these settings in place, you're ready to begin building a Bookworm.
160 | See `the walkthrough <#walkthrough>`__ for a fuller example.
161 | 
162 | The query API
163 | -------------
164 | 
165 | This distribution also includes two files, general\_api.py and
166 | SQLapi.py, which together constitute an implementation of the API for
167 | Bookworm, written in Python. It primarily implements the API on a MySQL
168 | database now, but includes classes for more easily implementing it on
169 | top of other platforms (such as Solr).
170 | 
171 | It is used with the `Bookworm
172 | GUI <https://github.com/Bookworm-project/BookwormGUI>`__ and can also be
173 | used as a standalone tool to query data from your database. To run the
174 | API in its most basic form, type ``bookworm query $string``, where
175 | $string is a json-formatted query.
176 | 
177 | An executable is bundled in the distro at
178 | ``bookwormdb/bin/dbbindings.py`` that, when placed in your cgi-bin
179 | folder, will serve the API over to and from the web; when you install
180 | bookworm, it attempts to move this into a web directory for you.
181 | 
182 | While the point of the command-line tool ``bookworm`` is generally to
183 | *create* a Bookworm, the point of the query API is to retrieve results
184 | from it.
185 | 
186 | For a more interactive explanation of how the GUI works, see the `D3
187 | bookworm browser <http://benschmidt.org/D3/APISandbox>`__ (Sorry, this
188 | is broken for the moment).
189 | 
190 | Installing the API.
191 | ~~~~~~~~~~~~~~~~~~~
192 | 
193 | On most systems, ``pip install .`` in the ``bookwormDB`` dir should
194 | deposit a copy in an appropriate location on your system (such as
195 | ``/usr/lib/cgi-bin``).
196 | 
197 | If that doesn't work, just run
198 | ``cp bookwormDB/bin/dbbindings.py /usr/lib/cgi-bin`` (exact locations
199 | may vary) to place it in the correct place.
200 | 
201 | If using homebrew on OS X, the shebang at the beginning of
202 | ``dbbindings.py`` may be incorrect. (It will not load your installed
203 | python modules). Change it from ``#!/usr/bin/env python`` to
204 | ``#!/usr/local/bin/python``, and it should work. (Or you can fix the
205 | PYTHONPATH that apache uses as `described
206 | here <https://github.com/Bookworm-project/BookwormDB/issues/81>`__, but
207 | that is considerably harder than just changing the bookworm code.
208 | 
209 | Walkthrough
210 | ===========
211 | 
212 | These are some instructions on how to build a bookworm.
213 | 
214 |     Indented bits tell you how to build on specific bookworm using `text
215 |     from the summaries of
216 |     bills <https://github.com/unitedstates/congress/wiki>`__ introduced
217 |     in the US Congress from 1973 to the present day. The goal is to
218 |     provide everything needed to build a Bookworm using publically
219 |     available data.
220 | 
221 | Get the Data
222 | ------------
223 | 
224 | First off, you need a collection of texts to analyze. Ideally this
225 | should be more than 1000 individual texts, with some year (or other
226 | time) description.
227 | 
228 |     To download the congress data, Matt Nicklay has put together a
229 |     script in another repo that will download everything you'll need.
230 |     Clone that repo and run ``get_and_unzip_data.py`` to fetch and unzip
231 |     the data:
232 | 
233 |     ::
234 | 
235 |         git clone git://github.com/bmschmidt/congress_api
236 |         cd congress_api
237 |         python get_and_unzip_data.py
238 | 
239 |     This will take a few minutes depending on your Internet connection
240 |     and the speed of your computer. The ``get_and_unzip_data.py`` script
241 |     simply downloads and unzips all the files in parallel using
242 |     `multiprocessing <http://docs.python.org/2/library/multiprocessing.html>`__.
243 |     NOTE: Once fully unzipped, the files will take up just under 3GB of
244 |     disk space.
245 | 
246 | Prep to Build Bookworm
247 | ----------------------
248 | 
249 | If you haven't already, install this repo on your system.
250 | 
251 | ::
252 | 
253 |     git clone git://github.com/Bookworm-project/BookwormDB
254 |     cd BookwormDB
255 |     python setup.py install
256 | 
257 | Required Files
258 | ~~~~~~~~~~~~~~
259 | 
260 | To build a bookworm, you need to build three files in the directory you
261 | plan to use. You can have whatever other files you want in the root
262 | directory. But these three names are reserved for bookworm use.
263 | 
264 | ::
265 | 
266 |     congress/
267 |       | input.txt
268 |       | jsoncatalog.txt
269 |       | field_descriptions.json
270 | 
271 | Required files 1: input.txt:
272 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
273 | 
274 | The first is slightly more complicated than it appears. It contains the
275 | various files you'll be reading in as unicode text. These can be input
276 | in one of three ways.
277 | 
278 | The first, which will be faster in most cases, is as a *single file*.
279 | 
280 | -  ``input.txt``
281 | 
282 | In this format, each line consists of the file's unique identifier,
283 | followed by a tab, followed by the **full text** of that file. Note that
284 | you'll have to strip out all newlines and returns from original
285 | documents. In the event that an identifier is used twice, behavior is
286 | undefined.
287 | 
288 | By changing the makefile, you can also do some more complex
289 | substitutions. (See the metadata parsers for an example of a Bookworm
290 | that directly reads hierarchical, bzipped directories without
291 | decompressing first).
292 | 
293 | **Format 2** is as a directory of files:
294 | 
295 | -  ``input/``
296 | 
297 | This folder should contain a uniquely named .txt file for every item in
298 | your collection of texts that you want to build a bookworm around. The
299 | files may be stored in subdirectories: if so, their identifier key
300 | should include the full path to the file (but not the trailing '.txt').
301 | (NOTE: this is currently unimplemented)
302 | 
303 | **Format 3** is as a shell script named
304 | 
305 | -  ``input_script``
306 | 
307 | That script when executed, should out a stream formatted the same as
308 | input.txt. In some cases, this will allow you to save a lot disk space
309 | and/or time. It must be executable and have a shebang on the first line
310 | designating the interpreter. (NOTE: currently unimplemented).
311 | 
312 |     To build the congress API, we must create an ``input.txt`` file with
313 |     raw text from summaries of bills introduced into Congress. Each line
314 |     contains a unique ID and the text from the summary of a single bill.
315 |     Then, we will create the ``jsoncatalog.txt`` file which will hold
316 |     metadata for each bill, including a field that links each JSON
317 |     object to a line in input.txt. Included in the
318 |     `congress\_api <http://github.com/bmschmidt/congress_api>`__ repo is
319 |     a script ``congress_parser.py`` which we'll run to create
320 |     ``jsoncatalog.txt`` and the ``input.txt`` file.
321 | 
322 |     ::
323 | 
324 |         cd congress_api
325 |         python congress_parser.py
326 | 
327 | Required files 2: Metadata about each file.
328 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
329 | 
330 | -  ``jsoncatalog.txt`` with one JSON object per line. The keys represent
331 |    shared metadata for each file: the values represent the entry for
332 |    that particular document. There should be no new line or tab
333 |    characters in this file.
334 | 
335 | In addition to the metadata you choose, two fields are required:
336 | 
337 | 1. A ``searchstring`` field that contains valid HTML which will be
338 |    served to the user to identify the text.
339 | 
340 | -  This can be a link, or simply a description of the field. If you have
341 |    a URL where the text can be read, it's best to include it inside an
342 |    tag: otherwise, you can just put in any text field you want in the
343 |    process of creating the jsoncatalog.txt file: something like author
344 |    and title is good.
345 | 
346 | 2. A ``filename`` field that includes a unique identifier for the
347 |    document (linked to the filename or the identifier, depending on your
348 |    input format).
349 | 
350 |     Congress users have already created this file in the previous step.
351 | 
352 | Required Files 3: Metadata about the metadata.
353 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
354 | 
355 | Now create a file in the ``field_descriptions.json`` which is used to
356 | define the type of variable for each variable in ``jsoncatalog.txt``.
357 | 
358 | Currently, you **do** have to include a ``searchstring`` definition in
359 | this, but **should not** include a filename definition.
360 | 
361 |     For the Congress demo, copy the following JSON object into
362 |     ``field_descriptions.json``:
363 | 
364 |     .. code:: json
365 | 
366 |         [
367 |            {"field":"date","datatype":"time","type":"numeric","unique":true,"derived":[{"resolution":"month"}]},
368 |            {"field":"searchstring","datatype":"searchstring","type":"text","unique":true},
369 |            {"field":"enacted","datatype":"categorical","type":"text","unique":false},
370 |            {"field":"sponsor_state","datatype":"categorical","type":"text","unique":false},
371 |            {"field":"cosponsors_state","datatype":"categorical","type":"text","unique":false},
372 |            {"field":"chamber","datatype":"categorical","type":"text","unique":false}
373 |            ]
374 | 
375 |     Everything should now be in place and we are ready to build the
376 |     database.
377 | 
378 | Running
379 | -------
380 | 
381 | For a first run, you just want to use ``bookworm init`` to create the
382 | entire database (if you want to rebuild parts of a large bookworm--the
383 | metadata, for example--that is also possible.)
384 | 
385 | ::
386 | 
387 |     bookworm init
388 | 
389 | This will walk you through the process of choosing a name for your
390 | database.
391 | 
392 | Then to build the bookworm, type
393 | 
394 | ::
395 | 
396 |     bookworm build all
397 | 
398 | Depending on the total number and average size of your texts, this could
399 | take a while. Sit back and relax.
400 | 
401 | Finally, you may want to set up a GUI.
402 | 
403 | To test a local one over a python webserver, type
404 | 
405 | ::
406 | 
407 |     bookworm serve
408 | 
409 | Otherwise, you can type
410 | 
411 | ::
412 | 
413 |     bookworm build linechartGUI
414 | 
415 | General Workflow
416 | ~~~~~~~~~~~~~~~~
417 | 
418 | For reference, the general workflow of the Makefile is the following:
419 | 
420 | 5.  Build the directory structure in ``files/texts/``.
421 | 6.  Derive ``.bookworm/metadata/field_descriptions_derived.json`` from
422 |     ``.bookworm/metadata/field_descriptions.txt``.
423 | 7.  Derive ``.bookworm/metadata/jsoncatalog_derived.txt`` from
424 |     ``.bookworm/metadata/jsoncatalog.json``, respectively.
425 | 8.  Create metadata catalog files in ``.bookworm/metadata/``.
426 | 9.  Create a table with all words from the text files, and save the
427 |     million most common for regular use.
428 | 10. Encode unigrams and bigrams from the texts into
429 |     ``.bookworm/encoded``
430 | 11. Load data into MySQL database.
431 | 12. Create temporary MySQL table and .json file that will be used by the
432 |     web app.
433 | 13. Create API settings.
434 | 
435 | Dependencies
436 | ============
437 | 
438 | -  python 2.7 (with modules):
439 | -  ntlk (recommended, to be required)
440 | -  numpy
441 | -  regex (to handle complicated Unicode regular expressions for
442 |    tokenization: ``easy_install regex``)
443 | -  pandas (used by the API, not this precise, set of scripts)
444 | -  parallel (GNU parallel, in versions available from apt-get or
445 |    homebrew)
446 | -  MySQL v. 5.6 (will work with 5.5, but future versions may require 5.6
447 |    for some functionality; MariaDB 10.0+ is also actively supported.
448 |    Some people have reported that it largely works with MySQL 5.1)
449 | -  Apache or other webserver (for front end, if you don't just want to
450 |    run the simple version through ``bookworm serve`` that uses an
451 |    obscure port.)
452 | 
453 | .. |Travis Build Status| image:: https://travis-ci.org/Bookworm-project/BookwormDB.svg?branch=master
454 |    :target: https://travis-ci.org/Bookworm-project/BookwormDB
455 | 


--------------------------------------------------------------------------------
/bookwormDB/MetaParser.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from datetime import date
  3 | import datetime
  4 | import dateutil.parser
  5 | import json
  6 | import sys
  7 | import os
  8 | import logging
  9 | from multiprocessing import Queue, Process
 10 | from queue import Empty
 11 | from .multiprocessingHelp import mp_stats, running_processes
 12 | import time
 13 | 
 14 | 
 15 | defaultDate = datetime.datetime(datetime.MINYEAR, 1, 1)
 16 | 
 17 | def DaysSinceZero(dateobj):
 18 |     #Zero isn't a date, which python knows but MySQL and javascript don't.
 19 |     return (dateobj - date(1,1,1)).days + 366
 20 | 
 21 | def ParseFieldDescs(write = False):
 22 |     f = open('field_descriptions.json', 'r')
 23 |     try:
 24 |         fields = json.loads(f.read())
 25 |     except ValueError:
 26 |         raise ValueError("Error parsing JSON: Check to make sure that your field_descriptions.json file is valid?")
 27 | 
 28 | 
 29 |     if write:
 30 |         derivedFile = open('.bookworm/metadata/field_descriptions_derived.json', 'w')
 31 | 
 32 |     output = []
 33 |     
 34 |     fields_to_derive = []
 35 |     
 36 |     for field in fields:
 37 |         if field["datatype"] == "time":
 38 |             if "derived" in field:
 39 |                 fields_to_derive.append(field)
 40 |             else:
 41 |                 output.append(field)
 42 |         else:
 43 |             output.append(field)
 44 | 
 45 |     for field in fields_to_derive:
 46 |         for derive in field["derived"]:
 47 |             if "aggregate" in derive:
 48 |                 tmp = dict(datatype="time", type="integer", unique=True)
 49 |                 tmp["field"] = '_'.join([field["field"], derive["resolution"],
 50 |                                          derive["aggregate"]])
 51 |                 output.append(tmp)
 52 |             else:
 53 |                 tmp = dict(datatype="time", type="integer", unique=True)
 54 |                 tmp["field"] = '_'.join([field["field"], derive["resolution"]])
 55 |                 output.append(tmp)
 56 |     if write:
 57 |         derivedFile.write(json.dumps(output))
 58 |         derivedFile.close()
 59 |         
 60 |     return (fields_to_derive, fields)
 61 | 
 62 | def parse_json_catalog(line_queue, processes, modulo):
 63 |     fields_to_derive, fields = ParseFieldDescs(write = False)
 64 |     
 65 |     if os.path.exists("jsoncatalog.txt"):
 66 |         mode = "json"
 67 |         fin = open("jsoncatalog.txt")
 68 |         
 69 |     if os.path.exists("catalog.csv"):
 70 |         mode = "csv"
 71 |         import csv
 72 |         fin  = csv.DictReader("catalog.csv")
 73 |         
 74 |     for i, line in enumerate(fin):
 75 |         if i % processes != modulo:
 76 |             continue
 77 |         
 78 |         for char in ['\t', '\n']:
 79 |             line = line.replace(char, '')
 80 | 
 81 |         if mode == "json":
 82 |             try:
 83 |                 line = json.loads(line)
 84 |             except:
 85 |                 logging.warn("Couldn't parse catalog line {}".format(line))
 86 |                 continue
 87 |             
 88 |         for field in fields:
 89 |             # Smash together misidentified lists
 90 |             try:
 91 |                 if field['unique'] and isinstance(line[field["field"]],list):
 92 |                     line[field["field"]] = "--".join(line[field["field"]])
 93 |             except KeyError:
 94 |                 pass
 95 |         
 96 |         for field in fields_to_derive:
 97 |             
 98 |             """
 99 |             Using fields_to_derive as a shorthand for dates--this may break 
100 |             if we get more ambitious about derived fields,
101 |             but this whole metadata-parsing code needs to be refactored anyway.
102 | 
103 |             Note: this code is inefficient--it parses the same date multiple times. 
104 |             We should be parsing the date once and pulling 
105 |             derived fields out of that one parsing.
106 |             """
107 |             
108 |             try:
109 |                 if line[field["field"]]=="":
110 |                     # Use blankness as a proxy for unknown
111 |                     continue
112 | 
113 |                 time = dateutil.parser.parse(line[field["field"]],default = defaultDate)
114 |                 intent = [time.year,time.month,time.day]
115 |                 content = [str(item) for item in intent]
116 |                 
117 |                 pass
118 |             except:
119 |                 """
120 |                 Fall back to parsing as strings
121 |                 """
122 |                 try:
123 |                     datem = line[field["field"]].split("T")[0]
124 |                     content = datem.split('-')
125 |                     intent = [int(item) for item in content]
126 |                 except KeyError:
127 |                     #It's OK not to have an entry for a time field
128 |                     continue
129 |                 except ValueError:
130 |                     # Thrown if fields are empty on taking the int value: treat as junk
131 |                     continue
132 |                 except AttributeError:
133 |                     """
134 |                     Happens if it's an integer, which is a forgiveable way
135 |                     to enter a year:
136 |                     """
137 |                     content = [str(line[field['field']])]
138 |                     intent = [line[field['field']]]
139 |             else:
140 |                 for derive in field["derived"]:
141 |                     try:
142 |                         if "aggregate" in derive:
143 |                             if derive["resolution"] == 'day' and \
144 |                                     derive["aggregate"] == "year":
145 |                                 k = "%s_day_year" % field["field"]
146 |                                 dt = date(intent[0], intent[1], intent[2])
147 |                                 line[k] = dt.timetuple().tm_yday
148 |                             elif derive["resolution"] == 'day' and \
149 |                                     derive["aggregate"] == "month":
150 |                                 k = "%s_day_month" % field["field"]
151 |                                 line[k] = intent[2]
152 |                             elif derive["resolution"] == 'day' and \
153 |                                     derive["aggregate"] == "week":
154 |                                 k = "%s_day_month" % field["field"]
155 |                                 dt = date(intent[0], intent[1], intent[2])
156 |                                 # Python and javascript handle weekdays differently:
157 |                                 # Like JS, we want to begin on Sunday with zero
158 |                                 line[k] = dt.weekday() + 1
159 |                                 if (line[k]) == 7:
160 |                                     line[k] = 0
161 |                             elif derive["resolution"] == 'month' and \
162 |                                     derive["aggregate"] == "year":
163 |                                 k = "%s_month_year" % field["field"]
164 |                                 dt = date(1,intent[1],1)
165 |                                 line[k] = dt.timetuple().tm_yday
166 |                             elif derive["resolution"] == 'week' and \
167 |                                     derive["aggregate"] == "year":
168 |                                 dt = date(intent[0], intent[1], intent[2])
169 |                                 k = "%s_week_year" % field["field"]
170 |                                 line[k] = int(dt.timetuple().tm_yday/7)*7
171 |                             elif derive["resolution"] == 'hour' and \
172 |                                     derive["aggregate"] == "day":
173 |                                 k = "%s_hour_day" % field["field"]
174 |                                 line[k] = time.hour
175 |                             elif derive["resolution"] == 'minute' and \
176 |                                     derive["aggregate"] == "day":
177 |                                 k = "%s_hour_day" % field["field"]
178 |                                 line[k] = time.hour*60 + time.minute
179 |                             else:
180 |                                 logging.warning('Problem with aggregate resolution.')
181 |                                 continue
182 |                         else:
183 |                             if derive["resolution"] == 'year':
184 |                                 line["%s_year" % field["field"]] = intent[0]
185 |                             elif derive["resolution"] == 'month':
186 |                                 try:
187 |                                     k = "%s_month" % field["field"]
188 |                                     dt = date(intent[0], intent[1], 1)
189 |                                     line[k] = DaysSinceZero(dt)
190 |                                 except:
191 |                                     logging.warning("Problem with date fields\n")
192 |                                     pass
193 |                             elif derive['resolution'] == 'week':
194 |                                 k = "%s_week" % field['field']
195 |                                 dt = date(intent[0], intent[1], intent[2])
196 |                                 inttime = DaysSinceZero(dt)
197 |                                 time = int(inttime/7)*7
198 |                                 #Not starting on Sunday or anything funky like that. Actually, I don't know what we're starting on. Adding an integer here would fix that.
199 |                                 line[k] = time
200 |                             elif derive['resolution'] == 'day':
201 |                                 k = "%s_day" % field['field']
202 |                                 dt = date(intent[0], intent[1], intent[2])
203 |                                 inttime = DaysSinceZero(dt)
204 |                                 line[k] = inttime
205 |                             else:
206 |                                 logging.warning('Resolution %s currently not supported.' % (derive['resolution']))
207 |                                 continue
208 |                     except ValueError:
209 |                         # One of out a million Times articles threw this with
210 |                         # a year of like 111,203. It's not clear how best to
211 |                         # handle this.
212 |                         logging.warning("ERROR: %s " % line[field["field"]] +
213 |                                         "did not convert to proper date. Moving on...")
214 |                         # raise
215 |                         pass
216 |                     except Exception as e:
217 |                         logging.warning('*'*50)
218 |                         logging.warning('ERROR: %s\nINFO: %s\n' % (str(e), e.__doc__))
219 |                         logging.warning('*'*50)
220 |                 line.pop(field["field"])
221 |         try:
222 |             el = json.dumps(line)
223 |             line_queue.put((line["filename"], el))
224 |         except KeyError:
225 |             logging.warning("No filename key in {}".format(line))
226 |         except:
227 |             logging.warning("Error on {}".format(line))
228 |             raise
229 |     logging.debug("Metadata thread done after {} lines".format(i))
230 | 
231 | 
232 | def parse_catalog_multicore():
233 |     from .sqliteKV import KV
234 |     cpus, _ = mp_stats()
235 |     encoded_queue = Queue(10000)
236 |     workers = []
237 |     
238 |     for i in range(cpus):
239 |         p = Process(target = parse_json_catalog, args = (encoded_queue, cpus, i))
240 |         p.start()
241 |         workers.append(p)
242 |     output = open(".bookworm/metadata/jsoncatalog_derived.txt", "w")
243 | 
244 |     bookids = KV(".bookworm/metadata/textids.sqlite")
245 |     import sqlite3
246 |     
247 |     while True:
248 |         try:
249 |             filename, n = encoded_queue.get_nowait()
250 |             output.write(n + "\n")
251 |             ids = set()
252 |             try:
253 |                 bookids.register(filename)
254 |             except sqlite3.IntegrityError:
255 |                 if filename in ids:
256 |                     logging.warning("Duplicate key insertion {}".format(filename))
257 |             ids.add(filename)
258 |                 
259 |         except Empty:
260 |             if running_processes(workers):
261 |                 # Give it a sec to fill back up to avoid this thread taking up
262 |                 # a full processor.
263 |                 time.sleep(0.01)
264 |             else:
265 |                 # We're done!
266 |                 break
267 |             
268 |     bookids.close()
269 |     output.close()
270 | 


--------------------------------------------------------------------------------
/bookwormDB/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bookworm-project/BookwormDB/a7eb8482879143ffc6a0fb55a891f765d2aae383/bookwormDB/__init__.py


--------------------------------------------------------------------------------
/bookwormDB/benchmark.md:
--------------------------------------------------------------------------------
1 | at 3000 files per batch, 100 seconds to load in the streets from the raw file:
2 | 
3 | 


--------------------------------------------------------------------------------
/bookwormDB/bin/dbbindings-flask.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # So we load in the terms that allow the API implementation to happen for now.
 4 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall
 5 | from flask import Flask, request, Response, jsonify
 6 | import json
 7 | import os
 8 | 
 9 | app = Flask(__name__)
10 | 
11 | 
12 | @app.route('/')
13 | def index():
14 |     JSONinput = request.args.get('queryTerms') or request.args.get('query')
15 |     if not JSONinput:
16 |         return "Need query or queryTerms argument"
17 |     return main(JSONinput)
18 | 
19 | @app.route('/debug')
20 | def debug_api():
21 |     import logging
22 |     logging.basicConfig(level=logging.INFO)
23 |     JSONinput = request.args.get('queryTerms') or request.args.get('query')
24 |     if not JSONinput:
25 |         return "Need query or queryTerms argument"
26 |     return main(JSONinput)
27 | 
28 | @app.route('/debug/query')
29 | def debug_query():
30 |     JSONinput = request.args.get('queryTerms') or request.args.get('query')
31 |     return JSONinput
32 | 
33 | 
34 | def main(JSONinput):
35 | 
36 |     query = json.loads(JSONinput)
37 | 
38 |     p = SQLAPIcall(query)
39 |     result = p.execute()
40 | 
41 |     if (query['method'] == 'data' and 'format' in query and
42 |             query['format'] == 'json'):
43 |         # New format for response
44 |         jresp = json.loads(result)
45 |         resp = jsonify(jresp)
46 |         if jresp['status'] == 'error':
47 |             resp.status_code = jresp['code'] if 'code' in jresp else 500
48 |     else:
49 |         resp = Response(result)
50 | 
51 |     if query['method'] == "return_tsv":
52 |         resp.headers['Content-Type'] = "text; charset=utf-8"
53 |         resp.headers["Content-Disposition"] = "filename=Bookworm-data.txt"
54 |         resp.headers["Pragma"] = "no-cache"
55 |         resp.headers["Expires"] = 0
56 |     elif query['method'] in ['return_json', 'return_pickle']:
57 |         resp.headers['Content-Type'] = "text/html"
58 | 
59 |     resp.headers['Access-Control-Allow-Origin'] = '*'
60 |     resp.headers['Access-Control-Allow-Methods'] = 'GET, POST, PUT, OPTIONS'
61 |     resp.headers['Access-Control-Allow-Headers'] = 'Origin, Accept, '\
62 |         'Content-Type, X-Requested-With, X-CSRF-Token'
63 | 
64 |     return resp
65 | 
66 | if __name__ == '__main__':
67 |     port = int(os.environ.get('PORT', 8080))
68 |     app.run(host='0.0.0.0', port=port, debug=True)
69 | 


--------------------------------------------------------------------------------
/bookwormDB/bin/dbbindings.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # So we load in the terms that allow the API implementation to happen for now.
 4 | from __future__ import print_function
 5 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall
 6 | import cgi
 7 | import cgitb
 8 | import json
 9 | 
10 | cgitb.enable()
11 | 
12 | 
13 | def headers(method, errorcode=False):
14 | 
15 |     print('Access-Control-Allow-Origin: *')
16 |     print('Access-Control-Allow-Methods: GET, POST, PUT, OPTIONS')
17 |     print('Access-Control-Allow-Headers: Origin, Accept, Content-Type, ' \
18 |           'X-Requested-With, X-CSRF-Token')
19 | 
20 |     if errorcode:
21 |         print("Status: %d" % errorcode)
22 | 
23 |     if method != "return_tsv":
24 |         print("Content-type: text/html\n")
25 | 
26 |     elif method == "return_tsv":
27 |         print("Content-type: text; charset=utf-8")
28 |         print("Content-Disposition: filename=Bookworm-data.txt")
29 |         print("Pragma: no-cache")
30 |         print("Expires: 0\n")
31 | 
32 | 
33 | def debug(string):
34 |     """
35 |     Makes it easier to debug through a web browser by handling the headers
36 |     No calls should be permanently left in the code ever, or they will break
37 |     things badly.
38 |     """
39 |     print(headers('1'))
40 |     print("<br>")
41 |     print(string)
42 |     print("<br>")
43 | 
44 | 
45 | def main(JSONinput):
46 | 
47 |     query = json.loads(JSONinput)
48 |     # Set up the query.
49 |     p = SQLAPIcall(query)
50 | 
51 |     # run the query.
52 |     resp = p.execute()
53 | 
54 |     if query['method'] == 'data' and 'format' in query and query['format'] == 'json':
55 |         try:
56 |             resp = json.loads(resp)
57 |         except:
58 |             resp = dict(status="error", code=500,
59 |                         message="Internal error: server did not return json")
60 | 
61 |         # Print appropriate HTML headers
62 |         if 'status' in resp and resp['status'] == 'error':
63 |             code = resp['code'] if 'code' in resp else 500
64 |             headers(query['method'], errorcode=code)
65 |         else:
66 |             headers(query['method'])
67 |         print(json.dumps(resp))
68 |     else:
69 |         headers(query['method'])
70 |         print(resp)
71 | 
72 |     return True
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     form = cgi.FieldStorage()
77 | 
78 |     # Still supporting two names for the passed parameter.
79 |     try:
80 |         JSONinput = form["queryTerms"].value
81 |     except KeyError:
82 |         JSONinput = form["query"].value
83 | 
84 |     main(JSONinput)
85 | 


--------------------------------------------------------------------------------
/bookwormDB/bin/logParser.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from future import standard_library
 3 | standard_library.install_aliases()
 4 | import urllib.request, urllib.parse, urllib.error
 5 | import os
 6 | import re
 7 | import gzip
 8 | import json
 9 | import sys
10 | 
11 | files = os.listdir("/var/log/apache2")
12 | 
13 | words = []
14 | 
15 | for file in files:
16 |     reading = None
17 |     if re.search("^access.log..*.gz", file):
18 |         reading = gzip.open("/var/log/apache2/" + file)
19 |     elif re.search("^access.log.*", file):
20 |         reading = open("/var/log/apache2/" + file)
21 |     else:
22 |         continue
23 |     sys.stderr.write(file + "\n")
24 | 
25 |     for line in reading:
26 |         matches = re.findall(r"([0-9\.]+).*\[(.*)].*cgi-bin/dbbindings.py/?.query=([^ ]+)", line)
27 |         for fullmatch in matches:
28 |             t = dict()
29 |             t['ip'] = fullmatch[0]
30 |             match = fullmatch[2]
31 |             try:
32 |                 data = json.loads(urllib.parse.unquote(match).decode('utf8'))
33 |             except ValueError:
34 |                 continue
35 |             try:
36 |                 if isinstance(data['search_limits'], dict):
37 |                     data['search_limits'] = [data['search_limits']]
38 |                 for setting in ['words_collation', 'database']:
39 |                     try:
40 |                         t[setting] = data[setting]
41 |                     except KeyError:
42 |                         t[setting] = ""
43 |                 for limit in data['search_limits']:
44 |                     p = dict()
45 |                     for constraint in ["word", "TV_show", "director"]:
46 |                         try:
47 |                             p[constraint] = p[constraint] + "," +\
48 |                                     (",".join(limit[constraint]))
49 |                         except KeyError:
50 |                             try:
51 |                                 p[constraint] = (",".join(limit[constraint]))
52 |                             except KeyError:
53 |                                 p[constraint] = ""
54 |                     for key in list(p.keys()):
55 |                         t[key] = p[key]
56 |                     vals = [t[key] for key in ('ip', 'database',
57 |                                                'words_collation', 'word',
58 |                                                'TV_show', 'director')]
59 |                     print("\t".join(vals).encode("utf-8"))
60 | 
61 |             except KeyError:
62 |                 raise
63 | 
64 | print(len(words))
65 | 


--------------------------------------------------------------------------------
/bookwormDB/bwExceptions.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This is a stub exception to identify explicitly defined Bookworm Exception.
 3 | 
 4 | The intended usage is to raise the exception with a dict that has an error
 5 | message, and optionally a code that matches HTTP status codes. e.g.
 6 | 
 7 |     raise BookwormException({"message": "I'm a teapot" code:418})
 8 | 
 9 | or more tidy for longer messages:
10 |     err = dict(message="I'm a teapot", code=418)
11 |     raise BookwormException(err)
12 | 
13 | Code should be an int, not a string.
14 | '''
15 | 
16 | 
17 | class BookwormException(Exception):
18 |     pass
19 | 


--------------------------------------------------------------------------------
/bookwormDB/configuration.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | from __future__ import print_function
  4 | import configparser
  5 | import os
  6 | import sys
  7 | import re
  8 | import MySQLdb
  9 | import argparse
 10 | import getpass
 11 | import subprocess
 12 | import logging
 13 | import uuid
 14 | 
 15 | def update():
 16 |     ## Assemble list of all bookworms on the system.
 17 | 
 18 |     bookworms = [] ### ...
 19 | 
 20 |     ## Create on-disk versions of memory tables if 'fastcat_' does not exists.
 21 | 
 22 |     pass
 23 | 
 24 |     ## Allow "'bookworm'@'localhost' IDENTIFIED BY ''" to have select access on each bookworm.
 25 | 
 26 |     pass
 27 | 
 28 |     ## Print a message about enabling access.
 29 | 
 30 |     pass
 31 | 
 32 | 
 33 | def create(ask_about_defaults=True, database=None):
 34 |     """
 35 |     Through interactive prompts at the command line, builds up a file at
 36 |     bookworm.cnf that can be used to set preferences for the installation.
 37 |     """
 38 | 
 39 |     if ask_about_defaults:
 40 |         print("""
 41 |     Welcome to Bookworm.
 42 |     ~~~~~~~~~~~~~~~~~~~~
 43 |     First off, let's build a configuration file. This will live
 44 |     at bookworm.cnf in the current directory: if you mistype anything,
 45 |     or want to change settings, edit it directly in that location.
 46 | 
 47 |     For each of the following entries, type the value you want, or hit
 48 |     enter to accept the default:
 49 | 
 50 |     """)
 51 |     else:
 52 |         logging.info("Auto-generating config file.")
 53 | 
 54 |     """
 55 |     First, we go to great efforts to find some sensible defaults
 56 |     Usually the user can just hit enter.
 57 |     """
 58 | 
 59 |     systemConfigFile = configparser.SafeConfigParser(allow_no_value=True)
 60 | 
 61 |     defaults = dict()
 62 |     # The default bookwormname is just the current location
 63 | 
 64 |     if database is None:
 65 |         defaults['database'] = os.path.relpath(".", "..")
 66 |     else:
 67 |         defaults['database'] = database
 68 | 
 69 |     defaults["user"] = "bookworm"
 70 |     defaults["password"] = ""
 71 | 
 72 |     config = configparser.ConfigParser()
 73 | 
 74 |     for section in ["client"]:
 75 |         config.add_section(section)
 76 | 
 77 |     if ask_about_defaults:
 78 |         database = input("What is the name of the bookworm [" + defaults['database'] + "]: ")
 79 |     else:
 80 |         database = defaults['database']
 81 | 
 82 |     config.set("client", "database", re.sub(" ","_",database))
 83 |     config.write(open("bookworm.cnf", "w"))
 84 | 
 85 | class Configfile(object):
 86 |     def __init__(self, usertype, possible_locations=None, default=None, ask_about_defaults=True):
 87 |         """
 88 |         Initialize with the type of the user. The last encountered file on
 89 |         the list is the one that will be used.
 90 |         If default is set, a file will be created at that location if none
 91 |         of the files in possible_locations exist.
 92 | 
 93 |         If ask_about_defaults is false, it will do a force installation.
 94 |         """
 95 | 
 96 |         if not usertype in ['read_only', 'admin']:
 97 |             raise NotImplementedError("Only read_only and admin supported")
 98 | 
 99 |         self.ask_about_defaults = ask_about_defaults
100 | 
101 |         logging.info("Creating configuration as " + usertype)
102 | 
103 |         self.usertype = usertype
104 | 
105 |         if possible_locations is None:
106 |             possible_locations = self.default_locations_from_type(usertype)
107 | 
108 |         self.location = None
109 | 
110 |         self.config = configparser.ConfigParser(allow_no_value=True)
111 | 
112 |         if usertype=="admin":
113 | 
114 |             self.ensure_section("client")
115 |             self.ensure_section("mysqld")
116 | 
117 |             self.config.set("client", "host", "localhost")
118 |             self.config.set("client", "user", "root")
119 |             self.config.set("client", "password", "")
120 | 
121 |         else:
122 |             self.ensure_section("client")
123 |             self.config.set("client", "host", "localhost")
124 |             self.config.set("client", "user", "bookworm")
125 |             self.config.set("client", "password", "")
126 | 
127 |         self.read_config_files(possible_locations)
128 | 
129 |         for string in possible_locations:
130 |             if os.path.exists(string):
131 |                 self.location = string
132 | 
133 | 
134 |     def read_config_files(self, used_files):
135 | 
136 |         try:
137 |             self.config.read(used_files)
138 |         except configparser.MissingSectionHeaderError:
139 |             """
140 |             Some files throw this error if you have an empty
141 |             my.cnf. This throws those out of the list, and tries again.
142 |             """
143 |             for file in used_files:
144 |                 try:
145 |                     self.config.read(file)
146 |                 except configparser.MissingSectionHeaderError:
147 |                     used_files.remove(file)
148 |             successes = self.config.read(used_files)
149 | 
150 | 
151 | 
152 |     def default_locations_from_type(self,usertype):
153 |         """
154 |         The default locations for each usertype.
155 |         Note that these are in ascending order of importance:
156 |         so the preferred location for admin and read_only configuration
157 |         is in /etc/bookworm/admin.cnf
158 |         and /etc/bookworm/client.cnf
159 |         """
160 | 
161 |         if usertype=="admin":
162 |             return [os.path.abspath(os.path.expanduser("~/.my.cnf")),
163 |                     os.path.abspath(os.path.expanduser("~/my.cnf")),
164 |                     "/etc/bookworm/admin.cnf"]
165 |         if usertype == "read_only":
166 |             return ["~/.bookworm-sql.cnf", "/etc/bookworm/client.cnf"]
167 |         else:
168 |             return []
169 | 
170 |     def ensure_section(self,section):
171 |         if not self.config.has_section(section):
172 |             self.config.add_section(section)
173 | 
174 |     def set_bookworm_options(self):
175 |         """
176 |         A number of specific MySQL changes to ensure fast queries on Bookworm.
177 |         """
178 |         self.ensure_section("mysqld")
179 | 
180 |         mysqldoptions = {"### = =": "THIS FILE SHOULD GENERALLY BE PLACED AT /etc/mysql/my.cnf = = = ###", "max_allowed_packet":"512M","sort_buffer_size":"8M","read_buffer_size":"8M","read_rnd_buffer_size":"8M","bulk_insert_buffer_size":"512M","myisam_sort_buffer_size":"5512M","myisam_max_sort_file_size":"5500G","key_buffer_size":"2500M","query_cache_size":"32M","tmp_table_size":"1024M","max_heap_table_size":"2048M","character_set_server":"utf8","query_cache_type":"1","query_cache_limit":"8M"}
181 | 
182 |         for option in list(mysqldoptions.keys()):
183 |             if not self.config.has_option("mysqld",option):
184 |                 self.config.set("mysqld", option, mysqldoptions[option])
185 |             else:
186 |                 if mysqldoptions[option] != self.config.get("mysqld",option):
187 |                     choice = input("Do you want to change the value for " + option + " from " + self.config.get("mysqld",option) + " to the bookworm-recommended " + mysqldoptions[option] + "? (y/N): ")
188 |                     if choice=="y":
189 |                         self.config.set("mysqld",option,mysqldoptions[option])
190 | 
191 |         self.write_out()
192 | 
193 |     def write_out(self):
194 |         """
195 |         Write out a new version of the configfile to stdout.
196 |         The user is responsible for putting this somewhere it will
197 |         affect the MySQL preferences
198 |         """
199 |         self.config.write(sys.stdout)
200 | 
201 | def recommend_my_cnf(known_loc = None):
202 |     if known_loc is None:
203 |         for loc in ["/usr/etc/my.cnf","/etc/mysql/my.cnf","/etc/my.cnf"]:
204 |             if os.path.exists(loc):
205 |                 known_loc = loc
206 |     if known_loc is None:
207 |         raise FileNotFoundError("Could not find MySQL folder: pass one.")
208 |     cnf = Configfile(usertype = 'admin', possible_locations = [known_loc])
209 |     cnf.set_bookworm_options()
210 |     cnf.write_out()
211 | 
212 | 
213 | 
214 | def apache(self = None):
215 |     print("""
216 |     Instructions for Apache:
217 | 
218 | 
219 |     First: Serve the Bookworm API over port 10012. (`bookworm serve`).
220 | 
221 |     Then: Install an Apache host on port 80.
222 | 
223 |     Then: enable proxy servers and turn off any existing cgi.
224 | 
225 |     # If you were previously using the CGI bookworm.
226 |     `sudo a2dismod cgi`
227 | 
228 |     `sudo a2enmod proxy proxy_ajp proxy_http rewrite deflate headers proxy_balancer proxy_connect proxy_html`
229 | 
230 |     Then: Add the following to your '/etc/apache2/sites-available/000-default.conf'
231 |     (or whatever site from which you run your apache.
232 | 
233 |     ~~~~~~~~~~~~~~~~
234 | 
235 |     <Proxy *>
236 |       Order deny,allow
237 |       Allow from all
238 |     </Proxy>
239 |       ProxyPreserveHost On
240 |     <Location "/cgi-bin">
241 |       ProxyPass "http://127.0.0.1:10012/"
242 |       ProxyPassReverse "http://127.0.0.1:10012/"
243 |     </Location>
244 | 
245 |     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
246 | 
247 | 
248 | """)
249 | 


--------------------------------------------------------------------------------
/bookwormDB/convertTSVtoJSONarray.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | def convertToJSON(filename, location):
 4 |     """
 5 |     given a filename of a tsv, converts that into an ndjson
 6 |     file for Bookworm.
 7 |     """
 8 |     input = open(filename)
 9 |     output = open(location, "w")
10 |     headers = input.readline()
11 |     headers = headers.rstrip("\n")
12 |     headers = headers.rstrip("\r")
13 |     headers = headers.rstrip("\n")
14 |     headers = headers.rstrip("\r")    
15 |     headers = headers.split("\t")
16 |     for line in input:
17 |         line = line.rstrip("\n")
18 |         line = line.rstrip("\r")
19 |         line = line.rstrip("\n")
20 |         line = line.rstrip("\r")        
21 |         values = line.split("\t")
22 |         myobject = dict(list(zip(headers,values)))
23 |         output.write(json.dumps(myobject) + "\n")
24 |     output.close()
25 | 
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/bookwormDB/countManager.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import bounter
  4 | from collections import Counter
  5 | from .tokenizer import Tokenizer, tokenBatches, PreTokenized
  6 | from multiprocessing import Process, Queue, Pool
  7 | from .multiprocessingHelp import mp_stats, running_processes
  8 | import multiprocessing as mp
  9 | import psutil
 10 | import queue
 11 | import logging
 12 | import fileinput
 13 | import time
 14 | import csv
 15 | 
 16 | cpus, memory = mp_stats()
 17 | 
 18 | 
 19 | # Allocate half of available memory for the bounter, in megabytes.
 20 | memory = int(memory/1024/1024/2)
 21 | 
 22 | # Use another third of the memory for storing worker counts; divided
 23 | # by number of CPUS.
 24 | # Assume 200 bytes per entry in python dict.
 25 | 
 26 | QUEUE_POST_THRESH = int(memory / 3 * 1024 * 1024 / 200 / cpus)
 27 | logging.debug("Ideal queue size is {}".format(QUEUE_POST_THRESH))
 28 | QUEUE_POST_THRESH = max([100000, QUEUE_POST_THRESH])
 29 | 
 30 | logging.info("Filling dicts to size {}".format(QUEUE_POST_THRESH))
 31 | 
 32 | import random
 33 | import gzip
 34 | 
 35 | def flush_counter(counter, qout):
 36 |     for k in ['', '\x00']:
 37 |         try:
 38 |             del counter[k]
 39 |         except KeyError:
 40 |             continue
 41 |         qout.put(counter)
 42 |         
 43 | def counter(qout, i, fin, mode = "count"):
 44 |     """
 45 |     # Counts words exactly in a separate process.
 46 |     # It runs in place.
 47 |     If mode is 'encode', this is called for a side-effect of writing
 48 |     files to disk.
 49 |     """
 50 | 
 51 |     totals = 0
 52 |     errors = 0
 53 |     
 54 |     if mode == "count":
 55 |         counter = Counter()
 56 |         encoder = tokenBatches(['words'])
 57 |         
 58 |     if mode == "encode":
 59 |         encoder = tokenBatches(['unigrams', 'bigrams'])
 60 |         
 61 |     datatype = "raw"
 62 |     
 63 |     count_signals = [".unigrams", ".bigrams", ".trigrams", ".quadgrams"]
 64 |     for signal in count_signals:
 65 |         if signal in fin:
 66 |             datatype = signal.strip(".")
 67 |             if mode == "encode":
 68 |                 encoder = tokenBatches([datatype])            
 69 |         
 70 |     if (fin.endswith(".gz")):
 71 |         fin = gzip.open(fin, 'rt')
 72 |     else:
 73 |         fin = open(fin)
 74 | 
 75 |     
 76 |     for ii, row in enumerate(fin):
 77 |         if ii % cpus != i:
 78 |             # Don't do anything on most lines.
 79 |             continue
 80 |         totals += 1
 81 |         try:
 82 |             (filename, text) = row.rstrip().split("\t",1)
 83 |         except ValueError:
 84 |             errors += 1
 85 |             continue
 86 |         
 87 |         if datatype == "raw":
 88 |             tokenizer = Tokenizer(text)
 89 |         else:
 90 |             tokenizer = PreTokenized(text, encoder.levels[0])
 91 | 
 92 |         # When encoding
 93 |         if mode == "encode":
 94 |             encoder.encodeRow(filename, tokenizer, write_completed = True)
 95 |             continue
 96 |         
 97 |         # When building counts
 98 |         counter.update(tokenizer.counts("words"))
 99 |             
100 |         # When the counter is long, post it to the master and clear it.
101 |         if len(counter) > QUEUE_POST_THRESH:
102 |             flush_counter(counter=counter, qout = qout)
103 |             counter = Counter()
104 | 
105 |     # Cleanup.
106 |     if mode == "count":
107 |         logging.debug("Flushing leftover counts from thread {}".format(i))
108 |         flush_counter(counter=counter, qout = qout)
109 |         if totals > 0 and errors/totals > 0.01:
110 |             logging.warning("Skipped {} rows without tabs".format(errors))
111 |     if mode == "encode":
112 |         encoder.close()
113 | 
114 | def create_counts(input):
115 |     qout = Queue(cpus * 2)
116 |     workers = []
117 |     logging.info("Spawning {} count processes on {}".format(cpus, input))
118 |     for i in range(cpus):
119 |         p = Process(target = counter, args = (qout, i, input, "count"))
120 |         p.start()
121 |         workers.append(p)
122 | 
123 |     wordcounter = bounter.bounter(memory)
124 |     
125 |     while True:
126 |         
127 |         try:
128 |             input_dict = qout.get_nowait()
129 |             logging.debug("inputting queue of length {} from worker".format(len(input_dict)))
130 |             wordcounter.update(input_dict)
131 |             
132 |         except queue.Empty:
133 |             if running_processes(workers):
134 |                 time.sleep(1/100)
135 |             else:
136 |                 break
137 |         except ValueError:
138 |             for k, v in input_dict.items():
139 |                 print("'{}'\t'{}'".format(k, v))                
140 |                 wordcounter.update({k: v})
141 |             raise
142 |         except TypeError:
143 |             for k, v in input_dict.items():
144 |                 print("'{}'\t'{}'".format(k, v))                
145 |                 wordcounter.update({k: v})
146 |             raise
147 |         
148 |     return wordcounter
149 | 
150 | def create_wordlist(n, input, output):
151 |     
152 |     counter = create_counts(input)
153 |     counter = sorted(list(counter.iteritems()), key = lambda x: -1 * x[1])
154 |     output = open(output, "w")
155 |     for i, (k, v) in enumerate(counter):
156 |         output.write("{}\t{}\t{}\n".format(i, k, v))
157 |         if i >= n:
158 |             break
159 |         
160 | def encode_words(wordlist, input = "input.txt"):
161 |     qout = Queue(cpus * 2)
162 |     workers = []
163 | 
164 |     for i in range(cpus):
165 |         p = Process(target = counter, args = (qout, i, input, "encode"))
166 |         p.start()
167 |         workers.append(p)
168 | 
169 |     while running_processes(workers):
170 |         time.sleep(1/30)
171 | 


--------------------------------------------------------------------------------
/bookwormDB/general_API.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | from pandas import merge
  4 | from pandas import Series
  5 | from pandas.io.sql import read_sql
  6 | from pandas import merge
  7 | from pandas import set_option
  8 | from copy import deepcopy
  9 | from collections import defaultdict
 10 | from .SQLAPI import DbConnect
 11 | from .SQLAPI import userquery
 12 | from .mariaDB import Query
 13 | from .bwExceptions import BookwormException
 14 | import re
 15 | import json
 16 | import logging
 17 | import numpy as np
 18 | import csv
 19 | import io
 20 | import numpy as np
 21 | 
 22 | """
 23 | The general API is some functions for working with pandas to calculate
 24 | bag-of-words summary statistics according to the API description.
 25 | 
 26 | It is not bound to any particular backend: instead, a subset of 
 27 | methods in the API must be supported by subclassing APICall().
 28 | 
 29 | The only existing example of this is "SQLAPICall."
 30 | """
 31 | 
 32 | # Some settings can be overridden here, if nowhere else.
 33 | 
 34 | prefs = dict()
 35 | 
 36 | def PMI(df, location, groups):
 37 |     """
 38 |     A simple PMI calculation. Arguments:
 39 | 
 40 |     'location': The field to calculate expected values for.
 41 |     'groups': The metadata to sum up over.
 42 | 
 43 |     """
 44 |     copy = df.copy()
 45 |     total = df[[location]].sum()
 46 |     copy['expected'] = total[0]
 47 |     for i in range(len(groups)):
 48 |         new_name = groups[i] + "__r"
 49 |         renamer = dict()
 50 |         renamer[location] = new_name
 51 |         etc = (df[[groups[i], location]].groupby(groups[i]).sum()/total).rename(renamer, axis="columns")
 52 |         copy = merge(copy, etc, left_on = groups[i], right_index = True)
 53 |         copy["expected"] = copy["expected"] * copy[new_name]
 54 |     return np.log(copy[location]/copy["expected"])
 55 | 
 56 | def rle(input):
 57 |     """
 58 |     Format a list as run-length encoding JSON.
 59 |     """
 60 |     output = [input[0]]
 61 |     for item in input[1:]:
 62 |         if isinstance(output[-1], list) and output[-1][1] == item:
 63 |             output[-1][0] += 1
 64 |         elif output[-1] == item:
 65 |             output[-1] = [2, item]
 66 |         else:
 67 |             output.append(item)
 68 |     return output
 69 | 
 70 | def DunningLog(df, a, b):
 71 |     from numpy import log as log
 72 |     destination = "Dunning"
 73 |     df[a] = df[a].replace(0, 0.5)
 74 |     df[b] = df[b].replace(0, 0.5)
 75 |     if a == "WordCount_x":
 76 |         # Dunning comparisons should be to the sums if counting:
 77 |         c = sum(df[a])
 78 |         d = sum(df[b])
 79 |     if a == "TextCount_x":
 80 |         # The max count isn't necessarily the total number of books,
 81 |         # but it's a decent proxy.
 82 |         c = max(df[a])
 83 |         d = max(df[b])
 84 |     expectedRate = (df[a] + df[b]).divide(c+d)
 85 |     E1 = c*expectedRate
 86 |     E2 = d*expectedRate
 87 |     diff1 = log(df[a].divide(E1))
 88 |     diff2 = log(df[b].divide(E2))
 89 |     df[destination] = 2*(df[a].multiply(diff1) + df[b].multiply(diff2))
 90 |     # A hack, but a useful one: encode the direction of the significance,
 91 |     # in the sign, so negative
 92 |     difference = diff1 < diff2
 93 |     df.ix[difference, destination] = -1*df.ix[difference, destination]
 94 |     return df[destination]
 95 | 
 96 | class Aggregator(object):
 97 |     """
 98 |     We only collect "WordCount and "TextCount" for each query,
 99 |     but there are a multitude of things you can do with those:
100 |     basic things like frequency, all the way up to TF-IDF.
101 | 
102 |     """    
103 |     def __init__(self, df, groups = None):
104 |         self.df = df
105 |         self.groups = groups
106 | 
107 |     def _aggregate(self, parameters):
108 |         "Run the aggregation. Prefixed with an underscore so it doesn't show up in the dict."
109 |         
110 |         parameters = set(map(str, parameters))
111 |         for parameter in parameters:
112 |             getattr(self, parameter)()
113 |         return self.df
114 |             
115 |     def WordCount(self):
116 |         self.df["WordCount"] = self.df["WordCount_x"]
117 |         
118 |     def TextCount(self):
119 |         self.df["TextCount"] = self.df["TextCount_x"]
120 |         
121 |     def WordsPerMillion(self):
122 |         self.df["WordsPerMillion"] = (self.df["WordCount_x"].multiply(1000000)/
123 |                                  self.df["WordCount_y"])
124 |     def TotalWords(self):
125 |         self.df["TotalWords"] = self.df["WordCount_y"]
126 |         
127 |     def SumWords(self):
128 |         self.df["SumWords"] = self.df["WordCount_y"] + self.df["WordCount_x"]
129 |         
130 |     def WordsRatio(self):
131 |         self.df["WordsRatio"] = self.df["WordCount_x"]/self.df["WordCount_y"]
132 |         
133 |     def TextPercent(self):
134 |         self.df["TextPercent"] = 100*self.df["TextCount_x"].divide(self.df["TextCount_y"])
135 |         
136 |     def TextRatio(self):
137 |         self.df["TextRatio"] = self.df["TextCount_x"]/self.df["TextCount_y"]       
138 | 
139 |     def TotalTexts(self):
140 |         self.df["TotalTexts"] = self.df["TextCount_y"]
141 |         
142 |     def SumTexts(self):
143 |         self.df["SumTexts"] = self.df["TextCount_y"] + self.df["TextCount_x"]
144 |         
145 |     def HitsPerText(self):
146 |         self.df["HitsPerText"] = self.df["WordCount_x"]/self.df["TextCount_x"]
147 | 
148 |     def TextLength(self):
149 |         self.df["TextLength"] = self.df["WordCount_y"]/self.df["TextCount_y"]
150 | 
151 |     def PMI_words(self):
152 |         self.df["PMI_words"] = PMI(self.df, "WordCount_x", self.groups)
153 | 
154 |     def PMI_texts(self):
155 |         self.df["PMI_texts"] = PMI(self.df, "TextCount_x", self.groups)        
156 |         
157 |     def TFIDF(self):
158 |         from numpy import log as log
159 |         self.df["TF"] = self.df["WordCount_x"]/self.df["WordCount_y"]
160 |         self.df["TFIDF"] = self.df["TF"] * np.log(self.df["TextCount_y"]/self.df['TextCount_x'])
161 |     
162 |     def Dunning(self):
163 |         self.df["Dunning"] = DunningLog(self.df, "WordCount_x", "WordCount_y")
164 | 
165 | 
166 |     def DunningTexts(self):
167 |         self.df["DunningTexts"] = DunningLog(self.df, "TextCount_x", "TextCount_y")
168 | 
169 | def rename(df, newkey):
170 |     
171 |     # Add "x" and "y" suffixed to the dataframes even when not explicitly needed.
172 | 
173 |     renamer = {}
174 |     for k in ["WordCount", "TextCount"]:
175 |         renamer[k] = k + "_" + newkey
176 |     df.rename(index=str, columns=renamer, inplace = True)
177 | 
178 | 
179 | def intersectingNames(p1, p2, full=False):
180 |     """
181 |     The list of intersection column names between two DataFrame objects.
182 | 
183 |     'full' lets you specify that you want to include the count values:
184 |     Otherwise, they're kept separate for convenience in merges.
185 |     """
186 |     exclude = set(['WordCount', 'TextCount'])
187 |     names1 = set([column for column in p1.columns if column not in exclude])
188 |     names2 = [column for column in p2.columns if column not in exclude]
189 |     if full:
190 |         return list(names1.union(names2))
191 |     return list(names1.intersection(names2))
192 | 
193 | 
194 | def need_comparison_query(count_types):
195 |     """
196 |     Do we not need a comparison query?
197 |     """
198 |     needing_fields = [c for c in count_types if not c in ["WordCount","TextCount"]]
199 |     return len(needing_fields) != 0
200 | 
201 | def base_count_types(list_of_final_count_types):
202 |     """
203 |     the final count types are calculated from some base types across both
204 |     the local query and the superquery.
205 | 
206 |     These are very not optimized--I should go through and cut out bad ones for more obscure count types.
207 | 
208 |     """
209 | 
210 |     subq = set()
211 |     superq = set()
212 |     
213 |     for count_name in list_of_final_count_types:
214 |         if count_name in ["WordCount", "WordsPerMillion", "WordsRatio",
215 |                           "TotalWords", "SumWords", "Dunning", "PMI_words", "TextLength", "HitsPerMatch", "TFIDF"]:
216 |             subq.add("WordCount")
217 |             superq.add("WordCount")
218 |         if count_name in ["TextCount", "TextPercent", "TextRatio",
219 |                           "TotalTexts", "SumTexts", "DunningTexts", "PMI_texts",
220 |                               "TextLength", "HitsPerMatch", "TFIDF"]:
221 |             subq.add("TextCount")
222 |             superq.add("TextCount")
223 | 
224 |     return [list(subq), list(superq)]
225 | 
226 | 
227 | def is_a_wordcount_field(string):
228 |     if string in ["unigram", "bigram", "word"]:
229 |         return True
230 |     return False
231 | 
232 | 
233 | class APIcall(object):
234 |     """
235 |     This is the base class from which more specific classes for actual
236 |     methods can be dispatched.
237 | 
238 |     Without a "return_pandas_frame" method, it won't run.
239 |     """
240 |     def __init__(self, APIcall):
241 | 
242 |         """
243 |         Initialized with a dictionary unJSONed from the API defintion.
244 |         """
245 | 
246 |         self.query = APIcall
247 |         self.idiot_proof_arrays()
248 |         self.set_defaults()
249 | 
250 |     def set_defaults(self):
251 |         query = self.query
252 |         if "search_limits" not in query:
253 |             self.query["search_limits"] = dict()
254 |         if "unigram" in query["search_limits"]:
255 |             # Hack: change somehow. You can't group on "word", just on
256 |             # "unigram"
257 |             query["search_limits"]["word"] = query["search_limits"]["unigram"]
258 |             del query["search_limits"]["unigram"]
259 | 
260 |     def idiot_proof_arrays(self):
261 |         for element in ['counttype', 'groups']:
262 |             try:
263 |                 if not isinstance(self.query[element], list):
264 |                     self.query[element] = [self.query[element]]
265 |             except KeyError:
266 |                 # It's OK if it's not there.
267 |                 pass
268 | 
269 |     def get_compare_limits(self):
270 |         """
271 |         The compare limits will try to
272 |         first be the string specified:
273 |         if not that, then drop every term that begins with an asterisk:
274 |         if not that, then drop the words term;
275 |         if not that, then exactly the same as the search limits.
276 |         """
277 | 
278 |         if "compare_limits" in self.query:
279 |             return self.query['compare_limits']
280 | 
281 |         search_limits = self.query['search_limits']
282 |         compare_limits = deepcopy(search_limits)
283 | 
284 |         asterisked = False
285 |         for limit in list(search_limits.keys()):
286 |             if re.search(r'^\*', limit):
287 |                 search_limits[limit.replace('*', '')] = search_limits[limit]
288 |                 del search_limits[limit]
289 |                 del compare_limits[limit]
290 |                 asterisked = True
291 | 
292 |         if asterisked:
293 |             return compare_limits
294 | 
295 |         # Next, try deleting the word term.
296 | 
297 |         for word_term in list(search_limits.keys()):
298 |             if word_term in ['word', 'unigram', 'bigram']:
299 |                 del compare_limits[word_term]
300 | 
301 |         # Finally, whether it's deleted a word term or not, return it all.
302 |         return compare_limits
303 | 
304 |     def data(self):
305 |         if hasattr(self, "pandas_frame"):
306 |             return self.pandas_frame
307 |         else:
308 |             self.pandas_frame = self.get_data_from_source()
309 |             return self.pandas_frame
310 | 
311 |     def validate_query(self):
312 |         self.ensure_query_has_required_fields()
313 |         
314 |     def ensure_query_has_required_fields(self):
315 | 
316 |         required_fields = ['counttype', 'groups', 'database']
317 |         if self.query['method'] in ['schema', 'search']:
318 |             required_fields = ['database']
319 |         
320 |         for field in required_fields:
321 |             if field not in self.query:
322 |                 logging.error("Missing field: %s" % field)
323 |                 err = dict(message="Bad query. Missing \"%s\" field" % field,
324 |                            code=400)
325 |                 raise BookwormException(err)
326 | 
327 |     def prepare_search_and_compare_queries(self):
328 | 
329 |         
330 | 
331 |         call1 = deepcopy(self.query)
332 |         call2 = deepcopy(call1)
333 |         call2['search_limits'] = self.get_compare_limits()
334 |         
335 |         # The individual calls need only the base counts: not "Percentage of
336 |         # Words," but just "WordCount" twice, and so forth
337 | 
338 |         call1['counttype'], call2['counttype'] = base_count_types(self.query['counttype'])
339 |         
340 |         # Drop out asterisks for that syntactic sugar.
341 |         for limit in list(call1['search_limits'].keys()):
342 |             if re.search(r'^\*', limit):
343 |                 call1['search_limits'][limit.replace('*', '')] = \
344 |                         call1['search_limits'][limit]
345 |                 del call1['search_limits'][limit]
346 | 
347 |         for n, group in enumerate(self.query['groups']):
348 |             if re.search(r'^\*', group):
349 |                 replacement = group.replace("*", "")
350 |                 call1['groups'][n] = replacement
351 |                 self.query['groups'][n] = replacement
352 |                 call2['groups'].remove(group)
353 | 
354 |         self.call1 = call1
355 |         self.call2 = call2
356 | 
357 | 
358 |     def get_data_from_source(self):
359 |         """
360 |         Retrieves data from the backend, and calculates totals.
361 | 
362 |         Note that this method could be easily adapted to run on top of a Solr
363 |         instance or something else, just by changing the bits in the middle
364 |         where it handles storage_format.
365 |         """
366 |         
367 |         self.validate_query()
368 | 
369 |         if self.query['method'] in ['schema', 'search']:
370 |             return self.generate_pandas_frame()
371 |         
372 |         self.prepare_search_and_compare_queries()
373 |         
374 |         """
375 |         This could use any method other than pandas_SQL:
376 |         You'd just need to redefine "generate_pandas_frame"
377 |         """
378 | 
379 |         if not need_comparison_query(self.query['counttype']):
380 |             df1 = self.generate_pandas_frame(self.call1)
381 | #            rename(df1, "x")
382 |             return df1[self.query['groups'] + self.query['counttype']]
383 | 
384 |         try:
385 |             df1 = self.generate_pandas_frame(self.call1)
386 |             rename(df1, "x")
387 |             logging.debug(self.call2)
388 |             df2 = self.generate_pandas_frame(self.call2)
389 |             rename(df2, "y")
390 |             
391 |         except Exception as error:
392 |             logging.exception("Database error")
393 |             # One common error is putting in an inappropriate column
394 |             try:
395 |                 column_search = re.search("Unknown column '(.+)' in 'field list'",str(error)).groups()
396 |                 if len(column_search) > 0:
397 |                     return Series({"status": "error", "message": "No field in database entry matching desired key `{}`".format(column_search[0])})
398 |                 else:
399 |                     return Series({"status": "error", "message": "Database error. "
400 |                                    "Try checking field names.","code":str(error)})
401 | 
402 |             except:
403 |                     return Series({"status": "error", "message": "Unknown error. ",
404 |                                    "code":str(error)})                
405 |         
406 |         intersections = intersectingNames(df1, df2)
407 | 
408 |         """
409 |         Would this merge be faster with indexes?
410 |         """
411 |         
412 |         if len(intersections) > 0:
413 |             merged = merge(df1, df2, on=intersections, how='outer')
414 |         else:
415 |             merged = df1.join(df2, lsuffix='_x', rsuffix='_y')
416 | 
417 |         merged = merged.fillna(int(0))
418 | 
419 |         calculations = self.query['counttype']
420 |         gator = Aggregator(merged, self.query['groups'])
421 |         calcced = gator._aggregate(calculations)
422 | #        calcced = calculateAggregates(merged, calculations, self.query['groups'])
423 |         
424 |         calcced = calcced.fillna(int(0))
425 | 
426 |         final_DataFrame = (calcced[self.query['groups'] +
427 |                            self.query['counttype']])
428 | 
429 |         return final_DataFrame
430 | 
431 |     def execute(self):
432 | 
433 |         method = self.query['method']
434 |         logging.debug("Preparing to execute with method '{}'".format(method))
435 |         fmt = self.query['format'] if 'format' in self.query else False
436 | 
437 |         if method == 'data' or method == 'schema' or method == 'search':
438 |             version = 2
439 |             if fmt in ['json_c', 'search', 'html', 'csv', 'tsv']:
440 |                 version = 3
441 |         else:
442 |             version = 1
443 | 
444 |         if version == 1:
445 |             # What to do with multiple search_limits
446 |             if isinstance(self.query['search_limits'], list):
447 |                 if method in ["json", "return_json"]:
448 |                     self.query['method'] = 'data'
449 |                     self.query['format'] = 'json'
450 |                     return self.multi_execute(version=version)
451 |                 else:
452 |                     # Only return first search limit if not return in json
453 |                     self.query['search_limits'] = self.query['search_limits'][0]
454 |                     
455 |             form = method[7:] if method[:6] == 'return' else method
456 |             
457 |             logging.warning("method == \"%s\" is deprecated. Use method=\"data\" "
458 |                          "with format=\"%s\" instead." % (method, form))
459 | 
460 |             if method == "return_json" or method == "json":
461 |                     self.query['method'] = 'data'
462 |                     self.query['format'] = 'json'
463 |                     return self.return_json(version=1)
464 | 
465 |             elif method == "return_csv" or method == "csv":
466 |                 self.query['method'] = 'data'
467 |                 self.query['format'] = 'json'                
468 |                 frame = self.data()
469 |                 return frame.to_csv(path = None, sep="\t", encoding="utf8", index=False,
470 |                                     quoting=csv.QUOTE_NONE, escapechar="\\")
471 |         elif version >= 2:
472 |             try:
473 |                 # What to do with multiple search_limits
474 |                 
475 |                 if isinstance(self.query['search_limits'], list):
476 |                     if fmt == "json" or version >= 3:
477 |                         frame = self.multi_execute(version = version)
478 |                     else:
479 |                         # Only return first search limit if not return in json
480 |                         self.query['search_limits'] = self.query['search_limits'][0]
481 |                 else:
482 |                     frame = self.data()
483 |                     
484 |                 if fmt == "json":
485 |                     return self.return_json(version=2)
486 |                 
487 |                 if fmt == "csv":
488 |                     return frame.to_csv(encoding="utf8", index=False)
489 |                 
490 |                 if fmt == "tsv":
491 |                     return frame.to_csv(sep="\t", encoding="utf8", index=False)
492 | 
493 |                 if fmt == "feather":
494 |                     fout = io.BytesIO(b'')
495 |                     try:
496 |                         frame.to_feather(fout)
497 |                     except:
498 |                         logging.warning("You need the pyarrow package installed to export as feather.")
499 |                         raise
500 |                     fout.seek(0)
501 |                     return fout.read()
502 | 
503 |                 if fmt == 'json_c':
504 |                     return self.return_rle_json(frame)
505 | 
506 |                 if fmt == 'html':
507 |                     return self.html(frame)
508 |                 
509 |                 else:
510 |                     err = dict(status="error", code=200,
511 |                                message="Only formats in ['csv', 'tsv', 'json', 'feather']"
512 |                                " currently supported")
513 |                     return json.dumps(err)
514 |             except BookwormException as e:
515 |                 # Error status codes are HTTP codes
516 |                 # http://www.restapitutorial.com/httpstatuscodes.html
517 |                 err = e.args[0]
518 |                 err['status'] = "error"
519 |                 return json.dumps(err)
520 |             except Exception as ex:
521 |                 # General Uncaught error.
522 |                 logging.exception("{}".format(ex))
523 |                 logging.exception("Database error")
524 |                 return json.dumps({"status": "error", "message": "Database error. "
525 |                                "Try checking field names."})
526 | 
527 |         # Temporary catch-all pushes to the old methods:
528 |         if method in ["returnPossibleFields", "search_results",
529 |                       "return_books", "schema"]:
530 |                 try:
531 |                     query = userquery(self.query)
532 |                     if method == "return_books":
533 |                         return query.execute()
534 |                     return json.dumps(query.execute())
535 |                 except Exception as e:
536 |                     if len(str(e)) > 1 and e[1].startswith("Unknown database"):
537 |                         return "No such bookworm {}".format(e[1].replace("Unknown database",""))
538 |                 except:
539 |                     return "General error"
540 | 
541 |     def multi_execute(self, version=1):
542 |         
543 |         """
544 |         Queries may define several search limits in an array
545 |         if they use the return_json method.
546 |         """
547 |         
548 |         if version <= 2:
549 |             returnable = []
550 |             for limits in self.query['search_limits']:
551 |                 child = deepcopy(self.query)
552 |                 child['search_limits'] = limits
553 |                 q = self.__class__(child).return_json(raw_python_object=True,
554 |                                                   version=version)
555 |                 returnable.append(q)
556 |             return self._prepare_response(returnable, version)
557 |         
558 |         if version == 3:
559 |             for i, limits in enumerate(self.query['search_limits']):
560 |                 child = deepcopy(self.query)
561 |                 child['search_limits'] = limits
562 |                 f = self.__class__(child).data()
563 |                 f['Search'] = i
564 |                 if i == 0:
565 |                     frame = f
566 |                 else:
567 |                     frame = frame.append(f, ignore_index = True)
568 |             return frame
569 | 
570 |     
571 |     def html(self, data):
572 |         """
573 |         Return data in column-oriented format with run-length encoding
574 |         on duplicate values.
575 |         """
576 | 
577 |         if isinstance(data, Series) and 'status' in data:
578 |             # If data has a status, Bookworm is trying to send us an error
579 |             return data.to_json()
580 |         
581 |         set_option('display.max_colwidth', -1)
582 |         return data.to_html(escape = False, index = False)
583 | 
584 | 
585 |     def return_rle_json(self, data):
586 |         """
587 |         Return data in column-oriented format with run-length encoding
588 |         on duplicate values.
589 |         """
590 |         
591 |         if isinstance(data, Series) and 'status' in data:
592 |             # If data has a status, Bookworm is trying to send us an error
593 |             return data.to_json()
594 |     
595 |         output = {'status':'success', 'data':{}}
596 |         
597 |         for k in data:
598 |             series = data[k]
599 |             output['data'][k] = rle(data[k].tolist())
600 |             
601 |         return json.dumps(output)
602 |     
603 |         
604 |     def return_json(self, raw_python_object=False, version=1):
605 |         '''
606 |         Get JSON data for a single search_limit.
607 | 
608 |         version: 1 returns just the data, using method = return_json.
609 |                  2 formats the response according to the JSend spec.
610 |         '''
611 |         query = self.query
612 |         data = self.data()
613 | 
614 |         if isinstance(data, Series) and 'status' in data:
615 |             # If data has a status, Bookworm is trying to send us an error
616 |             return data.to_json()
617 | 
618 |         def fixNumpyType(input):
619 |             # This is, weirdly, an occasional problem but not a constant one.
620 |             if type(input) is np.int64:
621 |                 return int(input)
622 |             else:
623 |                 return input
624 | 
625 |         # Define a recursive structure to hold the stuff.
626 |         def tree():
627 |             return defaultdict(tree)
628 |         returnt = tree()
629 | 
630 |         for row in data.itertuples(index=False):
631 |             row = list(row)
632 |             destination = returnt
633 |             if len(row) == len(query['counttype']):
634 |                 returnt = [fixNumpyType(num) for num in row]
635 |             while len(row) > len(query['counttype']):
636 |                 key = row.pop(0)
637 |                 if len(row) == len(query['counttype']):
638 |                     # Assign the elements.
639 |                     try:
640 |                         row = [
641 |                             r if np.isfinite(row)
642 |                         else None
643 |                         for r in row
644 |                         ]
645 |                     except:
646 |                         logging.warning(row)
647 |                         pass
648 |                     destination[key] = row
649 |                     break
650 |                 # This bit of the loop is where we descend the recursive
651 |                 # dictionary.
652 |                 destination = destination[key]
653 |         if raw_python_object:
654 |             return returnt
655 |         else:
656 |             return self._prepare_response(returnt, version)
657 | 
658 |     def _prepare_response(self, data, version=1):
659 |         if version == 1:
660 |             resp = data
661 |         elif version == 2:
662 |             resp = dict(status="success", data=data)
663 |         else:
664 |             resp = dict(status="error",
665 |                         data="Internal error: unknown response version")
666 | 
667 |         try:
668 |             return json.dumps(resp)
669 |         except ValueError:
670 |             return json.dumps(resp)
671 | 
672 | 
673 | class oldSQLAPIcall(APIcall):
674 |     """
675 |     To make a new backend for the API, you just need to extend the base API
676 |     call class like this.
677 | 
678 |     This one is comically short because all the real work is done in the
679 |     userquery object.
680 | 
681 |     But the point is, you need to define a function "generate_pandas_frame"
682 |     that accepts an API call and returns a pandas frame.
683 | 
684 |     But that API call is more limited than the general API; you only need to
685 |     support "WordCount" and "TextCount" methods.
686 |     """
687 | 
688 |     def generate_pandas_frame(self, call = None):
689 |         """
690 | 
691 |         This is good example of the query that actually fetches the results.
692 |         It creates some SQL, runs it, and returns it as a pandas DataFrame.
693 | 
694 |         The actual SQL production is handled by the userquery class, which uses
695 |         more legacy code.
696 | 
697 |         """
698 | 
699 |         if call is None:
700 |             call = self.query
701 | 
702 |         con = DbConnect(prefs, self.query['database'])
703 |         q = userquery(call).query()
704 |         df = read_sql(q, con.db)
705 |         return df
706 | 
707 | class SQLAPIcall(APIcall):
708 |     """
709 |     To make a new backend for the API, you just need to extend the base API
710 |     call class like this.
711 | 
712 |     This one is comically short because all the real work is done in the
713 |     userquery object.
714 | 
715 |     But the point is, you need to define a function "generate_pandas_frame"
716 |     that accepts an API call and returns a pandas frame.
717 | 
718 |     But that API call is more limited than the general API; you only need to
719 |     support "WordCount" and "TextCount" methods.
720 |     """
721 | 
722 |     def generate_pandas_frame(self, call = None):
723 |         """
724 | 
725 |         This is good example of the query that actually fetches the results.
726 |         It creates some SQL, runs it, and returns it as a pandas DataFrame.
727 | 
728 |         The actual SQL production is handled by the userquery class, which uses
729 |         more legacy code.
730 | 
731 |         """
732 | 
733 |         if call is None:
734 |             call = self.query
735 |         con = DbConnect(prefs, self.query['database'])
736 |         q = Query(call).query()
737 |         logging.debug("Preparing to execute {}".format(q)) 
738 |         df = read_sql(q, con.db)
739 |         logging.debug("Query retrieved")
740 |         return df
741 |     
742 | 


--------------------------------------------------------------------------------
/bookwormDB/json_schema.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .schema_primitives import *
 3 | 
 4 | base_schema = {
 5 |     "definitions": {
 6 | 
 7 |     },
 8 |     "type": "object",
 9 |     "title": "Bookworm Query Schema",
10 |     "required": [
11 |         "database",
12 |         "method",
13 |         "format",
14 |         "search_limits",
15 |         "groups",
16 |         "counttype"
17 |     ],
18 |     "properties": {
19 |         "method": method_schema,
20 |         "format": format_schema,
21 |         "database": {
22 |             "type": "string",
23 |             "title": "The Database Schema",
24 |             "description": "The name of the database to search in.",
25 |             "examples": [
26 |                 "federalist_bookworm",
27 |                 "hathipd"
28 |             ],
29 |             "pattern": "^([^ ]+)$"
30 |         },
31 |         "search_limits": {
32 |             "$id": "#/properties/search_limits",
33 |             "type": "object",
34 |             "description": "A set of constraints to create a corpus. If an array, each will be treated as a grouping field for results and a new key, 'Search,' will be returned."
35 |         },
36 |         "compare_limits": {
37 |             "$id": "#/properties/compare_limits",
38 |             "type": "object",
39 |             "description": "The definition of a full corpus against which to run comparisons. In general, this will be automatically inferred from the search_limits field by dropping the 'word' limit.",
40 |         },
41 |         "groups": {
42 |             "$id": "#/properties/groups",
43 |             "type": "array",
44 |             "items": {
45 |                 "$id": "#/properties/groups/items",
46 |                 "type": "string",
47 |                 "default": "",
48 |                 "examples": [
49 |                     "author",
50 |                     "date_day_year"
51 |                 ],
52 |                 "pattern": "^(.*)$"
53 |             }
54 |         },
55 |         "counttype": counts_schema
56 |     }
57 | }
58 | 
59 | class DataQuerySchema(dict):
60 |     """
61 |     A JSON schema for valid queries.
62 |     """
63 |     def __init__(self, con):
64 |         dict.__init__(self, base_schema)
65 |         self.set_base_elements()
66 | 
67 |     def set_base_elements(self):
68 |         pass
69 | 
70 |     def validate(self, query):
71 |         pass
72 | 


--------------------------------------------------------------------------------
/bookwormDB/manager.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import re
  3 | from subprocess import call
  4 | from subprocess import Popen
  5 | import logging
  6 | import sys
  7 | import os
  8 | import bookwormDB
  9 | import argparse
 10 | 
 11 | """
 12 | This is the code that actually gets run from the command-line executable.
 13 | 
 14 | The BookwormManager class defines some methods for controlling bookworm SQL instances
 15 | and running upkeep operations;
 16 | the run_arguments function pulls commands from the command line. Any useful new bookworm methods
 17 | should be passed through run_arguments to work.
 18 | 
 19 | 
 20 | Some modules, especially bookworm-specific ones,
 21 | are imported inline in the code here--that substantially
 22 | (as in, 1 second to 0.2 seconds) reduces startup time
 23 | for the command-line executable,
 24 | even though it's not best practice otherwise.
 25 | """
 26 | 
 27 | class BookwormManager(object):
 28 |     """
 29 |     This class is passed some options that tell it the name of the bookworm it's working on;
 30 |     some of the methods here are the directly callable as the command line arguments.
 31 | section'client'
 32 |     This is what calls the various other bookworm scripts, whether Python or not.
 33 |     """
 34 | 
 35 |     def __init__(self, cnf_file=None, database=None):
 36 | 
 37 |         # This will likely be changed if it isn't None.
 38 |         import configparser
 39 | 
 40 |         self.basedir = None
 41 |         self.dbname = None
 42 |         for i in range(10):
 43 |             basedir = "../"*i
 44 |             if os.path.exists(basedir + ".bookworm"):
 45 |                 self.basedir = basedir
 46 |                 break
 47 |             if self.basedir==None:
 48 |                 logging.debug("No bookworm directory found; hopefully this isn't a build call.")
 49 | 
 50 |         if cnf_file is not None:
 51 |             config = configparser.ConfigParser(allow_no_value=True)
 52 |             config.read([cnf_file])
 53 |             if config.has_section("client"):
 54 |                 """
 55 |                 Silently go along if the config doesn't exist.
 56 |                 """
 57 |                 try:
 58 |                     self.dbname = config.get("client", "database")
 59 |                 except configParser.NoOptionError:
 60 |                     pass
 61 | 
 62 |         # More specific options override the config file
 63 |         if database is not None:
 64 |             # Passed in dbname takes precedence over config file.
 65 |             self.dbname = database
 66 | 
 67 |     def config(self,args):
 68 |         """
 69 |         Performs useful configuration tasks, such as setting up a MySQL installation.
 70 |         """
 71 |         if args.target=="mysql":
 72 |             import bookwormDB.configuration
 73 |             bookwormDB.configuration.recommend_my_cnf()
 74 |         if args.target=="mysql-info":
 75 |             from bookwormDB.configuration import Configfile
 76 |             config = Configfile("admin")
 77 |             print("The admin configuration login currently being used should be the following.\n")
 78 |             config.write_out()
 79 |         if args.target=="apache":
 80 |             from bookwormDB.configuration import apache
 81 |             apache()
 82 | 
 83 |     def ftokenize(self, args):
 84 | 
 85 |         import bookwormDB.tokenizer
 86 | 
 87 |         """
 88 |         Handle functions related to tokenization and encoding.
 89 | 
 90 |         Should eventually be able to accept arguments like "token-regex"
 91 |         and already-tokenized documents.
 92 |         """
 93 | 
 94 |         if args.process == "encode":
 95 |             self.encoded(args)
 96 | 
 97 |         if args.process == "text_stream" or args.process == "token_stream":
 98 |             raise NotImplementedError("This feature has been removed")
 99 | 
100 |         if args.process == "word_db":
101 |             self.wordlist(args)
102 | 
103 |     def init(self, args):
104 |         """
105 |         Initialize the current directory as a bookworm directory.
106 |         """
107 |         # Create a configuration file
108 |         if not args.force:
109 |             if os.path.exists(".bookworm"):
110 |                 logging.error("""
111 |                 You already have a folder named '.bookworm'.
112 |                 Probably you've already initialized a Bookworm here.
113 |                 """)
114 |                 return
115 |             if not os.path.exists("bookworm.cnf"):
116 |                 fout = open("bookworm.cnf", "w")
117 |                 if self.dbname:
118 |                     loc = self.dbname
119 |                 else:
120 |                     loc = os.path.relpath(".", "..")
121 |                     print("Configuring Bookworm named '{}'".format(loc))
122 |                     print("Change the file at bookworm.cnf if this is undesirable".format(loc))
123 |                 fout.write("[client]\ndatabase = {}\n".format(loc))
124 |         else:
125 |             fout = open("bookworm.cnf", "w")
126 |             loc = os.path.relpath(".", "..")
127 |             print("Configuring Bookworm named '{}'".format(loc))
128 |             print("Change the file at bookworm.cnf if this is undesirable".format(loc))
129 |             fout.write("[client]\ndatabase = {}\n".format(loc))
130 | 
131 |     def query(self, args):
132 |         """
133 |         Run a query against the API from the command line.
134 |         """
135 | 
136 |         from bookwormDB.general_API import SQLAPIcall
137 |         import json
138 | 
139 |         query = json.loads(args.APIcall)
140 |         caller = SQLAPIcall(query)
141 |         print(caller.execute())
142 | 
143 |     def serve(self,args):
144 | 
145 |         """
146 |         Serve the api.
147 |         """
148 | 
149 |         from bookwormDB.wsgi import run
150 |         run(args.bind, args.workers)
151 | 
152 |         import http.server
153 |         from http.server import HTTPServer
154 |         import shutil
155 | 
156 |         base_dir = args.dir
157 |         base_cgi_dir = os.path.normpath(base_dir + "/" + "cgi-bin")
158 |         d3_dir = os.path.normpath(base_dir + "/" + "D3")
159 |         for dir in [base_dir,base_cgi_dir]:
160 |             if not os.path.exists(dir):
161 |                 os.makedirs(dir)
162 | 
163 |         API = os.path.normpath(os.path.dirname(bookwormDB.__file__) + "/bin/dbbindings.py")
164 |         if not os.path.exists(base_cgi_dir + "/" + API):
165 |             shutil.copy(API, base_cgi_dir)
166 | 
167 |         if not os.path.exists(d3_dir):
168 |             call(["git","clone","http://github.com/bmschmidt/BookwormD3",d3_dir])
169 | 
170 |         # Use the Makefile to build the linechartGUI. This is a little Rube Goldberg-y.
171 |         args.target="linechartGUI"
172 | 
173 |         raise TypeError("The line below this is nonsense")
174 |         self.prep(args)
175 | 
176 |         os.chdir(base_dir)
177 |         # Actually serve it.
178 |         PORT = args.port
179 | 
180 |         httpd = HTTPServer(("", PORT), http.server.CGIHTTPRequestHandler)
181 | 
182 |         print("\n\n" + "****"*20)
183 |         print("A local bookworm server is now running")
184 |         print("You can now view some charts in a web-browser at http://localhost:%d/D3" % PORT)
185 |         print("If you have a time variable, linecharts are at http://localhost:%d/%s" % (PORT,self.dbname))
186 |         print("Please note that this is not a very secure way: if you plan to put your bookworm")
187 |         print("on the open web, consider using apache.")
188 |         httpd.serve_forever()
189 | 
190 | 
191 |     def extension(self,args):
192 |         """
193 |         Creates (or updates) an extension
194 |         """
195 | 
196 |         if not os.path.exists(self.basedir + ".bookworm/extensions"):
197 |             os.makedirs(self.basedir + ".bookworm/extensions")
198 | 
199 |         my_extension = Extension(args,basedir = self.basedir)
200 |         my_extension.clone_or_pull()
201 |         my_extension.make()
202 | 
203 |     def build(self, args):
204 |         self.prep(args)
205 | 
206 |     def prep(self, args):
207 |         """
208 |         This is a wrapper to all the functions define here: the purpose
209 |         is to continue to allow access to internal methods in, for instance,
210 |         the Makefile, without documenting all of them in separate functions.
211 | 
212 |         That's a little groaty, I know.
213 |         """
214 |         logging.debug(args)
215 | 
216 |         getattr(self, args.goal)(args)
217 | 
218 |     def wordlist(self, args):
219 |         """
220 |         Create a wordlist of the top 1.5 million words.
221 |         """
222 |         from .countManager import create_wordlist
223 |         if os.path.exists(".bookworm/texts/wordlist/wordlist.txt"):
224 |             return
225 |         try:
226 |             os.makedirs(".bookworm/texts/wordlist")
227 |         except FileExistsError:
228 |             pass
229 | 
230 |         input = "input.txt"
231 |         if args.feature_counts:
232 |             logging.info(args.feature_counts)
233 |             input = [a for a in args.feature_counts if 'unigrams' in a][0]
234 |         create_wordlist(n = 1.5e06,
235 |                         input = input,
236 |                         output = ".bookworm/texts/wordlist/wordlist.txt")
237 | 
238 |     def pristine(self, args):
239 | 
240 |         import bookwormDB.CreateDatabase
241 |         bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname, variableFile=None)
242 |         if self.dbname == "mysql":
243 |             raise NameError("Don't try to delete the mysql database")
244 |         bookworm.db.query("DROP DATABASE IF EXISTS {}".format(self.dbname))
245 | 
246 |     def encoded(self, args):
247 |         """
248 |         Using the wordlist and catalog, create encoded files.
249 |         """
250 |         self.wordlist(args)
251 |         self.derived_catalog(args)
252 | 
253 |         for k in ['unigrams', 'bigrams', 'trigrams', 'quadgrams', 'completed']:
254 |             try:
255 |                 os.makedirs(".bookworm/texts/encoded/{}".format(k))
256 |             except FileExistsError:
257 |                 pass
258 |         from .countManager import encode_words
259 | 
260 |         if args.feature_counts:
261 |             for feature in args.feature_counts:
262 |                 encode_words(".bookworm/texts/wordlist/wordlist.txt", feature)
263 |         else:
264 |             encode_words(".bookworm/texts/wordlist/wordlist.txt", "input.txt")
265 | 
266 |     def all(self, args):
267 |         self.preDatabaseMetadata(args)
268 |         self.encoded(args)
269 |         self.database_wordcounts(args)
270 |         self.database_metadata(args)
271 | 
272 |     def preDatabaseMetadata(self, args=None, **kwargs):
273 |         import os
274 |         if not os.path.exists("field_descriptions.json"):
275 |             self.guessAtFieldDescriptions()
276 |         self.derived_catalog(args)
277 |         import bookwormDB.CreateDatabase
278 |         # Doesn't need a created database yet, just needs access
279 |         # to some pieces.
280 |         Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase()
281 |         logging.info("Writing metadata to new catalog file...")
282 |         Bookworm.variableSet.writeMetadata()
283 | 
284 |         # This creates helper files in the /metadata/ folder.
285 | 
286 |     def derived_catalog(self, args):
287 | 
288 |         if not os.path.exists(".bookworm/metadata"):
289 |             os.makedirs(".bookworm/metadata")
290 |         if os.path.exists(".bookworm/metadata/jsoncatalog_derived.txt"):
291 |             return
292 | 
293 |         from bookwormDB.MetaParser import parse_catalog_multicore, ParseFieldDescs
294 | 
295 |         logging.debug("Preparing to write field descriptions")
296 |         ParseFieldDescs(write = True)
297 |         logging.debug("Preparing to write catalog")
298 |         parse_catalog_multicore()
299 | 
300 |     def guessAtFieldDescriptions(self, args = None, **kwargs):
301 | 
302 |         """
303 |         Use a number of rules of thumb to automatically generate a field_descriptions.json file.
304 |         This may bin some categories incorrectly (depending on names, for example it may treat dates
305 |         as either categorical or time variables).
306 |         """
307 | 
308 |         import bookwormDB.CreateDatabase
309 |         import json
310 |         Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname, variableFile=None)
311 |         Bookworm.setVariables("jsoncatalog.txt", jsonDefinition=None)
312 |         import os
313 |         if not os.path.exists("field_descriptions.json"):
314 |             output = open("field_descriptions.json","w")
315 |             guess = json.dumps(Bookworm.variableSet.guessAtFieldDescriptions(), indent = 2)
316 |             logging.warning("Creating guess for field descriptions at: {}".format(guess))
317 |             output.write(guess)
318 |         else:
319 |             logging.error("""
320 |             You already have a file at field_descriptions.json
321 |             Dying rather than overwrite it.
322 |             """)
323 |             sys.exit()
324 | 
325 |     def reload_memory(self,args):
326 |         import bookwormDB.CreateDatabase
327 |         dbnames = [self.dbname]
328 |         if args.all==True:
329 |             dbnames = []
330 |             datahandler = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname,variableFile=None)
331 |             cursor = datahandler.db.query("SELECT TABLE_SCHEMA FROM information_schema.tables WHERE TABLE_NAME='masterTableTable'")
332 |             for row in cursor.fetchall():
333 |                 dbnames.append(row[0])
334 |             logging.info("The following databases are bookworms to be reloaded:")
335 |             for name in dbnames:
336 |                 logging.info("\t" + name)
337 | 
338 |         for database in dbnames:
339 |             logging.info("Reloading memory tables for %s" %database)
340 |             Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(database,variableFile=None)
341 |             Bookworm.reloadMemoryTables(force=args.force)
342 | 
343 |     def database_metadata(self, args):
344 |         import bookwormDB.CreateDatabase
345 |         logging.debug("creating metadata db")
346 |         Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname)
347 |         Bookworm.variableSet.loadMetadata()
348 | 
349 |         logging.debug("creating metadata variable tables")
350 | 
351 |         # This creates a table in the database that makes the results of
352 |         # field_descriptions accessible through the API, and updates the
353 | 
354 |         Bookworm.loadVariableDescriptionsIntoDatabase()
355 | 
356 | 
357 |         Bookworm.create_fastcat_and_wordsheap_disk_tables()
358 | 
359 |         # The temporary memory tables are no longer automatically created on a build.
360 |         # To create them, use `bookworm reload_memory`.
361 |         # Bookworm.reloadMemoryTables()
362 | 
363 |         #print "adding cron job to automatically reload memory tables on launch"
364 |         #print "(this assumes this machine is the MySQL server, which need not be the case)"
365 |         #call(["sh","scripts/scheduleCronJob.sh"])
366 |         Bookworm.jsonify_data() # Create the self.dbname.json file in the root directory.
367 |         Bookworm.create_API_settings()
368 | 
369 |         Bookworm.grantPrivileges()
370 | 
371 |     def add_metadata(self, args):
372 |         import bookwormDB.CreateDatabase
373 |         import bookwormDB.convertTSVtoJSONarray
374 |         bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname,None)
375 |         anchorField = args.key
376 |         if args.format == "tsv":
377 |             # TSV is just converted into JSON in a file at tmp.txt, and slurped in that way.
378 |             if args.key is None:
379 |                 args.key = open(args.file).readline().split("\t")[0]
380 |             f = "tmp.txt"
381 |             bookwormDB.convertTSVtoJSONarray.convertToJSON(args.file, f)
382 |             args.file = f
383 | 
384 |         bookworm.importNewFile(args.file,
385 |                                anchorField=args.key,
386 |                                jsonDefinition=args.field_descriptions)
387 | 
388 | 
389 |     def database_wordcounts(self, args = None, **kwargs):
390 |         """
391 |         Builds the wordcount components of the database. This will die
392 |         if you can't connect to the database server.
393 |         """
394 |         cmd_args = args
395 |         import bookwormDB.CreateDatabase
396 | 
397 |         index = True
398 |         reverse_index = True
399 |         ingest = True
400 |         newtable = True
401 | 
402 |         if cmd_args and hasattr(cmd_args, "index_only"):
403 |             if cmd_args.index_only:
404 |                 ingest = False
405 |                 newtable = False
406 |             else:
407 |                 index = not cmd_args.no_index
408 |                 newtable = not cmd_args.no_delete
409 |             reverse_index = not cmd_args.no_reverse_index
410 |             if not (newtable and ingest and index):
411 |                 logging.warn("database_wordcounts args not supported for bigrams yet.")
412 | 
413 |         Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname)
414 |         Bookworm.load_word_list()
415 |         Bookworm.create_unigram_book_counts(newtable=newtable, ingest=ingest, index=index, reverse_index=reverse_index)
416 |         Bookworm.create_bigram_book_counts()
417 | 
418 | class Extension(object):
419 | 
420 |     """
421 |     A bookworm extension. Initialized with an args object,
422 |     which has the element url, the location of a clonable git repo.
423 | 
424 |     Because I don't want people to have to write extensions in python,
425 |     they are build using `make`.
426 |     """
427 | 
428 |     def __init__(self,args,basedir="./"):
429 |         self.args = args
430 |         self.dir = basedir + ".bookworm/extensions/" + re.sub(".*/","",self.args.url)
431 | 
432 |     def clone_or_pull(self):
433 |         if not os.path.exists(self.dir):
434 |             logging.info("cloning git repo from " + self.args.url)
435 |             call(["git","clone",self.args.url,self.dir])
436 |         else:
437 |             logging.info("updating pre-existing git repo at " + self.dir)
438 |             Popen(["git","pull"],cwd=self.dir)
439 | 
440 |     def make(self):
441 |         logging.debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
442 |         logging.debug("Running make in " + self.dir)
443 |         Popen(["make"], cwd=self.dir)
444 | 
445 | # Initiate MySQL connection.
446 | 
447 | 
448 | # Pull a method from command line input.
449 | 
450 | def run_arguments():
451 |     """
452 |     Parse the command line arguments and run them.
453 | 
454 |     The actual running is handled by an instance of the class `BookwormManager`,
455 |     which calls all bookworm-related arguments; that, in turn, calls some specific
456 |     methods to make things happen (the most important of which is the `BookwormDB`
457 |     class, which is in charge of MySQL calls).
458 | 
459 |     I apologize for how ugly and linear this code is: it's not clear to me
460 |     how to write pretty modular code with the argparse module.
461 |     You just end up with a bunch of individual add argument lines that are full of random text.
462 |     Refactoring pull requests welcome.
463 |     """
464 | 
465 |     parser = argparse.ArgumentParser(description='Build and maintain a Bookworm database.',prog="bookworm")
466 |     parser.add_argument("--configuration","-c",help="The name of the configuration file to read options from: by default, 'bookworm.cnf' in the current directory.", default="bookworm.cnf")
467 | 
468 |     parser.add_argument("--database","-d",help="The name of the bookworm database in MySQL to connect to: by default, read from the active configuration file.", default=None)
469 | 
470 |     parser.add_argument("--log-level","-l", help="The logging detail to use for errors. Default is 'warning', only significant problems; info gives a fuller record, and 'debug' dumps many MySQL queries, etc.",choices=["warning","info","debug"],type=str.lower,default="warning")
471 | 
472 | 
473 |     parser.add_argument("--feature-counts", action='append',
474 |                                  help="Use pre-calculated feature counts rather than tokenizing complete text on the fly. Supply any number of single files per count level like 'input.unigrams', 'input.bigrams', etc.")
475 | 
476 |     parser.add_argument("--ngrams",nargs="+",default=["unigrams","bigrams"],help="What levels to parse with. Multiple arguments should be unquoted in spaces. This option currently does nothing.")
477 | 
478 | 
479 |     # Use subparsers to have an action syntax, like git.
480 |     subparsers = parser.add_subparsers(title="action", help='The commands to run with Bookworm', dest="action")
481 | 
482 | 
483 | 
484 |     ############# build #################
485 |     build_parser = subparsers.add_parser("build",description = "Create files",help="""Build up the component parts of a Bookworm.\
486 | 
487 |     if you specify something far along the line (for instance, the linechart GUI), it will\
488 |     build all prior files as well.""")
489 | 
490 |     build_parser.add_argument("target", help="The make that you want to build. To build a full bookworm, type 'build all'.")
491 | 
492 |     # Grep out all possible targets from the Makefile
493 | 
494 |     ############# supplement #################
495 |     supplement_parser = subparsers.add_parser("add_metadata",help="""Supplement the\
496 |     metadata for an already-created Bookworm with new items. They can be keyed to any field already in the database.""")
497 |     supplement_parser.add_argument("-f","--file",help="""The location of a file with additional metadata to incorporate into your bookworm.""",required=True)
498 | 
499 |     supplement_parser.add_argument(
500 |         "--format",
501 |         help="""The file format of the new metadata.\
502 |         Must be "json" or "tsv". For JSON, the format is the same as the default\
503 |         jsoncatalog.txt (a text file of json lines, each corresponding to a metadata field);\
504 |         for TSV, a tsv with first line of which is column names,\
505 |         and the first column of which is shared key (like filename). The TSV format,\
506 |         particularly without field descriptions, is much easier to use, but doesn't\
507 |         permit multiple values for the same key.""",
508 |         default="json",type=str.lower,choices=["tsv","json"])
509 | 
510 |     supplement_parser.add_argument("--key",help="""The name of the key. If not specified and input type is TSV, the first column is used.""",default=None)
511 |     supplement_parser.add_argument("--field_descriptions","-d",help="""A description of the new metadata in the format of "field_descriptions.json"; if empty, we'll just guess at some suitable values.""",default=None)
512 | 
513 |     ######### Reload Memory #############
514 |     memory_tables_parser = subparsers.add_parser("reload_memory",help="Reload the memory\
515 |     tables for the designated Bookworm; this must be done after every MySQL restart")
516 |     memory_tables_parser.add_argument("--force-reload",dest="force",action="store_true",
517 |                                       help="Force reload on all memory tables. Use\
518 |                                       '--skip-reload' for faster execution. On by default\
519 |                                       .")
520 |     memory_tables_parser.add_argument("--skip-reload",dest="force",action="store_false",
521 |                                       help="Don't reload memory tables which have at least\
522 |                                       one entry in them. Significantly faster, but may produce\
523 |                                       bad results if the underlying tables have been\
524 |                                       changed. Good for maintenance, bad for actively updated\
525 |                                       installations.")
526 |     memory_tables_parser.set_defaults(force=False)
527 |     memory_tables_parser.add_argument("--all",action="store_true",default=False,
528 |                                       help="Search for all bookworm installations on\
529 |                                       the server, and reload memory tables for each of them.")
530 | 
531 | 
532 |     ########## Clone and run extensions
533 |     extensions_parser = subparsers.add_parser("extension", help="Install Extensions to the current directory")
534 |     extensions_parser.add_argument("url",help="A cloneable url for the extension you want to pul: passed as an argument to 'git clone,' so may be either using the https protocol or the git protocol")
535 | 
536 | 
537 |     ########## Clone and run extensions
538 |     extensions_parser = subparsers.add_parser("query", help="Run a query using the Bookworm API")
539 |     extensions_parser.add_argument("APIcall",help="The json-formatted query to be run.")
540 | 
541 | 
542 |     ########## Build components
543 |     extensions_parser = subparsers.add_parser("prep", help="Build individual components.", aliases = ['build'])
544 |     extensions_subparsers = extensions_parser.add_subparsers(title="goal", help="The name of the target.", dest="goal")
545 | 
546 |     # Bookworm prep targets that allow additional args
547 |     catalog_prep_parser = extensions_subparsers.add_parser("preDatabaseMetadata",
548 |                                                            help=getattr(BookwormManager, "preDatabaseMetadata").__doc__)
549 | 
550 |     word_ingest_parser = extensions_subparsers.add_parser("database_wordcounts",
551 |                                                            help=getattr(BookwormManager, "database_wordcounts").__doc__)
552 |     word_ingest_parser.add_argument("--no-delete", action="store_true", help="Do not delete and rebuild the token tables. Useful for a partially finished ingest.")
553 | 
554 |     word_ingest_parser.add_argument("--no-reverse-index", action="store_true", help="When creating the table, choose not to index bookid/wordid/counts. This is useful for really large builds. Because this is specified at table creation time, it does nothing with --no-delete or --index-only.")
555 | 
556 |     word_ingest_parser.add_argument("--no-index", action="store_true", help="Do not re-enable keys after ingesting tokens. Only do this if you intent to manually enable keys or will run this command again.")
557 | 
558 |     word_ingest_parser.add_argument("--index-only", action="store_true", help="Only re-enable keys. Supercedes other flags.")
559 | 
560 |     # Bookworm prep targets that don't allow additional args
561 |     for prep_arg in BookwormManager.__dict__.keys():
562 |         extensions_subparsers.add_parser(prep_arg, help=getattr(BookwormManager, prep_arg).__doc__)
563 | 
564 |     """
565 |     Some special functions
566 |     """
567 | 
568 |     init_parser = subparsers.add_parser("init",help="Initialize the current directory as a bookworm directory")
569 |     init_parser.add_argument("--force","-f",help="Overwrite some existing files.",default=False,action="store_true")
570 |     init_parser.add_argument("--yes","-y",help="Automatically use default values with no prompts",default=False,action="store_true")
571 | 
572 | 
573 |     # Serve the current bookworm
574 | 
575 |     serve_parser = subparsers.add_parser("serve",
576 |                                          help="Serve the bookworm. Be default this is an API endpoint,"
577 |                                          "served over gunicorn, or (not yet supported) a full installation. You might want to wrap"
578 | "the gunicorn endpoint behind a more powerful webserver like apache or nginx.")
579 | 
580 |     serve_parser.add_argument("--full-site", action = "store_true", help="Serve a webpage as well as a query endpoint? Not active.")
581 | 
582 |     serve_parser.add_argument("--bind", "-b", default="10012", help="The port over which to serve the bookworm",type=int)
583 | 
584 |     serve_parser.add_argument("--workers", "-w", default="0", help="How many gunicorn worker threads to launch for the API. Reduce if you're seeing memory issues.",type=int)
585 | 
586 |     serve_parser.add_argument("--dir","-d",default="http_server",help="A filepath for a directory to serve from. Will be created if it does not exist.")
587 | 
588 | 
589 | 
590 |     # Configure the global server.
591 |     configure_parser = subparsers.add_parser("config",help="Some helpers to configure a running bookworm, or to manage your server-wide configuration.")
592 |     configure_parser.add_argument("target",help="The thing you want help configuring.",choices=["mysql", "mysql-info", "apache"])
593 |     configure_parser.add_argument("--users",nargs="+",choices=["admin","global","root"],help="The user levels you want to act on.",default=["admin","global"])
594 |     configure_parser.add_argument("--force","-f",help="Overwrite existing configurations in potentially bad ways.",action="store_true",default=False)
595 | 
596 |     # Call the function
597 |     args = parser.parse_args()
598 |     # Set the logging level based on the input.
599 |     numeric_level = getattr(logging, args.log_level.upper(), None)
600 |     if not isinstance(numeric_level, int):
601 |         raise ValueError('Invalid log level: %s' % args.log_level)
602 |     # While we're at it, log with line numbers
603 |     FORMAT = "[%(filename)s:%(lineno)s-%(funcName)s() %(asctime)s.%(msecs)03d] %(message)s"
604 |     logging.basicConfig(format=FORMAT, level=numeric_level, datefmt="%I:%M:%S")
605 |     logging.info("Info logging enabled.")
606 |     logging.info("Debug logging enabled.")
607 | 
608 |     # Create the bookworm
609 |     my_bookworm = BookwormManager(args.configuration, args.database)
610 | 
611 |     # Call the current action with the arguments passed in.
612 |     getattr(my_bookworm,args.action)(args)
613 | 


--------------------------------------------------------------------------------
/bookwormDB/multiprocessingHelp.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import psutil
 3 | import logging
 4 | 
 5 | def mp_stats():
 6 |     try:
 7 |         cpus = len(os.sched_getaffinity(0))
 8 |     except AttributeError:
 9 |         # Should be better OS X support than this.
10 |         cpus = 6
11 | 
12 |     # Allocate half of available memory for the bounter
13 |     memory = int(psutil.virtual_memory()[4])
14 | 
15 |     if memory < 1024:
16 |         logging.warning("Not much memory to work with--vocab may be inexact")
17 | 
18 |     return (cpus, memory)
19 | 
20 | def running_processes(workerlist):
21 |     running = False
22 |     for worker in workerlist:
23 |         if worker.is_alive():
24 |             running = True
25 |         else:
26 |             code = worker.exitcode
27 |             if code > 0:
28 |                 raise("Process died with code {}".format(code))
29 |     return running
30 | 


--------------------------------------------------------------------------------
/bookwormDB/schema_primitives.py:
--------------------------------------------------------------------------------
 1 | from .general_API import Aggregator
 2 | 
 3 | agg_keys = list(Aggregator.__dict__.keys())
 4 | agg_keys = [k for k in agg_keys if not k.startswith("_")]
 5 | counts_schema = {
 6 |             "$id": "#/properties/counttype",
 7 |             "type": "array",
 8 |             "items": {
 9 |                 "$id": "#/properties/counttype/items",
10 |                 "type": "string",
11 |                 "default": "WordCount",
12 |                 "enum":  agg_keys
13 |                 "pattern": "^(.*)$"
14 |             }
15 | }
16 | 
17 | method_schema = {
18 |     "type": "string",
19 |     "title": "Return Method",
20 |     "default": "data",
21 |     "enum": [
22 |         "data",
23 |         "schema",
24 |         "search"
25 |     ],
26 |     "pattern": "^(.*)$"
27 | }
28 | 
29 | format_schema = {
30 |     "description": "The return format requested from the API.",
31 |     "type": "string",
32 |     "title": "The Format Schema",
33 |     "default": "json_c",
34 |     "enum": [
35 |         "json_c",
36 |         "csv",
37 |         "tsv",
38 |         "feather",
39 |         "json",
40 |         "html"
41 |     ]
42 | }
43 | 


--------------------------------------------------------------------------------
/bookwormDB/scripts/fast_featurecounter.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Usage:
 3 | #    htrc_wordcounter.sh [infile] [tmpdir] [blocksize] [outfile]
 4 | 
 5 | # Important: Need to set locale in order to sort properly
 6 | export LC_ALL=C
 7 | infile=$1
 8 | # Explicitly set tmp directory to better manage disk needs
 9 | tmpdir=$2
10 | blocksize=$3
11 | outfile=$4
12 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
13 | tmpfile=tmp1-$RANDOM.txt
14 | 
15 | mkdir -p $tmpdir
16 | cat $infile |\
17 | parallel --block $blocksize -j95% --pipe --files --tempdir $tmpdir \
18 | 	awk '{print\ \$2\,\ \$3}' "|" sort "|" awk -f $DIR/mergecounted.awk >$tmpfile
19 | 
20 | echo $tmpfile
21 | 
22 | # We've processed the files in a big batch, but in all likelihood, there's still too many
23 | # of them to glob all together and sort. So, let's merge in batches of 30 and dedupe again
24 | cat $tmpfile | parallel --files --tempdir $tmpdir -Xn30 -j95% \
25 | 	sort -m {} "|" awk -f scripts/mergecounted.awk ";" rm {} |\
26 | 	parallel -Xj1 sort -m {} "|" awk -f $DIR/mergecounted.awk ";" rm {} |\
27 | 	sort -n -r -k2 | awk 'BEGIN {i=0}{i+=1;print i "	" $1 "	" $2}' >$outfile # Format for bw
28 | 
29 | rm $tmpfile
30 | 


--------------------------------------------------------------------------------
/bookwormDB/scripts/mergecounted.awk:
--------------------------------------------------------------------------------
 1 | #/usr/bin/awk -f
 2 | # Awk script to merge sorted "word\tcount" files.
 3 | # Speed is the reason necessitating awk.
 4 | BEGIN {start = 1;} { word = $1; 
 5 | if (last == word) { sum += $2; } 
 6 | else { 
 7 | 	if (!start) print last " " sum
 8 | 	else start = 0; last=word; sum = $2;
 9 |      }
10 | } END { print last " " sum } 
11 | 


--------------------------------------------------------------------------------
/bookwormDB/search_limits.py:
--------------------------------------------------------------------------------
 1 | import MySQLdb
 2 | 
 3 | def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_joiner = " OR "):
 4 |     whereterm = []
 5 |     # The general idea here is that we try to break everything in search_limits down to a list, and then create a whereterm on that joined by whatever the 'joiner' is ("AND" or "OR"), with the comparison as whatever comp is ("=",">=",etc.).
 6 |     # For more complicated bits, it gets all recursive until the bits are all in terms of list.
 7 |     if joiner is None:
 8 |         joiner = " AND "
 9 |     for key in list(myhash.keys()):
10 |         values = myhash[key]
11 |         if isinstance(values, (str, bytes)) or isinstance(values, int) or isinstance(values, float):
12 |             # This is just human-being handling. You can pass a single value instead of a list if you like, and it will just convert it
13 |             # to a list for you.
14 |             values = [values]
15 |         # Or queries are special, since the default is "AND". This toggles that around for a subportion.
16 | 
17 |         if key == "$or" or key == "$OR":
18 |             local_set = []
19 |             for comparison in values:
20 |                 local_set.append(where_from_hash(comparison, comp=comp))
21 |             whereterm.append(" ( " + " OR ".join(local_set) + " )")
22 |         elif key == '$and' or key == "$AND":
23 |             for comparison in values:
24 |                 whereterm.append(where_from_hash(comparison, joiner=" AND ", comp=comp))                
25 |         elif isinstance(values, dict):
26 |             if joiner is None:
27 |                 joiner = " AND "
28 |             # Certain function operators can use MySQL terms.
29 |             # These are the only cases that a dict can be passed as a limitations
30 |             operations = {"$gt":">", "$ne":"!=", "$lt":"<",
31 |                           "$grep":" REGEXP ", "$gte":">=",
32 |                           "$lte":"<=", "$eq":"="}
33 |             
34 |             for operation in list(values.keys()):
35 |                 if operation == "$ne":
36 |                     # If you pass a lot of ne values, they must *all* be false.
37 |                     subjoiner = " AND "
38 |                 else:
39 |                     subjoiner = " OR "
40 |                 whereterm.append(where_from_hash({key:values[operation]}, comp=operations[operation], list_joiner=subjoiner))
41 |         elif isinstance(values, list):
42 |             # and this is where the magic actually happens:
43 |             # the cases where the key is a string, and the target is a list.
44 |             if isinstance(values[0], dict):
45 |                 # If it's a list of dicts, then there's one thing that happens.
46 |                 # Currently all types are assumed to be the same:
47 |                 # you couldn't pass in, say {"year":[{"$gte":1900}, 1898]} to
48 |                 # catch post-1898 years except for 1899. Not that you
49 |                 # should need to.
50 |                 for entry in values:
51 |                     whereterm.append(where_from_hash(entry))
52 |             else:
53 |                 # Note that about a third of the code is spent on escaping strings.
54 |                 if escapeStrings:
55 |                     if isinstance(values[0], (str, bytes)):
56 |                         quotesep = "'"
57 |                     else:
58 |                         quotesep = ""
59 | 
60 |                     def escape(value):
61 |                         # NOTE: stringifying the escape from MySQL; hopefully doesn't break too much.                        
62 |                         return str(MySQLdb.escape_string(to_unicode(value)), 'utf-8')
63 |                 else:
64 |                     def escape(value):
65 |                         return to_unicode(value)
66 |                     quotesep = ""
67 | 
68 |                 joined = list_joiner.join([" ({}{}{}{}{}) ".format(key, comp, quotesep, escape(value), quotesep) for value in values])
69 |                 whereterm.append(" ( {} ) ".format(joined))
70 | 
71 |     if len(whereterm) > 1:
72 |         return "(" + joiner.join(whereterm) + ")"
73 |     else:
74 |         return whereterm[0]
75 |     # This works pretty well, except that it requires very specific sorts of terms going in, I think.
76 | 
77 |     
78 | class Search_limits(dict):
79 |     def to_sql(self):
80 |         return where_from_hash(self)
81 |     def rkeys(self):
82 |         # Recursively return the SQL keys so we know what fields to work with.
83 |         keys = []
84 |         for k, v in self.iteritems():
85 |             if not k.starts_with("$"):
86 |                 keys.append(k)
87 |             elif isinstance(v, dict):
88 |                 for k in Search_limits(v).rkeys():
89 |                     keys.append(k)
90 |         return keys
91 |     def validate(self):
92 |         # Some tests to see if a query is valid
93 |         for k in self.keys():
94 |             pass
95 |             
96 | 


--------------------------------------------------------------------------------
/bookwormDB/sqliteKV.py:
--------------------------------------------------------------------------------
 1 | # Copyright © 2018 Sylvain PULICANI <picani@laposte.net>
 2 | # Super heavily changed by Ben Schmidt; the old version was a true
 3 | # kv store, this one just autoincrements a lookup table.
 4 | 
 5 | # This should generally be thread safe for reads, but not for writes.
 6 | # If multip
 7 | 
 8 | # This work is free. You can redistribute it and/or modify it under the
 9 | # terms of the Do What The Fuck You Want To Public License, Version 2,
10 | # as published by Sam Hocevar. See the COPYING file for more details.
11 | 
12 | # sqlite_kv.py
13 | #
14 | # Python implementation of the SQLiteKV store.
15 | 
16 | import sqlite3
17 | 
18 | 
19 | class KV:
20 |     """
21 |     Python implementation of the SQLiteKV store, with additionnal methods
22 |     to make it more pythonic.
23 |     ..Warning::
24 |       * The `close` method has to be called after use.
25 |       * The `delete` method is not yet implemented.
26 |     """
27 |     def __init__(self, dbfile):
28 |         """
29 |         Open a connection to the SQLite file. If it doesn't exists, create it
30 |         and add the needed tables.
31 |         """
32 |         self.conn = None
33 |         self.conn = sqlite3.connect(dbfile, detect_types=sqlite3.PARSE_DECLTYPES)
34 |         self.conn.row_factory = sqlite3.Row
35 | 
36 |         tables = [dict(r)['name'] for r in self.conn.execute(
37 |             "SELECT name FROM sqlite_master WHERE type='table'")]
38 | 
39 |         if 'keys' not in tables:
40 |             self.conn.execute("""CREATE TABLE keys(
41 |                               ID INTEGER PRIMARY KEY ASC,
42 |                               key TEXT UNIQUE NOT NULL)""")
43 |             
44 |             self.conn.execute("CREATE UNIQUE INDEX idx_keys ON keys(key)")
45 | 
46 | 
47 |     def close(self):
48 |         """
49 |         Properly close the database.
50 |         """
51 |         self.conn.commit()
52 |         self.conn.close()
53 |                        
54 |     def __getitem__(self, key):
55 |         rows = self.conn.execute("""SELECT ID FROM keys 
56 |                                WHERE keys.key=(?)""", (key, ))
57 |         row = rows.fetchone()
58 |         if row is None:
59 |             raise KeyError(key)
60 |         return row['ID']
61 | 
62 |     def register(self, key):
63 |         self.conn.execute("INSERT INTO keys(key) VALUES (?)",
64 |                           (key, ))
65 | 
66 | 


--------------------------------------------------------------------------------
/bookwormDB/tokenizer.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | 
  3 | from __future__ import print_function
  4 | import random
  5 | import sys
  6 | import os
  7 | from .sqliteKV import KV
  8 | import time
  9 | import logging
 10 | import numpy as np
 11 | from pandas import read_csv
 12 | from io import StringIO
 13 | import re
 14 | 
 15 | """
 16 | This section does a lot of work on tokenizing and aggregating wordcounts.
 17 | """
 18 | 
 19 | # Likewise, store a thread-wise count on whether we've thrown a unicode encoding error.
 20 | haveWarnedUnicode = False
 21 | # And the default regex is generated by a function on demand.
 22 | bigregex = None
 23 | 
 24 | 
 25 | def wordRegex():
 26 |     """
 27 |     #I'm including the code to create the regex, which makes it more readable.
 28 |     Note that this uses *unicode*: among other things, that means that it needs to be passed
 29 |     a unicode-decoded string: and that we have to use the "regex" module instead of the "re" module. Python3 will make this, perhaps, easier.
 30 |     """
 31 |     global re
 32 |     MasterExpression = r"\w+"
 33 |     possessive = MasterExpression + r"'s"
 34 |     numbers = r"(?:[\$])?\d+"
 35 |     decimals = numbers + r"\.\d+"
 36 |     abbreviation = r"(?:mr|ms|mrs|dr|prof|rev|rep|sen|st|sr|jr|ft|gen|adm|lt|col|etc)\."
 37 |     sharps = r"[a-gjxA-GJX]#"
 38 |     punctuators = r"[^\w\p{Z}]"
 39 |     """
 40 |     Note: this compiles looking for the most complicated words first, and as it goes on finds simpler and simpler forms 
 41 |     """
 42 |     bigregex = re.compile("|".join([decimals,possessive,numbers,abbreviation,sharps,punctuators,MasterExpression]),re.UNICODE|re.IGNORECASE)
 43 |     return bigregex
 44 | 
 45 | 
 46 | def readDictionaryFile(prefix=""):
 47 |     look = dict()
 48 |     for line in open(prefix + ".bookworm/texts/wordlist/wordlist.txt"):
 49 |         line = line.rstrip("\n")
 50 |         v, k, _ = line.split("\t")
 51 |         look[k] = v
 52 |     return look
 53 | 
 54 | def readIDfile(prefix=""):
 55 |     if not os.path.exists(".bookworm/metadata/textids.sqlite"):
 56 |         raise FileNotFoundError("No textids DB: run `bookworm build textids`")
 57 |     return KV(prefix + ".bookworm/metadata/textids.sqlite")
 58 | 
 59 | class tokenBatches(object):
 60 |     """
 61 |     A tokenBatches is a manager for tokenizers. Each one corresponds to 
 62 |     a reasonable number of texts to read in to memory on a single processor:
 63 |     during the initial loads, there will probably be one per core.
 64 |     It doesn't store the original text, just the unigram and bigram tokenizations in its attached self.counts arrays.
 65 |     
 66 |     It writes out its dat to a single file: 
 67 |        in this way, a batch of up to several hundred thousand individual files is grouped into a single file.
 68 | 
 69 |     It also has a method that encodes and writes its wordcounts into a tsv file appropriate for reading with mysql,
 70 |     with 3-byte integer encoding for wordid and bookid.
 71 |     """
 72 |     
 73 |     def __init__(self, levels=["unigrams","bigrams"]):
 74 |         """
 75 |         
 76 |         mode: 'encode' (write files out)
 77 |         """
 78 |         self.id = '%030x' % random.randrange(16**30)
 79 |         self.levels=levels
 80 | 
 81 |         # placeholder to alert that createOutputFiles must be run.
 82 |         self.completedFile = None
 83 |         
 84 |     def createOutputFiles(self):
 85 |         self.completedFile = open(".bookworm/texts/encoded/completed/" + self.id,"w")
 86 |         self.outputFiles = dict()
 87 |         for level in self.levels:
 88 |             self.outputFiles[level] = open(".bookworm/texts/encoded/{}/{}.txt".format(level, self.id),"w")
 89 |         
 90 |     def attachDictionaryAndID(self):
 91 |         self.dictionary = readDictionaryFile()
 92 |         self.IDfile = readIDfile()
 93 | 
 94 | 
 95 |     def close(self):
 96 |         """
 97 |         This test allows the creation of bookworms with fewer document than requested 
 98 |         threads, which happens to be the case in the tests.
 99 |         """
100 |         if self.completedFile is not None:
101 |             self.completedFile.close()
102 |             for v in self.outputFiles.values():
103 |                 v.close()
104 |         
105 |     def encodeRow(self,
106 |                   filename,
107 |                   tokenizer,
108 |                   write_completed=True
109 |     ):
110 |         """
111 |         'id': the filename
112 |         'tokenizer': a tokenizer object
113 | 
114 |         """
115 |         if self.completedFile is None:
116 |             self.createOutputFiles()
117 |             self.attachDictionaryAndID()
118 |             
119 |         #The dictionary and ID lookup tables should be pre-attached.
120 |         dictionary = self.dictionary
121 |         IDfile = self.IDfile
122 | 
123 |         levels = None
124 |         """
125 |         if source=="raw_text":
126 |             parts = row.split("\t", 1)
127 |             filename = parts[0]
128 |             try:
129 |                 tokens = tokenizer(parts[1])
130 |             except IndexError:
131 |                 logging.warn("\nFound no tab in the input for '" + filename + "'...skipping row\n")
132 |             levels = self.levels
133 | 
134 |         if source == "countfile":
135 |             try:
136 |                 (filename, token, count) = row.split("\t")
137 |             except:
138 |                 logging.error("Can't find tab\n***************")
139 |                 logging.error(row)
140 |                 raise
141 |             tokens = preTokenized(token, count, self.levels[0])
142 |         """
143 |         
144 |         try:
145 |             textid = IDfile[filename]
146 |         except KeyError:
147 |             logging.warn("Warning: file " + filename + " not found in jsoncatalog.txt, not encoding")
148 |             return
149 | 
150 |         for level in self.levels:
151 |             outputFile = self.outputFiles[level]
152 |             output = []
153 | 
154 |             counts = tokenizer.counts(level)
155 | 
156 |             for wordset, count in counts.items():
157 |                 skip = False
158 |                 wordList = []
159 |                 for word in wordset:
160 |                     try:
161 |                         wordList.append(dictionary[word])
162 |                     except KeyError:
163 |                         """
164 |                         if any of the words to be included is not in the dictionary,
165 |                         we don't include the whole n-gram in the counts.
166 |                         """
167 |                         skip = True                        
168 |                 if not skip:
169 |                     wordids = "\t".join(wordList)
170 |                     output.append("{}\t{}\t{}".format(int(textid), wordids, count))
171 | 
172 |             try:
173 |                 if len(output) > 0:
174 |                     # The test is necessary because otherwise this prints a blank line.
175 |                     outputFile.write("\n".join(output) + "\n")
176 |                 
177 |             except IOError as e:
178 |                 logging.exception(e)
179 | 
180 |         if write_completed:
181 |             self.completedFile.write(filename + "\n")
182 | 
183 | class Tokenizer(object):
184 |     """
185 |     A tokenizer is initialized with a single text string.
186 | 
187 |     It assumes that you have in namespace an object called "bigregex" which
188 |     identifies words.
189 | 
190 |     (I'd define it here, but it's a performance optimization to avoid compiling the large regex millions of times.)
191 | 
192 |     the general way to call it is to initialize, and then for each desired set of counts call "tokenizer.counts("bigrams")" (or whatever).
193 | 
194 |     That returns a dictionary, whose keys are tuples of length 1 for unigrams, 2 for bigrams, etc., and whose values are counts for that ngram. The tuple form should allow faster parsing down the road.
195 |     
196 |     """
197 |     
198 |     def __init__(self, string, tokenization_regex=None):
199 |         global haveWarnedUnicode
200 |         self.string = string
201 |         self.tokenization_regex = tokenization_regex
202 |         self.tokens = None
203 |     def tokenize(self):
204 |         """
205 |         This tries to return the pre-made tokenization:
206 |         if that doesn't exist, it creates it.
207 |         """
208 |         if self.tokens is not None:
209 |             return self.tokens
210 |         """
211 |         For speed, don't import until here.
212 |         """
213 |         tokenization_regex=self.tokenization_regex
214 |         global re
215 |         if re is None:
216 |             import regex as re
217 |         if tokenization_regex is None:
218 |             # by default, use the big regex.
219 |             global bigregex
220 |             if bigregex==None:
221 |                 bigregex = wordRegex()
222 |             tokenization_regex = bigregex
223 |         self.tokens = re.findall(tokenization_regex, self.string)
224 |         return self.tokens
225 | 
226 |     def ngrams(self, n, collapse = False):
227 |         """
228 |         All the ngrams in the text can be created as a tuple by zipping an arbitrary number of
229 |         copies of the text to itself.
230 |         """
231 |         
232 |         self.tokenize()
233 |         l = list(zip(*[self.tokens[i:] for i in range(n)]))
234 |         if collapse:
235 |             l = [" ".join(tupled) for tupled in l]
236 |         return l
237 | 
238 |     def unigrams(self):
239 |         return self.ngrams(1)
240 | 
241 |     def bigrams(self):
242 |         return self.ngrams(2)
243 | 
244 |     def trigrams(self):
245 |         return self.ngrams(3)
246 | 
247 |     def allgrams(self, max = 6):
248 |         output = []
249 |         for i in range(1, max + 1):
250 |             output.extend(self.ngrams(i, collapse = True))
251 |         return output
252 | 
253 |     def words(self):
254 |         """
255 |         1-grams have tuple keys, but words have index keys.
256 |         """
257 |         self.tokenize()
258 |         return self.tokens
259 |     
260 |     def counts(self, whichType):
261 |         
262 |         count = dict()
263 |         for gram in getattr(self,whichType)():
264 |             try:
265 |                 count[gram] += 1
266 |             except KeyError:
267 |                 count[gram] = 1
268 |         return count
269 | 
270 | 
271 | class PreTokenized(object):
272 |     """
273 |     This class is a little goofy: it mimics the behavior of a tokenizer
274 |     one data that's already been tokenized by something like
275 |     Google Ngrams or JStor Data for Research.
276 |     """
277 | 
278 |     def __init__(self, csv_string, level):
279 |         f = read_csv(StringIO(csv_string),
280 |                      lineterminator = "\f",
281 |                      # Ugh--want 'NA' to be a word.
282 |                      dtype = {'word': str, 'counts': np.int},
283 |                      keep_default_na=False,
284 |                      names = ["word", "counts"])
285 |         self.level = level
286 |         if level == 'words':
287 |             self.output = dict(zip(f.word, f.counts))
288 |         else:
289 |             self.output = dict(zip([tuple(w.split(" ")) for w in f.word], f.counts))
290 |             
291 |     def counts(self,level):
292 |         if level != self.level:
293 |             raise
294 |         return self.output
295 | 
296 |     
297 | def getAlreadySeenList(folder):
298 |     #Load in a list of what's already been translated for that level.
299 |     #Returns a set.
300 |     files = os.listdir(folder)
301 |     seen = set([])
302 |     for file in files:
303 |         for line in open(folder + "/" + file):
304 |             seen.add(line.rstrip("\n"))
305 |     return seen
306 | 
307 | def encode_text_stream():
308 |     seen = getAlreadySeenList(".bookworm/texts/encoded/completed")
309 |     tokenBatch = tokenBatches()
310 |     tokenBatch.attachDictionaryAndID()
311 |     for line in sys.stdin:
312 |         filename = line.split("\t",1)[0]
313 |         line = line.rstrip("\n")
314 |         if filename not in seen:
315 |             tokenBatch.encodeRow(line)
316 |             
317 |     # And printout again at the end
318 | 
319 | if __name__=="__main__":
320 |     encode_text_stream()
321 | 
322 | 


--------------------------------------------------------------------------------
/bookwormDB/wsgi.py:
--------------------------------------------------------------------------------
  1 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall
  2 | import json
  3 | from urllib.parse import unquote
  4 | import logging
  5 | import multiprocessing
  6 | import gunicorn.app.base
  7 | from datetime import datetime
  8 | 
  9 | def content_type(query):
 10 |     try:
 11 |         format = query['format']
 12 |     except:
 13 |         return 'text/plain'
 14 |     
 15 |     if format == "json":
 16 |         return "application/json"
 17 |     
 18 |     if format == "feather":
 19 |         return "application/octet-stream"
 20 |     
 21 |     if format == "html":
 22 |         return "text/html"
 23 |     
 24 |     return 'text/plain'
 25 | 
 26 | def application(environ, start_response, logfile = "bookworm_queries.log"):
 27 |     # Starting with code from http://wsgi.tutorial.codepoint.net/parsing-the-request-post
 28 |     try:
 29 |         request_body_size = int(environ.get('QUERY_STRING', 0))
 30 |     except (ValueError):
 31 |         request_body_size = 0
 32 | 
 33 |     # When the method is POST the variable will be sent
 34 |     # in the HTTP request body which is passed by the WSGI server
 35 |     # in the file like wsgi.input environment variable.
 36 | 
 37 |     q = environ.get('QUERY_STRING')
 38 |     try:
 39 |         ip = environ.get('HTTP_X_FORWARDED_FOR')
 40 |  #       logging.debug("Request from {}".format(ip))
 41 |     except:
 42 |         ip = environ.get('REMOTE_ADDR')
 43 |     if ip is None:
 44 |         ip = environ.get('REMOTE_ADDR')
 45 |     query = unquote(q)
 46 |     
 47 |     headers = {
 48 |         'Access-Control-Allow-Origin': '*',
 49 |         'Access-Control-Allow-Methods': 'GET, POST, PUT, OPTIONS',
 50 |         'Access-Control-Allow-Headers':
 51 |         'Origin, Accept, Content-Type, X-Requested-With, X-CSRF-Token',
 52 |         'charset': 'utf-8'
 53 |     }
 54 | 
 55 | 
 56 |         
 57 |     logging.debug("Received query {}".format(query))
 58 |     start = datetime.now()
 59 | 
 60 |     # Backward-compatability: we used to force query to be
 61 |     # a named argument.
 62 |     query = query.strip("query=")
 63 |     query = query.strip("queryTerms=")
 64 |                           
 65 |     try:
 66 |         query = json.loads(query)
 67 |         query['ip'] = ip
 68 |     except:
 69 |         response_body = "Unable to read JSON"
 70 |         status = '404'
 71 |         start_response(status, list(headers.items()))
 72 |         return [b'{"status":"error", "message": "You have passed invalid JSON to the Bookworm API"}']
 73 | 
 74 |     process = SQLAPIcall(query)
 75 |     response_body = process.execute()
 76 | 
 77 |     # It might be binary already.
 78 |     headers['Content-type'] = content_type(query)
 79 |     
 80 |     if headers['Content-type'] != 'application/octet-stream':
 81 |         response_body = bytes(response_body, 'utf-8')
 82 |                     
 83 |     headers['Content-Length'] = str(len(response_body))
 84 |     status = '200 OK'
 85 |     start_response(status, list(headers.items()))
 86 | 
 87 |     query['time'] = start.timestamp()
 88 |     query['duration'] = datetime.now().timestamp() - start.timestamp()
 89 |     # This writing isn't thread-safe; but generally we're not getting more than a couple queries a second.
 90 |     with open(logfile, 'a') as fout:
 91 |         json.dump(query, fout)
 92 |         fout.write("\n")
 93 |     logging.debug("Writing to log: \n{}\n".format(json.dumps(query)))
 94 |     return [response_body]
 95 | 
 96 | # Copied from the gunicorn docs.
 97 | 
 98 | 
 99 | def number_of_workers():
100 |     return (multiprocessing.cpu_count() * 2) + 1
101 | 
102 | class StandaloneApplication(gunicorn.app.base.BaseApplication):
103 |     """
104 |     Superclassed to allow bookworm to do the running.
105 |     """
106 |     def __init__(self, app, options=None):
107 |         self.options = options or {}
108 |         self.application = app
109 |         super(StandaloneApplication, self).__init__()
110 | 
111 |     def load_config(self):
112 |         config = dict([(key, value) for key, value in self.options.items()
113 |                        if key in self.cfg.settings and value is not None])
114 |         for key, value in config.items():
115 |             self.cfg.set(key.lower(), value)
116 | 
117 |     def load(self):
118 |         return self.application
119 | 
120 | def run(port = 10012, workers = number_of_workers()):
121 |     if workers==0:
122 |         workers = number_of_workers()
123 |         
124 |     options = {
125 |         'bind': '{}:{}'.format('127.0.0.1', port),
126 |         'workers': workers,
127 |     }
128 |     
129 |     StandaloneApplication(application, options).run()
130 |     
131 | 


--------------------------------------------------------------------------------
/demos/.ipynb_checkpoints/Reading Binary data-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook shows how to read the wordcount data directly from the mysql binary file. This is likely to be the fastest possible way to iterate over the whole thing. It will only work on Bookworms created under certain, undefined processor architectures. Probably anything you're likely to build, though, will work. We're not talking about Mac vs. Linux type differences, but things like default endianness in the processor."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "First we define the source. This is already a sign things are pretty out of hand."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 218,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "source = \"/drobo/mysql/hathipd\""
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 621,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import numpy as np\n",
 37 |     "import sys\n",
 38 |     "class BinaryBookworm():\n",
 39 |     "    def __init__(self,source_dir):\n",
 40 |     "        #self.file = open(source + \"/master_bookcounts.MYD\",\"rb\")\n",
 41 |     "        self.memmap = np.memmap(source + \"/master_bookcounts.MYD\",\"<i1\")\n",
 42 |     "        self.i = 0\n",
 43 |     "    def __iter__(self):\n",
 44 |     "        return self\n",
 45 |     "    def next(self):\n",
 46 |     "        zero = np.zeros(4*3*nrows,'<i1')\n",
 47 |     "        zero[[0,1,2,4,5,6,8,9,10]] = f[range(self.i + 1,self.i + 10)]\n",
 48 |     "        self.i += 10\n",
 49 |     "        return zero.view('<i4')\n",
 50 |     "    def old_next(self):\n",
 51 |     "        line = self.file.read(10)\n",
 52 |     "        if len(line)==0:\n",
 53 |     "            raise StopIteration\n",
 54 |     "        if line[9] != \"\\x00\" or line[0] != \"\\xff\":\n",
 55 |     "            if (np.fromstring(line [9],'<i1') <= 2):\n",
 56 |     "                # MySQL documentation says the end of a record is always null, but about 5 per million seem\n",
 57 |     "                # to me to be one or two. I wonder if these are errors from the load data infile or something?\n",
 58 |     "                return self.next()\n",
 59 |     "            construit = line[1:4] + \"\\x00\" + line[4:7] + \"\\x00\" + line[7:10] + \"\\x00\"\n",
 60 |     "            sys.stderr.write(\"unparseable\\n\")\n",
 61 |     "            # Three ways of writing any errors that look bad.\n",
 62 |     "            sys.stderr.write(\"\\t\" + line.encode(\"hex\") + \"\\n\")\n",
 63 |     "            sys.stderr.write(\"\\t\" + str(np.fromstring(line,\"<i1\")) + \"\\n\")\n",
 64 |     "            sys.stderr.write(\"\\t\" + str(np.fromstring(construit,\"<i4\")) + \"\\n\")\n",
 65 |     "        construit = line[1:4] + \"\\x00\" + line[4:7] + \"\\x00\" + line[7:10] + \"\\x00\"\n",
 66 |     "        return np.fromstring(construit,\"<i4\")\n",
 67 |     "reading = BinaryBookworm(source)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {
 74 |     "collapsed": true
 75 |    },
 76 |    "outputs": [],
 77 |    "source": []
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 752,
 82 |    "metadata": {
 83 |     "collapsed": false
 84 |    },
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "253\n"
 91 |      ]
 92 |     },
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "array([556483,  53421,      1], dtype=int32)"
 97 |       ]
 98 |      },
 99 |      "execution_count": 752,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "\n",
106 |     "\n",
107 |     "print reading.i/10/1000000\n",
108 |     "reading.i += 1000000000\n",
109 |     "reading.next()"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 614,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "[295779   4986      6]\n"
124 |      ]
125 |     }
126 |    ],
127 |    "source": []
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {
133 |     "collapsed": false
134 |    },
135 |    "outputs": [],
136 |    "source": []
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 753,
141 |    "metadata": {
142 |     "collapsed": false
143 |    },
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "1000001\n"
150 |      ]
151 |     }
152 |    ],
153 |    "source": [
154 |     "foo = []\n",
155 |     "i = 0\n",
156 |     "for line in reading:\n",
157 |     "    i+=1\n",
158 |     "    foo.append(line)\n",
159 |     "    if i > 1000000:\n",
160 |     "        break\n",
161 |     "print len(foo)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 562,
167 |    "metadata": {
168 |     "collapsed": false
169 |    },
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "[array([295779,   4986,      6], dtype=int32),\n",
175 |        " array([295779, 187140,      1], dtype=int32),\n",
176 |        " array([295779, 294054,      2], dtype=int32)]"
177 |       ]
178 |      },
179 |      "execution_count": 562,
180 |      "metadata": {},
181 |      "output_type": "execute_result"
182 |     }
183 |    ],
184 |    "source": [
185 |     "foo[:3]"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 184,
191 |    "metadata": {
192 |     "collapsed": false
193 |    },
194 |    "outputs": [
195 |     {
196 |      "ename": "StopIteration",
197 |      "evalue": "",
198 |      "output_type": "error",
199 |      "traceback": [
200 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
201 |       "\u001b[0;31mStopIteration\u001b[0m                             Traceback (most recent call last)",
202 |       "\u001b[0;32m<ipython-input-184-cbc18351722d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m     \u001b[0mparse_row\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreadable\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
203 |       "\u001b[0;32m<ipython-input-183-e631d565ec2a>\u001b[0m in \u001b[0;36mparse_row\u001b[0;34m(readable)\u001b[0m\n\u001b[1;32m      2\u001b[0m     \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreadable\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m9\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m\"\\x00\"\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m\"\\xff\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m         \u001b[0;32mprint\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
204 |       "\u001b[0;31mStopIteration\u001b[0m: "
205 |      ]
206 |     }
207 |    ],
208 |    "source": [
209 |     "while True:\n",
210 |     "    parse_row(readable)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 98,
216 |    "metadata": {
217 |     "collapsed": false
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "read"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 100,
227 |    "metadata": {
228 |     "collapsed": true
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "a = np.ndarray(len(buf), np.dtype('>i1'), buf)\n",
233 |     "e = np.zeros(len(buf) / 6, np.dtype('>i4'))\n"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 104,
239 |    "metadata": {
240 |     "collapsed": false
241 |    },
242 |    "outputs": [
243 |     {
244 |      "ename": "ValueError",
245 |      "evalue": "new type not compatible with array.",
246 |      "output_type": "error",
247 |      "traceback": [
248 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
249 |       "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
250 |       "\u001b[0;32m<ipython-input-104-0762ae257c20>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m     \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'>i4'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m      \u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'>i4'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
251 |       "\u001b[0;31mValueError\u001b[0m: new type not compatible with array."
252 |      ]
253 |     }
254 |    ],
255 |    "source": [
256 |     "for i in range(3):\n",
257 |     "    e.view(dtype='>i4')[i + 1::4] = \\\n",
258 |     "     a.view(dtype='>i4')[i::3]\n"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 113,
264 |    "metadata": {
265 |     "collapsed": false
266 |    },
267 |    "outputs": [
268 |     {
269 |      "data": {
270 |       "text/plain": [
271 |        "'\\x01\\x00\\x00'"
272 |       ]
273 |      },
274 |      "execution_count": 113,
275 |      "metadata": {},
276 |      "output_type": "execute_result"
277 |     }
278 |    ],
279 |    "source": [
280 |     "foo[1:4]"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 112,
286 |    "metadata": {
287 |     "collapsed": false
288 |    },
289 |    "outputs": [
290 |     {
291 |      "data": {
292 |       "text/plain": [
293 |        "<read-only buffer for 0x105397260, size 9, offset 0 at 0x10536fb70>"
294 |       ]
295 |      },
296 |      "execution_count": 112,
297 |      "metadata": {},
298 |      "output_type": "execute_result"
299 |     }
300 |    ],
301 |    "source": [
302 |     "a.data"
303 |    ]
304 |   }
305 |  ],
306 |  "metadata": {
307 |   "kernelspec": {
308 |    "display_name": "Python 2",
309 |    "language": "python",
310 |    "name": "python2"
311 |   },
312 |   "language_info": {
313 |    "codemirror_mode": {
314 |     "name": "ipython",
315 |     "version": 2
316 |    },
317 |    "file_extension": ".py",
318 |    "mimetype": "text/x-python",
319 |    "name": "python",
320 |    "nbconvert_exporter": "python",
321 |    "pygments_lexer": "ipython2",
322 |    "version": "2.7.12"
323 |   }
324 |  },
325 |  "nbformat": 4,
326 |  "nbformat_minor": 0
327 | }
328 | 


--------------------------------------------------------------------------------
/demos/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "% load_ext autoreload\n",
10 |     "% autoreload 2"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 6,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "from bookwormDB.mariaDB import Query"
20 |    ]
21 |   },
22 |   {
23 |    "cell_type": "code",
24 |    "execution_count": 7,
25 |    "metadata": {},
26 |    "outputs": [
27 |     {
28 |      "ename": "BookwormException",
29 |      "evalue": "{'code': 400, 'message': 'You must specify a value for database'}",
30 |      "output_type": "error",
31 |      "traceback": [
32 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
33 |       "\u001b[0;31mBookwormException\u001b[0m                         Traceback (most recent call last)",
34 |       "\u001b[0;32m<ipython-input-7-0e441f81e265>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mtest\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\"plottype\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"pointchart\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"smoothingSpan\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"host\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"http://localhost:10012/\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"words_collation\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"Case_Sensitive\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"database\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"RMP\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"aesthetic\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"y\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"department\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"x\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"WordsPerMillion\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"color\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"gender\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"search_limits\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"word\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"brilliant\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"vega\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"title\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"The most STEM-happy senators\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"transform\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"filter\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"datum.WordsPerMillion > 130\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"groups\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"department\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"gender\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"counttype\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"WordsPerMillion\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"method\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"data\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"format\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"json_c\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mQuery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
35 |       "\u001b[0;32m~/bookwormDB/bookwormDB/mariaDB.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, query_object, db, databaseScheme)\u001b[0m\n\u001b[1;32m    122\u001b[0m         \u001b[0;31m# Certain constructions require a DB connection already available, so we just start it here, or use the one passed to it.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    123\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 124\u001b[0;31m         \u001b[0mcheck_query\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery_object\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    125\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    126\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprefs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m'database'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mquery_object\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'database'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
36 |       "\u001b[0;32m~/bookwormDB/bookwormDB/mariaDB.py\u001b[0m in \u001b[0;36mcheck_query\u001b[0;34m(query)\u001b[0m\n\u001b[1;32m    102\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'database'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    103\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mquery\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 104\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mBookwormException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"code\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;36m400\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"message\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"You must specify a value for {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    105\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    106\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
37 |       "\u001b[0;31mBookwormException\u001b[0m: {'code': 400, 'message': 'You must specify a value for database'}"
38 |      ]
39 |     }
40 |    ],
41 |    "source": [
42 |     "test = {\n",
43 |     "    \"host\":\"http://localhost:10012/\",\"words_collation\":\"Case_Sensitive\",\n",
44 |     "    \"database\":\"RMP\",\"aesthetic\":{\"y\":\"department\",\"x\":\"WordsPerMillion\",\"color\":\"gender\"},\"search_limits\":{\"word\":[\"brilliant\"]},\"vega\":{\"title\":\"The most STEM-happy senators\",\"transform\":[{\"filter\":\"datum.WordsPerMillion > 130\"}]},\"groups\":[\"department\",\"gender\"],\"counttype\":[\"WordsPerMillion\"],\"method\":\"data\",\"format\":\"json_c\"}\n",
45 |     "Query(test)"
46 |    ]
47 |   }
48 |  ],
49 |  "metadata": {
50 |   "kernelspec": {
51 |    "display_name": "Python 3",
52 |    "language": "python",
53 |    "name": "python3"
54 |   },
55 |   "language_info": {
56 |    "codemirror_mode": {
57 |     "name": "ipython",
58 |     "version": 3
59 |    },
60 |    "file_extension": ".py",
61 |    "mimetype": "text/x-python",
62 |    "name": "python",
63 |    "nbconvert_exporter": "python",
64 |    "pygments_lexer": "ipython3",
65 |    "version": "3.7.1"
66 |   }
67 |  },
68 |  "nbformat": 4,
69 |  "nbformat_minor": 2
70 | }
71 | 


--------------------------------------------------------------------------------
/demos/Untitled.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "% load_ext autoreload\n",
 10 |     "% autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 6,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "from bookwormDB.mariaDB import Query"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 18,
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "name": "stderr",
 29 |      "output_type": "stream",
 30 |      "text": [
 31 |       "WARNING:root:'TRUE'\n"
 32 |      ]
 33 |     },
 34 |     {
 35 |      "name": "stdout",
 36 |      "output_type": "stream",
 37 |      "text": [
 38 |       "\n",
 39 |       "        SELECT sum(nwords) as WordCount, department, gender\n",
 40 |       "        FROM departmentLookup_ NATURAL JOIN fastcat_ NATURAL JOIN genderLookup_ NATURAL JOIN ID_genderheap_\n",
 41 |       "        WHERE\n",
 42 |       "           TRUE \n",
 43 |       "          AND \n",
 44 |       "           TRUE \n",
 45 |       "          AND TRUE \n",
 46 |       "        GROUP BY department__id, gender__id\n",
 47 |       "        \n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "test = {\n",
 53 |     "    \"host\":\"http://localhost:10012/\",\"words_collation\":\"Case_Sensitive\",\n",
 54 |     "    \"database\":\"RMP\", \"search_limits\":{},\n",
 55 |     "    \"groups\":[\"department\",\"gender\"],\"counttype\":[\"WordCount\"],\"method\":\"data\",\"format\":\"json_c\"}\n",
 56 |     "print(Query(test).query())"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 14,
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "ename": "NameError",
 66 |      "evalue": "name 'SqlFilter' is not defined",
 67 |      "output_type": "error",
 68 |      "traceback": [
 69 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 70 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
 71 |       "\u001b[0;32m<ipython-input-14-09f09f02e070>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mlexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlexers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMySqlLexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mlexer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_filter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSqlFilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhighlight\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformatters\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTerminalFormatter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 72 |       "\u001b[0;31mNameError\u001b[0m: name 'SqlFilter' is not defined"
 73 |      ]
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "!cd /drobo/"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 15,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "from bookwormDB import CreateDatabase\n",
 87 |     "z = CreateDatabase.BookwormSQLDatabase(\"RMP\")"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 16,
 93 |    "metadata": {},
 94 |    "outputs": [
 95 |     {
 96 |      "ename": "FileNotFoundError",
 97 |      "evalue": "[Errno 2] No such file or directory: '.bookworm/metadata/field_descriptions_derived.json'",
 98 |      "output_type": "error",
 99 |      "traceback": [
100 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
101 |       "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
102 |       "\u001b[0;32m<ipython-input-16-61a30c6dfbae>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mCreateDatabase\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mBookwormSQLDatabase\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"RMP\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
103 |       "\u001b[0;32m~/bookwormDB/bookwormDB/CreateDatabase.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, dbname, variableFile)\u001b[0m\n\u001b[1;32m    140\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    141\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mvariableFile\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 142\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetVariables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moriginFile\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvariableFile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    144\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mgrantPrivileges\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
104 |       "\u001b[0;32m~/bookwormDB/bookwormDB/CreateDatabase.py\u001b[0m in \u001b[0;36msetVariables\u001b[0;34m(self, originFile, anchorField, jsonDefinition)\u001b[0m\n\u001b[1;32m    160\u001b[0m     def setVariables(self, originFile, anchorField=\"bookid\",\n\u001b[1;32m    161\u001b[0m                      jsonDefinition=\".bookworm/metadata/field_descriptions_derived.json\"):\n\u001b[0;32m--> 162\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvariableSet\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvariableSet\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moriginFile\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moriginFile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0manchorField\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0manchorField\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjsonDefinition\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjsonDefinition\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdb\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    163\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    164\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mimportNewFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0moriginFile\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0manchorField\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mjsonDefinition\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
105 |       "\u001b[0;32m~/bookwormDB/bookwormDB/variableSet.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, originFile, anchorField, jsonDefinition, db)\u001b[0m\n\u001b[1;32m    500\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjsonDefinition\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mguessAtFieldDescriptions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    501\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 502\u001b[0;31m             \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjsonDefinition\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfin\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    503\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjsonDefinition\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfin\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    504\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
106 |       "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '.bookworm/metadata/field_descriptions_derived.json'"
107 |      ]
108 |     }
109 |    ],
110 |    "source": []
111 |   }
112 |  ],
113 |  "metadata": {
114 |   "kernelspec": {
115 |    "display_name": "Python 3",
116 |    "language": "python",
117 |    "name": "python3"
118 |   },
119 |   "language_info": {
120 |    "codemirror_mode": {
121 |     "name": "ipython",
122 |     "version": 3
123 |    },
124 |    "file_extension": ".py",
125 |    "mimetype": "text/x-python",
126 |    "name": "python",
127 |    "nbconvert_exporter": "python",
128 |    "pygments_lexer": "ipython3",
129 |    "version": "3.7.1"
130 |   }
131 |  },
132 |  "nbformat": 4,
133 |  "nbformat_minor": 2
134 | }
135 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | # This flag says that the code is written to work on both Python 2 and Python
3 | # 3. If at all possible, it is good practice to do this. If you cannot, you
4 | # will need to generate wheels for each Python version that you support.
5 | universal=0
6 | 
7 | [flake8]
8 | ignore= E231, E501
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup
 3 | 
 4 | 
 5 | setup(
 6 |     name='bookwormDB',
 7 |     packages=["bookwormDB"],
 8 |     version='1.0',
 9 |     entry_points={
10 |         'console_scripts': [
11 |             'bookworm = bookwormDB.manager:run_arguments'
12 |         ],
13 |     },
14 |     description="Create, deploy, and serve a Bookworm instance.",
15 |     long_description="\n".join(open("README.rst").readlines()),
16 |     package_data={'bookwormDB':['etc/*','bin/*']},
17 |     url="http://github.com/Bookworm-Project",
18 |     author="Benjamin Schmidt",
19 |     author_email="bmschmidt@gmail.com",
20 |     license="MIT",
21 |     classifiers=[
22 |         'Development Status :: 4 - Beta',
23 |         'Intended Audience :: Developers',
24 |         'Intended Audience :: Education',
25 |         "Natural Language :: English",
26 |         # Pick your license as you wish (should match "license" above)
27 |         'License :: OSI Approved :: MIT License',
28 |         "Operating System :: Unix",
29 |         # Specify the Python versions you support here. In particular, ensure
30 |         # that you indicate whether you support Python 2, Python 3 or both.
31 |         'Programming Language :: Python :: 3.6',
32 |         'Programming Language :: Python :: 3.7',
33 |         "Topic :: Sociology :: History",
34 |         "Topic :: Text Processing :: Indexing",
35 |         "Topic :: Text Processing :: Linguistic"
36 |     ],
37 |     install_requires=["numpy","pandas","mysqlclient",
38 |                       "python-dateutil", "psutil", "bounter",
39 |                       "gunicorn"
40 |     ]
41 | )
42 | 


--------------------------------------------------------------------------------
/tests/setup.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import bookwormDB
 3 | import bookwormDB.CreateDatabase
 4 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall
 5 | import logging
 6 | import os
 7 | from subprocess import call as call
 8 | import sys
 9 | import json
10 | from shutil import rmtree
11 | 
12 | def setup_bookworm():
13 |     """
14 |     Creates a test bookworm. Removes any existing databases called "federalist_bookworm"
15 |     """
16 |     logging.info("\n\nTESTING BOOKWORM CREATION\n\n")
17 |     import MySQLdb
18 |     from warnings import filterwarnings
19 |     filterwarnings('ignore', category = MySQLdb.Warning)
20 | 
21 |     import bookwormDB.configuration
22 |     os.chdir(sys.path[0] + "/test_bookworm_files")
23 |     rmtree(".bookworm", ignore_errors = True)
24 |     
25 |     bookwormDB.configuration.create(ask_about_defaults=False, database="federalist_bookworm")
26 | 
27 |     db = bookwormDB.CreateDatabase.DB(dbname="mysql")
28 |     
29 |     try:
30 |         db.query("DROP DATABASE IF EXISTS federalist_bookworm")
31 |     except MySQLdb.OperationalError as e:
32 |         if e[0]==1008:
33 |             pass
34 |         else:
35 |             print(e)
36 |             raise
37 |     except Exception as e:
38 |         """
39 |         This is some weird MariaDB exception. It sucks that I'm compensating for it here.
40 |         """
41 |         if e[0]=="Cannot load from mysql.proc. The table is probably corrupted":
42 |             pass
43 |         else:
44 |             print(e)
45 |             logging.warning("Some mysterious error in attempting to drop previous iterations: just try running it again?")
46 |             
47 |     call(["bookworm --log-level warning build all"],shell=True,cwd=sys.path[0] + "/test_bookworm_files")
48 | 
49 |     
50 | def setup_bookworm_unicode():
51 |     """
52 |     Creates a test bookworm. Removes any existing databases called "unicode_test_bookworm"
53 |     """
54 |     logging.info("\n\nTESTING BOOKWORM CREATION\n\n")
55 |     import MySQLdb
56 |     from warnings import filterwarnings
57 |     filterwarnings('ignore', category = MySQLdb.Warning)
58 | 
59 |     import bookwormDB.configuration
60 |     os.chdir(sys.path[0] + "/test_bookworm_files_unicode")
61 |     rmtree(".bookworm", ignore_errors = True)
62 |     
63 |     bookwormDB.configuration.create(ask_about_defaults=False,database="unicode_test_bookworm")
64 |     
65 |     db = bookwormDB.CreateDatabase.DB(dbname="mysql")
66 |     
67 |     try:
68 |         db.query("DROP DATABASE IF EXISTS unicode_test_bookworm")
69 |     except MySQLdb.OperationalError as e:
70 |         if e[0]==1008:
71 |             pass
72 |         else:
73 |             print(e)
74 |             raise
75 |     except Exception as e:
76 |         """
77 |         This is some weird MariaDB exception. It sucks that I'm compensating for it here.
78 |         """
79 |         if e[0]=="Cannot load from mysql.proc. The table is probably corrupted":
80 |             pass
81 |         else:
82 |             logging.warning("Some mysterious error in attempting to drop previous iterations: just try running it again?")
83 |             
84 |     call(["bookworm --log-level warning build all"],
85 |          shell=True,
86 |          cwd=sys.path[0] + "/test_bookworm_files_unicode")
87 | 
88 | 
89 | if __name__=="__main__":
90 |     setup_bookworm()
91 |     setup_bookworm_unicode()
92 | 
93 | 


--------------------------------------------------------------------------------
/tests/test_API.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from builtins import range
  4 | from builtins import object
  5 | import unittest
  6 | import bookwormDB
  7 | import bookwormDB.CreateDatabase
  8 | from bookwormDB.general_API import SQLAPIcall as SQLAPIcall
  9 | import logging
 10 | import os
 11 | from subprocess import call as call
 12 | import sys
 13 | import json
 14 | from setup import setup_bookworm, setup_bookworm_unicode
 15 | 
 16 | class Bookworm_SQL_Creation(unittest.TestCase):
 17 | 
 18 |     def test_bookworm_files_exist(self):
 19 |         bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase("federalist_bookworm")
 20 |         db = bookworm.db
 21 |         db.query("USE federalist_bookworm")
 22 |         wordCount = db.query("SELECT SUM(nwords) FROM fastcat_").fetchall()[0][0]
 23 |         # This should be 212,081, but I don't want the tests to start failing when
 24 |         # we change the tokenization rules or miscellaneous things about encoding.
 25 |         self.assertTrue(wordCount>100000)
 26 |         """
 27 |         Then we test whether the API can make queries on that bookworm.
 28 |         """
 29 |         
 30 |     def test_API(self):
 31 |         from bookwormDB.general_API import SQLAPIcall as SQLAPIcall
 32 |         import json
 33 |         
 34 |         query = {
 35 |                 "database":"federalist_bookworm",
 36 |                 "search_limits":{},
 37 |                 "counttype":"TextPercent",
 38 |                 "groups":["author"],
 39 |                 "method":"data", "format":"json"
 40 |         }
 41 |         
 42 |         m = json.loads(SQLAPIcall(query).execute())['data']
 43 |         self.assertEqual(len(m),5)
 44 | 
 45 | 
 46 |     def test_multiword_search(self):
 47 |         from bookwormDB.general_API import SQLAPIcall as SQLAPIcall
 48 |         import json
 49 |         
 50 |         query = {
 51 |                 "database":"federalist_bookworm",
 52 |                 "search_limits":{"word":["on","upon"]},
 53 |                 "counttype":"TextPercent",
 54 |                 "method":"data", "format":"json",
 55 |                 "groups": []
 56 |         }
 57 |         
 58 |         m = json.loads(SQLAPIcall(query).execute())['data']
 59 |         self.assertTrue(m[0] > 33)
 60 | 
 61 |     def test_ne_with_one_entry(self):
 62 |         from bookwormDB.general_API import SQLAPIcall as SQLAPIcall
 63 |         import json
 64 |         
 65 |         query = {
 66 |                 "database":"federalist_bookworm",
 67 |                 "search_limits":{
 68 |                     "author": {"$ne": ["HAMILTON"]}
 69 |                 },
 70 |                 "counttype":"TextPercent",
 71 |                 "groups":["author"],
 72 |                 "method":"data", "format":"json"
 73 |         }
 74 |         
 75 |         m = json.loads(SQLAPIcall(query).execute())['data']
 76 |         self.assertTrue(len(m)==4)
 77 | 
 78 |     def test_ne_with_two_entries(self):
 79 |         from bookwormDB.general_API import SQLAPIcall as SQLAPIcall
 80 |         import json
 81 |         
 82 |         query = {
 83 |                 "database":"federalist_bookworm",
 84 |                 "search_limits":{
 85 |                     "author": {"$ne": ["HAMILTON","DISPUTED"]}
 86 |                 },
 87 |                 "counttype":"TextPercent",
 88 |                 "groups":["author"],
 89 |                 "method":"data", "format":"json"
 90 |         }
 91 | 
 92 |         m = json.loads(SQLAPIcall(query).execute())['data']
 93 |         self.assertTrue(len(m)==3)
 94 | 
 95 | 
 96 |     def test_ne_with_two_entries(self):
 97 |         from bookwormDB.general_API import SQLAPIcall as SQLAPIcall
 98 |         import json
 99 |         
100 |         query = {
101 |                 "database":"federalist_bookworm",
102 |                 "search_limits":{
103 |                     "author": {"$ne": ["HAMILTON","DISPUTED"]}
104 |                 },
105 |                 "counttype":"TextPercent",
106 |                 "groups":["author"],
107 |                 "method":"data", "format":"json"
108 |         }
109 | 
110 |         m = json.loads(SQLAPIcall(query).execute())['data']
111 |         self.assertTrue(len(m)==3)
112 | 
113 | 
114 |     def test_or_with_two_entries(self):
115 |         from bookwormDB.general_API import SQLAPIcall as SQLAPIcall
116 |         import json
117 |         
118 |         query = {
119 |                 "database":"federalist_bookworm",
120 |                 "search_limits":{
121 |                     "$or": [
122 |                         {"author": ["HAMILTON"]},
123 |                         {"author": ["DISPUTED"]}
124 |                     ]
125 |                 },
126 |                 "counttype":"TextCount",
127 |                 "groups":["author"],
128 |                 "method":"data", "format":"json"
129 |         }
130 | 
131 |         m = json.loads(SQLAPIcall(query).execute())['data']
132 |         self.assertEqual(len(m),2)
133 | 
134 |     def test_lte_and_gte(self):
135 |         from bookwormDB.general_API import SQLAPIcall as SQLAPIcall
136 |         import json
137 |         
138 |         query = {
139 |                 "database":"federalist_bookworm",
140 |                 "search_limits":{
141 |                     "fedNumber":{"$lte":10,"$gte":5}
142 |                 },
143 |                 "counttype":"TextCount",
144 |                 "groups":["fedNumber"],
145 |                 "method":"data", "format":"json"
146 |         }
147 | 
148 |         m = json.loads(SQLAPIcall(query).execute())['data']
149 |         self.assertTrue(len(m)==6)
150 |         
151 |     def test_and_with_two_entries(self):
152 |         from bookwormDB.general_API import SQLAPIcall as SQLAPIcall
153 |         import json
154 |         
155 |         query = {
156 |                 "database":"federalist_bookworm",
157 |                 "search_limits":{
158 |                     "$and": [
159 |                         {"author": ["HAMILTON"]},
160 |                         {"fedNumber":[40]}
161 |                     ]
162 |                 },
163 |                 "counttype":"TextCount",
164 |                 "groups":["author"],
165 |                 "method":"data", "format":"json"
166 |         }
167 | 
168 |         m = json.loads(SQLAPIcall(query).execute())['data']
169 |         self.assertTrue(len(m)==0)
170 |         
171 |     def test_adding_metadata_to_bookworm(self):
172 |         """
173 |         Build out some dummy metadata: label the difference
174 |         between even and odd paragrahs.
175 |         """
176 |         
177 |         from bookwormDB.manager import BookwormManager
178 |         manager = BookwormManager(database="federalist_bookworm")
179 | 
180 |         # Create a phony derived field to test metadata supplementing
181 | 
182 |         
183 |         def even_even(number):
184 |             if number % 2 == 0:
185 |                 return "even"
186 |             return "odd"
187 | 
188 |         tmp_file = "{}/test_bookworm_metadata.tsv".format(sys.path[0])
189 |         
190 |         with open(tmp_file,"w") as newMetadata:
191 |             newMetadata.write("paragraphNumber\toddness\n")                
192 |             for n in range(500):
193 |                 newMetadata.write("%d\t%s\n" %(n,even_even(n)))
194 |                 
195 |         class Dummy(object):
196 |             """
197 |             Just quickly create a namespace to stand in for the command-line args.
198 |             """
199 |             key = "paragraphNumber"
200 |             format = "tsv"
201 |             file = tmp_file
202 |             # Test the guessing at field_descriptions while we're at it
203 |             field_descriptions = None
204 | 
205 |         import os
206 |         manager.add_metadata(Dummy)
207 | 
208 |         """
209 |         And then we test if that can be retrieved
210 |         """
211 | 
212 |         query = {
213 |                 "database":"federalist_bookworm",
214 |                 "search_limits":{},
215 |                 "counttype":"TextCount",
216 |                 "groups":["oddness"],
217 |                 "method":"data", "format":"json"
218 |         }
219 |         
220 |         SQLAPIcall(query)
221 |         m = json.loads(SQLAPIcall(query).execute())['data']
222 |         # Even or odd is one of two things.
223 |         self.assertTrue(len(m)==2)
224 |         
225 |         # Since the first paragraph is odd,
226 |         # there should be more of those.
227 |         
228 |         self.assertTrue(m['odd'][0]>=m['even'][0])
229 |         
230 |     def test_case_sensitivity(self):
231 |         query = {
232 |                 "database":"federalist_bookworm",
233 |                 "search_limits":{"word":["the"]},
234 |                 "counttype":"WordCount",
235 |                 "groups":[],
236 |                 "words_collation":"Case_Sensitive",
237 |                 "method":"data", "format":"json"
238 |         }
239 | 
240 |         SQLAPIcall(query)
241 |         val1 = json.loads(SQLAPIcall(query).execute())['data']
242 |         self.assertTrue(val1[0] > 0)
243 | 
244 |         query["words_collation"] = "Case_Insensitive"        
245 | 
246 |         SQLAPIcall(query)        
247 |         val2 = json.loads(SQLAPIcall(query).execute())['data']
248 |         # The words ('The','the') appear more often than ('the') alone.
249 |         self.assertTrue(val2[0] > val1[0])
250 | 
251 | 
252 |     def test_case_insensitivity_works_without_search_term(self):
253 |         query = {
254 |                 "database":"federalist_bookworm",
255 |                 "search_limits":{"word":["hOwEvEr"]},
256 |                 "counttype":"WordCount",
257 |                 "groups":[],
258 |                 "words_collation":"Case_Insensitive",
259 |                 "method":"data", "format":"json"
260 |         }
261 |         SQLAPIcall(query)
262 |         val1 = json.loads(SQLAPIcall(query).execute())['data']
263 |         self.assertTrue(val1[0] > 0)
264 | 
265 |     def test_unicode_search_term(self):
266 |         query = {
267 |                 "database":"unicode_test_bookworm",
268 |                 "search_limits":{"word":[u"ᎾᏍᎩ"]},
269 |                 "counttype":"WordCount",
270 |                 "groups":[],
271 |                 "words_collation":"Case_Insensitive",
272 |                 "method":"data", "format":"json"
273 |         }
274 |         SQLAPIcall(query)
275 |         val1 = json.loads(SQLAPIcall(query).execute())['data']
276 |         self.assertTrue(val1[0] > 0)
277 | 
278 |     def test_various_unicode_cases(self):
279 |         # There's a 'description_' for each individual item.
280 |         catalog_location = sys.path[0] + "/test_bookworm_files_unicode/jsoncatalog.txt"
281 |         cases = [json.loads(line)["description_"] for line in open(catalog_location)]       
282 |         for case in cases:
283 |             query = {
284 |                 "database":"unicode_test_bookworm",
285 |                 "search_limits":{"description_":case},
286 |                 "counttype":"WordCount",
287 |                 "groups":[],
288 |                 "words_collation":"Case_Insensitive",
289 |                 "method":"data", "format":"json"
290 |                 }
291 |             SQLAPIcall(query)
292 |             val1 = json.loads(SQLAPIcall(query).execute())['data']
293 |             self.assertTrue(val1[0] > 0)
294 | 
295 |     def test_asterisks_in_search_limits(self):
296 |         """
297 |         The following two queries should, by definition, produce the same result.
298 |         """
299 |         query = {
300 |                 "database":"federalist_bookworm",
301 |                 "search_limits":{"word":["on"],"author":["HAMILTON"]},
302 |                 "compare_limits":{"word":["on"]},                
303 |                 "counttype":"WordsPerMillion",
304 |                 "groups":[],
305 |                 "method":"data", "format":"json"
306 |         }        
307 |         val1 = json.loads(SQLAPIcall(query).execute())['data']
308 | 
309 |         query = {
310 |             "database":"federalist_bookworm",
311 |             "search_limits":{"word":["on"],"*author":["HAMILTON"]},
312 |             "counttype":"WordsPerMillion",
313 |             "groups":[],
314 |             "method":"data", "format":"json"
315 |             }
316 |         val2 = json.loads(SQLAPIcall(query).execute())['data']
317 |         self.assertTrue(val1[0] == val2[0])        
318 | 
319 |         
320 | """        
321 | class SQLConnections(unittest.TestCase):
322 |     
323 |         
324 | 
325 |     def test_dunning(self):
326 |         query = {
327 |             "database":"federalist",
328 |             "search_limits":{"author":"Hamilton"},
329 |             "compare_limits":{"author":"Madison"},
330 |             "counttype":"Dunning",
331 |             "groups":["unigram"],
332 |             "method":"data", "format":"json"
333 |         }
334 |         
335 | 
336 |         try:
337 |             #dbbindings.main(query)
338 |             worked = True
339 |         except:
340 |             worked = False
341 | 
342 |         self.assertTrue(worked)
343 | """
344 | 
345 |         
346 | if __name__=="__main__":
347 |     # The setup is done without verbose logging; any failure
348 |     # causes it to try again.
349 |     logging.basicConfig(level=40)
350 |     try:
351 |         setup_bookworm()
352 |         setup_bookworm_unicode()
353 |     except:
354 |         logging.basicConfig(level=10)
355 |         setup_bookworm()
356 |         setup_bookworm_unicode()
357 |     logging.basicConfig(level=10)    
358 |     unittest.main()
359 | 


--------------------------------------------------------------------------------
/tests/test_bookworm_files/field_descriptions.json:
--------------------------------------------------------------------------------
1 | [
2 |     {"datatype": "searchstring", "field": "searchstring", "unique": true, "type": "text"},
3 |     {"datatype": "categorical", "field": "title", "unique": true, "type": "text"},
4 |     {"datatype": "categorical", "field": "author", "unique": true, "type": "text"},
5 |     {"datatype": "categorical", "field": "fedNumber", "unique": true, "type": "text"},
6 |     {"datatype": "categorical", "field": "paragraphNumber", "unique": true, "type": "text"},
7 |     {"datatype": "time", "field": "date", "unique": true, "type": "text", "derived":[{"resolution":"year"},{"resolution":"month"},{"resolution":"day"},{"resolution":"week","aggregate":"year"}]}
8 | ]
9 | 


--------------------------------------------------------------------------------
/tests/test_bookworm_files/test_bookworm_metadata.tsv:
--------------------------------------------------------------------------------
  1 | paragraphNumber	oddness
  2 | 0	even
  3 | 1	odd
  4 | 2	even
  5 | 3	odd
  6 | 4	even
  7 | 5	odd
  8 | 6	even
  9 | 7	odd
 10 | 8	even
 11 | 9	odd
 12 | 10	even
 13 | 11	odd
 14 | 12	even
 15 | 13	odd
 16 | 14	even
 17 | 15	odd
 18 | 16	even
 19 | 17	odd
 20 | 18	even
 21 | 19	odd
 22 | 20	even
 23 | 21	odd
 24 | 22	even
 25 | 23	odd
 26 | 24	even
 27 | 25	odd
 28 | 26	even
 29 | 27	odd
 30 | 28	even
 31 | 29	odd
 32 | 30	even
 33 | 31	odd
 34 | 32	even
 35 | 33	odd
 36 | 34	even
 37 | 35	odd
 38 | 36	even
 39 | 37	odd
 40 | 38	even
 41 | 39	odd
 42 | 40	even
 43 | 41	odd
 44 | 42	even
 45 | 43	odd
 46 | 44	even
 47 | 45	odd
 48 | 46	even
 49 | 47	odd
 50 | 48	even
 51 | 49	odd
 52 | 50	even
 53 | 51	odd
 54 | 52	even
 55 | 53	odd
 56 | 54	even
 57 | 55	odd
 58 | 56	even
 59 | 57	odd
 60 | 58	even
 61 | 59	odd
 62 | 60	even
 63 | 61	odd
 64 | 62	even
 65 | 63	odd
 66 | 64	even
 67 | 65	odd
 68 | 66	even
 69 | 67	odd
 70 | 68	even
 71 | 69	odd
 72 | 70	even
 73 | 71	odd
 74 | 72	even
 75 | 73	odd
 76 | 74	even
 77 | 75	odd
 78 | 76	even
 79 | 77	odd
 80 | 78	even
 81 | 79	odd
 82 | 80	even
 83 | 81	odd
 84 | 82	even
 85 | 83	odd
 86 | 84	even
 87 | 85	odd
 88 | 86	even
 89 | 87	odd
 90 | 88	even
 91 | 89	odd
 92 | 90	even
 93 | 91	odd
 94 | 92	even
 95 | 93	odd
 96 | 94	even
 97 | 95	odd
 98 | 96	even
 99 | 97	odd
100 | 98	even
101 | 99	odd
102 | 100	even
103 | 101	odd
104 | 102	even
105 | 103	odd
106 | 104	even
107 | 105	odd
108 | 106	even
109 | 107	odd
110 | 108	even
111 | 109	odd
112 | 110	even
113 | 111	odd
114 | 112	even
115 | 113	odd
116 | 114	even
117 | 115	odd
118 | 116	even
119 | 117	odd
120 | 118	even
121 | 119	odd
122 | 120	even
123 | 121	odd
124 | 122	even
125 | 123	odd
126 | 124	even
127 | 125	odd
128 | 126	even
129 | 127	odd
130 | 128	even
131 | 129	odd
132 | 130	even
133 | 131	odd
134 | 132	even
135 | 133	odd
136 | 134	even
137 | 135	odd
138 | 136	even
139 | 137	odd
140 | 138	even
141 | 139	odd
142 | 140	even
143 | 141	odd
144 | 142	even
145 | 143	odd
146 | 144	even
147 | 145	odd
148 | 146	even
149 | 147	odd
150 | 148	even
151 | 149	odd
152 | 150	even
153 | 151	odd
154 | 152	even
155 | 153	odd
156 | 154	even
157 | 155	odd
158 | 156	even
159 | 157	odd
160 | 158	even
161 | 159	odd
162 | 160	even
163 | 161	odd
164 | 162	even
165 | 163	odd
166 | 164	even
167 | 165	odd
168 | 166	even
169 | 167	odd
170 | 168	even
171 | 169	odd
172 | 170	even
173 | 171	odd
174 | 172	even
175 | 173	odd
176 | 174	even
177 | 175	odd
178 | 176	even
179 | 177	odd
180 | 178	even
181 | 179	odd
182 | 180	even
183 | 181	odd
184 | 182	even
185 | 183	odd
186 | 184	even
187 | 185	odd
188 | 186	even
189 | 187	odd
190 | 188	even
191 | 189	odd
192 | 190	even
193 | 191	odd
194 | 192	even
195 | 193	odd
196 | 194	even
197 | 195	odd
198 | 196	even
199 | 197	odd
200 | 198	even
201 | 199	odd
202 | 200	even
203 | 201	odd
204 | 202	even
205 | 203	odd
206 | 204	even
207 | 205	odd
208 | 206	even
209 | 207	odd
210 | 208	even
211 | 209	odd
212 | 210	even
213 | 211	odd
214 | 212	even
215 | 213	odd
216 | 214	even
217 | 215	odd
218 | 216	even
219 | 217	odd
220 | 218	even
221 | 219	odd
222 | 220	even
223 | 221	odd
224 | 222	even
225 | 223	odd
226 | 224	even
227 | 225	odd
228 | 226	even
229 | 227	odd
230 | 228	even
231 | 229	odd
232 | 230	even
233 | 231	odd
234 | 232	even
235 | 233	odd
236 | 234	even
237 | 235	odd
238 | 236	even
239 | 237	odd
240 | 238	even
241 | 239	odd
242 | 240	even
243 | 241	odd
244 | 242	even
245 | 243	odd
246 | 244	even
247 | 245	odd
248 | 246	even
249 | 247	odd
250 | 248	even
251 | 249	odd
252 | 250	even
253 | 251	odd
254 | 252	even
255 | 253	odd
256 | 254	even
257 | 255	odd
258 | 256	even
259 | 257	odd
260 | 258	even
261 | 259	odd
262 | 260	even
263 | 261	odd
264 | 262	even
265 | 263	odd
266 | 264	even
267 | 265	odd
268 | 266	even
269 | 267	odd
270 | 268	even
271 | 269	odd
272 | 270	even
273 | 271	odd
274 | 272	even
275 | 273	odd
276 | 274	even
277 | 275	odd
278 | 276	even
279 | 277	odd
280 | 278	even
281 | 279	odd
282 | 280	even
283 | 281	odd
284 | 282	even
285 | 283	odd
286 | 284	even
287 | 285	odd
288 | 286	even
289 | 287	odd
290 | 288	even
291 | 289	odd
292 | 290	even
293 | 291	odd
294 | 292	even
295 | 293	odd
296 | 294	even
297 | 295	odd
298 | 296	even
299 | 297	odd
300 | 298	even
301 | 299	odd
302 | 300	even
303 | 301	odd
304 | 302	even
305 | 303	odd
306 | 304	even
307 | 305	odd
308 | 306	even
309 | 307	odd
310 | 308	even
311 | 309	odd
312 | 310	even
313 | 311	odd
314 | 312	even
315 | 313	odd
316 | 314	even
317 | 315	odd
318 | 316	even
319 | 317	odd
320 | 318	even
321 | 319	odd
322 | 320	even
323 | 321	odd
324 | 322	even
325 | 323	odd
326 | 324	even
327 | 325	odd
328 | 326	even
329 | 327	odd
330 | 328	even
331 | 329	odd
332 | 330	even
333 | 331	odd
334 | 332	even
335 | 333	odd
336 | 334	even
337 | 335	odd
338 | 336	even
339 | 337	odd
340 | 338	even
341 | 339	odd
342 | 340	even
343 | 341	odd
344 | 342	even
345 | 343	odd
346 | 344	even
347 | 345	odd
348 | 346	even
349 | 347	odd
350 | 348	even
351 | 349	odd
352 | 350	even
353 | 351	odd
354 | 352	even
355 | 353	odd
356 | 354	even
357 | 355	odd
358 | 356	even
359 | 357	odd
360 | 358	even
361 | 359	odd
362 | 360	even
363 | 361	odd
364 | 362	even
365 | 363	odd
366 | 364	even
367 | 365	odd
368 | 366	even
369 | 367	odd
370 | 368	even
371 | 369	odd
372 | 370	even
373 | 371	odd
374 | 372	even
375 | 373	odd
376 | 374	even
377 | 375	odd
378 | 376	even
379 | 377	odd
380 | 378	even
381 | 379	odd
382 | 380	even
383 | 381	odd
384 | 382	even
385 | 383	odd
386 | 384	even
387 | 385	odd
388 | 386	even
389 | 387	odd
390 | 388	even
391 | 389	odd
392 | 390	even
393 | 391	odd
394 | 392	even
395 | 393	odd
396 | 394	even
397 | 395	odd
398 | 396	even
399 | 397	odd
400 | 398	even
401 | 399	odd
402 | 400	even
403 | 401	odd
404 | 402	even
405 | 403	odd
406 | 404	even
407 | 405	odd
408 | 406	even
409 | 407	odd
410 | 408	even
411 | 409	odd
412 | 410	even
413 | 411	odd
414 | 412	even
415 | 413	odd
416 | 414	even
417 | 415	odd
418 | 416	even
419 | 417	odd
420 | 418	even
421 | 419	odd
422 | 420	even
423 | 421	odd
424 | 422	even
425 | 423	odd
426 | 424	even
427 | 425	odd
428 | 426	even
429 | 427	odd
430 | 428	even
431 | 429	odd
432 | 430	even
433 | 431	odd
434 | 432	even
435 | 433	odd
436 | 434	even
437 | 435	odd
438 | 436	even
439 | 437	odd
440 | 438	even
441 | 439	odd
442 | 440	even
443 | 441	odd
444 | 442	even
445 | 443	odd
446 | 444	even
447 | 445	odd
448 | 446	even
449 | 447	odd
450 | 448	even
451 | 449	odd
452 | 450	even
453 | 451	odd
454 | 452	even
455 | 453	odd
456 | 454	even
457 | 455	odd
458 | 456	even
459 | 457	odd
460 | 458	even
461 | 459	odd
462 | 460	even
463 | 461	odd
464 | 462	even
465 | 463	odd
466 | 464	even
467 | 465	odd
468 | 466	even
469 | 467	odd
470 | 468	even
471 | 469	odd
472 | 470	even
473 | 471	odd
474 | 472	even
475 | 473	odd
476 | 474	even
477 | 475	odd
478 | 476	even
479 | 477	odd
480 | 478	even
481 | 479	odd
482 | 480	even
483 | 481	odd
484 | 482	even
485 | 483	odd
486 | 484	even
487 | 485	odd
488 | 486	even
489 | 487	odd
490 | 488	even
491 | 489	odd
492 | 490	even
493 | 491	odd
494 | 492	even
495 | 493	odd
496 | 494	even
497 | 495	odd
498 | 496	even
499 | 497	odd
500 | 498	even
501 | 499	odd
502 | 


--------------------------------------------------------------------------------
/tests/test_bookworm_files_unicode/field_descriptions.json:
--------------------------------------------------------------------------------
1 | [
2 |     {"datatype": "searchstring", "field": "searchstring", "unique": true, "type": "text"},
3 |     {"datatype": "categorical", "field": "language", "unique": true, "type": "text"},
4 |         {"datatype": "categorical", "field": "description_", "unique": true, "type": "text"}
5 | ]
6 | 


--------------------------------------------------------------------------------
/tests/test_bookworm_files_unicode/input.txt:
--------------------------------------------------------------------------------
1 | john_1	ᏗᏓᎴᏂᏍᎬ ᎧᏃᎮᏛ ᎡᎮᎢ, ᎠᎴ ᎾᏍᎩ ᎧᏃᎮᏛ ᎤᏁᎳᏅᎯ ᎢᏧᎳᎭ ᎠᏁᎮᎢ, ᎠᎴ ᎾᏍᎩ ᎧᏃᎮᏛ ᎤᏁᎳᏅᎯ ᎨᏎᎢ.
2 | quran_2	"بِسْمِ اللَّـهِ الرَّحْمَـٰنِ الرَّحِيمِ"
3 | quran_1	بِسْمِ اللَّـهِ الرَّحْمَـٰنِ الرَّحِيمِ
4 | سورة الفاتحة	بِسْمِ اللَّـهِ الرَّحْمَـٰنِ الرَّحِيمِ
5 | سورة	In the name of Allah, the Entirely Merciful, the Especially Merciful
6 | 


--------------------------------------------------------------------------------
/tests/test_bookworm_files_unicode/jsoncatalog.txt:
--------------------------------------------------------------------------------
1 | {"filename":"john_1", "language":"cherokee","searchstring":"Cherokee bible verse","description_":"Cherokee Bible Verse"}
2 | {"filename":"quran_1", "language":"arabic","searchstring":"Quran verse 1","description_": "Arabic text"}
3 | {"filename":"quran_2", "language":"arabic","searchstring":"Quran verse 1 in quotes","description_": "Arabic text in ASCII quotes"}
4 | {"filename":"سورة الفاتحة", "language":"arabic","searchstring":"Quran verse 1 with arabic filename","description_":"Arabic Filename with Arabic text"}
5 | {"filename":"سورة", "language":"english","searchstring":"Quran verse 1 in English with arabic filename","description_":"Arabic Filename with English text"}
6 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from bookwormDB.manager import BookwormManager
 4 | import unittest
 5 | import logging
 6 | import os
 7 | import sys
 8 | 
 9 | class Bookworm_Configuration(unittest.TestCase):
10 | 
11 |     def test_config(self):
12 |         bookworm = BookwormManager(None, "federalist_bookworm")        
13 | 
14 |         
15 | if __name__=="__main__":
16 |     # The setup is done without verbose logging; any failure
17 |     # causes it to try again.
18 |     unittest.main()
19 | 


--------------------------------------------------------------------------------
/tests/test_mysql.py:
--------------------------------------------------------------------------------
 1 | from builtins import hex
 2 | import unittest
 3 | import bookwormDB
 4 | from bookwormDB.configuration import Configfile
 5 | import bookwormDB.CreateDatabase
 6 | import logging
 7 | import MySQLdb
 8 | import random
 9 | 
10 | logging.basicConfig(level=10)
11 | 
12 | 
13 | """
14 | Tests of the MySQL configuration.
15 | """
16 | 
17 | class Bookworm_MySQL_Configuration(unittest.TestCase):
18 |     def test_server_connection(self):
19 |         logging.info("\n\nTESTING SERVER CONNECTION\n\n")
20 |         """
21 |         Connect to MySQL and run a simple query.
22 |         """
23 |         import bookwormDB.CreateDatabase
24 |         db = bookwormDB.CreateDatabase.DB(dbname="mysql")
25 |         sampleQuery=db.query("SELECT 1+1").fetchall()
26 |         self.assertTrue(sampleQuery[0][0]==2)
27 | 
28 |     """
29 |     To properly test things, we actually build some bookworms.
30 |     This assumes that the directory '/tmp' is writeable,
31 |     which isn't strictly necessary for a bookworm to be built.
32 |     """
33 | 
34 |     def test_config_files(self):
35 |         logging.info("\n\nTESTING CONFIG FILE ACCESS\n\n")
36 |         def test_config_file(conf):
37 |             user = conf.config.get("client","user")
38 |             pw = conf.config.get("client","password")
39 |             return (user,pw)
40 | 
41 |         global_configuration_file = Configfile("read_only")
42 |         admin_configuration_file = Configfile("admin")
43 | 
44 |         (admin_user,admin_pw) = test_config_file(global_configuration_file)
45 |         (client_user,client_pw) = test_config_file(admin_configuration_file)
46 |         logging.info("admin user is {} and password is {}".format(admin_user,admin_pw))
47 |         logging.info("client user is {} and password is {}".format(client_user,client_pw))
48 |         logging.info("Checking that admin and client users are distinct")
49 |         self.assertTrue(admin_user != client_user)
50 |         
51 |     def test_createDB_permission(self):
52 |         logging.info("\nTESTING ABILITY TO CREATE DATABASES\n\n")
53 |         import bookwormDB.configuration
54 |         dbname = "A" + hex(random.getrandbits(128))[2:-1]
55 |         import bookwormDB.CreateDatabase
56 |         db = bookwormDB.CreateDatabase.DB(dbname="mysql")        
57 |         cursor = db.query("CREATE DATABASE {}".format(dbname))
58 |         cursor.execute("DROP DATABASE {}".format(dbname))
59 |         cursor.close()
60 | 
61 | 
62 | if __name__=="__main__":
63 |     unittest.main()
64 | 


--------------------------------------------------------------------------------