├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── README.md
├── conf.py
├── examples
    ├── datashader_example.py
    ├── extract_sample.py
    ├── parse_wikipedia_history.py
    ├── top_items.py
    ├── tutorial_simple_traildb.py
    └── tutorial_wikipedia_sessions.py
├── index.rst
├── requirements.txt
├── runtests.sh
├── setup.py
├── test
    └── test.py
├── traildb
    ├── __init__.py
    ├── __main__.py
    └── traildb.py
└── travisdeps.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | /build/
3 | *.pyc
4 | _build/
5 | test/*.tdb
6 | 
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | 
 3 | language: python
 4 | 
 5 | python:
 6 |   - 2.7
 7 |   - 3.3
 8 |   - 3.4
 9 |   - 3.5
10 |   - 3.6
11 |   - pypy
12 |   - pypy3.5
13 | 
14 | before_install:
15 |   - ./travisdeps.sh
16 | 
17 | script:
18 |   - ./runtests.sh


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM c3h3/traildb-base:latest
 2 | 
 3 | # pyenv image
 4 | 
 5 | ENV HOME /root
 6 | ENV PYENVPATH $HOME/.pyenv
 7 | ENV PATH $PYENVPATH/shims:$PYENVPATH/bin:$PATH
 8 | 
 9 | RUN curl -L https://raw.githubusercontent.com/yyuu/pyenv-installer/master/bin/pyenv-installer | bash
10 | RUN echo 'eval "$(pyenv init -)"' >  /root/.bashrc
11 | 
12 | 
13 | EXPOSE 8888
14 | 
15 | RUN pyenv update && pyenv install anaconda-2.3.0 && pyenv global anaconda-2.3.0 && ipython profile create
16 | 
17 | RUN (echo "require(['base/js/namespace'], function (IPython) {" && \
18 |  echo "  IPython._target = '_self';" && \
19 |  echo "});") > /root/.ipython/profile_default/static/custom/custom.js
20 | 
21 | 
22 | RUN (echo "c = get_config()" && \
23 |      echo "headers = {'Content-Security-Policy': 'frame-ancestors *'}" && \
24 |      echo "c.NotebookApp.allow_origin = '*'" && \
25 |      echo "c.NotebookApp.allow_credentials = True" && \
26 |      echo "c.NotebookApp.tornado_settings = {'headers': headers}" && \
27 |      echo "c.NotebookApp.ip = '0.0.0.0'" && \
28 |      echo "c.NotebookApp.open_browser = False" && \
29 |      echo "from IPython.lib import passwd" && \
30 |      echo "import os" && \
31 |      echo "c.NotebookApp.password = passwd(os.environ.get('PASSWORD', 'jupyter'))") \
32 |     > /root/.ipython/profile_default/ipython_notebook_config.py
33 | 
34 | 
35 | RUN cd /tmp && git clone https://github.com/traildb/traildb-python && cd traildb-python && python setup.py install
36 | 
37 | RUN mkdir /ipynbs
38 | WORKDIR /ipynbs
39 | 
40 | CMD ipython notebook --no-browser --ip=0.0.0.0 --port 8888


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | Copyright (c) 2016 AdRoll, Inc.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining
 5 | a copy of this software and associated documentation files (the
 6 | "Software"), to deal in the Software without restriction, including
 7 | without limitation the rights to use, copy, modify, merge, publish,
 8 | distribute, sublicense, and/or sell copies of the Software, and to
 9 | permit persons to whom the Software is furnished to do so, subject to
10 | the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included
13 | in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python bindings for TrailDB
 2 | 
 3 | ### Quick start
 4 | 
 5 | First install the [TrailDB library](https://github.com/traildb/traildb). Then
 6 | 
 7 |     $ python setup.py install
 8 | 
 9 | For detailed instructions, see [Getting Started guide](http://traildb.io/docs/getting_started/).
10 | 
11 | ### Example
12 | 
13 | See [TrailDB tutorial](http://traildb.io/docs/tutorial) for more information.
14 | 
15 | ```python
16 | 
17 | >>> from traildb import TrailDB, TrailDBConstructor
18 | 
19 | >>> cookie = '12345678123456781234567812345678'
20 | >>> cons = TrailDBConstructor('test.tdb', ['field1', 'field2'])
21 | >>> cons.add(cookie, 123, ['a'])
22 | >>> cons.add(cookie, 124, ['b', 'c'])
23 | >>> tdb = cons.finalize()
24 | 
25 | >>> for cookie, trail in tdb.trails():
26 | ...     for event in trail:
27 | ...         print cookie, event
28 | 
29 | 12345678123456781234567812345678 event(time=123L, field1='a', field2='')
30 | 12345678123456781234567812345678 event(time=124L, field1='b', field2='c')
31 | ```
32 | 
33 | ## For Docker User:
34 | 
35 | You can pull image from here:
36 | 
37 |     $ docker pull c3h3/traildb-ipynb
38 | 
39 | Or, you can build docker image by yourself (please replace "your/repo-name" with whatever you want):
40 | 
41 |     $ docker build -t your/repo-name .
42 | 
43 | 
44 | You can run the docker image with default password (jupyter), and your jupyter notebook will listen on 8080 port:
45 | 
46 |     $ docker run -p 8080:8888 -it c3h3/traildb-ipynb
47 | 
48 | Or, you can run the docker image with your password (yourPassword), and your jupyter notebook will listen on 8080 port:
49 | 
50 |     $ docker run -e PASSWORD=yourPassword -p 8080:8888 -it c3h3/traildb-ipynb
51 | 
52 | Easily to use [http://localhost:8080](http://localhost:8080) to access your jupyter notebook
53 | 
54 | #### Documentation
55 | 
56 | Sphinx documentation is available.
57 | 
58 | 1. Ensure Sphinx is installed.
59 | 
60 | `pip install sphinx`
61 | 
62 | 2. Generate HTML documentation.
63 | 
64 | `sphinx-build -b html . _build`
65 | 
66 | Open `_build/index.html` in a browser.
67 | 


--------------------------------------------------------------------------------
/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # traildb-python documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Oct  2 14:22:22 2017.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another directory,
 16 | # add these directories to sys.path here. If the directory is relative to the
 17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 18 | #
 19 | import os
 20 | import sys
 21 | sys.path.insert(0, os.path.abspath('.'))
 22 | 
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #
 28 | # needs_sphinx = '1.0'
 29 | 
 30 | # Add any Sphinx extension module names here, as strings. They can be
 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 32 | # ones.
 33 | extensions = ['sphinx.ext.autodoc']
 34 | 
 35 | # Add any paths that contain templates here, relative to this directory.
 36 | templates_path = ['_templates']
 37 | 
 38 | # The suffix(es) of source filenames.
 39 | # You can specify multiple suffix as a list of string:
 40 | #
 41 | # source_suffix = ['.rst', '.md']
 42 | source_suffix = '.rst'
 43 | 
 44 | # The master toctree document.
 45 | master_doc = 'index'
 46 | 
 47 | # General information about the project.
 48 | project = u'traildb-python'
 49 | copyright = u'2017, AdRoll Inc'
 50 | author = u'AdRoll Inc'
 51 | 
 52 | # The version info for the project you're documenting, acts as replacement for
 53 | # |version| and |release|, also used in various other places throughout the
 54 | # built documents.
 55 | #
 56 | # The short X.Y version.
 57 | version = u'0.1.0'
 58 | # The full version, including alpha/beta/rc tags.
 59 | release = u'0.1.0'
 60 | 
 61 | # The language for content autogenerated by Sphinx. Refer to documentation
 62 | # for a list of supported languages.
 63 | #
 64 | # This is also used if you do content translation via gettext catalogs.
 65 | # Usually you set "language" from the command line for these cases.
 66 | language = None
 67 | 
 68 | # List of patterns, relative to source directory, that match files and
 69 | # directories to ignore when looking for source files.
 70 | # This patterns also effect to html_static_path and html_extra_path
 71 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 72 | 
 73 | # The name of the Pygments (syntax highlighting) style to use.
 74 | pygments_style = 'sphinx'
 75 | 
 76 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 77 | todo_include_todos = False
 78 | 
 79 | 
 80 | # -- Options for HTML output ----------------------------------------------
 81 | 
 82 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 83 | # a list of builtin themes.
 84 | #
 85 | html_theme = 'alabaster'
 86 | 
 87 | # Theme options are theme-specific and customize the look and feel of a theme
 88 | # further.  For a list of options available for each theme, see the
 89 | # documentation.
 90 | #
 91 | # html_theme_options = {}
 92 | 
 93 | # Add any paths that contain custom static files (such as style sheets) here,
 94 | # relative to this directory. They are copied after the builtin static files,
 95 | # so a file named "default.css" will overwrite the builtin "default.css".
 96 | html_static_path = ['_static']
 97 | 
 98 | # Custom sidebar templates, must be a dictionary that maps document names
 99 | # to template names.
100 | #
101 | # This is required for the alabaster theme
102 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
103 | html_sidebars = {
104 |     '**': [
105 |         'about.html',
106 |         'navigation.html',
107 |         'relations.html',  # needs 'show_related': True theme option to display
108 |         'searchbox.html',
109 |         'donate.html',
110 |     ]
111 | }
112 | 
113 | 
114 | # -- Options for HTMLHelp output ------------------------------------------
115 | 
116 | # Output file base name for HTML help builder.
117 | htmlhelp_basename = 'traildb-pythondoc'
118 | 
119 | 
120 | # -- Options for LaTeX output ---------------------------------------------
121 | 
122 | latex_elements = {
123 |     # The paper size ('letterpaper' or 'a4paper').
124 |     #
125 |     # 'papersize': 'letterpaper',
126 | 
127 |     # The font size ('10pt', '11pt' or '12pt').
128 |     #
129 |     # 'pointsize': '10pt',
130 | 
131 |     # Additional stuff for the LaTeX preamble.
132 |     #
133 |     # 'preamble': '',
134 | 
135 |     # Latex figure (float) alignment
136 |     #
137 |     # 'figure_align': 'htbp',
138 | }
139 | 
140 | # Grouping the document tree into LaTeX files. List of tuples
141 | # (source start file, target name, title,
142 | #  author, documentclass [howto, manual, or own class]).
143 | latex_documents = [
144 |     (master_doc, 'traildb-python.tex', u'traildb-python Documentation',
145 |      u'AdRoll Inc', 'manual'),
146 | ]
147 | 
148 | 
149 | # -- Options for manual page output ---------------------------------------
150 | 
151 | # One entry per manual page. List of tuples
152 | # (source start file, name, description, authors, manual section).
153 | man_pages = [
154 |     (master_doc, 'traildb-python', u'traildb-python Documentation',
155 |      [author], 1)
156 | ]
157 | 
158 | 
159 | # -- Options for Texinfo output -------------------------------------------
160 | 
161 | # Grouping the document tree into Texinfo files. List of tuples
162 | # (source start file, target name, title, author,
163 | #  dir menu entry, description, category)
164 | texinfo_documents = [
165 |     (master_doc, 'traildb-python', u'traildb-python Documentation',
166 |      author, 'traildb-python', 'One line description of project.',
167 |      'Miscellaneous'),
168 | ]
169 | 
170 | 
171 | 
172 | 


--------------------------------------------------------------------------------
/examples/datashader_example.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from builtins import open
 6 | from builtins import int
 7 | from builtins import range
 8 | from past.utils import old_div
 9 | 
10 | import datashader as ds
11 | import datashader.transfer_functions as tf
12 | import pandas as pd
13 | 
14 | from traildb import TrailDB
15 | 
16 | 
17 | def get_events(tdb):
18 |     query = [('title', 'Prince (musician)')]
19 |     for i in range(len(tdb)):
20 |         events = list(tdb.trail(i, event_filter=query))
21 |         if events:
22 |             yield events[0].time, events
23 | 
24 | 
25 | def get_dataframe():
26 |     tdb = TrailDB('pydata-tutorial.tdb')
27 |     base = tdb.min_timestamp()
28 |     types = []
29 |     xs = []
30 |     ys = []
31 |     # try this:
32 |     # for y, (first_ts, events) in enumerate(sorted(get_events(tdb), reverse=True)):
33 |     for y, (first_ts, events) in enumerate(get_events(tdb)):
34 |         for event in events:
35 |             xs.append(old_div(int(event.time - base), (24 * 3600)))
36 |             ys.append(y)
37 |             types.append('user' if event.user else 'anon')
38 |     data = pd.DataFrame({'x': xs, 'y': ys})
39 |     data['type'] = pd.Series(types, dtype='category')
40 |     return data
41 | 
42 | cnv = ds.Canvas(400, 300)
43 | agg = cnv.points(get_dataframe(), 'x', 'y', ds.count_cat('type'))
44 | colors = {'anon': 'red', 'user': 'blue'}
45 | img = tf.set_background(tf.colorize(agg, colors, how='eq_hist'), 'white')
46 | with open('prince.png', 'w') as f:
47 |     f.write(img.to_bytesio().getvalue())
48 | 


--------------------------------------------------------------------------------
/examples/extract_sample.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | from __future__ import absolute_import
 5 | from past.utils import old_div
 6 | from random import random
 7 | import sys
 8 | 
 9 | from traildb import TrailDB, TrailDBConstructor
10 | 
11 | 
12 | def extract(tdb, cons, sample_size):
13 |     for uuid, trail in tdb.trails():
14 |         if random() < sample_size:
15 |             for event in trail:
16 |                 cons.add(uuid, event.time, list(event)[1:])
17 |     return cons.finalize()
18 | 
19 | if __name__ == '__main__':
20 |     if len(sys.argv) < 3:
21 |         print('Usage: extract_sample source_tdb destination_tdb sample_percentage')
22 |         sys.exit(1)
23 |     tdb = TrailDB(sys.argv[1])
24 |     cons = TrailDBConstructor(sys.argv[2], tdb.fields[1:])
25 |     num = extract(tdb, cons, old_div(float(sys.argv[3]), 100.)).num_trails
26 |     print('Extracted %d trails to %s' % (num, sys.argv[2]))
27 | 


--------------------------------------------------------------------------------
/examples/parse_wikipedia_history.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from datetime import datetime
 6 | import sys
 7 | import gzip
 8 | import hashlib
 9 | 
10 | import traildb
11 | 
12 | num_events = 0
13 | 
14 | # This script parses Wikipedia revision metadata that you can find here
15 | # https://dumps.wikimedia.org/enwiki/
16 | # You want a file like
17 | # https://dumps.wikimedia.org/enwiki/20160501/enwiki-20160501-stub-meta-history.xml.gz
18 | 
19 | 
20 | def add_event(cons, uuid, tstamp, user, ip, title):
21 |     global num_events
22 |     cons.add(uuid, tstamp, (user, ip, title))
23 |     num_events += 1
24 |     if not num_events & 1023:
25 |         print('%d events added' % num_events)
26 | 
27 | 
28 | def parse(cons, fileobj):
29 |     for line in fileobj:
30 |         line = line.strip()
31 |         if line.startswith('<title>'):
32 |             title = line[7:-8]
33 |         elif line.startswith('<timestamp>'):
34 |             tstamp = datetime.strptime(line[11:-13], '%Y-%m-%dT%H:%M:%S')
35 |         elif line.startswith('<username>'):
36 |             user = line[10:-11]
37 |             ip = ''
38 |             uuid = hashlib.md5(user).hexdigest()
39 |             add_event(cons, uuid, tstamp, user, ip, title)
40 |         elif line.startswith('<ip>'):
41 |             user = ''
42 |             ip = line[4:-5]
43 |             uuid = hashlib.md5(ip).hexdigest()
44 |             add_event(cons, uuid, tstamp, user, ip, title)
45 | 
46 | if __name__ == '__main__':
47 |     if len(sys.argv) < 3:
48 |         print('Usage: parse_wikipedia_history.py enwiki-20160501-stub-meta-history.xml.gz wikipedia-history.tdb')
49 |         sys.exit(1)
50 | 
51 |     cons = traildb.TrailDBConstructor(sys.argv[2],
52 |                                       ['user', 'ip', 'title'])
53 |     parse(cons, gzip.GzipFile(sys.argv[1]))
54 |     print('Done adding %d events!' % num_events)
55 |     cons.finalize()
56 |     print('Success!')
57 | 


--------------------------------------------------------------------------------
/examples/top_items.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from collections import Counter
 6 | import timeit
 7 | 
 8 | from traildb import TrailDB
 9 | 
10 | 
11 | def string_top():
12 |     tdb = TrailDB('pydata-tutorial')
13 |     return Counter(event.title for uuid, trail in tdb.trails()
14 |                    for event in trail).most_common(5)
15 | 
16 | 
17 | def item_top():
18 |     tdb = TrailDB('pydata-tutorial')
19 |     stats = Counter(event.title for uuid, trail in tdb.trails(rawitems=True)
20 |                     for event in trail)
21 |     return [(tdb.get_item_value(item), f) for item, f in stats.most_common(5)]
22 | 
23 | print('string_top', timeit.timeit(string_top, number=3))
24 | print('item_top', timeit.timeit(item_top, number=3))
25 | 


--------------------------------------------------------------------------------
/examples/tutorial_simple_traildb.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import range
 6 | from uuid import uuid4
 7 | from datetime import datetime
 8 | 
 9 | from traildb import TrailDBConstructor, TrailDB
10 | 
11 | cons = TrailDBConstructor('tiny', ['username', 'action'])
12 | 
13 | for i in range(3):
14 |     uuid = uuid4().hex
15 |     username = 'user%d' % i
16 |     for day, action in enumerate(['open', 'save', 'close']):
17 |         cons.add(uuid, datetime(2016, i + 1, day + 1), (username, action))
18 | 
19 | cons.finalize()
20 | 
21 | for uuid, trail in TrailDB('tiny').trails():
22 |     print(uuid, list(trail))
23 | 


--------------------------------------------------------------------------------
/examples/tutorial_wikipedia_sessions.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import next
 6 | import sys
 7 | 
 8 | from traildb import TrailDB
 9 | 
10 | SESSION_LIMIT = 30 * 60
11 | 
12 | 
13 | def sessions(tdb):
14 |     for i, (uuid, trail) in enumerate(tdb.trails(only_timestamp=True)):
15 |         prev_time = next(trail)
16 |         num_events = 1
17 |         num_sessions = 1
18 |         for timestamp in trail:
19 |             if timestamp - prev_time > SESSION_LIMIT:
20 |                 num_sessions += 1
21 |             prev_time = timestamp
22 |             num_events += 1
23 |         print('Trail[%d] Number of Sessions: %d Number of Events: %d' %
24 |               (i, num_sessions, num_events))
25 | 
26 | if __name__ == '__main__':
27 |     if len(sys.argv) < 2:
28 |         print('Usage: tutorial_wikipedia_sessions <wikipedia-history.tdb>')
29 |     else:
30 |         sessions(TrailDB(sys.argv[1]))
31 | 


--------------------------------------------------------------------------------
/index.rst:
--------------------------------------------------------------------------------
 1 | .. traildb-python documentation master file, created by
 2 |    sphinx-quickstart on Mon Oct  2 14:17:29 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | traildb-python
 7 | ==============
 8 | 
 9 | These are Python 2 bindings to TrailDB. Official TrailDB website is at http://traildb.io/
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 |    :caption: Contents:
14 | 
15 | .. autoclass:: traildb.TrailDB
16 |    :members:
17 | 
18 | .. autoclass:: traildb.TrailDBConstructor
19 |    :members:
20 | 
21 | .. autoclass:: traildb.TrailDBCursor
22 |    :members:
23 | 
24 | .. autoclass:: traildb.TrailDBMultiCursor
25 |    :members:
26 | 
27 | .. autoclass:: traildb.TrailDBEventFilter
28 |    :members:
29 | 
30 | .. autoclass:: traildb.TrailDBError
31 |    :members:
32 | 
33 | Indices and tables
34 | ==================
35 | 
36 | * :ref:`genindex`
37 | * :ref:`search`
38 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.3
 2 | configparser==3.5.0
 3 | enum34==1.1.6
 4 | flake8==3.3.0
 5 | future==0.16.0
 6 | mccabe==0.6.1
 7 | packaging==16.8
 8 | py==1.4.32
 9 | pycodestyle==2.3.1
10 | pyflakes==1.5.0
11 | pyparsing==2.2.0
12 | six==1.10.0
13 | 


--------------------------------------------------------------------------------
/runtests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib"
 3 | 
 4 | set -e
 5 | 
 6 | # E999 -- syntax error
 7 | # F821 -- undefined local variable
 8 | flake8 ./traildb/ | grep '[ ]E999[ ]\|[ ]F821[ ]' | awk '{print} END {exit(NR > 0)}'
 9 | 
10 | env PYTHONPATH='.' python test/test.py
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | 
3 | setup(name='traildb',
4 |       version='0.0.2',
5 |       description='TrailDB stores and queries cookie trails from raw logs.',
6 |       author='AdRoll.com',
7 |       install_requires=['future>=0.16.0'],
8 |       packages=['traildb'])
9 | 


--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | from __future__ import print_function
  3 | from __future__ import division
  4 | from __future__ import absolute_import
  5 | from builtins import next
  6 | from builtins import int
  7 | 
  8 | import os
  9 | import unittest
 10 | import datetime
 11 | 
 12 | from traildb import TrailDB, TrailDBConstructor, tdb_item_field, tdb_item_val
 13 | from traildb import TrailDBError, TrailDBCursor, TrailDBMultiCursor
 14 | 
 15 | 
 16 | class TestAPI(unittest.TestCase):
 17 |     def setUp(self):
 18 |         self.uuid = '12345678123456781234567812345678'
 19 |         cons = TrailDBConstructor('testtrail', ['field1', 'field2'])
 20 |         cons.add(self.uuid, 1, ['a', '1'])
 21 |         cons.add(self.uuid, 2, ['b', '2'])
 22 |         cons.add(self.uuid, 3, ['c', '3'])
 23 |         cons.finalize()
 24 | 
 25 |     def tearDown(self):
 26 |         os.unlink('testtrail.tdb')
 27 | 
 28 |     def test_trails(self):
 29 |         db = TrailDB('testtrail')
 30 |         self.assertEqual(1, db.num_trails)
 31 | 
 32 |         trail = db.trail(0)
 33 |         self.assertIsInstance(trail, TrailDBCursor)
 34 | 
 35 |         events = list(trail)  # Force evaluation of generator
 36 |         self.assertEqual(3, len(events))
 37 |         for event in events:
 38 |             self.assertTrue(hasattr(event, 'time'))
 39 |             self.assertTrue(hasattr(event, 'field1'))
 40 |             self.assertTrue(hasattr(event, 'field2'))
 41 | 
 42 |             with self.assertRaises(AttributeError):
 43 |                 event.missing_field
 44 | 
 45 |     def test_trails_selected_uuids(self):
 46 |         uuids = ["02345678123456781234567812345678",
 47 |                  "12345678123456781234567812345678",
 48 |                  "22345678123456781234567812345678",
 49 |                  "32345678123456781234567812345678",
 50 |                  "42345678123456781234567812345678"]
 51 |         cons = TrailDBConstructor('whitelist_testtrail', ['field1', 'field2'])
 52 |         for uuid in uuids:
 53 |             cons.add(uuid, 1, ['a', '1'])
 54 |             cons.add(uuid, 2, ['b', '2'])
 55 |             cons.add(uuid, 3, ['c', '3'])
 56 |         cons.finalize()
 57 | 
 58 |         tdb = TrailDB('whitelist_testtrail')
 59 |         whitelist = [uuids[0],
 60 |                      uuids[3],
 61 |                      uuids[4]]
 62 | 
 63 |         expected_length = 3
 64 |         for trail_uuid, trail_events in tdb.trails(selected_uuids=whitelist):
 65 |             trail_events = list(trail_events)
 66 |             self.assertEqual(len(trail_events),
 67 |                              expected_length)
 68 | 
 69 |     def test_crumbs(self):
 70 |         db = TrailDB('testtrail.tdb')
 71 | 
 72 |         n = 0
 73 |         for uuid, trail in db.trails():
 74 |             n += 1
 75 |             self.assertEqual(self.uuid, uuid)
 76 |             self.assertIsInstance(trail, TrailDBCursor)
 77 |             self.assertEqual(3, len(list(trail)))
 78 | 
 79 |         self.assertEqual(1, n)
 80 | 
 81 |     def test_silly_open(self):
 82 |         self.assertTrue(os.path.exists('testtrail.tdb'))
 83 |         self.assertFalse(os.path.exists('testtrail'))
 84 | 
 85 |         db1 = TrailDB('testtrail.tdb')
 86 |         db2 = TrailDB('testtrail')
 87 | 
 88 |         with self.assertRaises(TrailDBError):
 89 |             TrailDB('foo.tdb')
 90 | 
 91 |     def test_fields(self):
 92 |         db = TrailDB('testtrail')
 93 |         self.assertEqual(['time', 'field1', 'field2'], db.fields)
 94 | 
 95 |     def test_uuids(self):
 96 |         db = TrailDB('testtrail')
 97 |         self.assertEqual(0, db.get_trail_id(self.uuid))
 98 |         self.assertEqual(self.uuid, db.get_uuid(0))
 99 |         self.assertTrue(self.uuid in db)
100 | 
101 |     def test_lexicons(self):
102 |         db = TrailDB('testtrail')
103 | 
104 |         # First field
105 |         self.assertEqual(4, db.lexicon_size(1))
106 |         self.assertEqual(['a', 'b', 'c'], list(db.lexicon(1)))
107 | 
108 |         # Second field
109 |         self.assertEqual(['1', '2', '3'], list(db.lexicon(2)))
110 | 
111 |         with self.assertRaises(TrailDBError):
112 |             db.lexicon(3)  # Out of bounds
113 | 
114 |     def test_metadata(self):
115 |         db = TrailDB('testtrail.tdb')
116 |         self.assertEqual(1, db.min_timestamp())
117 |         self.assertEqual(3, db.max_timestamp())
118 |         self.assertEqual((1, 3), db.time_range())
119 | 
120 |         self.assertEqual((1, 3), db.time_range(parsetime=False))
121 | 
122 | 
123 |     def test_apply_whitelist(self):
124 |         uuids = ["02345678123456781234567812345678",
125 |                  "12345678123456781234567812345678",
126 |                  "22345678123456781234567812345678",
127 |                  "32345678123456781234567812345678",
128 |                  "42345678123456781234567812345678"]
129 |         cons = TrailDBConstructor('whitelist_testtrail', ['field1', 'field2'])
130 |         for uuid in uuids:
131 |             cons.add(uuid, 1, ['a', '1'])
132 |             cons.add(uuid, 2, ['b', '2'])
133 |             cons.add(uuid, 3, ['c', '3'])
134 |         cons.finalize()
135 | 
136 |         tdb = TrailDB('whitelist_testtrail')
137 |         whitelist = [uuids[0],
138 |                      uuids[3],
139 |                      uuids[4]]
140 |         tdb.apply_whitelist(whitelist)
141 |         found_trails = list(tdb.trails(parsetime=False))
142 | 
143 |         self.assertEqual(len(found_trails), len(uuids))
144 |         for trail_uuid, trail_events in found_trails:
145 |             if trail_uuid in whitelist:
146 |                 expected_length = 3
147 |             else:
148 |                 expected_length = 0
149 | 
150 |             trail_events = list(trail_events)
151 |             self.assertEqual(len(trail_events),
152 |                              expected_length)
153 | 
154 |     def test_apply_blacklist(self):
155 |         uuids = ["02345678123456781234567812345678",
156 |                  "12345678123456781234567812345678",
157 |                  "22345678123456781234567812345678",
158 |                  "32345678123456781234567812345678",
159 |                  "42345678123456781234567812345678"]
160 |         cons = TrailDBConstructor('blacklist_testtrail', ['field1', 'field2'])
161 |         for uuid in uuids:
162 |             cons.add(uuid, 1, ['a', '1'])
163 |             cons.add(uuid, 2, ['b', '2'])
164 |             cons.add(uuid, 3, ['c', '3'])
165 |         cons.finalize()
166 | 
167 |         tdb = TrailDB('blacklist_testtrail')
168 |         blacklist = [uuids[1],
169 |                      uuids[2]]
170 |         tdb.apply_blacklist(blacklist)
171 |         found_trails = list(tdb.trails(parsetime=False))
172 | 
173 |         for trail_uuid, trail_events in found_trails:
174 |             if trail_uuid in blacklist:
175 |                 expected_length = 0
176 |             else:
177 |                 expected_length = 3
178 | 
179 |             trail_events = list(trail_events)
180 |             self.assertEqual(len(trail_events),
181 |                              expected_length)
182 | 
183 | 
184 | class TestFilter(unittest.TestCase):
185 | 
186 |     def setUp(self):
187 |         uuid = '12345678123456781234567812345678'
188 |         cons = TrailDBConstructor('testtrail', ['field1', 'field2', 'field3'])
189 |         cons.add(uuid, 1, ['a', '1', 'x'])
190 |         cons.add(uuid, 2, ['b', '2', 'x'])
191 |         cons.add(uuid, 3, ['c', '3', 'y'])
192 |         cons.add(uuid, 4, ['d', '4', 'x'])
193 |         cons.add(uuid, 5, ['e', '5', 'x'])
194 |         tdb = cons.finalize()
195 | 
196 |     def tearDown(self):
197 |         os.unlink('testtrail.tdb')
198 | 
199 |     def test_simple_disjunction(self):
200 |         tdb = TrailDB('testtrail')
201 |         # test shorthand notation (not a list of lists)
202 |         events = list(
203 |             tdb.trail(0, event_filter=[('field1', 'a'), ('field2', '4')]))
204 |         self.assertEqual(len(events), 2)
205 |         self.assertEqual((events[0].field1, events[0].field2), ('a', '1'))
206 |         self.assertEqual((events[1].field1, events[1].field2), ('d', '4'))
207 | 
208 |     def test_negation(self):
209 |         tdb = TrailDB('testtrail')
210 |         events = list(tdb.trail(0, event_filter=[('field3', 'x', True)]))
211 |         self.assertEqual(len(events), 1)
212 |         self.assertEqual((events[0].field1, events[0].field2,
213 |                           events[0].field3), ('c', '3', 'y'))
214 | 
215 |     def test_conjunction(self):
216 |         tdb = TrailDB('testtrail')
217 |         events = list(
218 |             tdb.trail(0, event_filter=[[('field1', 'e'), ('field1', 'c')],
219 |                                        [('field3', 'y', True)]]))
220 |         self.assertEqual(len(events), 1)
221 |         self.assertEqual((events[0].field1, events[0].field2), ('e', '5'))
222 | 
223 |     def test_time_range(self):
224 |         tdb = TrailDB('testtrail')
225 |         events = list(tdb.trail(0,
226 |                                 event_filter=[[(2, 4)]],
227 |                                 parsetime=False))
228 |         self.assertEqual(len(events), 2)
229 |         self.assertEqual(events[0].time, 2)
230 |         self.assertEqual(events[1].time, 3)
231 | 
232 |     def test_filter_object(self):
233 |         tdb = TrailDB('testtrail')
234 |         obj = tdb.create_filter([[('field1', 'e'), ('field1', 'c')],
235 |                                  [('field3', 'y', True)]])
236 |         events = list(tdb.trail(0, event_filter=obj))
237 |         self.assertEqual(len(events), 1)
238 |         self.assertEqual((events[0].field1, events[0].field2), ('e', '5'))
239 |         events = list(tdb.trail(0, event_filter=obj))
240 |         self.assertEqual(len(events), 1)
241 |         self.assertEqual((events[0].field1, events[0].field2), ('e', '5'))
242 | 
243 | 
244 | class TestCons(unittest.TestCase):
245 |     def test_cursor(self):
246 |         uuid = '12345678123456781234567812345678'
247 |         cons = TrailDBConstructor('testtrail', ['field1', 'field2'])
248 |         cons.add(uuid, 1, ['a', '1'])
249 |         cons.add(uuid, 2, ['b', '2'])
250 |         cons.add(uuid, 3, ['c', '3'])
251 |         cons.add(uuid, 4, ['d', '4'])
252 |         cons.add(uuid, 5, ['e', '5'])
253 |         tdb = cons.finalize()
254 | 
255 |         with self.assertRaises(IndexError):
256 |             tdb.get_trail_id('12345678123456781234567812345679')
257 | 
258 |         trail = tdb.trail(tdb.get_trail_id(uuid))
259 |         with self.assertRaises(TypeError):
260 |             len(trail)
261 | 
262 |         j = 1
263 |         for event in trail:
264 |             self.assertEqual(j, int(event.field2))
265 |             self.assertEqual(j, int(event.time))
266 |             j += 1
267 |         self.assertEqual(6, j)
268 | 
269 |         # Iterator is empty now
270 |         self.assertEqual([], list(trail))
271 | 
272 |         field1_values = [e.field1 for e in tdb.trail(tdb.get_trail_id(uuid))]
273 |         self.assertEqual(['a', 'b', 'c', 'd', 'e'], field1_values)
274 | 
275 |     def test_cursor_parsetime(self):
276 |         uuid = '12345678123456781234567812345678'
277 |         cons = TrailDBConstructor('testtrail', ['field1'])
278 | 
279 |         events = [(datetime.datetime(2016, 1, 1, 1, 1), ['1']),
280 |                   (datetime.datetime(2016, 1, 1, 1, 2), ['2']),
281 |                   (datetime.datetime(2016, 1, 1, 1, 3), ['3'])]
282 |         [cons.add(uuid, time, fields) for time, fields in events]
283 |         tdb = cons.finalize()
284 | 
285 |         timestamps = [e.time for e in tdb.trail(0, parsetime=True)]
286 | 
287 |         self.assertIsInstance(timestamps[0], datetime.datetime)
288 |         self.assertEqual([time for time, _ in events], timestamps)
289 |         self.assertEqual(tdb.time_range(True), (events[0][0], events[-1][0]))
290 | 
291 |     def test_binarydata(self):
292 |         binary = b'\x00\x01\x02\x00\xff\x00\xff'
293 |         uuid = '12345678123456781234567812345678'
294 |         cons = TrailDBConstructor('testtrail', ['field1'])
295 |         cons.add(uuid, 123, [binary])
296 |         tdb = cons.finalize(decode=False)
297 |         self.assertEqual(list(tdb[0])[0].field1, binary)
298 | 
299 |     def test_cons(self):
300 |         uuid = '12345678123456781234567812345678'
301 |         cons = TrailDBConstructor('testtrail', ['field1', 'field2'])
302 |         cons.add(uuid, 123, ['a'])
303 |         cons.add(uuid, 124, ['b', 'c'])
304 |         tdb = cons.finalize()
305 | 
306 |         self.assertEqual(0, tdb.get_trail_id(uuid))
307 |         self.assertEqual(uuid, tdb.get_uuid(0))
308 |         self.assertEqual(1, tdb.num_trails)
309 |         self.assertEqual(2, tdb.num_events)
310 |         self.assertEqual(3, tdb.num_fields)
311 | 
312 |         crumbs = list(tdb.trails())
313 |         self.assertEqual(1, len(crumbs))
314 |         self.assertEqual(uuid, crumbs[0][0])
315 |         self.assertTrue(tdb[uuid])
316 |         self.assertTrue(uuid in tdb)
317 |         self.assertFalse('00000000000000000000000000000000' in tdb)
318 |         with self.assertRaises(IndexError):
319 |             tdb['00000000000000000000000000000000']
320 | 
321 |         trail = list(crumbs[0][1])
322 | 
323 |         self.assertEqual(123, trail[0].time)
324 |         self.assertEqual('a', trail[0].field1)
325 |         self.assertEqual('', trail[0].field2)  # TODO: Should this be None?
326 | 
327 |         self.assertEqual(124, trail[1].time)
328 |         self.assertEqual('b', trail[1].field1)
329 |         self.assertEqual('c', trail[1].field2)
330 | 
331 |     def test_items(self):
332 |         uuid = '12345678123456781234567812345678'
333 |         cons = TrailDBConstructor('testtrail', ['field1', 'field2'])
334 |         cons.add(uuid, 123, ['a', 'x' * 2048])
335 |         cons.add(uuid, 124, ['b', 'y' * 2048])
336 |         tdb = cons.finalize()
337 | 
338 |         cursor = tdb.trail(0, rawitems=True)
339 |         event = next(cursor)
340 |         self.assertEqual(tdb.get_item_value(event.field1), 'a')
341 |         self.assertEqual(tdb.get_item_value(event.field2), 'x' * 2048)
342 |         self.assertEqual(tdb.get_item('field1', 'a'), event.field1)
343 |         self.assertEqual(tdb.get_item('field2', 'x' * 2048), event.field2)
344 |         event = next(cursor)
345 |         self.assertEqual(tdb.get_item_value(event.field1), 'b')
346 |         self.assertEqual(tdb.get_item_value(event.field2), 'y' * 2048)
347 |         self.assertEqual(tdb.get_item('field1', 'b'), event.field1)
348 |         self.assertEqual(tdb.get_item('field2', 'y' * 2048), event.field2)
349 | 
350 |         cursor = tdb.trail(0, rawitems=True)
351 |         event = next(cursor)
352 |         field = tdb_item_field(event.field1)
353 |         val = tdb_item_val(event.field1)
354 |         self.assertEqual(tdb.get_value(field, val), 'a')
355 |         field = tdb_item_field(event.field2)
356 |         val = tdb_item_val(event.field2)
357 |         self.assertEqual(tdb.get_value(field, val), 'x' * 2048)
358 |         event = next(cursor)
359 |         field = tdb_item_field(event.field1)
360 |         val = tdb_item_val(event.field1)
361 |         self.assertEqual(tdb.get_value(field, val), 'b')
362 |         field = tdb_item_field(event.field2)
363 |         val = tdb_item_val(event.field2)
364 |         self.assertEqual(tdb.get_value(field, val), 'y' * 2048)
365 | 
366 |     def test_append(self):
367 |         uuid = '12345678123456781234567812345678'
368 |         cons = TrailDBConstructor('testtrail', ['field1'])
369 |         cons.add(uuid, 123, ['foobarbaz'])
370 |         tdb = cons.finalize()
371 | 
372 |         cons = TrailDBConstructor('testtrail2', ['field1'])
373 |         cons.add(uuid, 124, ['barquuxmoo'])
374 |         cons.append(tdb)
375 |         tdb = cons.finalize()
376 | 
377 |         self.assertEqual(2, tdb.num_events)
378 |         uuid, trail = list(tdb.trails())[0]
379 |         trail = list(trail)
380 |         self.assertEqual([123, 124], [e.time for e in trail])
381 |         self.assertEqual(['foobarbaz', 'barquuxmoo'],
382 |                          [e.field1 for e in trail])
383 | 
384 |     def tearDown(self):
385 |         try:
386 |             os.unlink('testtrail.tdb')
387 |             os.unlink('testtrail2.tdb')
388 |         except:
389 |             pass
390 | 
391 | 
392 | class TestMultiCursor(unittest.TestCase):
393 |     def setUp(self):
394 |         self.uuid1 = '12345678123456781234567812345678'
395 |         self.uuid2 = '12345678123456781234567812345679'
396 | 
397 |         cons = TrailDBConstructor('testtrail1', ['field1', 'field2', 'field3'])
398 |         cons.add(self.uuid1, 1, ['a', '1', 'x'])
399 |         cons.add(self.uuid1, 2, ['b', '2', 'x'])
400 |         cons.add(self.uuid2, 1, ['c', '3', 'y'])
401 |         cons.add(self.uuid2, 2, ['d', '4', 'x'])
402 |         cons.add(self.uuid2, 3, ['e', '5', 'x'])
403 |         self.tdb1 = cons.finalize()
404 | 
405 |         cons = TrailDBConstructor('testtrail2', ['field1', 'field2', 'field3', 'field4'])
406 |         cons.add(self.uuid2, 4, ['a', '1', 'x', 'l'])
407 |         cons.add(self.uuid2, 5, ['b', '2', 'x', 'm'])
408 |         cons.add(self.uuid1, 3, ['c', '3', 'y', 'n'])
409 |         cons.add(self.uuid1, 4, ['d', '4', 'x', 'o'])
410 |         cons.add(self.uuid1, 5, ['e', '5', 'x', 'p'])
411 |         self.tdb2 = cons.finalize()
412 | 
413 |     def test_multicursor(self):
414 |         c1 = self.tdb1.trail(self.tdb1.get_trail_id(self.uuid1))
415 |         c2 = self.tdb2.trail(self.tdb2.get_trail_id(self.uuid1))
416 |         mc = TrailDBMultiCursor(False, False, False)
417 | 
418 |         # not initialized, raise error
419 |         with self.assertRaises(TrailDBError):
420 |             next(mc)
421 |         mc.set_cursors([c1, c2], [self.tdb1, self.tdb2])
422 | 
423 |         # exhaust the iterator
424 |         events = list(mc)
425 | 
426 |         self.assertEqual(len(events), 5)
427 |         self.assertEqual(events[0][0].time, 1)
428 |         self.assertEqual(events[0][0].field1, 'a')
429 |         self.assertEqual(events[0][0].field2, '1')
430 |         self.assertEqual(events[0][0].field3, 'x')
431 |         self.assertEqual(events[1][0].time, 2)
432 |         self.assertEqual(events[1][0].field1, 'b')
433 |         self.assertEqual(events[1][0].field2, '2')
434 |         self.assertEqual(events[1][0].field3, 'x')
435 |         # this one is from the 2nd tdb, has an additional field
436 |         self.assertEqual(events[2][0].time, 3)
437 |         self.assertEqual(events[2][0].field1, 'c')
438 |         self.assertEqual(events[2][0].field2, '3')
439 |         self.assertEqual(events[2][0].field3, 'y')
440 |         self.assertEqual(events[2][0].field4, 'n')
441 | 
442 |     def test_multicursor_reuse(self):
443 |         c1 = self.tdb1.trail(self.tdb1.get_trail_id(self.uuid1))
444 |         c2 = self.tdb2.trail(self.tdb2.get_trail_id(self.uuid1))
445 |         mc = TrailDBMultiCursor(False, False, False)
446 |         mc.set_cursors([c1, c2], [self.tdb1, self.tdb2])
447 |         # exhaust the iterator
448 |         list(mc)
449 | 
450 |         # change the cursors
451 |         c1.get_trail(self.tdb1.get_trail_id(self.uuid2))
452 |         c2.get_trail(self.tdb2.get_trail_id(self.uuid2))
453 | 
454 |         # reset the multicursor
455 |         mc.reset()
456 |         events = list(mc)
457 | 
458 |         self.assertEqual(len(events), 5)
459 |         self.assertEqual(events[0][0].time, 1)
460 |         self.assertEqual(events[0][0].field1, 'c')
461 |         self.assertEqual(events[0][0].field2, '3')
462 |         self.assertEqual(events[0][0].field3, 'y')
463 |         self.assertEqual(events[3][0].time, 4)
464 |         self.assertEqual(events[3][0].field1, 'a')
465 |         self.assertEqual(events[3][0].field2, '1')
466 |         self.assertEqual(events[3][0].field3, 'x')
467 |         self.assertEqual(events[3][0].field4, 'l')
468 | 
469 |     def test_multicursor_raw_items_parsetime(self):
470 |         c1 = self.tdb1.trail(self.tdb1.get_trail_id(self.uuid1))
471 |         c2 = self.tdb2.trail(self.tdb2.get_trail_id(self.uuid1))
472 |         mc = TrailDBMultiCursor(True, True, False)
473 |         mc.set_cursors([c1, c2], [self.tdb1, self.tdb2])
474 |         # exhaust the iterator
475 |         events = list(mc)
476 | 
477 |         # just make sure the length is right and we didn't have any errors
478 |         self.assertEqual(len(events), 5)
479 | 
480 |     def tearDown(self):
481 |         try:
482 |             os.unlink('testtrail1.tdb')
483 |             os.unlink('testtrail2.tdb')
484 |         except:
485 |             pass
486 | 
487 | 
488 | if __name__ == '__main__':
489 |     unittest.main()
490 | 


--------------------------------------------------------------------------------
/traildb/__init__.py:
--------------------------------------------------------------------------------
1 | from .traildb import TrailDBError
2 | from .traildb import TrailDBConstructor
3 | from .traildb import TrailDB
4 | from .traildb import TrailDBCursor
5 | from .traildb import TrailDBMultiCursor
6 | from .traildb import TrailDBEventFilter
7 | from .traildb import tdb_item_field
8 | from .traildb import tdb_item_val
9 | 


--------------------------------------------------------------------------------
/traildb/__main__.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals
2 | from __future__ import print_function
3 | import sys
4 | 
5 | import traildb
6 | 
7 | for cookie, trail in traildb.TrailDB(*(sys.argv[1:] or ['a.tdb'])).crumbs():
8 |     print(cookie, trail)
9 | 


--------------------------------------------------------------------------------
/traildb/traildb.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | from __future__ import absolute_import
  6 | from builtins import int
  7 | from builtins import range
  8 | from past.builtins import basestring
  9 | from builtins import object
 10 | from ctypes import c_char, c_char_p, c_ubyte, c_int, c_void_p
 11 | from ctypes import c_uint, c_uint32, c_uint64
 12 | from ctypes import Structure, Union
 13 | from ctypes import CDLL, POINTER, pointer
 14 | from ctypes import string_at, addressof
 15 | from datetime import datetime
 16 | 
 17 | import os
 18 | import sys
 19 | import time
 20 | import codecs
 21 | 
 22 | CODEC = 'utf8'
 23 | 
 24 | HEX = 'hex'
 25 | 
 26 | try:
 27 |     codecs.decode('A0', 'hex')
 28 | except LookupError:
 29 |     HEX = 'hex_codec'
 30 | 
 31 | if os.name == "posix" and sys.platform == "darwin":
 32 |     try:
 33 |         lib = CDLL('libtraildb.dylib')
 34 |     except:
 35 |         # is there a better way to figure out the path?
 36 |         lib = CDLL('/usr/local/lib/libtraildb.dylib')
 37 | elif os.name == "posix" and "linux" in sys.platform:
 38 |     lib = CDLL('libtraildb.so')
 39 | 
 40 | 
 41 | def api(fun, args, res=None):
 42 |     fun.argtypes = args
 43 |     fun.restype = res
 44 | 
 45 | tdb         = c_void_p
 46 | tdb_cons    = c_void_p
 47 | tdb_field   = c_uint32
 48 | tdb_val     = c_uint64
 49 | tdb_item    = c_uint64
 50 | tdb_cursor  = c_void_p
 51 | tdb_error   = c_int
 52 | tdb_event_filter = c_void_p
 53 | tdb_multi_cursor = c_void_p
 54 | 
 55 | 
 56 | class tdb_event(Structure):
 57 |     _fields_ = [("timestamp", c_uint64),
 58 |                 ("num_items", c_uint64),
 59 |                 ("items", POINTER(tdb_item))]
 60 | 
 61 | class tdb_multi_event(Structure):
 62 |     _fields_ = [("db", tdb),
 63 |                 ("tdb_event", POINTER(tdb_event)),
 64 |                 ("cursor_idx", c_uint64)]
 65 | 
 66 | class tdb_opt_value(Union):
 67 |     _fields_ = [("ptr", c_void_p),
 68 |                 ("value", c_uint64)]
 69 | 
 70 | TDB_OPT_EVENT_FILTER = 101
 71 | 
 72 | 
 73 | api(lib.tdb_cons_init, [], tdb_cons)
 74 | api(lib.tdb_cons_open,
 75 |     [tdb_cons, c_char_p, POINTER(c_char_p), c_uint64], tdb_error)
 76 | api(lib.tdb_cons_close, [tdb_cons])
 77 | api(lib.tdb_cons_add,
 78 |     [tdb_cons, POINTER(c_ubyte), c_uint64,
 79 |      POINTER(c_char_p), POINTER(c_uint64)],
 80 |     tdb_error)
 81 | api(lib.tdb_cons_append, [tdb_cons, tdb], tdb_error)
 82 | api(lib.tdb_cons_finalize, [tdb_cons], tdb_error)
 83 | 
 84 | api(lib.tdb_init, [], tdb)
 85 | api(lib.tdb_open, [tdb, c_char_p], tdb_error)
 86 | api(lib.tdb_close, [tdb])
 87 | 
 88 | api(lib.tdb_lexicon_size, [tdb, tdb_field], tdb_error)
 89 | 
 90 | api(lib.tdb_get_field, [tdb, c_char_p], tdb_error)
 91 | api(lib.tdb_get_field_name, [tdb, tdb_field], c_char_p)
 92 | 
 93 | api(lib.tdb_get_item, [tdb, tdb_field, POINTER(c_char), c_uint64], tdb_item)
 94 | api(lib.tdb_get_value,
 95 |     [tdb, tdb_field, tdb_val, POINTER(c_uint64)], POINTER(c_char))
 96 | api(lib.tdb_get_item_value,
 97 |     [tdb, tdb_item, POINTER(c_uint64)], POINTER(c_char))
 98 | 
 99 | api(lib.tdb_get_uuid, [tdb, c_uint64], POINTER(c_ubyte))
100 | api(lib.tdb_get_trail_id,
101 |     [tdb, POINTER(c_ubyte), POINTER(c_uint64)], tdb_error)
102 | 
103 | api(lib.tdb_error_str, [tdb_error], c_char_p)
104 | 
105 | api(lib.tdb_num_trails, [tdb], c_uint64)
106 | api(lib.tdb_num_events, [tdb], c_uint64)
107 | api(lib.tdb_num_fields, [tdb], c_uint64)
108 | api(lib.tdb_min_timestamp, [tdb], c_uint64)
109 | api(lib.tdb_max_timestamp, [tdb], c_uint64)
110 | 
111 | api(lib.tdb_version, [tdb], c_uint64)
112 | 
113 | api(lib.tdb_cursor_new, [tdb], tdb_cursor)
114 | api(lib.tdb_cursor_free, [tdb])
115 | api(lib.tdb_cursor_next, [tdb_cursor], POINTER(tdb_event))
116 | api(lib.tdb_get_trail, [tdb_cursor, c_uint64], tdb_error)
117 | api(lib.tdb_get_trail_length, [tdb_cursor], c_uint64)
118 | api(lib.tdb_cursor_set_event_filter, [tdb_cursor, tdb_event_filter], tdb_error)
119 | 
120 | api(lib.tdb_multi_cursor_new, [POINTER(tdb_cursor), c_uint64], tdb_multi_cursor)
121 | api(lib.tdb_multi_cursor_free, [tdb_multi_cursor])
122 | api(lib.tdb_multi_cursor_reset, [tdb_multi_cursor])
123 | api(lib.tdb_multi_cursor_next, [tdb_multi_cursor], POINTER(tdb_multi_event))
124 | api(lib.tdb_multi_cursor_next_batch, [tdb_multi_cursor, POINTER(tdb_multi_event), c_uint64])
125 | 
126 | api(lib.tdb_event_filter_new, [], tdb_event_filter)
127 | api(lib.tdb_event_filter_add_term, [tdb_event_filter, tdb_item, c_int], tdb_error)
128 | api(lib.tdb_event_filter_add_time_range, [c_uint64, c_uint64], tdb_error)
129 | api(lib.tdb_event_filter_new_clause, [tdb_event_filter], tdb_error)
130 | api(lib.tdb_event_filter_new_match_none, [], tdb_event_filter)
131 | api(lib.tdb_event_filter_new_match_all, [], tdb_event_filter)
132 | api(lib.tdb_event_filter_free, [tdb_event_filter])
133 | 
134 | api(lib.tdb_set_opt, [tdb, c_uint, tdb_opt_value], tdb_error)
135 | api(lib.tdb_set_trail_opt, [tdb, c_uint64, c_uint, tdb_opt_value], tdb_error)
136 | 
137 | 
138 | def uuid_hex(uuid):
139 |     """
140 |     :returns: Given a binary UUID, encodes it into hex.
141 |     """
142 |     if isinstance(uuid, basestring):
143 |         return uuid
144 |     return codecs.encode(string_at(uuid, 16), HEX).decode(CODEC)
145 | 
146 | 
147 | def uuid_raw(uuid):
148 |     """
149 |     :returns: Given a hex UUID, encodes it into binary.
150 |     """
151 |     if isinstance(uuid, basestring):
152 |         return (c_ubyte * 16).from_buffer_copy(codecs.decode(uuid, HEX))
153 |     return uuid
154 | 
155 | 
156 | def nullterm(strs, size):
157 |     return '\x00'.join(strs) + (size - len(strs) + 1) * '\x00'
158 | 
159 | 
160 | # Port of tdb_item_field and tdb_item_val in tdb_types.h. Cannot use
161 | # them directly as they are inlined functions.
162 | 
163 | def tdb_item_is32(item):
164 |     return not (item & 128)
165 | 
166 | 
167 | def tdb_item_field32(item):
168 |     return item & 127
169 | 
170 | 
171 | def tdb_item_val32(item):
172 |     return (item >> 8) & 4294967295  # UINT32_MAX
173 | 
174 | 
175 | def tdb_item_field(item):
176 |     """Return field-part of an item."""
177 |     if tdb_item_is32(item):
178 |         return tdb_item_field32(item)
179 |     else:
180 |         return (item & 127) | (((item >> 8) & 127) << 7)
181 | 
182 | 
183 | def tdb_item_val(item):
184 |     """Return value-part of an item."""
185 |     if tdb_item_is32(item):
186 |         return tdb_item_val32(item)
187 |     else:
188 |         return item >> 16
189 | 
190 | 
191 | class TrailDBError(Exception):
192 |     """This is the exception thrown when something fails with TrailDB."""
193 |     pass
194 | 
195 | 
196 | class TrailDBConstructor(object):
197 |     """Objects of this class are used to Construct new TrailDBs."""
198 | 
199 |     def __init__(self, path, ofields=()):
200 |         """Initialize a new TrailDB constructor.
201 | 
202 |         :param path: TrailDB output path (without .tdb).
203 |         :param ofields: List of field (names) in this TrailDB.
204 | 
205 |         .. code-block:: python
206 | 
207 |           import traildb
208 |           tdbcons = traildb.TrailDBConstructor('example', ['type', 'flavor'])
209 |           c.add('00000000000000000000000000000001', 123, ['click', 'tasty'])
210 |           c.add('00000000000000000000000000000002', 129, ['flash', 'sour'])
211 |           c.finalize()   # Don't forget to finalize, otherwise you won't get a full TrailDB.
212 |         """
213 |         if not path:
214 |             raise TrailDBError("Path is required")
215 |         n = len(ofields)
216 | 
217 |         if isinstance(path, str):
218 |             path = path.encode(CODEC)
219 | 
220 |         ofield_names = (c_char_p * n)(*[name.encode(CODEC)
221 |                                         for name in ofields])
222 | 
223 |         self._cons = lib.tdb_cons_init()
224 |         if lib.tdb_cons_open(self._cons, path, ofield_names, n) != 0:
225 |             raise TrailDBError("Cannot open constructor")
226 | 
227 |         self.path = path
228 |         self.ofields = ofields
229 | 
230 |     def __del__(self):
231 |         if hasattr(self, '_cons'):
232 |             lib.tdb_cons_close(self._cons)
233 | 
234 |     def add(self, uuid, tstamp, values):
235 |         """Add an event in TrailDB.
236 | 
237 |         :param uuid: UUID of this event.
238 |         :param tstamp: Timestamp of this event (datetime or integer).
239 |         :param values: value of each field.
240 | 
241 |         .. code-block:: python
242 | 
243 |           cons.add('00000000000000000000000000000001', 123, ['click', 'tasty'])
244 |         """
245 |         if isinstance(tstamp, datetime):
246 |             tstamp = int(time.mktime(tstamp.timetuple()))
247 |         n = len(self.ofields)
248 |         values = [v.encode(CODEC) if not isinstance(v, bytes)
249 |                   else v for v in values]
250 |         value_array = (c_char_p * n)(*values)
251 |         value_lengths = (c_uint64 * n)(*[len(v) for v in values])
252 |         f = lib.tdb_cons_add(self._cons, uuid_raw(uuid), tstamp, value_array,
253 |                              value_lengths)
254 |         if f:
255 |             raise TrailDBError("Too many values: %s" % values[f])
256 | 
257 |     def append(self, db):
258 |         """Merge an existing TrailDB in this TrailDB.
259 | 
260 |         :param db: An instance of :py:class:`~traildb.TrailDB` you want to merge to this one.
261 |         """
262 |         f = lib.tdb_cons_append(self._cons, db._db)
263 |         if f < 0:
264 |             raise TrailDBError("Wrong number of fields: %d" % db.num_fields)
265 |         if f > 0:
266 |             raise TrailDBError("Too many values: %s" % db.num_fields)
267 | 
268 |     def finalize(self, decode=True):
269 |         """Finalize this TrailDB. You cannot add new events in this TrailDB
270 |         after calling this function.
271 | 
272 |         You need to finalize :py:class:`~traildb.TrailDBConstructor` or you
273 |         will not have an openable TrailDB later. Finalization is where all the
274 |         compression and preparation happen and is typically the most
275 |         resource-intensive part of TrailDB building.
276 | 
277 |         :returns: Opened :py:class:`~traildb.TrailDB`:
278 |         """
279 |         r = lib.tdb_cons_finalize(self._cons)
280 |         if r:
281 |             raise TrailDBError("Could not finalize (%d)" % r)
282 |         return TrailDB(self.path, decode)
283 | 
284 | 
285 | class TrailDBCursor(object):
286 |     """TrailDBCursor iterates over events of a trail.
287 | 
288 |     Typically this class is not instantiated directly but it is
289 |     returned by TrailDB.trail() or TrailDB.cursor()
290 | 
291 |     A cursor can be reused for different trails by calling
292 |     TrailDBCursor.get_trail(trail_id)
293 | 
294 |     """
295 | 
296 |     def __init__(self,
297 |                  cursor,
298 |                  cls,
299 |                  valuefun,
300 |                  parsetime,
301 |                  only_timestamp,
302 |                  event_filter_obj):
303 |         self.cursor = cursor
304 |         self.valuefun = valuefun
305 |         self.parsetime = parsetime
306 |         self.cls = cls
307 |         self.only_timestamp = only_timestamp
308 |         if event_filter_obj:
309 |             self.event_filter_obj = event_filter_obj
310 |             if lib.tdb_cursor_set_event_filter(cursor, event_filter_obj.flt):
311 |                 raise TrailDBError("cursor_set_event_filter failed")
312 |         else:
313 |             self.event_filter_obj = None
314 | 
315 |     def __del__(self):
316 |         if self.cursor:
317 |             lib.tdb_cursor_free(self.cursor)
318 | 
319 |     def __iter__(self):
320 |         return self
321 | 
322 |     def __next__(self):
323 |         """Return the next event in the trail."""
324 |         event = lib.tdb_cursor_next(self.cursor)
325 |         if not event:
326 |             raise StopIteration()
327 | 
328 |         address = addressof(event.contents.items)
329 |         items = (tdb_item * event.contents.num_items).from_address(address)
330 | 
331 |         timestamp = event.contents.timestamp
332 |         if self.parsetime:
333 |             timestamp = datetime.fromtimestamp(event.contents.timestamp)
334 | 
335 |         if self.only_timestamp:
336 |             return timestamp
337 |         elif self.valuefun:
338 |             return self.cls(False, timestamp, *items)
339 |         else:
340 |             return self.cls(True, timestamp, *items)
341 | 
342 |     def get_trail(self, trail_id):
343 |         if lib.tdb_get_trail(self.cursor, trail_id) != 0:
344 |             raise TrailDBError("Failed to initalize trail in cursor")
345 | 
346 |         if self.event_filter_obj:
347 |             if lib.tdb_cursor_set_event_filter(self.cursor, self.event_filter_obj.flt):
348 |                 raise TrailDBError("cursor_set_event_filter failed")
349 | 
350 | 
351 | class TrailDBMultiCursor(object):
352 |     """
353 |     TrailDBMultiCursor iterates over the events of multiple trails,
354 |     merged together into a single trail with events sorted in the ascending
355 |     time order. The trails can be from different traildbs.
356 | 
357 |     To use, initialize and then set the cursors using the set_cursors method.
358 |     To reuse a multicursor, set new trails on the underlying cursors and then
359 |     call :py:meth:`~traildb.TrailDBMultiCursor.reset()`. If filtering, apply event filters to the underlying
360 |     cursors individually before setting them on the multicursor, or call reset after doing so
361 |     if already set.
362 |     """
363 | 
364 |     def __init__(self, parsetime, rawitems, only_timestamp):
365 |         """
366 |         :param parsetime: If True, returns datetime objects instead of integer timestamps.
367 |         :param rawitems: Return raw integer items instead of stringified values. Using raw items is usually a bit more efficient than using string values.
368 |         :param only_timestamp: If True, only return timestamps, not event objects.
369 |         """
370 |         self.parsetime = parsetime
371 |         self.rawitems = rawitems
372 |         self.only_timestamp = only_timestamp
373 |         self.multicursor = None
374 |         self._ready = False
375 | 
376 |     def __del__(self):
377 |         if self.multicursor:
378 |             lib.tdb_multi_cursor_free(self.multicursor)
379 | 
380 |     def __iter__(self):
381 |         return self
382 | 
383 |     def __next__(self):
384 |         """
385 |         return the next event in the combined trails, in ascending timestamp order
386 | 
387 |         this will return tuples in the form of `(event, traildb)`, where the traildb
388 |         is the :py:class:`~traildb.TrailDB` the event belongs to. This can be used to
389 |         get the values if rawitems is used.
390 |         """
391 |         if not self._ready:
392 |             raise TrailDBError("Multicursor not initialized, call set_cursors")
393 | 
394 |         multi_event = lib.tdb_multi_cursor_next(self.multicursor)
395 |         if multi_event:
396 |             event = self.to_event(multi_event.contents)
397 |         else:
398 |             raise StopIteration()
399 | 
400 |         return event
401 | 
402 |     def to_event(self, multi_event):
403 |         event = multi_event.tdb_event
404 |         tdb_ptr = multi_event.db
405 | 
406 |         timestamp = event.contents.timestamp
407 |         if self.parsetime:
408 |             timestamp = datetime.fromtimestamp(event.contents.timestamp)
409 | 
410 |         if self.only_timestamp:
411 |             return timestamp
412 | 
413 |         try:
414 |             traildb = self._traildbs[tdb_ptr]
415 |         except KeyError:
416 |             raise TrailDBError("TrailDBMultiCursor encountered a traildb that was not included in set_cursors")
417 | 
418 |         address = addressof(event.contents.items)
419 |         items = (tdb_item * event.contents.num_items).from_address(address)
420 | 
421 |         if self.rawitems:
422 |             return traildb._event_cls(True, timestamp, *items), traildb
423 |         else:
424 |             return traildb._event_cls(False, timestamp, *items), traildb
425 | 
426 |     def set_cursors(self, cursors, traildbs):
427 |         """
428 |         configure this multicursor to merge the specified cursors. This is required before use.
429 | 
430 |         :param cursors: list of :py:class:`~traildb.TrailDBCursor` instances to merge
431 |         :param traildbs: list of :py:class:`~traildb.TrailDB` instances from which the cursors were created (only needs to be specified once, even if there are multiple cursors from the same TrailDB)
432 |         """
433 | 
434 |         n_cursors = len(cursors)
435 |         cursor_array = (tdb_cursor * n_cursors)(*[c.cursor for c in cursors])
436 | 
437 |         # maintain references to these in python so they wont get garbage collected
438 |         self._cursor_arr = cursor_array
439 |         self.cursors = cursors
440 | 
441 |         self.multicursor = lib.tdb_multi_cursor_new(cursor_array, n_cursors)
442 |         if self.multicursor is None:
443 |             raise TrailDBError("Failed to allocate memory for multicursor")
444 |         self.reset()
445 | 
446 |         # mapping of the traildb pointer to the TrailDB object
447 |         # we need this to get the configured traildb in python since we get a pointer to the tdb from the multi event
448 |         self._traildbs = {tdb._db: tdb for tdb in traildbs}
449 | 
450 |         self._ready = True
451 | 
452 |     def reset(self):
453 |         """
454 |         reset the state of the multicursor to sync with the underlying cursors.
455 |         Used when resuing cursors. Also resets the state of the python object,
456 |         including any batched results.
457 |         """
458 | 
459 |         if self.multicursor:
460 |             lib.tdb_multi_cursor_reset(self.multicursor)
461 |         self._batch_idx = 0
462 |         self._current_batch_size = 0
463 | 
464 | 
465 | def mk_event_class(fields, valuefun):
466 |     field_to_index = {f: i for i, f in enumerate(fields)}
467 | 
468 |     class TrailDBEvent(object):
469 |         __slots__ = ('items', 'rawitems', 'memoized')
470 | 
471 |         def __repr__(self):
472 |             return '<TrailDBCursor: {}>'.format(self.to_list())
473 | 
474 |         def __str__(self):
475 |             return self.__repr__()
476 | 
477 |         def __init__(self, rawitems, *items):
478 |             self.items = tuple(items)
479 |             self.rawitems = rawitems
480 |             self.memoized = {}
481 | 
482 |         def _fields(self):
483 |             return fields
484 | 
485 |         def __eq__(self, other):
486 |             fields_checked = set()
487 |             
488 |             # Are the field contents same?
489 |             for f in fields:
490 |                 try:
491 |                     if self.__getattr__(f) != other.__getattr__(f):
492 |                         return False
493 |                     fields_checked.add(f)
494 |                 except AttributeError:
495 |                     return False
496 | 
497 |             for f in other._fields():
498 |                 if f not in fields_checked:
499 |                     return False
500 | 
501 |             # So field contents and number of them are the
502 |             # same. But field *names* have not been checked
503 |             # yet.
504 | 
505 |             other_fields = other._fields()
506 |             for i, f in enumerate(fields):
507 |                 if f != other_fields[i]:
508 |                     return False
509 | 
510 |             return True
511 |         
512 |         def __hash__(self):
513 |             return hash(tuple(self.to_list()))
514 | 
515 |         def to_list(self):
516 |             lst = []
517 |             for f in fields:
518 |                 lst.append( (f, self.__getattr__(f)) )
519 |             return lst
520 | 
521 |         def __getattr__(self, name):
522 |             if name in self.memoized:
523 |                 return self.memoized[name]
524 | 
525 |             if name not in field_to_index:
526 |                 raise AttributeError
527 | 
528 |             item = self.items[field_to_index[name]]
529 |             if self.rawitems:
530 |                 return item
531 |             else:
532 |                 if name == 'time':
533 |                     return item
534 |                 else:
535 |                     self.memoized[name] = valuefun(item)
536 |                     return self.memoized[name]
537 | 
538 |     return TrailDBEvent
539 | 
540 | 
541 | class TrailDB(object):
542 |     """Objects of this class represent an opened TrailDB.
543 | 
544 |     Simply pass the filename to the constructor (with or without extension) as below.
545 | 
546 |     .. code-block:: python
547 | 
548 |       import traildb
549 |       tdb = traildb.TrailDB('blah.tdb')
550 |     """
551 | 
552 |     def __init__(self, path, decode=True):
553 |         """Open a TrailDB at path."""
554 |         if isinstance(path, str):
555 |             path = path.encode(CODEC)
556 | 
557 |         self._db = db = lib.tdb_init()
558 |         res = lib.tdb_open(self._db, path)
559 |         if res != 0:
560 |             raise TrailDBError("Could not open %s, error code %d" %
561 |                                (path, res))
562 | 
563 |         self.num_trails = lib.tdb_num_trails(db)
564 |         self.num_events = lib.tdb_num_events(db)
565 |         self.num_fields = lib.tdb_num_fields(db)
566 |         self.fields = [lib.tdb_get_field_name(db, i).decode(CODEC)
567 |                        for i in range(self.num_fields)]
568 |         self._event_cls = mk_event_class(self.fields, self.get_item_value)
569 |         self._uint64_ptr = pointer(c_uint64())
570 |         self.decode = decode
571 | 
572 |     def __del__(self):
573 |         if hasattr(self, '_db'):
574 |             lib.tdb_close(self._db)
575 | 
576 |     def __contains__(self, uuidish):
577 |         """:returns: True if UUID or Trail ID exists in this TrailDB."""
578 |         try:
579 |             self[uuidish]
580 |             return True
581 |         except IndexError:
582 |             return False
583 | 
584 |     def __getitem__(self, uuidish):
585 |         """:returns: a cursor for the given UUID or Trail ID."""
586 |         if isinstance(uuidish, basestring):
587 |             return self.trail(self.get_trail_id(uuidish))
588 |         return self.trail(uuidish)
589 | 
590 |     def __len__(self):
591 |         """:returns: The number of trails in the TrailDB."""
592 |         return self.num_trails
593 | 
594 |     def trails(self, selected_uuids=None, reuse_cursors=False, **kwds):
595 |         """
596 |         Iterate over all trails in this TrailDB.
597 | 
598 |         :param selected_uuids: If passed, only go through the UUIDs passed in
599 |           this argument. It should be an iterable that yields hex UUIDs.
600 | 
601 |         :param reuse_cursors: If `False`, trails() creates a new cursor
602 |            for every single trail it iterates over. You can change this
603 |            behavior by setting ``reuse_cursors=True``. Now, the same underlying
604 |            cursor object will be reused for all trails yielded from this
605 |            function. This is a major performance improvement but it means
606 |            you cannot save the iterators from trails() and iterate over them
607 |            later; you must consume them immediately before you go to next item
608 |            from trails().
609 | 
610 |         :returns: Yields ``(uuid, events)`` pairs.
611 | 
612 |         Any other keyword arguments are passed to :py:meth:`~TrailDB.cursor()`.
613 | 
614 |         .. code-block:: python
615 | 
616 |           # Prints all UUIDs in a TrailDB
617 |           import traildb
618 |           tdb = traildb.TrailDB('blah')
619 |           for uuid, events in tdb.trails():
620 |             print(uuid)
621 | 
622 |         """
623 |         if reuse_cursors:
624 |             cursor = self.cursor(**kwds)
625 | 
626 |         if selected_uuids is not None:
627 |             for uuid in selected_uuids:
628 |                 try:
629 |                     i = self.get_trail_id(uuid)
630 |                 except IndexError:
631 |                     continue
632 | 
633 |                 if not reuse_cursors:
634 |                     cursor = self.cursor(**kwds)
635 | 
636 |                 cursor.get_trail(i)
637 |                 yield uuid, cursor
638 |         else:
639 |             for i in range(len(self)):
640 |                 if not reuse_cursors:
641 |                     cursor = self.cursor(**kwds)
642 | 
643 |                 cursor.get_trail(i)
644 |                 yield self.get_uuid(i), cursor
645 | 
646 |     def trail(self,
647 |               trail_id,
648 |               parsetime=False,
649 |               rawitems=False,
650 |               only_timestamp=False,
651 |               event_filter=None):
652 |         """Return a cursor over a single trail.
653 | 
654 |         :param trail_id: Trail ID to use.
655 |         :param parsetime: If True, returns datetime objects instead of integer timestamps.
656 |         :param rawitems: Return raw integer items instead of stringified values. Using raw items is usually a bit more efficient than using string values.
657 |         :param only_timestamp: If True, only return timestamps, not event objects.
658 |         :param event_filter: Apply given event filter to the cursor.
659 |         :returns: A :py:class:`~traildb.TrailDBCursor` to given Trail ID.
660 | 
661 |         This function can throw :py:class:`~traildb.TrailDBError` if Trail ID is not
662 |         present in the TrailDB.
663 |         """
664 |         cursor = lib.tdb_cursor_new(self._db)
665 |         if lib.tdb_get_trail(cursor, trail_id) != 0:
666 |             raise TrailDBError("Failed to create cursor")
667 | 
668 |         if isinstance(event_filter, TrailDBEventFilter):
669 |             event_filter_obj = event_filter
670 |         elif event_filter:
671 |             event_filter_obj = self.create_filter(event_filter)
672 |         else:
673 |             event_filter_obj = None
674 | 
675 |         valuefun = None if rawitems else self.get_item_value
676 |         return TrailDBCursor(cursor,
677 |                              self._event_cls,
678 |                              valuefun,
679 |                              parsetime,
680 |                              only_timestamp,
681 |                              event_filter_obj)
682 | 
683 |     def cursor(self, *args, **kwargs):
684 |         """Alias for :py:meth:`~traildb.TrailDB.trail` with ``trail_id=0``"""
685 |         return self.trail(0, *args, **kwargs)
686 | 
687 |     def field(self, fieldish):
688 |         """:returns: a field ID given a field name.
689 | 
690 |         .. code-block:: python
691 | 
692 |           import traildb
693 |           tdb = traildb.TrailDB('blah.tdb')
694 |           print(tdb.field('type'))
695 |         """
696 |         if isinstance(fieldish, basestring):
697 |             return self.fields.index(fieldish)
698 |         return fieldish
699 | 
700 |     def lexicon(self, fieldish):
701 |         """:returns: an iterator over values of the given field ID or field name."""
702 |         field = self.field(fieldish)
703 |         return (self.get_value(field, i)
704 |                 for i in range(1, self.lexicon_size(field)))
705 | 
706 |     def lexicon_size(self, fieldish):
707 |         """:returns: The number of distinct values in the given field ID or field name. (i.e. cardinality of a field in the TrailDB)"""
708 |         field = self.field(fieldish)
709 |         value = lib.tdb_lexicon_size(self._db, field)
710 |         if value == 0:
711 |             raise TrailDBError("Invalid field index")
712 |         return value
713 | 
714 |     def get_item(self, fieldish, value):
715 |         """:returns: The item corresponding to a field ID or a field name and a string value.
716 | 
717 |         .. code-block:: python
718 | 
719 |           import traildb
720 |           tdb = traildb.TrailDB('blah.tdb')
721 |           print(tdb.get_item('type', 'click'))
722 | 
723 |         """
724 |         field = self.field(fieldish)
725 |         item = lib.tdb_get_item(self._db,
726 |                                 field,
727 |                                 value.encode(CODEC),
728 |                                 len(value))
729 |         if not item:
730 |             raise TrailDBError("No such value: '%s'" % value)
731 |         return item
732 | 
733 |     def get_item_value(self, item):
734 |         """:returns: The string value corresponding to an item.
735 | 
736 |         .. code-block:: python
737 | 
738 |           import traildb
739 |           tdb = traildb.TrailDB('blah.tdb')
740 | 
741 |           # This should print 'click' (if TrailDB contains 'type' field and 'click' values in that field).
742 |           print(tdb.get_item_value(tdb.get_item('type', 'click')))
743 | 
744 |         """
745 |         value = lib.tdb_get_item_value(self._db, item, self._uint64_ptr)
746 |         if value is None:
747 |             raise TrailDBError("Error reading value, error: %s" %
748 |                                lib.tdb_error(self._db))
749 | 
750 |         if self.decode:
751 |             return value[0:self._uint64_ptr.contents.value].decode(CODEC)
752 | 
753 |         return value[0:self._uint64_ptr.contents.value]
754 | 
755 |     def get_value(self, fieldish, val):
756 |         """:returns: The string value corresponding to a field ID or a field name and a value ID."""
757 |         field = self.field(fieldish)
758 |         value = lib.tdb_get_value(self._db, field, val, self._uint64_ptr)
759 |         if value is None:
760 |             raise TrailDBError("Error reading value, error: %s" %
761 |                                lib.tdb_error(self._db))
762 | 
763 |         if self.decode:
764 |             return value[0:self._uint64_ptr.contents.value].decode(CODEC)
765 | 
766 |         return value[0:self._uint64_ptr.contents.value]
767 | 
768 |     def get_uuid(self, trail_id, raw=False):
769 |         """
770 |         :param trail_id: The Trail ID to give UUID for.
771 |         :param raw: If true, returns 16-byte binary string for UUID instead of hexified UUID.
772 |         :returns: UUID given a Trail ID.
773 |         """
774 |         uuid = lib.tdb_get_uuid(self._db, trail_id)
775 |         if uuid:
776 |             if raw:
777 |                 return string_at(uuid, 16)
778 |             else:
779 |                 return uuid_hex(uuid)
780 |         raise IndexError("Trail ID out of range")
781 | 
782 |     def get_trail_id(self, uuid):
783 |         """:returns: Trail ID given a UUID.
784 | 
785 |         This is the reverse of :py:meth:`traildb.TrailDB.get_uuid`.
786 |         """
787 |         ret = lib.tdb_get_trail_id(self._db, uuid_raw(uuid), self._uint64_ptr)
788 |         if ret:
789 |             raise IndexError("UUID '%s' not found" % uuid)
790 |         return self._uint64_ptr.contents.value
791 | 
792 |     def time_range(self, parsetime=False):
793 |         """:returns: The time range covered by this TrailDB.
794 | 
795 |         :param parsetime: If True, return time range as integers or datetime objects.
796 |         """
797 |         tmin = self.min_timestamp()
798 |         tmax = self.max_timestamp()
799 |         if parsetime:
800 |             return datetime.fromtimestamp(tmin), datetime.fromtimestamp(tmax)
801 |         return tmin, tmax
802 | 
803 |     def min_timestamp(self):
804 |         """:returns: The minimum time stamp of this TrailDB."""
805 |         return lib.tdb_min_timestamp(self._db)
806 | 
807 |     def max_timestamp(self):
808 |         """:returns: The maximum time stamp of this TrailDB."""
809 |         return lib.tdb_max_timestamp(self._db)
810 | 
811 |     def create_filter(self, event_filter):
812 |         """:returns: :py:class:`~traildb.TrailDBEventFilter` object created from this TrailDB."""
813 |         return TrailDBEventFilter(self, event_filter)
814 | 
815 |     def apply_whitelist(self, uuids):
816 |         """
817 |         Applies a whitelist of UUIDs to TrailDB so that further calls to
818 |         :py:meth:`~traildb.TrailDB.trails` do not return any events for UUIDs that
819 |         have not been whitelisted with this call.
820 |         """
821 |         empty_filter = lib.tdb_event_filter_new_match_none()
822 |         all_filter = lib.tdb_event_filter_new_match_all()
823 |         value = tdb_opt_value(ptr=empty_filter)
824 | 
825 |         lib.tdb_set_opt(self._db,
826 |                         TDB_OPT_EVENT_FILTER,
827 |                         value)
828 | 
829 |         value = tdb_opt_value(ptr=all_filter)
830 | 
831 |         for uuid in uuids:
832 |             try:
833 |                 trail_id = self.get_trail_id(uuid)
834 |                 lib.tdb_set_trail_opt(self._db,
835 |                                       trail_id,
836 |                                       TDB_OPT_EVENT_FILTER,
837 |                                       value)
838 |             except IndexError:
839 |                 continue
840 | 
841 |     def apply_blacklist(self, uuids):
842 |         """
843 |         Applies a blacklist of UUIDs to TrailDB so that further calls to
844 |         :py:meth:`~traildb.TrailDB.trails` do not return any events for the blacklisted UUIDs.
845 |         """
846 |         empty_filter = lib.tdb_event_filter_new_match_none()
847 |         all_filter = lib.tdb_event_filter_new_match_all()
848 |         value = tdb_opt_value(ptr=all_filter)
849 | 
850 |         lib.tdb_set_opt(self._db,
851 |                         TDB_OPT_EVENT_FILTER,
852 |                         value)
853 | 
854 |         value = tdb_opt_value(ptr=empty_filter)
855 |         for uuid in uuids:
856 |             try:
857 |                 trail_id = self.get_trail_id(uuid)
858 |                 lib.tdb_set_trail_opt(self._db,
859 |                                       trail_id,
860 |                                       TDB_OPT_EVENT_FILTER,
861 |                                       value)
862 |             except IndexError:
863 |                 continue
864 | 
865 | 
866 | class TrailDBEventFilter(object):
867 |     """
868 |     Converts a query defined in terms of Python collections to a
869 |     `tdb_event_filter` which can be passed to various TrailDB functions.
870 |     Performs some validation when parsing the query.
871 | 
872 |     Queries are boolean expressions defined from terms and clauses.  A term is
873 |     defined using a tuple:
874 | 
875 |     .. code-block:: python
876 | 
877 |       (field_name, "value")        # match records with field_name == "value"
878 |       (field_name, "value", False) # match records with field_name == "value"
879 |       (field_name, "value", True)  # match records with field_name != "value"
880 |       (start_time, end_time)       # match records with start_time <= time < end_time
881 | 
882 |     Clauses are boolean expressions formed from terms, which are connected with AND.
883 |     Clauses are defined with lists of terms:
884 | 
885 |     .. code-block:: python
886 | 
887 |       [term]
888 |       [term1, term2]
889 |       [term1, term2, ...]
890 | 
891 |     Queries are boolean expressions formed from clauses, which are connected with OR.
892 |     Queries are defined with lists of clauses:
893 | 
894 |     .. code-block:: python
895 | 
896 |       [clause]
897 |       [clause1, clause2]
898 |       [clause1, clause2, ...]
899 | 
900 |     Some complete examples:
901 | 
902 |     .. code-block:: python
903 | 
904 |       [[("user", "george_jetson")]] # Match records for the user "george_jetson"
905 |       [[("user", "george_jetson", True)]] # Match records for users other than "george_jetson"
906 |       [[(1501013929, 1501100260)]] # Match records between 2017-07-25 3:18 pm to  2017-07-26 3:18 pm
907 |       [[("job_title", "manager"), ("user", "george_jetson")]] # Match records for the user "george_jetson" AND with job title "manager"
908 |       [[("job_title", "manager")], [("user", "george_jetson")]] # Match records for the user "george_jetson" OR with job title "manager"
909 |       [[("job_title", "manager"), (1501013929, 1501100260)], [("user", "george_jetson"), (1501013929, 1501100260)]] # Match records for the user "george_jetson" OR with job title "manager" and between 2017-07-25 3:18 pm to  2017-07-26 3:18 pm
910 |     """
911 |     def __init__(self, db, query):
912 |         """
913 |         Initializes TrailDBEventFilter. You might want to use :py:meth:`traildb.TrailDB.create_filter` instead that passes ``db`` automatically.
914 |         """
915 | 
916 |         self.flt = lib.tdb_event_filter_new()
917 |         if type(query[0]) is tuple:
918 |             query = [query]
919 |         for i, clause in enumerate(query):
920 |             if i > 0:
921 |                 err = lib.tdb_event_filter_new_clause(self.flt)
922 |                 if err:
923 |                     raise TrailDBError("Out of memory in _create_filter")
924 | 
925 |             for term in clause:
926 |                 err = None
927 |                 # time range?
928 |                 if len(term) == 2 and isinstance(term[0], int) \
929 |                    and isinstance(term[1], int):
930 |                     start_time, end_time = term
931 |                     err = lib.tdb_event_filter_add_time_range(self.flt,
932 |                                                               start_time,
933 |                                                               end_time)
934 |                 else:
935 |                     is_negative = False
936 |                     if len(term) == 3:
937 |                         field, value, is_negative = term
938 |                     else:
939 |                         field, value = term
940 |                     try:
941 |                         item = db.get_item(field, value)
942 |                     except (TrailDBError, ValueError):
943 |                         item = 0
944 |                     err = lib.tdb_event_filter_add_term(self.flt,
945 |                                                         item,
946 |                                                         1 if is_negative else 0)
947 |                 if err:
948 |                     raise TrailDBError("Out of memory in _create_filter")
949 | 
950 |     def __del__(self):
951 |         lib.tdb_event_filter_free(self.flt)
952 | 


--------------------------------------------------------------------------------
/travisdeps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Get up to date libjudy
 4 | sudo apt-get update
 5 | sudo apt-get install -y libarchive-dev pkg-config build-essential
 6 | wget https://mirrors.kernel.org/ubuntu/pool/universe/j/judy/libjudy-dev_1.0.5-5_amd64.deb \
 7 |      https://mirrors.kernel.org/ubuntu/pool/universe/j/judy/libjudydebian1_1.0.5-5_amd64.deb
 8 | sudo dpkg -i libjudy-dev_1.0.5-5_amd64.deb libjudydebian1_1.0.5-5_amd64.deb
 9 | 
10 | # compile dependency in /opt/traildb/traildb
11 | 
12 | mkdir -p /opt/traildb
13 | cd /opt/traildb
14 | 
15 | # shallow-ish copy of master branch of traildb/traildb
16 | git clone --depth=50 https://github.com/traildb/traildb
17 | 
18 | # build traildb so
19 | cd /opt/traildb/traildb
20 | sudo ./waf configure
21 | # actually needs root permissions to install into /usr/local
22 | sudo ./waf install
23 | 


--------------------------------------------------------------------------------