├── .travis.yml
├── LICENSE
├── README.rst
├── conf
    └── settings.yml
├── dev
    ├── reset-trough-dev.sh
    ├── start-trough-dev.sh
    ├── status-trough-dev.sh
    └── stop-trough-dev.sh
├── requirements.txt
├── scripts
    ├── garbage_collector.py
    ├── reader.py
    ├── sync.py
    ├── udptee.py
    └── writer.py
├── setup.py
├── tests
    ├── Dockerfile
    ├── __init__.py
    ├── run_tests.sh
    ├── test.conf
    ├── test_read.py
    ├── test_settings.py
    ├── test_sync.py
    ├── test_write.py
    └── wsgi
    │   ├── __init__.py
    │   └── test_segment_manager.py
└── trough
    ├── __init__.py
    ├── client.py
    ├── db_api.py
    ├── read.py
    ├── settings.py
    ├── shell
        └── __init__.py
    ├── sync.py
    ├── write.py
    └── wsgi
        ├── __init__.py
        └── segment_manager.py


/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: xenial
 2 | language: python
 3 | python:
 4 | - 2.7
 5 | - 3.6
 6 | - 3.5
 7 | - 3.4
 8 | - 3.7
 9 | - 3.8-dev
10 | - nightly
11 | - pypy
12 | - pypy3
13 | matrix:
14 |   allow_failures:
15 |   - python: 2.7
16 |   - python: pypy
17 |   - python: pypy3
18 |   - python: nightly
19 |   - python: 3.4
20 |   - python: 3.8-dev
21 |   - python: 3.8-dev
22 | 
23 | services:
24 | - docker
25 | before_install:
26 | - sudo service docker restart ; sleep 10  # https://github.com/travis-ci/travis-ci/issues/4778
27 | - docker run -d --publish=28015:28015 rethinkdb
28 | - docker run -d --publish=8020:8020 --publish=50070:50070 --publish=50010:50010 --publish=50020:50020 --publish=50075:50075 chalimartines/cdh5-pseudo-distributed
29 | - sudo apt-get -y install libcurl3 libgsasl7 libntlm0
30 | - curl -sSLvO https://github.com/nlevitt/libhdfs3-deb/raw/master/libhdfs3_1-1.deb
31 | - sudo dpkg -i libhdfs3_1-1.deb
32 | 
33 | install:
34 | - pip install -e . --no-input --upgrade
35 | - pip install pytest
36 | 
37 | before_script:
38 | ### # https://docs.docker.com/docker-for-mac/networking/#use-cases-and-workarounds
39 | ### # see "I WANT TO CONNECT TO A CONTAINER FROM THE MAC" (you can't)
40 | ### hadoop_container_ip=$(docker exec -it hadoop ifconfig eth0 | egrep -o 'addr:[^ ]+' | awk -F: '{print $2}')
41 | ### sudo ifconfig lo0 alias $hadoop_container_ip
42 | - 'sync.py >>/tmp/trough-sync-local.out 2>&1 &'
43 | - sleep 5
44 | - python -c "import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings['RETHINKDB_HOSTS']) ; rr.db('trough_configuration').wait().run()"
45 | - 'uwsgi --http :6444 --master --processes=2 --harakiri=3200 --http-timeout=3200 --socket-timeout=3200 --max-requests=50000 --vacuum --die-on-term --wsgi-file scripts/reader.py >>/tmp/trough-read.out 2>&1 &'
46 | - 'uwsgi --http :6222 --master --processes=2 --harakiri=240 --http-timeout=240 --max-requests=50000 --vacuum --die-on-term --wsgi-file scripts/writer.py >>/tmp/trough-write.out 2>&1 &'
47 | - 'sync.py --server >>/tmp/trough-sync-server.out 2>&1 &'
48 | - 'uwsgi --http :6112 --master --processes=2 --harakiri=7200 --http-timeout==7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:local >>/tmp/trough-segment-manager-local.out 2>&1 &'
49 | - 'uwsgi --http :6111 --master --processes=2 --harakiri=7200 --http-timeout==7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1 &'
50 | 
51 | script:
52 | - py.test --tb=native -v tests
53 | 
54 | after_script:
55 | - cat /tmp/trough-sync-local.out
56 | - cat /tmp/trough-read.out
57 | - cat /tmp/trough-write.out
58 | - cat /tmp/trough-segment-manager-local.out
59 | - cat /tmp/trough-segment-manager-server.out
60 | 
61 | notifications:
62 |   slack: internetarchive:PLZQTqR7RpyGNr1jb1TgMqhK
63 | 
64 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2016, jkafader
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. image:: https://travis-ci.org/internetarchive/trough.svg?branch=master
 2 |     :target: https://travis-ci.org/internetarchive/trough
 3 | 
 4 | =======
 5 | Trough
 6 | =======
 7 | 
 8 | Big data, small databases.
 9 | ==========================
10 | 
11 | Big data is really just lots and lots of little data. 
12 | 
13 | If you split a large dataset into lots of small SQL databases sharded on a well-chosen key, 
14 | they can work in concert to create a database system that can query very large datasets.
15 | 
16 | Worst-case Performance is *important*
17 | =====================================
18 | 
19 | A key insight when working with large datasets is that with monolithic big data tools' performance 
20 | is largely tied to having a full dataset completely loaded and working in a 
21 | production-quality cluster.
22 | 
23 | Trough is designed to have very predictable performance characteristics: simply determine your sharding key,
24 | determine your largest shard, load it into a sqlite database locally, and you already know your worst-case
25 | performance scenario.
26 | 
27 | Designed to leverage storage, not RAM
28 | =====================================
29 | 
30 | Rather than having huge CPU and memory requirements to deliver performant queries over large datasets,
31 | Trough relies on flat sqlite files, which are easily distributed to a cluster and queried against.
32 | 
33 | Reliable parts, reliable whole
34 | ==============================
35 | 
36 | Each piece of technology in the stack was carefully selected and load tested to ensure that your data stays
37 | reliably up and reliably queryable. The code is small enough for one programmer to audit.
38 | 
39 | Ease of installation
40 | ====================
41 | 
42 | One of the worst parts of setting up a big data system generally is getting setting sensible defaults and
43 | deploying it to staging and production environments. Trough has been designed to require as little 
44 | configuration as possible.
45 | 
46 | An example ansible deployment specification has been removed from the trough
47 | repo but can be found at https://github.com/internetarchive/trough/tree/cc32d3771a7/ansible.
48 | It is designed for a cluster Ubuntu 16.04 Xenial nodes.
49 | 
50 | 


--------------------------------------------------------------------------------
/conf/settings.yml:
--------------------------------------------------------------------------------
1 | HDFS_PATH: /ait/prod/trough/
2 | HDFS_HOST: localhost
3 | HDFS_PORT: 6000
4 | MINIMUM_ASSIGNMENTS: "lambda segment_id: 2 if segment_id.isnumeric() and int(segment_id) > 200000 else 1"
5 | COLD_STORE_SEGMENT: "lambda segment_id: segment_id.isnumeric() and int(segment_id) < 600000"
6 | RUN_AS_COLD_STORAGE_NODE: True
7 | COLD_STORAGE_PATH: "/var/trough/cold_storage/{prefix}/{segment_id}"


--------------------------------------------------------------------------------
/dev/reset-trough-dev.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$VIRTUAL_ENV" ] ; then
 4 |     echo '$VIRTUAL_ENV is not set (please activate your trough virtualenv)'
 5 |     exit 1
 6 | fi
 7 | 
 8 | python -c 'import trough'
 9 | if [ $? -ne 0 ]; then
10 |     echo "trough module could not be imported. Are you in the right virtualenv?"
11 |     exit 1
12 | fi
13 | 
14 | script_dir=$(dirname $VIRTUAL_ENV)/dev
15 | $script_dir/stop-trough-dev.sh
16 | 
17 | rm -vrf /tmp/trough-*.out
18 | rm -vrf /var/tmp/trough
19 | python -c "import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings['RETHINKDB_HOSTS']) ; print(rr.db_drop('trough_configuration').run())"
20 | python -c "from hdfs3 import HDFileSystem ; from trough.settings import settings ; hdfs = HDFileSystem(host=settings['HDFS_HOST'], port=settings['HDFS_PORT']) ; hdfs.rm(settings['HDFS_PATH'])"
21 | 


--------------------------------------------------------------------------------
/dev/start-trough-dev.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$VIRTUAL_ENV" ] ; then
 4 |     echo '$VIRTUAL_ENV is not set (please activate your trough virtualenv)'
 5 |     exit 1
 6 | fi
 7 | 
 8 | python -c 'import trough'
 9 | if [ $? -ne 0 ]; then
10 |     echo "trough module could not be imported. Are you in the right virtualenv?"
11 |     exit 1
12 | fi
13 | 
14 | source $VIRTUAL_ENV/bin/activate
15 | 
16 | set -x
17 | 
18 | rethinkdb >>/tmp/rethinkdb.log 2>&1 &
19 | docker run --detach --rm --name=hadoop --publish=8020:8020 --publish=50070:50070 --publish=50010:50010 --publish=50020:50020 --publish=50075:50075 chalimartines/cdh5-pseudo-distributed && sleep 30
20 | 
21 | # XXX mac-specific hack
22 | # https://docs.docker.com/docker-for-mac/networking/#use-cases-and-workarounds
23 | # see "I WANT TO CONNECT TO A CONTAINER FROM THE MAC" (you can't)
24 | hadoop_container_ip=$(docker exec -it hadoop ifconfig eth0 | egrep -o 'addr:[^ ]+' | awk -F: '{print $2}')
25 | sudo ifconfig lo0 alias $hadoop_container_ip
26 | sudo ifconfig lo0 alias 127.0.0.1
27 | 
28 | export TROUGH_LOG_LEVEL=DEBUG
29 | 
30 | $VIRTUAL_ENV/bin/sync.py >>/tmp/trough-sync-local.out 2>&1 &
31 | sleep 0.5
32 | python -c "
33 | import doublethink
34 | from trough.settings import settings
35 | from rethinkdb.errors import ReqlOpFailedError
36 | 
37 | rr = doublethink.Rethinker(settings['RETHINKDB_HOSTS'])
38 | while True:
39 |     try:
40 |         rr.db('trough_configuration').wait().run()
41 |         rr.db('trough_configuration').table('assignment').wait().run()
42 |         rr.db('trough_configuration').table('lock').wait().run()
43 |         rr.db('trough_configuration').table('schema').wait().run()
44 |         rr.db('trough_configuration').table('services').wait().run()
45 |         break
46 |     except ReqlOpFailedError as e:
47 |         pass
48 | "
49 | 
50 | uwsgi --venv=$VIRTUAL_ENV --http :6444 --master --processes=2 --harakiri=3200 --http-timeout=3200 --socket-timeout=3200 --max-requests=50000 --vacuum --die-on-term --wsgi-file $VIRTUAL_ENV/bin/reader.py >>/tmp/trough-read.out 2>&1 &
51 | uwsgi --venv=$VIRTUAL_ENV --http :6222 --master --processes=2 --harakiri=240 --http-timeout=240 --max-requests=50000 --vacuum --die-on-term --wsgi-file $VIRTUAL_ENV/bin/writer.py >>/tmp/trough-write.out 2>&1 &
52 | $VIRTUAL_ENV/bin/sync.py --server >>/tmp/trough-sync-server.out 2>&1 &
53 | uwsgi --venv=$VIRTUAL_ENV --http :6112 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:local >>/tmp/trough-segment-manager-local.out 2>&1 &
54 | uwsgi --venv=$VIRTUAL_ENV --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1 &
55 | 


--------------------------------------------------------------------------------
/dev/status-trough-dev.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$VIRTUAL_ENV" ] ; then
 4 |     echo '$VIRTUAL_ENV is not set (please activate your trough virtualenv)'
 5 |     exit 1
 6 | fi
 7 | 
 8 | python -c 'import trough'
 9 | if [ $? -ne 0 ]; then
10 |     echo "trough module could not be imported. Are you in the right virtualenv?"
11 |     exit 1
12 | fi
13 | 
14 | for svc in $VIRTUAL_ENV/bin/reader.py $VIRTUAL_ENV/bin/writer.py trough.wsgi.segment_manager:local trough.wsgi.segment_manager:server $VIRTUAL_ENV/bin/sync.py ;
15 | do
16 |     echo === $svc ===
17 |     pids=$(pgrep -f $svc)
18 |     if [ -n "$pids" ] ; then
19 |         ps $pids
20 |     else
21 |         echo not running
22 |     fi
23 | done
24 | 
25 | 


--------------------------------------------------------------------------------
/dev/stop-trough-dev.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$VIRTUAL_ENV" ] ; then
 4 |     echo '$VIRTUAL_ENV is not set (please activate your trough virtualenv)'
 5 |     exit 1
 6 | fi
 7 | 
 8 | python -c 'import trough'
 9 | if [ $? -ne 0 ]; then
10 |     echo "trough module could not be imported. Are you in the right virtualenv?"
11 |     exit 1
12 | fi
13 | 
14 | pkill -f $VIRTUAL_ENV/bin/reader.py
15 | pkill -f $VIRTUAL_ENV/bin/writer.py
16 | pkill -f $VIRTUAL_ENV/bin/sync.py
17 | pkill -f trough.wsgi.segment_manager:local
18 | pkill -f trough.wsgi.segment_manager:server
19 | 
20 | # XXX see start-trough-dev.sh
21 | hadoop_container_ip=$(docker exec -it hadoop ifconfig eth0 | egrep -o 'addr:[^ ]+' | awk -F: '{print $2}')
22 | sudo ifconfig lo0 -alias $hadoop_container_ip
23 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | protobuf==3.7.1
 2 | PyYAML==5.1
 3 | requests==2.21.0
 4 | six==1.10.0
 5 | snakebite-py3==3.0.1
 6 | ujson-ia>=2.1.1
 7 | sqlparse==0.2.2
 8 | uWSGI==2.0.15
 9 | git+https://github.com/internetarchive/doublethink.git@master
10 | 


--------------------------------------------------------------------------------
/scripts/garbage_collector.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import trough
 3 | from trough.settings import settings
 4 | import logging
 5 | import time
 6 | 
 7 | if __name__ == '__main__':
 8 |     controller = trough.sync.get_controller(False)
 9 |     controller.check_config()
10 |     controller.collect_garbage()
11 | 


--------------------------------------------------------------------------------
/scripts/reader.py:
--------------------------------------------------------------------------------
 1 | import trough
 2 | from trough.settings import settings, init_worker
 3 | 
 4 | trough.settings.configure_logging()
 5 | 
 6 | init_worker()
 7 | 
 8 | # setup uwsgi endpoint
 9 | application = trough.read.ReadServer()
10 | 
11 | if __name__ == '__main__':
12 |     from wsgiref.simple_server import make_server
13 |     server = make_server('', 6444, application)
14 |     server.serve_forever()
15 | 
16 | 


--------------------------------------------------------------------------------
/scripts/sync.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import trough
 3 | from trough.settings import settings
 4 | import logging
 5 | import time
 6 | import datetime
 7 | import sys
 8 | 
 9 | if __name__ == '__main__':
10 |     import argparse
11 |     parser = argparse.ArgumentParser(description='Run a "server" sync process, which controls other sync processes, ' \
12 |         'or a "local" sync process, which loads segments onto the current machine and performs health checks.')
13 | 
14 |     parser.add_argument('--server', dest='server', action='store_true',
15 |                         help='run in server mode: control the actions of other local synchronizers.')
16 |     parser.add_argument('-v', '--verbose', action='store_true')
17 |     args = parser.parse_args()
18 | 
19 |     logging.root.handlers = []
20 |     logging.basicConfig(
21 |             stream=sys.stdout,
22 |             level=logging.DEBUG if args.verbose else logging.INFO, format=(
23 |                 '%(asctime)s %(levelname)s %(name)s.%(funcName)s'
24 |                 '(%(filename)s:%(lineno)d) %(message)s'))
25 |     logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING)
26 |     logging.getLogger('urllib3').setLevel(logging.WARNING)
27 |     logging.getLogger('asyncio').setLevel(logging.WARNING)
28 |     logging.getLogger('snakebite').setLevel(logging.INFO)
29 | 
30 |     controller = trough.sync.get_controller(args.server)
31 |     controller.start()
32 |     controller.check_config()
33 |     while True:
34 |         controller.check_health()
35 |         started = datetime.datetime.now()
36 |         controller.sync()
37 |         if not args.server:
38 |             controller.collect_garbage()
39 |         loop_duration = datetime.datetime.now() - started
40 |         sleep_time = settings['SYNC_LOOP_TIMING'] - loop_duration.total_seconds()
41 |         sleep_time = sleep_time if sleep_time > 0 else 0
42 |         logging.info('Sleeping for %s seconds' % round(sleep_time))
43 |         time.sleep(sleep_time)
44 | 


--------------------------------------------------------------------------------
/scripts/udptee.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | '''
 3 | udptee.py - like `tee` but duplicates output to a udp destination instead of a
 4 | file
 5 | '''
 6 | 
 7 | import argparse
 8 | import sys
 9 | import os
10 | import socket
11 | 
12 | def main(argv=['udptee.py']):
13 |     arg_parser = argparse.ArgumentParser(
14 |             prog=os.path.basename(argv[0]), description=(
15 |                 'like `tee` but duplicates output to a udp destination '
16 |                 'instead of a file'))
17 |     arg_parser.add_argument(
18 |             metavar='ADDRESS', dest='addresses', nargs='+', help=(
19 |                 'destination address "host:port"'))
20 |     args = arg_parser.parse_args(args=argv[1:])
21 | 
22 |     addrs = []
23 |     for address in args.addresses:
24 |         host, port = address.split(':')
25 |         port = int(port)
26 |         addrs.append((host, port))
27 | 
28 |     sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
29 | 
30 |     stdin = open(0, mode='rb', buffering=0)
31 |     stdout = open(1, mode='wb', buffering=0)
32 | 
33 |     while True:
34 |         line = stdin.readline()
35 |         if not line:
36 |             break
37 |         stdout.write(line)
38 |         for addr in addrs:
39 |             # 1400 byte chunks to avoid EMSGSIZE
40 |             for chunk in (line[i*1400:(i+1)*1400]
41 |                           for i in range((len(line) - 1) // 1400 + 1)):
42 |                 sock.sendto(chunk, addr)
43 | 
44 | if __name__ == '__main__':
45 |     main(sys.argv)
46 | 


--------------------------------------------------------------------------------
/scripts/writer.py:
--------------------------------------------------------------------------------
 1 | import trough
 2 | from trough.settings import settings, init_worker
 3 | 
 4 | trough.settings.configure_logging()
 5 | 
 6 | init_worker()
 7 | 
 8 | # setup uwsgi endpoint
 9 | application = trough.write.WriteServer()
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import glob
 3 | 
 4 | setup(
 5 |     name='Trough',
 6 |     version='0.2.2',
 7 |     packages=[
 8 |         'trough',
 9 |         'trough.shell',
10 |         'trough.wsgi',
11 |     ],
12 |     maintainer='James Kafader',
13 |     maintainer_email='jkafader@archive.org',
14 |     url='https://github.com/internetarchive/trough',
15 |     license='BSD',
16 |     long_description=open('README.rst').read(),
17 |     classifiers=[
18 |         'Development Status :: 4 - Beta',
19 |         'Topic :: Database :: Database Engines/Servers',
20 |         'License :: OSI Approved :: BSD License',
21 |     ],
22 |     install_requires=[
23 |         'protobuf>=3.7.1,<4',
24 |         'PyYAML>=5.1',
25 |         'requests>=2.21.0',
26 |         'six>=1.10.0',
27 |         'snakebite-py3>=3.0',
28 |         'ujson-ia>=2.1.1',
29 |         'sqlparse>=0.2.2',
30 |         'uWSGI>=2.0.15',
31 |         'doublethink>=0.2.0',
32 |         'uhashring>=0.7,<1.0',
33 |         'flask>=1.0.2,<2',
34 |         'sqlitebck>=1.4',
35 |         'hdfs3>=0.2.0',
36 |         'aiodns>=1.2.0',
37 |         'aiohttp>=2.3.10,<=3.0.0b0', # >3.0.0b0 requires python 3.5.3+
38 |         'async-timeout<3.0.0',       # >=3.0.0 requires python 3.5.3+
39 |     ],
40 |     tests_require=['pytest'],
41 |     scripts=glob.glob('scripts/*.py'),
42 |     entry_points={'console_scripts': ['trough-shell=trough.shell:trough_shell']}
43 | )
44 | 


--------------------------------------------------------------------------------
/tests/Dockerfile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Dockerfile for trough tests
 3 | #
 4 | # Copyright (C) 2015-2017 Internet Archive
 5 | #
 6 | # This program is free software; you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License
 8 | # as published by the Free Software Foundation; either version 2
 9 | # of the License, or (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program; if not, write to the Free Software
18 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
19 | # USA.
20 | #
21 | 
22 | FROM phusion/baseimage
23 | 
24 | # see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
25 | # and https://github.com/chali/hadoop-cdh-pseudo-docker/blob/master/Dockerfile
26 | 
27 | ENV LANG=C.UTF-8
28 | 
29 | RUN apt-get update && apt-get --auto-remove -y dist-upgrade
30 | 
31 | # Add the RethinkDB repository and public key
32 | RUN curl -s https://download.rethinkdb.com/apt/pubkey.gpg | apt-key add - \
33 |     && echo "deb http://download.rethinkdb.com/apt xenial main" > /etc/apt/sources.list.d/rethinkdb.list \
34 |     && apt-get update && apt-get -y install rethinkdb
35 | 
36 | RUN mkdir -vp /etc/service/rethinkdb \
37 |     && echo "#!/bin/bash\nexec rethinkdb --bind 0.0.0.0 --directory /tmp/rethink-data --runuser rethinkdb --rungroup rethinkdb\n" > /etc/service/rethinkdb/run \
38 |     && chmod a+x /etc/service/rethinkdb/run
39 | 
40 | RUN apt-get -y install git
41 | RUN apt-get -y install libpython2.7-dev libpython3-dev libffi-dev libssl-dev \
42 |                python-setuptools python3-setuptools
43 | RUN apt-get -y install gcc
44 | 
45 | RUN echo '57ff41e99cb01b6a1c2b0999161589b726f0ec8b  /tmp/pip-9.0.1.tar.gz' > /tmp/sha1sums.txt
46 | RUN curl -sSL -o /tmp/pip-9.0.1.tar.gz https://pypi.python.org/packages/11/b6/abcb525026a4be042b486df43905d6893fb04f05aac21c32c638e939e447/pip-9.0.1.tar.gz
47 | RUN sha1sum -c /tmp/sha1sums.txt
48 | RUN tar -C /tmp -xf /tmp/pip-9.0.1.tar.gz
49 | RUN cd /tmp/pip-9.0.1 && python3 setup.py install
50 | 
51 | RUN pip install virtualenv
52 | 
53 | # hadoop hdfs for trough
54 | RUN curl -s https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/archive.key | apt-key add - \
55 |     && . /etc/lsb-release \
56 |     && echo "deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/$DISTRIB_CODENAME/amd64/cdh $DISTRIB_CODENAME-cdh5 contrib" >> /etc/apt/sources.list.d/cloudera.list
57 | 
58 | RUN apt-get update
59 | RUN apt-get install -y openjdk-8-jdk hadoop-conf-pseudo
60 | 
61 | RUN su hdfs -c 'hdfs namenode -format'
62 | 
63 | RUN mv -v /etc/hadoop/conf/core-site.xml /etc/hadoop/conf/core-site.xml.orig \
64 |     && cat /etc/hadoop/conf/core-site.xml.orig | sed 's,localhost:8020,0.0.0.0:8020,' > /etc/hadoop/conf/core-site.xml
65 | 
66 | RUN mv -v /etc/hadoop/conf/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml.orig \
67 |     && cat /etc/hadoop/conf/hdfs-site.xml.orig | sed 's,^</configuration>$,  <property>\n    <name>dfs.permissions.enabled</name>\n    <value>false</value>\n  </property>\n</configuration>,' > /etc/hadoop/conf/hdfs-site.xml
68 | 
69 | RUN echo '#!/bin/bash\nservice hadoop-hdfs-namenode start\nservice hadoop-hdfs-datanode start' > /etc/my_init.d/50_start_hdfs.sh \
70 |     && chmod a+x /etc/my_init.d/50_start_hdfs.sh
71 | 
72 | RUN apt-get -y install libsqlite3-dev
73 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/trough/0c6243e0ec4731ce5bb61c15aa7993ac57b692fe/tests/__init__.py


--------------------------------------------------------------------------------
/tests/run_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 6 | 
 7 | docker build -t internetarchive/trough-tests $script_dir
 8 | 
 9 | docker run --rm -it --volume="$script_dir/..:/trough" internetarchive/trough-tests /sbin/my_init -- bash -c \
10 |     $'bash -x -c "cd /tmp && git clone /trough \
11 |             && cd /tmp/trough \
12 |             && (cd /trough && git diff HEAD) | patch -p1 \
13 |             && virtualenv -p python3 /tmp/venv \
14 |             && source /tmp/venv/bin/activate \
15 |             && pip install pytest -e /trough --no-input --upgrade" \
16 |     && bash -x -c "source /tmp/venv/bin/activate \
17 |             && sync.py >>/tmp/trough-sync-local.out 2>&1 &" \
18 |     && bash -x -c "source /tmp/venv/bin/activate \
19 |             && sleep 5 \
20 |             && python -c \\"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\'RETHINKDB_HOSTS\']) ; rr.db(\'trough_configuration\').wait().run()\\"" \
21 |     && bash -x -c "source /tmp/venv/bin/activate \
22 |             && sync.py --server >>/tmp/trough-sync-server.out 2>&1 &" \
23 |     && bash -x -c "source /tmp/venv/bin/activate \
24 |             && uwsgi --daemonize2 --venv=/tmp/venv --http :6444 --master --processes=2 --harakiri=3200 --http-timeout=3200 --socket-timeout=3200 --max-requests=50000 --vacuum --die-on-term --wsgi-file /tmp/venv/bin/reader.py >>/tmp/trough-read.out 2>&1 \
25 |             && uwsgi --daemonize2 --venv=/tmp/venv --http :6222 --master --processes=2 --harakiri=240 --http-timeout=240 --max-requests=50000 --vacuum --die-on-term --wsgi-file /tmp/venv/bin/writer.py >>/tmp/trough-write.out 2>&1 \
26 |             && uwsgi --daemonize2 --venv=/tmp/venv --http :6112 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:local >>/tmp/trough-segment-manager-local.out 2>&1 \
27 |             && uwsgi --daemonize2 --venv=/tmp/venv --http :6111 --master --processes=2 --harakiri=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1 \
28 |             && cd /tmp/trough \
29 |             && py.test -v tests"'
30 | 


--------------------------------------------------------------------------------
/tests/test.conf:
--------------------------------------------------------------------------------
1 | TEST_SETTING: test__setting__value
2 | HOSTNAME: test01
3 | EXTERNAL_IP: 127.0.0.1
4 | MINIMUM_ASSIGNMENTS: "lambda segment_id: 2 if segment_id.isnumeric() and int(segment_id) > 200000 else 1"
5 | HDFS_PATH: /tmp/trough
6 | ELECTION_CYCLE: 0.01 # Wait 0.01s between running elections. Keeps this test from taking a long time.
7 | HOST_CHECK_WAIT_PERIOD: 0.01 # Wait 0.01s between checking if any hosts have joined the cluster. Keeps this test from taking a long time.
8 | 


--------------------------------------------------------------------------------
/tests/test_read.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TROUGH_SETTINGS'] = os.path.join(os.path.dirname(__file__), "test.conf")
 3 | 
 4 | import unittest
 5 | from unittest import mock
 6 | import trough
 7 | import json
 8 | import sqlite3
 9 | from tempfile import NamedTemporaryFile
10 | from trough import sync
11 | from trough.settings import settings
12 | import doublethink
13 | 
14 | class TestReadServer(unittest.TestCase):
15 |     def setUp(self):
16 |         self.server = trough.read.ReadServer()
17 |     def test_empty_read(self):
18 |         database_file = NamedTemporaryFile()
19 |         connection = sqlite3.connect(database_file.name)
20 |         cursor = connection.cursor()
21 |         cursor.execute('CREATE TABLE test (id INTEGER PRIMARY KEY AUTOINCREMENT, test varchar(4));')
22 |         # no inserts!
23 |         connection.commit()
24 | 
25 |         segment = mock.Mock()
26 |         segment.local_path = lambda: database_file.name
27 | 
28 |         output = b""
29 |         for part in self.server.sql_result_json_iter(
30 |                 self.server.execute_query(segment, b'SELECT * FROM "test";')):
31 |             output += part
32 |         output = json.loads(output.decode('utf-8'))
33 |         database_file.close()
34 |         cursor.close()
35 |         connection.close()
36 |         self.assertEqual(output, [])
37 |     def test_read(self):
38 |         database_file = NamedTemporaryFile()
39 |         connection = sqlite3.connect(database_file.name)
40 |         cursor = connection.cursor()
41 |         cursor.execute('CREATE TABLE test (id INTEGER PRIMARY KEY AUTOINCREMENT, test varchar(4));')
42 |         cursor.execute('INSERT INTO test (test) VALUES ("test");')
43 |         connection.commit()
44 |         output = b""
45 | 
46 |         segment = mock.Mock()
47 |         segment.local_path = lambda: database_file.name
48 | 
49 |         for part in self.server.sql_result_json_iter(
50 |                 self.server.execute_query(segment, b'SELECT * FROM "test";')):
51 |             output += part
52 |         output = json.loads(output.decode('utf-8'))
53 |         cursor.close()
54 |         connection.close()
55 |         database_file.close()
56 |         self.assertEqual(output, [{'id': 1, 'test': 'test'}])
57 |     def test_write_failure(self):
58 |         database_file = NamedTemporaryFile()
59 |         connection = sqlite3.connect(database_file.name)
60 |         cursor = connection.cursor()
61 |         cursor.execute('CREATE TABLE test (id INTEGER PRIMARY KEY AUTOINCREMENT, test varchar(4));')
62 |         cursor.execute('INSERT INTO test (test) VALUES ("test");')
63 |         connection.commit()
64 |         output = b""
65 | 
66 |         segment = mock.Mock()
67 |         segment.segment_path = lambda: database_file.name
68 | 
69 |         with self.assertRaises(Exception):
70 |             for item in self.server.read(segment, b'INSERT INTO test (test) VALUES ("test");'):
71 |                 print("item:", item)
72 |         database_file.close()
73 |         cursor.close()
74 |         connection.close()
75 |     @mock.patch("trough.read.requests")
76 |     def test_proxy_for_write_segment(self, requests):
77 |         def post(*args, **kwargs):
78 |             response = mock.Mock()
79 |             response.headers = {"Content-Type": "application/json"}
80 |             response.iter_content = lambda: (b"test", b"output")
81 |             response.status_code = 200
82 |             response.__enter__ = lambda *args, **kwargs: response
83 |             response.__exit__ = lambda *args, **kwargs: None
84 |             return response
85 |         requests.post = post
86 |         consul = mock.Mock()
87 |         registry = mock.Mock()
88 |         rethinker = doublethink.Rethinker(db="trough_configuration", servers=settings['RETHINKDB_HOSTS'])
89 |         services = doublethink.ServiceRegistry(rethinker)
90 |         segment = trough.sync.Segment(segment_id="TEST", rethinker=rethinker, services=services, registry=registry, size=0)
91 |         output = self.server.proxy_for_write_host('localhost', segment, "SELECT * FROM mock;", start_response=lambda *args, **kwargs: None)
92 |         self.assertEqual(list(output), [b"test", b"output"])
93 | 
94 | if __name__ == '__main__':
95 |     unittest.main()
96 | 


--------------------------------------------------------------------------------
/tests/test_settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TROUGH_SETTINGS'] = os.path.join(os.path.dirname(__file__), "test.conf")
 3 | 
 4 | from trough.settings import settings
 5 | import unittest
 6 | from unittest import mock
 7 | 
 8 | class TestSettings(unittest.TestCase):
 9 |     def test_read_settings(self):
10 |         self.assertEqual(settings['TEST_SETTING'], 'test__setting__value')
11 | 
12 | if __name__ == '__main__':
13 |     unittest.main()
14 | 


--------------------------------------------------------------------------------
/tests/test_sync.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ['TROUGH_SETTINGS'] = os.path.join(os.path.dirname(__file__), "test.conf")
  3 | 
  4 | import unittest
  5 | from unittest import mock
  6 | from trough import sync
  7 | from trough.settings import settings
  8 | import time
  9 | import doublethink
 10 | import rethinkdb as r
 11 | import random
 12 | import string
 13 | import tempfile
 14 | import logging
 15 | from hdfs3 import HDFileSystem
 16 | import pytest
 17 | 
 18 | random_db = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10))
 19 | 
 20 | class TestSegment(unittest.TestCase):
 21 |     def setUp(self):
 22 |         self.rethinker = doublethink.Rethinker(db=random_db, servers=settings['RETHINKDB_HOSTS'])
 23 |         self.services = doublethink.ServiceRegistry(self.rethinker)
 24 |         self.registry = sync.HostRegistry(rethinker=self.rethinker, services=self.services)
 25 |         sync.init(self.rethinker)
 26 |         self.rethinker.table("services").delete().run()
 27 |         self.rethinker.table("lock").delete().run()
 28 |         self.rethinker.table("assignment").delete().run()
 29 |     def test_host_key(self):
 30 |         segment = sync.Segment('test-segment',
 31 |             services=self.services,
 32 |             rethinker=self.rethinker,
 33 |             registry=self.registry,
 34 |             size=100)
 35 |         key = segment.host_key('test-node')
 36 |         self.assertEqual(key, 'test-node:test-segment')
 37 |     def test_all_copies(self):
 38 |         registry = sync.HostRegistry(rethinker=self.rethinker, services=self.services)
 39 |         segment = sync.Segment('test-segment',
 40 |             services=self.services,
 41 |             rethinker=self.rethinker,
 42 |             registry=self.registry,
 43 |             size=100)
 44 |         registry.assign(hostname='test-pool', segment=segment, remote_path="/fake/path")
 45 |         registry.commit_assignments()
 46 |         output = segment.all_copies()
 47 |         output = [item for item in output]
 48 |         self.assertEqual(output[0]['id'], 'test-pool:test-segment')
 49 |     def test_readable_copies(self):
 50 |         registry = sync.HostRegistry(rethinker=self.rethinker, services=self.services)
 51 |         segment = sync.Segment('test-segment',
 52 |             services=self.services,
 53 |             rethinker=self.rethinker,
 54 |             registry=self.registry,
 55 |             size=100)
 56 |         registry.heartbeat(pool='trough-read',
 57 |             node=settings['HOSTNAME'],
 58 |             ttl=0.4,
 59 |             segment=segment.id)
 60 |         output = segment.readable_copies()
 61 |         output = list(output)
 62 |         self.assertEqual(output[0]['node'], settings['HOSTNAME'])
 63 |     def test_is_assigned_to_host(self):
 64 |         segment = sync.Segment('test-segment',
 65 |             services=self.services,
 66 |             rethinker=self.rethinker,
 67 |             registry=self.registry,
 68 |             size=100)
 69 |         registry = sync.HostRegistry(rethinker=self.rethinker, services=self.services)
 70 |         registry.assign(hostname='assigned', segment=segment, remote_path="/fake/path")
 71 |         registry.commit_assignments()
 72 |         output = segment.is_assigned_to_host('not-assigned')
 73 |         self.assertFalse(output)
 74 |         output = segment.is_assigned_to_host('assigned')
 75 |         self.assertTrue(output)
 76 |     def test_minimum_assignments(self):
 77 |         segment = sync.Segment('123456',
 78 |             services=self.services,
 79 |             rethinker=self.rethinker,
 80 |             registry=self.registry,
 81 |             size=100)
 82 |         output = segment.minimum_assignments()
 83 |         self.assertEqual(output, 1)
 84 |         segment = sync.Segment('228188',
 85 |             services=self.services,
 86 |             rethinker=self.rethinker,
 87 |             registry=self.registry,
 88 |             size=100)
 89 |         output = segment.minimum_assignments()
 90 |         self.assertEqual(output, 2)
 91 |     def test_new_write_lock(self):
 92 |         lock = sync.Lock.load(self.rethinker, 'write:lock:123456')
 93 |         if lock:
 94 |             lock.release()
 95 |         segment = sync.Segment('123456',
 96 |             services=self.services,
 97 |             rethinker=self.rethinker,
 98 |             registry=self.registry,
 99 |             size=100)
100 |         lock = segment.new_write_lock()
101 |         with self.assertRaises(Exception):
102 |             segment.new_write_lock()
103 |         lock.release()
104 |     def test_retrieve_write_lock(self):
105 |         lock = sync.Lock.load(self.rethinker, 'write:lock:123456')
106 |         if lock:
107 |             lock.release()
108 |         segment = sync.Segment('123456',
109 |             services=self.services,
110 |             rethinker=self.rethinker,
111 |             registry=self.registry,
112 |             size=100)
113 |         output = segment.new_write_lock()
114 |         lock = segment.retrieve_write_lock()
115 |         self.assertEqual(lock["node"], settings['HOSTNAME'])
116 |         self.assertIn("acquired_on", lock)
117 |         lock.release()
118 |     def test_local_path(self):
119 |         segment = sync.Segment('123456',
120 |             services=self.services,
121 |             rethinker=self.rethinker,
122 |             registry=self.registry,
123 |             size=100)
124 |         output = segment.local_path()
125 |         self.assertEqual(output, os.path.join(settings['LOCAL_DATA'], '123456.sqlite'))
126 |     def test_local_segment_exists(self):
127 |         segment = sync.Segment('123456',
128 |             services=self.services,
129 |             rethinker=self.rethinker,
130 |             registry=self.registry,
131 |             size=100)
132 |         output = segment.local_segment_exists()
133 |         self.assertEqual(output, False)
134 |     def test_provision_local_segment(self):
135 |         segment = sync.Segment('123456-test-database',
136 |             services=self.services,
137 |             rethinker=self.rethinker,
138 |             registry=self.registry,
139 |             size=100)
140 |         if segment.local_segment_exists():
141 |             os.remove(segment.local_path())
142 |         output = segment.provision_local_segment('')
143 |         os.remove(segment.local_path())
144 | 
145 | 
146 | class TestHostRegistry(unittest.TestCase):
147 |     def setUp(self):
148 |         self.rethinker = doublethink.Rethinker(db=random_db, servers=settings['RETHINKDB_HOSTS'])
149 |         self.services = doublethink.ServiceRegistry(self.rethinker)
150 |         sync.init(self.rethinker)
151 |         self.rethinker.table("services").delete().run()
152 |         self.rethinker.table("lock").delete().run()
153 |         self.rethinker.table("assignment").delete().run()
154 |     def test_get_hosts(self):
155 |         hostname = 'test.example.com'
156 |         registry = sync.HostRegistry(rethinker=self.rethinker, services=self.services)
157 |         registry.heartbeat(pool='trough-nodes', service_id='trough:nodes:%s' % hostname, node=hostname, ttl=0.6)
158 |         output = registry.get_hosts()
159 |         self.assertEqual(output[0]['node'], "test.example.com")
160 |     def test_heartbeat(self):
161 |         '''This function unusually produces indeterminate output.'''
162 |         hostname = 'test.example.com'
163 |         registry = sync.HostRegistry(rethinker=self.rethinker, services=self.services)
164 |         registry.heartbeat(pool='trough-nodes',
165 |             service_id='trough:nodes:%s' % hostname,
166 |             node=hostname,
167 |             ttl=0.3,
168 |             available_bytes=1024*1024)
169 |         hosts = registry.get_hosts()
170 |         self.assertEqual(hosts[0]["node"], hostname)
171 |         time.sleep(0.4)
172 |         hosts = registry.get_hosts()
173 |         self.assertEqual(hosts, [])
174 |     def test_assign(self):
175 |         registry = sync.HostRegistry(rethinker=self.rethinker, services=self.services)
176 |         segment = sync.Segment('123456',
177 |             services=self.services,
178 |             rethinker=self.rethinker,
179 |             registry=registry,
180 |             size=1024)
181 |         registry.assign('localhost', segment, '/fake/path')
182 |         self.assertEqual(registry.assignment_queue._queue[0]['id'], 'localhost:123456')
183 |         return (segment, registry)
184 |     def test_commit_assignments(self):
185 |         segment, registry = self.test_assign()
186 |         registry.commit_assignments()
187 |         output = [seg for seg in segment.all_copies()]
188 |         self.assertEqual(output[0]['id'], 'localhost:123456')
189 |     def test_unassign(self):
190 |         segment, registry = self.test_assign()
191 |         assignment = registry.assignment_queue._queue[0]
192 |         registry.commit_assignments()
193 |         registry.unassign(assignment)
194 |         self.assertEqual(registry.unassignment_queue._queue[0]['id'], 'localhost:123456')
195 |         return (segment, registry)
196 |     def test_commit_unassignments(self):
197 |         segment, registry = self.test_unassign()
198 |         registry.commit_unassignments()
199 |         output = [seg for seg in segment.all_copies()]
200 |         self.assertEqual(output, [])
201 |     def test_segments_for_host(self):
202 |         registry = sync.HostRegistry(rethinker=self.rethinker, services=self.services)
203 |         segment = sync.Segment('123456',
204 |             services=self.services,
205 |             rethinker=self.rethinker,
206 |             registry=registry,
207 |             size=1024)
208 |         asmt = registry.assign('localhost', segment, '/fake/path')
209 |         registry.commit_assignments()
210 |         output = registry.segments_for_host('localhost')
211 |         self.assertEqual(output[0].id, '123456')
212 |         registry.unassign(asmt)
213 |         registry.commit_unassignments()
214 |         output = registry.segments_for_host('localhost')
215 |         self.assertEqual(output, [])
216 | 
217 | class TestMasterSyncController(unittest.TestCase):
218 |     def setUp(self):
219 |         self.rethinker = doublethink.Rethinker(db=random_db, servers=settings['RETHINKDB_HOSTS'])
220 |         self.services = doublethink.ServiceRegistry(self.rethinker)
221 |         self.registry = sync.HostRegistry(rethinker=self.rethinker, services=self.services)
222 |         sync.init(self.rethinker)
223 |         self.snakebite_client = mock.Mock()
224 |         self.rethinker.table("services").delete().run()
225 |         self.rethinker.table("assignment").delete().run()
226 |     def get_foreign_controller(self):
227 |         controller = sync.MasterSyncController(rethinker=self.rethinker,
228 |             services=self.services,
229 |             registry=self.registry)
230 |         controller.hostname = 'read02'
231 |         controller.election_cycle = 0.1
232 |         controller.sync_loop_timing = 0.01
233 |         return controller
234 |     def get_local_controller(self):
235 |         # TODO: tight timings here point to the fact that 'doublethink.service.unique_service' should
236 |         # be altered to return the object that is returned from the delta rather than doing a
237 |         # separate query. setting the sync loop timing to the same as the election cycle will 
238 |         # yield a situation in which the first query may succeed, and not update the row, but
239 |         # the second query will not find it when it runs because it's passed its TTL.
240 |         controller = sync.MasterSyncController(rethinker=self.rethinker,
241 |             services=self.services,
242 |             registry=self.registry)
243 |         controller.election_cycle = 0.1
244 |         controller.sync_loop_timing = 0.01
245 |         return controller
246 |     def test_hold_election(self):
247 |         foreign_controller = self.get_foreign_controller()
248 |         controller = self.get_local_controller()
249 |         output = controller.hold_election()
250 |         self.assertEqual(output, True)
251 |         output = foreign_controller.hold_election()
252 |         self.assertEqual(output, False)
253 |         time.sleep(0.4)
254 |         output = controller.hold_election()
255 |         self.assertEqual(output, True)
256 |         output = controller.hold_election()
257 |         self.assertEqual(output, True)
258 |     def test_get_segment_file_list(self):
259 |         controller = sync.MasterSyncController(
260 |             rethinker=self.rethinker,
261 |             services=self.services,
262 |             registry=self.registry)
263 |         # populate some dirs/files
264 |         hdfs = HDFileSystem(host=controller.hdfs_host, port=controller.hdfs_port)
265 |         hdfs.rm(controller.hdfs_path, recursive=True)
266 |         hdfs.mkdir(controller.hdfs_path)
267 |         hdfs.touch(os.path.join(controller.hdfs_path, '0.txt'))
268 |         hdfs.touch(os.path.join(controller.hdfs_path, '1.sqlite'))
269 |         hdfs.mkdir(os.path.join(controller.hdfs_path, '2.dir'))
270 |         hdfs.touch(os.path.join(controller.hdfs_path, '3.txt'))
271 |         with hdfs.open(os.path.join(controller.hdfs_path, '4.sqlite'), 'wb', replication=1) as f:
272 |             f.write(b'some bytes')
273 |         hdfs.touch('/tmp/5.sqlite')
274 |         listing = controller.get_segment_file_list()
275 |         entry = next(listing)
276 |         assert entry['name'] == os.path.join(controller.hdfs_path, '1.sqlite')
277 |         assert entry['kind'] == 'file'
278 |         assert entry['size'] == 0
279 |         entry = next(listing)
280 |         assert entry['name'] == os.path.join(controller.hdfs_path, '4.sqlite')
281 |         assert entry['kind'] == 'file'
282 |         assert entry['size'] == 10
283 |         with pytest.raises(StopIteration):
284 |             next(listing)
285 |         # clean up after successful test
286 |         hdfs.rm(controller.hdfs_path, recursive=True)
287 |         hdfs.mkdir(controller.hdfs_path)
288 |     def test_assign_segments(self):
289 |         controller = self.get_local_controller()
290 |         hdfs = HDFileSystem(host=controller.hdfs_host, port=controller.hdfs_port)
291 |         hdfs.rm(controller.hdfs_path, recursive=True)
292 |         hdfs.mkdir(controller.hdfs_path)
293 |         with hdfs.open(os.path.join(controller.hdfs_path, '1.sqlite'), 'wb', replication=1) as f:
294 |             f.write(b'x' * 1024)
295 |         hostname = 'test.example.com'
296 |         self.registry.heartbeat(pool='trough-nodes',
297 |             service_id='trough:nodes:%s' % hostname,
298 |             node=hostname,
299 |             ttl=0.3,
300 |             available_bytes=1024*1024)
301 |         controller = self.get_local_controller()
302 |         controller.assign_segments()
303 |         assignments = [asmt for asmt in self.rethinker.table('assignment').filter(r.row['id'] != 'ring-assignments').run()]
304 |         self.assertEqual(len(assignments), 1)
305 |         self.assertEqual(assignments[0]['bytes'], 1024)
306 |         self.assertEqual(assignments[0]['hash_ring'], 0)
307 |         # clean up after successful test
308 |         hdfs.rm(controller.hdfs_path, recursive=True)
309 |         hdfs.mkdir(controller.hdfs_path)
310 |     @mock.patch("trough.sync.requests")
311 |     def test_provision_writable_segment(self, requests):
312 |         u = []
313 |         d = []
314 |         class Response(object):
315 |             def __init__(self, code, text):
316 |                 self.status_code = code
317 |                 self.text = text or "Test"
318 |         def p(url, data=None, json=None):
319 |             u.append(url)
320 |             d.append(data or json)
321 |             host = url.split("/")[2].split(":")[0]
322 |             if url == 'http://example4:6112/provision':
323 |                 return Response(500, "Test")
324 |             else:
325 |                 return Response(200, """{ "url": "http://%s:6222/?segment=testsegment" }""" % host)
326 |         requests.post = p
327 |         self.rethinker.table('services').insert({
328 |             'role': "trough-nodes",
329 |             'node': "example3",
330 |             'segment': "testsegment",
331 |             'ttl': 999,
332 |             'last_heartbeat': r.now(),
333 |         }).run()
334 |         self.rethinker.table('services').insert({
335 |             'id': "trough-read:example2:testsegment",
336 |             'role': "trough-read",
337 |             'node': "example2",
338 |             'segment': "testsegment",
339 |             'ttl': 999,
340 |             'last_heartbeat': r.now(),
341 |         }).run()
342 |         self.rethinker.table('lock').insert({ 
343 |             'id': 'write:lock:testsegment', 
344 |             'node':'example', 
345 |             'segment': 'testsegment' }).run()
346 |         controller = self.get_local_controller()
347 |         # check behavior when lock exists
348 |         output = controller.provision_writable_segment('testsegment')
349 |         self.assertEqual(output['url'], 'http://example:6222/?segment=testsegment')
350 |         # check behavior when readable copy exists
351 |         self.rethinker.table('lock').get('write:lock:testsegment').delete().run()
352 |         output = controller.provision_writable_segment('testsegment')
353 |         self.assertEqual(u[1], 'http://example2:6112/provision')
354 |         self.assertEqual(d[1]['segment'], 'testsegment')
355 |         self.assertEqual(output['url'], 'http://example2:6222/?segment=testsegment')
356 |         # check behavior when only pool of nodes exists
357 |         self.rethinker.table('services').get( "trough-read:example2:testsegment").delete().run()
358 |         output = controller.provision_writable_segment('testsegment')
359 |         self.assertEqual(u[2], 'http://example3:6112/provision')
360 |         self.assertEqual(d[2]['segment'], 'testsegment')
361 |         self.assertEqual(output['url'], 'http://example3:6222/?segment=testsegment')
362 |         # check behavior when we get a downstream error
363 |         self.rethinker.table('services').delete().run()
364 |         self.rethinker.table('services').insert({
365 |             'role': "trough-nodes",
366 |             'node': "example4",
367 |             'ttl': 999,
368 |             'last_heartbeat': r.now(),
369 |         }).run()
370 |         with self.assertRaisesRegex(Exception, 'Received a 500 response while'):
371 |             output = controller.provision_writable_segment('testsegment')
372 |         self.assertEqual(u[3], 'http://example4:6112/provision')
373 |         self.assertEqual(d[3]['segment'], 'testsegment')
374 |         # check behavior when node expires
375 |         self.rethinker.table('services').delete().run()
376 |         self.rethinker.table('services').insert({
377 |             'role': "trough-nodes",
378 |             'node': "example6",
379 |             'load': 30,
380 |             'ttl': 1.5,
381 |             'last_heartbeat': r.now(),
382 |         }).run()
383 |         self.rethinker.table('services').insert({
384 |             'role': "trough-nodes",
385 |             'node': "example5",
386 |             'load': 0.01,
387 |             'ttl': 0.2,
388 |             'last_heartbeat': r.now(),
389 |         }).run()
390 |         # example 5 hasn't expired yet
391 |         output = controller.provision_writable_segment('testsegment')
392 |         self.assertEqual(u[4], 'http://example5:6112/provision')
393 |         self.assertEqual(d[4], {'segment': 'testsegment', 'schema': 'default'})
394 |         time.sleep(1)
395 |         # example 5 has expired
396 |         output = controller.provision_writable_segment('testsegment')
397 |         self.assertEqual(u[5], 'http://example6:6112/provision')
398 |         self.assertEqual(d[5], {'segment': 'testsegment', 'schema': 'default'})
399 |         time.sleep(1)
400 |         # example 5 and 6 have expired
401 |         with self.assertRaises(Exception):
402 |             output = controller.provision_writable_segment('testsegment')
403 | 
404 |     def test_sync(self):
405 |         pass
406 | 
407 | class TestLocalSyncController(unittest.TestCase):
408 |     def setUp(self):
409 |         self.rethinker = doublethink.Rethinker(db=random_db, servers=settings['RETHINKDB_HOSTS'])
410 |         self.services = doublethink.ServiceRegistry(self.rethinker)
411 |         self.registry = sync.HostRegistry(rethinker=self.rethinker, services=self.services)
412 |         self.snakebite_client = mock.Mock()
413 |         self.rethinker.table("services").delete().run()
414 |     def make_fresh_controller(self):
415 |         return sync.LocalSyncController(rethinker=self.rethinker,
416 |             services=self.services,
417 |             registry=self.registry)
418 |     # v don't log out the error message on error test below.
419 |     @mock.patch("trough.sync.client")
420 |     @mock.patch("trough.sync.logging.error")
421 |     def test_copy_segment_from_hdfs(self, error, snakebite):
422 |         results = [{'error': 'test error'}]
423 |         class C:
424 |             def __init__(*args, **kwargs):
425 |                 pass
426 |             def copyToLocal(self, paths, dst, *args, **kwargs):
427 |                 for result in results:
428 |                     if not result.get('error'):
429 |                         # create empty dest file
430 |                         with open(dst, 'wb') as f: pass
431 |                     yield result
432 |         snakebite.Client = C
433 |         controller = self.make_fresh_controller()
434 |         segment = sync.Segment('test-segment',
435 |             services=self.services,
436 |             rethinker=self.rethinker,
437 |             registry=self.registry,
438 |             size=100,
439 |             remote_path='/fake/remote/path')
440 |         with self.assertRaises(Exception):
441 |             output = controller.copy_segment_from_hdfs(segment)
442 |         results = [{}]
443 |         output = controller.copy_segment_from_hdfs(segment)
444 |         self.assertEqual(output, True)
445 |     def test_heartbeat(self):
446 |         controller = self.make_fresh_controller()
447 |         controller.heartbeat()
448 |         output = [svc for svc in self.rethinker.table('services').run()]
449 |         self.assertEqual(output[0]['node'], 'test01')
450 |         self.assertEqual(output[0]['first_heartbeat'], output[0]['last_heartbeat'])
451 | 
452 |     @mock.patch("trough.sync.client")
453 |     def test_sync_discard_uninteresting_segments(self, snakebite):
454 |         with tempfile.TemporaryDirectory() as tmp_dir:
455 |             controller = self.make_fresh_controller()
456 |             controller.local_data = tmp_dir
457 |             sync.init(self.rethinker)
458 |             assert controller.healthy_service_ids == set()
459 |             controller.sync()
460 |             assert controller.healthy_service_ids == set()
461 |             controller.healthy_service_ids.add('trough-read:test01:1')
462 |             controller.healthy_service_ids.add('trough-read:test01:2')
463 |             controller.healthy_service_ids.add('trough-write:test01:2')
464 |             controller.sync()
465 |             assert controller.healthy_service_ids == set()
466 | 
467 |             # make segment 3 a segment of interest
468 |             with open(os.path.join(tmp_dir, '3.sqlite'), 'wb'):
469 |                 pass
470 |             controller.healthy_service_ids.add('trough-read:test01:1')
471 |             controller.healthy_service_ids.add('trough-read:test01:3')
472 |             controller.healthy_service_ids.add('trough-write:test01:3')
473 |             controller.sync()
474 |             assert controller.healthy_service_ids == {'trough-read:test01:3', 'trough-write:test01:3'}
475 | 
476 |     def test_sync_segment_freshness(self):
477 |         sync.init(self.rethinker)
478 |         with tempfile.TemporaryDirectory() as tmp_dir:
479 |             self.rethinker.table('lock').delete().run()
480 |             self.rethinker.table('assignment').delete().run()
481 |             self.rethinker.table('services').delete().run()
482 |             controller = self.make_fresh_controller()
483 |             controller.local_data = tmp_dir
484 |             assert controller.healthy_service_ids == set()
485 |             # make segment 4 a segment of interest
486 |             with open(os.path.join(tmp_dir, '4.sqlite'), 'wb'):
487 |                 pass
488 |             controller.sync()
489 |             assert controller.healthy_service_ids == {'trough-read:test01:4'}
490 | 
491 |             # create a write lock
492 |             lock = sync.Lock.acquire(self.rethinker, 'trough-write:test01:4', {'segment':'4'})
493 |             controller.sync()
494 |             assert controller.healthy_service_ids == {'trough-read:test01:4', 'trough-write:test01:4'}
495 |             locks = list(self.rethinker.table('lock').run())
496 | 
497 |             assert len(locks) == 1
498 |             assert locks[0]['id'] == 'trough-write:test01:4'
499 | 
500 |         self.rethinker.table('lock').delete().run()
501 |         self.rethinker.table('assignment').delete().run()
502 | 
503 |         # clean slate
504 |         with tempfile.TemporaryDirectory() as tmp_dir:
505 |             hdfs = HDFileSystem(host=controller.hdfs_host, port=controller.hdfs_port)
506 |             hdfs.rm(controller.hdfs_path, recursive=True)
507 |             hdfs.mkdir(controller.hdfs_path)
508 |             with hdfs.open(os.path.join(controller.hdfs_path, '5.sqlite'), 'wb', replication=1) as f:
509 |                 f.write('y' * 1024)
510 |             self.rethinker.table('lock').delete().run()
511 |             self.rethinker.table('assignment').delete().run()
512 |             self.rethinker.table('services').delete().run()
513 |             controller = self.make_fresh_controller()
514 |             controller.local_data = tmp_dir
515 |             # create an assignment without a local segment
516 |             assignment = sync.Assignment(self.rethinker, d={
517 |                 'hash_ring': 'a', 'node': 'test01', 'segment': '5',
518 |                 'assigned_on': r.now(), 'bytes': 0,
519 |                 'remote_path': os.path.join(controller.hdfs_path, '5.sqlite')})
520 |             assignment.save()
521 |             lock = sync.Lock.acquire(self.rethinker, 'write:lock:5', {'segment':'5'})
522 |             assert len(list(self.rethinker.table('lock').run())) == 1
523 |             controller.healthy_service_ids.add('trough-write:test01:5')
524 |             controller.healthy_service_ids.add('trough-read:test01:5')
525 |             controller.sync()
526 |             assert controller.healthy_service_ids == {'trough-read:test01:5'}
527 |             assert list(self.rethinker.table('lock').run()) == []
528 |             # clean up
529 |             hdfs.rm(controller.hdfs_path, recursive=True)
530 |             hdfs.mkdir(controller.hdfs_path)
531 | 
532 |         # third case: not assigned, local file exists, is older than hdfs
533 |         # this corresponds to the situation where we have an out-of-date
534 |         # segment on disk that was probably a write segment before it was
535 |         # reassigned when it was pushed upstream
536 |         with tempfile.TemporaryDirectory() as tmp_dir:
537 |             # create a local segment without an assignment
538 |             with open(os.path.join(tmp_dir, '6.sqlite'), 'wb'):
539 |                 pass
540 |             time.sleep(2)
541 |             # create file in hdfs with newer timestamp
542 |             hdfs = HDFileSystem(host=controller.hdfs_host, port=controller.hdfs_port)
543 |             hdfs.rm(controller.hdfs_path, recursive=True)
544 |             hdfs.mkdir(controller.hdfs_path)
545 |             with hdfs.open(os.path.join(controller.hdfs_path, '6.sqlite'), 'wb', replication=1) as f:
546 |                 f.write('z' * 1024)
547 |             self.rethinker.table('lock').delete().run()
548 |             self.rethinker.table('assignment').delete().run()
549 |             self.rethinker.table('services').delete().run()
550 |             controller = self.make_fresh_controller()
551 |             controller.local_data = tmp_dir
552 |             controller.healthy_service_ids.add('trough-write:test01:6')
553 |             controller.healthy_service_ids.add('trough-read:test01:6')
554 |             controller.sync()
555 |             assert controller.healthy_service_ids == set()
556 |             # clean up
557 |             hdfs.rm(controller.hdfs_path, recursive=True)
558 |             hdfs.mkdir(controller.hdfs_path)
559 | 
560 |     @mock.patch("trough.sync.client")
561 |     def test_hdfs_resiliency(self, snakebite):
562 |         sync.init(self.rethinker)
563 |         self.rethinker.table('lock').delete().run()
564 |         self.rethinker.table('assignment').delete().run()
565 |         self.rethinker.table('services').delete().run()
566 |         assignment = sync.Assignment(self.rethinker, d={
567 |             'hash_ring': 'a', 'node': 'test01', 'segment': '1',
568 |             'assigned_on': r.now(), 'remote_path': '/1.sqlite', 'bytes': 9})
569 |         assignment.save()
570 |         class C:
571 |             def __init__(*args, **kwargs):
572 |                 pass
573 |             def ls(*args, **kwargs):
574 |                 yield {'length': 1024 * 1000, 'path': '/1.sqlite', 'modification_time': (time.time() + 1000000) * 1000}
575 |             def copyToLocal(*args, **kwargs):
576 |                 return [{'error':'There was a problem...'}]
577 |         snakebite.Client = C
578 |         controller = self.make_fresh_controller()
579 |         controller.sync()
580 |         class C:
581 |             def __init__(*args, **kwargs):
582 |                 pass
583 |             def ls(*args, **kwargs):
584 |                 yield {'length': 1024 * 1000, 'path': '/1.sqlite', 'modification_time': (time.time() + 1000000) * 1000}
585 |             def copyToLocal(*args, **kwargs):
586 |                 def g():
587 |                     raise Exception("HDFS IS DOWN")
588 |                     yield 0
589 |                 return g()
590 |         snakebite.Client = C
591 |         controller = self.make_fresh_controller()
592 |         controller.sync()
593 |         class C:
594 |             def __init__(*args, **kwargs):
595 |                 pass
596 |             def ls(*args, **kwargs):
597 |                 def g():
598 |                     raise Exception("HDFS IS DOWN")
599 |                     yield 0
600 |                 return g()
601 |             def copyToLocal(*args, **kwargs):
602 |                 def g():
603 |                     raise Exception("HDFS IS DOWN")
604 |                     yield 0
605 |                 return g()
606 |         snakebite.Client = C
607 |         controller = self.make_fresh_controller()
608 |         controller.sync()
609 |         self.rethinker.table('lock').delete().run()
610 |         self.rethinker.table('assignment').delete().run()
611 |         self.rethinker.table('services').delete().run()
612 | 
613 |     def test_periodic_heartbeat(self):
614 |         controller = self.make_fresh_controller()
615 |         controller.sync_loop_timing = 1
616 |         controller.healthy_service_ids = {'trough-read:test01:id0', 'trough-read:test01:id1'}
617 |         assert set(self.rethinker.table('services')['id'].run()) == set()
618 | 
619 |         # first time it inserts individual services
620 |         heartbeats_after = doublethink.utcnow()
621 |         healthy_service_ids = controller.periodic_heartbeat()
622 |         assert set(healthy_service_ids) == {'trough-read:test01:id0', 'trough-read:test01:id1'}
623 |         assert set(self.rethinker.table('services')['id'].run()) == {'trough-nodes:test01:None', 'trough-read:test01:id0', 'trough-read:test01:id1'}
624 |         for svc in self.rethinker.table('services').run():
625 |             assert svc['last_heartbeat'] > heartbeats_after
626 | 
627 |         # subsequently updates existing services in one bulk query
628 |         heartbeats_after = doublethink.utcnow()
629 |         healthy_service_ids = controller.periodic_heartbeat()
630 |         assert set(healthy_service_ids) == {'trough-read:test01:id0', 'trough-read:test01:id1'}
631 |         assert set(self.rethinker.table('services')['id'].run()) == {'trough-nodes:test01:None', 'trough-read:test01:id0', 'trough-read:test01:id1'}
632 |         for svc in self.rethinker.table('services').run():
633 |             assert svc['last_heartbeat'] > heartbeats_after
634 | 
635 |     def test_provision_writable_segment(self):
636 |         test_segment = sync.Segment('test',
637 |             services=self.services,
638 |             rethinker=self.rethinker,
639 |             registry=self.registry,
640 |             size=0)
641 |         test_path = test_segment.local_path()
642 |         if os.path.isfile(test_path):
643 |             os.remove(test_path)
644 |         called = []
645 |         controller = self.make_fresh_controller()
646 |         controller.provision_writable_segment('test')
647 |         self.assertEqual(os.path.isfile(test_path), True)
648 |         os.remove(test_path)
649 | 
650 |     def test_collect_garbage(self):
651 |         # for each segment file on local disk
652 |         # - segment assigned to me should not be gc'd
653 |         # - segment not assigned to me with healthy service count <= minimum
654 |         #   should not be gc'd
655 |         # - segment not assigned to me with healthy service count == minimum
656 |         #   and no local healthy service entry should be gc'd
657 |         # - segment not assigned to me with healthy service count > minimum
658 |         #   and has local healthy service entry should be gc'd
659 |         with tempfile.TemporaryDirectory() as tmp_dir:
660 |             # create segment file
661 |             segment_id = 'test_collect_garbage'
662 |             filename = '%s.sqlite' % segment_id
663 |             path = os.path.join(tmp_dir, filename)
664 |             with open(path, 'wb'):
665 |                 pass
666 |             assert os.path.exists(path)
667 | 
668 |             # create controller
669 |             controller = self.make_fresh_controller()
670 |             controller.local_data = tmp_dir
671 | 
672 |             # assign to me
673 |             assignment = sync.Assignment(self.rethinker, d={
674 |                 'hash_ring': 'a', 'node': 'test01', 'segment': segment_id,
675 |                 'assigned_on': r.now(), 'bytes': 9,
676 |                 'remote_path': '/%s.sqlite' % segment_id})
677 |             assignment.save()
678 | 
679 |             # - segment assigned to me should not be gc'd
680 |             controller.collect_garbage()
681 |             assert os.path.exists(path)
682 | 
683 |             # - segment not assigned to me with healthy service count <= minimum
684 |             #   should not be gc'd
685 |             controller.registry.unassign(assignment)
686 |             controller.registry.commit_unassignments()
687 |             # 0 healthy service ids
688 |             controller.collect_garbage()
689 |             assert os.path.exists(path)
690 |             # 1 healthy service id
691 |             controller.registry.heartbeat(pool='trough-read', node='test01', ttl=600, segment=segment_id)
692 |             controller.collect_garbage()
693 |             assert os.path.exists(path)
694 | 
695 |             # - segment not assigned to me with healthy service count == minimum
696 |             #   and no local healthy service entry should be gc'd
697 |             # delete service entry
698 |             self.rethinker.table('services').get('trough-read:test01:%s' % segment_id).delete().run()
699 |             controller.registry.heartbeat(pool='trough-read', node='test02', ttl=600, segment=segment_id)
700 |             controller.collect_garbage()
701 |             assert not os.path.exists(path)
702 | 
703 |             # recreate file
704 |             with open(path, 'wb'):
705 |                 pass
706 |             assert os.path.exists(path)
707 | 
708 |             # - segment not assigned to me with healthy service count > minimum
709 |             #   and has local healthy service entry should be gc'd
710 |             controller.registry.heartbeat(pool='trough-read', node='test01', ttl=600, segment=segment_id)
711 |             controller.registry.heartbeat(pool='trough-read', node='test02', ttl=600, segment=segment_id)
712 |             controller.collect_garbage()
713 |             assert not os.path.exists(path)
714 |             assert not self.rethinker.table('services').get('trough-read:test01:%s' % segment_id).run()
715 |             assert self.rethinker.table('services').get('trough-read:test02:%s' % segment_id).run()
716 | 
717 | 
718 | if __name__ == '__main__':
719 |     unittest.main()
720 | 


--------------------------------------------------------------------------------
/tests/test_write.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TROUGH_SETTINGS'] = os.path.join(os.path.dirname(__file__), "test.conf")
 3 | 
 4 | import unittest
 5 | from unittest import mock
 6 | from trough import write
 7 | import json
 8 | import sqlite3
 9 | from tempfile import NamedTemporaryFile
10 | 
11 | class TestWriteServer(unittest.TestCase):
12 |     def setUp(self):
13 |         self.server = write.WriteServer()
14 |     def test_empty_write(self):
15 |         database_file = NamedTemporaryFile()
16 |         segment = mock.Mock()
17 |         segment.segment_path = lambda: database_file.name
18 |         # no inserts!
19 |         output = b""
20 |         with self.assertRaises(Exception):
21 |             output = self.server.write(segment, b'')
22 |         database_file.close()
23 |         self.assertEqual(output, b'')
24 |     def test_read_failure(self):
25 |         database_file = NamedTemporaryFile()
26 |         segment = mock.Mock()
27 |         segment.segment_path = lambda: database_file.name
28 |         connection = sqlite3.connect(database_file.name)
29 |         cursor = connection.cursor()
30 |         cursor.execute('CREATE TABLE test (id INTEGER PRIMARY KEY AUTOINCREMENT, test varchar(4));')
31 |         cursor.execute('INSERT INTO test (test) VALUES ("test");')
32 |         connection.commit()
33 |         output = b""
34 |         with self.assertRaises(Exception):
35 |             output = self.server.write(segment, b'SELECT * FROM "test";')
36 |         database_file.close()
37 |     def test_write(self):
38 |         database_file = NamedTemporaryFile()
39 |         segment = mock.Mock()
40 |         segment.local_path = lambda: database_file.name
41 |         output = self.server.write(segment, b'CREATE TABLE test (id INTEGER PRIMARY KEY AUTOINCREMENT, test varchar(4));')
42 |         output = self.server.write(segment, b'INSERT INTO test (test) VALUES ("test");')
43 |         connection = sqlite3.connect(database_file.name)
44 |         cursor = connection.cursor()
45 |         output = cursor.execute('SELECT * FROM test;')
46 |         for row in output:
47 |             output = dict((cursor.description[i][0], value) for i, value in enumerate(row))
48 |         database_file.close()
49 |         self.assertEqual(output, {'id': 1, 'test': 'test'})
50 |     def test_write_failure_to_read_only_segment(self):
51 |         database_file = NamedTemporaryFile()
52 |         segment = mock.Mock()
53 |         segment.segment_path = lambda: database_file.name
54 |         connection = sqlite3.connect(database_file.name)
55 |         cursor = connection.cursor()
56 |         cursor.execute('CREATE TABLE test (id INTEGER PRIMARY KEY AUTOINCREMENT, test varchar(4));')
57 |         # set up an environment for uwsgi mock
58 |         env = {}
59 |         env['HTTP_HOST'] = "TEST.host"
60 |         env['wsgi.input'] = mock.Mock()
61 |         env['wsgi.input'].read = lambda: b'INSERT INTO test (test) VALUES ("test")'
62 |         start = mock.Mock()
63 |         output = self.server(env, start)
64 |         self.assertEqual(output, [b"500 Server Error: This node (settings['HOSTNAME']='test01') cannot write to segment 'TEST'. There is no write lock set, or the write lock authorizes another node. Write lock: None\n"])
65 |         database_file.close()
66 | 
67 | if __name__ == '__main__':
68 |     unittest.main()
69 | 


--------------------------------------------------------------------------------
/tests/wsgi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/trough/0c6243e0ec4731ce5bb61c15aa7993ac57b692fe/tests/wsgi/__init__.py


--------------------------------------------------------------------------------
/tests/wsgi/test_segment_manager.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from trough.wsgi.segment_manager import server
  3 | import ujson
  4 | import trough
  5 | from trough.settings import settings
  6 | import doublethink
  7 | import rethinkdb as r
  8 | import requests # :-\ urllib3?
  9 | import hdfs3
 10 | import time
 11 | import tempfile
 12 | import os
 13 | import sqlite3
 14 | import logging
 15 | import socket
 16 | 
 17 | trough.settings.configure_logging()
 18 | 
 19 | @pytest.fixture(scope="module")
 20 | def segment_manager_server():
 21 |     server.testing = True
 22 |     return server.test_client()
 23 | 
 24 | def test_simple_provision(segment_manager_server):
 25 |     result = segment_manager_server.get('/')
 26 |     assert result.status == '405 METHOD NOT ALLOWED'
 27 | 
 28 |     # hasn't been provisioned yet
 29 |     result = segment_manager_server.post('/', data='test_simple_provision_segment')
 30 |     assert result.status_code == 200
 31 |     assert result.mimetype == 'text/plain'
 32 |     assert b''.join(result.response).endswith(b':6222/?segment=test_simple_provision_segment')
 33 | 
 34 |     # now it has already been provisioned
 35 |     result = segment_manager_server.post('/', data='test_simple_provision_segment')
 36 |     assert result.status_code == 200
 37 |     assert result.mimetype == 'text/plain'
 38 |     assert b''.join(result.response).endswith(b':6222/?segment=test_simple_provision_segment')
 39 | 
 40 | def test_provision(segment_manager_server):
 41 |     result = segment_manager_server.get('/provision')
 42 |     assert result.status == '405 METHOD NOT ALLOWED'
 43 | 
 44 |     # hasn't been provisioned yet
 45 |     result = segment_manager_server.post(
 46 |             '/provision', content_type='application/json',
 47 |             data=ujson.dumps({'segment':'test_provision_segment'}))
 48 |     assert result.status_code == 200
 49 |     assert result.mimetype == 'application/json'
 50 |     result_bytes = b''.join(result.response)
 51 |     result_dict = ujson.loads(result_bytes) # ujson accepts bytes! 😻
 52 |     assert result_dict['write_url'].endswith(':6222/?segment=test_provision_segment')
 53 | 
 54 |     # now it has already been provisioned
 55 |     result = segment_manager_server.post(
 56 |             '/provision', content_type='application/json',
 57 |             data=ujson.dumps({'segment':'test_provision_segment'}))
 58 |     assert result.status_code == 200
 59 |     assert result.mimetype == 'application/json'
 60 |     result_bytes = b''.join(result.response)
 61 |     result_dict = ujson.loads(result_bytes)
 62 |     assert result_dict['write_url'].endswith(':6222/?segment=test_provision_segment')
 63 | 
 64 | def test_provision_with_schema(segment_manager_server):
 65 |     schema = '''CREATE TABLE test (id INTEGER PRIMARY KEY AUTOINCREMENT, test varchar(4));
 66 | INSERT INTO test (test) VALUES ("test");'''
 67 |     # create a schema by submitting sql
 68 |     result = segment_manager_server.put(
 69 |             '/schema/test1/sql', content_type='applicaton/sql', data=schema)
 70 |     assert result.status_code == 201
 71 | 
 72 |     # provision a segment with that schema
 73 |     result = segment_manager_server.post(
 74 |             '/provision', content_type='application/json',
 75 |             data=ujson.dumps({'segment':'test_provision_with_schema_1', 'schema':'test1'}))
 76 |     assert result.status_code == 200
 77 |     assert result.mimetype == 'application/json'
 78 |     result_bytes = b''.join(result.response)
 79 |     result_dict = ujson.loads(result_bytes) # ujson accepts bytes! 😻
 80 |     assert result_dict['write_url'].endswith(':6222/?segment=test_provision_with_schema_1')
 81 | 
 82 |     # get db read url from rethinkdb
 83 |     rethinker = doublethink.Rethinker(
 84 |             servers=settings['RETHINKDB_HOSTS'], db='trough_configuration')
 85 |     query = rethinker.table('services').get_all('test_provision_with_schema_1', index='segment').filter({'role': 'trough-read'}).filter(lambda svc: r.now().sub(svc['last_heartbeat']).lt(svc['ttl'])).order_by('load')[0]
 86 |     healthy_segment = query.run()
 87 |     read_url = healthy_segment.get('url')
 88 |     assert read_url.endswith(':6444/?segment=test_provision_with_schema_1')
 89 | 
 90 |     # run a query to check that the schema was used
 91 |     sql = 'SELECT * FROM test;'
 92 |     with requests.post(read_url, stream=True, data=sql) as response:
 93 |         assert response.status_code == 200
 94 |         result = ujson.loads(response.text)
 95 |         assert result == [{'test': 'test', 'id': 1}]
 96 | 
 97 |     # delete the schema from rethinkdb for the sake of other tests
 98 |     rethinker = doublethink.Rethinker(
 99 |             servers=settings['RETHINKDB_HOSTS'], db='trough_configuration')
100 |     result = rethinker.table('schema').get('test1').delete().run()
101 |     assert result == {'deleted': 1, 'inserted': 0, 'skipped': 0, 'errors': 0, 'unchanged': 0, 'replaced': 0}
102 | 
103 | def test_schemas(segment_manager_server):
104 |     # initial list of schemas
105 |     result = segment_manager_server.get('/schema')
106 |     assert result.status_code == 200
107 |     assert result.mimetype == 'application/json'
108 |     result_bytes = b''.join(result.response)
109 |     result_list = ujson.loads(result_bytes)
110 |     assert set(result_list) == {'default'}
111 | 
112 |     # existent schema as json
113 |     result = segment_manager_server.get('/schema/default')
114 |     assert result.status_code == 200
115 |     assert result.mimetype == 'application/json'
116 |     result_bytes = b''.join(result.response)
117 |     result_dict = ujson.loads(result_bytes)
118 |     assert result_dict == {'id': 'default', 'sql': ''}
119 | 
120 |     # existent schema sql
121 |     result = segment_manager_server.get('/schema/default/sql')
122 |     assert result.status_code == 200
123 |     assert result.mimetype == 'application/sql'
124 |     result_bytes = b''.join(result.response)
125 |     assert result_bytes == b''
126 | 
127 |     # schema doesn't exist yet
128 |     result = segment_manager_server.get('/schema/schema1')
129 |     assert result.status_code == 404
130 | 
131 |     # schema doesn't exist yet
132 |     result = segment_manager_server.get('/schema/schema1/sql')
133 |     assert result.status_code == 404
134 | 
135 |     # bad request: POST not accepted (must be PUT)
136 |     result = segment_manager_server.post('/schema/schema1', data='{}')
137 |     assert result.status_code == 405
138 |     result = segment_manager_server.post('/schema/schema1/sql', data='')
139 |     assert result.status_code == 405
140 | 
141 |     # bad request: invalid json
142 |     result = segment_manager_server.put(
143 |             '/schema/schema1', data=']]}what the not valid json' )
144 |     assert result.status_code == 400
145 |     assert b''.join(result.response) == b'input could not be parsed as json'
146 | 
147 |     # bad request: id in json does not match url
148 |     result = segment_manager_server.put(
149 |             '/schema/schema1', data=ujson.dumps({'id': 'schema2', 'sql': ''}))
150 |     assert result.status_code == 400
151 |     assert b''.join(result.response) == b"id in json 'schema2' does not match id in url 'schema1'"
152 | 
153 |     # bad request: missing sql
154 |     result = segment_manager_server.put(
155 |             '/schema/schema1', data=ujson.dumps({'id': 'schema1'}))
156 |     assert result.status_code == 400
157 |     assert b''.join(result.response) == b"input json has keys {'id'} (should be {'id', 'sql'})"
158 | 
159 |     # bad request: missing id
160 |     result = segment_manager_server.put(
161 |             '/schema/schema1', data=ujson.dumps({'sql': ''}))
162 |     assert result.status_code == 400
163 |     assert b''.join(result.response) == b"input json has keys {'sql'} (should be {'id', 'sql'})"
164 | 
165 |     # bad request: invalid sql
166 |     result = segment_manager_server.put(
167 |             '/schema/schema1', data=ujson.dumps({'id': 'schema1', 'sql': 'create create table table blah blooofdjaio'}))
168 |     assert result.status_code == 400
169 |     assert b''.join(result.response) == b'schema sql failed validation: near "create": syntax error'
170 | 
171 |     # create new schema by submitting sql
172 |     result = segment_manager_server.put(
173 |             '/schema/schema1/sql', content_type='applicaton/sql',
174 |             data='create table foo (bar varchar(100));')
175 |     assert result.status_code == 201
176 | 
177 |     # get the new schema as json
178 |     result = segment_manager_server.get('/schema/schema1')
179 |     assert result.status_code == 200
180 |     assert result.mimetype == 'application/json'
181 |     result_bytes = b''.join(result.response)
182 |     result_dict = ujson.loads(result_bytes)
183 |     assert result_dict == {'id': 'schema1', 'sql': 'create table foo (bar varchar(100));'}
184 | 
185 |     # get the new schema as sql
186 |     result = segment_manager_server.get('/schema/schema1/sql')
187 |     assert result.status_code == 200
188 |     assert result.mimetype == 'application/sql'
189 |     result_bytes = b''.join(result.response)
190 |     assert result_bytes == b'create table foo (bar varchar(100));'
191 | 
192 |     # create new schema by submitting json
193 |     result = segment_manager_server.put(
194 |             '/schema/schema2', content_type='applicaton/sql',
195 |             data=ujson.dumps({'id': 'schema2', 'sql': 'create table schema2_table (foo varchar(100));'}))
196 |     assert result.status_code == 201
197 | 
198 |     # get the new schema as json
199 |     result = segment_manager_server.get('/schema/schema2')
200 |     assert result.status_code == 200
201 |     assert result.mimetype == 'application/json'
202 |     result_bytes = b''.join(result.response)
203 |     result_dict = ujson.loads(result_bytes)
204 |     assert result_dict == {'id': 'schema2', 'sql': 'create table schema2_table (foo varchar(100));'}
205 | 
206 |     # get the new schema as sql
207 |     result = segment_manager_server.get('/schema/schema2/sql')
208 |     assert result.status_code == 200
209 |     assert result.mimetype == 'application/sql'
210 |     result_bytes = b''.join(result.response)
211 |     assert result_bytes == b'create table schema2_table (foo varchar(100));'
212 | 
213 |     # updated list of schemas
214 |     result = segment_manager_server.get('/schema')
215 |     assert result.status_code == 200
216 |     assert result.mimetype == 'application/json'
217 |     result_bytes = b''.join(result.response)
218 |     result_list = ujson.loads(result_bytes)
219 |     assert set(result_list) == {'default', 'schema1', 'schema2'}
220 | 
221 |     # overwrite schema1 with json api
222 |     result = segment_manager_server.put(
223 |             '/schema/schema1', content_type='applicaton/json',
224 |             data=ujson.dumps({'id': 'schema1', 'sql': 'create table blah (toot varchar(100));'}))
225 |     assert result.status_code == 204
226 | 
227 |     # get the modified schema as sql
228 |     result = segment_manager_server.get('/schema/schema1/sql')
229 |     assert result.status_code == 200
230 |     assert result.mimetype == 'application/sql'
231 |     result_bytes = b''.join(result.response)
232 |     assert result_bytes == b'create table blah (toot varchar(100));'
233 | 
234 |     # overwrite schema1 with sql api
235 |     result = segment_manager_server.put(
236 |             '/schema/schema1/sql', content_type='applicaton/sql',
237 |             data='create table haha (hehehe varchar(100));')
238 |     assert result.status_code == 204
239 | 
240 |     # get the modified schema as json
241 |     result = segment_manager_server.get('/schema/schema1')
242 |     assert result.status_code == 200
243 |     assert result.mimetype == 'application/json'
244 |     result_bytes = b''.join(result.response)
245 |     result_dict = ujson.loads(result_bytes)
246 |     assert result_dict == {'id': 'schema1', 'sql': 'create table haha (hehehe varchar(100));'}
247 | 
248 |     # updated list of schemas
249 |     result = segment_manager_server.get('/schema')
250 |     assert result.status_code == 200
251 |     assert result.mimetype == 'application/json'
252 |     result_bytes = b''.join(result.response)
253 |     result_list = ujson.loads(result_bytes)
254 |     assert set(result_list) == {'default', 'schema1', 'schema2'}
255 | 
256 |     # XXX DELETE?
257 | 
258 | def test_promotion(segment_manager_server):
259 |     hdfs = hdfs3.HDFileSystem(settings['HDFS_HOST'], settings['HDFS_PORT'])
260 | 
261 |     hdfs.rm(settings['HDFS_PATH'])
262 |     hdfs.mkdir(settings['HDFS_PATH'])
263 | 
264 |     result = segment_manager_server.get('/promote')
265 |     assert result.status == '405 METHOD NOT ALLOWED'
266 | 
267 |     # provision a test segment for write
268 |     result = segment_manager_server.post(
269 |             '/provision', content_type='application/json',
270 |             data=ujson.dumps({'segment':'test_promotion'}))
271 |     assert result.status_code == 200
272 |     assert result.mimetype == 'application/json'
273 |     result_bytes = b''.join(result.response)
274 |     result_dict = ujson.loads(result_bytes)
275 |     assert result_dict['write_url'].endswith(':6222/?segment=test_promotion')
276 |     write_url = result_dict['write_url']
277 | 
278 |     # write something into the db
279 |     sql = ('create table foo (bar varchar(100));\n'
280 |            'insert into foo (bar) values ("testing segment promotion");\n')
281 |     response = requests.post(write_url, sql)
282 |     assert response.status_code == 200
283 | 
284 |     # shouldn't be anything in hdfs yet...
285 |     expected_remote_path = os.path.join(
286 |             settings['HDFS_PATH'], 'test_promot', 'test_promotion.sqlite')
287 |     with pytest.raises(FileNotFoundError):
288 |         hdfs.ls(expected_remote_path, detail=True)
289 | 
290 |     # now write to the segment and promote it to HDFS
291 |     before = time.time()
292 |     time.sleep(1.5)
293 |     result = segment_manager_server.post(
294 |             '/promote', content_type='application/json',
295 |             data=ujson.dumps({'segment': 'test_promotion'}))
296 |     assert result.status_code == 200
297 |     assert result.mimetype == 'application/json'
298 |     result_bytes = b''.join(result.response)
299 |     result_dict = ujson.loads(result_bytes)
300 |     assert result_dict == {'remote_path': expected_remote_path}
301 | 
302 |     # make sure it doesn't think the segment is under promotion
303 |     rethinker = doublethink.Rethinker(
304 |             servers=settings['RETHINKDB_HOSTS'], db='trough_configuration')
305 |     query = rethinker.table('lock').get('write:lock:test_promotion')
306 |     result = query.run()
307 |     assert not result.get('under_promotion')
308 | 
309 |     # let's see if it's hdfs
310 |     listing_after_promotion = hdfs.ls(expected_remote_path, detail=True)
311 |     assert len(listing_after_promotion) == 1
312 |     assert listing_after_promotion[0]['last_mod'] > before
313 | 
314 |     # grab the file from hdfs and check the content
315 |     # n.b. copy created by sqlitebck may have different size, sha1 etc from orig
316 |     size = None
317 |     with tempfile.TemporaryDirectory() as tmpdir:
318 |         local_copy = os.path.join(tmpdir, 'test_promotion.sqlite')
319 |         hdfs.get(expected_remote_path, local_copy)
320 |         conn = sqlite3.connect(local_copy)
321 |         cur = conn.execute('select * from foo')
322 |         assert cur.fetchall() == [('testing segment promotion',)]
323 |         conn.close()
324 |         size = os.path.getsize(local_copy)
325 | 
326 |     # test promotion when there is an assignment in rethinkdb
327 |     rethinker.table('assignment').insert({
328 |         'assigned_on': doublethink.utcnow(),
329 |         'bytes': size,
330 |         'hash_ring': 0 ,
331 |         'id': 'localhost:test_promotion',
332 |         'node': 'localhost',
333 |         'remote_path': expected_remote_path,
334 |         'segment': 'test_promotion'}).run()
335 | 
336 |     # promote it to HDFS
337 |     before = time.time()
338 |     time.sleep(1.5)
339 |     result = segment_manager_server.post(
340 |             '/promote', content_type='application/json',
341 |             data=ujson.dumps({'segment': 'test_promotion'}))
342 |     assert result.status_code == 200
343 |     assert result.mimetype == 'application/json'
344 |     result_bytes = b''.join(result.response)
345 |     result_dict = ujson.loads(result_bytes)
346 |     assert result_dict == {'remote_path': expected_remote_path}
347 | 
348 |     # make sure it doesn't think the segment is under promotion
349 |     rethinker = doublethink.Rethinker(
350 |             servers=settings['RETHINKDB_HOSTS'], db='trough_configuration')
351 |     query = rethinker.table('lock').get('write:lock:test_promotion')
352 |     result = query.run()
353 |     assert not result.get('under_promotion')
354 | 
355 |     # let's see if it's hdfs
356 |     listing_after_promotion = hdfs.ls(expected_remote_path, detail=True)
357 |     assert len(listing_after_promotion) == 1
358 |     assert listing_after_promotion[0]['last_mod'] > before
359 | 
360 |     # pretend the segment is under promotion
361 |     rethinker.table('lock')\
362 |             .get('write:lock:test_promotion')\
363 |             .update({'under_promotion': True}).run()
364 |     assert rethinker.table('lock')\
365 |             .get('write:lock:test_promotion').run()\
366 |             .get('under_promotion')
367 |     with pytest.raises(Exception):
368 |         result = segment_manager_server.post(
369 |                 '/promote', content_type='application/json',
370 |                 data=ujson.dumps({'segment': 'test_promotion'}))
371 | 
372 | def test_delete_segment(segment_manager_server):
373 |     hdfs = hdfs3.HDFileSystem(settings['HDFS_HOST'], settings['HDFS_PORT'])
374 |     rethinker = doublethink.Rethinker(
375 |             servers=settings['RETHINKDB_HOSTS'], db='trough_configuration')
376 | 
377 |     # initially, segment doesn't exist
378 |     result = segment_manager_server.delete('/segment/test_delete_segment')
379 |     assert result.status_code == 404
380 | 
381 |     # provision segment
382 |     result = segment_manager_server.post(
383 |             '/provision', content_type='application/json',
384 |             data=ujson.dumps({'segment':'test_delete_segment'}))
385 |     assert result.status_code == 200
386 |     assert result.mimetype == 'application/json'
387 |     result_bytes = b''.join(result.response)
388 |     result_dict = ujson.loads(result_bytes)
389 |     assert result_dict['write_url'].endswith(':6222/?segment=test_delete_segment')
390 |     write_url = result_dict['write_url']
391 | 
392 |     # write something into the db
393 |     sql = ('create table foo (bar varchar(100));\n'
394 |            'insert into foo (bar) values ("testing segment deletion");\n')
395 |     response = requests.post(write_url, sql)
396 |     assert response.status_code == 200
397 | 
398 |     # check that local file exists
399 |     local_path = os.path.join(
400 |             settings['LOCAL_DATA'], 'test_delete_segment.sqlite')
401 |     assert os.path.exists(local_path)
402 | 
403 |     # check that attempted delete while under write returns 400
404 |     result = segment_manager_server.delete('/segment/test_delete_segment')
405 |     assert result.status_code == 400
406 | 
407 |     # shouldn't be anything in hdfs yet
408 |     expected_remote_path = os.path.join(
409 |             settings['HDFS_PATH'], 'test_delete_segm',
410 |             'test_delete_segment.sqlite')
411 |     with pytest.raises(FileNotFoundError):
412 |         hdfs.ls(expected_remote_path, detail=True)
413 | 
414 |     # promote segment to hdfs
415 |     result = segment_manager_server.post(
416 |             '/promote', content_type='application/json',
417 |             data=ujson.dumps({'segment': 'test_delete_segment'}))
418 |     assert result.status_code == 200
419 |     assert result.mimetype == 'application/json'
420 |     result_bytes = b''.join(result.response)
421 |     result_dict = ujson.loads(result_bytes)
422 |     assert result_dict == {'remote_path': expected_remote_path}
423 | 
424 |     # let's see if it's hdfs
425 |     hdfs_ls = hdfs.ls(expected_remote_path, detail=True)
426 |     assert len(hdfs_ls) == 1
427 | 
428 |     # add an assignment (so we can check it is deleted successfully)
429 |     rethinker.table('assignment').insert({
430 |         'assigned_on': doublethink.utcnow(),
431 |         'bytes': os.path.getsize(local_path),
432 |         'hash_ring': 0 ,
433 |         'id': '%s:test_delete_segment' % socket.gethostname(),
434 |         'node': socket.gethostname(),
435 |         'remote_path': expected_remote_path,
436 |         'segment': 'test_delete_segment'}).run()
437 | 
438 |     # check that service entries, assignment exist
439 |     assert rethinker.table('services')\
440 |             .get('trough-read:%s:test_delete_segment' % socket.gethostname())\
441 |             .run()
442 |     assert rethinker.table('services')\
443 |             .get('trough-write:%s:test_delete_segment' % socket.gethostname())\
444 |             .run()
445 |     assert rethinker.table('assignment')\
446 |             .get('%s:test_delete_segment' % socket.gethostname()).run()
447 | 
448 |     # check that attempted delete while under write returns 400
449 |     result = segment_manager_server.delete('/segment/test_delete_segment')
450 |     assert result.status_code == 400
451 | 
452 |     # delete the write lock
453 |     assert rethinker.table('lock')\
454 |             .get('write:lock:test_delete_segment').delete().run() == {
455 |                     'deleted': 1, 'errors': 0, 'inserted': 0,
456 |                     'replaced': 0 , 'skipped': 0 , 'unchanged': 0, }
457 | 
458 |     # delete the segment
459 |     result = segment_manager_server.delete('/segment/test_delete_segment')
460 |     assert result.status_code == 204
461 | 
462 |     # check that service entries and assignment are gone
463 |     assert not rethinker.table('services')\
464 |             .get('trough-read:%s:test_delete_segment' % socket.gethostname())\
465 |             .run()
466 |     assert not rethinker.table('services')\
467 |             .get('trough-write:%s:test_delete_segment' % socket.gethostname())\
468 |             .run()
469 |     assert not rethinker.table('assignment')\
470 |             .get('%s:test_delete_segment' % socket.gethostname()).run()
471 | 
472 |     # check that local file is gone
473 |     assert not os.path.exists(local_path)
474 | 
475 |     # check that file is gone from hdfs
476 |     with pytest.raises(FileNotFoundError):
477 |         hdfs_ls = hdfs.ls(expected_remote_path, detail=True)
478 | 
479 | 


--------------------------------------------------------------------------------
/trough/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import settings, read, write, sync
 2 | 
 3 | # monkey-patch log level TRACE
 4 | import logging
 5 | TRACE = logging.DEBUG // 2
 6 | def _logging_trace(msg, *args, **kwargs):
 7 |     logging.root.trace(msg, *args, **kwargs)
 8 | def _logger_trace(self, msg, *args, **kwargs):
 9 |     if self.isEnabledFor(TRACE):
10 |         self._log(TRACE, msg, args, **kwargs)
11 | logging.trace = _logging_trace
12 | logging.Logger.trace = _logger_trace
13 | logging.addLevelName(TRACE, 'TRACE')
14 | 
15 | # monkey-patch log level TRACE
16 | NOTICE = (logging.INFO + logging.WARNING) // 2
17 | def _logging_notice(msg, *args, **kwargs):
18 |     logging.root.notice(msg, *args, **kwargs)
19 | def _logger_notice(self, msg, *args, **kwargs):
20 |     if self.isEnabledFor(NOTICE):
21 |         self._log(NOTICE, msg, args, **kwargs)
22 | logging.notice = _logging_notice
23 | logging.Logger.notice = _logger_notice
24 | logging.addLevelName(NOTICE, 'NOTICE')
25 | 
26 | 


--------------------------------------------------------------------------------
/trough/client.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | trough/client.py - trough client code
  3 | 
  4 | Copyright (C) 2017-2019 Internet Archive
  5 | 
  6 | This program is free software; you can redistribute it and/or
  7 | modify it under the terms of the GNU General Public License
  8 | as published by the Free Software Foundation; either version 2
  9 | of the License, or (at your option) any later version.
 10 | 
 11 | This program is distributed in the hope that it will be useful,
 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | GNU General Public License for more details.
 15 | 
 16 | You should have received a copy of the GNU General Public License
 17 | along with this program; if not, write to the Free Software
 18 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 19 | USA.
 20 | '''
 21 | 
 22 | from __future__ import absolute_import
 23 | 
 24 | import logging
 25 | import os
 26 | import json
 27 | import requests
 28 | import doublethink
 29 | import rethinkdb as r
 30 | import datetime
 31 | import threading
 32 | import time
 33 | import collections
 34 | from aiohttp import ClientSession
 35 | 
 36 | class TroughException(Exception):
 37 |     def __init__(self, message, payload=None, returned_message=None):
 38 |         super().__init__(message)
 39 |         self.payload = payload
 40 |         self.returned_message = returned_message
 41 | 
 42 | class TroughSegmentNotFound(TroughException):
 43 |     pass
 44 | 
 45 | class TroughClient(object):
 46 |     logger = logging.getLogger('trough.client.TroughClient')
 47 | 
 48 |     def __init__(self, rethinkdb_trough_db_url, promotion_interval=None):
 49 |         '''
 50 |         TroughClient constructor
 51 | 
 52 |         Args:
 53 |             rethinkdb_trough_db_url: url with schema rethinkdb:// pointing to
 54 |                 trough configuration database
 55 |             promotion_interval: if specified, `TroughClient` will spawn a
 56 |                 thread that "promotes" (pushed to hdfs) "dirty" trough segments
 57 |                 (segments that have received writes) periodically, sleeping for
 58 |                 `promotion_interval` seconds between cycles (default None)
 59 |         '''
 60 |         parsed = doublethink.parse_rethinkdb_url(rethinkdb_trough_db_url)
 61 |         self.rr = doublethink.Rethinker(
 62 |                 servers=parsed.hosts, db=parsed.database)
 63 |         self.svcreg = doublethink.ServiceRegistry(self.rr)
 64 |         self._write_url_cache = {}
 65 |         self._read_url_cache = {}
 66 |         self._dirty_segments = set()
 67 |         self._dirty_segments_lock = threading.RLock()
 68 | 
 69 |         self.promotion_interval = promotion_interval
 70 |         self._promoter_thread = None
 71 |         if promotion_interval:
 72 |             self._promoter_thread = threading.Thread(
 73 |                     target=self._promotrix, name='TroughClient-promoter')
 74 |             self._promoter_thread.setDaemon(True)
 75 |             self._promoter_thread.start()
 76 | 
 77 |     def _promotrix(self):
 78 |         while True:
 79 |             time.sleep(self.promotion_interval)
 80 |             try:
 81 |                 with self._dirty_segments_lock:
 82 |                     dirty_segments = list(self._dirty_segments)
 83 |                     self._dirty_segments.clear()
 84 |                 self.logger.info(
 85 |                         'promoting %s trough segments', len(dirty_segments))
 86 |                 for segment_id in dirty_segments:
 87 |                     try:
 88 |                         self.promote(segment_id)
 89 |                     except:
 90 |                         self.logger.error(
 91 |                                 'problem promoting segment %s', segment_id,
 92 |                                 exc_info=True)
 93 |             except:
 94 |                 self.logger.error(
 95 |                         'caught exception doing segment promotion',
 96 |                         exc_info=True)
 97 | 
 98 |     def promote(self, segment_id):
 99 |         url = os.path.join(self.segment_manager_url(), 'promote')
100 |         payload_dict = {'segment': segment_id}
101 |         self.logger.debug('posting %s to %s', json.dumps(payload_dict), url)
102 |         response = requests.post(url, json=payload_dict, timeout=21600)
103 |         if response.status_code != 200:
104 |             raise TroughException(
105 |                     'unexpected response %r %r: %r from POST %r with '
106 |                     'payload %r' % (
107 |                         response.status_code, response.reason, response.text,
108 |                         url, json.dumps(payload_dict)))
109 | 
110 |     @staticmethod
111 |     def sql_value(x):
112 |         if x is None:
113 |             return 'null'
114 |         elif isinstance(x, datetime.datetime):
115 |             return 'datetime(%r)' % x.isoformat()
116 |         elif isinstance(x, bool):
117 |             return int(x)
118 |         elif isinstance(x, str) or isinstance(x, bytes):
119 |             # the only character that needs escaped in sqlite string literals
120 |             # is single-quote, which is escaped as two single-quotes
121 |             if isinstance(x, bytes):
122 |                 s = x.decode('utf-8')
123 |             else:
124 |                 s = x
125 |             return "'" + s.replace("'", "''") + "'"
126 |         elif isinstance(x, (int, float)):
127 |             return x
128 |         else:
129 |             raise TroughException(
130 |                     "don't know how to make an sql value from %r (%r)" % (
131 |                         x, type(x)))
132 | 
133 |     def segment_manager_url(self):
134 |         master_node = self.svcreg.unique_service('trough-sync-master')
135 |         if not master_node:
136 |             raise TroughException(
137 |                     'no healthy trough-sync-master in service registry')
138 |         return master_node['url']
139 | 
140 |     def write_url_nocache(self, segment_id, schema_id='default'):
141 |         url = os.path.join(self.segment_manager_url(), 'provision')
142 |         payload_dict = {'segment': segment_id, 'schema': schema_id}
143 |         self.logger.debug('posting %s to %s', json.dumps(payload_dict), url)
144 |         response = requests.post(url, json=payload_dict, timeout=600)
145 |         if response.status_code != 200:
146 |             raise TroughException(
147 |                     'unexpected response %r %r: %r from POST %r with '
148 |                     'payload %r' % (
149 |                         response.status_code, response.reason, response.text,
150 |                         url, json.dumps(payload_dict)))
151 |         result_dict = response.json()
152 |         # assert result_dict['schema'] == schema_id  # previously provisioned?
153 |         return result_dict['write_url']
154 | 
155 |     def read_url_nocache(self, segment_id):
156 |         reql = self.rr.table('services', read_mode='outdated').get_all(
157 |                 segment_id, index='segment').filter(
158 |                         {'role':'trough-read'}).filter(
159 |                                 lambda svc: r.now().sub(
160 |                                     svc['last_heartbeat']).lt(svc['ttl'])
161 |                                 ).order_by('load')
162 |         self.logger.debug('querying rethinkdb: %r', reql)
163 |         results = reql.run()
164 |         try:
165 |             return results[0]['url']
166 |         except:
167 |             raise TroughSegmentNotFound(
168 |                     'no read url for segment %s; usually this means the '
169 |                     "segment hasn't been provisioned yet" % segment_id)
170 | 
171 |     def read_urls_for_regex(self, regex):
172 |         '''
173 |         Looks up read urls for segments matching `regex`.
174 |         Populates `self._read_url_cache` and returns dictionary
175 |         `{segment: url}`
176 |         '''
177 |         d = {}
178 |         reql = self.rr.table('services', read_mode='outdated')\
179 |                 .filter({'role': 'trough-read'})\
180 |                 .filter(r.row.has_fields('segment'))\
181 |                 .filter(lambda svc: svc['segment'].coerce_to('string').match(regex))\
182 |                 .filter(lambda svc: r.now().sub(svc['last_heartbeat']).lt(svc['ttl']))
183 |         self.logger.debug('querying rethinkdb: %r', reql)
184 |         results = reql.run()
185 |         for result in results:
186 |             d[result['segment']] = result['url']
187 |             self._read_url_cache[result['segment']] = result['url']
188 |         return d
189 | 
190 |     def schemas(self):
191 |         reql = self.rr.table('schema', read_mode='outdated')
192 |         for result in reql.run():
193 |             yield collections.OrderedDict([('name', result['id'])])
194 | 
195 |     def schema(self, id):
196 |         reql = self.rr.table('schema', read_mode='outdated').get(id)
197 |         result = reql.run()
198 |         if result:
199 |             return [collections.OrderedDict([(id, result['sql'])])]
200 |         else:
201 |             return None
202 | 
203 |     def readable_segments(self, regex=None):
204 |         reql = self.rr.table('services', read_mode='outdated')\
205 |                 .filter({'role':'trough-read'})\
206 |                 .filter(lambda svc: r.now().sub(svc['last_heartbeat'])\
207 |                                            .lt(svc['ttl']))
208 |         if regex:
209 |             reql = reql.filter(
210 |                     lambda svc: svc['segment'].coerce_to('string').match(regex))
211 |         self.logger.debug('querying rethinkdb: %r', reql)
212 |         results = reql.run()
213 |         for result in reql.run():
214 |             yield collections.OrderedDict([
215 |                     ('segment', result['segment']),
216 |                     ('url', result['url']),
217 |                     ('first_heartbeat', result['first_heartbeat']),
218 |                     ('last_heartbeat', result['last_heartbeat'])])
219 | 
220 |     def write_url(self, segment_id, schema_id='default'):
221 |         if not segment_id in self._write_url_cache:
222 |             self._write_url_cache[segment_id] = self.write_url_nocache(
223 |                     segment_id, schema_id)
224 |             self.logger.info(
225 |                     'segment %r write url is %r', segment_id,
226 |                     self._write_url_cache[segment_id])
227 |         return self._write_url_cache[segment_id]
228 | 
229 |     def read_url(self, segment_id):
230 |         if not self._read_url_cache.get(segment_id):
231 |             self._read_url_cache[segment_id] = self.read_url_nocache(segment_id)
232 |             self.logger.info(
233 |                     'segment %r read url is %r', segment_id,
234 |                     self._read_url_cache[segment_id])
235 |         return self._read_url_cache[segment_id]
236 | 
237 |     def write(self, segment_id, sql_tmpl, values=(), schema_id='default'):
238 |         write_url = self.write_url(segment_id, schema_id)
239 |         sql = sql_tmpl % tuple(self.sql_value(v) for v in values)
240 |         sql_bytes = sql.encode('utf-8')
241 | 
242 |         try:
243 |             response = requests.post(
244 |                     write_url, sql_bytes, timeout=600,
245 |                     headers={'content-type': 'application/sql;charset=utf-8'})
246 |             if response.status_code != 200:
247 |                 raise TroughException(
248 |                         'unexpected response %r %r: %r from POST %r with '
249 |                         'payload %r' % (
250 |                             response.status_code, response.reason,
251 |                             response.text, write_url, sql_bytes), sql_bytes, response.text)
252 |             if segment_id not in self._dirty_segments:
253 |                 with self._dirty_segments_lock:
254 |                     self._dirty_segments.add(segment_id)
255 |         except Exception as e:
256 |             self._write_url_cache.pop(segment_id, None)
257 |             raise e
258 | 
259 |     def read(self, segment_id, sql_tmpl, values=()):
260 |         read_url = self.read_url(segment_id)
261 |         sql = sql_tmpl % tuple(self.sql_value(v) for v in values)
262 |         sql_bytes = sql.encode('utf-8')
263 |         try:
264 |             response = requests.post(
265 |                     read_url, sql_bytes, timeout=600,
266 |                     headers={'content-type': 'application/sql;charset=utf-8'})
267 |             if response.status_code != 200:
268 |                 raise TroughException(
269 |                         'unexpected response %r %r %r from %r to query %r' % (
270 |                             response.status_code, response.reason, response.text,
271 |                             read_url, sql_bytes), sql_bytes, response.text)
272 |             self.logger.trace(
273 |                     'got %r from posting query %r to %r', response.text, sql,
274 |                     read_url)
275 |             results = json.loads(response.text)
276 |             return results
277 |         except Exception as e:
278 |             self._read_url_cache.pop(segment_id, None)
279 |             raise e
280 | 
281 |     async def async_read(self, segment_id, sql_tmpl, values=()):
282 |         read_url = self.read_url(segment_id)
283 |         sql = sql_tmpl % tuple(self.sql_value(v) for v in values)
284 |         sql_bytes = sql.encode('utf-8')
285 | 
286 |         async with ClientSession() as session:
287 |             async with session.post(
288 |                     read_url, data=sql_bytes, headers={
289 |                         'content-type': 'application/sql;charset=utf-8'}) as res:
290 |                 if res.status != 200:
291 |                     self._read_url_cache.pop(segment_id, None)
292 |                     text = await res.text('utf-8')
293 |                     raise TroughException(
294 |                             'unexpected response %r %r %r from %r to '
295 |                             'query %r' % (
296 |                                 res.status, res.reason, text, read_url,
297 |                                 sql), sql_bytes, text)
298 |                 results = list(await res.json())
299 |                 return results
300 | 
301 |     def schema_exists(self, schema_id):
302 |         url = os.path.join(self.segment_manager_url(), 'schema', schema_id)
303 |         response = requests.get(url, timeout=60)
304 |         if response.status_code == 200:
305 |             return True
306 |         elif response.status_code == 404:
307 |             return False
308 |         else:
309 |             try:
310 |                 response.raise_for_status()
311 |             except Exception as e:
312 |                 raise TroughException(e)
313 | 
314 |     def register_schema(self, schema_id, sql):
315 |         url = os.path.join(
316 |                 self.segment_manager_url(), 'schema', schema_id, 'sql')
317 |         response = requests.put(url, sql, timeout=600)
318 |         if response.status_code not in (201, 204):
319 |             raise TroughException(
320 |                     'unexpected response %r %r %r from %r to query %r' % (
321 |                         response.status_code, response.reason, response.text,
322 |                         url, sql))
323 | 
324 |     def delete_segment(self, segment_id):
325 |         url = os.path.join(self.segment_manager_url(), 'segment', segment_id)
326 |         self.logger.debug('DELETE %s', url)
327 |         response = requests.delete(url, timeout=1200)
328 |         if response.status_code == 404:
329 |             raise TroughSegmentNotFound('received 404 from DELETE %s' % url)
330 |         elif response.status_code != 204:
331 |             raise TroughException(
332 |                     'unexpected response %r %r: %r from DELETE %s' % (
333 |                         response.status_code, response.reason, response.text,
334 |                         url))
335 | 
336 | 


--------------------------------------------------------------------------------
/trough/db_api.py:
--------------------------------------------------------------------------------
  1 | import rethinkdb as r
  2 | import ujson as json
  3 | import datetime
  4 | import re
  5 | import doublethink
  6 | import pycurl
  7 | from io import BytesIO
  8 | import logging
  9 | from urllib.parse import urlparse, urlencode
 10 | from http.client import HTTPConnection
 11 | import socks
 12 | 
 13 | def healthy_services_query(rethinker, role):
 14 |     return rethinker.table('services').filter({"role": role}).filter(
 15 |         lambda svc: r.now().sub(svc["last_heartbeat"]) < svc["ttl"]
 16 |     )
 17 | 
 18 | class TroughCursor():
 19 |     def __init__(self, database=None, rethinkdb=None, proxy=None, proxy_port=9000, proxy_type='SOCKS5'):
 20 |         self.database = database
 21 |         self.rethinkdb = rethinkdb
 22 |         self.proxy = proxy
 23 |         self.proxy_port = proxy_port
 24 |         self.proxy_type = socks.PROXY_TYPE_SOCKS5 if proxy_type == 'SOCKS5' else socks.PROXY_TYPE_SOCKS4
 25 |         # use this flag to save time. don't provision database for each query.
 26 |         self._writable = False
 27 |         #self.rethinker = doublethink.rethinker()
 28 |         self._write_url = None
 29 | 
 30 |     def _do_read(self, query, raw=False):
 31 |         # send query to server, return JSON
 32 |         rethinker = doublethink.Rethinker(db="trough_configuration", servers=self.rethinkdb)
 33 |         healthy_databases = list(rethinker.table('services').get_all(self.database, index='segment').run())
 34 |         healthy_databases = [db for db in healthy_databases if db['role'] == 'trough-read' and (rethinker.now().run() - db['last_heartbeat']).seconds < db['ttl']]
 35 |         try:
 36 |             assert len(healthy_databases) > 0
 37 |         except:
 38 |             raise Exception('No healthy node found for segment %s' % self.database)
 39 |         url = urlparse(healthy_databases[0].get('url'))
 40 |         if self.proxy:
 41 |             conn = HTTPConnection(self.proxy, self.proxy_port)
 42 |             conn.set_tunnel(url.netloc, url.port)
 43 |             conn.sock = socks.socksocket()
 44 |             conn.sock.set_proxy(self.proxy_type, self.proxy, self.proxy_port)
 45 |             conn.sock.connect((url.netloc.split(":")[0], url.port))
 46 |         else:
 47 |             conn = HTTPConnection(url.netloc)
 48 |         request_path = "%s?%s" % (url.path, url.query)
 49 |         conn.request("POST", request_path, query)
 50 |         response = conn.getresponse()
 51 |         results = json.loads(response.read())
 52 |         self._last_results = results
 53 | 
 54 |     def _do_write(self, query):
 55 |         # send provision query to server if not self._write_url.
 56 |         # after send provision query, set self._write_url.
 57 |         # send query to server, return JSON
 58 |         rethinker = doublethink.Rethinker(db="trough_configuration", servers=self.rethinkdb)
 59 |         services = doublethink.ServiceRegistry(rethinker)
 60 |         master_node = services.unique_service('trough-sync-master')
 61 |         logging.info('master_node=%r', master_node)
 62 |         if not master_node:
 63 |             raise Exception('no healthy trough-sync-master in service registry')
 64 |         if not self._write_url:
 65 |             buffer = BytesIO()
 66 |             c = pycurl.Curl()
 67 |             c.setopt(c.URL, master_node.get('url'))
 68 |             c.setopt(c.POSTFIELDS, self.database)
 69 |             if self.proxy:
 70 |                 c.setopt(pycurl.PROXY, self.proxy)
 71 |                 c.setopt(pycurl.PROXYPORT, int(self.proxy_port))
 72 |                 c.setopt(pycurl.PROXYTYPE, self.proxy_type)
 73 |             c.setopt(c.WRITEDATA, buffer)
 74 |             c.perform()
 75 |             c.close()
 76 |             self._write_url = buffer.getvalue()
 77 |             logging.info('self._write_url=%r', self._write_url)
 78 |         buffer = BytesIO()
 79 |         c = pycurl.Curl()
 80 |         c.setopt(c.URL, self._write_url)
 81 |         c.setopt(c.POSTFIELDS, query)
 82 |         if self.proxy:
 83 |             c.setopt(pycurl.PROXY, self.proxy)
 84 |             c.setopt(pycurl.PROXYPORT, int(self.proxy_port))
 85 |             c.setopt(pycurl.PROXYTYPE, self.proxy_type)
 86 |         c.setopt(c.WRITEDATA, buffer)
 87 |         c.perform()
 88 |         c.close()
 89 |         response = buffer.getvalue()
 90 |         if response.strip() != b'OK':
 91 |             raise Exception('Trough Query Failed: Database: %r Response: %r Query: %.200r' % (self.database, response, query))
 92 |         self._last_results = None
 93 |     def execute(self, sql, params=[], force=None, raw=False):
 94 |         query = sql % tuple(repr(param) for param in params)
 95 |         if force=='read' or query.strip()[:6].lower() == 'select':
 96 |             return self._do_read(query, raw)
 97 |         return self._do_write(query)
 98 |     def executemany(self, queries):
 99 |         query_types = set()
100 |         split_queries = sqlparse.split(queries, encoding=None)
101 |         for query in split_queries:
102 |             query_types = (query.strip()[:6].lower() == 'select')
103 |         if len(query_types > 1):
104 |             raise Exception('Queries passed to executemany() must be exclusively SELECT or non-SELECT queries.')
105 |         return self.execute(queries, force='read' if True in query_types else 'write')
106 |     def executescript(self, queries):
107 |         self.executemany(queries)
108 |     def close(self):
109 |         pass
110 |     def fetchall(self):
111 |         return self._last_results
112 |     def fetchmany(self, size=100):
113 |         return self._last_results[0:size]
114 |     def fetchone(self):
115 |         return [v for k,v in self._last_results[0].items()]
116 | 
117 | class TroughConnection():
118 |     def __init__(self, *args, database=None, rethinkdb=None, proxy=None, proxy_port=9000, proxy_type='SOCKS5', **kwargs):
119 |         self.database = database
120 |         self.rethinkdb = rethinkdb
121 |         self.proxy = proxy
122 |         self.proxy_port = int(proxy_port)
123 |         self.proxy_type = proxy_type
124 |     def cursor(self):
125 |         return TroughCursor(database=self.database,
126 |             rethinkdb=self.rethinkdb,
127 |             proxy=self.proxy,
128 |             proxy_port=self.proxy_port,
129 |             proxy_type=self.proxy_type)
130 |     def execute(self, query):
131 |         return self.cursor().execute(query)
132 |     def executemany(self, queries):
133 |         return self.cursor().executemany(query)
134 |     def executescript(self, queries):
135 |         return self.cursor().executescript(query)
136 |     def close(self):
137 |         pass
138 |     def commit(self):
139 |         pass
140 | 
141 | def connect(*args, **kwargs):
142 |     return TroughConnection(**kwargs)


--------------------------------------------------------------------------------
/trough/read.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import trough
  3 | from trough.settings import settings, try_init_sentry
  4 | import sqlite3
  5 | import ujson
  6 | import os
  7 | import sqlparse
  8 | import logging
  9 | import requests
 10 | import urllib
 11 | import doublethink
 12 | 
 13 | try_init_sentry()
 14 | 
 15 | class ReadServer:
 16 |     def __init__(self):
 17 |         self.rethinker = doublethink.Rethinker(db="trough_configuration", servers=settings['RETHINKDB_HOSTS'])
 18 |         self.services = doublethink.ServiceRegistry(self.rethinker)
 19 |         self.registry = trough.sync.HostRegistry(rethinker=self.rethinker, services=self.services)
 20 |         trough.sync.init(self.rethinker)
 21 | 
 22 |     def proxy_for_write_host(self, node, segment, query, start_response):
 23 |         # enforce that we are querying the correct database, send an explicit hostname.
 24 |         write_url = "http://{node}:{port}/?segment={segment}".format(node=node, segment=segment.id, port=settings['READ_PORT'])
 25 |         with requests.post(write_url, stream=True, data=query) as r:
 26 |             status_line = '{status_code} {reason}'.format(status_code=r.status_code, reason=r.reason)
 27 |             # headers [('Content-Type','application/json')]
 28 |             headers = [("Content-Type", r.headers['Content-Type'],)]
 29 |             start_response(status_line, headers)
 30 |             for chunk in r.iter_content():
 31 |                 yield chunk
 32 | 
 33 |     def sql_result_json_iter(self, cursor):
 34 |         first = True
 35 |         yield b"["
 36 |         try:
 37 |             while True:
 38 |                 row = cursor.fetchone()
 39 |                 if not row:
 40 |                     break
 41 |                 if not first:
 42 |                     yield b",\n"
 43 |                 output = dict((cursor.description[i][0], value) for i, value in enumerate(row))
 44 |                 yield ujson.dumps(output, escape_forward_slashes=False).encode('utf-8')
 45 |                 first = False
 46 |             yield b"]\n"
 47 |         except Exception as e:
 48 |             logging.error('exception in middle of streaming response', exc_info=1)
 49 |         finally:
 50 |             # close the cursor 'finally', in case there is an Exception.
 51 |             cursor.close()
 52 |             cursor.connection.close()
 53 | 
 54 |     def execute_query(self, segment, query):
 55 |         '''Returns a cursor.'''
 56 |         logging.info('Servicing request: {query}'.format(query=query))
 57 |         # if the user sent more than one query, or the query is not a SELECT, raise an exception.
 58 |         if len(sqlparse.split(query)) != 1 or sqlparse.parse(query)[0].get_type() != 'SELECT':
 59 |             raise Exception('Exactly one SELECT query per request, please.')
 60 |         assert os.path.isfile(segment.local_path())
 61 | 
 62 |         logging.info("Connecting to sqlite database: {segment}".format(segment=segment.local_path()))
 63 |         connection = sqlite3.connect(segment.local_path())
 64 |         trough.sync.setup_connection(connection)
 65 |         cursor = connection.cursor()
 66 |         cursor.execute(query.decode('utf-8'))
 67 |         return cursor
 68 | 
 69 |     # uwsgi endpoint
 70 |     def __call__(self, env, start_response):
 71 |         try:
 72 |             query_dict = urllib.parse.parse_qs(env['QUERY_STRING'])
 73 |             # use the ?segment= query string variable or the host string to figure out which sqlite database to talk to.
 74 |             segment_id = query_dict.get('segment', env.get('HTTP_HOST', "").split("."))[0]
 75 |             logging.info('Connecting to Rethinkdb on: %s' % settings['RETHINKDB_HOSTS'])
 76 |             segment = trough.sync.Segment(segment_id=segment_id, size=0, rethinker=self.rethinker, services=self.services, registry=self.registry)
 77 |             content_length = int(env.get('CONTENT_LENGTH', 0))
 78 |             query = env.get('wsgi.input').read(content_length)
 79 | 
 80 |             write_lock = segment.retrieve_write_lock()
 81 |             if write_lock and write_lock['node'] != settings['HOSTNAME']:
 82 |                 logging.info('Found write lock for {segment}. Proxying {query} to {host}'.format(segment=segment.id, query=query, host=write_lock['node']))
 83 |                 return self.proxy_for_write_host(write_lock['node'], segment, query, start_response)
 84 | 
 85 |                 ## # enforce that we are querying the correct database, send an explicit hostname.
 86 |                 ## write_url = "http://{node}:{port}/?segment={segment}".format(node=node, segment=segment.id, port=settings['READ_PORT'])
 87 |                 ## with requests.post(write_url, stream=True, data=query) as r:
 88 |                 ##     status_line = '{status_code} {reason}'.format(status_code=r.status_code, reason=r.reason)
 89 |                 ##     headers = [("Content-Type", r.headers['Content-Type'],)]
 90 |                 ##     start_response(status_line, headers)
 91 |                 ##     return r.iter_content()
 92 |             cursor = self.execute_query(segment, query)
 93 |             start_response('200 OK', [('Content-Type','application/json')])
 94 |             return self.sql_result_json_iter(cursor)
 95 |         except Exception as e:
 96 |             logging.error('500 Server Error due to exception', exc_info=True)
 97 |             start_response('500 Server Error', [('Content-Type', 'text/plain')])
 98 |             return [('500 Server Error: %s\n' % str(e)).encode('utf-8')]
 99 | 
100 | 


--------------------------------------------------------------------------------
/trough/settings.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import socket
  4 | import sys
  5 | 
  6 | import snakebite.errors
  7 | import sqlite3
  8 | import yaml
  9 | 
 10 | def configure_logging():
 11 |     logging.root.handlers = []
 12 |     level = getattr(logging, os.environ.get('TROUGH_LOG_LEVEL', 'INFO'))
 13 |     logging.basicConfig(stream=sys.stdout, level=level, format=(
 14 |         '%(asctime)s %(levelname)s %(name)s.%(funcName)s'
 15 |         '(%(filename)s:%(lineno)d) %(message)s'))
 16 |     logging.getLogger('requests.packages.urllib3').setLevel(level + 20)
 17 |     logging.getLogger('urllib3').setLevel(level + 20)
 18 |     logging.getLogger('snakebite').setLevel(level + 10)
 19 |     logging.getLogger('hdfs3').setLevel(level + 10)
 20 |     
 21 |     #emit warning if settings file failed to load properly
 22 |     if file_load_error is not None:
 23 |         logging.warning('%s -- using default settings', file_load_error)
 24 | 
 25 | def sizeof_fmt(num, suffix='B'):
 26 |     for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
 27 |         if abs(num) < 1024.0:
 28 |             return "%3.1f%s%s" % (num, unit, suffix)
 29 |         num /= 1024.0
 30 |     return "%.1f%s%s" % (num, 'Yi', suffix)
 31 | 
 32 | def get_ip():
 33 |     s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
 34 |     s.connect(('10.255.255.255', 1)) # ip doesn't need to be reachable
 35 |     output = s.getsockname()[0]
 36 |     s.close()
 37 |     return output
 38 | 
 39 | def get_storage_in_bytes():
 40 |     '''
 41 |     Set a reasonable default for storage quota.
 42 | 
 43 |     Look up the settings['LOCAL_DATA'] directory, calculate the bytes on the
 44 |     device on which it is mounted, take 80% of total.
 45 |     '''
 46 |     path = settings['LOCAL_DATA']
 47 |     while True:
 48 |         try:
 49 |             statvfs = os.statvfs(path)
 50 |             return int(statvfs.f_frsize * statvfs.f_blocks * 0.8)
 51 |         except:
 52 |             path = os.path.dirname(path)
 53 | 
 54 | settings = {
 55 |     'LOCAL_DATA': '/var/tmp/trough',
 56 |     'READ_THREADS': '10',
 57 |     'WRITE_THREADS': '5',
 58 |     'ELECTION_CYCLE': 10, # how frequently should I hold an election for sync master server? In seconds
 59 |     # 'ROLE': 'READ', # READ, WRITE, SYNCHRONIZE, CONSUL # commented: might not need this, handle via ansible/docker?
 60 |     'HDFS_PATH': '/tmp/trough', # /ait/prod/trough/
 61 |     'HDFS_HOST': 'localhost',
 62 |     'HDFS_PORT': 8020,
 63 |     'READ_PORT': 6444,
 64 |     'WRITE_PORT': 6222,
 65 |     'SYNC_SERVER_PORT': 6111,
 66 |     'SYNC_LOCAL_PORT': 6112,
 67 |     'EXTERNAL_IP': None,
 68 |     'HOST_CHECK_WAIT_PERIOD': 5, # if the sync master starts before anything else, poll for hosts to assign to every N seconds.
 69 |     'STORAGE_IN_BYTES': None, # this will be set later, if it is not set in settings.yml
 70 |     'HOSTNAME': socket.gethostname(),
 71 |     'READ_NODE_DNS_TTL': 60 * 10, # 10 minute default
 72 |     'READ_DATABASE_DNS_TTL': 60 * 10, # 10 minute default
 73 |     'SYNC_LOOP_TIMING': 60 * 2, # do a 'sync' loop every N seconds (default: 2m. applies to both local and master sync nodes)
 74 |     'RETHINKDB_HOSTS': ["localhost",],
 75 |     'MINIMUM_ASSIGNMENTS': 2,
 76 |     'MAXIMUM_ASSIGNMENTS': 2,
 77 |     'SENTRY_DSN': None,
 78 |     'LOG_LEVEL': 'INFO',
 79 |     'RUN_AS_COLD_STORAGE_NODE': False,
 80 |     'COLD_STORAGE_PATH': "/mount/hdfs/trough-data/{prefix}/{segment_id}.sqlite",
 81 |     'COLD_STORE_SEGMENT': False,
 82 |     'COPY_THREAD_POOL_SIZE': 2,
 83 | }
 84 | 
 85 | 
 86 | file_load_error= None
 87 | 
 88 | try:
 89 |     with open(os.environ.get('TROUGH_SETTINGS') or '/etc/trough/settings.yml') as f:
 90 |         yaml_settings = yaml.safe_load(f)
 91 |         for key in yaml_settings.keys():
 92 |             settings[key] = yaml_settings[key]
 93 | except (IOError, AttributeError) as e:
 94 |     file_load_error = e
 95 | 
 96 | # if the user provided a lambda, we have to eval() it, :gulp:
 97 | if "lambda" in str(settings['MINIMUM_ASSIGNMENTS']):
 98 |     settings['MINIMUM_ASSIGNMENTS'] = eval(settings['MINIMUM_ASSIGNMENTS'])
 99 | 
100 | if "lambda" in str(settings['COLD_STORE_SEGMENT']):
101 |     settings['COLD_STORE_SEGMENT'] = eval(settings['COLD_STORE_SEGMENT'])
102 | 
103 | if settings['EXTERNAL_IP'] is None:
104 |     settings['EXTERNAL_IP'] = get_ip()
105 | 
106 | if settings['STORAGE_IN_BYTES'] is None:
107 |     settings['STORAGE_IN_BYTES'] = get_storage_in_bytes()
108 | 
109 | def init_worker():
110 |     '''
111 |     Some initial setup for worker nodes.
112 |     '''
113 |     if not os.path.isdir(settings['LOCAL_DATA']):
114 |         logging.info("LOCAL_DATA path %s does not exist. Attempting to make dirs." % settings['LOCAL_DATA'])
115 |         os.makedirs(settings['LOCAL_DATA'])
116 | 
117 | 
118 | # Exceptions which, if unhandled, will *not* be sent to sentry as events.
119 | # These exceptions are filtered to reduce excessive event volume from
120 | # burdenting sentry infrastructure.
121 | SENTRY_FILTERED_EXCEPTIONS = (
122 |     snakebite.errors.FileNotFoundException,
123 |     sqlite3.DatabaseError,
124 |     sqlite3.OperationalError,
125 | )
126 | 
127 | 
128 | def try_init_sentry():
129 |     """Attempts to initialize the sentry sdk, if available."""
130 | 
131 |     def _before_send(event, hint):
132 |         # see: https://docs.sentry.io/platforms/python/configuration/filtering/#event-hints
133 |         if 'exc_info' in hint:
134 |             exc_type, exc_value, tb = hint['exc_info']
135 |             if isinstance(exc_value, SENTRY_FILTERED_EXCEPTIONS):
136 |                 return None
137 | 
138 |         return event
139 | 
140 |     sentry_dsn = settings.get('SENTRY_DSN')
141 |     if sentry_dsn is not None:
142 |         try:
143 |             import sentry_sdk
144 |             sentry_sdk.init(sentry_dsn, before_send=_before_send)
145 |         except ImportError:
146 |             logging.warning(
147 |                 "'SENTRY_DSN' setting is configured but 'sentry_sdk' module "
148 |                 "not available. Install to use sentry."
149 |             )
150 | 


--------------------------------------------------------------------------------
/trough/shell/__init__.py:
--------------------------------------------------------------------------------
  1 | import trough.client
  2 | import sys
  3 | import argparse
  4 | import os
  5 | import cmd
  6 | import logging
  7 | import readline
  8 | import datetime
  9 | import re
 10 | from aiohttp import ClientSession
 11 | import asyncio
 12 | from contextlib import contextmanager
 13 | import subprocess
 14 | import io
 15 | import json
 16 | 
 17 | HISTORY_FILE = os.path.expanduser('~/.trough_history')
 18 | 
 19 | class BetterArgumentDefaultsHelpFormatter(
 20 |                 argparse.ArgumentDefaultsHelpFormatter,
 21 |                 argparse.RawDescriptionHelpFormatter):
 22 |     '''
 23 |     HelpFormatter with these properties:
 24 | 
 25 |     - formats option help like argparse.ArgumentDefaultsHelpFormatter except
 26 |       that it omits the default value for arguments with action='store_const'
 27 |     - like argparse.RawDescriptionHelpFormatter, does not reformat description
 28 |       string
 29 |     '''
 30 |     def _get_help_string(self, action):
 31 |         if isinstance(action, argparse._StoreConstAction):
 32 |             return action.help
 33 |         else:
 34 |             return argparse.ArgumentDefaultsHelpFormatter._get_help_string(self, action)
 35 | 
 36 | class TroughShell(cmd.Cmd):
 37 |     intro = 'Welcome to the trough shell. Type help or ? to list commands.\n'
 38 |     logger = logging.getLogger('trough.client.TroughShell')
 39 | 
 40 |     def __init__(
 41 |             self, trough_client, segments, writable=False,
 42 |             schema_id='default'):
 43 |         super().__init__()
 44 |         self.cli = trough_client
 45 |         self.segments = segments
 46 |         self.writable = writable
 47 |         self.schema_id = schema_id
 48 |         self.format = 'table'
 49 |         self.pager_pipe = None
 50 |         self.update_prompt()
 51 | 
 52 |     def onecmd(self, line):
 53 |         try:
 54 |             return super().onecmd(line)
 55 |         except trough.client.TroughException as e:
 56 |             if e.returned_message and e.payload:
 57 |                 print("An error occured during execution:")
 58 |                 print(e.returned_message.replace("500 Server Error: ", ""))
 59 |                 print("(query was: '%s')" % e.payload.decode().strip())
 60 |             else:
 61 |                 self.logger.error('caught exception', exc_info=True)
 62 |         except Exception as e:
 63 |             self.logger.error('caught exception', exc_info=True)
 64 |             
 65 | 
 66 |     def table(self, dictlist):
 67 |         assert dictlist
 68 |         s = ''
 69 |         # calculate lengths for each column
 70 |         max_lengths = {}
 71 |         for row in dictlist:
 72 |             for k, v in row.items():
 73 |                 max_lengths[k] = max(
 74 |                         max_lengths.get(k, 0), len(k),
 75 |                         len(str(v) if v is not None else '<null>'))
 76 | 
 77 |         if not self.column_keys:
 78 |             column_keys = list(dictlist[0].keys())
 79 |             # column order: id first, then shortest column, next biggest, etc
 80 |             # with column name alphabetical as tiebreaker
 81 |             column_keys.sort(key=lambda k: (0, '!') if k == 'id' \
 82 |                                                else (max_lengths[k], k))
 83 |             self.column_keys = column_keys
 84 | 
 85 |         # compose a formatter-string
 86 |         lenstr = "| "+" | ".join("{:<%s}" % max_lengths[k] for k in self.column_keys) + " |\n"
 87 |         # print header and borders
 88 |         border = "+" + "+".join(["-" * (max_lengths[k] + 2) for k in self.column_keys]) + "+\n"
 89 |         s += border
 90 |         header = lenstr.format(*self.column_keys)
 91 |         s += header
 92 |         s += border
 93 |         # print rows and borders
 94 |         for row in dictlist:
 95 |             formatted = lenstr.format(*[
 96 |                 str(row[k]) if row[k] is not None else '<null>'
 97 |                 for k in self.column_keys])
 98 |             s += formatted
 99 |         s += border
100 |         return s
101 | 
102 |     def display(self, result):
103 |         if self.pager_pipe:
104 |             out = self.pager_pipe
105 |         else:
106 |             out = sys.stdout
107 | 
108 |         try:
109 |             if not result:
110 |                 print('<no results>', file=out)
111 |                 return 0
112 |             elif self.format == 'table':
113 |                 n_rows = 0
114 |                 result = list(result)
115 |                 print(self.table(result), end='', file=out)
116 |                 return len(result)
117 |             elif self.format == 'pretty':
118 |                print(json.dumps(result, indent=2), file=out)
119 |                return len(result)
120 |             else:
121 |                 print(json.dumps(result), file=out)
122 |                 return len(result)
123 |         except BrokenPipeError:
124 |             pass  # user quit the pager
125 | 
126 |     def update_prompt(self):
127 |         if not self.segments:
128 |             self.prompt = 'trough> '
129 |         elif len(self.segments) == 1:
130 |             self.prompt = 'trough:%s(%s)> ' % (
131 |                     self.segments[0], 'rw' if self.writable else 'ro')
132 |         else:
133 |             self.prompt = 'trough:[%s segments](%s)> ' % (
134 |                     len(self.segments), 'rw' if self.writable else 'ro')
135 | 
136 |     def do_show(self, argument):
137 |         '''
138 |         SHOW command, like MySQL. Available subcommands:
139 |         - SHOW TABLES
140 |         - SHOW CREATE TABLE
141 |         - SHOW CONNECTIONS
142 |         - SHOW SCHEMA schema-name
143 |         - SHOW SCHEMAS
144 |         - SHOW SEGMENTS
145 |         - SHOW SEGMENTS MATCHING <regex>
146 |         '''
147 |         with self.pager():
148 |             argument = argument.replace(";", "").lower()
149 |             if argument[:6] == 'tables':
150 |                 self.do_select("name from sqlite_master where type = 'table';")
151 |             elif argument[:12] == 'create table':
152 |                 self.do_select(
153 |                         "sql from sqlite_master where type = 'table' "
154 |                         "and name = '%s';" % argument[12:].replace(';', '').strip())
155 |             elif argument[:7] == 'schemas':
156 |                 result = self.cli.schemas()
157 |                 self.display(result)
158 |             elif argument[:11] == 'connections':
159 |                 connections = []
160 |                 for segment in sorted(self.segments):
161 |                     conn = {'segment_id': segment}
162 |                     if self.writable:
163 |                         try:
164 |                             conn['write_url'] = self.cli.write_url(segment)
165 |                         except:
166 |                             conn['write_url'] = None
167 |                     try:
168 |                         conn['read_url'] = self.cli.read_url(segment)
169 |                     except:
170 |                         conn['read_url'] = None
171 |                     connections.append(conn)
172 |                 self.display(connections)
173 |             elif argument[:7] == 'schema ':
174 |                 name = argument[7:].strip()
175 |                 result = self.cli.schema(name)
176 |                 self.display(result)
177 |             elif argument[:8] == 'segments':
178 |                 regex = None
179 |                 if "matching" in argument:
180 |                     regex = argument.split("matching")[-1].strip().strip('"').strip("'")
181 |                 try:
182 |                     start = datetime.datetime.now()
183 |                     result = self.cli.readable_segments(regex=regex)
184 |                     end = datetime.datetime.now()
185 |                     n_rows = self.display(result)
186 |                     print("%s results" % n_rows, file=self.pager_pipe)
187 |                 except Exception as e:
188 |                     self.logger.error(e, exc_info=True)
189 |             else:
190 |                 self.do_help('show')
191 | 
192 |     def do_connect(self, argument):
193 |         '''
194 |         Connect to one or more trough "segments" (sqlite databases).
195 |         Usage:
196 | 
197 |         - CONNECT segment [segment...]
198 |         - CONNECT MATCHING <regex>
199 | 
200 |         See also SHOW CONNECTIONS
201 |         '''
202 |         argument = re.sub(r';+$', '', argument.strip().lower())
203 |         if not argument:
204 |             self.do_help('connect')
205 |             return
206 | 
207 |         if argument[:8] == 'matching':
208 |             seg_urls = self.cli.read_urls_for_regex(argument[8:].lstrip())
209 |             self.segments = seg_urls.keys()
210 |         else:
211 |             self.segments = argument.split()
212 |         self.update_prompt()
213 | 
214 |     def do_format(self, raw_arg):
215 |         '''
216 |         Set result output display format. Options:
217 | 
218 |         - FORMAT TABLE   - tabular format (the default)
219 |         - FORMAT PRETTY  - pretty-printed json
220 |         - FORMAT RAW     - raw json
221 | 
222 |         With no argument, displays current output format.
223 |         '''
224 |         arg = raw_arg.strip().lower()
225 |         if not arg:
226 |             print('Format is %r' % self.format)
227 |         elif arg in ('table', 'pretty', 'raw'):
228 |             self.format = arg
229 |             print('Format is now %r' % self.format)
230 |         else:
231 |             self.do_help('format')
232 | 
233 |     async def async_select(self, segment, query):
234 |         result = await self.cli.async_read(segment, query)
235 |         try:
236 |             print('+++++ results from segment %s +++++' % segment,
237 |                   file=self.pager_pipe or sys.stdout)
238 |         except BrokenPipeError:
239 |             pass
240 |         return self.display(result) # returns number of rows
241 | 
242 |     async def async_fanout(self, query):
243 |         tasks = []
244 |         for segment in self.segments:
245 |             task = asyncio.ensure_future(self.async_select(segment, query))
246 |             tasks.append(task)
247 |         results = await asyncio.gather(*tasks, return_exceptions=True)
248 |         for i, result in enumerate(results):
249 |             if isinstance(result, BaseException):
250 |                 try:
251 |                     raise result
252 |                 except:
253 |                     if isinstance(result, trough.client.TroughException) and result.returned_message and result.payload:
254 |                         print("An error occured during execution:")
255 |                         print(result.returned_message.replace("500 Server Error: ", ""))
256 |                         print("(query was: '%s')" % result.payload.decode().strip())
257 |                     else:
258 |                         self.logger.warning(
259 |                             'async_fanout results[%r] is an exception:',
260 |                             i, exc_info=True)
261 |             elif result:
262 |                 self.n_rows += result
263 | 
264 |     def do_select(self, line):
265 |         '''Send a query to the currently-connected trough segment.
266 | 
267 |         Syntax: select...
268 | 
269 |         Example: Send query "select * from host_statistics;" to server
270 |         trough> select * from host_statistics;
271 |         '''
272 |         if not self.segments:
273 |             print('not connected to any segments')
274 |             return
275 | 
276 |         query = 'select ' + line
277 |         with self.pager():
278 |             try:
279 |                 self.n_rows = 0
280 |                 loop = asyncio.get_event_loop()
281 |                 future = asyncio.ensure_future(self.async_fanout(query))
282 |                 loop.run_until_complete(future)
283 |                 # XXX not sure how to measure time not including user time
284 |                 # scrolling around in `less`
285 |                 print('%s total results' % self.n_rows, file=self.pager_pipe)
286 |             except Exception as e:
287 |                 self.logger.error(e, exc_info=True)
288 | 
289 |     @contextmanager
290 |     def pager(self):
291 |         if self.pager_pipe:
292 |             # reentrancy!
293 |             yield
294 |             return
295 | 
296 |         self.column_keys = None
297 |         cmd = os.environ.get('PAGER') or '/usr/bin/less -nFSX'
298 |         try:
299 |             with subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) as proc:
300 |                 with io.TextIOWrapper(
301 |                         proc.stdin, errors='backslashreplace') as self.pager_pipe:
302 |                     yield
303 |                 proc.wait()
304 |         except BrokenPipeError:
305 |             pass # user quit the pager
306 |         self.pager_pipe = None
307 | 
308 |     def emptyline(self):
309 |         pass
310 | 
311 |     def do_promote(self, args):
312 |         '''
313 |         Promote connected segments to permanent storage in hdfs.
314 | 
315 |         Takes no arguments. Only supported in read-write mode.
316 |         '''
317 |         if args.strip():
318 |             self.do_help('promote')
319 |             return
320 |         if not self.segments:
321 |             print('not connected to any segments')
322 |             return
323 |         if not self.writable:
324 |             print('promoting segments not supported in read-only mode')
325 |             return
326 |         for segment in self.segments:
327 |             self.cli.promote(segment)
328 | 
329 |     def do_infile(self, filename):
330 |         '''
331 |         Read and execute SQL commands from a file.
332 | 
333 |         Usage:
334 | 
335 |         INFILE filename
336 |         '''
337 |         if not filename:
338 |             self.do_help('infile')
339 |             return
340 |         with open(filename.strip(), 'r') as infile:
341 |             if self.writable:
342 |                 if len(self.segments) == 1:
343 |                     self.cli.write(self.segments[0], infile.read(), schema_id=self.schema_id)
344 |                 elif not self.segments:
345 |                     print('not connected to any segments')
346 |                 elif len(self.segments) > 1:
347 |                     print('writing to multiple segments not supported')
348 |             else:
349 |                 self.logger.error(
350 |                     'invalid command "%s %s", and refusing to execute arbitrary '
351 |                     'sql (in read-only mode)', 'infile', filename)
352 | 
353 |         
354 |     def do_register(self, line):
355 |         '''
356 |         Register a new schema. Reads the schema from 'schema_file' argument, registers as "schema_name" 
357 | 
358 |         Usage:
359 | 
360 |         REGISTER SCHEMA schema_name schema_file
361 |         or
362 |         REGISTER schema_name schema_file
363 |         
364 |         See also: SHOW SCHEMA(S)
365 |         '''
366 |         args = line.split()
367 |         if args[0].lower() == 'schema':
368 |             args.pop(0)
369 |         if len(args) != 2:
370 |             print("please provide exactly two arguments: schema_name schema_file. You provided \"%s\"." % (" ".join(args), line))
371 |             self.do_help('register')
372 |             return
373 |         with open(args[1], 'r') as infile:
374 |             schema = infile.read()
375 |             print("registering schema '%s'...\n%s" % (args[0], schema))
376 |             self.cli.register_schema(args[0], schema)
377 |             print("Done.")
378 |             
379 | 
380 |     def do_shred(self, argument):
381 |         '''
382 |         Delete segments entirely from trough. CAUTION: Not reversible!
383 |         Usage:
384 | 
385 |         SHRED SEGMENT segment_id [segment_id...]
386 |         '''
387 |         argument = re.sub(r';+$', '', argument.strip()).strip()
388 |         if not argument:
389 |             self.do_help('shred')
390 |             return
391 | 
392 |         args = argument.split()
393 |         if args[0].lower() != 'segment' or len(args) < 2:
394 |             self.do_help('shred')
395 |             return
396 | 
397 |         if self.writable:
398 |             for arg in args[1:]:
399 |                 self.cli.delete_segment(arg)
400 |         else:
401 |             self.logger.error('SHRED disallowed in read-only mode')
402 |             return
403 | 
404 |     def default(self, line):
405 |         keyword_args = line.strip().split(maxsplit=1)
406 | 
407 |         if len(keyword_args) == 1:
408 |             keyword, args = keyword_args[0], ''
409 |         else:
410 |             keyword, args = keyword_args[0], keyword_args[1]
411 | 
412 |         if getattr(self, 'do_' + keyword.lower(), None):
413 |             getattr(self, 'do_' + keyword.lower())(args)
414 |         elif self.writable:
415 |             if len(self.segments) == 1:
416 |                 self.cli.write(self.segments[0], line, schema_id=self.schema_id)
417 |             elif not self.segments:
418 |                 print('not connected to any segments')
419 |             elif len(self.segments) > 1:
420 |                 print('writing to multiple segments not supported')
421 |         else:
422 |             self.logger.error(
423 |                     'invalid command %r, and refusing to execute arbitrary '
424 |                     'sql (in read-only mode)', keyword)
425 | 
426 |     def do_quit(self, args):
427 |         '''Exit the trough shell.'''
428 |         if not args:
429 |             print('bye!')
430 |             return True
431 |     do_EOF = do_quit
432 |     do_exit = do_quit
433 |     do_bye = do_quit
434 | 
435 |     def do_help(self, arg):
436 |         super().do_help(arg.lower())
437 | 
438 | def trough_shell(argv=None):
439 |     argv = argv or sys.argv
440 |     arg_parser = argparse.ArgumentParser(
441 |             prog=os.path.basename(argv[0]),
442 |             formatter_class=BetterArgumentDefaultsHelpFormatter)
443 |     arg_parser.add_argument(
444 |             '-u', '--rethinkdb-trough-db-url',
445 |             default='rethinkdb://localhost/trough_configuration')
446 |     arg_parser.add_argument('-w', '--writable', action='store_true')
447 |     arg_parser.add_argument('-v', '--verbose', action='store_true')
448 |     arg_parser.add_argument(
449 |             '-s', '--schema', default='default',
450 |             help='schema id for new segment')
451 |     arg_parser.add_argument('segment', nargs='*')
452 |     args = arg_parser.parse_args(args=argv[1:])
453 | 
454 |     logging.root.handlers = []
455 |     logging.basicConfig(
456 |             stream=sys.stdout,
457 |             level=logging.DEBUG if args.verbose else logging.INFO, format=(
458 |                 '%(asctime)s %(levelname)s %(name)s.%(funcName)s'
459 |                 '(%(filename)s:%(lineno)d) %(message)s'))
460 |     logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING)
461 |     logging.getLogger('urllib3').setLevel(logging.WARNING)
462 |     logging.getLogger('asyncio').setLevel(logging.WARNING)
463 | 
464 |     cli = trough.client.TroughClient(args.rethinkdb_trough_db_url)
465 |     shell = TroughShell(cli, args.segment, args.writable, args.schema)
466 | 
467 |     if os.path.exists(HISTORY_FILE):
468 |         readline.read_history_file(HISTORY_FILE)
469 | 
470 |     try:
471 |         shell.cmdloop()
472 |     finally:
473 |         readline.write_history_file(HISTORY_FILE)
474 | 
475 | 


--------------------------------------------------------------------------------
/trough/sync.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | import abc
   3 | import logging
   4 | import doublethink
   5 | import rethinkdb as r
   6 | from trough.settings import settings, init_worker, try_init_sentry
   7 | from snakebite import client
   8 | import socket
   9 | import json
  10 | import os
  11 | import time
  12 | import random
  13 | import sys
  14 | import string
  15 | import requests
  16 | import datetime
  17 | import sqlite3
  18 | import re
  19 | import contextlib
  20 | from uhashring import HashRing
  21 | import ujson
  22 | from hdfs3 import HDFileSystem
  23 | import threading
  24 | import tempfile
  25 | from concurrent import futures
  26 | 
  27 | class ClientError(Exception):
  28 |     pass
  29 | 
  30 | 
  31 | try_init_sentry()
  32 | 
  33 | 
  34 | def healthy_services_query(rethinker, role):
  35 |     return rethinker.table('services', read_mode='outdated')\
  36 |             .filter({"role": role})\
  37 |             .filter(
  38 |                 lambda svc: r.now().sub(svc["last_heartbeat"]).lt(svc["ttl"]))
  39 | 
  40 | def setup_connection(conn):
  41 |     def regexp(expr, item):
  42 |         try:
  43 |             if item is None:
  44 |                 return False
  45 |             reg = re.compile(expr)
  46 |             return reg.search(item) is not None
  47 |         except:
  48 |             logging.error('REGEXP(%r, %r)', expr, item, exc_info=True)
  49 |             raise
  50 | 
  51 |     # TODO these next two functions are stupidly specific to archive-it
  52 |     def seed_crawled_status_filter(status_code):
  53 |         ''' convert crawler status codes to human-readable test '''
  54 |         try:
  55 |             status_code = int(status_code)
  56 |         except TypeError:
  57 |             return 'Not crawled (%s)' % status_code
  58 | 
  59 |         if status_code >= 300 and status_code < 400:
  60 |             return 'Redirected'
  61 |         elif status_code >= 400:
  62 |             return 'Crawled (HTTP error %s)' % status_code
  63 |         elif status_code > 0:
  64 |             return 'Crawled'
  65 |         elif status_code in (0, -5003, -5004):
  66 |             return 'Not crawled (queued)'
  67 |         elif status_code == -9998:
  68 |             return 'Not crawled (blocked by robots)'
  69 |         else:
  70 |             return 'Not crawled (%s)' % status_code
  71 | 
  72 |     def build_redirect_array(redirect_url, redirect_status, hop_path, json_list=None):
  73 |         hop_no = len(hop_path) # parse out "[0-9]+"? how many times will we have 50+ redirects for seeds? maybe 0.
  74 |         if json_list:
  75 |             json_list = json.loads(json_list)
  76 |         else:
  77 |             json_list = []
  78 |         if hop_no > len(json_list):
  79 |             json_list.extend(None for i in range(hop_no - len(json_list)))
  80 |         redirect = {'seed': redirect_url, 'status': seed_crawled_status_filter(redirect_status) }
  81 |         json_list[(hop_no-1)] = redirect
  82 |         return json.dumps(json_list)
  83 | 
  84 |     conn.create_function('REGEXP', 2, regexp)
  85 |     conn.create_function('SEEDCRAWLEDSTATUS', 1, seed_crawled_status_filter)
  86 |     conn.create_function('BUILDREDIRECTARRAY', 4, build_redirect_array)
  87 | 
  88 | class AssignmentQueue:
  89 |     def __init__(self, rethinker):
  90 |         self._queue = []
  91 |         self.rethinker = rethinker
  92 |     def enqueue(self, item):
  93 |         self._queue.append(item)
  94 |         if self.length() >= 1000:
  95 |             self.commit()
  96 |     def commit(self):
  97 |         logging.info("Committing %s assignments", self.length())
  98 |         self.rethinker.table('assignment').insert(self._queue).run()
  99 |         del self._queue[:]
 100 |     def length(self):
 101 |         return len(self._queue)
 102 | 
 103 | class UnassignmentQueue(AssignmentQueue):
 104 |     def commit(self):
 105 |         logging.info("Committing %s unassignments", self.length())
 106 |         ids = [item.id for item in self._queue]
 107 |         self.rethinker.table('assignment').get_all(*ids).delete().run()
 108 |         del self._queue[:]
 109 | 
 110 | class Assignment(doublethink.Document):
 111 |     def populate_defaults(self):
 112 |         if not "id" in self:
 113 |             self.id = "{node}:{segment}".format(node=self.node, segment=self.segment)
 114 |             self._pk = "id"
 115 |     @classmethod
 116 |     def table_create(cls, rr):
 117 |         rr.table_create(cls.table).run()
 118 |         rr.table(cls.table).index_create('segment').run()
 119 |         rr.table(cls.table).index_wait('segment').run()
 120 |     @classmethod
 121 |     def host_assignments(cls, rr, node):
 122 |         return (Assignment(rr, d=asmt) for asmt in rr.table(cls.table, read_mode='outdated').between('%s:\x01' % node, '%s:\x7f' % node, right_bound="closed").run())
 123 |     @classmethod
 124 |     def all(cls, rr):
 125 |         return (Assignment(rr, d=asmt) for asmt in rr.table(cls.table, read_mode='outdated').run())
 126 |     @classmethod
 127 |     def segment_assignments(cls, rr, segment):
 128 |         return (Assignment(rr, d=asmt) for asmt in rr.table(cls.table, read_mode='outdated').get_all(segment, index="segment").run())
 129 | 
 130 | class Lock(doublethink.Document):
 131 |     @classmethod
 132 |     def table_create(cls, rr):
 133 |         rr.table_create(cls.table).run()
 134 |         rr.table(cls.table).index_create('node').run()
 135 |         rr.table(cls.table).index_wait('node').run()
 136 |     @classmethod
 137 |     def acquire(cls, rr, pk, document={}):
 138 |         '''Acquire a lock. Raises an exception if the lock key exists.'''
 139 |         document["id"] = pk
 140 |         document["node"] = settings['HOSTNAME']
 141 |         document["acquired_on"] = doublethink.utcnow()
 142 |         output = rr.table(cls.table).insert(document).run()
 143 |         if output.get('errors'):
 144 |             raise Exception('Unable to acquire a lock for id: "%s"' % pk)
 145 |         return cls(rr, d=document)
 146 |     def release(self):
 147 |         return self.rr.table(self.table, read_mode='majority').get(self.id).delete().run()
 148 |     @classmethod
 149 |     def host_locks(cls, rr, host):
 150 |         return (Lock(rr, d=asmt) for asmt in rr.table(cls.table, read_mode='outdated').get_all(host, index="node").run())
 151 | 
 152 | class Schema(doublethink.Document):
 153 |     pass
 154 | 
 155 | def init(rethinker):
 156 |     Assignment.table_ensure(rethinker)
 157 |     Lock.table_ensure(rethinker)
 158 |     Schema.table_ensure(rethinker)
 159 |     default_schema = Schema.load(rethinker, 'default')
 160 |     if not default_schema:
 161 |         default_schema = Schema(rethinker, d={'sql':''})
 162 |         default_schema.id = 'default'
 163 |         logging.info('saving default schema %r', default_schema)
 164 |         default_schema.save()
 165 |     else:
 166 |         logging.info('default schema already exists %r', default_schema)
 167 |     try:
 168 |         rethinker.table('services').index_create('segment').run()
 169 |         rethinker.table('services').index_create('role').run()
 170 |         rethinker.table('services').index_wait('segment').run()
 171 |         rethinker.table('services').index_wait('role').run()
 172 |     except Exception as e:
 173 |         pass
 174 | 
 175 |     snakebite_client = client.Client(settings['HDFS_HOST'], settings['HDFS_PORT'])
 176 |     for d in snakebite_client.mkdir([settings['HDFS_PATH']], create_parent=True):
 177 |         logging.info('created hdfs dir %r', d)
 178 | 
 179 | class Segment(object):
 180 |     def __init__(self, segment_id, size, rethinker, services, registry, remote_path=None):
 181 |         self.id = segment_id
 182 |         self.size = int(size)
 183 |         self.rethinker = rethinker
 184 |         self.services = services
 185 |         self.registry = registry
 186 |         self.remote_path = remote_path
 187 |     def host_key(self, host):
 188 |         return "%s:%s" % (host, self.id)
 189 |     def all_copies(self):
 190 |         ''' returns the 'assigned' segment copies, whether or not they are 'up' '''
 191 |         return Assignment.segment_assignments(self.rethinker, self.id)
 192 |     def readable_copies_query(self):
 193 |         return self.rethinker.table('services', read_mode='outdated').get_all(self.id, index="segment").filter({"role": 'trough-read'}).filter(
 194 |                 lambda svc: r.now().sub(svc["last_heartbeat"]).lt(svc["ttl"])
 195 |             )
 196 |     def readable_copies(self):
 197 |         '''returns the 'up' copies of this segment to read from, per rethinkdb.'''
 198 |         return self.readable_copies_query().run()
 199 |     def readable_copies_count(self):
 200 |         '''returns the count of 'up' copies of this segment to read from, per rethinkdb.'''
 201 |         return self.readable_copies_query().count().run()
 202 |     def writable_copies_query(self):
 203 |         return healthy_services_query(self.rethinker, role='trough-write').get_all(self.id, index='segment')
 204 |     def writable_copy(self):
 205 |         '''returns the 'up' copies of this segment to write to, per rethinkdb.'''
 206 |         copies = list(self.writable_copies_query().run())
 207 |         if copies:
 208 |             return copies[0]
 209 |         return None
 210 |     def is_assigned_to_host(self, host):
 211 |         return bool(Assignment.load(self.rethinker, self.host_key(host)))
 212 |     def minimum_assignments(self):
 213 |         '''This function should return the minimum number of assignments which is acceptable for a given segment.'''
 214 |         if hasattr(settings['MINIMUM_ASSIGNMENTS'], "__call__"):
 215 |             return settings['MINIMUM_ASSIGNMENTS'](self.id)
 216 |         else:
 217 |             return settings['MINIMUM_ASSIGNMENTS']
 218 |     def cold_store(self):
 219 |         if hasattr(settings['COLD_STORE_SEGMENT'], "__call__"):
 220 |             return settings['COLD_STORE_SEGMENT'](self.id)
 221 |         else:
 222 |             return settings['COLD_STORE_SEGMENT']
 223 |     def cold_storage_path(self):
 224 |         return settings['COLD_STORAGE_PATH'].format(prefix=str(self.id)[0:-3], segment_id=self.id)
 225 |     def new_write_lock(self):
 226 |         '''Raises exception if lock exists.'''
 227 |         return Lock.acquire(self.rethinker, pk='write:lock:%s' % self.id, document={ "segment": self.id })
 228 |     def retrieve_write_lock(self):
 229 |         '''Returns None or dict. Can be used to evaluate whether a lock exists and, if so, which host holds it.'''
 230 |         return Lock.load(self.rethinker, 'write:lock:%s' % self.id)
 231 |     def local_host_can_write(self):
 232 |         write_lock = self.retrieve_write_lock()
 233 |         if write_lock and write_lock['node'] == settings['HOSTNAME']:
 234 |             return write_lock
 235 |         else:
 236 |             return None
 237 |     def local_path(self):
 238 |         if self.cold_store():
 239 |             return self.cold_storage_path()
 240 |         return os.path.join(settings['LOCAL_DATA'], "%s.sqlite" % self.id)
 241 |     def local_segment_exists(self):
 242 |         return os.path.isfile(self.local_path())
 243 |     def provision_local_segment(self, schema_sql):
 244 |         connection = sqlite3.connect(self.local_path())
 245 |         setup_connection(connection)
 246 |         cursor = connection.cursor()
 247 |         cursor.executescript(schema_sql)
 248 |         cursor.close()
 249 |         connection.commit()
 250 |         connection.close()
 251 |         logging.info('provisioned %s', self.local_path())
 252 |     def __repr__(self):
 253 |         return '<Segment:id=%r,local_path=%r>' % (self.id, self.local_path())
 254 | 
 255 | class HostRegistry(object):
 256 |     '''Host Registry'''
 257 |     def __init__(self, rethinker, services):
 258 |         self.rethinker = rethinker
 259 |         self.services = services
 260 |         self.assignment_queue = AssignmentQueue(self.rethinker)
 261 |         self.unassignment_queue = UnassignmentQueue(self.rethinker)
 262 |     def get_hosts(self, exclude_cold=True):
 263 |         query = self.rethinker.table('services').between('trough-nodes:!', 'trough-nodes:~').filter(
 264 |                    lambda svc: r.now().sub(svc["last_heartbeat"]).lt(svc["ttl"])
 265 |                ).order_by("load")
 266 |         if exclude_cold:
 267 |             query = query.filter(r.row['cold_storage'].default(False).not_())
 268 |         return list(query.run())
 269 |     def get_cold_hosts(self):
 270 |         return list(self.rethinker.table('services').between('trough-nodes:!', 'trough-nodes:~').filter(
 271 |                    lambda svc: r.now().sub(svc["last_heartbeat"]).lt(svc["ttl"])
 272 |                ).filter({"cold_storage": True}).order_by("load").run())
 273 |     def total_bytes_for_node(self, node):
 274 |         for service in self.services.available_services('trough-nodes'):
 275 |             if service['node'] == node:
 276 |                 return service.get('available_bytes')
 277 |         raise Exception('Could not find node "%s"' % node)
 278 |     def heartbeat(self, pool=None, node=None, ttl=None, **doc):
 279 |         if None in [pool, node, ttl]:
 280 |             raise Exception('"pool", "node" and "ttl" are required arguments.')
 281 |         doc['id'] = "%s:%s:%s" % (pool, node, doc.get('segment'))
 282 |         logging.info("Setting Heartbeat ID to [%s]" % doc['id'])
 283 |         doc['role'] = pool
 284 |         doc['node'] = node
 285 |         doc['ttl'] = ttl
 286 |         doc['load'] = os.getloadavg()[1] # load average over last 5 mins
 287 |         logging.info('Heartbeat: role[%s] node[%s] at IP %s:%s with ttl %s' % (pool, node, node, doc.get('port'), ttl))
 288 |         return self.services.heartbeat(doc)
 289 |     def bulk_heartbeat(self, ids):
 290 |         self.rethinker.table('services').get_all(*ids).update({ 'last_heartbeat': r.now(), 'load': os.getloadavg()[1] }).run()
 291 |         # send a non-bulk heartbeat for each id we *didn't* just update
 292 |         missing_ids = set(ids) - set(self.rethinker.table('services').get_all(*ids).get_field('id').run())
 293 |         for id in missing_ids:
 294 |             pool, node, segment = id.split(":")
 295 |             port = settings['WRITE_PORT'] if pool == 'trough-write' else settings['READ_PORT']
 296 |             url = 'http://%s:%s/?segment=%s' % (node, port, segment)
 297 |             self.heartbeat(pool=pool, node=node, segment=segment, port=port, url=url, ttl=round(settings['SYNC_LOOP_TIMING'] * 4))
 298 |     def assign(self, hostname, segment, remote_path):
 299 |         logging.info("Assigning segment: %s to '%s'" % (segment.id, hostname))
 300 |         asmt = Assignment(self.rethinker, d={
 301 |             'node': hostname,
 302 |             'segment': segment.id,
 303 |             'assigned_on': doublethink.utcnow(),
 304 |             'remote_path': remote_path,
 305 |             'bytes': segment.size })
 306 |         logging.info('Adding "%s" to rethinkdb.' % (asmt))
 307 |         self.assignment_queue.enqueue(asmt)
 308 |         return asmt
 309 |     def unassign(self, assignment):
 310 |         self.unassignment_queue.enqueue(assignment)
 311 |     def commit_assignments(self):
 312 |         self.assignment_queue.commit()
 313 |     def commit_unassignments(self):
 314 |         self.unassignment_queue.commit()
 315 |     def segments_for_host(self, host):
 316 |         locks = Lock.host_locks(self.rethinker, host)
 317 |         segments = {lock.segment: Segment(segment_id=lock.segment, size=0, rethinker=self.rethinker, services=self.services, registry=self) for lock in locks}
 318 |         assignments = Assignment.host_assignments(self.rethinker, host)
 319 |         for asmt in assignments:
 320 |             segments[asmt.segment] = Segment(segment_id=asmt.segment, size=asmt.bytes, rethinker=self.rethinker, services=self.services, registry=self, remote_path=asmt.remote_path)
 321 |         logging.info('Checked for segments assigned to %s: Found %s segment(s)' % (host, len(segments)))
 322 |         return list(segments.values())
 323 | 
 324 | # Base class, not intended for use.
 325 | class SyncController:
 326 |     __metaclass__ = abc.ABCMeta
 327 | 
 328 |     def __init__(self, rethinker=None, services=None, registry=None, hdfs_path=None):
 329 |         self.rethinker = rethinker
 330 |         self.services = services
 331 |         self.registry = registry
 332 |         self.leader = False
 333 | 
 334 |         self.hostname = settings['HOSTNAME']
 335 |         self.external_ip = settings['EXTERNAL_IP']
 336 |         self.rethinkdb_hosts = settings['RETHINKDB_HOSTS']
 337 | 
 338 |         self.hdfs_path = settings['HDFS_PATH']
 339 |         self.hdfs_host = settings['HDFS_HOST']
 340 |         self.hdfs_port = settings['HDFS_PORT']
 341 | 
 342 |         self.election_cycle = settings['ELECTION_CYCLE']
 343 |         self.sync_server_port = settings['SYNC_SERVER_PORT']
 344 |         self.sync_local_port = settings['SYNC_LOCAL_PORT']
 345 |         self.read_port = settings['READ_PORT']
 346 |         self.write_port = settings['WRITE_PORT']
 347 |         self.sync_loop_timing = settings['SYNC_LOOP_TIMING']
 348 | 
 349 |         self.rethinkdb_hosts = settings['RETHINKDB_HOSTS']
 350 |         self.host_check_wait_period = settings['HOST_CHECK_WAIT_PERIOD']
 351 | 
 352 |         self.local_data = settings['LOCAL_DATA']
 353 |         self.storage_in_bytes = settings['STORAGE_IN_BYTES']
 354 |     def start(self):
 355 |         pass
 356 |     def check_config(self):
 357 |         raise Exception('Not Implemented')
 358 |     def ls_r(self, hdfs, path):
 359 |         for entry in hdfs.ls(path, detail=True):
 360 |             yield entry
 361 |             if entry['kind'] == 'directory':
 362 |                 yield from self.ls_r(hdfs, entry['name'])
 363 |     def check_health(self):
 364 |         pass
 365 |     def get_segment_file_list(self):
 366 |         logging.info('Looking for *.sqlite in hdfs recursively under %s', self.hdfs_path)
 367 |         hdfs = HDFileSystem(host=self.hdfs_host, port=self.hdfs_port)
 368 |         return (entry for entry in self.ls_r(hdfs, self.hdfs_path)
 369 |                 if entry['name'].endswith('.sqlite'))
 370 |     def list_schemas(self):
 371 |         gen = self.rethinker.table(Schema.table)['id'].run()
 372 |         result = list(gen)
 373 |         return result
 374 |     def get_schema(self, id):
 375 |         schema = Schema.load(self.rethinker, id)
 376 |         return schema
 377 |     def set_schema(self, id, sql):
 378 |         validate_schema_sql(sql)
 379 |         # create a document, insert/update it, overwriting document with id 'id'.
 380 |         created = False
 381 |         output = Schema.load(self.rethinker, id)
 382 |         if not output:
 383 |             output = Schema(self.rethinker, d={})
 384 |             created = True
 385 |         output.id = id
 386 |         output.sql = sql
 387 |         output.save()
 388 |         return (output, created)
 389 | 
 390 |     @abc.abstractmethod
 391 |     def delete_segment(self, segment_id):
 392 |         raise NotImplementedError
 393 | 
 394 | 
 395 | # Master or "Server" mode synchronizer.
 396 | class MasterSyncController(SyncController):
 397 |     def __init__(self, *args, **kwargs):
 398 |         super().__init__(*args, **kwargs)
 399 |         self.current_master = {}
 400 |         self.current_host_nodes = []
 401 | 
 402 |     def check_config(self):
 403 |         try:
 404 |             assert settings['HDFS_PATH'], "HDFS_PATH must be set, otherwise I don't know where to look for sqlite files."
 405 |             assert settings['HDFS_HOST'], "HDFS_HOST must be set, or I can't communicate with HDFS."
 406 |             assert settings['HDFS_PORT'], "HDFS_PORT must be set, or I can't communicate with HDFS."
 407 |             assert settings['ELECTION_CYCLE'] > 0, "ELECTION_CYCLE must be greater than zero. It governs the number of seconds in a sync master election period."
 408 |             assert settings['HOSTNAME'], "HOSTNAME must be set, or I can't figure out my own hostname."
 409 |             assert settings['EXTERNAL_IP'], "EXTERNAL_IP must be set. We need to know which IP to use."
 410 |             assert settings['SYNC_SERVER_PORT'], "SYNC_SERVER_PORT must be set. We need to know the output port."
 411 |             assert settings['RETHINKDB_HOSTS'], "RETHINKDB_HOSTS must be set. Where can I contact RethinkDB on port 29015?"
 412 |         except AssertionError as e:
 413 |             sys.exit("{} Exiting...".format(str(e)))
 414 | 
 415 |     def hold_election(self):
 416 |         logging.debug(
 417 |                 'Holding Sync Master Election (current master is %s)...',
 418 |                 self.current_master.get('url'))
 419 |         candidate = {
 420 |             "id": "trough-sync-master",
 421 |             "node": self.hostname,
 422 |             "port": self.sync_server_port,
 423 |             "url": "http://%s:%s/" % (self.hostname, self.sync_server_port),
 424 |             "ttl": self.election_cycle + self.sync_loop_timing * 4,
 425 |         }
 426 |         sync_master = self.services.unique_service('trough-sync-master', candidate=candidate)
 427 |         if sync_master.get('node') == self.hostname:
 428 |             if self.current_master.get('node') != sync_master.get('node'):
 429 |                 logging.info('I am the new master! url=%r ttl=%r', sync_master.get('url'), sync_master.get('ttl'))
 430 |             else:
 431 |                 logging.debug('I am still the master. url=%r ttl=%r', sync_master.get('url'), sync_master.get('ttl'))
 432 |             self.current_master = sync_master
 433 |             return True
 434 |         else:
 435 |             logging.debug('I am not the master. The master is %r', sync_master.get('url'))
 436 |             self.current_master = sync_master
 437 |             return False
 438 | 
 439 |     def delete_segment(self, segment_id):
 440 |         '''
 441 |         Looks up the segment's assignments and services to determine which
 442 |         trough worker nodes may hold the segment. Makes an http api call to
 443 |         each of these trough workers to have them delete their segments on disk
 444 |         and delete their service entries. Then deletes assignments from
 445 |         rethinkdb and finally deletes the files from hdfs.
 446 | 
 447 |         Raises:
 448 |             KeyError: if there are no assignments and no services for
 449 |                 `segment_id`
 450 |             ClientError: if a write lock exists for the segment
 451 |         '''
 452 |         query = self.rethinker.table('lock').get('write:lock:%s' % segment_id)
 453 |         result = query.run()
 454 |         if result:
 455 |             raise ClientError(
 456 |                     'cannot delete segment: write lock exists: %r', result)
 457 | 
 458 |         # look up assigned worker nodes and service entry nodes; generally
 459 |         # these should be the same nodes but do everything to be thorough
 460 |         workers = set()
 461 | 
 462 |         assignments = list(
 463 |                 Assignment.segment_assignments(self.rethinker, segment_id))
 464 |         for assignment in assignments:
 465 |             workers.add(assignment['node'])
 466 | 
 467 |         services = self.rethinker.table('services')\
 468 |                 .get_all(segment_id, index='segment').run()
 469 |         for service in services:
 470 |             if service.get('role') == 'trough-write':
 471 |                 # this service is cruft (we already know there is no write lock)
 472 |                 query = self.rethinker.table('services')\
 473 |                         .get(service['id']).delete()
 474 |                 result = query.run()
 475 |                 # ugh cannot log the query, some kind of bug
 476 |                 # *** RuntimeError: generator raised StopIteration
 477 |                 logging.warning(
 478 |                         'deleted crufty trough-write service %r => %r',
 479 |                         service['id'], result)
 480 |             workers.add(service['node'])
 481 | 
 482 |         if not workers:
 483 |             raise KeyError(
 484 |                     'no assignments or services found for segment id '
 485 |                     '%r' % segment_id)
 486 | 
 487 |         # ask workers to do their part (TODO could do these calls in parallel)
 488 |         for worker in workers:
 489 |             url = 'http://%s:%s/segment/%s' % (
 490 |                     worker, self.sync_local_port, segment_id)
 491 |             response = requests.delete(url, timeout=120)
 492 |             if response.status_code >= 500:
 493 |                 response.raise_for_status()
 494 |             logging.info('worker: %s DELETE %s', response.status_code, url)
 495 | 
 496 |         # delete assignments
 497 |         query = self.rethinker.table('assignment')\
 498 |                 .get_all(segment_id, index='segment').delete()
 499 |         result = query.run()
 500 |         logging.info(
 501 |                 'rethinkdb result of deleting %s assignment: %s',
 502 |                 segment_id, result)
 503 | 
 504 |         # delete files from hdfs
 505 |         hdfs_paths = [a['remote_path'] for a in assignments if a.get('remote_path')]
 506 |         if hdfs_paths:
 507 |             hdfs_cli = client.Client(settings['HDFS_HOST'], settings['HDFS_PORT'])
 508 |             result = list(hdfs_cli.delete(hdfs_paths))
 509 |             logging.info('%s', result)
 510 | 
 511 |     def assign_segments(self):
 512 |         logging.debug('Assigning and balancing segments...')
 513 |         max_copies = settings['MAXIMUM_ASSIGNMENTS']
 514 |         if self.hold_election():
 515 |             last_heartbeat = datetime.datetime.now()
 516 |         else:
 517 |             return False
 518 | 
 519 |         # get segment list
 520 |         # output is like ({ "path": "/a/b/c/segmentA.sqlite" }, { "path": "/a/b/c/segmentB.sqlite" })
 521 |         segment_files = self.get_segment_file_list()
 522 |         # output is like [Segment("segmentA"), Segment("segmentB")]
 523 |         segments = []
 524 |         for file in segment_files:
 525 |             segment = Segment(
 526 |                 segment_id=file['name'].split('/')[-1].replace('.sqlite', ''),
 527 |                 size=file['size'],
 528 |                 remote_path=file['name'],
 529 |                 rethinker=self.rethinker,
 530 |                 services=self.services,
 531 |                 registry=self.registry)
 532 |             segments.append(segment) # TODO: fix this per comment above.
 533 |         logging.info('assigning and balancing %r segments', len(segments))
 534 | 
 535 |         # host_ring_mapping will be e.g. { 'host1': { 'ring': 0, 'weight': 188921 }, 'host2': { 'ring': 0, 'weight': 190190091 }... }
 536 |         # the keys are node names, the values are array indices for the hash_rings variable (below)
 537 |         host_ring_mapping = Assignment.load(self.rethinker, "ring-assignments")
 538 |         if not host_ring_mapping:
 539 |             host_ring_mapping = Assignment(self.rethinker, {})
 540 |             host_ring_mapping.id = "ring-assignments"
 541 | 
 542 |         host_weights = {host['node']: self.registry.total_bytes_for_node(host['node'])
 543 |                         for host in self.registry.get_hosts()}
 544 | 
 545 |         # instantiate N hash rings where N is the lesser of (the maximum number of copies of any segment)
 546 |         # and (the number of currently available hosts)
 547 |         hash_rings = []
 548 |         for i in range(min(max_copies, len(host_weights))):
 549 |             ring = HashRing()
 550 |             ring.id = i
 551 |             hash_rings.append(ring)
 552 | 
 553 |         # prune hosts that don't exist anymore
 554 |         for host in [key for key in host_ring_mapping.keys() if key not in host_weights and key != 'id']:
 555 |             logging.info('pruning worker %r from pool (worker went offline?) [was: hash ring %s]', host, host_ring_mapping[host])
 556 |             del(host_ring_mapping[host])
 557 | 
 558 |         # assign each host to one hash ring. Save the assignment in rethink so it's reproducible.
 559 |         # weight each host assigned to a hash ring with its total assignable bytes quota
 560 |         for hostname in [key for key in host_ring_mapping.keys() if key != 'id']:
 561 |             host = host_ring_mapping[hostname]
 562 |             hash_rings[host['ring']].add_node(hostname, { 'weight': host['weight'] })
 563 |             logging.info("Host '%s' assigned to ring %s" % (hostname, host['ring']))
 564 | 
 565 |         new_hosts = [host for host in host_weights if host not in host_ring_mapping]
 566 |         for host in new_hosts:
 567 |             weight = host_weights[host]
 568 |             host_ring = sorted(hash_rings, key=lambda ring: len(ring.get_nodes()))[0].id # TODO: this should be sorted by bytes, not raw # of nodes
 569 |             host_ring_mapping[host] = { 'weight': weight, 'ring': host_ring }
 570 |             hash_rings[host_ring].add_node(host, { 'weight': weight })
 571 |             logging.info("new trough worker %r assigned to ring %r", host, host_ring)
 572 | 
 573 |         host_ring_mapping.save()
 574 | 
 575 |         # 'ring_assignments' will be like { "0-192811": Assignment(), "1-192811": Assignment()... }
 576 |         ring_assignments = {}
 577 |         cold_assignments = {}
 578 |         for assignment in Assignment.all(self.rethinker):
 579 |             if assignment.hash_ring == 'cold':
 580 |                 dict_key = "%s-%s" % (assignment.node, assignment.segment)
 581 |                 cold_assignments[dict_key] = assignment
 582 |             elif assignment.id != 'ring-assignments':
 583 |                 dict_key = "%s-%s" % (assignment.hash_ring, assignment.segment)
 584 |                 ring_assignments[dict_key] = assignment
 585 | 
 586 |         changed_assignments = 0
 587 |         i = 0
 588 |         for segment in segments:
 589 |             i += 1
 590 |             if i % 10000 == 0:
 591 |                 logging.info(
 592 |                         'processed assignments for %s of %s segments so far',
 593 |                         i, len(segments))
 594 |             # if it's been over 80% of an election cycle since the last heartbeat, hold an election so we don't lose master status
 595 |             if datetime.datetime.now() - datetime.timedelta(seconds=0.8 * self.election_cycle) > last_heartbeat:
 596 |                 if self.hold_election():
 597 |                     last_heartbeat = datetime.datetime.now()
 598 |                 else:
 599 |                     return False
 600 |             logging.debug("Assigning segment [%s]", segment.id)
 601 |             if segment.cold_store():
 602 |                 # assign segment, so we can advertise the service
 603 |                 for cold_host in self.registry.get_cold_hosts():
 604 |                     if not cold_assignments.get("%s-%s" % (cold_host['node'], segment.id)):
 605 |                         logging.info("Segment [%s] will be assigned to cold storage tier host [%s]", segment.id, cold_host['node'])
 606 |                         changed_assignments += 1
 607 |                         self.registry.assignment_queue.enqueue(Assignment(self.rethinker, d={
 608 |                                                         'node': cold_host['node'],
 609 |                                                         'segment': segment.id,
 610 |                                                         'assigned_on': doublethink.utcnow(),
 611 |                                                         'remote_path': segment.remote_path,
 612 |                                                         'bytes': segment.size,
 613 |                                                         'hash_ring': "cold" }))
 614 |                 for ring in hash_rings:
 615 |                     warm_dict_key = '%s-%s' % (ring.id, segment.id)
 616 |                     if warm_dict_key in ring_assignments:
 617 |                         logging.info('removing warm assignnment %s because segment %s is cold', ring_assignments[warm_dict_key], segment.id)
 618 |                         self.registry.unassign(ring_assignments[warm_dict_key])
 619 |                 continue
 620 |             # find position of segment in N hash rings, where N is the minimum number of assignments for this segment
 621 |             random.seed(segment.id) # (seed random so we always get the same sample of hash rings for this item)
 622 |             assigned_rings = random.sample(hash_rings, segment.minimum_assignments())
 623 |             logging.debug("Segment [%s] will use rings %s", segment.id, [s.id for s in assigned_rings])
 624 |             for ring in assigned_rings:
 625 |                 # get the node for the key from hash ring, updating or creating assignments from corresponding entry in 'ring_assignments' as necessary
 626 |                 assigned_node = ring.get_node(segment.id)
 627 |                 dict_key = "%s-%s" % (ring.id, segment.id)
 628 |                 assignment = ring_assignments.get(dict_key)
 629 |                 logging.debug("Current assignment: '%s' New assignment: '%s'", assignment.node if assignment else None, assigned_node)
 630 |                 if assignment is None or assignment.node != assigned_node:
 631 |                     changed_assignments += 1
 632 |                     logging.info("Segment [%s] will be assigned to host '%s' for ring [%s]", segment.id, assigned_node, ring.id)
 633 |                     if assignment:
 634 |                         logging.info("Removing old assignment to node '%s' for segment [%s]: (%s will be deleted)", assignment.node, segment.id, assignment)
 635 |                         self.registry.unassign(assignment)
 636 |                         del ring_assignments[dict_key]
 637 |                     ring_assignments[dict_key] = ring_assignments.get(dict_key, Assignment(self.rethinker, d={
 638 |                                                         'hash_ring': ring.id,
 639 |                                                         'node': assigned_node,
 640 |                                                         'segment': segment.id,
 641 |                                                         'assigned_on': doublethink.utcnow(),
 642 |                                                         'remote_path': segment.remote_path,
 643 |                                                         'bytes': segment.size }))
 644 |                     ring_assignments[dict_key]['node'] = assigned_node
 645 |                     ring_assignments[dict_key]['id'] = "%s:%s" % (ring_assignments[dict_key]['node'], ring_assignments[dict_key]['segment'])
 646 |                     self.registry.assignment_queue.enqueue(ring_assignments[dict_key])
 647 |         logging.info("%s assignments changed during this sync cycle.", changed_assignments)
 648 |         # commit assignments that were created or updated
 649 |         self.registry.commit_unassignments()
 650 |         self.registry.commit_assignments()
 651 | 
 652 | 
 653 |     def sync(self):
 654 |         '''
 655 |         "server" mode:
 656 |         - if I am not the leader, poll forever
 657 |         - if there are no hosts to assign to, poll forever.
 658 |         - for entire list of segments that match pattern in REMOTE_DATA setting:
 659 |             - check rethinkdb to make sure each item is assigned to a worker
 660 |             - if it is not assigned:
 661 |                 - assign it using consistent hash rings, based on the available quota on each worker
 662 |         '''
 663 |         if self.hold_election():
 664 |             new_host_nodes = sorted([host.get('node') for host in self.registry.get_hosts(exclude_cold=False)])
 665 |             if new_host_nodes != self.current_host_nodes:
 666 |                 logging.info('pool of trough workers changed size from %r to %r (old=%r new=%r)', len(self.current_host_nodes), len(new_host_nodes), self.current_host_nodes, new_host_nodes)
 667 |                 self.current_host_nodes = new_host_nodes
 668 |             if new_host_nodes:
 669 |                 self.assign_segments()
 670 |             else:
 671 |                 logging.info('not assigning segments because there are no trough workers!')
 672 | 
 673 |     def provision_writable_segment(self, segment_id, schema_id='default'):
 674 |         # the query below implements this algorithm:
 675 |         # - look up a write lock for the passed-in segment
 676 |         # - if the write lock exists
 677 |         # -   return it
 678 |         # - else
 679 |         # -   look up the set of readable copies of the segment
 680 |         # -   if readable copies exist
 681 |         # -     return the one with the lowest load
 682 |         # -   else (this is a new segment)
 683 |         # -     return the node (service entry with role trough-node) with lowest load
 684 |         #
 685 |         # the result is either:
 686 |         # - a 'lock' table entry table in case there is already a write lock
 687 |         #   for this segment
 688 |         # - a 'services' table entry in with role 'trough-read' in case the
 689 |         #   segment exists but is not under write
 690 |         # - a 'services' table entry with role 'trough-nodes' in case this is a
 691 |         #   new segment, in which case this is the node where we will provision
 692 |         #   the new segment
 693 |         if Segment(segment_id, -1, None, None, None).cold_store():
 694 |             raise ClientError(
 695 |                     'cannot provision segment %s for writing because that '
 696 |                     'segment id is in the read-only cold storage '
 697 |                     'range' % segment_id)
 698 | 
 699 |         assignment = self.rethinker.table('lock')\
 700 |             .get('write:lock:%s' % segment_id)\
 701 |             .default(r.table('services')\
 702 |                 .get_all(segment_id, index='segment')\
 703 |                 .filter({'role':'trough-read'})\
 704 |                 .filter(lambda svc: r.now().sub(svc["last_heartbeat"]).lt(svc["ttl"]))\
 705 |                 .order_by('load')[0].default(
 706 |                     r.table('services')\
 707 |                         .get_all('trough-nodes', index='role')\
 708 |                         .filter(r.row['cold_storage'].default(False).not_())\
 709 |                         .filter(lambda svc: r.now().sub(svc["last_heartbeat"]).lt(svc["ttl"]))\
 710 |                         .order_by('load')[0].default(None)
 711 |                 )
 712 |             ).run()
 713 | 
 714 |         if not assignment:
 715 |             raise Exception('No healthy node to assign to')
 716 |         post_url = 'http://%s:%s/provision' % (assignment['node'], self.sync_local_port)
 717 |         json_data = {'segment': segment_id, 'schema': schema_id}
 718 |         try:
 719 |             response = requests.post(post_url, json=json_data)
 720 |         except Exception as e:
 721 |             logging.error("Error while provisioning segment '%s'! This segment may have been provisioned without a schema! Exception was: %s", segment_id, e)
 722 |         if response.status_code != 200:
 723 |             raise Exception('Received a %s response while provisioning segment "%s" on node %s:\n%r\nwhile posting %r to %r' % (response.status_code, segment_id, assignment['node'], response.text, ujson.dumps(json_data), post_url))
 724 |         result_dict = ujson.loads(response.text)
 725 |         return result_dict
 726 | 
 727 |     def promote_writable_segment_upstream(self, segment_id):
 728 |         # this function calls the downstream server that holds the write lock
 729 |         # if a lock exists, insert a flag representing the promotion into it, otherwise raise exception
 730 | 
 731 |         # forward the request downstream to actually perform the promotion
 732 |         write_lock = self.rethinker.table('lock').get('write:lock:%s' % segment_id).run()
 733 |         if not write_lock:
 734 |             raise Exception("Segment %s is not currently writable" % segment_id)
 735 |         post_url = 'http://%s:%s/promote' % (write_lock['node'], self.sync_local_port)
 736 |         json_data = {'segment': segment_id}
 737 |         logging.info('posting %s to %s', json.dumps(json_data), post_url)
 738 |         try:
 739 |             response = requests.post(post_url, json=json_data)
 740 |         except Exception as e:
 741 |             logging.error("Error while promoting segment '%s' to HDFS! Exception was: %s", segment_id, e)
 742 |         if response.status_code != 200:
 743 |             raise Exception('Received a %s response while promoting segment "%s" to HDFS:\n%r\nwhile posting %r to %r' % (response.status_code, segment_id, response.text, ujson.dumps(json_data), post_url))
 744 |         response_dict = ujson.loads(response.content)
 745 |         if not 'remote_path' in response_dict:
 746 |             logging.warning('response json from downstream does not have remote_path?? %r', response_dict)
 747 |         return response_dict
 748 | 
 749 | def validate_schema_sql(sql):
 750 |     '''
 751 |     Schema sql is considered valid if it runs without error in an empty sqlite
 752 |     database.
 753 |     '''
 754 |     connection = sqlite3.connect(':memory:')
 755 |     connection.executescript(sql) # may raise exception
 756 |     connection.close()
 757 | 
 758 | # Local mode synchronizer.
 759 | class LocalSyncController(SyncController):
 760 |     def __init__(self, *args, **kwargs):
 761 |         super().__init__(*args, **kwargs)
 762 |         self.hostname = settings['HOSTNAME']
 763 |         self.read_id_tmpl = 'trough-read:%s:%%s' % self.hostname
 764 |         self.write_id_tmpl = 'trough-write:%s:%%s' % self.hostname
 765 |         self.healthy_service_ids = set()
 766 |         self.heartbeat_thread = threading.Thread(target=self.heartbeat_periodically_forever, daemon=True)
 767 | 
 768 |     def start(self):
 769 |         init_worker()
 770 |         self.heartbeat_thread.start()
 771 | 
 772 |     def heartbeat_periodically_forever(self):
 773 |         while True:
 774 |             start = time.time()
 775 |             try:
 776 |                 healthy_service_ids = self.periodic_heartbeat()
 777 |                 elapsed =  time.time() - start
 778 |                 logging.info('heartbeated %s segments in %0.2f sec', len(healthy_service_ids), elapsed)
 779 |             except:
 780 |                 elapsed =  time.time() - start
 781 |                 logging.error('problem sending heartbeat', exc_info=True)
 782 |             time.sleep(max((self.sync_loop_timing - elapsed), 0))
 783 | 
 784 |     def periodic_heartbeat(self):
 785 |         self.heartbeat()
 786 |         # make a copy for thread safety
 787 |         healthy_service_ids = list(self.healthy_service_ids)
 788 |         self.registry.bulk_heartbeat(healthy_service_ids)
 789 |         return healthy_service_ids
 790 | 
 791 |     def check_config(self):
 792 |         try:
 793 |             assert settings['HOSTNAME'], "HOSTNAME must be set, or I can't figure out my own hostname."
 794 |             assert settings['EXTERNAL_IP'], "EXTERNAL_IP must be set. We need to know which IP to use."
 795 |             assert settings['READ_PORT'], "READ_PORT must be set. We need to know the output port."
 796 |             assert settings['RETHINKDB_HOSTS'], "RETHINKDB_HOSTS must be set. Where can I contact RethinkDB on port 29015?"
 797 |         except AssertionError as e:
 798 |             sys.exit("{} Exiting...".format(str(e)))
 799 | 
 800 |     def check_health(self):
 801 |         assert self.heartbeat_thread.is_alive()
 802 | 
 803 |     def copy_segment_from_hdfs(self, segment):
 804 |         logging.debug('copying segment %r from HDFS path %r...', segment.id, segment.remote_path)
 805 |         assert segment.remote_path
 806 |         source = [segment.remote_path]
 807 |         with tempfile.TemporaryDirectory() as tmpdir:
 808 |             tmp_dest = os.path.join(tmpdir, "%s.sqlite" % segment.id)
 809 |             logging.debug('running snakebite.Client.copyToLocal(%r, %r)', source, tmp_dest)
 810 |             snakebite_client = client.Client(settings['HDFS_HOST'], settings['HDFS_PORT'])
 811 |             for f in snakebite_client.copyToLocal(source, tmp_dest):
 812 |                 if f.get('error'):
 813 |                     raise Exception('Copying HDFS file %r to %r produced an error: %r' % (source, tmp_dest, f['error']))
 814 |                 logging.debug('copying from hdfs succeeded, moving %s to %s', tmp_dest, segment.local_path())
 815 |                 # clobbers segment.local_path if it already exists, which is what we want
 816 |                 os.rename(tmp_dest, segment.local_path())
 817 |                 return True
 818 | 
 819 |     def heartbeat(self):
 820 |         logging.warning('Updating health check for "%s".' % self.hostname)
 821 |         # reset the countdown
 822 |         self.registry.heartbeat(pool='trough-nodes',
 823 |             node=self.hostname,
 824 |             ttl=round(self.sync_loop_timing * 4),
 825 |             available_bytes=self.storage_in_bytes,
 826 |             cold_storage=settings['RUN_AS_COLD_STORAGE_NODE'],
 827 |         )
 828 | 
 829 |     def decommission_writable_segment(self, segment, write_lock):
 830 |         logging.info('De-commissioning a writable segment: %s' % segment.id)
 831 |         write_lock.release()
 832 |         writable_copy = segment.writable_copy()
 833 |         if writable_copy:
 834 |             self.services.unregister(writable_copy.get('id'))
 835 | 
 836 |     def delete_segment(self, segment_id):
 837 |         '''
 838 |         Deletes the service registry entry for the given segment_id on this
 839 |         worker node, and deletes the .sqlite file from local disk. Called by
 840 |         upstream segment manager server. See
 841 |         `MasterSyncController.delete_segment()`
 842 | 
 843 |         Raises:
 844 |             KeyError: if there is no trough-read service for this node for
 845 |                 segment_id and the file does not exist locally
 846 |             ClientError: if a write lock exists for the segment
 847 |         '''
 848 |         query = self.rethinker.table('lock').get('write:lock:%s' % segment_id)
 849 |         result = query.run()
 850 |         if result:
 851 |             raise ClientError(
 852 |                     'cannot delete segment: write lock exists: %r', result)
 853 | 
 854 |         svc_id = 'trough-read:%s:%s' % (self.hostname, segment_id)
 855 |         query = self.rethinker.table('services').get(svc_id).delete()
 856 |         result = query.run()
 857 |         # ugh cannot log the query, some kind of bug
 858 |         # *** RuntimeError: generator raised StopIteration
 859 |         logging.info(
 860 |                 '%s.delete() %s', self.rethinker.table('services').get(svc_id),
 861 |                 result)
 862 |         deleted_service = bool(result.get('deleted'))
 863 | 
 864 |         deleted_file = False
 865 |         if not settings['RUN_AS_COLD_STORAGE_NODE']:
 866 |             try:
 867 |                 path = os.path.join(
 868 |                         settings['LOCAL_DATA'], '%s.sqlite' % segment_id)
 869 |                 os.unlink(path)
 870 |                 deleted_file = True
 871 |             except FileNotFoundError:
 872 |                 deleted_file = False
 873 | 
 874 |         if not deleted_file and not deleted_service:
 875 |             raise KeyError
 876 | 
 877 |     def segment_id_from_path(self, path):
 878 |         return path.split("/")[-1].replace('.sqlite', '')
 879 | 
 880 |     def discard_warm_stuff(self):
 881 |         '''
 882 |         Make sure cold storage nodes don't hold on to any warm segment
 883 |         assignments or write locks, and are absent from the host ring
 884 |         assignment.
 885 |         '''
 886 |         if not settings['RUN_AS_COLD_STORAGE_NODE']:
 887 |             return
 888 | 
 889 |         query = self.rethinker.table(Assignment.table)\
 890 |                 .between('%s:\x01' % self.hostname,
 891 |                          '%s:\x7f' % self.hostname,
 892 |                          right_bound="closed")\
 893 |                 .filter(r.row['hash_ring'].default('').ne('cold'))\
 894 |                 .delete()
 895 |         result = query.run()
 896 |         logging.info(
 897 |                 'deleted warm segment assignments: %s returned %s',
 898 |                 query, result)
 899 | 
 900 |         query = self.rethinker.table(Lock.table)\
 901 |                 .get_all(self.hostname, index="node").delete()
 902 |         result = query.run()
 903 |         logging.info(
 904 |                 'deleted warm segment write locks: %s returned %s',
 905 |                 query, result)
 906 | 
 907 |         query = self.rethinker.table(Assignment.table).get('ring-assignments')\
 908 |                 .replace(r.row.without(self.hostname))
 909 |         result = query.run()
 910 |         logging.info(
 911 |                 'deleted %s from ring assignments: %s returned %s',
 912 |                 self.hostname, query, result)
 913 | 
 914 |     def process_stale_segment(self, segment, local_mtime=None, remote_mtime=None):
 915 |         logging.info('processing stale segment id: %s', segment.id)
 916 |         if not segment or not segment.remote_path:
 917 |             # There is a newer copy in hdfs but we are not assigned to
 918 |             # serve it. Do not copy down the new segment and do not release
 919 |             # the write lock. One of the assigned nodes will release the
 920 |             # write lock after copying it down, ensuring there is no period
 921 |             # of time when no one is serving the segment.
 922 |             logging.info('segment %s appears to be assigned to another machine', segment.id)
 923 |             return
 924 |         if local_mtime:
 925 |             logging.info('replacing segment %r local copy (mtime=%s) from hdfs (mtime=%s)',
 926 |                          segment.id, datetime.datetime.fromtimestamp(local_mtime),
 927 |                          datetime.datetime.fromtimestamp(remote_mtime))
 928 |         else:
 929 |             logging.info('copying new segment %r from hdfs', segment.id)
 930 |         try:
 931 |             self.copy_segment_from_hdfs(segment)
 932 |         except Exception as e:
 933 |             logging.error('Error during HDFS copy of segment %r', segment.id, exc_info=True)
 934 |             return
 935 |         self.healthy_service_ids.add(self.read_id_tmpl % segment.id)
 936 |         write_lock = segment.retrieve_write_lock()
 937 |         if write_lock:
 938 |             logging.info("Segment %s has a writable copy. It will be decommissioned in favor of the newer read-only copy from HDFS.", segment.id)
 939 |             self.decommission_writable_segment(segment, write_lock)
 940 | 
 941 |     def sync(self):
 942 |         '''
 943 |         assignments = list of segments assigned to this node
 944 |         local_segments = list of segments on local disk
 945 |         remote_segments = list of segments in hdfs
 946 |         segments_of_interest = set(assignments + local_segments)
 947 |         write_locks = list of locks assigned to this node
 948 | 
 949 |         for segment in self.healthy_service_ids:
 950 |             if segment not in segments_of_interest:
 951 |                 discard write id from self.healthy_service_ids
 952 |                 discard read id from self.healthy_service_ids
 953 | 
 954 |         for segment in segments_of_interest:
 955 |             if segment exists locally and is newer than hdfs:
 956 |                 add read id to self.healthy_service_ids
 957 |                 if segment in write_locks:
 958 |                     add write id to self.healthy_service_ids
 959 |             else: # segment does not exist locally or is older than hdfs:
 960 |                 discard write id from self.healthy_service_ids
 961 |                 discard read id from self.healthy_service_ids
 962 |                 add to stale queue
 963 | 
 964 |         for segment in stale_queue:
 965 |             copy down from hdfs
 966 |             add read id to self.healthy_service_ids
 967 |             delete write lock from rethinkdb
 968 |         '''
 969 |         start = time.time()
 970 |         logging.info('sync starting')
 971 |         if settings['RUN_AS_COLD_STORAGE_NODE']:
 972 |             self.discard_warm_stuff()
 973 | 
 974 |         # { segment_id: Segment }
 975 |         my_segments = { segment.id: segment for segment in self.registry.segments_for_host(self.hostname) }
 976 | 
 977 |         if settings['RUN_AS_COLD_STORAGE_NODE']:
 978 |             for segment_id in my_segments:
 979 |                 self.healthy_service_ids.add(self.read_id_tmpl % segment_id)
 980 |             return
 981 | 
 982 |         remote_mtimes = {}  # { segment_id: mtime (long) }
 983 |         try:
 984 |             # iterator of dicts that look like this
 985 |             # {'last_mod': 1509406266, 'replication': 0, 'block_size': 0, 'name': '//tmp', 'group': 'supergroup', 'last_access': 0, 'owner': 'hdfs', 'kind': 'directory', 'permissions': 1023, 'encryption_info': None, 'size': 0}
 986 |             remote_listing = self.get_segment_file_list()
 987 |             for file in remote_listing:
 988 |                 segment_id = self.segment_id_from_path(file['name'])
 989 |                 remote_mtimes[segment_id] = file['last_mod']
 990 |             hdfs_up = True
 991 |         except Exception as e:
 992 |             logging.error('Error while listing files from HDFS', exc_info=True)
 993 |             logging.warning('PROCEEDING WITHOUT DATA FROM HDFS')
 994 |             hdfs_up = False
 995 |         logging.info('found %r segments in hdfs', len(remote_mtimes))
 996 |         # list of filenames
 997 |         local_listing = os.listdir(self.local_data)
 998 |         # { segment_id: mtime }
 999 |         local_mtimes = {}
1000 |         for path in local_listing:
1001 |             try:
1002 |                 local_mtimes[self.segment_id_from_path(path)] = os.stat(os.path.join(self.local_data, path)).st_mtime
1003 |             except:
1004 |                 logging.warning('%r gone since listing directory', path)
1005 |         logging.info('found %r segments on local disk', len(local_mtimes))
1006 |         # { segment_id: Lock }
1007 |         write_locks = { lock.segment: lock for lock in Lock.host_locks(self.rethinker, self.hostname) }
1008 |         writable_segments_found = len([1 for lock in write_locks if local_mtimes.get(lock)])
1009 |         logging.info('found %r writable segments on-disk and %r write locks in RethinkDB for host %r', writable_segments_found, len(write_locks), self.hostname)
1010 |         # list of segment id
1011 |         stale_queue = []
1012 | 
1013 |         segments_of_interest = set()
1014 |         segments_of_interest.update(my_segments.keys())
1015 |         segments_of_interest.update(local_mtimes.keys())
1016 | 
1017 |         count = 0
1018 |         for service_id in list(self.healthy_service_ids):
1019 |             segment_id = service_id.split(':')[-1]
1020 |             if segment_id not in segments_of_interest:
1021 |                 self.healthy_service_ids.discard(service_id)
1022 |                 logging.debug('discarded %r from healthy service ids because segment %r is gone from host %r', service_id, segment_id, self.hostname)
1023 |                 count += 1
1024 |         logging.info('%r healthy service ids discarded on %r since last sync', count, self.hostname)
1025 | 
1026 |         for segment_id in segments_of_interest:
1027 |             if segment_id in local_mtimes and local_mtimes[segment_id] >= remote_mtimes.get(segment_id, 0):
1028 |                 if (self.read_id_tmpl % segment_id) not in self.healthy_service_ids:
1029 |                     logging.debug('adding %r to healthy segment list', (self.read_id_tmpl % segment_id))
1030 |                 self.healthy_service_ids.add(self.read_id_tmpl % segment_id)
1031 |                 if segment_id in write_locks:
1032 |                     if (self.write_id_tmpl % segment_id) not in self.healthy_service_ids:
1033 |                         logging.debug('adding %r to healthy segment list', (self.write_id_tmpl % segment_id))
1034 |                     self.healthy_service_ids.add(self.write_id_tmpl % segment_id)
1035 |             else: # segment does not exist locally or is older than hdfs
1036 |                 self.healthy_service_ids.discard(self.read_id_tmpl % segment_id)
1037 |                 self.healthy_service_ids.discard(self.write_id_tmpl % segment_id)
1038 |                 stale_queue.append(segment_id)
1039 | 
1040 |         if not hdfs_up:
1041 |             return
1042 | 
1043 |         with futures.ThreadPoolExecutor(max_workers=settings['COPY_THREAD_POOL_SIZE']) as pool:
1044 |             for segment_id in sorted(stale_queue, reverse=True):
1045 |                 # essentially does this call with a thread pool:
1046 |                 # process_stale_segment(my_segments.get(segment_id), local_mtimes.get(segment_id))
1047 |                 pool.submit(self.process_stale_segment, my_segments.get(segment_id), local_mtimes.get(segment_id), remote_mtimes.get(segment_id))
1048 | 
1049 |     def provision_writable_segment(self, segment_id, schema_id='default'):
1050 |         if settings['RUN_AS_COLD_STORAGE_NODE']:
1051 |             raise ClientError(
1052 |                     'cannot provision segment %s for writing because this '
1053 |                     'trough worker %s is designated as cold storage' % (
1054 |                         segment_id, self.host))
1055 | 
1056 |         # instantiate the segment
1057 |         segment = Segment(segment_id=segment_id,
1058 |             rethinker=self.rethinker,
1059 |             services=self.services,
1060 |             registry=self.registry,
1061 |             size=0)
1062 | 
1063 |         if segment.cold_store():
1064 |             raise ClientError(
1065 |                     'cannot provision segment %s for writing because that '
1066 |                     'segment id is in the read-only cold storage '
1067 |                     'range' % segment_id)
1068 | 
1069 |         # get the current write lock if any # TODO: collapse the below into one query
1070 |         lock_data = segment.retrieve_write_lock()
1071 |         if lock_data:
1072 |             logging.info('retrieved existing write lock for segment %r', segment_id)
1073 |         else:
1074 |             lock_data = segment.new_write_lock()
1075 |             logging.info('acquired new write lock for segment %r', segment_id)
1076 | 
1077 |         # TODO: spawn a thread for these?
1078 |         logging.info('heartbeating write service for segment %r', segment_id)
1079 |         trough_write_status = self.registry.heartbeat(pool='trough-write',
1080 |             segment=segment_id,
1081 |             node=self.hostname,
1082 |             port=self.write_port,
1083 |             url='http://%s:%s/?segment=%s' % (self.hostname, self.write_port, segment_id),
1084 |             ttl=round(self.sync_loop_timing * 4))
1085 | 
1086 |         logging.info('heartbeating read service for segment %r', segment_id)
1087 |         self.registry.heartbeat(pool='trough-read',
1088 |             segment=segment_id,
1089 |             node=self.hostname,
1090 |             port=self.read_port,
1091 |             url='http://%s:%s/?segment=%s' % (self.hostname, self.read_port, segment_id),
1092 |             ttl=round(self.sync_loop_timing * 4))
1093 | 
1094 |         # ensure that the file exists on the filesystem
1095 |         if not segment.local_segment_exists():
1096 |             # execute the provisioning sql file against the sqlite segment
1097 |             schema = self.get_schema(schema_id)
1098 |             if not schema:
1099 |                 raise Exception('no such schema id=%r' % schema_id)
1100 |             logging.info('provisioning local segment %r', segment_id)
1101 |             segment.provision_local_segment(schema.sql)
1102 | 
1103 |         result_dict = {
1104 |             'write_url': trough_write_status['url'],
1105 |             'result': "success",
1106 |             'size': os.path.getsize(segment.local_path()),
1107 |             'schema': schema_id,
1108 |         }
1109 |         logging.info('finished provisioning writable segment %r', result_dict)
1110 |         return result_dict
1111 | 
1112 |     def do_segment_promotion(self, segment):
1113 |         import sqlitebck
1114 |         hdfs = HDFileSystem(host=self.hdfs_host, port=self.hdfs_port)
1115 |         with tempfile.NamedTemporaryFile() as temp_file:
1116 |             # "online backup" see https://www.sqlite.org/backup.html
1117 |             logging.info(
1118 |                     'backing up %s to %s', segment.local_path(),
1119 |                     temp_file.name)
1120 |             source = sqlite3.connect(segment.local_path())
1121 |             dest = sqlite3.connect(temp_file.name)
1122 |             sqlitebck.copy(source, dest)
1123 |             source.close()
1124 |             dest.close()
1125 |             logging.info(
1126 |                     'uploading %s to hdfs %s', temp_file.name,
1127 |                     segment.remote_path)
1128 |             hdfs.mkdir(os.path.dirname(segment.remote_path))
1129 |             # java hdfs convention, upload to foo._COPYING_
1130 |             tmp_name = '%s._COPYING_' % segment.remote_path
1131 |             hdfs.put(temp_file.name, tmp_name)
1132 | 
1133 |             # update mtime of local segment so that sync local doesn't think the
1134 |             # segment we just pushed to hdfs is newer (if it did, it would pull it
1135 |             # down and decommission its writable copy)
1136 |             # see https://webarchive.jira.com/browse/ARI-5713?focusedCommentId=110920#comment-110920
1137 |             os.utime(segment.local_path(), times=(time.time(), time.time()))
1138 | 
1139 |             # move existing out of the way if necessary (else mv fails)
1140 |             if hdfs.exists(segment.remote_path):
1141 |                 hdfs.rm(segment.remote_path)
1142 | 
1143 |             # now move into place (does not update mtime)
1144 |             # returns False (does not raise exception) on failure
1145 |             result = hdfs.mv(tmp_name, segment.remote_path)
1146 |             assert result is True
1147 | 
1148 |             logging.info('Promoted writable segment %s upstream to %s', segment.id, segment.remote_path)
1149 | 
1150 |     def promote_writable_segment_upstream(self, segment_id):
1151 |         # load write lock, check segment is writable and not under promotion
1152 |         # update write lock to mark segment as being under promotion
1153 |         # get hdfs path from rethinkdb, use default if not set
1154 |         # push segment to hdfs
1155 |         # unset under_promotion flag
1156 |         # return response with hdfs path
1157 |         query = self.rethinker.table('lock')\
1158 |             .get('write:lock:%s' % segment_id)\
1159 |             .update({'under_promotion': True}, return_changes=True)
1160 |         result = query.run()
1161 |         try:
1162 |             write_lock = result['changes'][0]['new_val']
1163 |             assert write_lock['node'] == self.hostname
1164 |         except:
1165 |             if result['unchanged'] > 0:
1166 |                 raise Exception("Segment %s is currently being copied upstream (write lock flag 'under_promotion' is set)" % segment_id)
1167 |             if result['skipped'] > 0:
1168 |                 raise Exception("Segment %s is not currently writable" % segment_id)
1169 |             raise Exception("Unexpected result %r from rethinkdb query %r" % (result, query))
1170 | 
1171 |         try:
1172 |             try:
1173 |                 assignment = self.rethinker.table('assignment').get_all(segment_id, index='segment')[0].run()
1174 |                 remote_path = assignment['remote_path']
1175 |             except r.errors.ReqlNonExistenceError:
1176 |                 remote_path = os.path.join(self.hdfs_path, segment_id[:-3], '%s.sqlite' % segment_id)
1177 | 
1178 |             segment = Segment(
1179 |                     segment_id, size=-1, rethinker=self.rethinker,
1180 |                     services=self.services, registry=self.registry,
1181 |                     remote_path=remote_path)
1182 | 
1183 |             self.do_segment_promotion(segment)
1184 |         finally:
1185 |             self.rethinker.table('lock')\
1186 |                     .get('write:lock:%s' % segment_id)\
1187 |                     .update({'under_promotion': False}).run()
1188 |         return {'remote_path': remote_path}
1189 | 
1190 |     def collect_garbage(self):
1191 |         # for each segment file on local disk
1192 |         # - segment assigned to me should not be gc'd
1193 |         # - segment not assigned to me with healthy service count <= minimum
1194 |         #   should not be gc'd
1195 |         # - segment not assigned to me with healthy service count == minimum
1196 |         #   and no local healthy service entry should be gc'd
1197 |         # - segment not assigned to me with healthy service count > minimum
1198 |         #   and has local healthy service entry should be gc'd
1199 |         if settings['RUN_AS_COLD_STORAGE_NODE']:
1200 |             return
1201 | 
1202 |         assignments = set(item.id for item in self.registry.segments_for_host(self.hostname))
1203 |         for filename in os.listdir(self.local_data):
1204 |             if not filename.endswith('.sqlite'):
1205 |                 continue
1206 |             segment_id = filename[:-7]
1207 |             local_service_id = 'trough-read:%s:%s' % (self.hostname, segment_id)
1208 |             if segment_id not in assignments:
1209 |                 segment = Segment(segment_id, 0, self.rethinker, self.services, self.registry)
1210 |                 healthy_service_ids = {service['id'] for service in segment.readable_copies()}
1211 |                 if local_service_id in healthy_service_ids:
1212 |                     healthy_service_ids.remove(local_service_id)
1213 |                     # re-check that the lock is not held by this machine before removing service
1214 |                     rechecked_lock = self.rethinker.table('lock').get(segment.id).run()
1215 |                     if len(healthy_service_ids) >= segment.minimum_assignments() \
1216 |                         and (rechecked_lock is None or rechecked_lock['node'] != self.hostname):
1217 |                         logging.info(
1218 |                                 'segment %s has %s readable copies (minimum is %s) '
1219 |                                 'and is not assigned to %s, removing %s from the '
1220 |                                 'service registry',
1221 |                                 segment_id, len(healthy_service_ids),
1222 |                                 segment.minimum_assignments(), self.hostname,
1223 |                                 local_service_id)
1224 |                         self.rethinker.table('services').get(local_service_id).delete().run()
1225 |                 # re-check that the lock is not held by this machine before removing segment file
1226 |                 rechecked_lock = self.rethinker.table('lock').get(segment.id)
1227 |                 if len(healthy_service_ids) >= segment.minimum_assignments() \
1228 |                     and (rechecked_lock is None or rechecked_lock['node'] != self.hostname):
1229 |                     path = os.path.join(self.local_data, filename)
1230 |                     logging.info(
1231 |                             'segment %s now has %s readable copies (minimum '
1232 |                             'is %s) and is not assigned to %s, deleting %s',
1233 |                             segment_id, len(healthy_service_ids),
1234 |                             segment.minimum_assignments(), self.hostname,
1235 |                             path)
1236 |                     os.remove(path)
1237 | 
1238 | def get_controller(server_mode):
1239 |     logging.info('Connecting to Rethinkdb on: %s' % settings['RETHINKDB_HOSTS'])
1240 |     rethinker = doublethink.Rethinker(db="trough_configuration", servers=settings['RETHINKDB_HOSTS'])
1241 |     services = doublethink.ServiceRegistry(rethinker)
1242 |     registry = HostRegistry(rethinker=rethinker, services=services)
1243 |     init(rethinker)
1244 |     logging.info('Connecting to HDFS on: %s:%s' % (settings['HDFS_HOST'], settings['HDFS_PORT']))
1245 | 
1246 |     if server_mode:
1247 |         controller = MasterSyncController(
1248 |             rethinker=rethinker,
1249 |             services=services,
1250 |             registry=registry)
1251 |     else:
1252 |         controller = LocalSyncController(
1253 |             rethinker=rethinker,
1254 |             services=services,
1255 |             registry=registry)
1256 | 
1257 |     return controller
1258 | 


--------------------------------------------------------------------------------
/trough/write.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import trough
 3 | from trough.settings import settings, try_init_sentry
 4 | import sqlite3
 5 | import ujson
 6 | import os
 7 | import sqlparse
 8 | import logging
 9 | import urllib
10 | import doublethink
11 | 
12 | 
13 | try_init_sentry()
14 | 
15 | 
16 | class WriteServer:
17 |     def __init__(self):
18 |         self.rethinker = doublethink.Rethinker(db="trough_configuration", servers=settings['RETHINKDB_HOSTS'])
19 |         self.services = doublethink.ServiceRegistry(self.rethinker)
20 |         self.registry = trough.sync.HostRegistry(rethinker=self.rethinker, services=self.services)
21 |         trough.sync.init(self.rethinker)
22 | 
23 |     def write(self, segment, query):
24 |         logging.info('Servicing request: segment=%r query=%r', segment, query)
25 |         # if one or more of the query(s) are not a write query, raise an exception.
26 |         if not query:
27 |             raise Exception("No query provided.")
28 |         # no sql parsing, if our chmod has write permission, allow all queries.
29 |         assert os.path.isfile(segment.local_path())
30 |         connection = sqlite3.connect(segment.local_path())
31 |         trough.sync.setup_connection(connection)
32 |         try:
33 |             query = query.rstrip();
34 |             if not query[-1] == b';':
35 |                 query = query + b';'
36 |             # executescript does not seem to respect isolation_level, so for
37 |             # performance, we wrap the sql in a transaction manually
38 |             # see http://bugs.python.org/issue30593
39 |             query = b"BEGIN TRANSACTION;\n" + query + b"\nCOMMIT;\n"
40 |             output = connection.executescript(query.decode('utf-8'))
41 |         finally:
42 |             connection.commit()
43 |             connection.close()
44 |         return b"OK\n"
45 | 
46 |     # uwsgi endpoint
47 |     def __call__(self, env, start_response):
48 |         try:
49 |             query_dict = urllib.parse.parse_qs(env.get('QUERY_STRING'))
50 |             # use the ?segment= query string variable or the host string to figure out which sqlite database to talk to.
51 |             segment_id = query_dict.get('segment', env.get('HTTP_HOST', "").split("."))[0]
52 |             logging.info('Connecting to Rethinkdb on: %s' % settings['RETHINKDB_HOSTS'])
53 |             segment = trough.sync.Segment(segment_id=segment_id, size=0, rethinker=self.rethinker, services=self.services, registry=self.registry)
54 |             query = env.get('wsgi.input').read()
55 |             write_lock = segment.retrieve_write_lock()
56 |             if not write_lock or write_lock['node'] != settings['HOSTNAME']:
57 |                 raise Exception("This node (settings['HOSTNAME']={!r}) cannot write to segment {!r}. There is no write lock set, or the write lock authorizes another node. Write lock: {!r}".format(settings['HOSTNAME'], segment.id, write_lock))
58 | 
59 |             output = self.write(segment, query)
60 |             start_response('200 OK', [('Content-Type', 'text/plain')])
61 |             return output
62 |         except Exception as e:
63 |             logging.error('500 Server Error due to exception (segment=%r query=%r)', segment, bytes(query), exc_info=True)
64 |             start_response('500 Server Error', [('Content-Type', 'text/plain')])
65 |             return [('500 Server Error: %s\n' % str(e)).encode('utf-8')]
66 | 


--------------------------------------------------------------------------------
/trough/wsgi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/trough/0c6243e0ec4731ce5bb61c15aa7993ac57b692fe/trough/wsgi/__init__.py


--------------------------------------------------------------------------------
/trough/wsgi/segment_manager.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import sqlite3
  3 | import trough
  4 | import flask
  5 | import ujson
  6 | import trough.settings
  7 | 
  8 | def make_app(controller):
  9 |     controller.check_config()
 10 |     app = flask.Flask(__name__)
 11 | 
 12 |     @app.route('/', methods=['POST'])
 13 |     def simple_provision_writable_segment():
 14 |         ''' deprecated api '''
 15 |         segment_id = flask.request.get_data(as_text=True)
 16 |         logging.info('provisioning writable segment %r', segment_id)
 17 |         result_dict = controller.provision_writable_segment(segment_id)
 18 |         return flask.Response(result_dict.get('write_url'), mimetype='text/plain')
 19 | 
 20 |     @app.route('/provision', methods=['POST'])
 21 |     def provision_writable_segment():
 22 |         '''Provisions Writes. Will respond with a JSON object which describes segment metadata, including:
 23 |         - write url
 24 |         - segment size on disk
 25 |         - schema ID used to provision segment
 26 |     or respond with a 500 including error description.'''
 27 |         segment_id = flask.request.json['segment']
 28 |         schema_id = flask.request.json.get('schema', 'default')
 29 |         logging.info('provisioning writable segment %r (schema_id=%r)', segment_id, schema_id)
 30 |         # {'write_url': write_url, 'size': None, 'schema': schema}
 31 |         try:
 32 |             result_dict = controller.provision_writable_segment(segment_id, schema_id=schema_id)
 33 |             result_json = ujson.dumps(result_dict)
 34 |             return flask.Response(result_json, mimetype='application/json')
 35 |         except trough.sync.ClientError as e:
 36 |             response = flask.jsonify({'error': e.args[0]})
 37 |             response.status_code = 400
 38 |             return response
 39 | 
 40 |     @app.route('/promote', methods=['POST'])
 41 |     def promote_writable_segment():
 42 |         '''Promotes segments to HDFS, will respond with a JSON object which describes:
 43 |         - hdfs path
 44 |         - segment size on disk
 45 |         - whether or not an upstream segment will be overwritten
 46 | 
 47 |     This endpoint will toggle a value on the write lock record, which will be consulted so that a segment cannot be promoted while a promotion is in progress. The current journal will be committed, and after promotion completes, this URL will return its JSON document.'''
 48 |         post_json = ujson.loads(flask.request.get_data())
 49 |         segment_id = post_json['segment']
 50 |         result_dict = controller.promote_writable_segment_upstream(segment_id)
 51 |         result_json = ujson.dumps(result_dict)
 52 |         return flask.Response(result_json, mimetype='application/json')
 53 | 
 54 |     @app.route('/schema', methods=['GET'])
 55 |     def list_schemas():
 56 |         '''Schema API Endpoint, lists schema names'''
 57 |         result_json = ujson.dumps(controller.list_schemas())
 58 |         return flask.Response(result_json, mimetype='application/json')
 59 | 
 60 |     @app.route('/schema/<id>', methods=['GET'])
 61 |     def get_schema(id):
 62 |         '''Schema API Endpoint, returns schema json'''
 63 |         schema = controller.get_schema(id=id)
 64 |         if not schema:
 65 |             flask.abort(404)
 66 |         return flask.Response(ujson.dumps(schema), mimetype='application/json')
 67 | 
 68 |     @app.route('/schema/<id>/sql', methods=['GET'])
 69 |     def get_schema_sql(id):
 70 |         '''Schema API Endpoint, returns schema sql'''
 71 |         schema = controller.get_schema(id=id)
 72 |         if not schema:
 73 |             flask.abort(404)
 74 |         return flask.Response(schema.sql, mimetype='application/sql')
 75 | 
 76 |     @app.route('/schema/<id>', methods=['PUT'])
 77 |     def put_schema(id):
 78 |         '''Schema API Endpoint, creates or updates schema from json input'''
 79 |         try:
 80 |             schema_dict = ujson.loads(flask.request.get_data(as_text=True))
 81 |         except:
 82 |             return flask.Response(
 83 |                     status=400, mimetype='text/plain',
 84 |                     response='input could not be parsed as json')
 85 |         if set(schema_dict.keys()) != {'id','sql'}:
 86 |             return flask.Response(status=400, mimetype='text/plain', response=(
 87 |                 "input json has keys %r (should be {'id', 'sql'})" % set(schema_dict.keys())))
 88 |         if schema_dict.get('id') != id:
 89 |             return flask.Response(
 90 |                     status=400, mimetype='text/plain',
 91 |                     response='id in json %r does not match id in url %r' % (
 92 |                         schema_dict.get('id'), id))
 93 | 
 94 |         try:
 95 |             schema, created = controller.set_schema(id=id, sql=schema_dict['sql'])
 96 |         except sqlite3.OperationalError as e:
 97 |             return flask.Response(
 98 |                     status=400, mimetype='text/plain',
 99 |                     response='schema sql failed validation: %s' % e)
100 | 
101 |         return flask.Response(status=201 if created else 204)
102 | 
103 |     @app.route('/schema/<id>/sql', methods=['PUT'])
104 |     def put_schema_sql(id):
105 |         '''Schema API Endpoint, creates or updates schema from sql input'''
106 |         sql = flask.request.get_data(as_text=True)
107 |         try:
108 |             schema, created = controller.set_schema(id=id, sql=sql)
109 |         except sqlite3.OperationalError as e:
110 |             return flask.Response(
111 |                     status=400, mimetype='text/plain',
112 |                     response='schema sql failed validation: %s' % e)
113 | 
114 |         return flask.Response(status=201 if created else 204)
115 | 
116 |     # responds with 204 on successful delete, 404 if segment does not exist
117 |     @app.route('/segment/<id>', methods=['DELETE'])
118 |     def delete_segment(id):
119 |         logging.info('serving request DELETE /segment/%s', id)
120 |         try:
121 |             controller.delete_segment(id)
122 |             return flask.Response(status=204)
123 |         except KeyError as e:
124 |             logging.warning('DELETE /segment/%s', id, exc_info=True)
125 |             flask.abort(404)
126 |         except trough.sync.ClientError as e:
127 |             logging.warning('DELETE /segment/%s', id, exc_info=True)
128 |             flask.abort(400)
129 | 
130 |     return app
131 | 
132 | trough.settings.configure_logging()
133 | local = make_app(trough.sync.get_controller(server_mode=False))
134 | server = make_app(trough.sync.get_controller(server_mode=True))
135 | 
136 | 


--------------------------------------------------------------------------------