├── caspanda
    ├── __init__.py
    ├── exceptions.py
    ├── spots.py
    ├── utils.py
    ├── tests
    │   ├── test_input.py
    │   └── test_describe.py
    ├── bear.py
    ├── metabear.py
    └── bamboo.py
├── .gitignore
├── .travis.yml
├── setup.py
├── bin
    ├── cleanup
    └── example.py
├── LICENSE
└── README.md


/caspanda/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | caspanda.egg-info
3 | build/*
4 | dist/*
5 | 


--------------------------------------------------------------------------------
/caspanda/exceptions.py:
--------------------------------------------------------------------------------
 1 | ###################################################
 2 | #################[ Module: Exceptions ]############
 3 | ###################################################
 4 | """
 5 | Custom exceptions for Caspanda.
 6 | """
 7 | 
 8 | 
 9 | class InputError(Exception):
10 |     pass


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.6"
 4 |   - "2.7"
 5 |   - "3.2"
 6 |   - "3.3"
 7 |   - "3.4"
 8 | 
 9 | before_install:
10 |   - pip install codecov
11 |   - pip install coverage
12 |   - pip install pytest-cov
13 | 
14 | services:
15 |   - cassandra
16 | 
17 | install:
18 |   - pip install -e .
19 | 
20 | # command to run tests
21 | script:
22 |   - if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]] ; then coverage run --include='caspanda/*' $(which py.test) ; coverage report; fi
23 | 
24 | 
25 | after_success:
26 |   - if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; codecov ; fi
27 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from setuptools import find_packages
 3 | 
 4 | 
 5 | setup(
 6 |     name='caspanda',
 7 |     version='0.0.0.4',
 8 |     packages=find_packages(),
 9 |     install_requires=[
10 |         'cassandra-driver',
11 |         'numpy',
12 |         'pandas',
13 |         'nose',
14 |         'blist',
15 |         'future'
16 |     ],
17 |     url='',
18 |     license='MIT',
19 |     author='Aaron Benz',
20 |     author_email='aaron.benz@accenture.com',
21 |     description='Cassandra Wrapper for Easy Panda DataFrame Access',
22 |     include_package_data=True,
23 | )
24 | 


--------------------------------------------------------------------------------
/bin/cleanup:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Delete .pyc files and empty directories from root of project
 4 | cd ./$(git rev-parse --show-cdup)
 5 | 
 6 | # Clean-up
 7 | find . -name ".DS_Store" -delete
 8 | 
 9 | NUM_PYC_FILES=$( find . -name "*.pyc" | wc -l | tr -d ' ' )
10 | if [ $NUM_PYC_FILES -gt 0 ]; then
11 |     find . -name "*.pyc" -delete
12 |     printf "\e[00;31mDeleted $NUM_PYC_FILES .pyc files\e[00m\n"
13 | fi
14 | 
15 | NUM_EMPTY_DIRS=$( find . -type d -empty | wc -l | tr -d ' ' )
16 | if [ $NUM_EMPTY_DIRS -gt 0 ]; then
17 |     find . -type d -empty -delete
18 |     printf "\e[00;31mDeleted $NUM_EMPTY_DIRS empty directories\e[00m\n"
19 | fi
20 | 


--------------------------------------------------------------------------------
/caspanda/spots.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A Panda has spots, and so does data. This is meant to define ways to more easily view caspanda data
 3 | """
 4 | 
 5 | from cassandra.metadata import TableMetadata, KeyspaceMetadata
 6 | try:
 7 |     from exceptions import SyntaxError
 8 | except ImportError:
 9 |     pass
10 | 
11 | def describe(x):
12 |     """
13 |     Given a TableMetaData or list of KeyspaceMetaData, it will return a description of all of the tables in that Keyspace. Given a
14 |     TableMetaData or a list of TableMetaData, it will return the description of all of them
15 |     :param x: A list or a single keyspace/table
16 |     :return: A json description of the tables
17 |     """
18 | 
19 |     raise(SyntaxError)
20 |     pass
21 | 
22 | def _describe_table(x):
23 |     """
24 |     Describes a single TableMetaData table
25 |     :param x: TableMetaData
26 |     :return: dict describing x layout
27 |     """
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Aaron Benz
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/caspanda/utils.py:
--------------------------------------------------------------------------------
 1 | ###################################################
 2 | #################[ Module: Utils ]#################
 3 | ###################################################
 4 | """
 5 | Miscellaneous utilities for caspanda.
 6 | """
 7 | 
 8 | def paste(x, sep=", "):
 9 |     """
10 |     Custom string formatting function to format (???) output.
11 |     """
12 |     out = ""
13 |     for i in x:
14 |         out += i + sep
15 |     return out.strip(sep)
16 | 
17 | def print_ls(ls, ident = '', braces=1):
18 |     """ Recursively prints nested lists."""
19 |     out = ""
20 |     for value in ls:
21 |         if isinstance(value, list):
22 |             out = out + print_ls(value, ident+'\t', braces+1)
23 |         else:
24 |             #out = out + ident+'%s' %(value if isinstance(value, basestring) else value.name) + '\n'
25 |             out = out + ident+'%s' %(value) + '\n'
26 | 
27 |     return out
28 | 
29 | def is_instance_multiple(x, obj_class):
30 |     """
31 |     Checks isinstance of multiple objects to save time. Does nothing if it is none
32 |     :param x: objects
33 |     :param obj_class: class of object
34 |     :return:
35 |     """
36 |     assert isinstance(x, list)
37 |     assert isinstance(obj_class, list)
38 | 


--------------------------------------------------------------------------------
/caspanda/tests/test_input.py:
--------------------------------------------------------------------------------
 1 | ###################################################
 2 | #################[ Module: Test Input ]############
 3 | ###################################################
 4 | """
 5 | Test inserting CassandraFrame into Cassandra and ensure database get returns expected output.
 6 | """
 7 | import unittest
 8 | 
 9 | from caspanda.bear import CasPanda
10 | from caspanda.bamboo import CassandraFrame
11 | 
12 | 
13 | class BaseTestInput(unittest.TestCase):
14 |     """
15 |     Base class for input testing.
16 | 
17 |     Connects to database.
18 |     """  
19 |     def setUp(self):
20 |         self.cl = CasPanda()
21 |         self.session = self.cl.connect()
22 |         self.session.execute("""CREATE KEYSPACE IF NOT EXISTS tests WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 };""")
23 |         self.session.set_keyspace("tests")
24 |         self.session.execute("""CREATE TABLE IF NOT EXISTS tester(
25 |                                     id text PRIMARY KEY,
26 |                                     car text,
27 |                                     color text,
28 |                                     owner text,
29 |                                     passengers set<text>,
30 |                                     data blob
31 |                                 );""")
32 | 
33 |         self.cols = ["id","car","color","owner"]
34 | 
35 |         super(BaseTestInput, self).setUp()
36 | 
37 | class TestQuery(BaseTestInput):
38 | 
39 |     def setUp(self):
40 |         super(TestQuery, self).setUp()
41 | 
42 |         self.frame = CassandraFrame([["VIN1", "ford", "black", "frank"],
43 |                                      ["VIN2", "cyrsler", "blue", "chris"],
44 |                                      ["VIN3", "honda", "red", "harry"]],
45 |                                     columns=self.cols, session=self.session, table="tester")
46 | 
47 |         self.frame.create_cql_insert()
48 | 
49 | 
50 |     def test_all_attributes(self):
51 |         self.frame.insert_async()
52 |         result_set = self.session.execute("SELECT * FROM tester") 
53 |         self.cf = result_set._current_rows
54 | 
55 |         self.assertEqual(len(self.cf), 3)
56 |         self.assertIsInstance(self.cf, CassandraFrame)
57 |         self.assertEqual(self.frame.session, self.cf.session)
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/bin/example.py:
--------------------------------------------------------------------------------
 1 | ###################################################
 2 | #################[ Script: Example ]################
 3 | ###################################################
 4 | """
 5 | Start a cassandra cluster and demonstrate inserting a CassandraFrame.
 6 | """
 7 | from caspanda.bear import CasPanda
 8 | from caspanda.bamboo import CassandraFrame
 9 | 
10 | cl = CasPanda()
11 | session = cl.connect()
12 | session.execute("""CREATE KEYSPACE IF NOT EXISTS tests WITH REPLICATION = { 'class' : 'SimpleStrategy',
13 |                     'replication_factor' : 1 };""")
14 | session.set_keyspace("tests")
15 | session.execute("""CREATE TABLE IF NOT EXISTS sold_cars (
16 |     make text,
17 |     state text,
18 |     day timestamp,
19 |     event_time timestamp,
20 |     dealership text,
21 |     salesman text,
22 |     year int,
23 |     account_lead text static,
24 |     distributor_lead text static,
25 |     PRIMARY KEY ((make, state), day, event_time));""")
26 | session.execute("""CREATE TABLE IF NOT EXISTS albums(
27 |                        id text PRIMARY KEY,
28 |                        car text,
29 |                        color text,
30 |                        owner text,
31 |                        passengers set<text>,
32 |                        data blob
33 |                     );""")
34 | 
35 | cols = ["id","car","color","owner"]
36 | 
37 | session = cl.connect("tests")
38 | 
39 | #df = pd.DataFrame(range(1,5), columns=["a"])
40 | #tmp = CassandraFrame(np.random.randn(10, 2), columns=["id",""], session = session, table="albums")
41 | 
42 | tmp = CassandraFrame([["VIN1", "ford", "black", "frank"], ["VIN2", "cyrsler", "blue", "chris"], ["VIN3", "honda", "red", "harry"]],
43 |                                   columns = cols, session=session, table="albums")
44 | tmp.create_cql_insert()
45 | tmp.insert_async()
46 | 
47 | print "Now see that the data was inserted"
48 | session.execute("""SELECT id, car, color, owner FROM tests.albums""")
49 | 
50 | print "The description of tests.albumns:"
51 | print cl.keyspaces["tests"].tables["albums"]
52 | print "As opposed to this:"
53 | print cl.metadata.keyspaces["tests"].tables["albums"].export_as_string()
54 | 
55 | print "Another comparison"
56 | print cl.keyspaces["tests"].tables["sold_cars"]
57 | print "As opposed to this:"
58 | print cl.metadata.keyspaces["tests"].tables["sold_cars"].export_as_string()
59 | 
60 | 
61 | cl.shutdown()
62 | 
63 | 
64 | 
65 | 
66 | #session.execute("DROP TABLE albums;")
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/caspanda/bear.py:
--------------------------------------------------------------------------------
  1 | ###################################################
  2 | #################[ Module: Base ]##################
  3 | ###################################################
  4 | """
  5 | Class CasPanda, which subclasses cassandra.cluster.Cluster
  6 | and provides an interface between pandas and Cassandra.
  7 | """
  8 | from cassandra.cluster import Cluster
  9 | from caspanda.metabear import ColumnMeta, KeyspaceMeta, TableMeta
 10 | from cassandra.query import dict_factory
 11 | 
 12 | from caspanda.bamboo import CassandraFrame
 13 | 
 14 | from future.utils import iteritems
 15 | 
 16 | #TODO: Add describe function to name any keyspace or keyspace + table(s) to utilize MetaTable.describe function
 17 | class CasPanda(Cluster):
 18 |     """
 19 |     Interface for pandas and Cassandra.
 20 |     """
 21 |     keyspaces = None # contains all of the MetaKeyspaces info
 22 | 
 23 | 
 24 |     def __init__(self, *args, **kwargs):
 25 | 
 26 |         super(CasPanda, self).__init__(*args, **kwargs)
 27 |     def connect(self, kp=None):
 28 |         """
 29 |         Create `cassandra.cluster.Cluster` session, 
 30 |         and patch `session.row_factory` with `self.panda_factory`.
 31 | 
 32 |         :return: Session object
 33 |         """
 34 | 
 35 |         self.session = super(CasPanda, self).connect(kp)
 36 |         self.session.row_factory = self.panda_factory
 37 |         if self.keyspaces is None:
 38 |             self._sync_metadata(kp)
 39 | 
 40 |         return self.session
 41 | 
 42 |     def panda_factory(self, colnames, rows):
 43 |         """
 44 |         Returns Rows in a Panda DataFrame
 45 |         :param rows: values selected in Select statement
 46 |         :param colnames: column names selected
 47 |         :return: Panda DataFrame
 48 |         """
 49 |         if len(rows) == 0:
 50 |             return CassandraFrame(session=self.session)
 51 |         return CassandraFrame(rows, columns=colnames, session=self.session)
 52 | 
 53 |     def describe(self, kp=None, tb=None):
 54 | 
 55 |         pass
 56 | 
 57 |     def _sync_metadata(self, kp):
 58 |         """
 59 |         Syncs all of the metadata keyspaces and their underlying tables and columns. Sets keyspace to be a dict
 60 |         of all MetaKeyspace in the connection by name:MetaKeyspace
 61 |         :return:
 62 |         """
 63 | 
 64 |         self.keyspaces = {}
 65 |         #TODO: Turn off warnings when this occurs
 66 |         self.session.row_factory = dict_factory
 67 | 
 68 |         #gets all of the column data for all tables/keyspaces
 69 |         result = self.session.execute("""SELECT keyspace_name, columnfamily_name, column_name, component_index, index_name,
 70 |                              index_options, index_type, type as cql_type, validator FROM system.schema_columns""")
 71 | 
 72 | 
 73 |         cols = [ColumnMeta(**row) for row in result]
 74 |         for i in cols:
 75 |             #create keyspace if not already exists
 76 |             if self.keyspaces.get(i.keyspace) is None:
 77 |                 self.keyspaces.update({i.keyspace:KeyspaceMeta(i.keyspace)})
 78 | 
 79 |             #add table if not already exists
 80 |             kp = self.keyspaces.get(i.keyspace)
 81 |             if kp.tables.get(i.table) is None:
 82 |                 kp.tables.update({i.table:TableMeta(i.keyspace, i.table)})
 83 | 
 84 |             #finally add/overwrite column into table
 85 |             tb = kp.tables.get(i.table)
 86 |             tb.columns[i.name] = i
 87 |         for kp_nm, kp in iteritems(self.keyspaces):
 88 |             for tbl_nm, tbl in iteritems(kp.tables):
 89 |                 tbl.categorize_columns()
 90 | 
 91 |         self.session.row_factory = self.panda_factory
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/caspanda/metabear.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file is meant to some valuable information about a cassandra table
  3 | """
  4 | from caspanda.utils import paste, print_ls
  5 | from cassandra.cqltypes import lookup_casstype
  6 | 
  7 | from future.utils import itervalues
  8 | class ColumnMeta(object):
  9 |     keyspace = None
 10 |     """
 11 |     The keypace the column belongs to
 12 |     """
 13 | 
 14 |     table = None
 15 |     """
 16 |     The table the column belongs to
 17 |     """
 18 | 
 19 |     name = None
 20 |     """
 21 |     The name of the column
 22 |     """
 23 |     # TODO: add the rest of arguments
 24 | 
 25 |     def __init__(self, keyspace_name, columnfamily_name, column_name, component_index=None, index_name=None, index_options=None, index_type=None, cql_type=None, validator=None):
 26 |         self.keyspace = keyspace_name
 27 |         self.table = columnfamily_name
 28 |         self.name = column_name
 29 |         self.component_index = component_index
 30 |         self.index_name = index_name
 31 |         self.index_options = index_options
 32 |         self.index_type = index_type
 33 |         self.cql_type = cql_type
 34 |         self.validator = validator
 35 | 
 36 |     def __repr__(self):
 37 |         return "{0} {1} {2}".format(self.name, lookup_casstype(self.validator).typename, self.cql_type if self.cql_type!="regular" else "")
 38 | 
 39 | class TableMeta(object):
 40 |     keyspace = None
 41 |     name = None
 42 |     columns = {}
 43 | 
 44 | 
 45 |     def __init__(self, keyspace_name, name, columns=None):
 46 |         self.keyspace = keyspace_name
 47 |         self.name = name
 48 |         self.columns = {} if columns is None else columns
 49 | 
 50 |     def add_column(self, x):
 51 |         self.columns.append(x)
 52 | 
 53 |     def __repr__(self, *args, **kwargs):
 54 |         """ Recursively prints nested lists."""
 55 |         return print_ls(self.categorize_columns())
 56 | 
 57 |     def sort_columns(self, x, reverse = False):
 58 |         seq = []
 59 |         for i in x:
 60 |             seq.append((i.component_index, i))
 61 |         seq.sort(reverse=reverse)
 62 |         return [x[1] for x in seq]
 63 | 
 64 |     def categorize_columns(self):
 65 |         self.partition_cols = []
 66 |         self.clustering_cols = []
 67 |         self.regular_cols = []
 68 |         self.static_cols = []
 69 | 
 70 |         for i in itervalues(self.columns):
 71 |             if i.cql_type == "partition_key":
 72 |                 self.partition_cols.append(i)
 73 |                 next
 74 |             if i.cql_type == "clustering_key":
 75 |                 self.clustering_cols.append(i)
 76 |                 next
 77 |             if i.cql_type == "regular":
 78 |                 self.regular_cols.append(i)
 79 |                 next
 80 |             if i.cql_type == "static":
 81 |                 self.static_cols.append(i)
 82 |                 next
 83 | 
 84 |         self.partition_cols = self.sort_columns(self.partition_cols)
 85 |         self.clustering_cols = self.sort_columns(self.clustering_cols, reverse=True)
 86 |         cluster_str = self.regular_cols
 87 |         for i in self.clustering_cols:
 88 |             cluster_str = [i, cluster_str]
 89 | 
 90 |         #partition_cols = paste([i.name for i in partition_cols])
 91 | 
 92 |         return self.partition_cols,[cluster_str, self.static_cols]
 93 | 
 94 | #TODO utilize TableMeta.describe to implement the same thing for keyspaces
 95 | class KeyspaceMeta(object):
 96 |     name = None
 97 |     tables = {}
 98 |     # TODO: fill in the rest of the arguments for keyspace
 99 | 
100 |     def __init__(self, name, tables=None):
101 |         self.name = name
102 |         self.tables = {} if tables is None else tables
103 | 
104 |     def add_table(self, x):
105 |         self.tables.append(x)
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/caspanda/tests/test_describe.py:
--------------------------------------------------------------------------------
 1 | #TODO delete keyspace after tests
 2 | #TODO Add tests for all possibilities: table, [table], keyspace, [keyspace]
 3 | """
 4 | Testing the describe functions and those in spots.py
 5 | """
 6 | import unittest
 7 | 
 8 | from caspanda.bear import CasPanda
 9 | from caspanda.metabear import KeyspaceMeta, TableMeta, ColumnMeta
10 | from caspanda.spots import describe
11 | 
12 | class BaseTestInput(unittest.TestCase):
13 |     """
14 |     Base class for input testing.
15 | 
16 |     Connects to database.
17 |     """
18 |     def setUp(self):
19 |         self.cl = CasPanda()
20 |         self.cl.connect()
21 |         super(BaseTestInput, self).setUp()
22 |         cl = CasPanda()
23 |         session = cl.connect()
24 |         session.execute("""CREATE KEYSPACE IF NOT EXISTS tests WITH REPLICATION = { 'class' : 'SimpleStrategy',
25 |                             'replication_factor' : 1 };""")
26 |         session.set_keyspace("tests")
27 |         session.execute("""CREATE TABLE IF NOT EXISTS sold_cars (
28 |             make text,
29 |             state text,
30 |             day timestamp,
31 |             event_time timestamp,
32 |             dealership text,
33 |             salesman text,
34 |             year int,
35 |             account_lead text static,
36 |             distributor_lead text static,
37 |             PRIMARY KEY ((make, state), day, event_time));""")
38 | 
39 | #class TestDescribe(BaseTestInput):
40 | 
41 |     # def setUp(self):
42 |     #     super(TestDescribe, self).setUp()
43 |     #
44 |     # def test_single_table(self):
45 |     #     out = "make, state\n\t\tday\n\t\t\tevent_time\n\t\t\t\tdealership\n\t\t\t\tyear\n\t\t\t\tsalesman\n\t\tdistributor_lead\n\t\taccount_lead\n"
46 |     #     self.assertEqual(out, self.cl.keyspaces["tests"].tables["sold_cars"])
47 | 
48 | class TestColumnStructure(BaseTestInput):
49 |     def setUp(self):
50 |         super(TestColumnStructure, self).setUp()
51 |         schema_columns={}
52 | 
53 |     def test_single_table(self):
54 | 
55 |         self.assertIsInstance(self.cl.keyspaces, dict)
56 |         self.assertIsInstance(self.cl.keyspaces["tests"], KeyspaceMeta)
57 |         self.assertIsInstance(self.cl.keyspaces["tests"].tables, dict)
58 | 
59 |         tb = self.cl.keyspaces["tests"].tables["sold_cars"]
60 | 
61 |         self.assertIsInstance(tb, TableMeta)
62 |         self.assertEqual(len(tb.columns), 9)
63 |         self.assertIsInstance(tb.columns, dict)
64 |         self.assertIsInstance(tb.columns["account_lead"], ColumnMeta)
65 | 
66 |         self.assertEqual(tb.columns["account_lead"].cql_type, "static")
67 | 
68 |     def test_columns(self):
69 |         tb = self.cl.keyspaces["tests"].tables["sold_cars"]
70 |         col_day = tb.columns["day"]
71 |         self.assertEqual(col_day.cql_type, "clustering_key")
72 |         self.assertEqual(col_day.component_index, 0)
73 |         self.assertEqual(col_day.keyspace, "tests")
74 |         self.assertEqual(col_day.name, "day")
75 |         self.assertEqual(col_day.table, "sold_cars")
76 | 
77 |         col_state = tb.columns["state"]
78 |         self.assertEqual(col_state.cql_type, "partition_key")
79 |         self.assertEqual(col_state.component_index, 1)
80 |         self.assertEqual(col_state.keyspace, "tests")
81 |         self.assertEqual(col_state.name, "state")
82 |         self.assertEqual(col_state.table, "sold_cars")
83 | 
84 |         col_state = tb.columns["salesman"]
85 |         self.assertEqual(col_state.cql_type, "regular")
86 |         self.assertEqual(col_state.component_index, 2)
87 |         self.assertEqual(col_state.keyspace, "tests")
88 |         self.assertEqual(col_state.name, "salesman")
89 |         self.assertEqual(col_state.table, "sold_cars")
90 | 
91 | 
92 |         col_state = tb.columns["account_lead"]
93 |         self.assertEqual(col_state.cql_type, "static")
94 |         self.assertEqual(col_state.component_index, 2)
95 |         self.assertEqual(col_state.keyspace, "tests")
96 |         self.assertEqual(col_state.name, "account_lead")
97 |         self.assertEqual(col_state.table, "sold_cars")
98 | 
99 | 


--------------------------------------------------------------------------------
/caspanda/bamboo.py:
--------------------------------------------------------------------------------
  1 | ###################################################
  2 | #################[ Module: Bamboo ]################
  3 | ###################################################
  4 | """
  5 | This module contains the CassandraFrame class, which exposes the main interface between pandas and Cassandra.
  6 | 
  7 | Bamboo, like what PANDAS eat. Get it??
  8 | 
  9 | CassandraFrame implements synchronous and asynchronous insertion operations, and MultiIndexes output from Cassandra
 10 | in order to support pivot- and melt-like operations.
 11 | """
 12 | import logging
 13 | import pandas as pd
 14 | try:
 15 |     import Queue as queue
 16 | except ImportError:
 17 |     # Python 3
 18 |     import queue
 19 | from cassandra.cluster import Session
 20 | 
 21 | from caspanda.utils import paste
 22 | 
 23 | 
 24 | class CassandraFrame(pd.DataFrame):
 25 |     """
 26 |     Wrapper for pandas.DataFrame.
 27 | 
 28 |     Implements convenience methods for get and put operations to Cassandra,
 29 |     and handles MultiIndexing of CQL output.
 30 | 
 31 |     Keeps track of column name hierarchy in the self._prepared_columns and self._cql_columns.
 32 |     """
 33 |     statement_input      = None
 34 |     _prepared_columns    = None
 35 |     _insert_index        = None
 36 | 
 37 | 
 38 |     def __init__(self, data=None, index=None, columns=None, cql=None, session=None, table=None, dtype=None,
 39 |                  copy=False, cql_columns=None, *args, **kwargs):
 40 | 
 41 |         super(CassandraFrame, self).__init__(data, index=index, columns=columns, dtype=dtype, copy=copy, *args, **kwargs)
 42 | 
 43 |         self.set_session(session)
 44 | 
 45 |         self.table         = table
 46 |         self.cql           = kwargs.get('cql', None)
 47 |         self.insert_queue  = queue.Queue()
 48 |  
 49 |         self.set_cql_columns(cql_columns)
 50 | 
 51 | 
 52 |     def put(self, table=None):
 53 |         """
 54 |         TODO: (???)
 55 |         """
 56 |         if table is not None:
 57 |             self.table = table
 58 |         pass
 59 | 
 60 | 
 61 |     def create_cql_insert(self):
 62 |         """
 63 |         Given a table, prepares a statement to allow the dataframe to be inserted row by row into cassandra.
 64 | 
 65 |         Sets statement_input to be the prepared statement.
 66 | 
 67 |         :return: 0
 68 |         """
 69 |         assert isinstance(self.session, Session)
 70 |         assert self.table is not None
 71 | 
 72 |         statement = "INSERT INTO " + self.table + "(" + paste(self._cql_columns) + ") VALUES (" + paste(["?"] * len(self.columns)) + ");"
 73 | 
 74 | 
 75 | 
 76 |         self.statement_input   = self.session.prepare(statement)
 77 | 
 78 |         self._prepared_columns = self._cql_columns
 79 | 
 80 |         return
 81 | 
 82 | 
 83 |     def insert_sync(self):
 84 |         """
 85 |         Insert rows synchronously into Cassandra.
 86 | 
 87 |         Cassandra doesn't get a performance improvement from batch insertion as it is a peer-to-peer architecture;
 88 |         so the insertion strategy is to iterate over the CassandraFrame's rows and bind them one by one.
 89 |         """
 90 |         assert self._cql_columns == self._prepared_columns
 91 |         assert self.statement_input is not None, 'Statement_input not defined. Use create_cql_insert().'
 92 | 
 93 |         for index, row in self.loc[:,self._prepared_columns].iterrows():
 94 |             self.session.execute(self.statement_input.bind(row))
 95 | 
 96 |         return
 97 | 
 98 | 
 99 |     def insert_async(self):
100 |         """
101 |         Insert rows asynchronously into Cassandra.
102 | 
103 |         TODO: distinguish from the chained callback approach in insert_callback() and clean up.
104 |         """
105 |         assert self._cql_columns == self._prepared_columns
106 |         assert self.statement_input is not None, 'Statement_input not defined. Use create_cql_insert().'
107 | 
108 |         def handle_success(rows):
109 |             pass
110 | 
111 |         def handle_error(exception):
112 |             logging.error("Failed to send data info: %s", exception)
113 |             return
114 | 
115 |         def put(i):
116 |             future = self.session.execute_async(self.statement_input.bind(self.loc[i, self._prepared_columns]))
117 |             future.add_callbacks(handle_success, handle_error)
118 |             return future
119 | 
120 |         map(put, range(self.__len__()))
121 | 
122 |         return
123 | 
124 | 
125 |     def insert_callback(self):
126 |         """
127 |         TODO: code the upper limit on concurrent futures, clean up (and deprecate insert_async??)
128 | 
129 |         Put row indices into a queue; 
130 |         while the queue is not empty and the upper threshold on number of concurrent waiting processes is not reached,
131 |         insert a new row into Cassandra.
132 |         """
133 |         assert self._cql_columns == self._prepared_columns
134 |         assert self.statement_input is not None, 'Statement_input not defined. Use create_cql_insert().'
135 | 
136 |         map(self.insert_queue.put_nowait, range(self.__len__()))
137 | 
138 |         def handle_success(rows):
139 |             """
140 |             Queue raises an Empty exception when it hits the bottom of the queue (after blocking for `timeout` seconds).
141 | 
142 |             Try getting until Queue is exhausted, then return.
143 |             """
144 |             try:
145 |                 i = self.insert_queue.get()
146 |             except queue.Empty:
147 |                 return
148 | 
149 |             print("Inserting "+self.iloc[i].name+" ...")
150 |             print("-----------------------------------------------")
151 | 
152 |             future = self.session.execute_async(self.statement_input.bind(self.loc[i, self._prepared_columns]))
153 | 
154 |             future.add_callbacks(handle_success, handle_error)  # intentional tail recursion!
155 |                                                                 # need hard upper limit on number of concurrent futures
156 |                                                                 # something like: for i in range(min(120, self.__len__())):
157 |             return future
158 | 
159 |         def handle_error(exception):
160 |             """
161 |             Log error and recurse.
162 |             """
163 |             logging.error("Failed to send data info: %s", exception)
164 |             future = handle_success(None)
165 |             return future
166 | 
167 |         future = handle_success(None)
168 | 
169 |         return future.result()
170 | 
171 | 
172 |     def get_cql_columns(self):
173 |         return self._cql_columns
174 | 
175 | #TODO Redo these operations to run off of the meta data just built from metabear.py
176 |     def set_cql_columns(self, x=None):
177 |         if x is None:
178 |             self._cql_columns = self.columns.tolist()
179 |         else:
180 |             assert isinstance(x, list)
181 |             self._cql_columns = x
182 | 
183 |         return
184 | 
185 | 
186 |     def set_session(self, session):
187 |         """
188 |         Setter method for self.session.
189 | 
190 |         Pass a session object or None.
191 | 
192 |         :return: None
193 |         """
194 |         if session is None:
195 |             self.session = None
196 | 
197 |         else:
198 |             assert isinstance(session, Session), "Got non-session, type: {}".format(type(session))
199 |             self.session = session
200 | 
201 |         return
202 | 
203 | 
204 | 
205 | 
206 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/aaronbenz/caspanda.svg?branch=master)](https://travis-ci.org/aaronbenz/caspanda)
  2 | [![codecov.io](http://codecov.io/github/aaronbenz/caspanda/coverage.svg?branch=master)](http://codecov.io/github/aaronbenz/caspanda?branch=master)
  3 | ```
  4 |                                      _       
  5 |   ___ __ _ ___ _ __   __ _ _ __   __| | __ _ 
  6 |  / __/ _` / __| '_ \ / _` | '_ \ / _` |/ _` |
  7 | | (_| (_| \__ \ |_) | (_| | | | | (_| | (_| |
  8 |  \___\__,_|___/ .__/ \__,_|_| |_|\__,_|\__,_|
  9 |               |_|                            
 10 | 
 11 | Aaron Benz
 12 | Charlie Hack
 13 | Spring 2015
 14 | ```
 15 | 
 16 | caspanda
 17 | ========
 18 | Pandas interface for Cassandra.
 19 | 
 20 | ##What is it?
 21 | **caspanda** is a Python module combines **Apache Cassandra** with **Python's Pandas** module... aka **caspanda**. Its
 22 | overall goal is to give the user the ability to seperate Cassandra's NoSQL backend from the user's front end experience.
 23 | Ultimately, it hopes to provide Data Scientists who use Pandas the ability to easily use Cassandra.
 24 | 
 25 | It is still very early in its developement, but it plans on using the multi-indexing/pivot ability and the time series
 26 | functionality available in Pandas to automatically sort and organize a data coming from Cassandra according to its schema.
 27 | Additionally, it hopes to allow the user to easily insert data back into cassandra without ever having to speak CQL.
 28 | 
 29 | Main Features
 30 | ----
 31 | Here are a few of the things caspanda currently does:
 32 | 
 33 |     - Puts queried data into a Pandas Dataframe
 34 |     - Stores data into Cassandra using CassandraFrames (uses sync and async methods)
 35 |     - Describes the structure of Cassandra Tables in a hierarchical way
 36 | 
 37 | Usage
 38 | ----
 39 | One of the main objectives of **Caspandas** is being able to easily understand and use Cassandra. Unfortunately,
 40 |  many can be misled or lack the understanding of how Cassandra actually stores it's data. The attempt below is meant to 
 41 |  give you a conceptual understanding of the hierarchy that the data is really stored in.  
 42 |  
 43 |  The example table `sold_cars` demonstrates a data model that might exist if you wanted to store the information about
 44 |  sold cars. It stores information about a sale according to the *make* and *state* of the car, and then 
 45 |  stores the information by day and time. So, the query pattern would specify the *make* and *state*, and then give you 
 46 |  the ability to choose a date range. 
 47 |  
 48 |  Conceptually this might make since, but the way in which it is written down in CQL if often difficult to grasp for anyone
 49 |  not seasoned in Cassandra. So, we have tried to make this much more simple. First, connect to Cassandra and create the
 50 |  table `sold_cars`
 51 | ```python
 52 | from caspanda.bear import CasPanda
 53 | 
 54 | cl = CasPanda()
 55 | session = cl.connect()
 56 | session.execute("""CREATE KEYSPACE IF NOT EXISTS tests WITH REPLICATION = { 'class' : 'SimpleStrategy',
 57 |                     'replication_factor' : 1 };""")
 58 | session.set_keyspace("tests")
 59 | session.execute("""CREATE TABLE IF NOT EXISTS sold_cars (
 60 |     make text,
 61 |     state text,
 62 |     day timestamp,
 63 |     event_time timestamp,
 64 |     dealership text,
 65 |     salesman text,
 66 |     year int,
 67 |     account_lead text static,
 68 |     distributor_lead text static,
 69 |     PRIMARY KEY ((make, state), day, event_time));""")
 70 | ```
 71 | 
 72 | Now that the table has been created, let's visualize it. This breaks down the names of the columns in a hierarchical
 73 | fashion that demonstrates how it is actually stored. So for example, The *make* and *state* columns define a group of data.
 74 | That group is ordered and stored by *day*, and then by *event_time*. Then, for each *event_time*,
 75 | there are fields for a *dealership*, *year*, and *salesman*. Additionally, there is a single value column stored on the
 76 | same level as *day*, which is *distributor* and *account_lead*. 
 77 | 
 78 | Said differently, for every *make* and *state*, there is one *distributor_lead* and one *account_lead*. Also, for every
 79 | *make* and *state*, there can be a combination of *dealership*, *year*, and *salesman* defined by (indexed by) a *day*
 80 | and then by an *event_time*
 81 | 
 82 | ```python
 83 | 
 84 | print cl.keyspaces["tests"].tables["albums"]
 85 | 
 86 | #	make text partition_key
 87 | #	state text partition_key
 88 | #		day timestamp clustering_key
 89 | #			event_time timestamp clustering_key
 90 | #				dealership text 
 91 | #				year int 
 92 | #				salesman text 
 93 | #		distributor_lead text static
 94 | #		account_lead text static
 95 | ```
 96 | 
 97 | The traditional method for viewing this in CQL is this:
 98 | 
 99 | ```python
100 | 
101 | print cl.metadata.keyspaces["tests"].tables["sold_cars"].export_as_string()
102 | 
103 | #CREATE TABLE tests.sold_cars (
104 | #    make text,
105 | #    state text,
106 | #    day timestamp,
107 | #    event_time timestamp,
108 | #    account_lead text static,
109 | #    dealership text,
110 | #    distributor_lead text static,
111 | #    salesman text,
112 | #    year int,
113 | #    PRIMARY KEY ((make, state), day, event_time)
114 | ```
115 | 
116 | With that being said, please feel free to reach out to us for comments/suggestions/questions. 
117 | 
118 | There are also some more examples for calling data from Cassandra and inserting it back using only a Pandas Dataframe (which
119 | we called a CassandraFrame), in `bin/example.py`
120 | 
121 | Example of using Caspanda for selecting data
122 | ----
123 | Running a select from a Cassandra table will automatically return a Pandas Dataframe, even for simple selects.
124 | Let's say you have a keyspace called `tr_data` and you create one table `tr_minute` with the following columns:
125 | 
126 | ```
127 | cqlsh:tr_data> create table tr_minute (
128 |  ccypair text,
129 |  gmt_timestamp timestamp,
130 |  mid_rate double,
131 |  ric text static,
132 |  PRIMARY KEY (ccypair, gmt_timestamp) );
133 | ```
134 | Connect to the Cassandra database as usual, then switch to the `tr_data` keyspace. Any keywords controlling the connection such as the `port` or using `compression` are added as arguments to the initial CasPanda() call.
135 | ```python
136 | from caspanda.bear import CasPanda
137 | cl = CasPanda(contact_points=['105.150.100.25',], port=9042, compression=True)
138 | cpsession = cl.connect()
139 | cpsession.set_keyspace('tr_data')
140 | select_ccys_distinct = """select distinct ccypair from tr_minute"""
141 | ccys = cpsession.execute(select_ccys_distinct)
142 | ccys.head()
143 | ```
144 | <table width="30%" border="0" style=" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px;" cellspacing="1" cellpadding="0"><thead>
145 | <tr><td></td><td>ccypair</td></tr></thead>
146 | <tr><td>0</td><td>USDKRW</td></tr>
147 | <tr><td>1</td><td>USDRUB</td></tr>
148 | <tr><td>2</td><td>AEDUSD</td></tr>
149 | <tr><td>3</td><td>USDTWD</td></tr>
150 | <tr><td>4</td><td>USDMYR</td></tr></table>
151 | 
152 | Now select some time-series data from the table:
153 | 
154 | ```python
155 | select_minute_wlimit = """select ccypair,gmt_timestamp,ric,mid_rate from tr_minute
156 | where ccypair = 'EURUSD' and gmt_timestamp >= '2015-05-01 00:00:00+0000'
157 | and gmt_timestamp < '2015-06-01 00:00:00+0000' LIMIT 5"""
158 | ccyA = cpsession.execute(select_minute_wlimit)
159 | ccyA.head()
160 | ```
161 | <table width="30%" border="0" style=" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px;" cellspacing="2" cellpadding="0"><thead>
162 | <tr>
163 | <td></td>
164 | <td>ccypair</td><td>gmt_timestamp</td><td>ric</td><td>mid_rate</td></tr></thead>
165 | <tr><td>0</td>
166 | <td>EURUSD</td>
167 | <td>2015-05-01 00:00:00.001000</td>
168 | <td>EUR=</td>
169 | <td>1.121370</td></tr>
170 | <tr>
171 | <td>1</td>
172 | <td>EURUSD</td>
173 | <td>2015-05-01 00:01:00.001000</td>
174 | <td>EUR=</td>
175 | <td>1.120950</td></tr>
176 | <tr>
177 | <td>2</td>
178 | <td>EURUSD</td>
179 | <td>2015-05-01 00:02:00.001000</td>
180 | <td>EUR=</td>
181 | <td>1.121032</td></tr>
182 | <tr>
183 | <td>3</td>
184 | <td>EURUSD</td>
185 | <td>2015-05-01 00:03:00.001000</td>
186 | <td>EUR=</td>
187 | <td>1.121001</td></tr>
188 | <tr>
189 | <td>4</td>
190 | <td>EURUSD</td>
191 | <td>2015-05-01 00:04:00.001000</td>
192 | <td>EUR=</td>
193 | <td>1.120950</td></tr></table>
194 | 
195 | The dataframe returned is exactly the same layout as the table, though the pandas index is just the row number. If you want the index to be the timestamp, this has to be done explicitly:
196 | 
197 | ```python
198 | ccyA.set_index('gmt_timestamp')
199 | ```
200 | 
201 | *Large result sets*
202 | 
203 | By default the underlying python driver will switch to using paged-result sets if the number of returned rows is greater than 5,000 rows. This will not currently work with caspanda, because the results are not automatically returned by cassandra. The db 'waits' until the driver starts to request the results by page. To get around this you can increase the default select size:
204 | 
205 | ```python
206 | cpsession.default_fetch_size = 50000
207 | ```
208 | 
209 | However note that cassandra also has a default _server-side_ read timeout of 5 seconds. If you cannot retrieve all rows within this limit you will be timed out.
210 | 
211 | *Parallel sessions*
212 | 
213 | If you need to select basic data that does not really make sense in a dataframe (for instance a string of values to be re-used in another select), you can create another 'parallel' cassandra session, at the same time:
214 | 
215 | ```python
216 | from cassandra.cluster import Cluster
217 | cconnection = Cluster()
218 | csession = cconnection.connect()
219 | csession.set_keyspace('tr_data')
220 | cccys = csession.execute(select_ccys_distinct)
221 | # This returns a list of cassandra 'row-type'
222 | ccy_string = ''
223 | for row in cccys:
224 |     ccy_string = ccy_string + row.ccypair +','
225 | print ccy_string
226 | 'USDKRW,USDRUB,AEDUSD,USDTWD,USDMYR,USDARS,USDCHF,USDSAR,USDPEN,GBPUSD...'
227 | ```
228 | and the results can be pulled directly from the response. You can use both in the same session, according to the type of results needed..
229 | 
230 | Installation
231 | ----
232 | `$ python setup.py install` or `$ pip install -e .`
233 | You'll also need Cassandra:
234 | 
235 | `$ brew install cassandra`
236 | 
237 | 
238 | 
239 | Tests
240 | -----
241 | There are some unit and integration tests in the `caspanda/tests/` directory.
242 | 
243 | Run from the command line with
244 | 
245 | `$ nosetests`
246 | 
247 | 
248 | TODO
249 | ----  
250 | * `grep -r TODO .`
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 


--------------------------------------------------------------------------------