├── caspanda ├── __init__.py ├── exceptions.py ├── spots.py ├── utils.py ├── tests │ ├── test_input.py │ └── test_describe.py ├── bear.py ├── metabear.py └── bamboo.py ├── .gitignore ├── .travis.yml ├── setup.py ├── bin ├── cleanup └── example.py ├── LICENSE └── README.md /caspanda/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | caspanda.egg-info 3 | build/* 4 | dist/* 5 | -------------------------------------------------------------------------------- /caspanda/exceptions.py: -------------------------------------------------------------------------------- 1 | ################################################### 2 | #################[ Module: Exceptions ]############ 3 | ################################################### 4 | """ 5 | Custom exceptions for Caspanda. 6 | """ 7 | 8 | 9 | class InputError(Exception): 10 | pass -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.6" 4 | - "2.7" 5 | - "3.2" 6 | - "3.3" 7 | - "3.4" 8 | 9 | before_install: 10 | - pip install codecov 11 | - pip install coverage 12 | - pip install pytest-cov 13 | 14 | services: 15 | - cassandra 16 | 17 | install: 18 | - pip install -e . 19 | 20 | # command to run tests 21 | script: 22 | - if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]] ; then coverage run --include='caspanda/*' $(which py.test) ; coverage report; fi 23 | 24 | 25 | after_success: 26 | - if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; codecov ; fi 27 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools import find_packages 3 | 4 | 5 | setup( 6 | name='caspanda', 7 | version='0.0.0.4', 8 | packages=find_packages(), 9 | install_requires=[ 10 | 'cassandra-driver', 11 | 'numpy', 12 | 'pandas', 13 | 'nose', 14 | 'blist', 15 | 'future' 16 | ], 17 | url='', 18 | license='MIT', 19 | author='Aaron Benz', 20 | author_email='aaron.benz@accenture.com', 21 | description='Cassandra Wrapper for Easy Panda DataFrame Access', 22 | include_package_data=True, 23 | ) 24 | -------------------------------------------------------------------------------- /bin/cleanup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Delete .pyc files and empty directories from root of project 4 | cd ./$(git rev-parse --show-cdup) 5 | 6 | # Clean-up 7 | find . -name ".DS_Store" -delete 8 | 9 | NUM_PYC_FILES=$( find . -name "*.pyc" | wc -l | tr -d ' ' ) 10 | if [ $NUM_PYC_FILES -gt 0 ]; then 11 | find . -name "*.pyc" -delete 12 | printf "\e[00;31mDeleted $NUM_PYC_FILES .pyc files\e[00m\n" 13 | fi 14 | 15 | NUM_EMPTY_DIRS=$( find . -type d -empty | wc -l | tr -d ' ' ) 16 | if [ $NUM_EMPTY_DIRS -gt 0 ]; then 17 | find . -type d -empty -delete 18 | printf "\e[00;31mDeleted $NUM_EMPTY_DIRS empty directories\e[00m\n" 19 | fi 20 | -------------------------------------------------------------------------------- /caspanda/spots.py: -------------------------------------------------------------------------------- 1 | """ 2 | A Panda has spots, and so does data. This is meant to define ways to more easily view caspanda data 3 | """ 4 | 5 | from cassandra.metadata import TableMetadata, KeyspaceMetadata 6 | try: 7 | from exceptions import SyntaxError 8 | except ImportError: 9 | pass 10 | 11 | def describe(x): 12 | """ 13 | Given a TableMetaData or list of KeyspaceMetaData, it will return a description of all of the tables in that Keyspace. Given a 14 | TableMetaData or a list of TableMetaData, it will return the description of all of them 15 | :param x: A list or a single keyspace/table 16 | :return: A json description of the tables 17 | """ 18 | 19 | raise(SyntaxError) 20 | pass 21 | 22 | def _describe_table(x): 23 | """ 24 | Describes a single TableMetaData table 25 | :param x: TableMetaData 26 | :return: dict describing x layout 27 | """ 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Aaron Benz 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /caspanda/utils.py: -------------------------------------------------------------------------------- 1 | ################################################### 2 | #################[ Module: Utils ]################# 3 | ################################################### 4 | """ 5 | Miscellaneous utilities for caspanda. 6 | """ 7 | 8 | def paste(x, sep=", "): 9 | """ 10 | Custom string formatting function to format (???) output. 11 | """ 12 | out = "" 13 | for i in x: 14 | out += i + sep 15 | return out.strip(sep) 16 | 17 | def print_ls(ls, ident = '', braces=1): 18 | """ Recursively prints nested lists.""" 19 | out = "" 20 | for value in ls: 21 | if isinstance(value, list): 22 | out = out + print_ls(value, ident+'\t', braces+1) 23 | else: 24 | #out = out + ident+'%s' %(value if isinstance(value, basestring) else value.name) + '\n' 25 | out = out + ident+'%s' %(value) + '\n' 26 | 27 | return out 28 | 29 | def is_instance_multiple(x, obj_class): 30 | """ 31 | Checks isinstance of multiple objects to save time. Does nothing if it is none 32 | :param x: objects 33 | :param obj_class: class of object 34 | :return: 35 | """ 36 | assert isinstance(x, list) 37 | assert isinstance(obj_class, list) 38 | -------------------------------------------------------------------------------- /caspanda/tests/test_input.py: -------------------------------------------------------------------------------- 1 | ################################################### 2 | #################[ Module: Test Input ]############ 3 | ################################################### 4 | """ 5 | Test inserting CassandraFrame into Cassandra and ensure database get returns expected output. 6 | """ 7 | import unittest 8 | 9 | from caspanda.bear import CasPanda 10 | from caspanda.bamboo import CassandraFrame 11 | 12 | 13 | class BaseTestInput(unittest.TestCase): 14 | """ 15 | Base class for input testing. 16 | 17 | Connects to database. 18 | """ 19 | def setUp(self): 20 | self.cl = CasPanda() 21 | self.session = self.cl.connect() 22 | self.session.execute("""CREATE KEYSPACE IF NOT EXISTS tests WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 };""") 23 | self.session.set_keyspace("tests") 24 | self.session.execute("""CREATE TABLE IF NOT EXISTS tester( 25 | id text PRIMARY KEY, 26 | car text, 27 | color text, 28 | owner text, 29 | passengers set, 30 | data blob 31 | );""") 32 | 33 | self.cols = ["id","car","color","owner"] 34 | 35 | super(BaseTestInput, self).setUp() 36 | 37 | class TestQuery(BaseTestInput): 38 | 39 | def setUp(self): 40 | super(TestQuery, self).setUp() 41 | 42 | self.frame = CassandraFrame([["VIN1", "ford", "black", "frank"], 43 | ["VIN2", "cyrsler", "blue", "chris"], 44 | ["VIN3", "honda", "red", "harry"]], 45 | columns=self.cols, session=self.session, table="tester") 46 | 47 | self.frame.create_cql_insert() 48 | 49 | 50 | def test_all_attributes(self): 51 | self.frame.insert_async() 52 | result_set = self.session.execute("SELECT * FROM tester") 53 | self.cf = result_set._current_rows 54 | 55 | self.assertEqual(len(self.cf), 3) 56 | self.assertIsInstance(self.cf, CassandraFrame) 57 | self.assertEqual(self.frame.session, self.cf.session) 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /bin/example.py: -------------------------------------------------------------------------------- 1 | ################################################### 2 | #################[ Script: Example ]################ 3 | ################################################### 4 | """ 5 | Start a cassandra cluster and demonstrate inserting a CassandraFrame. 6 | """ 7 | from caspanda.bear import CasPanda 8 | from caspanda.bamboo import CassandraFrame 9 | 10 | cl = CasPanda() 11 | session = cl.connect() 12 | session.execute("""CREATE KEYSPACE IF NOT EXISTS tests WITH REPLICATION = { 'class' : 'SimpleStrategy', 13 | 'replication_factor' : 1 };""") 14 | session.set_keyspace("tests") 15 | session.execute("""CREATE TABLE IF NOT EXISTS sold_cars ( 16 | make text, 17 | state text, 18 | day timestamp, 19 | event_time timestamp, 20 | dealership text, 21 | salesman text, 22 | year int, 23 | account_lead text static, 24 | distributor_lead text static, 25 | PRIMARY KEY ((make, state), day, event_time));""") 26 | session.execute("""CREATE TABLE IF NOT EXISTS albums( 27 | id text PRIMARY KEY, 28 | car text, 29 | color text, 30 | owner text, 31 | passengers set, 32 | data blob 33 | );""") 34 | 35 | cols = ["id","car","color","owner"] 36 | 37 | session = cl.connect("tests") 38 | 39 | #df = pd.DataFrame(range(1,5), columns=["a"]) 40 | #tmp = CassandraFrame(np.random.randn(10, 2), columns=["id",""], session = session, table="albums") 41 | 42 | tmp = CassandraFrame([["VIN1", "ford", "black", "frank"], ["VIN2", "cyrsler", "blue", "chris"], ["VIN3", "honda", "red", "harry"]], 43 | columns = cols, session=session, table="albums") 44 | tmp.create_cql_insert() 45 | tmp.insert_async() 46 | 47 | print "Now see that the data was inserted" 48 | session.execute("""SELECT id, car, color, owner FROM tests.albums""") 49 | 50 | print "The description of tests.albumns:" 51 | print cl.keyspaces["tests"].tables["albums"] 52 | print "As opposed to this:" 53 | print cl.metadata.keyspaces["tests"].tables["albums"].export_as_string() 54 | 55 | print "Another comparison" 56 | print cl.keyspaces["tests"].tables["sold_cars"] 57 | print "As opposed to this:" 58 | print cl.metadata.keyspaces["tests"].tables["sold_cars"].export_as_string() 59 | 60 | 61 | cl.shutdown() 62 | 63 | 64 | 65 | 66 | #session.execute("DROP TABLE albums;") 67 | 68 | 69 | -------------------------------------------------------------------------------- /caspanda/bear.py: -------------------------------------------------------------------------------- 1 | ################################################### 2 | #################[ Module: Base ]################## 3 | ################################################### 4 | """ 5 | Class CasPanda, which subclasses cassandra.cluster.Cluster 6 | and provides an interface between pandas and Cassandra. 7 | """ 8 | from cassandra.cluster import Cluster 9 | from caspanda.metabear import ColumnMeta, KeyspaceMeta, TableMeta 10 | from cassandra.query import dict_factory 11 | 12 | from caspanda.bamboo import CassandraFrame 13 | 14 | from future.utils import iteritems 15 | 16 | #TODO: Add describe function to name any keyspace or keyspace + table(s) to utilize MetaTable.describe function 17 | class CasPanda(Cluster): 18 | """ 19 | Interface for pandas and Cassandra. 20 | """ 21 | keyspaces = None # contains all of the MetaKeyspaces info 22 | 23 | 24 | def __init__(self, *args, **kwargs): 25 | 26 | super(CasPanda, self).__init__(*args, **kwargs) 27 | def connect(self, kp=None): 28 | """ 29 | Create `cassandra.cluster.Cluster` session, 30 | and patch `session.row_factory` with `self.panda_factory`. 31 | 32 | :return: Session object 33 | """ 34 | 35 | self.session = super(CasPanda, self).connect(kp) 36 | self.session.row_factory = self.panda_factory 37 | if self.keyspaces is None: 38 | self._sync_metadata(kp) 39 | 40 | return self.session 41 | 42 | def panda_factory(self, colnames, rows): 43 | """ 44 | Returns Rows in a Panda DataFrame 45 | :param rows: values selected in Select statement 46 | :param colnames: column names selected 47 | :return: Panda DataFrame 48 | """ 49 | if len(rows) == 0: 50 | return CassandraFrame(session=self.session) 51 | return CassandraFrame(rows, columns=colnames, session=self.session) 52 | 53 | def describe(self, kp=None, tb=None): 54 | 55 | pass 56 | 57 | def _sync_metadata(self, kp): 58 | """ 59 | Syncs all of the metadata keyspaces and their underlying tables and columns. Sets keyspace to be a dict 60 | of all MetaKeyspace in the connection by name:MetaKeyspace 61 | :return: 62 | """ 63 | 64 | self.keyspaces = {} 65 | #TODO: Turn off warnings when this occurs 66 | self.session.row_factory = dict_factory 67 | 68 | #gets all of the column data for all tables/keyspaces 69 | result = self.session.execute("""SELECT keyspace_name, columnfamily_name, column_name, component_index, index_name, 70 | index_options, index_type, type as cql_type, validator FROM system.schema_columns""") 71 | 72 | 73 | cols = [ColumnMeta(**row) for row in result] 74 | for i in cols: 75 | #create keyspace if not already exists 76 | if self.keyspaces.get(i.keyspace) is None: 77 | self.keyspaces.update({i.keyspace:KeyspaceMeta(i.keyspace)}) 78 | 79 | #add table if not already exists 80 | kp = self.keyspaces.get(i.keyspace) 81 | if kp.tables.get(i.table) is None: 82 | kp.tables.update({i.table:TableMeta(i.keyspace, i.table)}) 83 | 84 | #finally add/overwrite column into table 85 | tb = kp.tables.get(i.table) 86 | tb.columns[i.name] = i 87 | for kp_nm, kp in iteritems(self.keyspaces): 88 | for tbl_nm, tbl in iteritems(kp.tables): 89 | tbl.categorize_columns() 90 | 91 | self.session.row_factory = self.panda_factory 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /caspanda/metabear.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is meant to some valuable information about a cassandra table 3 | """ 4 | from caspanda.utils import paste, print_ls 5 | from cassandra.cqltypes import lookup_casstype 6 | 7 | from future.utils import itervalues 8 | class ColumnMeta(object): 9 | keyspace = None 10 | """ 11 | The keypace the column belongs to 12 | """ 13 | 14 | table = None 15 | """ 16 | The table the column belongs to 17 | """ 18 | 19 | name = None 20 | """ 21 | The name of the column 22 | """ 23 | # TODO: add the rest of arguments 24 | 25 | def __init__(self, keyspace_name, columnfamily_name, column_name, component_index=None, index_name=None, index_options=None, index_type=None, cql_type=None, validator=None): 26 | self.keyspace = keyspace_name 27 | self.table = columnfamily_name 28 | self.name = column_name 29 | self.component_index = component_index 30 | self.index_name = index_name 31 | self.index_options = index_options 32 | self.index_type = index_type 33 | self.cql_type = cql_type 34 | self.validator = validator 35 | 36 | def __repr__(self): 37 | return "{0} {1} {2}".format(self.name, lookup_casstype(self.validator).typename, self.cql_type if self.cql_type!="regular" else "") 38 | 39 | class TableMeta(object): 40 | keyspace = None 41 | name = None 42 | columns = {} 43 | 44 | 45 | def __init__(self, keyspace_name, name, columns=None): 46 | self.keyspace = keyspace_name 47 | self.name = name 48 | self.columns = {} if columns is None else columns 49 | 50 | def add_column(self, x): 51 | self.columns.append(x) 52 | 53 | def __repr__(self, *args, **kwargs): 54 | """ Recursively prints nested lists.""" 55 | return print_ls(self.categorize_columns()) 56 | 57 | def sort_columns(self, x, reverse = False): 58 | seq = [] 59 | for i in x: 60 | seq.append((i.component_index, i)) 61 | seq.sort(reverse=reverse) 62 | return [x[1] for x in seq] 63 | 64 | def categorize_columns(self): 65 | self.partition_cols = [] 66 | self.clustering_cols = [] 67 | self.regular_cols = [] 68 | self.static_cols = [] 69 | 70 | for i in itervalues(self.columns): 71 | if i.cql_type == "partition_key": 72 | self.partition_cols.append(i) 73 | next 74 | if i.cql_type == "clustering_key": 75 | self.clustering_cols.append(i) 76 | next 77 | if i.cql_type == "regular": 78 | self.regular_cols.append(i) 79 | next 80 | if i.cql_type == "static": 81 | self.static_cols.append(i) 82 | next 83 | 84 | self.partition_cols = self.sort_columns(self.partition_cols) 85 | self.clustering_cols = self.sort_columns(self.clustering_cols, reverse=True) 86 | cluster_str = self.regular_cols 87 | for i in self.clustering_cols: 88 | cluster_str = [i, cluster_str] 89 | 90 | #partition_cols = paste([i.name for i in partition_cols]) 91 | 92 | return self.partition_cols,[cluster_str, self.static_cols] 93 | 94 | #TODO utilize TableMeta.describe to implement the same thing for keyspaces 95 | class KeyspaceMeta(object): 96 | name = None 97 | tables = {} 98 | # TODO: fill in the rest of the arguments for keyspace 99 | 100 | def __init__(self, name, tables=None): 101 | self.name = name 102 | self.tables = {} if tables is None else tables 103 | 104 | def add_table(self, x): 105 | self.tables.append(x) 106 | 107 | 108 | -------------------------------------------------------------------------------- /caspanda/tests/test_describe.py: -------------------------------------------------------------------------------- 1 | #TODO delete keyspace after tests 2 | #TODO Add tests for all possibilities: table, [table], keyspace, [keyspace] 3 | """ 4 | Testing the describe functions and those in spots.py 5 | """ 6 | import unittest 7 | 8 | from caspanda.bear import CasPanda 9 | from caspanda.metabear import KeyspaceMeta, TableMeta, ColumnMeta 10 | from caspanda.spots import describe 11 | 12 | class BaseTestInput(unittest.TestCase): 13 | """ 14 | Base class for input testing. 15 | 16 | Connects to database. 17 | """ 18 | def setUp(self): 19 | self.cl = CasPanda() 20 | self.cl.connect() 21 | super(BaseTestInput, self).setUp() 22 | cl = CasPanda() 23 | session = cl.connect() 24 | session.execute("""CREATE KEYSPACE IF NOT EXISTS tests WITH REPLICATION = { 'class' : 'SimpleStrategy', 25 | 'replication_factor' : 1 };""") 26 | session.set_keyspace("tests") 27 | session.execute("""CREATE TABLE IF NOT EXISTS sold_cars ( 28 | make text, 29 | state text, 30 | day timestamp, 31 | event_time timestamp, 32 | dealership text, 33 | salesman text, 34 | year int, 35 | account_lead text static, 36 | distributor_lead text static, 37 | PRIMARY KEY ((make, state), day, event_time));""") 38 | 39 | #class TestDescribe(BaseTestInput): 40 | 41 | # def setUp(self): 42 | # super(TestDescribe, self).setUp() 43 | # 44 | # def test_single_table(self): 45 | # out = "make, state\n\t\tday\n\t\t\tevent_time\n\t\t\t\tdealership\n\t\t\t\tyear\n\t\t\t\tsalesman\n\t\tdistributor_lead\n\t\taccount_lead\n" 46 | # self.assertEqual(out, self.cl.keyspaces["tests"].tables["sold_cars"]) 47 | 48 | class TestColumnStructure(BaseTestInput): 49 | def setUp(self): 50 | super(TestColumnStructure, self).setUp() 51 | schema_columns={} 52 | 53 | def test_single_table(self): 54 | 55 | self.assertIsInstance(self.cl.keyspaces, dict) 56 | self.assertIsInstance(self.cl.keyspaces["tests"], KeyspaceMeta) 57 | self.assertIsInstance(self.cl.keyspaces["tests"].tables, dict) 58 | 59 | tb = self.cl.keyspaces["tests"].tables["sold_cars"] 60 | 61 | self.assertIsInstance(tb, TableMeta) 62 | self.assertEqual(len(tb.columns), 9) 63 | self.assertIsInstance(tb.columns, dict) 64 | self.assertIsInstance(tb.columns["account_lead"], ColumnMeta) 65 | 66 | self.assertEqual(tb.columns["account_lead"].cql_type, "static") 67 | 68 | def test_columns(self): 69 | tb = self.cl.keyspaces["tests"].tables["sold_cars"] 70 | col_day = tb.columns["day"] 71 | self.assertEqual(col_day.cql_type, "clustering_key") 72 | self.assertEqual(col_day.component_index, 0) 73 | self.assertEqual(col_day.keyspace, "tests") 74 | self.assertEqual(col_day.name, "day") 75 | self.assertEqual(col_day.table, "sold_cars") 76 | 77 | col_state = tb.columns["state"] 78 | self.assertEqual(col_state.cql_type, "partition_key") 79 | self.assertEqual(col_state.component_index, 1) 80 | self.assertEqual(col_state.keyspace, "tests") 81 | self.assertEqual(col_state.name, "state") 82 | self.assertEqual(col_state.table, "sold_cars") 83 | 84 | col_state = tb.columns["salesman"] 85 | self.assertEqual(col_state.cql_type, "regular") 86 | self.assertEqual(col_state.component_index, 2) 87 | self.assertEqual(col_state.keyspace, "tests") 88 | self.assertEqual(col_state.name, "salesman") 89 | self.assertEqual(col_state.table, "sold_cars") 90 | 91 | 92 | col_state = tb.columns["account_lead"] 93 | self.assertEqual(col_state.cql_type, "static") 94 | self.assertEqual(col_state.component_index, 2) 95 | self.assertEqual(col_state.keyspace, "tests") 96 | self.assertEqual(col_state.name, "account_lead") 97 | self.assertEqual(col_state.table, "sold_cars") 98 | 99 | -------------------------------------------------------------------------------- /caspanda/bamboo.py: -------------------------------------------------------------------------------- 1 | ################################################### 2 | #################[ Module: Bamboo ]################ 3 | ################################################### 4 | """ 5 | This module contains the CassandraFrame class, which exposes the main interface between pandas and Cassandra. 6 | 7 | Bamboo, like what PANDAS eat. Get it?? 8 | 9 | CassandraFrame implements synchronous and asynchronous insertion operations, and MultiIndexes output from Cassandra 10 | in order to support pivot- and melt-like operations. 11 | """ 12 | import logging 13 | import pandas as pd 14 | try: 15 | import Queue as queue 16 | except ImportError: 17 | # Python 3 18 | import queue 19 | from cassandra.cluster import Session 20 | 21 | from caspanda.utils import paste 22 | 23 | 24 | class CassandraFrame(pd.DataFrame): 25 | """ 26 | Wrapper for pandas.DataFrame. 27 | 28 | Implements convenience methods for get and put operations to Cassandra, 29 | and handles MultiIndexing of CQL output. 30 | 31 | Keeps track of column name hierarchy in the self._prepared_columns and self._cql_columns. 32 | """ 33 | statement_input = None 34 | _prepared_columns = None 35 | _insert_index = None 36 | 37 | 38 | def __init__(self, data=None, index=None, columns=None, cql=None, session=None, table=None, dtype=None, 39 | copy=False, cql_columns=None, *args, **kwargs): 40 | 41 | super(CassandraFrame, self).__init__(data, index=index, columns=columns, dtype=dtype, copy=copy, *args, **kwargs) 42 | 43 | self.set_session(session) 44 | 45 | self.table = table 46 | self.cql = kwargs.get('cql', None) 47 | self.insert_queue = queue.Queue() 48 | 49 | self.set_cql_columns(cql_columns) 50 | 51 | 52 | def put(self, table=None): 53 | """ 54 | TODO: (???) 55 | """ 56 | if table is not None: 57 | self.table = table 58 | pass 59 | 60 | 61 | def create_cql_insert(self): 62 | """ 63 | Given a table, prepares a statement to allow the dataframe to be inserted row by row into cassandra. 64 | 65 | Sets statement_input to be the prepared statement. 66 | 67 | :return: 0 68 | """ 69 | assert isinstance(self.session, Session) 70 | assert self.table is not None 71 | 72 | statement = "INSERT INTO " + self.table + "(" + paste(self._cql_columns) + ") VALUES (" + paste(["?"] * len(self.columns)) + ");" 73 | 74 | 75 | 76 | self.statement_input = self.session.prepare(statement) 77 | 78 | self._prepared_columns = self._cql_columns 79 | 80 | return 81 | 82 | 83 | def insert_sync(self): 84 | """ 85 | Insert rows synchronously into Cassandra. 86 | 87 | Cassandra doesn't get a performance improvement from batch insertion as it is a peer-to-peer architecture; 88 | so the insertion strategy is to iterate over the CassandraFrame's rows and bind them one by one. 89 | """ 90 | assert self._cql_columns == self._prepared_columns 91 | assert self.statement_input is not None, 'Statement_input not defined. Use create_cql_insert().' 92 | 93 | for index, row in self.loc[:,self._prepared_columns].iterrows(): 94 | self.session.execute(self.statement_input.bind(row)) 95 | 96 | return 97 | 98 | 99 | def insert_async(self): 100 | """ 101 | Insert rows asynchronously into Cassandra. 102 | 103 | TODO: distinguish from the chained callback approach in insert_callback() and clean up. 104 | """ 105 | assert self._cql_columns == self._prepared_columns 106 | assert self.statement_input is not None, 'Statement_input not defined. Use create_cql_insert().' 107 | 108 | def handle_success(rows): 109 | pass 110 | 111 | def handle_error(exception): 112 | logging.error("Failed to send data info: %s", exception) 113 | return 114 | 115 | def put(i): 116 | future = self.session.execute_async(self.statement_input.bind(self.loc[i, self._prepared_columns])) 117 | future.add_callbacks(handle_success, handle_error) 118 | return future 119 | 120 | map(put, range(self.__len__())) 121 | 122 | return 123 | 124 | 125 | def insert_callback(self): 126 | """ 127 | TODO: code the upper limit on concurrent futures, clean up (and deprecate insert_async??) 128 | 129 | Put row indices into a queue; 130 | while the queue is not empty and the upper threshold on number of concurrent waiting processes is not reached, 131 | insert a new row into Cassandra. 132 | """ 133 | assert self._cql_columns == self._prepared_columns 134 | assert self.statement_input is not None, 'Statement_input not defined. Use create_cql_insert().' 135 | 136 | map(self.insert_queue.put_nowait, range(self.__len__())) 137 | 138 | def handle_success(rows): 139 | """ 140 | Queue raises an Empty exception when it hits the bottom of the queue (after blocking for `timeout` seconds). 141 | 142 | Try getting until Queue is exhausted, then return. 143 | """ 144 | try: 145 | i = self.insert_queue.get() 146 | except queue.Empty: 147 | return 148 | 149 | print("Inserting "+self.iloc[i].name+" ...") 150 | print("-----------------------------------------------") 151 | 152 | future = self.session.execute_async(self.statement_input.bind(self.loc[i, self._prepared_columns])) 153 | 154 | future.add_callbacks(handle_success, handle_error) # intentional tail recursion! 155 | # need hard upper limit on number of concurrent futures 156 | # something like: for i in range(min(120, self.__len__())): 157 | return future 158 | 159 | def handle_error(exception): 160 | """ 161 | Log error and recurse. 162 | """ 163 | logging.error("Failed to send data info: %s", exception) 164 | future = handle_success(None) 165 | return future 166 | 167 | future = handle_success(None) 168 | 169 | return future.result() 170 | 171 | 172 | def get_cql_columns(self): 173 | return self._cql_columns 174 | 175 | #TODO Redo these operations to run off of the meta data just built from metabear.py 176 | def set_cql_columns(self, x=None): 177 | if x is None: 178 | self._cql_columns = self.columns.tolist() 179 | else: 180 | assert isinstance(x, list) 181 | self._cql_columns = x 182 | 183 | return 184 | 185 | 186 | def set_session(self, session): 187 | """ 188 | Setter method for self.session. 189 | 190 | Pass a session object or None. 191 | 192 | :return: None 193 | """ 194 | if session is None: 195 | self.session = None 196 | 197 | else: 198 | assert isinstance(session, Session), "Got non-session, type: {}".format(type(session)) 199 | self.session = session 200 | 201 | return 202 | 203 | 204 | 205 | 206 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/aaronbenz/caspanda.svg?branch=master)](https://travis-ci.org/aaronbenz/caspanda) 2 | [![codecov.io](http://codecov.io/github/aaronbenz/caspanda/coverage.svg?branch=master)](http://codecov.io/github/aaronbenz/caspanda?branch=master) 3 | ``` 4 | _ 5 | ___ __ _ ___ _ __ __ _ _ __ __| | __ _ 6 | / __/ _` / __| '_ \ / _` | '_ \ / _` |/ _` | 7 | | (_| (_| \__ \ |_) | (_| | | | | (_| | (_| | 8 | \___\__,_|___/ .__/ \__,_|_| |_|\__,_|\__,_| 9 | |_| 10 | 11 | Aaron Benz 12 | Charlie Hack 13 | Spring 2015 14 | ``` 15 | 16 | caspanda 17 | ======== 18 | Pandas interface for Cassandra. 19 | 20 | ##What is it? 21 | **caspanda** is a Python module combines **Apache Cassandra** with **Python's Pandas** module... aka **caspanda**. Its 22 | overall goal is to give the user the ability to seperate Cassandra's NoSQL backend from the user's front end experience. 23 | Ultimately, it hopes to provide Data Scientists who use Pandas the ability to easily use Cassandra. 24 | 25 | It is still very early in its developement, but it plans on using the multi-indexing/pivot ability and the time series 26 | functionality available in Pandas to automatically sort and organize a data coming from Cassandra according to its schema. 27 | Additionally, it hopes to allow the user to easily insert data back into cassandra without ever having to speak CQL. 28 | 29 | Main Features 30 | ---- 31 | Here are a few of the things caspanda currently does: 32 | 33 | - Puts queried data into a Pandas Dataframe 34 | - Stores data into Cassandra using CassandraFrames (uses sync and async methods) 35 | - Describes the structure of Cassandra Tables in a hierarchical way 36 | 37 | Usage 38 | ---- 39 | One of the main objectives of **Caspandas** is being able to easily understand and use Cassandra. Unfortunately, 40 | many can be misled or lack the understanding of how Cassandra actually stores it's data. The attempt below is meant to 41 | give you a conceptual understanding of the hierarchy that the data is really stored in. 42 | 43 | The example table `sold_cars` demonstrates a data model that might exist if you wanted to store the information about 44 | sold cars. It stores information about a sale according to the *make* and *state* of the car, and then 45 | stores the information by day and time. So, the query pattern would specify the *make* and *state*, and then give you 46 | the ability to choose a date range. 47 | 48 | Conceptually this might make since, but the way in which it is written down in CQL if often difficult to grasp for anyone 49 | not seasoned in Cassandra. So, we have tried to make this much more simple. First, connect to Cassandra and create the 50 | table `sold_cars` 51 | ```python 52 | from caspanda.bear import CasPanda 53 | 54 | cl = CasPanda() 55 | session = cl.connect() 56 | session.execute("""CREATE KEYSPACE IF NOT EXISTS tests WITH REPLICATION = { 'class' : 'SimpleStrategy', 57 | 'replication_factor' : 1 };""") 58 | session.set_keyspace("tests") 59 | session.execute("""CREATE TABLE IF NOT EXISTS sold_cars ( 60 | make text, 61 | state text, 62 | day timestamp, 63 | event_time timestamp, 64 | dealership text, 65 | salesman text, 66 | year int, 67 | account_lead text static, 68 | distributor_lead text static, 69 | PRIMARY KEY ((make, state), day, event_time));""") 70 | ``` 71 | 72 | Now that the table has been created, let's visualize it. This breaks down the names of the columns in a hierarchical 73 | fashion that demonstrates how it is actually stored. So for example, The *make* and *state* columns define a group of data. 74 | That group is ordered and stored by *day*, and then by *event_time*. Then, for each *event_time*, 75 | there are fields for a *dealership*, *year*, and *salesman*. Additionally, there is a single value column stored on the 76 | same level as *day*, which is *distributor* and *account_lead*. 77 | 78 | Said differently, for every *make* and *state*, there is one *distributor_lead* and one *account_lead*. Also, for every 79 | *make* and *state*, there can be a combination of *dealership*, *year*, and *salesman* defined by (indexed by) a *day* 80 | and then by an *event_time* 81 | 82 | ```python 83 | 84 | print cl.keyspaces["tests"].tables["albums"] 85 | 86 | # make text partition_key 87 | # state text partition_key 88 | # day timestamp clustering_key 89 | # event_time timestamp clustering_key 90 | # dealership text 91 | # year int 92 | # salesman text 93 | # distributor_lead text static 94 | # account_lead text static 95 | ``` 96 | 97 | The traditional method for viewing this in CQL is this: 98 | 99 | ```python 100 | 101 | print cl.metadata.keyspaces["tests"].tables["sold_cars"].export_as_string() 102 | 103 | #CREATE TABLE tests.sold_cars ( 104 | # make text, 105 | # state text, 106 | # day timestamp, 107 | # event_time timestamp, 108 | # account_lead text static, 109 | # dealership text, 110 | # distributor_lead text static, 111 | # salesman text, 112 | # year int, 113 | # PRIMARY KEY ((make, state), day, event_time) 114 | ``` 115 | 116 | With that being said, please feel free to reach out to us for comments/suggestions/questions. 117 | 118 | There are also some more examples for calling data from Cassandra and inserting it back using only a Pandas Dataframe (which 119 | we called a CassandraFrame), in `bin/example.py` 120 | 121 | Example of using Caspanda for selecting data 122 | ---- 123 | Running a select from a Cassandra table will automatically return a Pandas Dataframe, even for simple selects. 124 | Let's say you have a keyspace called `tr_data` and you create one table `tr_minute` with the following columns: 125 | 126 | ``` 127 | cqlsh:tr_data> create table tr_minute ( 128 | ccypair text, 129 | gmt_timestamp timestamp, 130 | mid_rate double, 131 | ric text static, 132 | PRIMARY KEY (ccypair, gmt_timestamp) ); 133 | ``` 134 | Connect to the Cassandra database as usual, then switch to the `tr_data` keyspace. Any keywords controlling the connection such as the `port` or using `compression` are added as arguments to the initial CasPanda() call. 135 | ```python 136 | from caspanda.bear import CasPanda 137 | cl = CasPanda(contact_points=['105.150.100.25',], port=9042, compression=True) 138 | cpsession = cl.connect() 139 | cpsession.set_keyspace('tr_data') 140 | select_ccys_distinct = """select distinct ccypair from tr_minute""" 141 | ccys = cpsession.execute(select_ccys_distinct) 142 | ccys.head() 143 | ``` 144 | 145 | 146 | 147 | 148 | 149 | 150 |
ccypair
0USDKRW
1USDRUB
2AEDUSD
3USDTWD
4USDMYR
151 | 152 | Now select some time-series data from the table: 153 | 154 | ```python 155 | select_minute_wlimit = """select ccypair,gmt_timestamp,ric,mid_rate from tr_minute 156 | where ccypair = 'EURUSD' and gmt_timestamp >= '2015-05-01 00:00:00+0000' 157 | and gmt_timestamp < '2015-06-01 00:00:00+0000' LIMIT 5""" 158 | ccyA = cpsession.execute(select_minute_wlimit) 159 | ccyA.head() 160 | ``` 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 |
ccypairgmt_timestampricmid_rate
0EURUSD2015-05-01 00:00:00.001000EUR=1.121370
1EURUSD2015-05-01 00:01:00.001000EUR=1.120950
2EURUSD2015-05-01 00:02:00.001000EUR=1.121032
3EURUSD2015-05-01 00:03:00.001000EUR=1.121001
4EURUSD2015-05-01 00:04:00.001000EUR=1.120950
194 | 195 | The dataframe returned is exactly the same layout as the table, though the pandas index is just the row number. If you want the index to be the timestamp, this has to be done explicitly: 196 | 197 | ```python 198 | ccyA.set_index('gmt_timestamp') 199 | ``` 200 | 201 | *Large result sets* 202 | 203 | By default the underlying python driver will switch to using paged-result sets if the number of returned rows is greater than 5,000 rows. This will not currently work with caspanda, because the results are not automatically returned by cassandra. The db 'waits' until the driver starts to request the results by page. To get around this you can increase the default select size: 204 | 205 | ```python 206 | cpsession.default_fetch_size = 50000 207 | ``` 208 | 209 | However note that cassandra also has a default _server-side_ read timeout of 5 seconds. If you cannot retrieve all rows within this limit you will be timed out. 210 | 211 | *Parallel sessions* 212 | 213 | If you need to select basic data that does not really make sense in a dataframe (for instance a string of values to be re-used in another select), you can create another 'parallel' cassandra session, at the same time: 214 | 215 | ```python 216 | from cassandra.cluster import Cluster 217 | cconnection = Cluster() 218 | csession = cconnection.connect() 219 | csession.set_keyspace('tr_data') 220 | cccys = csession.execute(select_ccys_distinct) 221 | # This returns a list of cassandra 'row-type' 222 | ccy_string = '' 223 | for row in cccys: 224 | ccy_string = ccy_string + row.ccypair +',' 225 | print ccy_string 226 | 'USDKRW,USDRUB,AEDUSD,USDTWD,USDMYR,USDARS,USDCHF,USDSAR,USDPEN,GBPUSD...' 227 | ``` 228 | and the results can be pulled directly from the response. You can use both in the same session, according to the type of results needed.. 229 | 230 | Installation 231 | ---- 232 | `$ python setup.py install` or `$ pip install -e .` 233 | You'll also need Cassandra: 234 | 235 | `$ brew install cassandra` 236 | 237 | 238 | 239 | Tests 240 | ----- 241 | There are some unit and integration tests in the `caspanda/tests/` directory. 242 | 243 | Run from the command line with 244 | 245 | `$ nosetests` 246 | 247 | 248 | TODO 249 | ---- 250 | * `grep -r TODO .` 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | --------------------------------------------------------------------------------