├── lib └── cassandra-composite-type-0.0.1.jar ├── .gitignore ├── src ├── test │ ├── resources │ │ ├── log4j.properties │ │ └── cassandra.yaml │ └── java │ │ └── indexedcollections │ │ └── IndexTest.java └── main │ └── java │ └── indexedcollections │ └── IndexedCollections.java ├── README └── pom.xml /lib/cassandra-composite-type-0.0.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edanuff/CassandraIndexedCollections/HEAD/lib/cassandra-composite-type-0.0.1.jar -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.DS_Store 3 | hector.iml 4 | releases 5 | target 6 | tmp 7 | bin 8 | .classpath 9 | .project 10 | .settings 11 | out 12 | *.svn 13 | *.ipr 14 | *.iws 15 | DS_Store 16 | 17 | /.DS_Store -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # for production, you should probably set the root to INFO 18 | # and the pattern to %c instead of %l. (%l is slower.) 19 | 20 | # output messages into a rolling log file as well as stdout 21 | log4j.rootLogger=INFO,stdout 22 | 23 | # stdout 24 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 25 | #log4j.appender.stdout.layout=org.apache.log4j.SimpleLayout 26 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 27 | log4j.appender.stdout.layout.ConversionPattern=%d %p (%t) [%c] - %m%n 28 | 29 | log4j.category.org.apache=ERROR, stdout 30 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Indexed Collections for Cassandra 2 | 3 | NOTE: This has been updated to use the new built-in composite types in Cassandra 0.8.1 4 | 5 | You will need to use Hector 0.8.0-2-SNAPSHOT or later and Cassandra 0.8.1 or later. 6 | 7 | This is an implementation of the indexing technique described here: 8 | 9 | http://www.anuff.com/2010/07/secondary-indexes-in-cassandra.html 10 | 11 | The original article describes the use of a custom composite column comparator. A 12 | version of this comparator has recently been added to the latest verion of Cassandra, 13 | meaning that it's no longer necessary to install anthing on the Cassandra instance 14 | to handle composite types. 15 | 16 | This is a simplified version of a more complex indexing scheme thats been in used 17 | for some time now as part of a large project. However, this simplified implementation was 18 | largely created from scratch and hasn't been extensively tested. 19 | 20 | This indexing, as opposed to Cassandra's built-in secondary indexes, is completely 21 | dynamic. It's possible to create new indexes at any time and the index value types don't 22 | have to be predefined, making use with JSON data easier. 23 | 24 | One advantage of this indexing technique is that it combines relationships with mini-indexes 25 | so that you can have a user who's friends with 5 other users and then be able to search the 26 | user's friend list. In this example, the user is what is referred to in the 27 | code as the "container" and the other users are "items". 28 | 29 | For examples of use, look at the class IndextText. 30 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | cassandra-indexed-collections 5 | cassandra-indexed-collections 6 | 0.0.1 7 | CassandraIndexedCollections 8 | 9 | 10 | 11 | org.apache.maven.plugins 12 | maven-surefire-plugin 13 | 2.6 14 | 15 | 16 | 17 | ${basedir}/src/test/conf 18 | 19 | always 20 | -Xmx512M -Xms512M 21 | 22 | 23 | 24 | org.apache.maven.plugins 25 | maven-compiler-plugin 26 | 2.3.2 27 | 28 | 1.6 29 | 1.6 30 | true 31 | true 32 | true 33 | true 34 | 35 | 36 | 37 | org.apache.maven.plugins 38 | maven-install-plugin 39 | 2.3.1 40 | 41 | 42 | install cassandra-composite-types 43 | initialize 44 | 45 | install-file 46 | 47 | 48 | jar 49 | cassandra-composite-type 50 | cassandra-composite-type 51 | 0.0.1 52 | ${basedir}/lib/cassandra-composite-type-0.0.1.jar 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | org.apache.cassandra 62 | cassandra-all 63 | 0.8.1 64 | jar 65 | 66 | 67 | org.apache.cassandra 68 | cassandra-javautils 69 | 0.7.0 70 | test 71 | 72 | 73 | me.prettyprint 74 | hector-core 75 | 0.8.0-2-SNAPSHOT 76 | 77 | 78 | com.github.stephenc.eaio-uuid 79 | uuid 80 | 3.2.0 81 | 82 | 83 | junit 84 | junit 85 | 4.8.1 86 | test 87 | 88 | 89 | log4j 90 | log4j 91 | 1.2.14 92 | 93 | 94 | org.slf4j 95 | slf4j-api 96 | 1.6.1 97 | 98 | 99 | org.slf4j 100 | slf4j-log4j12 101 | 1.6.1 102 | 103 | 104 | org.perf4j 105 | perf4j 106 | 0.9.12 107 | 108 | 109 | 110 | 111 | codehaus 112 | codehaus 113 | http://repository.codehaus.org/ 114 | 115 | 116 | nexus-snapshots 117 | Sonatype Nexus Snapshots 118 | http://oss.sonatype.org/content/repositories/snapshots 119 | 120 | true 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /src/test/java/indexedcollections/IndexTest.java: -------------------------------------------------------------------------------- 1 | package indexedcollections; 2 | 3 | import static me.prettyprint.hector.api.beans.DynamicComposite.DEFAULT_DYNAMIC_COMPOSITE_ALIASES; 4 | import static me.prettyprint.hector.api.ddl.ComparatorType.DYNAMICCOMPOSITETYPE; 5 | import static me.prettyprint.hector.api.factory.HFactory.createColumn; 6 | import static me.prettyprint.hector.api.factory.HFactory.createKeyspace; 7 | import static me.prettyprint.hector.api.factory.HFactory.createMutator; 8 | import static me.prettyprint.hector.api.factory.HFactory.getOrCreateCluster; 9 | import static org.junit.Assert.assertEquals; 10 | import static org.junit.Assert.assertTrue; 11 | import indexedcollections.IndexedCollections.ContainerCollection; 12 | 13 | import java.io.IOException; 14 | import java.util.ArrayList; 15 | import java.util.LinkedHashSet; 16 | import java.util.List; 17 | import java.util.Set; 18 | import java.util.UUID; 19 | 20 | import me.prettyprint.cassandra.serializers.ByteBufferSerializer; 21 | import me.prettyprint.cassandra.serializers.BytesArraySerializer; 22 | import me.prettyprint.cassandra.serializers.DynamicCompositeSerializer; 23 | import me.prettyprint.cassandra.serializers.LongSerializer; 24 | import me.prettyprint.cassandra.serializers.StringSerializer; 25 | import me.prettyprint.cassandra.serializers.UUIDSerializer; 26 | import me.prettyprint.cassandra.service.ThriftCfDef; 27 | import me.prettyprint.cassandra.service.ThriftKsDef; 28 | import me.prettyprint.cassandra.testutils.EmbeddedServerHelper; 29 | import me.prettyprint.hector.api.Cluster; 30 | import me.prettyprint.hector.api.Keyspace; 31 | import me.prettyprint.hector.api.Serializer; 32 | 33 | import org.apache.cassandra.config.ConfigurationException; 34 | import org.apache.cassandra.db.marshal.BytesType; 35 | import org.apache.cassandra.db.marshal.TimeUUIDType; 36 | import org.apache.cassandra.thrift.CfDef; 37 | import org.apache.cassandra.thrift.KsDef; 38 | import org.apache.log4j.Logger; 39 | import org.apache.thrift.transport.TTransportException; 40 | import org.junit.AfterClass; 41 | import org.junit.BeforeClass; 42 | import org.junit.Test; 43 | 44 | /** 45 | * Example class showing usage of IndexedCollections. 46 | */ 47 | public class IndexTest { 48 | 49 | private static final Logger logger = Logger.getLogger(IndexTest.class 50 | .getName()); 51 | 52 | public static final String KEYSPACE = "Keyspace"; 53 | 54 | public static final StringSerializer se = new StringSerializer(); 55 | public static final ByteBufferSerializer be = new ByteBufferSerializer(); 56 | public static final DynamicCompositeSerializer ce = new DynamicCompositeSerializer(); 57 | public static final UUIDSerializer ue = new UUIDSerializer(); 58 | public static final LongSerializer le = new LongSerializer(); 59 | public static final BytesArraySerializer bae = new BytesArraySerializer(); 60 | 61 | static EmbeddedServerHelper embedded; 62 | 63 | static Cluster cluster; 64 | static Keyspace ko; 65 | 66 | @Test 67 | public void testIndexes() throws IOException, TTransportException, 68 | InterruptedException, ConfigurationException { 69 | 70 | // Create a container entity 71 | 72 | UUID g1 = createEntity("company"); 73 | 74 | ContainerCollection container = new ContainerCollection(g1, 75 | "employees"); 76 | Set> containers = new LinkedHashSet>(); 77 | containers.add(container); 78 | 79 | // Create a set of items to add to the container 80 | 81 | UUID e1 = createEntity("employee"); 82 | UUID e2 = createEntity("employee"); 83 | UUID e3 = createEntity("employee"); 84 | 85 | // Create container/item relationship 86 | 87 | addEntityToCollection(container, e1); 88 | addEntityToCollection(container, e2); 89 | addEntityToCollection(container, e3); 90 | 91 | // Check the entities in the container 92 | 93 | List entities = getEntitiesInCollection(container); 94 | assertEquals(3, entities.size()); 95 | 96 | // Set name column values 97 | 98 | setEntityColumn(e1, "name", "bob", containers, se); 99 | 100 | setEntityColumn(e2, "name", "fred", containers, se); 101 | 102 | setEntityColumn(e3, "name", "bill", containers, se); 103 | 104 | // Do an exact match search for name column 105 | 106 | logger.info("SELECT WHERE name = 'fred'"); 107 | 108 | List results = searchContainer(container, "name", "fred"); 109 | 110 | logger.info(results.size() + " results found"); 111 | 112 | assertEquals(1, results.size()); 113 | assertTrue(results.get(0).equals(e2)); 114 | 115 | logger.info("Result found is " + results.get(0)); 116 | 117 | // Change the value of a name column and make sure the old value is no 118 | // longer in the index 119 | 120 | setEntityColumn(e2, "name", "steve", containers, se); 121 | 122 | logger.info("SELECT WHERE name = 'fred'"); 123 | 124 | results = searchContainer(container, "name", "fred"); 125 | 126 | logger.info(results.size() + " results found"); 127 | 128 | assertEquals(0, results.size()); 129 | 130 | // Do a range search 131 | 132 | logger.info("SELECT WHERE name >= 'bill' AND name < 'c'"); 133 | 134 | results = searchContainer(container, "name", "bill", "c", false); 135 | 136 | logger.info(results.size() + " results found"); 137 | 138 | assertEquals(2, results.size()); 139 | 140 | // Set column values for height 141 | 142 | setEntityColumn(e1, "height", (long) 5, containers, le); 143 | 144 | setEntityColumn(e2, "height", (long) 6, containers, le); 145 | 146 | setEntityColumn(e3, "height", (long) 7, containers, le); 147 | 148 | // Do an numeric exact match search for height 149 | 150 | logger.info("SELECT WHERE height = 6"); 151 | 152 | results = searchContainer(container, "height", 6); 153 | 154 | logger.info(results.size() + " results found"); 155 | 156 | assertEquals(1, results.size()); 157 | 158 | // Do a numeric range search for height 159 | 160 | logger.info("SELECT WHERE height >= 6 AND name < 10"); 161 | 162 | results = searchContainer(container, "height", 6, 10, false); 163 | 164 | logger.info(results.size() + " results found"); 165 | 166 | assertEquals(2, results.size()); 167 | 168 | // Change a numeric column value and make sure it's no longer in the 169 | // index 170 | 171 | setEntityColumn(e3, "height", (long) 5, containers, le); 172 | 173 | results = searchContainer(container, "height", 6, 10, false); 174 | 175 | logger.info(results.size() + " results found"); 176 | 177 | assertEquals(1, results.size()); 178 | 179 | // Set byte values in columns 180 | 181 | setEntityColumn(e1, "bytes", new byte[] { 1, 2, 3 }, containers, bae); 182 | 183 | setEntityColumn(e2, "bytes", new byte[] { 1, 2, 4 }, containers, bae); 184 | 185 | setEntityColumn(e3, "bytes", new byte[] { 1, 2, 5 }, containers, bae); 186 | 187 | // Do a byte array exact match search 188 | 189 | results = searchContainer(container, "bytes", new byte[] { 1, 2, 4 }); 190 | 191 | logger.info(results.size() + " results found"); 192 | 193 | assertEquals(1, results.size()); 194 | 195 | // Do a byte array range search 196 | 197 | results = searchContainer(container, "bytes", new byte[] { 1, 2, 4 }, 198 | new byte[] { 10 }, false); 199 | 200 | logger.info(results.size() + " results found"); 201 | 202 | assertEquals(2, results.size()); 203 | 204 | // Store some text columns 205 | 206 | setEntityColumn(e1, "location", "san francisco", containers, se); 207 | 208 | setEntityColumn(e2, "location", "san diego", containers, se); 209 | 210 | setEntityColumn(e3, "location", "santa clara", containers, se); 211 | 212 | // Do a range search exclusive on the same value for start and end and 213 | // make sure we get 0 results 214 | 215 | results = searchContainer(container, "location", "san francisco", 216 | "san francisco", false); 217 | 218 | logger.info(results.size() + " results found"); 219 | 220 | assertEquals(0, results.size()); 221 | 222 | // Do a range search inclusive on the same value for start and end and 223 | // make sure we get 1 result 224 | 225 | results = searchContainer(container, "location", "san francisco", 226 | "san francisco", true); 227 | 228 | logger.info(results.size() + " results found"); 229 | 230 | assertEquals(1, results.size()); 231 | 232 | } 233 | 234 | @BeforeClass 235 | public static void setup() throws TTransportException, IOException, 236 | InterruptedException, ConfigurationException { 237 | embedded = new EmbeddedServerHelper(); 238 | embedded.setup(); 239 | 240 | cluster = getOrCreateCluster("MyCluster", "127.0.0.1:9170"); 241 | ko = createKeyspace(KEYSPACE, cluster); 242 | 243 | ArrayList cfDefList = new ArrayList(2); 244 | 245 | setupColumnFamilies(cfDefList); 246 | 247 | makeKeyspace(cluster, KEYSPACE, 248 | "org.apache.cassandra.locator.SimpleStrategy", 1, cfDefList); 249 | 250 | } 251 | 252 | @AfterClass 253 | public static void teardown() throws IOException { 254 | EmbeddedServerHelper.teardown(); 255 | embedded = null; 256 | } 257 | 258 | /** 259 | * Create the four required column families for values and indexes. 260 | * 261 | * @param cfDefList 262 | */ 263 | public static void setupColumnFamilies(List cfDefList) { 264 | 265 | createCF(IndexedCollections.DEFAULT_ITEM_CF, 266 | BytesType.class.getSimpleName(), cfDefList); 267 | 268 | createCF(IndexedCollections.DEFAULT_COLLECTION_CF, 269 | TimeUUIDType.class.getSimpleName(), cfDefList); 270 | 271 | createCF(IndexedCollections.DEFAULT_COLLECTION_INDEX_CF, 272 | DYNAMICCOMPOSITETYPE.getTypeName() 273 | + DEFAULT_DYNAMIC_COMPOSITE_ALIASES, cfDefList); 274 | 275 | createCF(IndexedCollections.DEFAULT_ITEM_INDEX_ENTRIES, 276 | DYNAMICCOMPOSITETYPE.getTypeName() 277 | + DEFAULT_DYNAMIC_COMPOSITE_ALIASES, cfDefList); 278 | 279 | } 280 | 281 | public static void createCF(String name, String comparator_type, 282 | List cfDefList) { 283 | cfDefList.add(new CfDef(KEYSPACE, name) 284 | .setComparator_type(comparator_type).setKey_cache_size(0) 285 | .setRow_cache_size(0).setGc_grace_seconds(86400)); 286 | } 287 | 288 | public static void makeKeyspace(Cluster cluster, String name, 289 | String strategy, int replicationFactor, List cfDefList) { 290 | 291 | if (cfDefList == null) { 292 | cfDefList = new ArrayList(); 293 | } 294 | 295 | try { 296 | KsDef ksDef = new KsDef(name, strategy, cfDefList); 297 | cluster.addKeyspace(new ThriftKsDef(ksDef)); 298 | return; 299 | } catch (Throwable e) { 300 | logger.error("Exception while creating keyspace, " + name 301 | + " - probably already exists", e); 302 | } 303 | 304 | for (CfDef cfDef : cfDefList) { 305 | try { 306 | cluster.addColumnFamily(new ThriftCfDef(cfDef)); 307 | } catch (Throwable e) { 308 | logger.error("Exception while creating CF, " + cfDef.getName() 309 | + " - probably already exists", e); 310 | } 311 | } 312 | } 313 | 314 | public static java.util.UUID newTimeUUID() { 315 | com.eaio.uuid.UUID eaioUUID = new com.eaio.uuid.UUID(); 316 | return new UUID(eaioUUID.time, eaioUUID.clockSeqAndNode); 317 | } 318 | 319 | /* 320 | * Convenience methods for wrapping IndexedCollections methods 321 | */ 322 | 323 | public UUID createEntity(String type) { 324 | UUID id = newTimeUUID(); 325 | createMutator(ko, ue).insert(id, IndexedCollections.DEFAULT_ITEM_CF, 326 | createColumn("type", type, se, se)); 327 | return id; 328 | } 329 | 330 | public void addEntityToCollection(ContainerCollection container, 331 | UUID itemEntity) { 332 | IndexedCollections.addItemToCollection(ko, container, itemEntity, 333 | IndexedCollections.defaultCFSet, ue); 334 | } 335 | 336 | public List getEntitiesInCollection( 337 | ContainerCollection container) { 338 | return IndexedCollections.getItemsInCollection(ko, container, 339 | IndexedCollections.defaultCFSet, ue); 340 | } 341 | 342 | public static void setEntityColumn(UUID itemEntity, String columnName, 343 | V columnValue, Set> containers, 344 | Serializer valueSerializer) { 345 | IndexedCollections.setItemColumn(ko, itemEntity, columnName, 346 | columnValue, containers, IndexedCollections.defaultCFSet, ue, 347 | se, valueSerializer, ue); 348 | } 349 | 350 | public static List searchContainer( 351 | ContainerCollection container, String columnName, 352 | Object searchValue) { 353 | 354 | return IndexedCollections.searchContainer(ko, container, columnName, 355 | searchValue, null, 100, false, IndexedCollections.defaultCFSet, 356 | ue, ue, se); 357 | } 358 | 359 | public static List searchContainer( 360 | ContainerCollection container, String columnName, 361 | Object startValue, Object endValue, boolean inclusive) { 362 | 363 | return IndexedCollections.searchContainer(ko, container, columnName, 364 | startValue, endValue, inclusive, null, 100, false, 365 | IndexedCollections.defaultCFSet, ue, ue, se); 366 | } 367 | 368 | } 369 | -------------------------------------------------------------------------------- /src/test/resources/cassandra.yaml: -------------------------------------------------------------------------------- 1 | # Cassandra storage config YAML 2 | 3 | # NOTE: 4 | # See http://wiki.apache.org/cassandra/StorageConfiguration for 5 | # full explanations of configuration directives 6 | # /NOTE 7 | 8 | # The name of the cluster. This is mainly used to prevent machines in 9 | # one logical cluster from joining another. 10 | cluster_name: 'Test Cluster' 11 | 12 | # You should always specify InitialToken when setting up a production 13 | # cluster for the first time, and often when adding capacity later. 14 | # The principle is that each node should be given an equal slice of 15 | # the token ring; see http://wiki.apache.org/cassandra/Operations 16 | # for more details. 17 | # 18 | # If blank, Cassandra will request a token bisecting the range of 19 | # the heaviest-loaded existing node. If there is no load information 20 | # available, such as is the case with a new cluster, it will pick 21 | # a random token, which will lead to hot spots. 22 | initial_token: 23 | 24 | # Set to true to make new [non-seed] nodes automatically migrate data 25 | # to themselves from the pre-existing nodes in the cluster. Defaults 26 | # to false because you can only bootstrap N machines at a time from 27 | # an existing cluster of N, so if you are bringing up a cluster of 28 | # 10 machines with 3 seeds you would have to do it in stages. Leaving 29 | # this off for the initial start simplifies that. 30 | auto_bootstrap: false 31 | 32 | # See http://wiki.apache.org/cassandra/HintedHandoff 33 | hinted_handoff_enabled: true 34 | # this defines the maximum amount of time a dead host will have hints 35 | # generated. After it has been dead this long, hints will be dropped. 36 | max_hint_window_in_ms: 3600000 # one hour 37 | # Sleep this long after delivering each row or row fragment 38 | hinted_handoff_throttle_delay_in_ms: 50 39 | 40 | # authentication backend, implementing IAuthenticator; used to identify users 41 | authenticator: org.apache.cassandra.auth.AllowAllAuthenticator 42 | 43 | # authorization backend, implementing IAuthority; used to limit access/provide permissions 44 | authority: org.apache.cassandra.auth.AllowAllAuthority 45 | 46 | # The partitioner is responsible for distributing rows (by key) across 47 | # nodes in the cluster. Any IPartitioner may be used, including your 48 | # own as long as it is on the classpath. Out of the box, Cassandra 49 | # provides org.apache.cassandra.dht.RandomPartitioner 50 | # org.apache.cassandra.dht.ByteOrderedPartitioner, 51 | # org.apache.cassandra.dht.OrderPreservingPartitioner (deprecated), 52 | # and org.apache.cassandra.dht.CollatingOrderPreservingPartitioner 53 | # (deprecated). 54 | # 55 | # - RandomPartitioner distributes rows across the cluster evenly by md5. 56 | # When in doubt, this is the best option. 57 | # - ByteOrderedPartitioner orders rows lexically by key bytes. BOP allows 58 | # scanning rows in key order, but the ordering can generate hot spots 59 | # for sequential insertion workloads. 60 | # - OrderPreservingPartitioner is an obsolete form of BOP, that stores 61 | # - keys in a less-efficient format and only works with keys that are 62 | # UTF8-encoded Strings. 63 | # - CollatingOPP colates according to EN,US rules rather than lexical byte 64 | # ordering. Use this as an example if you need custom collation. 65 | # 66 | # See http://wiki.apache.org/cassandra/Operations for more on 67 | # partitioners and token selection. 68 | partitioner: org.apache.cassandra.dht.RandomPartitioner 69 | 70 | # directories where Cassandra should store data on disk. 71 | data_file_directories: 72 | - ./tmp/data 73 | 74 | # commit log 75 | commitlog_directory: ./tmp/commitlog 76 | 77 | # saved caches 78 | saved_caches_directory: ./tmp/saved_caches 79 | 80 | # Size to allow commitlog to grow to before creating a new segment 81 | commitlog_rotation_threshold_in_mb: 128 82 | 83 | # commitlog_sync may be either "periodic" or "batch." 84 | # When in batch mode, Cassandra won't ack writes until the commit log 85 | # has been fsynced to disk. It will wait up to 86 | # CommitLogSyncBatchWindowInMS milliseconds for other writes, before 87 | # performing the sync. 88 | commitlog_sync: periodic 89 | 90 | # the other option is "periodic" where writes may be acked immediately 91 | # and the CommitLog is simply synced every commitlog_sync_period_in_ms 92 | # milliseconds. 93 | commitlog_sync_period_in_ms: 10000 94 | 95 | # any class that implements the SeedProvider interface and has a constructor that takes a Map of 96 | # parameters will do. 97 | seed_provider: 98 | # Addresses of hosts that are deemed contact points. 99 | # Cassandra nodes use this list of hosts to find each other and learn 100 | # the topology of the ring. You must change this if you are running 101 | # multiple nodes! 102 | - class_name: org.apache.cassandra.locator.SimpleSeedProvider 103 | parameters: 104 | # seeds is actually a comma-delimited list of addresses. 105 | - seeds: "127.0.0.1" 106 | 107 | # emergency pressure valve: each time heap usage after a full (CMS) 108 | # garbage collection is above this fraction of the max, Cassandra will 109 | # flush the largest memtables. 110 | # 111 | # Set to 1.0 to disable. Setting this lower than 112 | # CMSInitiatingOccupancyFraction is not likely to be useful. 113 | # 114 | # RELYING ON THIS AS YOUR PRIMARY TUNING MECHANISM WILL WORK POORLY: 115 | # it is most effective under light to moderate load, or read-heavy 116 | # workloads; under truly massive write load, it will often be too 117 | # little, too late. 118 | flush_largest_memtables_at: 0.75 119 | 120 | # emergency pressure valve #2: the first time heap usage after a full 121 | # (CMS) garbage collection is above this fraction of the max, 122 | # Cassandra will reduce cache maximum _capacity_ to the given fraction 123 | # of the current _size_. Should usually be set substantially above 124 | # flush_largest_memtables_at, since that will have less long-term 125 | # impact on the system. 126 | # 127 | # Set to 1.0 to disable. Setting this lower than 128 | # CMSInitiatingOccupancyFraction is not likely to be useful. 129 | reduce_cache_sizes_at: 0.85 130 | reduce_cache_capacity_to: 0.6 131 | 132 | # For workloads with more data than can fit in memory, Cassandra's 133 | # bottleneck will be reads that need to fetch data from 134 | # disk. "concurrent_reads" should be set to (16 * number_of_drives) in 135 | # order to allow the operations to enqueue low enough in the stack 136 | # that the OS and drives can reorder them. 137 | # 138 | # On the other hand, since writes are almost never IO bound, the ideal 139 | # number of "concurrent_writes" is dependent on the number of cores in 140 | # your system; (8 * number_of_cores) is a good rule of thumb. 141 | concurrent_reads: 32 142 | concurrent_writes: 32 143 | 144 | # Total memory to use for memtables. Cassandra will flush the largest 145 | # memtable when this much memory is used. Prefer using this to 146 | # the older, per-ColumnFamily memtable flush thresholds. 147 | # If omitted, Cassandra will set it to 1/3 of the heap. 148 | # If set to 0, only the old flush thresholds are used. 149 | # memtable_total_space_in_mb: 2048 150 | 151 | # This sets the amount of memtable flush writer threads. These will 152 | # be blocked by disk io, and each one will hold a memtable in memory 153 | # while blocked. If you have a large heap and many data directories, 154 | # you can increase this value for better flush performance. 155 | # By default this will be set to the amount of data directories defined. 156 | #memtable_flush_writers: 1 157 | 158 | # the number of full memtables to allow pending flush, that is, 159 | # waiting for a writer thread. At a minimum, this should be set to 160 | # the maximum number of secondary indexes created on a single CF. 161 | memtable_flush_queue_size: 4 162 | 163 | # Buffer size to use when performing contiguous column slices. 164 | # Increase this to the size of the column slices you typically perform 165 | sliced_buffer_size_in_kb: 64 166 | 167 | # TCP port, for commands and data 168 | storage_port: 7000 169 | 170 | # Address to bind to and tell other Cassandra nodes to connect to. You 171 | # _must_ change this if you want multiple nodes to be able to 172 | # communicate! 173 | # 174 | # Leaving it blank leaves it up to InetAddress.getLocalHost(). This 175 | # will always do the Right Thing *if* the node is properly configured 176 | # (hostname, name resolution, etc), and the Right Thing is to use the 177 | # address associated with the hostname (it might not be). 178 | # 179 | # Setting this to 0.0.0.0 is always wrong. 180 | listen_address: localhost 181 | 182 | # The address to bind the Thrift RPC service to -- clients connect 183 | # here. Unlike ListenAddress above, you *can* specify 0.0.0.0 here if 184 | # you want Thrift to listen on all interfaces. 185 | # 186 | # Leaving this blank has the same effect it does for ListenAddress, 187 | # (i.e. it will be based on the configured hostname of the node). 188 | rpc_address: localhost 189 | # port for Thrift to listen for clients on 190 | rpc_port: 9170 191 | 192 | # enable or disable keepalive on rpc connections 193 | rpc_keepalive: true 194 | 195 | # Cassandra uses thread-per-client for client RPC. This can 196 | # be expensive in memory used for thread stack for a large 197 | # enough number of clients. (Hence, connection pooling is 198 | # very, very strongly recommended.) 199 | # 200 | # Uncomment rpc_min|max|thread to set request pool size. 201 | # You would primarily set max as a safeguard against misbehaved 202 | # clients; if you do hit the max, Cassandra will block until 203 | # one disconnects before accepting more. The defaults are 204 | # min of 16 and max unlimited. 205 | # 206 | # rpc_min_threads: 16 207 | # rpc_max_threads: 2048 208 | 209 | # uncomment to set socket buffer sizes on rpc connections 210 | # rpc_send_buff_size_in_bytes: 211 | # rpc_recv_buff_size_in_bytes: 212 | 213 | # Frame size for thrift (maximum field length). 214 | # 0 disables TFramedTransport in favor of TSocket. This option 215 | # is deprecated; we strongly recommend using Framed mode. 216 | thrift_framed_transport_size_in_mb: 15 217 | 218 | # The max length of a thrift message, including all fields and 219 | # internal thrift overhead. 220 | thrift_max_message_length_in_mb: 16 221 | 222 | # Set to true to have Cassandra create a hard link to each sstable 223 | # flushed or streamed locally in a backups/ subdirectory of the 224 | # Keyspace data. Removing these links is the operator's 225 | # responsibility. 226 | incremental_backups: false 227 | 228 | # Whether or not to take a snapshot before each compaction. Be 229 | # careful using this option, since Cassandra won't clean up the 230 | # snapshots for you. Mostly useful if you're paranoid when there 231 | # is a data format change. 232 | snapshot_before_compaction: false 233 | 234 | # change this to increase the compaction thread's priority. In java, 1 is the 235 | # lowest priority and that is our default. 236 | # compaction_thread_priority: 1 237 | 238 | # Add column indexes to a row after its contents reach this size. 239 | # Increase if your column values are large, or if you have a very large 240 | # number of columns. The competing causes are, Cassandra has to 241 | # deserialize this much of the row to read a single column, so you want 242 | # it to be small - at least if you do many partial-row reads - but all 243 | # the index data is read for each access, so you don't want to generate 244 | # that wastefully either. 245 | column_index_size_in_kb: 64 246 | 247 | # Size limit for rows being compacted in memory. Larger rows will spill 248 | # over to disk and use a slower two-pass compaction process. A message 249 | # will be logged specifying the row key. 250 | in_memory_compaction_limit_in_mb: 64 251 | 252 | # Number of compaction threads. This default to the number of processors, 253 | # enabling multiple compactions to execute at once. Using more than one 254 | # thread is highly recommended to preserve read performance in a mixed 255 | # read/write workload as this avoids sstables from accumulating during long 256 | # running compactions. The default is usually fine and if you experience 257 | # problems with compaction running too slowly or too fast, you should look at 258 | # compaction_throughput_mb_per_sec first. 259 | # Uncomment to make compaction mono-threaded. 260 | #concurrent_compactors: 1 261 | 262 | # Throttles compaction to the given total throughput across the entire 263 | # system. The faster you insert data, the faster you need to compact in 264 | # order to keep the sstable count down, but in general, setting this to 265 | # 16 to 32 times the rate you are inserting data is more than sufficient. 266 | # Setting this to 0 disables throttling. 267 | compaction_throughput_mb_per_sec: 16 268 | 269 | # Track cached row keys during compaction, and re-cache their new 270 | # positions in the compacted sstable. Disable if you use really large 271 | # key caches. 272 | compaction_preheat_key_cache: true 273 | 274 | # Time to wait for a reply from other nodes before failing the command 275 | rpc_timeout_in_ms: 10000 276 | 277 | # phi value that must be reached for a host to be marked down. 278 | # most users should never need to adjust this. 279 | # phi_convict_threshold: 8 280 | 281 | # endpoint_snitch -- Set this to a class that implements 282 | # IEndpointSnitch, which will let Cassandra know enough 283 | # about your network topology to route requests efficiently. 284 | # Out of the box, Cassandra provides 285 | # - org.apache.cassandra.locator.SimpleSnitch: 286 | # Treats Strategy order as proximity. This improves cache locality 287 | # when disabling read repair, which can further improve throughput. 288 | # - org.apache.cassandra.locator.RackInferringSnitch: 289 | # Proximity is determined by rack and data center, which are 290 | # assumed to correspond to the 3rd and 2nd octet of each node's 291 | # IP address, respectively 292 | # org.apache.cassandra.locator.PropertyFileSnitch: 293 | # - Proximity is determined by rack and data center, which are 294 | # explicitly configured in cassandra-topology.properties. 295 | endpoint_snitch: org.apache.cassandra.locator.SimpleSnitch 296 | 297 | # dynamic_snitch -- This boolean controls whether the above snitch is 298 | # wrapped with a dynamic snitch, which will monitor read latencies 299 | # and avoid reading from hosts that have slowed (due to compaction, 300 | # for instance) 301 | dynamic_snitch: true 302 | # controls how often to perform the more expensive part of host score 303 | # calculation 304 | dynamic_snitch_update_interval_in_ms: 100 305 | # controls how often to reset all host scores, allowing a bad host to 306 | # possibly recover 307 | dynamic_snitch_reset_interval_in_ms: 600000 308 | # if set greater than zero and read_repair_chance is < 1.0, this will allow 309 | # 'pinning' of replicas to hosts in order to increase cache capacity. 310 | # The badness threshold will control how much worse the pinned host has to be 311 | # before the dynamic snitch will prefer other replicas over it. This is 312 | # expressed as a double which represents a percentage. Thus, a value of 313 | # 0.2 means Cassandra would continue to prefer the static snitch values 314 | # until the pinned host was 20% worse than the fastest. 315 | dynamic_snitch_badness_threshold: 0.0 316 | 317 | # request_scheduler -- Set this to a class that implements 318 | # RequestScheduler, which will schedule incoming client requests 319 | # according to the specific policy. This is useful for multi-tenancy 320 | # with a single Cassandra cluster. 321 | # NOTE: This is specifically for requests from the client and does 322 | # not affect inter node communication. 323 | # org.apache.cassandra.scheduler.NoScheduler - No scheduling takes place 324 | # org.apache.cassandra.scheduler.RoundRobinScheduler - Round robin of 325 | # client requests to a node with a separate queue for each 326 | # request_scheduler_id. The scheduler is further customized by 327 | # request_scheduler_options as described below. 328 | request_scheduler: org.apache.cassandra.scheduler.NoScheduler 329 | 330 | # Scheduler Options vary based on the type of scheduler 331 | # NoScheduler - Has no options 332 | # RoundRobin 333 | # - throttle_limit -- The throttle_limit is the number of in-flight 334 | # requests per client. Requests beyond 335 | # that limit are queued up until 336 | # running requests can complete. 337 | # The value of 80 here is twice the number of 338 | # concurrent_reads + concurrent_writes. 339 | # - default_weight -- default_weight is optional and allows for 340 | # overriding the default which is 1. 341 | # - weights -- Weights are optional and will default to 1 or the 342 | # overridden default_weight. The weight translates into how 343 | # many requests are handled during each turn of the 344 | # RoundRobin, based on the scheduler id. 345 | # 346 | # request_scheduler_options: 347 | # throttle_limit: 80 348 | # default_weight: 5 349 | # weights: 350 | # Keyspace1: 1 351 | # Keyspace2: 5 352 | 353 | # request_scheduler_id -- An identifer based on which to perform 354 | # the request scheduling. Currently the only valid option is keyspace. 355 | # request_scheduler_id: keyspace 356 | 357 | # The Index Interval determines how large the sampling of row keys 358 | # is for a given SSTable. The larger the sampling, the more effective 359 | # the index is at the cost of space. 360 | index_interval: 128 361 | 362 | # Enable or disable inter-node encryption 363 | # Default settings are TLS v1, RSA 1024-bit keys (it is imperative that 364 | # users generate their own keys) TLS_RSA_WITH_AES_128_CBC_SHA as the cipher 365 | # suite for authentication, key exchange and encryption of the actual data transfers. 366 | # NOTE: No custom encryption options are enabled at the moment 367 | # The available internode options are : all, none 368 | # 369 | # The passwords used in these options must match the passwords used when generating 370 | # the keystore and truststore. For instructions on generating these files, see: 371 | # http://download.oracle.com/javase/6/docs/technotes/guides/security/jsse/JSSERefGuide.html#CreateKeystore 372 | encryption_options: 373 | internode_encryption: none 374 | keystore: conf/.keystore 375 | keystore_password: cassandra 376 | truststore: conf/.truststore 377 | truststore_password: cassandra 378 | -------------------------------------------------------------------------------- /src/main/java/indexedcollections/IndexedCollections.java: -------------------------------------------------------------------------------- 1 | package indexedcollections; 2 | 3 | /* 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one 6 | * or more contributor license agreements. See the NOTICE file 7 | * distributed with this work for additional information 8 | * regarding copyright ownership. The ASF licenses this file 9 | * to you under the Apache License, Version 2.0 (the 10 | * "License"); you may not use this file except in compliance 11 | * with the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, 16 | * software distributed under the License is distributed on an 17 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 | * KIND, either express or implied. See the License for the 19 | * specific language governing permissions and limitations 20 | * under the License. 21 | * 22 | */ 23 | 24 | import static me.prettyprint.hector.api.factory.HFactory.createColumn; 25 | import static me.prettyprint.hector.api.factory.HFactory.createMutator; 26 | import static me.prettyprint.hector.api.factory.HFactory.createSliceQuery; 27 | 28 | import java.math.BigInteger; 29 | import java.nio.ByteBuffer; 30 | import java.util.ArrayList; 31 | import java.util.List; 32 | import java.util.Set; 33 | import java.util.UUID; 34 | 35 | import me.prettyprint.cassandra.serializers.ByteBufferSerializer; 36 | import me.prettyprint.cassandra.serializers.BytesArraySerializer; 37 | import me.prettyprint.cassandra.serializers.DynamicCompositeSerializer; 38 | import me.prettyprint.cassandra.serializers.LongSerializer; 39 | import me.prettyprint.cassandra.serializers.SerializerTypeInferer; 40 | import me.prettyprint.cassandra.serializers.StringSerializer; 41 | import me.prettyprint.cassandra.serializers.TypeInferringSerializer; 42 | import me.prettyprint.cassandra.serializers.UUIDSerializer; 43 | import me.prettyprint.hector.api.Keyspace; 44 | import me.prettyprint.hector.api.Serializer; 45 | import me.prettyprint.hector.api.beans.AbstractComposite; 46 | import me.prettyprint.hector.api.beans.AbstractComposite.Component; 47 | import me.prettyprint.hector.api.beans.ColumnSlice; 48 | import me.prettyprint.hector.api.beans.DynamicComposite; 49 | import me.prettyprint.hector.api.beans.HColumn; 50 | import me.prettyprint.hector.api.factory.HFactory; 51 | import me.prettyprint.hector.api.mutation.Mutator; 52 | import me.prettyprint.hector.api.query.QueryResult; 53 | import me.prettyprint.hector.api.query.SliceQuery; 54 | 55 | import org.apache.log4j.Logger; 56 | 57 | /** 58 | * Simple indexing library using composite types 59 | * (https://github.com/edanuff/CassandraCompositeType) to implement indexed 60 | * collections in Cassandra. 61 | * 62 | * See http://www.anuff.com/2010/07/secondary-indexes-in-cassandra.html for a 63 | * detailed discussion of the technique used here. 64 | * 65 | * @author Ed Anuff 66 | * @see Secondary 68 | * indexes in Cassandra 69 | * @see "org.apache.cassandra.db.marshal.CompositeType" 70 | * 71 | */ 72 | public class IndexedCollections { 73 | 74 | private static final Logger logger = Logger 75 | .getLogger(IndexedCollections.class.getName()); 76 | 77 | public static final String DEFAULT_ITEM_CF = "Item"; 78 | public static final String DEFAULT_COLLECTION_CF = "Collection"; 79 | public static final String DEFAULT_ITEM_INDEX_ENTRIES = "Item_Index_Entries"; 80 | public static final String DEFAULT_COLLECTION_INDEX_CF = "Collection_Index"; 81 | 82 | public static final byte VALUE_CODE_BYTES = 0; 83 | public static final byte VALUE_CODE_UTF8 = 1; 84 | public static final byte VALUE_CODE_UUID = 2; 85 | public static final byte VALUE_CODE_INT = 3; 86 | public static final byte VALUE_CODE_MAX = 127; 87 | 88 | public static final int DEFAULT_COUNT = 100; 89 | public static final int ALL_COUNT = 100000; 90 | 91 | public static final CollectionCFSet defaultCFSet = new CollectionCFSet(); 92 | 93 | public static final StringSerializer se = new StringSerializer(); 94 | public static final ByteBufferSerializer be = new ByteBufferSerializer(); 95 | public static final BytesArraySerializer bae = new BytesArraySerializer(); 96 | public static final DynamicCompositeSerializer ce = new DynamicCompositeSerializer(); 97 | public static final LongSerializer le = new LongSerializer(); 98 | public static final UUIDSerializer ue = new UUIDSerializer(); 99 | 100 | public static UUID newTimeUUID() { 101 | com.eaio.uuid.UUID eaioUUID = new com.eaio.uuid.UUID(); 102 | return new UUID(eaioUUID.time, eaioUUID.clockSeqAndNode); 103 | } 104 | 105 | /** 106 | * Convert values to be indexed into types that can be compared by 107 | * Cassandra: UTF8Type, UUIDType, IntegerType, and BytesType 108 | * 109 | * @param value 110 | * @return value transformed into String, UUID, BigInteger, or ByteBuffer 111 | */ 112 | public static Object getIndexableValue(Object value) { 113 | 114 | if (value == null) { 115 | return null; 116 | } 117 | 118 | // Strings, UUIDs, and BigIntegers map to Cassandra 119 | // UTF8Type, UUIDType, and IntegerType 120 | if ((value instanceof String) || (value instanceof UUID) 121 | || (value instanceof BigInteger)) { 122 | return value; 123 | } 124 | 125 | // For any numeric values, turn them into a long 126 | // and make them BigIntegers for IntegerType 127 | if (value instanceof Number) { 128 | return BigInteger.valueOf(((Number) value).longValue()); 129 | } 130 | 131 | // Anything else, we're going to have to use BytesType 132 | return TypeInferringSerializer.get().toByteBuffer(value); 133 | } 134 | 135 | /** 136 | * The Cassandra DynamicCompositeType will complain if component values of 137 | * two different types are attempted to be compared. The way to prevent this 138 | * and still allow for indexes to store different dynamic values is have a 139 | * value code component that precedes the actual indexed value component in 140 | * the composite. The DynamicCompositeType will first compare the two 141 | * components holding the value codes, and if they don't match, then won't 142 | * compare the next pair of components, avoiding the DynamicCompositeType 143 | * throwing an error. 144 | * 145 | * @param value 146 | * @return value code 147 | */ 148 | public static int getIndexableValueCode(Object value) { 149 | if (value instanceof String) { 150 | return VALUE_CODE_UTF8; 151 | } else if (value instanceof UUID) { 152 | return VALUE_CODE_UUID; 153 | } else if (value instanceof Number) { 154 | return VALUE_CODE_INT; 155 | } else { 156 | return VALUE_CODE_BYTES; 157 | } 158 | } 159 | 160 | private static void addIndexInsertion(Mutator batch, 161 | CollectionCFSet cf, String columnIndexKey, IK itemKey, 162 | Object columnValue, UUID ts_uuid, long timestamp) { 163 | 164 | logger.info("UPDATE " + cf.getIndex() + " SET composite(" 165 | + getIndexableValueCode(columnValue) + ", " 166 | + getIndexableValue(columnValue) + ", " + itemKey + ", " 167 | + ts_uuid + ") = null WHERE KEY = " + columnIndexKey); 168 | 169 | DynamicComposite indexComposite = new DynamicComposite( 170 | getIndexableValueCode(columnValue), 171 | getIndexableValue(columnValue), itemKey, ts_uuid); 172 | 173 | batch.addInsertion(se.toByteBuffer(columnIndexKey), cf.getIndex(), 174 | HFactory.createColumn(indexComposite, new byte[0], timestamp, 175 | ce, bae)); 176 | 177 | } 178 | 179 | private static void addIndexDeletion(Mutator batch, 180 | CollectionCFSet cf, String columnIndexKey, IK itemKey, 181 | Object columnValue, UUID prev_timestamp, long timestamp) { 182 | 183 | logger.info("DELETE composite(" + getIndexableValueCode(columnValue) 184 | + ", " + getIndexableValue(columnValue) + ", " + itemKey + ", " 185 | + prev_timestamp + ") FROM " + cf.getIndex() + " WHERE KEY = " 186 | + columnIndexKey); 187 | 188 | DynamicComposite indexComposite = new DynamicComposite( 189 | getIndexableValueCode(columnValue), 190 | getIndexableValue(columnValue), itemKey, prev_timestamp); 191 | 192 | batch.addDeletion(se.toByteBuffer(columnIndexKey), cf.getIndex(), 193 | indexComposite, ce, timestamp); 194 | } 195 | 196 | private static void addEntriesInsertion(Mutator batch, 197 | CollectionCFSet cf, IK itemKey, Object columnName, 198 | Object columnValue, UUID ts_uuid, Serializer itemKeySerializer, 199 | long timestamp) { 200 | 201 | logger.info("UPDATE " + cf.getEntries() + " SET composite(" 202 | + columnName + ", " + ts_uuid + ") = composite(" + columnValue 203 | + ") WHERE KEY = " + itemKey); 204 | 205 | batch.addInsertion(itemKeySerializer.toByteBuffer(itemKey), cf 206 | .getEntries(), HFactory.createColumn(new DynamicComposite( 207 | columnName, ts_uuid), new DynamicComposite(columnValue), 208 | timestamp, ce, ce)); 209 | } 210 | 211 | private static void addEntriesDeletion(Mutator batch, 212 | CollectionCFSet cf, IK itemKey, DynamicComposite columnName, 213 | Object columnValue, UUID prev_timestamp, 214 | Serializer itemKeySerializer, long timestamp) { 215 | 216 | logger.info("DELETE composite(" + columnName + ", " + prev_timestamp 217 | + ") FROM " + cf.getEntries() + " WHERE KEY = " + itemKey); 218 | 219 | batch.addDeletion(itemKeySerializer.toByteBuffer(itemKey), 220 | cf.getEntries(), columnName, ce, timestamp); 221 | 222 | } 223 | 224 | /** 225 | * Sets the item column value for an item contained in a set of collections. 226 | * 227 | * @param 228 | * the container's key type 229 | * @param 230 | * the item's key type 231 | * @param 232 | * the item's column name type 233 | * @param 234 | * the item's column value type 235 | * @param ko 236 | * the keyspace operator 237 | * @param itemKey 238 | * the item row key 239 | * @param columnName 240 | * the name of the column to set 241 | * @param columnValue 242 | * the value to set the column to 243 | * @param containers 244 | * the set of containers the item is in 245 | * @param cf 246 | * the column families to use 247 | * @param itemKeySerializer 248 | * the item key serializer 249 | * @param nameSerializer 250 | * the column name serializer 251 | * @param valueSerializer 252 | * the column value serializer 253 | * @param containerKeySerializer 254 | * the container key serializer 255 | */ 256 | public static void setItemColumn(Keyspace ko, IK itemKey, 257 | N columnName, V columnValue, 258 | Set> containers, CollectionCFSet cf, 259 | Serializer itemKeySerializer, Serializer nameSerializer, 260 | Serializer valueSerializer, Serializer containerKeySerializer) { 261 | 262 | logger.info("SET " + columnName + " = '" + columnValue + "' FOR ITEM " 263 | + itemKey); 264 | 265 | long timestamp = HFactory.createClock(); 266 | Mutator batch = createMutator(ko, be); 267 | UUID ts_uuid = newTimeUUID(); 268 | 269 | // Get all know previous index entries for this item's 270 | // indexed column from the item's index entry list 271 | 272 | SliceQuery q = createSliceQuery( 273 | ko, itemKeySerializer, ce, ce); 274 | q.setColumnFamily(cf.getEntries()); 275 | q.setKey(itemKey); 276 | q.setRange(new DynamicComposite(columnName, new UUID(0, 0)), 277 | new DynamicComposite(columnName, new UUID(Long.MAX_VALUE 278 | | Long.MIN_VALUE, Long.MAX_VALUE | Long.MIN_VALUE)), 279 | false, ALL_COUNT); 280 | QueryResult> r = q 281 | .execute(); 282 | ColumnSlice slice = r.get(); 283 | List> entries = slice 284 | .getColumns(); 285 | 286 | logger.info(entries.size() + " previous values for " + columnName 287 | + " found in index for removal"); 288 | 289 | // Delete all previous index entities from the item's index entry list 290 | 291 | for (HColumn entry : entries) { 292 | UUID prev_timestamp = entry.getName().get(1, ue); 293 | Object prev_value = entry.getValue().get(0); 294 | 295 | addEntriesDeletion(batch, cf, itemKey, entry.getName(), prev_value, 296 | prev_timestamp, itemKeySerializer, timestamp); 297 | } 298 | 299 | // Add the new index entry to the item's index entry list 300 | 301 | if (columnValue != null) { 302 | addEntriesInsertion(batch, cf, itemKey, columnName, columnValue, 303 | ts_uuid, itemKeySerializer, timestamp); 304 | } 305 | 306 | for (ContainerCollection container : containers) { 307 | 308 | String columnIndexKey = container.getKey() + ":" 309 | + columnName.toString(); 310 | 311 | // Delete all previous index entities from both the container's 312 | // index 313 | 314 | for (HColumn entry : entries) { 315 | UUID prev_timestamp = entry.getName().get(1, ue); 316 | Object prev_value = entry.getValue().get(0); 317 | 318 | addIndexDeletion(batch, cf, columnIndexKey, itemKey, 319 | prev_value, prev_timestamp, timestamp); 320 | 321 | } 322 | 323 | // Add the new index entry into the container's index 324 | 325 | if (columnValue != null) { 326 | addIndexInsertion(batch, cf, columnIndexKey, itemKey, 327 | columnValue, ts_uuid, timestamp); 328 | } 329 | 330 | } 331 | 332 | // Store the new column value into the item 333 | // If new value is null, delete the value instead 334 | 335 | if (columnValue != null) { 336 | 337 | logger.info("UPDATE " + cf.getItem() + " SET " + columnName + " = " 338 | + columnValue + " WHERE KEY = " + itemKey); 339 | batch.addInsertion(itemKeySerializer.toByteBuffer(itemKey), cf 340 | .getItem(), HFactory.createColumn(columnName, columnValue, 341 | timestamp, nameSerializer, valueSerializer)); 342 | } else { 343 | logger.info("DELETE " + columnName + " FROM " + cf.getItem() 344 | + " WHERE KEY = " + itemKey); 345 | batch.addDeletion(itemKeySerializer.toByteBuffer(itemKey), 346 | cf.getItem(), columnName, nameSerializer, timestamp); 347 | } 348 | 349 | batch.execute(); 350 | 351 | } 352 | 353 | /** 354 | * Search container. 355 | * 356 | * @param 357 | * the item's key type 358 | * @param 359 | * the container's key type 360 | * @param 361 | * the item's column name type 362 | * @param ko 363 | * the keyspace operator 364 | * @param container 365 | * the ContainerCollection (container key and collection name) 366 | * @param columnName 367 | * the item's column name 368 | * @param searchValue 369 | * the exact value for the specified column 370 | * @param startResult 371 | * the start result row key 372 | * @param count 373 | * the number of row keys to return 374 | * @param reversed 375 | * search in reverse order 376 | * @param cf 377 | * the column family set 378 | * @param containerKeySerializer 379 | * the container key serializer 380 | * @param itemKeySerializer 381 | * the item key serializer 382 | * @param nameSerializer 383 | * the column name serializer 384 | * @return the list of row keys for items who's column value matches 385 | */ 386 | public static List searchContainer(Keyspace ko, 387 | ContainerCollection container, N columnName, 388 | Object searchValue, IK startResult, int count, boolean reversed, 389 | CollectionCFSet cf, Serializer containerKeySerializer, 390 | Serializer itemKeySerializer, Serializer nameSerializer) { 391 | 392 | return searchContainer(ko, container, columnName, searchValue, 393 | searchValue, true, startResult, count, reversed, cf, 394 | containerKeySerializer, itemKeySerializer, nameSerializer); 395 | } 396 | 397 | /** 398 | * Search container. 399 | * 400 | * @param 401 | * the item's key type 402 | * @param 403 | * the container's key type 404 | * @param 405 | * the item's column name type 406 | * @param ko 407 | * the keyspace operator 408 | * @param container 409 | * the ContainerCollection (container key and collection name) 410 | * @param columnName 411 | * the item's column name 412 | * @param startValue 413 | * the start value for the specified column (inclusive) 414 | * @param endValue 415 | * the end value for the specified column 416 | * @param inclusive 417 | * whether end value for the specified column is inclusive 418 | * @param startResult 419 | * the start result row key 420 | * @param count 421 | * the number of row keys to return 422 | * @param reversed 423 | * search in reverse order 424 | * @param cf 425 | * the column family set 426 | * @param containerKeySerializer 427 | * the container key serializer 428 | * @param itemKeySerializer 429 | * the item key serializer 430 | * @param nameSerializer 431 | * the column name serializer 432 | * @return the list of row keys for items who's column value matches 433 | */ 434 | @SuppressWarnings("unchecked") 435 | public static List searchContainer(Keyspace ko, 436 | ContainerCollection container, N columnName, Object startValue, 437 | Object endValue, boolean inclusive, IK startResult, int count, 438 | boolean reversed, CollectionCFSet cf, 439 | Serializer containerKeySerializer, 440 | Serializer itemKeySerializer, Serializer nameSerializer) { 441 | List items = new ArrayList(); 442 | 443 | String columnIndexKey = container.getKey() + ":" 444 | + columnName.toString(); 445 | 446 | if (count == 0) { 447 | count = DEFAULT_COUNT; 448 | } 449 | 450 | SliceQuery q = createSliceQuery( 451 | ko, be, ce, be); 452 | q.setColumnFamily(cf.getIndex()); 453 | q.setKey(se.toByteBuffer(columnIndexKey)); 454 | 455 | DynamicComposite start = null; 456 | 457 | if (startValue == null) { 458 | if (startResult != null) { 459 | start = new DynamicComposite(VALUE_CODE_BYTES, new byte[0], 460 | startResult); 461 | } else { 462 | start = new DynamicComposite(VALUE_CODE_BYTES, new byte[0]); 463 | } 464 | } else if (startResult != null) { 465 | start = new DynamicComposite(getIndexableValueCode(startValue), 466 | getIndexableValue(startValue), startResult); 467 | } else { 468 | start = new DynamicComposite(getIndexableValueCode(startValue), 469 | getIndexableValue(startValue)); 470 | } 471 | 472 | DynamicComposite finish = null; 473 | 474 | if (endValue != null) { 475 | finish = new DynamicComposite(getIndexableValueCode(endValue), 476 | getIndexableValue(endValue)); 477 | if (inclusive) { 478 | @SuppressWarnings("rawtypes") 479 | Component c = finish.getComponent(1); 480 | finish.setComponent(1, c.getValue(), c.getSerializer(), 481 | c.getComparator(), 482 | AbstractComposite.ComponentEquality.GREATER_THAN_EQUAL); 483 | } 484 | } 485 | 486 | q.setRange(start, finish, reversed, count); 487 | QueryResult> r = q.execute(); 488 | ColumnSlice slice = r.get(); 489 | List> results = slice 490 | .getColumns(); 491 | 492 | if (results != null) { 493 | for (HColumn result : results) { 494 | Object value = result.getName().get(1); 495 | logger.info("Value found: " + value); 496 | 497 | IK key = result.getName().get(2, itemKeySerializer); 498 | if (key != null) { 499 | items.add(key); 500 | } 501 | } 502 | } 503 | 504 | return items; 505 | } 506 | 507 | /** 508 | * Adds the item to collection. 509 | * 510 | * @param 511 | * the container's key type 512 | * @param 513 | * the item's key type 514 | * @param ko 515 | * the keyspace operator 516 | * @param container 517 | * the ContainerCollection (container key and collection name) 518 | * @param itemKey 519 | * the item's row key 520 | * @param cf 521 | * the column families to use 522 | * @param containerKeySerializer 523 | * the container key serializer 524 | * @param itemKeySerializer 525 | * the item key serializer 526 | */ 527 | public static void addItemToCollection(Keyspace ko, 528 | ContainerCollection container, IK itemKey, CollectionCFSet cf, 529 | Serializer itemKeySerializer) { 530 | 531 | createMutator(ko, se).insert( 532 | container.getKey(), 533 | cf.getItems(), 534 | createColumn(itemKey, HFactory.createClock(), 535 | itemKeySerializer, le)); 536 | 537 | } 538 | 539 | public static List getItemsInCollection(Keyspace ko, 540 | ContainerCollection container, CollectionCFSet cf, 541 | Serializer itemKeySerializer) { 542 | List keys = new ArrayList(); 543 | SliceQuery q = createSliceQuery(ko, se, 544 | itemKeySerializer, be); 545 | q.setColumnFamily(cf.getItems()); 546 | q.setKey(container.getKey()); 547 | q.setRange(null, null, false, ALL_COUNT); 548 | QueryResult> r = q.execute(); 549 | ColumnSlice slice = r.get(); 550 | List> results = slice.getColumns(); 551 | for (HColumn column : results) { 552 | keys.add(column.getName()); 553 | } 554 | return keys; 555 | } 556 | 557 | @SuppressWarnings("unchecked") 558 | public static T getAsType(K obj, Serializer st) { 559 | Serializer so = SerializerTypeInferer.getSerializer(obj); 560 | if (so == null) { 561 | return null; 562 | } 563 | if (so.getClass().equals(st.getClass())) { 564 | return (T) obj; 565 | } 566 | return st.fromByteBuffer(so.toByteBuffer(obj)); 567 | } 568 | 569 | /** 570 | * CollectionCFSet contains the names of the four column families needed to 571 | * implement indexed collections. Default CF names are provided, but can be 572 | * anything that makes sense for the application. 573 | */ 574 | public static class CollectionCFSet { 575 | 576 | private String item = DEFAULT_ITEM_CF; 577 | private String items = DEFAULT_COLLECTION_CF; 578 | private String index = DEFAULT_COLLECTION_INDEX_CF; 579 | private String entries = DEFAULT_ITEM_INDEX_ENTRIES; 580 | 581 | public CollectionCFSet() { 582 | } 583 | 584 | public CollectionCFSet(String item, String items, String index, 585 | String entries) { 586 | this.item = item; 587 | this.items = items; 588 | this.index = index; 589 | this.entries = entries; 590 | } 591 | 592 | public String getItem() { 593 | return item; 594 | } 595 | 596 | public void setItem(String item) { 597 | this.item = item; 598 | } 599 | 600 | public String getItems() { 601 | return items; 602 | } 603 | 604 | public void setItems(String items) { 605 | this.items = items; 606 | } 607 | 608 | public String getIndex() { 609 | return index; 610 | } 611 | 612 | public void setIndex(String index) { 613 | this.index = index; 614 | } 615 | 616 | public String getEntries() { 617 | return entries; 618 | } 619 | 620 | public void setEntries(String entries) { 621 | this.entries = entries; 622 | } 623 | } 624 | 625 | /** 626 | * ContainerCollection represents the containing entity's key and collection 627 | * name. The assumption is that an entity can have multiple collections, 628 | * each with their own name. 629 | * 630 | * @param 631 | * the container's row key type 632 | */ 633 | public static class ContainerCollection { 634 | private CK ownerKey; 635 | private String collectionName; 636 | 637 | public ContainerCollection(CK ownerKey, String collectionName) { 638 | this.ownerKey = ownerKey; 639 | this.collectionName = collectionName; 640 | } 641 | 642 | public CK getOwnerKey() { 643 | return ownerKey; 644 | } 645 | 646 | public void setOwnerKey(CK ownerKey) { 647 | this.ownerKey = ownerKey; 648 | } 649 | 650 | public String getCollectionName() { 651 | return collectionName; 652 | } 653 | 654 | public void setCollectionName(String collectionName) { 655 | this.collectionName = collectionName; 656 | } 657 | 658 | public String getKey() { 659 | return ownerKey + ":" + collectionName; 660 | } 661 | 662 | @Override 663 | public int hashCode() { 664 | final int prime = 31; 665 | int result = 1; 666 | result = prime 667 | * result 668 | + ((collectionName == null) ? 0 : collectionName.hashCode()); 669 | result = prime * result 670 | + ((ownerKey == null) ? 0 : ownerKey.hashCode()); 671 | return result; 672 | } 673 | 674 | @Override 675 | public boolean equals(Object obj) { 676 | if (this == obj) { 677 | return true; 678 | } 679 | if (obj == null) { 680 | return false; 681 | } 682 | if (getClass() != obj.getClass()) { 683 | return false; 684 | } 685 | @SuppressWarnings("rawtypes") 686 | ContainerCollection other = (ContainerCollection) obj; 687 | if (collectionName == null) { 688 | if (other.collectionName != null) { 689 | return false; 690 | } 691 | } else if (!collectionName.equals(other.collectionName)) { 692 | return false; 693 | } 694 | if (ownerKey == null) { 695 | if (other.ownerKey != null) { 696 | return false; 697 | } 698 | } else if (!ownerKey.equals(other.ownerKey)) { 699 | return false; 700 | } 701 | return true; 702 | } 703 | } 704 | } 705 | --------------------------------------------------------------------------------