├── .gitignore ├── .travis.yml ├── src ├── main │ └── java │ │ └── com │ │ └── basho │ │ └── riak │ │ └── hadoop │ │ ├── config │ │ ├── RiakTransport.java │ │ ├── NoRiakLocationsException.java │ │ ├── RiakPBLocation.java │ │ ├── RiakHTTPLocation.java │ │ ├── RiakLocation.java │ │ ├── ClientFactory.java │ │ └── RiakConfig.java │ │ ├── keylisters │ │ ├── KeyLister.java │ │ ├── BucketKeyLister.java │ │ ├── RiakSearchKeyLister.java │ │ ├── KeysKeyLister.java │ │ └── SecondaryIndexesKeyLister.java │ │ ├── RiakOutputCommitter.java │ │ ├── RiakRecordWriter.java │ │ ├── RiakOutputFormat.java │ │ ├── RiakMapper.java │ │ ├── RiakRecordReader.java │ │ ├── BucketKey.java │ │ ├── RiakInputSplit.java │ │ └── RiakInputFormat.java └── test │ └── java │ └── com │ └── basho │ └── riak │ └── hadoop │ ├── config │ ├── ClientFactoryTest.java │ └── RiakConfigTest.java │ ├── BucketKeyTest.java │ ├── keylisters │ ├── KeysKeyListerTest.java │ ├── RiakSearchKeyListerTest.java │ ├── BucketKeyListerTest.java │ └── SecondaryIndexesKeyListerTest.java │ └── RiakInputFormatTest.java ├── README.org └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | .classpath 2 | .project 3 | .settings/ 4 | bin/ 5 | lib/ 6 | target/ 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | notifications: 3 | email: clients@basho.com 4 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/config/RiakTransport.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.config; 15 | 16 | /** 17 | * Simple enum of available Riak transports 18 | * 19 | * @author russell 20 | * 21 | */ 22 | public enum RiakTransport { 23 | HTTP, PB; 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/config/NoRiakLocationsException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.config; 15 | 16 | import java.io.IOException; 17 | 18 | import com.basho.riak.hadoop.RiakInputFormat; 19 | 20 | /** 21 | * Tag exception for hadoop config where no {@link RiakLocation}s have been 22 | * provided to the {@link RiakInputFormat} 23 | * 24 | * @author russell 25 | * 26 | */ 27 | public class NoRiakLocationsException extends IOException { 28 | 29 | /** 30 | * Eclipse generated 31 | */ 32 | private static final long serialVersionUID = -4095183778220854984L; 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/config/RiakPBLocation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.config; 15 | 16 | /** 17 | * Models the PB interface endpoint's location 18 | * 19 | * @author russell 20 | * 21 | */ 22 | public class RiakPBLocation extends RiakLocation { 23 | 24 | /** 25 | * @param host 26 | * @param port 27 | */ 28 | public RiakPBLocation(String host, int port) { 29 | super(RiakTransport.PB, host, port); 30 | } 31 | 32 | /* 33 | * (non-Javadoc) 34 | * 35 | * @see com.basho.riak.hadoop.RiakLocation#asString() 36 | */ 37 | @Override public String asString() { 38 | return new StringBuilder(getHost()).append(":").append(getPort()).toString(); 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | * Riak-Hadoop 2 | 3 | [[http://travis-ci.org/basho/riak-hadoop][Travis-CI]] :: [[https://secure.travis-ci.org/basho/riak-hadoop.png]] 4 | 5 | Riak-Hadoop is a library for using [[http://basho.com/products/riak-overview/][Riak]] as an input/output to [[http://hadoop.apache.org/mapreduce/][Hadoop 6 | Map/Reduce]]. 7 | 8 | *NOTE* This library is not yet offically supported by Basho and is 9 | strictly experimental. 10 | 11 | ** How it works 12 | The library extends =InputFormat=, =InputSplit=, =RecordReader=, 13 | =OutputFormat= and =RecordWriter=, so you can declare any valid Riak 14 | M/R input (2i query, riak search query, list of keys, bucket) as input 15 | to a Hadoop M/R job. The library will split the keys into partitions 16 | and Hadoop will use the =RiakRecordReader= to load Key/Value pairs 17 | from Riak for the =Mapper= tasks. The =Reducer= output is written back 18 | to a configured bucket in Riak. It uses the [[https://github.com/basho/riak-java-client/][Riak-Java-Client]] to talk 19 | to Riak. You just write a normal Hadoop Map/Reduce job, but declare 20 | =RiakInputFormat= and =RiakOutpurFormat= as sources/sinks for your 21 | data/results. 22 | 23 | ** Example? 24 | Have a look at the 25 | [[https://github.com/russelldb/riak-hadoop-wordcount][Riak Word Count]] example project to get started with Riak-Hadoop. 26 | 27 | ** Future 28 | I plan to have the library load bulk data from Riak to HDFS to better 29 | leverage Hadoop's integration with that file system. There will be 30 | docs. 31 | 32 | ** Feedback 33 | Raise issues, pull requests, comments etc. as ever through GitHub, or 34 | [[http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com][the Riak mailing list]]. 35 | -------------------------------------------------------------------------------- /src/test/java/com/basho/riak/hadoop/config/ClientFactoryTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.config; 15 | 16 | import org.apache.hadoop.conf.Configuration; 17 | import org.junit.Before; 18 | import org.junit.Test; 19 | 20 | import com.basho.riak.client.RiakException; 21 | 22 | /** 23 | * @author russell 24 | * 25 | */ 26 | public class ClientFactoryTest { 27 | 28 | /** 29 | * @throws java.lang.Exception 30 | */ 31 | @Before public void setUp() throws Exception {} 32 | 33 | @Test(expected = IllegalArgumentException.class) public void getClusterClient_die() throws RiakException { 34 | Configuration conf = new Configuration(); 35 | 36 | conf = RiakConfig.addLocation(conf, new RiakPBLocation("33.33.33.12", 8087)); 37 | conf = RiakConfig.addLocation(conf, new RiakPBLocation("33.33.33.13", 8087)); 38 | conf = RiakConfig.addLocation(conf, new RiakHTTPLocation("33.33.33.10", 8098, "riak")); 39 | conf = RiakConfig.addLocation(conf, new RiakHTTPLocation("33.33.33.11", 8098, "riak")); 40 | 41 | ClientFactory.clusterClient(RiakConfig.getRiakLocatons(conf)); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.basho.riak.hadoop 6 | riak-hadoop 7 | 0.2-SNAPSHOT 8 | jar 9 | 10 | riak-hadoop 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.basho.riak 20 | https://oss.sonatype.org/content/repositories/combashoriak-168 21 | 22 | 23 | 24 | 25 | 26 | com.basho.riak 27 | riak-client 28 | 1.0.2 29 | 30 | 31 | org.apache.hadoop 32 | hadoop-core 33 | 0.20.203.0 34 | 35 | 36 | 37 | org.mockito 38 | mockito-all 39 | 1.8.0 40 | test 41 | 42 | 43 | junit 44 | junit 45 | 4.4 46 | test 47 | 48 | 49 | 50 | 51 | 52 | 53 | maven-compiler-plugin 54 | 55 | 1.5 56 | 1.5 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /src/test/java/com/basho/riak/hadoop/BucketKeyTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop; 15 | 16 | import static org.junit.Assert.*; 17 | 18 | import java.util.Collection; 19 | 20 | import org.codehaus.jackson.map.ObjectMapper; 21 | import org.codehaus.jackson.map.type.TypeFactory; 22 | import org.junit.Before; 23 | import org.junit.Test; 24 | 25 | /** 26 | * Tests that the object mapper can turn [["b", "k"], ["b", "k1"]] into a 27 | * Collection of {@link BucketKey} 28 | * 29 | * @author russell 30 | * 31 | */ 32 | public class BucketKeyTest { 33 | 34 | /** 35 | * @throws java.lang.Exception 36 | */ 37 | @Before public void setUp() throws Exception {} 38 | 39 | /** 40 | * Test method for 41 | * {@link com.basho.riak.hadoop.BucketKey#BucketKey(java.lang.String[])}. 42 | */ 43 | @Test public void bucketKeyFromReduceIdentity() throws Exception { 44 | final String mrOut = "[[\"indexed\",\"qbert\"],[\"indexed\",\"bert\"]]"; 45 | 46 | Collection bks = new ObjectMapper().readValue(mrOut, 47 | TypeFactory.collectionType(Collection.class,BucketKey.class)); 48 | 49 | assertEquals(2, bks.size()); 50 | 51 | assertTrue(bks.contains(new BucketKey("indexed", "qbert"))); 52 | assertTrue(bks.contains(new BucketKey("indexed", "bert"))); 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/config/RiakHTTPLocation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.config; 15 | 16 | /** 17 | * Holder for a Riak HTTP interface location 18 | * 19 | * @author russell 20 | * 21 | */ 22 | public class RiakHTTPLocation extends RiakLocation { 23 | 24 | private final String riakPath; 25 | 26 | /** 27 | * Create an HTTP location 28 | * 29 | * @param host 30 | * the host 31 | * @param port 32 | * the HTTP port 33 | * @param riakPath 34 | * the path to the 'riak' resource 35 | */ 36 | public RiakHTTPLocation(String host, int port, String riakPath) { 37 | super(RiakTransport.HTTP, host, port); 38 | this.riakPath = riakPath; 39 | } 40 | 41 | /** 42 | * @return the path to the 'riak' resource 43 | */ 44 | public String getRiakPath() { 45 | return riakPath; 46 | } 47 | 48 | /* 49 | * (non-Javadoc) 50 | * 51 | * @see com.basho.riak.hadoop.RiakLocation#asString() 52 | */ 53 | @Override public String asString() { 54 | StringBuilder sb = new StringBuilder("http://"); 55 | sb.append(getHost()).append(":").append(getPort()); 56 | 57 | if (!riakPath.startsWith("/")) { 58 | sb.append("/"); 59 | } 60 | 61 | sb.append(riakPath); 62 | return sb.toString(); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/keylisters/KeyLister.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.keylisters; 15 | 16 | import java.io.IOException; 17 | import java.util.Collection; 18 | 19 | import com.basho.riak.client.IRiakClient; 20 | import com.basho.riak.client.RiakException; 21 | import com.basho.riak.hadoop.BucketKey; 22 | 23 | /** 24 | * Strategy for obtaining list of keys for splits, {@link KeyLister}s must a 25 | * zero arg constructor. 26 | * 27 | * @author russell 28 | * 29 | */ 30 | public interface KeyLister { 31 | /** 32 | * Thanks to hadoop's configuration framework a key lister has to 33 | * deserialize and serialize itself this method and init(String) below are a 34 | * light weight way of doing that 35 | * 36 | * @return a String that can be used by the implementations init method to 37 | * reconsitute the state of the lister 38 | * @throws IOException 39 | */ 40 | String getInitString() throws IOException; 41 | 42 | /** 43 | * A string (from a prior call to getInitString) that this instance will use 44 | * to set itself up to list keys 45 | * 46 | * @param initString 47 | * @throws IOException 48 | */ 49 | void init(String initString) throws IOException; 50 | 51 | /** 52 | * Get keys with the given client 53 | * 54 | * @param client 55 | * @return 56 | * @throws RiakException 57 | * @throws {@link IllegalStateException} is init was not called and the 58 | * lister is not set up to get keys 59 | */ 60 | Collection getKeys(IRiakClient client) throws RiakException; 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/RiakOutputCommitter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop; 15 | 16 | import java.io.IOException; 17 | 18 | import org.apache.hadoop.mapreduce.JobContext; 19 | import org.apache.hadoop.mapreduce.OutputCommitter; 20 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 21 | 22 | /** 23 | * A NO-OP output committer 24 | * 25 | * @author russell 26 | * 27 | */ 28 | public class RiakOutputCommitter extends OutputCommitter { 29 | 30 | /* (non-Javadoc) 31 | * @see org.apache.hadoop.mapreduce.OutputCommitter#abortTask(org.apache.hadoop.mapreduce.TaskAttemptContext) 32 | */ 33 | @Override public void abortTask(TaskAttemptContext tac) throws IOException {} 34 | 35 | /* (non-Javadoc) 36 | * @see org.apache.hadoop.mapreduce.OutputCommitter#commitTask(org.apache.hadoop.mapreduce.TaskAttemptContext) 37 | */ 38 | @Override public void commitTask(TaskAttemptContext tac) throws IOException {} 39 | 40 | /* (non-Javadoc) 41 | * @see org.apache.hadoop.mapreduce.OutputCommitter#needsTaskCommit(org.apache.hadoop.mapreduce.TaskAttemptContext) 42 | */ 43 | @Override public boolean needsTaskCommit(TaskAttemptContext tac) throws IOException { 44 | return false; 45 | } 46 | 47 | /* (non-Javadoc) 48 | * @see org.apache.hadoop.mapreduce.OutputCommitter#setupJob(org.apache.hadoop.mapreduce.JobContext) 49 | */ 50 | @Override public void setupJob(JobContext jc) throws IOException {} 51 | 52 | /* (non-Javadoc) 53 | * @see org.apache.hadoop.mapreduce.OutputCommitter#setupTask(org.apache.hadoop.mapreduce.TaskAttemptContext) 54 | */ 55 | @Override public void setupTask(TaskAttemptContext tac) throws IOException {} 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/RiakRecordWriter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop; 15 | 16 | import java.io.IOException; 17 | 18 | import org.apache.hadoop.conf.Configuration; 19 | import org.apache.hadoop.io.Text; 20 | import org.apache.hadoop.mapreduce.RecordWriter; 21 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 22 | 23 | import com.basho.riak.client.IRiakClient; 24 | import com.basho.riak.client.RiakException; 25 | import com.basho.riak.client.bucket.Bucket; 26 | import com.basho.riak.hadoop.config.ClientFactory; 27 | import com.basho.riak.hadoop.config.RiakConfig; 28 | 29 | /** 30 | * Writes reducer results to Riak 31 | * 32 | * @author russell 33 | * @param 34 | * 35 | */ 36 | public class RiakRecordWriter extends RecordWriter { 37 | 38 | private final Bucket bucket; 39 | 40 | RiakRecordWriter(TaskAttemptContext tac) throws RiakException { 41 | Configuration conf = tac.getConfiguration(); 42 | IRiakClient client = ClientFactory.clusterClient(RiakConfig.getRiakLocatons(conf)); 43 | bucket = client.fetchBucket(RiakConfig.getOutputBucket(conf)).execute(); 44 | } 45 | 46 | /* 47 | * (non-Javadoc) 48 | * 49 | * @see 50 | * org.apache.hadoop.mapreduce.RecordWriter#close(org.apache.hadoop.mapreduce 51 | * .TaskAttemptContext) 52 | */ 53 | @Override public void close(TaskAttemptContext tac) throws IOException, InterruptedException { 54 | // NO-OP 55 | } 56 | 57 | /* 58 | * (non-Javadoc) 59 | * 60 | * @see org.apache.hadoop.mapreduce.RecordWriter#write(java.lang.Object, 61 | * java.lang.Object) 62 | */ 63 | @Override public void write(Text key, V value) throws IOException, InterruptedException { 64 | try { 65 | bucket.store(key.toString(), value).execute(); 66 | } catch (RiakException e) { 67 | throw new IOException(e); 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/RiakOutputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop; 15 | 16 | import java.io.IOException; 17 | 18 | import org.apache.hadoop.io.Text; 19 | import org.apache.hadoop.mapreduce.JobContext; 20 | import org.apache.hadoop.mapreduce.OutputCommitter; 21 | import org.apache.hadoop.mapreduce.OutputFormat; 22 | import org.apache.hadoop.mapreduce.RecordWriter; 23 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 24 | 25 | import com.basho.riak.client.RiakException; 26 | 27 | /** 28 | * Riak specific {@link OutputFormat}, just creates a {@link RiakRecordWriter} 29 | * 30 | * @author russell 31 | * 32 | */ 33 | public class RiakOutputFormat extends OutputFormat { 34 | 35 | /* 36 | * (non-Javadoc) 37 | * 38 | * @see 39 | * org.apache.hadoop.mapreduce.OutputFormat#checkOutputSpecs(org.apache. 40 | * hadoop.mapreduce.JobContext) 41 | */ 42 | @Override public void checkOutputSpecs(JobContext ctx) throws IOException, InterruptedException {} 43 | 44 | /* 45 | * (non-Javadoc) 46 | * 47 | * @see 48 | * org.apache.hadoop.mapreduce.OutputFormat#getOutputCommitter(org.apache 49 | * .hadoop.mapreduce.TaskAttemptContext) 50 | */ 51 | @Override public OutputCommitter getOutputCommitter(TaskAttemptContext tac) throws IOException, 52 | InterruptedException { 53 | return new RiakOutputCommitter(); 54 | } 55 | 56 | /* 57 | * (non-Javadoc) 58 | * 59 | * @see 60 | * org.apache.hadoop.mapreduce.OutputFormat#getRecordWriter(org.apache.hadoop 61 | * .mapreduce.TaskAttemptContext) 62 | */ 63 | @Override public RecordWriter getRecordWriter(TaskAttemptContext tac) throws IOException, 64 | InterruptedException { 65 | try { 66 | return new RiakRecordWriter(tac); 67 | } catch (RiakException e) { 68 | throw new IOException(e); 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/test/java/com/basho/riak/hadoop/keylisters/KeysKeyListerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.keylisters; 15 | 16 | import static org.junit.Assert.assertEquals; 17 | import static org.junit.Assert.fail; 18 | 19 | import java.util.Arrays; 20 | import java.util.HashSet; 21 | import java.util.List; 22 | import java.util.Set; 23 | 24 | import org.junit.Test; 25 | 26 | import com.basho.riak.hadoop.BucketKey; 27 | import com.basho.riak.hadoop.keylisters.KeysKeyLister; 28 | 29 | /** 30 | * @author russell 31 | * 32 | */ 33 | public class KeysKeyListerTest { 34 | 35 | private static final String BUCKET_NAME = "bucket"; 36 | 37 | private KeysKeyLister lister; 38 | 39 | /** 40 | * Test method for 41 | * {@link com.basho.riak.hadoop.keylisters.KeysKeyLister#KeysKeyLister(java.util.List)} 42 | * . 43 | */ 44 | @Test public void createWithKeys() throws Exception { 45 | Set keys = new HashSet(Arrays.asList(new BucketKey(BUCKET_NAME, "k1"), new BucketKey(BUCKET_NAME, 46 | "k2s"))); 47 | lister = new KeysKeyLister(keys); 48 | assertEquals(keys, lister.getKeys(null)); 49 | } 50 | 51 | /** 52 | * Test method for 53 | * {@link com.basho.riak.hadoop.keylisters.KeysKeyLister#KeysKeyLister(java.util.List, java.lang.String)} 54 | * . 55 | */ 56 | @Test public void createWithKeysAndCommonBucket() throws Exception { 57 | Set keys = new HashSet(Arrays.asList("k1", "k2", "k3", "k4")); 58 | lister = new KeysKeyLister(keys, BUCKET_NAME); 59 | 60 | Set expected = new HashSet(); 61 | for (String k : keys) { 62 | expected.add(new BucketKey(BUCKET_NAME, k)); 63 | } 64 | 65 | assertEquals(expected, lister.getKeys(null)); 66 | } 67 | 68 | /** 69 | * Test method for 70 | * {@link com.basho.riak.hadoop.keylisters.KeysKeyLister#KeysKeyLister()}. 71 | */ 72 | @Test public void noArgConstructorAndNoInitMeansIllegalState() throws Exception { 73 | lister = new KeysKeyLister(); 74 | 75 | try { 76 | lister.getKeys(null); 77 | fail("Expected IllegalStateException"); 78 | } catch (IllegalStateException e) { 79 | // NO-OP 80 | } 81 | 82 | } 83 | 84 | /** 85 | * Test method for 86 | * {@link com.basho.riak.hadoop.keylisters.KeysKeyLister#getInitString()}. 87 | */ 88 | @Test public void initFromString() throws Exception { 89 | List keys = Arrays.asList(new BucketKey(BUCKET_NAME, "k1"), new BucketKey(BUCKET_NAME, "k2s")); 90 | lister = new KeysKeyLister(keys); 91 | 92 | KeysKeyLister lister2 = new KeysKeyLister(); 93 | lister2.init(lister.getInitString()); 94 | 95 | assertEquals(lister.getKeys(null), lister2.getKeys(null)); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/RiakMapper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop; 15 | 16 | import java.io.IOException; 17 | import java.util.ArrayList; 18 | import java.util.Collection; 19 | 20 | import org.apache.hadoop.mapreduce.Mapper; 21 | 22 | import com.basho.riak.client.IRiakObject; 23 | import com.basho.riak.client.cap.ConflictResolver; 24 | import com.basho.riak.client.convert.Converter; 25 | import com.basho.riak.client.raw.RiakResponse; 26 | 27 | /** 28 | * A Riak specific extension of {@link Mapper} that can be used if you wish to 29 | * work with domain specific types and handle sibling values in your 30 | * {@link Mapper#map} method 31 | * 32 | * @author russell 33 | * @param 34 | * the type for the input value 35 | * @param 36 | * the type for the out key 37 | * @param 38 | * the type for the out value 39 | * 40 | */ 41 | public abstract class RiakMapper extends Mapper { 42 | 43 | private final Converter converter; 44 | private final ConflictResolver resolver; 45 | 46 | /** 47 | * Create a {@link Mapper} that will use the provided {@link Converter} and 48 | * {@link ConflictResolver} on the raw {@link RiakResponse} returned by the 49 | * {@link RiakRecordReader} 50 | * 51 | * @param converter 52 | * a {@link Converter} 53 | * @param resolver 54 | * a {@link ConflictResolver} 55 | */ 56 | public RiakMapper(Converter converter, ConflictResolver resolver) { 57 | this.converter = converter; 58 | this.resolver = resolver; 59 | } 60 | 61 | /* 62 | * (non-Javadoc) 63 | * 64 | * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object, 65 | * java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context) 66 | */ 67 | @Override public void map(BucketKey key, RiakResponse value, Context context) throws IOException, 68 | InterruptedException { 69 | 70 | // convert, conflict resolve 71 | final Collection siblings = new ArrayList(value.numberOfValues()); 72 | 73 | for (IRiakObject o : value) { 74 | siblings.add(converter.toDomain(o)); 75 | } 76 | 77 | map(key, resolver.resolve(siblings), context); 78 | } 79 | 80 | /** 81 | * Override this method in your {@link Mapper}. It is called by the default 82 | * {@link Mapper#map} method, after applying the {@link Converter} and 83 | * {@link ConflictResolver}. Put your mapping code here. 84 | * 85 | * @param k 86 | * the {@link BucketKey} 87 | * @param value 88 | * the converted value 89 | * @param context 90 | * the hadoop job Context 91 | * @throws IOException 92 | * @throws InterruptedException 93 | */ 94 | public abstract void map(BucketKey k, T value, Context context) throws IOException, InterruptedException; 95 | 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/RiakRecordReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop; 15 | 16 | import static com.basho.riak.hadoop.config.ClientFactory.getRawClient; 17 | 18 | import java.io.IOException; 19 | import java.util.concurrent.ConcurrentLinkedQueue; 20 | 21 | import org.apache.hadoop.mapreduce.InputSplit; 22 | import org.apache.hadoop.mapreduce.RecordReader; 23 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 24 | 25 | import com.basho.riak.client.raw.RawClient; 26 | import com.basho.riak.client.raw.RiakResponse; 27 | 28 | /** 29 | * Wrapper around a {@link RawClient} for reading values from Riak 30 | * 31 | * @author russell 32 | * 33 | */ 34 | public class RiakRecordReader extends RecordReader { 35 | 36 | private RawClient client; 37 | private ConcurrentLinkedQueue keys; 38 | private long initialSize; 39 | 40 | /* 41 | * (non-Javadoc) 42 | * 43 | * @see org.apache.hadoop.mapreduce.RecordReader#close() 44 | */ 45 | @Override public void close() throws IOException {} 46 | 47 | /* 48 | * (non-Javadoc) 49 | * 50 | * @see org.apache.hadoop.mapreduce.RecordReader#getCurrentKey() 51 | */ 52 | @Override public BucketKey getCurrentKey() throws IOException, InterruptedException { 53 | return keys.peek(); 54 | } 55 | 56 | /* 57 | * (non-Javadoc) 58 | * 59 | * @see org.apache.hadoop.mapreduce.RecordReader#getCurrentValue() 60 | */ 61 | @Override public RiakResponse getCurrentValue() throws IOException, InterruptedException { 62 | BucketKey key = keys.poll(); 63 | return client.fetch(key.getBucket(), key.getKey()); 64 | } 65 | 66 | /* 67 | * (non-Javadoc) 68 | * 69 | * @see org.apache.hadoop.mapreduce.RecordReader#getProgress() 70 | */ 71 | @Override public float getProgress() throws IOException, InterruptedException { 72 | int size = keys.size(); 73 | if (size == 0) { 74 | return 0; 75 | } else { 76 | return size / initialSize; 77 | } 78 | } 79 | 80 | /* 81 | * (non-Javadoc) 82 | * 83 | * @see 84 | * org.apache.hadoop.mapreduce.RecordReader#initialize(org.apache.hadoop 85 | * .mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext) 86 | */ 87 | @Override public void initialize(InputSplit split, TaskAttemptContext taskAttemptContext) throws IOException, 88 | InterruptedException { 89 | RiakInputSplit inputSplit = (RiakInputSplit) split; 90 | keys = new ConcurrentLinkedQueue(inputSplit.getInputs()); 91 | initialSize = split.getLength(); 92 | client = getRawClient(inputSplit.getLocation()); 93 | } 94 | 95 | /* 96 | * (non-Javadoc) 97 | * 98 | * @see org.apache.hadoop.mapreduce.RecordReader#nextKeyValue() 99 | */ 100 | @Override public boolean nextKeyValue() throws IOException, InterruptedException { 101 | return keys.peek() != null; 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/BucketKey.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop; 15 | 16 | import org.codehaus.jackson.annotate.JsonCreator; 17 | 18 | /** 19 | * Models a bucket/key location in Riak. 20 | * 21 | * @author russell 22 | * 23 | */ 24 | public class BucketKey { 25 | 26 | private final String bucket; 27 | private final String key; 28 | 29 | /** 30 | * Provide a JSON constructor for Jackson. 31 | * 32 | * @param bucketKey 33 | * a String[2] where [0] is the bucket and [1] is the key 34 | */ 35 | @JsonCreator public BucketKey(String[] bucketKey) { 36 | if (bucketKey == null || bucketKey.length != 2) { 37 | throw new IllegalArgumentException("bucketKey must be a String[] of length 2"); 38 | } 39 | 40 | this.bucket = bucketKey[0]; 41 | this.key = bucketKey[1]; 42 | } 43 | 44 | /** 45 | * Default constructor 46 | * 47 | * @param bucket 48 | * the bucket 49 | * @param key 50 | * the key 51 | */ 52 | public BucketKey(String bucket, String key) { 53 | this.bucket = bucket; 54 | this.key = key; 55 | } 56 | 57 | /** 58 | * @return the bucket 59 | */ 60 | public String getBucket() { 61 | return bucket; 62 | } 63 | 64 | /** 65 | * @return the key 66 | */ 67 | public String getKey() { 68 | return key; 69 | } 70 | 71 | /* 72 | * (non-Javadoc) 73 | * 74 | * @see java.lang.Object#hashCode() 75 | */ 76 | @Override public int hashCode() { 77 | final int prime = 31; 78 | int result = 1; 79 | result = prime * result + ((bucket == null) ? 0 : bucket.hashCode()); 80 | result = prime * result + ((key == null) ? 0 : key.hashCode()); 81 | return result; 82 | } 83 | 84 | /* 85 | * (non-Javadoc) 86 | * 87 | * @see java.lang.Object#equals(java.lang.Object) 88 | */ 89 | @Override public boolean equals(Object obj) { 90 | if (this == obj) { 91 | return true; 92 | } 93 | if (obj == null) { 94 | return false; 95 | } 96 | if (!(obj instanceof BucketKey)) { 97 | return false; 98 | } 99 | BucketKey other = (BucketKey) obj; 100 | if (bucket == null) { 101 | if (other.bucket != null) { 102 | return false; 103 | } 104 | } else if (!bucket.equals(other.bucket)) { 105 | return false; 106 | } 107 | if (key == null) { 108 | if (other.key != null) { 109 | return false; 110 | } 111 | } else if (!key.equals(other.key)) { 112 | return false; 113 | } 114 | return true; 115 | } 116 | 117 | /* 118 | * (non-Javadoc) 119 | * 120 | * @see java.lang.Object#toString() 121 | */ 122 | @Override public String toString() { 123 | return String.format("BucketKey [bucket=%s, key=%s]", bucket, key); 124 | } 125 | 126 | } 127 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/keylisters/BucketKeyLister.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.keylisters; 15 | 16 | import java.util.ArrayList; 17 | import java.util.Collection; 18 | import java.util.List; 19 | 20 | import com.basho.riak.client.IRiakClient; 21 | import com.basho.riak.client.RiakException; 22 | import com.basho.riak.client.bucket.Bucket; 23 | import com.basho.riak.hadoop.BucketKey; 24 | 25 | /** 26 | * A full list buckets key lister. DANGER, not advised for production use. 27 | * 28 | * @author russell 29 | * 30 | */ 31 | public class BucketKeyLister implements KeyLister { 32 | 33 | private static final String EMPTY = ""; 34 | private String bucket; 35 | 36 | /** 37 | * no arg CTOR for de-serialization 38 | */ 39 | public BucketKeyLister() {} 40 | 41 | /** 42 | * @param bucket 43 | */ 44 | public BucketKeyLister(String bucket) { 45 | this.bucket = bucket; 46 | } 47 | 48 | /* 49 | * (non-Javadoc) 50 | * 51 | * @see com.basho.riak.hadoop.KeyLister#getKeys() 52 | */ 53 | public Collection getKeys(IRiakClient client) throws RiakException { 54 | if (bucket == null || bucket.trim().equals(EMPTY)) { 55 | throw new IllegalStateException("bucket cannot be null or empty"); 56 | } 57 | 58 | List keys = new ArrayList(); 59 | Bucket b = client.fetchBucket(bucket).execute(); 60 | 61 | for (String key : b.keys()) { 62 | keys.add(new BucketKey(bucket, key)); 63 | 64 | } 65 | return keys; 66 | } 67 | 68 | /* 69 | * (non-Javadoc) 70 | * 71 | * @see com.basho.riak.hadoop.KeyLister#init(java.lang.String) 72 | */ 73 | public void init(String bucket) { 74 | this.bucket = bucket; 75 | } 76 | 77 | /* 78 | * (non-Javadoc) 79 | * 80 | * @see com.basho.riak.hadoop.KeyLister#getInitString() 81 | */ 82 | public String getInitString() { 83 | return bucket; 84 | } 85 | 86 | /* 87 | * (non-Javadoc) 88 | * 89 | * @see java.lang.Object#hashCode() 90 | */ 91 | @Override public int hashCode() { 92 | final int prime = 31; 93 | int result = 1; 94 | result = prime * result + ((bucket == null) ? 0 : bucket.hashCode()); 95 | return result; 96 | } 97 | 98 | /* 99 | * (non-Javadoc) 100 | * 101 | * @see java.lang.Object#equals(java.lang.Object) 102 | */ 103 | @Override public boolean equals(Object obj) { 104 | if (this == obj) { 105 | return true; 106 | } 107 | if (obj == null) { 108 | return false; 109 | } 110 | if (!(obj instanceof BucketKeyLister)) { 111 | return false; 112 | } 113 | BucketKeyLister other = (BucketKeyLister) obj; 114 | if (bucket == null) { 115 | if (other.bucket != null) { 116 | return false; 117 | } 118 | } else if (!bucket.equals(other.bucket)) { 119 | return false; 120 | } 121 | return true; 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /src/test/java/com/basho/riak/hadoop/keylisters/RiakSearchKeyListerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.keylisters; 15 | 16 | import static org.mockito.Mockito.*; 17 | import static org.junit.Assert.*; 18 | 19 | import java.util.Arrays; 20 | import java.util.Collection; 21 | 22 | import org.junit.Before; 23 | import org.junit.Test; 24 | import org.mockito.Mock; 25 | import org.mockito.MockitoAnnotations; 26 | 27 | import com.basho.riak.client.IRiakClient; 28 | import com.basho.riak.client.query.MapReduceResult; 29 | import com.basho.riak.client.query.SearchMapReduce; 30 | import com.basho.riak.client.query.functions.Args; 31 | import com.basho.riak.client.query.functions.NamedErlangFunction; 32 | import com.basho.riak.hadoop.BucketKey; 33 | import com.basho.riak.hadoop.keylisters.RiakSearchKeyLister; 34 | 35 | /** 36 | * @author russell 37 | * 38 | */ 39 | public class RiakSearchKeyListerTest { 40 | 41 | private static final String BUCKET = "bucket"; 42 | private static final String QUERY = "foo:zero"; 43 | 44 | @Mock private IRiakClient riakClient; 45 | @Mock private SearchMapReduce searchMapReduce; 46 | @Mock private MapReduceResult mapReduceResult; 47 | 48 | private RiakSearchKeyLister lister; 49 | 50 | /** 51 | * @throws java.lang.Exception 52 | */ 53 | @Before public void setUp() throws Exception { 54 | MockitoAnnotations.initMocks(this); 55 | } 56 | 57 | /** 58 | * Test method for 59 | * {@link com.basho.riak.hadoop.keylisters.RiakSearchKeyLister#RiakSearchKeyLister(java.lang.String, java.lang.String)} 60 | * . 61 | */ 62 | @Test public void createWithBucketAndQuery() throws Exception { 63 | lister = new RiakSearchKeyLister(BUCKET, QUERY); 64 | testLister(lister); 65 | } 66 | 67 | private void testLister(RiakSearchKeyLister lister) throws Exception { 68 | final Collection expected = Arrays.asList(new BucketKey(BUCKET, "k1"), new BucketKey(BUCKET, "k2")); 69 | 70 | when(riakClient.mapReduce(BUCKET, QUERY)).thenReturn(searchMapReduce); 71 | when(searchMapReduce.addReducePhase(NamedErlangFunction.REDUCE_IDENTITY, Args.REDUCE_PHASE_ONLY_1)).thenReturn(searchMapReduce); 72 | when(searchMapReduce.execute()).thenReturn(mapReduceResult); 73 | when(mapReduceResult.getResult(BucketKey.class)).thenReturn(expected); 74 | 75 | final Collection actual = lister.getKeys(riakClient); 76 | assertEquals(expected, actual); 77 | } 78 | 79 | /** 80 | * Test method for 81 | * {@link com.basho.riak.hadoop.keylisters.RiakSearchKeyLister#RiakSearchKeyLister()}. 82 | */ 83 | @Test public void emptyListerIllegalState() throws Exception { 84 | lister = new RiakSearchKeyLister(); 85 | 86 | try { 87 | lister.getKeys(riakClient); 88 | fail("Expected IllegalStateException"); 89 | } catch (IllegalStateException e) { 90 | // NO-OP 91 | } 92 | } 93 | 94 | /** 95 | * Test method for 96 | * {@link com.basho.riak.hadoop.keylisters.RiakSearchKeyLister#getInitString()}. 97 | */ 98 | @Test public void getInitString() throws Exception { 99 | lister = new RiakSearchKeyLister(BUCKET, QUERY); 100 | 101 | String initString = lister.getInitString(); 102 | 103 | RiakSearchKeyLister listerToo = new RiakSearchKeyLister(); 104 | listerToo.init(initString); 105 | 106 | testLister(listerToo); 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/test/java/com/basho/riak/hadoop/RiakInputFormatTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop; 15 | 16 | import static org.junit.Assert.assertEquals; 17 | import static org.junit.Assert.fail; 18 | import static org.mockito.Mockito.when; 19 | 20 | import java.util.LinkedList; 21 | import java.util.List; 22 | 23 | import org.apache.hadoop.conf.Configuration; 24 | import org.apache.hadoop.mapreduce.InputSplit; 25 | import org.apache.hadoop.mapreduce.JobContext; 26 | import org.junit.Before; 27 | import org.junit.Test; 28 | import org.mockito.Mock; 29 | import org.mockito.MockitoAnnotations; 30 | 31 | import com.basho.riak.hadoop.config.NoRiakLocationsException; 32 | import com.basho.riak.hadoop.config.RiakLocation; 33 | import com.basho.riak.hadoop.config.RiakPBLocation; 34 | 35 | /** 36 | * @author russell 37 | * 38 | */ 39 | public class RiakInputFormatTest { 40 | 41 | private static final String BUCKET = "bucket"; 42 | private static final String KEY = "key"; 43 | 44 | @Mock public JobContext jobContext; 45 | 46 | private RiakInputFormat inputFormat; 47 | 48 | /** 49 | * @throws java.lang.Exception 50 | */ 51 | @Before public void setUp() throws Exception { 52 | MockitoAnnotations.initMocks(this); 53 | inputFormat = new RiakInputFormat(); 54 | } 55 | 56 | /** 57 | * Test method for 58 | * {@link com.basho.riak.hadoop.RiakInputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} 59 | * . 60 | */ 61 | @Test public void getSplits_noLocations() throws Exception { 62 | Configuration conf = new Configuration(); 63 | when(jobContext.getConfiguration()).thenReturn(conf); 64 | try { 65 | inputFormat.getSplits(jobContext); 66 | fail("Expected IOException"); 67 | } catch (NoRiakLocationsException e) { 68 | // NO-OP 69 | } 70 | } 71 | 72 | @Test public void getSplitSize() { 73 | assertEquals(10, RiakInputFormat.getSplitSize(10, 4)); 74 | assertEquals(20, RiakInputFormat.getSplitSize(800, 4)); 75 | assertEquals(2500, RiakInputFormat.getSplitSize(100000, 4)); 76 | } 77 | 78 | @Test public void getSplits() throws Exception { 79 | final List bks = new LinkedList(); 80 | for (int i = 0; i < 100001; i++) { 81 | bks.add(new BucketKey(BUCKET, KEY + i)); 82 | } 83 | 84 | RiakLocation[] locations = new RiakLocation[] { new RiakPBLocation("host1", 8091), 85 | new RiakPBLocation("host2", 8091), 86 | new RiakPBLocation("host3", 8091), 87 | new RiakPBLocation("host4", 8091) }; 88 | 89 | List splits = RiakInputFormat.getSplits(bks, locations, 999); 90 | 91 | assertEquals("Expected 101 splits", 101, splits.size()); 92 | 93 | int _999SplitCnt = 0; 94 | int _101SplitCnt = 0; 95 | int otherSplitCnt = 0; 96 | 97 | for (InputSplit is : splits) { 98 | long length = is.getLength(); 99 | 100 | if (length == 999) { 101 | _999SplitCnt++; 102 | } else if (length == 101) { 103 | _101SplitCnt++; 104 | } else { 105 | otherSplitCnt++; 106 | } 107 | } 108 | 109 | assertEquals("Should be 100 splits of 999 keys", 100, _999SplitCnt); 110 | assertEquals("Should be 1 split of 101 keys", 1, _101SplitCnt); 111 | assertEquals("Should be 0 splits of with neither 999 or 101 keys", 0, otherSplitCnt); 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/test/java/com/basho/riak/hadoop/config/RiakConfigTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.config; 15 | 16 | import static org.junit.Assert.assertEquals; 17 | import static org.junit.Assert.assertTrue; 18 | 19 | import java.util.Arrays; 20 | 21 | import org.apache.hadoop.conf.Configuration; 22 | import org.junit.Test; 23 | 24 | import com.basho.riak.client.query.indexes.BinIndex; 25 | import com.basho.riak.client.raw.query.indexes.BinRangeQuery; 26 | import com.basho.riak.hadoop.config.RiakConfig; 27 | import com.basho.riak.hadoop.config.RiakHTTPLocation; 28 | import com.basho.riak.hadoop.config.RiakLocation; 29 | import com.basho.riak.hadoop.config.RiakPBLocation; 30 | import com.basho.riak.hadoop.keylisters.BucketKeyLister; 31 | import com.basho.riak.hadoop.keylisters.KeyLister; 32 | import com.basho.riak.hadoop.keylisters.KeysKeyLister; 33 | import com.basho.riak.hadoop.keylisters.RiakSearchKeyLister; 34 | import com.basho.riak.hadoop.keylisters.SecondaryIndexesKeyLister; 35 | 36 | /** 37 | * @author russell 38 | * 39 | */ 40 | public class RiakConfigTest { 41 | 42 | private static final String BUCKET = "bucket"; 43 | 44 | /** 45 | * Test method for 46 | * {@link com.basho.riak.hadoop.config.RiakConfig#addLocation(org.apache.hadoop.conf.Configuration, com.basho.riak.hadoop.config.RiakLocation)} 47 | * . 48 | */ 49 | @Test public void testAddRiakLocations() { 50 | final String host = "127.0.0.1"; 51 | final int port = 8097; 52 | Configuration conf = new Configuration(); 53 | conf = RiakConfig.addLocation(conf, new RiakPBLocation(host, port)); 54 | conf = RiakConfig.addLocation(conf, new RiakHTTPLocation(host, port, "riak")); 55 | 56 | assertEquals("127.0.0.1:8097,http://127.0.0.1:8097/riak", conf.get(RiakConfig.LOCATIONS_PROPERTY)); 57 | } 58 | 59 | /** 60 | * Test method for 61 | * {@link com.basho.riak.hadoop.config.RiakConfig#getRiakLocatons(org.apache.hadoop.conf.Configuration)} 62 | * . 63 | */ 64 | @Test public void testGetRiakLocatons() { 65 | Configuration conf = new Configuration(); 66 | conf.set(RiakConfig.LOCATIONS_PROPERTY, "127.0.0.1:8097,http://127.0.0.1:8097/riak"); 67 | 68 | RiakLocation[] locations = RiakConfig.getRiakLocatons(conf); 69 | 70 | assertEquals(2, locations.length); 71 | assertTrue(locations[0] instanceof RiakPBLocation); 72 | assertTrue(locations[1] instanceof RiakHTTPLocation); 73 | assertEquals("127.0.0.1:8097", locations[0].asString()); 74 | assertEquals("http://127.0.0.1:8097/riak", locations[1].asString()); 75 | } 76 | 77 | @Test public void setAndGetKeyLister() throws Exception { 78 | Configuration conf = new Configuration(); 79 | 80 | BucketKeyLister bkl = new BucketKeyLister(BUCKET); 81 | conf = RiakConfig.setKeyLister(conf, bkl); 82 | KeyLister actual = RiakConfig.getKeyLister(conf); 83 | assertEquals(bkl, actual); 84 | 85 | KeysKeyLister kkl = new KeysKeyLister(Arrays.asList("k1", "k2", "k3", "k4"), BUCKET); 86 | conf = RiakConfig.setKeyLister(conf, kkl); 87 | actual = RiakConfig.getKeyLister(conf); 88 | assertEquals(kkl, actual); 89 | 90 | RiakSearchKeyLister rskl = new RiakSearchKeyLister(BUCKET, "foo:zero"); 91 | conf = RiakConfig.setKeyLister(conf, rskl); 92 | actual = RiakConfig.getKeyLister(conf); 93 | assertEquals(rskl, actual); 94 | 95 | SecondaryIndexesKeyLister sikl = new SecondaryIndexesKeyLister(new BinRangeQuery(BinIndex.named("twitter"), 96 | BUCKET, "from", "to")); 97 | conf = RiakConfig.setKeyLister(conf, sikl); 98 | actual = RiakConfig.getKeyLister(conf); 99 | assertEquals(sikl, actual); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/config/RiakLocation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.config; 15 | 16 | import java.net.URI; 17 | 18 | /** 19 | * Models a Riak API end point location 20 | * 21 | * @author russell 22 | * 23 | */ 24 | public abstract class RiakLocation { 25 | 26 | private final RiakTransport transport; 27 | private final String host; 28 | private final int port; 29 | 30 | /** 31 | * Create a location 32 | * 33 | * @param transport 34 | * the {@link RiakTransport} for this location 35 | * @param host 36 | * the host 37 | * @param port 38 | * the port 39 | */ 40 | protected RiakLocation(RiakTransport transport, String host, int port) { 41 | this.transport = transport; 42 | this.host = host; 43 | this.port = port; 44 | } 45 | 46 | /** 47 | * @return the transport 48 | */ 49 | public RiakTransport getTransport() { 50 | return transport; 51 | } 52 | 53 | /** 54 | * @return the host 55 | */ 56 | public String getHost() { 57 | return host; 58 | } 59 | 60 | /** 61 | * @return the port 62 | */ 63 | public int getPort() { 64 | return port; 65 | } 66 | 67 | /** 68 | * Serialize this location to a String 69 | * 70 | * @return a string representation that can be used by fromString(String) 71 | */ 72 | public abstract String asString(); 73 | 74 | /** 75 | * De-serialize the location from a String 76 | * 77 | * @param location 78 | * a String representation from asString() 79 | * @return a {@link RiakLocation} 80 | */ 81 | public static RiakLocation fromString(String location) { 82 | RiakLocation result = null; 83 | if (location.contains("/")) { 84 | result = parseHttpLocation(location); 85 | } else { 86 | String[] pbLoc = location.split(":"); 87 | if (pbLoc.length != 2) { 88 | throw new IllegalArgumentException("Invalid locaton " + location); 89 | } 90 | result = new RiakPBLocation(pbLoc[0], Integer.parseInt(pbLoc[1])); 91 | } 92 | return result; 93 | } 94 | 95 | /** 96 | * @param location 97 | * @return 98 | */ 99 | private static RiakLocation parseHttpLocation(String location) { 100 | final URI uri = URI.create(location); 101 | return new RiakHTTPLocation(uri.getHost(), uri.getPort(), uri.getPath()); 102 | } 103 | 104 | /* 105 | * (non-Javadoc) 106 | * 107 | * @see java.lang.Object#hashCode() 108 | */ 109 | @Override public int hashCode() { 110 | final int prime = 31; 111 | int result = 1; 112 | result = prime * result + ((host == null) ? 0 : host.hashCode()); 113 | result = prime * result + port; 114 | result = prime * result + ((transport == null) ? 0 : transport.hashCode()); 115 | return result; 116 | } 117 | 118 | /* 119 | * (non-Javadoc) 120 | * 121 | * @see java.lang.Object#equals(java.lang.Object) 122 | */ 123 | @Override public boolean equals(Object obj) { 124 | if (this == obj) { 125 | return true; 126 | } 127 | if (obj == null) { 128 | return false; 129 | } 130 | if (!(obj instanceof RiakLocation)) { 131 | return false; 132 | } 133 | RiakLocation other = (RiakLocation) obj; 134 | if (host == null) { 135 | if (other.host != null) { 136 | return false; 137 | } 138 | } else if (!host.equals(other.host)) { 139 | return false; 140 | } 141 | if (port != other.port) { 142 | return false; 143 | } 144 | if (transport != other.transport) { 145 | return false; 146 | } 147 | return true; 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/keylisters/RiakSearchKeyLister.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.keylisters; 15 | 16 | import java.io.IOException; 17 | import java.util.Collection; 18 | 19 | import org.codehaus.jackson.map.ObjectMapper; 20 | 21 | import com.basho.riak.client.IRiakClient; 22 | import com.basho.riak.client.RiakException; 23 | import com.basho.riak.client.query.MapReduceResult; 24 | import com.basho.riak.client.query.functions.Args; 25 | import com.basho.riak.client.query.functions.NamedErlangFunction; 26 | import com.basho.riak.hadoop.BucketKey; 27 | 28 | /** 29 | * Uses a Riak Search M/R query to produce a list of {@link BucketKey}s for a 30 | * hadoop M/R job 31 | * 32 | * @author russell 33 | * 34 | */ 35 | public class RiakSearchKeyLister implements KeyLister { 36 | 37 | private static final ObjectMapper OM = new ObjectMapper(); 38 | 39 | private String bucket; 40 | private String searchQuery; 41 | 42 | /** 43 | * Create a key lister that will execute searchQuery for 44 | * bucket to get a list of {@link BucketKey}s 45 | * 46 | * @param bucket 47 | * @param searchQuery 48 | */ 49 | public RiakSearchKeyLister(String bucket, String searchQuery) { 50 | this.bucket = bucket; 51 | this.searchQuery = searchQuery; 52 | } 53 | 54 | public RiakSearchKeyLister() {} 55 | 56 | /* 57 | * (non-Javadoc) 58 | * 59 | * @see com.basho.riak.hadoop.KeyLister#getInitString() 60 | */ 61 | public String getInitString() throws IOException { 62 | return OM.writeValueAsString(new String[] { bucket, searchQuery }); 63 | } 64 | 65 | /* 66 | * (non-Javadoc) 67 | * 68 | * @see com.basho.riak.hadoop.KeyLister#init(java.lang.String) 69 | */ 70 | public void init(String initString) throws IOException { 71 | String[] bq = OM.readValue(initString, String[].class); 72 | bucket = bq[0]; 73 | searchQuery = bq[1]; 74 | } 75 | 76 | /* 77 | * (non-Javadoc) 78 | * 79 | * @see 80 | * com.basho.riak.hadoop.KeyLister#getKeys(com.basho.riak.client.IRiakClient 81 | * ) 82 | */ 83 | public Collection getKeys(IRiakClient client) throws RiakException { 84 | if (bucket == null || searchQuery == null) { 85 | throw new IllegalStateException("bucket and query cannot be null"); 86 | } 87 | 88 | MapReduceResult result = client.mapReduce(bucket, searchQuery).addReducePhase(NamedErlangFunction.REDUCE_IDENTITY, 89 | Args.REDUCE_PHASE_ONLY_1).execute(); 90 | 91 | return result.getResult(BucketKey.class); 92 | } 93 | 94 | /* 95 | * (non-Javadoc) 96 | * 97 | * @see java.lang.Object#hashCode() 98 | */ 99 | @Override public int hashCode() { 100 | final int prime = 31; 101 | int result = 1; 102 | result = prime * result + ((bucket == null) ? 0 : bucket.hashCode()); 103 | result = prime * result + ((searchQuery == null) ? 0 : searchQuery.hashCode()); 104 | return result; 105 | } 106 | 107 | /* 108 | * (non-Javadoc) 109 | * 110 | * @see java.lang.Object#equals(java.lang.Object) 111 | */ 112 | @Override public boolean equals(Object obj) { 113 | if (this == obj) { 114 | return true; 115 | } 116 | if (obj == null) { 117 | return false; 118 | } 119 | if (!(obj instanceof RiakSearchKeyLister)) { 120 | return false; 121 | } 122 | RiakSearchKeyLister other = (RiakSearchKeyLister) obj; 123 | if (bucket == null) { 124 | if (other.bucket != null) { 125 | return false; 126 | } 127 | } else if (!bucket.equals(other.bucket)) { 128 | return false; 129 | } 130 | if (searchQuery == null) { 131 | if (other.searchQuery != null) { 132 | return false; 133 | } 134 | } else if (!searchQuery.equals(other.searchQuery)) { 135 | return false; 136 | } 137 | return true; 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/RiakInputSplit.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop; 15 | 16 | import java.io.DataInput; 17 | import java.io.DataOutput; 18 | import java.io.IOException; 19 | import java.util.Arrays; 20 | import java.util.Collection; 21 | import java.util.List; 22 | 23 | import org.apache.hadoop.io.Writable; 24 | import org.apache.hadoop.mapreduce.InputSplit; 25 | 26 | import com.basho.riak.hadoop.config.RiakLocation; 27 | 28 | /** 29 | * Riak specific extension of {@link InputSplit} 30 | * 31 | * @author russell 32 | * 33 | */ 34 | public class RiakInputSplit extends InputSplit implements Writable { 35 | 36 | private BucketKey[] inputs; 37 | private RiakLocation location; 38 | 39 | public RiakInputSplit() {}; 40 | 41 | public RiakInputSplit(List split, RiakLocation location) { 42 | this.inputs = split.toArray(new BucketKey[split.size()]); 43 | this.location = location; 44 | } 45 | 46 | /** 47 | * @return the location for the split (this is where the record reader for 48 | * this split will load data from) 49 | */ 50 | public synchronized RiakLocation getLocation() { 51 | return location; 52 | } 53 | 54 | /** 55 | * @return the inputs the collection of keys whose data will be fetched by 56 | * the record reader 57 | */ 58 | public synchronized Collection getInputs() { 59 | return Arrays.asList(inputs.clone()); 60 | } 61 | 62 | /* 63 | * (non-Javadoc) 64 | * 65 | * @see org.apache.hadoop.mapreduce.InputSplit#getLength() 66 | */ 67 | @Override public long getLength() throws IOException, InterruptedException { 68 | return inputs.length; 69 | } 70 | 71 | /* 72 | * (non-Javadoc) 73 | * 74 | * @see org.apache.hadoop.mapreduce.InputSplit#getLocations() 75 | */ 76 | @Override public String[] getLocations() throws IOException, InterruptedException { 77 | return new String[] { location.asString() }; 78 | } 79 | 80 | /* 81 | * (non-Javadoc) 82 | * 83 | * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) 84 | */ 85 | public void readFields(DataInput din) throws IOException { 86 | location = RiakLocation.fromString(din.readUTF()); 87 | inputs = new BucketKey[din.readInt()]; 88 | 89 | for (int i = 0; i < inputs.length; i++) { 90 | inputs[i] = new BucketKey(din.readUTF(), din.readUTF()); 91 | } 92 | } 93 | 94 | /* 95 | * (non-Javadoc) 96 | * 97 | * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) 98 | */ 99 | public void write(DataOutput dout) throws IOException { 100 | dout.writeUTF(location.asString()); 101 | dout.writeInt(inputs.length); 102 | 103 | for (BucketKey bk : inputs) { 104 | dout.writeUTF(bk.getBucket()); 105 | dout.writeUTF(bk.getKey()); 106 | } 107 | } 108 | 109 | /* 110 | * (non-Javadoc) 111 | * 112 | * @see java.lang.Object#hashCode() 113 | */ 114 | @Override public int hashCode() { 115 | final int prime = 31; 116 | int result = 1; 117 | result = prime * result + Arrays.hashCode(inputs); 118 | result = prime * result + ((location == null) ? 0 : location.hashCode()); 119 | return result; 120 | } 121 | 122 | /* 123 | * (non-Javadoc) 124 | * 125 | * @see java.lang.Object#equals(java.lang.Object) 126 | */ 127 | @Override public boolean equals(Object obj) { 128 | if (this == obj) { 129 | return true; 130 | } 131 | if (obj == null) { 132 | return false; 133 | } 134 | if (!(obj instanceof RiakInputSplit)) { 135 | return false; 136 | } 137 | RiakInputSplit other = (RiakInputSplit) obj; 138 | if (!Arrays.equals(inputs, other.inputs)) { 139 | return false; 140 | } 141 | if (location == null) { 142 | if (other.location != null) { 143 | return false; 144 | } 145 | } else if (!location.equals(other.location)) { 146 | return false; 147 | } 148 | return true; 149 | } 150 | 151 | } 152 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/keylisters/KeysKeyLister.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.keylisters; 15 | 16 | import java.util.Collection; 17 | import java.util.HashSet; 18 | import java.util.Set; 19 | 20 | import com.basho.riak.client.IRiakClient; 21 | import com.basho.riak.client.RiakException; 22 | import com.basho.riak.hadoop.BucketKey; 23 | 24 | /** 25 | * Key lister that simply returns the list of keys it is configured with. 26 | * 27 | * If you get your key list from outside Riak, or for testing a subset of data. 28 | * 29 | * @author russell 30 | * 31 | */ 32 | public class KeysKeyLister implements KeyLister { 33 | 34 | private static final String BK_SEPARATOR = ":"; 35 | private static final String ENTRY_SEPARATOR = ","; 36 | 37 | private Set keys = null; 38 | 39 | /** 40 | * Provide the keys directly (don't look up in Riak) 41 | * 42 | * @param keys 43 | * the keys to M/R over 44 | */ 45 | public KeysKeyLister(Collection keys) { 46 | this.keys = new HashSet(keys); 47 | } 48 | 49 | /** 50 | * Provide the keys directly (don't look up in Riak) 51 | * 52 | * @param keys 53 | * the keys to M/R over 54 | * @param bucket 55 | * a common bucket the keys share 56 | */ 57 | public KeysKeyLister(Collection keys, String bucket) { 58 | this.keys = new HashSet(); 59 | for (String k : keys) { 60 | this.keys.add(new BucketKey(bucket, k)); 61 | } 62 | } 63 | 64 | public KeysKeyLister() {}; 65 | 66 | /* 67 | * (non-Javadoc) 68 | * 69 | * @see com.basho.riak.hadoop.KeyLister#getInitString() 70 | */ 71 | public String getInitString() { 72 | StringBuilder sb = new StringBuilder(); 73 | String sep = ""; 74 | for (BucketKey bk : keys) { 75 | sb.append(sep).append(bk.getBucket()).append(BK_SEPARATOR).append(bk.getKey()); 76 | sep = ENTRY_SEPARATOR; 77 | } 78 | 79 | return sb.toString(); 80 | } 81 | 82 | /* 83 | * (non-Javadoc) 84 | * 85 | * @see com.basho.riak.hadoop.KeyLister#init(java.lang.String) 86 | */ 87 | public void init(String initString) { 88 | if (initString == null) { 89 | throw new IllegalArgumentException("initString cannot be null"); 90 | } 91 | this.keys = new HashSet(); 92 | String[] bks = initString.split(ENTRY_SEPARATOR); 93 | 94 | for (String bk : bks) { 95 | String[] bucketKey = bk.split(BK_SEPARATOR); 96 | keys.add(new BucketKey(bucketKey[0], bucketKey[1])); 97 | } 98 | } 99 | 100 | /* 101 | * (non-Javadoc) 102 | * 103 | * @see 104 | * com.basho.riak.hadoop.KeyLister#getKeys(com.basho.riak.client.IRiakClient 105 | * ) 106 | */ 107 | public Collection getKeys(IRiakClient client) throws RiakException { 108 | if (keys == null) { 109 | throw new IllegalStateException("lister not initialised"); 110 | } 111 | return new HashSet(keys); 112 | } 113 | 114 | /* 115 | * (non-Javadoc) 116 | * 117 | * @see java.lang.Object#hashCode() 118 | */ 119 | @Override public int hashCode() { 120 | final int prime = 31; 121 | int result = 1; 122 | result = prime * result + ((keys == null) ? 0 : keys.hashCode()); 123 | return result; 124 | } 125 | 126 | /* 127 | * (non-Javadoc) 128 | * 129 | * @see java.lang.Object#equals(java.lang.Object) 130 | */ 131 | @Override public boolean equals(Object obj) { 132 | if (this == obj) { 133 | return true; 134 | } 135 | if (obj == null) { 136 | return false; 137 | } 138 | if (!(obj instanceof KeysKeyLister)) { 139 | return false; 140 | } 141 | KeysKeyLister other = (KeysKeyLister) obj; 142 | if (keys == null) { 143 | if (other.keys != null) { 144 | return false; 145 | } 146 | } else if (!keys.equals(other.keys)) { 147 | return false; 148 | } 149 | return true; 150 | } 151 | 152 | } 153 | -------------------------------------------------------------------------------- /src/test/java/com/basho/riak/hadoop/keylisters/BucketKeyListerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.keylisters; 15 | 16 | import static org.junit.Assert.assertEquals; 17 | import static org.junit.Assert.assertTrue; 18 | import static org.junit.Assert.fail; 19 | import static org.mockito.Mockito.when; 20 | 21 | import java.util.ArrayList; 22 | import java.util.Arrays; 23 | import java.util.Collection; 24 | import java.util.List; 25 | 26 | import org.junit.Before; 27 | import org.junit.Test; 28 | import org.mockito.Mock; 29 | import org.mockito.MockitoAnnotations; 30 | 31 | import com.basho.riak.client.IRiakClient; 32 | import com.basho.riak.client.RiakException; 33 | import com.basho.riak.client.bucket.Bucket; 34 | import com.basho.riak.client.bucket.FetchBucket; 35 | import com.basho.riak.hadoop.BucketKey; 36 | import com.basho.riak.hadoop.keylisters.BucketKeyLister; 37 | 38 | /** 39 | * @author russell 40 | * 41 | */ 42 | public class BucketKeyListerTest { 43 | 44 | private static final String BUCKET_NAME = "bucket"; 45 | 46 | @Mock private IRiakClient riakClient; 47 | @Mock private Bucket bucket; 48 | @Mock private FetchBucket fetchBucket; 49 | 50 | private BucketKeyLister lister; 51 | 52 | /** 53 | * Create {@link BucketKeyLister}, mocks, wire together, stub mocks 54 | */ 55 | @Before public void setUp() throws Exception { 56 | MockitoAnnotations.initMocks(this); 57 | // stub default calls to IRiakClient and FetchBucket 58 | when(riakClient.fetchBucket(BUCKET_NAME)).thenReturn(fetchBucket); 59 | when(fetchBucket.execute()).thenReturn(bucket); 60 | } 61 | 62 | /** 63 | * Test method for 64 | * {@link com.basho.riak.hadoop.keylisters.BucketKeyLister#BucketKeyLister()}. 65 | */ 66 | @Test public void illegalState() throws Exception { 67 | lister = new BucketKeyLister(); 68 | try { 69 | testLister(lister); 70 | fail("expected IllegalStateException"); 71 | } catch (IllegalStateException e) { 72 | // NO-OP 73 | } 74 | } 75 | 76 | /** 77 | * Test method for 78 | * {@link com.basho.riak.hadoop.keylisters.BucketKeyLister#BucketKeyLister(java.lang.String)} 79 | * . 80 | */ 81 | @Test public void createWithBucket() throws Exception { 82 | lister = new BucketKeyLister(BUCKET_NAME); 83 | testLister(lister); 84 | } 85 | 86 | /** 87 | * Test method for 88 | * {@link com.basho.riak.hadoop.keylisters.BucketKeyLister#init(java.lang.String)}. 89 | */ 90 | @Test public void initWithBucket() throws Exception { 91 | lister = new BucketKeyLister(); 92 | lister.init(BUCKET_NAME); 93 | testLister(lister); 94 | } 95 | 96 | /** 97 | * Test method for 98 | * {@link com.basho.riak.hadoop.keylisters.BucketKeyLister#getInitString()}. 99 | */ 100 | @Test public void testGetInitString() throws Exception { 101 | String initString = new BucketKeyLister(BUCKET_NAME).getInitString(); 102 | assertEquals(BUCKET_NAME, initString); 103 | testLister(new BucketKeyLister(initString)); 104 | } 105 | 106 | @Test public void exceptionsBubbleUp() throws Exception { 107 | final RiakException re = new RiakException(); 108 | lister = new BucketKeyLister(BUCKET_NAME); 109 | 110 | when(bucket.keys()).thenThrow(re); 111 | 112 | try { 113 | lister.getKeys(riakClient); 114 | fail("Expected RiakException"); 115 | } catch (RiakException e) { 116 | assertEquals(e, re); 117 | } 118 | } 119 | 120 | @Test public void zeroKeys() throws Exception { 121 | lister = new BucketKeyLister(BUCKET_NAME); 122 | testLister(lister, new ArrayList()); 123 | } 124 | 125 | private void testLister(BucketKeyLister lister) throws Exception { 126 | testLister(lister, Arrays.asList("k1", "k2", "k3", "k4")); 127 | } 128 | 129 | private void testLister(BucketKeyLister lister, List expectedKeys) throws Exception { 130 | when(bucket.keys()).thenReturn(expectedKeys); 131 | Collection keys = lister.getKeys(riakClient); 132 | assertEquals("Expected keys to be same length as stubbed mock value", expectedKeys.size(), keys.size()); 133 | 134 | for (String k : expectedKeys) { 135 | assertTrue("Expected keys to contain " + k, keys.contains(new BucketKey(BUCKET_NAME, k))); 136 | } 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/config/ClientFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.config; 15 | 16 | import java.io.IOException; 17 | 18 | import com.basho.riak.client.IRiakClient; 19 | import com.basho.riak.client.RiakException; 20 | import com.basho.riak.client.RiakFactory; 21 | import com.basho.riak.client.raw.RawClient; 22 | import com.basho.riak.client.raw.config.Configuration; 23 | import com.basho.riak.client.raw.http.HTTPClientAdapter; 24 | import com.basho.riak.client.raw.http.HTTPClientConfig; 25 | import com.basho.riak.client.raw.http.HTTPClusterConfig; 26 | import com.basho.riak.client.raw.pbc.PBClientAdapter; 27 | import com.basho.riak.client.raw.pbc.PBClientConfig; 28 | import com.basho.riak.client.raw.pbc.PBClusterConfig; 29 | 30 | /** 31 | * Used for generating clients for input/output 32 | * 33 | * Replace with existing RJC factory when {@link RiakLocation}s is swapped for 34 | * {@link Configuration} 35 | * 36 | * @author russell 37 | * 38 | */ 39 | public final class ClientFactory { 40 | 41 | private ClientFactory() {} 42 | 43 | public static IRiakClient getClient(RiakLocation location) throws RiakException { 44 | // TODO this should use getRawClient, but DefaultRiakClient's 45 | // constructor is wrong visibility 46 | // Either change the visibility or add a method to the factory to accept 47 | // a delegate (the latter!) 48 | IRiakClient client = null; 49 | switch (location.getTransport()) { 50 | case PB: 51 | client = RiakFactory.pbcClient(location.getHost(), location.getPort()); 52 | break; 53 | case HTTP: 54 | client = RiakFactory.httpClient(location.asString()); 55 | break; 56 | default: 57 | throw new RiakException("Unknown Transport"); 58 | } 59 | return client; 60 | } 61 | 62 | public static RawClient getRawClient(RiakLocation location) throws IOException { 63 | RawClient client = null; 64 | switch (location.getTransport()) { 65 | case PB: 66 | client = new PBClientAdapter(location.getHost(), location.getPort()); 67 | break; 68 | case HTTP: 69 | client = new HTTPClientAdapter(location.asString()); 70 | break; 71 | default: 72 | throw new IOException("Unknown Transport"); 73 | } 74 | return client; 75 | } 76 | 77 | /** 78 | * Generate a cluster client from an array of {@link RiakLocation}s 79 | * 80 | * @param riakLocatons 81 | * @return 82 | * @throws IllegalArgumentException 83 | * if locations are not all of same {@link RiakTransport} 84 | */ 85 | public static IRiakClient clusterClient(RiakLocation[] riakLocatons) throws RiakException { 86 | IRiakClient client = null; 87 | RiakTransport transport = null; 88 | 89 | if (riakLocatons != null && riakLocatons.length > 0) { 90 | transport = riakLocatons[0].getTransport(); 91 | } 92 | 93 | if (RiakTransport.PB.equals(transport)) { 94 | client = pbClusterClient(riakLocatons); 95 | } else if (RiakTransport.HTTP.equals(transport)) { 96 | client = httpClusterClient(riakLocatons); 97 | } 98 | 99 | return client; 100 | } 101 | 102 | /** 103 | * @param riakLocatons 104 | * @return a cluster client of HTTP clients 105 | */ 106 | private static IRiakClient httpClusterClient(RiakLocation[] riakLocatons) throws RiakException { 107 | HTTPClusterConfig conf = new HTTPClusterConfig(500); // TODO make this config 108 | 109 | for (RiakLocation loc : riakLocatons) { 110 | if(!RiakTransport.HTTP.equals(loc.getTransport())) { 111 | throw new IllegalArgumentException("Cluster clients must be homogenous"); 112 | } 113 | 114 | RiakHTTPLocation httpLoc = (RiakHTTPLocation)loc; 115 | conf.addClient(new HTTPClientConfig.Builder() 116 | .withHost(httpLoc.getHost()) 117 | .withPort(httpLoc.getPort()) 118 | .withRiakPath(httpLoc.getRiakPath()) 119 | .build()); 120 | } 121 | return RiakFactory.newClient(conf); 122 | } 123 | 124 | /** 125 | * @param riakLocatons 126 | * @return a cluster client of PB clients 127 | */ 128 | private static IRiakClient pbClusterClient(RiakLocation[] riakLocatons) throws RiakException { 129 | PBClusterConfig conf = new PBClusterConfig(500); // TODO make this config 130 | 131 | for (RiakLocation loc : riakLocatons) { 132 | if(!RiakTransport.PB.equals(loc.getTransport())) { 133 | throw new IllegalArgumentException("Cluster clients must be homogenous"); 134 | } 135 | conf.addClient(new PBClientConfig.Builder() 136 | .withHost(loc.getHost()) 137 | .withPort(loc.getPort()) 138 | .build()); 139 | } 140 | return RiakFactory.newClient(conf); 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /src/test/java/com/basho/riak/hadoop/keylisters/SecondaryIndexesKeyListerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.keylisters; 15 | 16 | import static org.junit.Assert.assertEquals; 17 | import static org.junit.Assert.fail; 18 | import static org.mockito.Mockito.when; 19 | 20 | import java.util.Arrays; 21 | import java.util.Collection; 22 | 23 | import org.junit.Before; 24 | import org.junit.Test; 25 | import org.mockito.Mock; 26 | import org.mockito.MockitoAnnotations; 27 | 28 | import com.basho.riak.client.IRiakClient; 29 | import com.basho.riak.client.query.IndexMapReduce; 30 | import com.basho.riak.client.query.MapReduceResult; 31 | import com.basho.riak.client.query.functions.Args; 32 | import com.basho.riak.client.query.functions.NamedErlangFunction; 33 | import com.basho.riak.client.query.indexes.BinIndex; 34 | import com.basho.riak.client.query.indexes.IntIndex; 35 | import com.basho.riak.client.raw.query.indexes.BinRangeQuery; 36 | import com.basho.riak.client.raw.query.indexes.BinValueQuery; 37 | import com.basho.riak.client.raw.query.indexes.IndexQuery; 38 | import com.basho.riak.client.raw.query.indexes.IntRangeQuery; 39 | import com.basho.riak.client.raw.query.indexes.IntValueQuery; 40 | import com.basho.riak.hadoop.BucketKey; 41 | import com.basho.riak.hadoop.keylisters.SecondaryIndexesKeyLister; 42 | 43 | /** 44 | * @author russell 45 | * 46 | */ 47 | public class SecondaryIndexesKeyListerTest { 48 | 49 | private static final String INDEX = "index"; 50 | private static final String BUCKET = "bucket"; 51 | private static final String VALUE = "value"; 52 | private static final String FROM = "from"; 53 | private static final String TO = "to"; 54 | 55 | @Mock private IRiakClient riakClient; 56 | @Mock private IndexMapReduce indexMapReduce; 57 | @Mock private MapReduceResult result; 58 | 59 | private SecondaryIndexesKeyLister lister; 60 | 61 | /** 62 | * @throws java.lang.Exception 63 | */ 64 | @Before public void setUp() throws Exception { 65 | MockitoAnnotations.initMocks(this); 66 | } 67 | 68 | /** 69 | * Test method for 70 | * {@link com.basho.riak.hadoop.keylisters.SecondaryIndexesKeyLister#SecondaryIndexesKeyLister(com.basho.riak.client.raw.query.indexes.IndexQuery)} 71 | * . 72 | */ 73 | @Test public void constructWithQuery() throws Exception { 74 | 75 | IndexQuery query = new BinRangeQuery(BinIndex.named(INDEX), BUCKET, FROM, TO); 76 | lister = new SecondaryIndexesKeyLister(query); 77 | 78 | testLister(lister, query); 79 | } 80 | 81 | /** 82 | * Test method for 83 | * {@link com.basho.riak.hadoop.keylisters.SecondaryIndexesKeyLister#SecondaryIndexesKeyLister()} 84 | * . 85 | */ 86 | @Test public void illegalState() throws Exception { 87 | lister = new SecondaryIndexesKeyLister(); 88 | 89 | try { 90 | lister.getKeys(riakClient); 91 | fail("Expected IllegalStateException"); 92 | } catch (IllegalStateException e) { 93 | // NO-OP 94 | } 95 | } 96 | 97 | /** 98 | * Test method for 99 | * {@link com.basho.riak.hadoop.keylisters.SecondaryIndexesKeyLister#getInitString()}. 100 | */ 101 | @Test public void getInitString_binRange() throws Exception { 102 | IndexQuery query = new BinRangeQuery(BinIndex.named(INDEX), BUCKET, FROM, TO); 103 | lister = new SecondaryIndexesKeyLister(query); 104 | 105 | String initString = lister.getInitString(); 106 | 107 | SecondaryIndexesKeyLister listerToo = new SecondaryIndexesKeyLister(); 108 | listerToo.init(initString); 109 | 110 | testLister(listerToo, query); 111 | } 112 | 113 | @Test public void getInitString_binValue() throws Exception { 114 | IndexQuery query = new BinValueQuery(BinIndex.named(INDEX), BUCKET, VALUE); 115 | lister = new SecondaryIndexesKeyLister(query); 116 | 117 | String initString = lister.getInitString(); 118 | 119 | SecondaryIndexesKeyLister listerToo = new SecondaryIndexesKeyLister(); 120 | listerToo.init(initString); 121 | 122 | testLister(listerToo, query); 123 | } 124 | 125 | @Test public void getInitString_intnRange() throws Exception { 126 | IndexQuery query = new IntRangeQuery(IntIndex.named(INDEX), BUCKET, 1, 100); 127 | lister = new SecondaryIndexesKeyLister(query); 128 | 129 | String initString = lister.getInitString(); 130 | 131 | SecondaryIndexesKeyLister listerToo = new SecondaryIndexesKeyLister(); 132 | listerToo.init(initString); 133 | 134 | testLister(listerToo, query); 135 | } 136 | 137 | @Test public void getInitString_intValue() throws Exception { 138 | IndexQuery query = new IntValueQuery(IntIndex.named(INDEX), BUCKET, 10); 139 | lister = new SecondaryIndexesKeyLister(query); 140 | 141 | String initString = lister.getInitString(); 142 | 143 | SecondaryIndexesKeyLister listerToo = new SecondaryIndexesKeyLister(); 144 | listerToo.init(initString); 145 | 146 | testLister(listerToo, query); 147 | } 148 | 149 | private void testLister(SecondaryIndexesKeyLister lister, IndexQuery query) throws Exception { 150 | final Collection expected = Arrays.asList(new BucketKey(BUCKET, "k1"), new BucketKey(BUCKET, "k2")); 151 | when(riakClient.mapReduce(query)).thenReturn(indexMapReduce); 152 | when(indexMapReduce.addReducePhase(NamedErlangFunction.REDUCE_IDENTITY, Args.REDUCE_PHASE_ONLY_1)).thenReturn(indexMapReduce); 153 | when(indexMapReduce.execute()).thenReturn(result); 154 | when(result.getResult(BucketKey.class)).thenReturn(expected); 155 | 156 | Collection actual = lister.getKeys(riakClient); 157 | 158 | assertEquals(expected, actual); 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/RiakInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * x * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop; 15 | 16 | import static com.basho.riak.hadoop.config.ClientFactory.getClient; 17 | 18 | import java.io.IOException; 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | 22 | import org.apache.hadoop.conf.Configuration; 23 | import org.apache.hadoop.mapreduce.InputFormat; 24 | import org.apache.hadoop.mapreduce.InputSplit; 25 | import org.apache.hadoop.mapreduce.JobContext; 26 | import org.apache.hadoop.mapreduce.RecordReader; 27 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 28 | 29 | import com.basho.riak.client.IRiakClient; 30 | import com.basho.riak.client.RiakException; 31 | import com.basho.riak.client.raw.RiakResponse; 32 | import com.basho.riak.hadoop.config.NoRiakLocationsException; 33 | import com.basho.riak.hadoop.config.RiakConfig; 34 | import com.basho.riak.hadoop.config.RiakLocation; 35 | import com.basho.riak.hadoop.keylisters.KeyLister; 36 | 37 | /** 38 | * Riak specific {@link InputFormat} for Hadoop Map/Reduce 39 | * 40 | * @author russell 41 | * 42 | */ 43 | public class RiakInputFormat extends InputFormat { 44 | 45 | /** 46 | * TODO: add this to the configuration. 47 | */ 48 | private static final int MINIMUM_SPLIT = 10; 49 | 50 | /* (non-Javadoc) 51 | * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(org.apache.hadoop.mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext) 52 | */ 53 | @Override public RecordReader createRecordReader(InputSplit split, 54 | TaskAttemptContext context) 55 | throws IOException, InterruptedException { 56 | return new RiakRecordReader(); 57 | } 58 | 59 | /* (non-Javadoc) 60 | * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext) 61 | */ 62 | @Override public List getSplits(JobContext context) throws IOException, InterruptedException { 63 | Configuration conf = context.getConfiguration(); 64 | RiakLocation[] locations = RiakConfig.getRiakLocatons(conf); 65 | 66 | if (locations.length == 0) { 67 | throw new NoRiakLocationsException(); 68 | } 69 | 70 | final KeyLister keyLister = RiakConfig.getKeyLister(conf); 71 | 72 | try { 73 | List keys = getKeys(locations, keyLister, 0); 74 | List splits = getSplits(keys, locations, 75 | getSplitSize(keys.size(), RiakConfig.getHadoopClusterSize(conf, 3))); 76 | return splits; 77 | } catch (RiakException e) { 78 | throw new IOException(e); 79 | } 80 | } 81 | 82 | /** 83 | * Get the list of input keys for the task. If the first location fails, try 84 | * the next, and so on, until we have a success or definitive failure. 85 | * 86 | * @return the list of bucket/keys (may be empty, never null) 87 | * @throws RiakException 88 | */ 89 | public static List getKeys(RiakLocation[] locations, KeyLister keyLister, int attemptNumber) 90 | throws RiakException { 91 | final List keys = new ArrayList(); 92 | try { 93 | IRiakClient attemptClient = getClient(locations[attemptNumber]); 94 | keys.addAll(keyLister.getKeys(attemptClient)); 95 | } catch (RiakException e) { 96 | if (attemptNumber >= (locations.length - 1)) { 97 | throw e; 98 | } else { 99 | getKeys(locations, keyLister, ++attemptNumber); 100 | } 101 | } 102 | return keys; 103 | } 104 | 105 | /** 106 | * Calculates the split size. Uses a *rough* heuristic based on the info 107 | * here http://wiki.apache.org/hadoop/HowManyMapsAndReduces to generate ~10 108 | * splits per hadoop node. Falls back to some lower number if the inputs are 109 | * smaller, and lower still when there are less inputs than hadoop nodes 110 | * 111 | * @param numberOfKeys 112 | * the total input size 113 | * @param hadoopClusterSize 114 | * rough number of nodes in the hadoop m/r cluster 115 | * @return the size for each split 116 | */ 117 | public static int getSplitSize(int numberOfKeys, int hadoopClusterSize) { 118 | int splitSize = numberOfKeys / (hadoopClusterSize * 10); 119 | if (splitSize < MINIMUM_SPLIT) { 120 | // too few? then use a smaller divider 121 | splitSize = numberOfKeys / hadoopClusterSize; 122 | if (splitSize < MINIMUM_SPLIT) { 123 | // still too few? just split into splits of MINIMUM_SPLIT 124 | splitSize = MINIMUM_SPLIT; 125 | } 126 | } 127 | return splitSize; 128 | } 129 | 130 | /** 131 | * Generate the splits, each split (except maybe the last) will be 132 | * splitSize and will have a {@link RiakLocation} assigned to 133 | * it. The {@link RiakLocation} is chosen by modulus so it should be a 134 | * reasonably fair distribution. 135 | * 136 | * @param keys 137 | * the list of inputs 138 | * @param locations 139 | * all the riak locations 140 | * @param splitSize 141 | * The target size for each split 142 | * @return the input splits 143 | */ 144 | public static List getSplits(final List keys, final RiakLocation[] locations, int splitSize) { 145 | final List splits = new ArrayList(); 146 | int splitCnt = 0; 147 | int startIndex = 0; 148 | int numberOfKeys = keys.size(); 149 | while (startIndex < numberOfKeys) { 150 | int endIndex = Math.min(numberOfKeys, splitSize + startIndex); 151 | final List split = keys.subList(startIndex, endIndex); 152 | splits.add(new RiakInputSplit(split, locations[splitCnt % locations.length])); 153 | splitCnt++; 154 | startIndex = endIndex; 155 | } 156 | 157 | return splits; 158 | } 159 | } -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/config/RiakConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.config; 15 | 16 | import java.io.IOException; 17 | import java.util.ArrayList; 18 | import java.util.List; 19 | import java.util.StringTokenizer; 20 | 21 | import org.apache.hadoop.conf.Configuration; 22 | import org.apache.hadoop.mapreduce.InputSplit; 23 | 24 | import com.basho.riak.hadoop.keylisters.BucketKeyLister; 25 | import com.basho.riak.hadoop.keylisters.KeyLister; 26 | 27 | /** 28 | * Helper class to make dealing with the hadoop {@link Configuration} object 29 | * easier when setting up a Riak Map/Reduce job on Hadoop 30 | * 31 | * @author russell 32 | * 33 | */ 34 | public final class RiakConfig { 35 | 36 | public static final String LOCATIONS_PROPERTY = "com.basho.riak.hadoop.mr.riak.locations"; 37 | private static final String COMMA = ","; 38 | public static final String CLUSTER_SIZE_PROPERTY = "com.basho.riak.hadoop.mr.cluster.size"; 39 | private static final String KEY_LISTER_CLASS_PROPERTY = "com.basho.riak.hadoop.mr.keylister.class"; 40 | private static final String KEY_LISTER_INIT_STRING_PROPERTY = "com.basho.riak.hadoop.mr.keylister.init_string"; 41 | private static final String OUTPUT_BUCKET_PROPERTY = "com.basho.riak.hadoop.mr.output.bucket"; 42 | 43 | private RiakConfig() {} 44 | 45 | /** 46 | * Add a riak location to the {@link Configuration} passed. 47 | * 48 | * @param conf 49 | * the {@link Configuration} to add a location too 50 | * @param location 51 | * the {@link RiakLocation} to add 52 | * @return the {@link Configuration} with location added to the 53 | * location property 54 | */ 55 | public static Configuration addLocation(Configuration conf, RiakLocation location) { 56 | StringBuilder sb = new StringBuilder(); 57 | String currentLocations = conf.get(LOCATIONS_PROPERTY); 58 | 59 | if (currentLocations != null) { 60 | sb.append(currentLocations); 61 | } 62 | 63 | if (sb.length() > 0) { 64 | sb.append(COMMA); 65 | } 66 | 67 | sb.append(location.asString()); 68 | 69 | conf.set(LOCATIONS_PROPERTY, sb.toString()); 70 | return conf; 71 | } 72 | 73 | /** 74 | * Get all the riak locations from the passed {@link Configuration} 75 | * 76 | * @param conf 77 | * the {@link Configuration} 78 | * @return an array of {@link RiakLocation} (may be empty, never null) 79 | */ 80 | public static RiakLocation[] getRiakLocatons(Configuration conf) { 81 | String locations = conf.get(LOCATIONS_PROPERTY, ""); 82 | StringTokenizer st = new StringTokenizer(locations, COMMA); 83 | List result = new ArrayList(); 84 | 85 | while (st.hasMoreTokens()) { 86 | result.add(RiakLocation.fromString(st.nextToken())); 87 | } 88 | 89 | return result.toArray(new RiakLocation[result.size()]); 90 | } 91 | 92 | /** 93 | * Set the size of the hadoop cluster, this is used by the 94 | * {@link RiakInputFormat} to try and optimize the number of 95 | * {@link InputSplit}s to create 96 | * 97 | * @param conf 98 | * the {@link Configuration} to store the hadoop cluster size in 99 | * @param hadoopClusterSize 100 | * the size of the hadoop cluster 101 | * @return the {@link Configuration} updated with the passed 102 | * hadoopClusterSize 103 | */ 104 | public static Configuration setHadoopClusterSize(Configuration conf, int hadoopClusterSize) { 105 | conf.setInt(CLUSTER_SIZE_PROPERTY, hadoopClusterSize); 106 | return conf; 107 | 108 | } 109 | 110 | /** 111 | * Get the hadoop cluster size property, provide a default in case it hasn't 112 | * been set 113 | * 114 | * @param conf 115 | * the {@link Configuration} to get the property value from 116 | * @param defaultValue 117 | * the default size to use if it hasn't been set 118 | * @return the hadoop cluster size or defaultValue 119 | */ 120 | public static int getHadoopClusterSize(Configuration conf, int defaultValue) { 121 | return conf.getInt(CLUSTER_SIZE_PROPERTY, defaultValue); 122 | } 123 | 124 | /** 125 | * @param conf 126 | * the {@link Configuration} to query 127 | * @return the {@link KeyLister} the job was configured with 128 | * @throws RuntimeException 129 | * if a {@link IllegalAccessException} or 130 | * {@link InstantiationException} is thrown creating a 131 | * {@link KeyLister} 132 | */ 133 | public static KeyLister getKeyLister(Configuration conf) throws IOException { 134 | Class clazz = conf.getClass(KEY_LISTER_CLASS_PROPERTY, BucketKeyLister.class, 135 | KeyLister.class); 136 | try { 137 | KeyLister lister = clazz.newInstance(); 138 | lister.init(conf.get(KEY_LISTER_INIT_STRING_PROPERTY)); 139 | return lister; 140 | } catch (IllegalAccessException e) { 141 | throw new RuntimeException(e); 142 | } catch (InstantiationException e) { 143 | throw new RuntimeException(e); 144 | } 145 | } 146 | 147 | /** 148 | * Set the {@link KeyLister} implementation to use. 149 | * 150 | * @param conf 151 | * the {@link Configuration} to update 152 | * @param lister 153 | * the {@link KeyLister} to use 154 | * @return the configuration updated with a serialized version of the lister 155 | * provided 156 | */ 157 | public static Configuration setKeyLister(Configuration conf, T lister) throws IOException { 158 | conf.setClass(KEY_LISTER_CLASS_PROPERTY, lister.getClass(), KeyLister.class); 159 | conf.setStrings(KEY_LISTER_INIT_STRING_PROPERTY, lister.getInitString()); 160 | return conf; 161 | } 162 | 163 | /** 164 | * Get the configured output bucket for the job's results 165 | * 166 | * @param conf 167 | * the {@link Configuration} to query 168 | * @return the bucket name 169 | */ 170 | public static String getOutputBucket(Configuration conf) { 171 | return conf.get(OUTPUT_BUCKET_PROPERTY); 172 | } 173 | 174 | /** 175 | * Add the output bucket for the results to the config. 176 | * 177 | * @param conf 178 | * the {@link Configuration} to update 179 | * @param bucket 180 | * the bucket to add 181 | * @return the updated {@link Configuration} 182 | */ 183 | public static Configuration setOutputBucket(Configuration conf, String bucket) { 184 | conf.set(OUTPUT_BUCKET_PROPERTY, bucket); 185 | return conf; 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /src/main/java/com/basho/riak/hadoop/keylisters/SecondaryIndexesKeyLister.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is provided to you under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.basho.riak.hadoop.keylisters; 15 | 16 | import java.io.ByteArrayOutputStream; 17 | import java.io.IOException; 18 | import java.util.Collection; 19 | import java.util.Map; 20 | 21 | import org.codehaus.jackson.JsonEncoding; 22 | import org.codehaus.jackson.JsonFactory; 23 | import org.codehaus.jackson.JsonGenerator; 24 | import org.codehaus.jackson.map.ObjectMapper; 25 | 26 | import com.basho.riak.client.IRiakClient; 27 | import com.basho.riak.client.RiakException; 28 | import com.basho.riak.client.query.MapReduceResult; 29 | import com.basho.riak.client.query.functions.Args; 30 | import com.basho.riak.client.query.functions.NamedErlangFunction; 31 | import com.basho.riak.client.query.indexes.BinIndex; 32 | import com.basho.riak.client.query.indexes.IntIndex; 33 | import com.basho.riak.client.raw.query.indexes.BinRangeQuery; 34 | import com.basho.riak.client.raw.query.indexes.BinValueQuery; 35 | import com.basho.riak.client.raw.query.indexes.IndexQuery; 36 | import com.basho.riak.client.raw.query.indexes.IndexWriter; 37 | import com.basho.riak.client.raw.query.indexes.IntRangeQuery; 38 | import com.basho.riak.client.raw.query.indexes.IntValueQuery; 39 | import com.basho.riak.hadoop.BucketKey; 40 | 41 | /** 42 | * Uses a 2i query to get keys for hadoop m/r. 43 | * 44 | * @author russell 45 | * 46 | */ 47 | public class SecondaryIndexesKeyLister implements KeyLister { 48 | private static final String BUCKET = "bucket"; 49 | private static final String INDEX = "index"; 50 | private static final String KEY = "key"; 51 | private static final String START = "start"; 52 | private static final String END = "end"; 53 | 54 | private IndexQuery query; 55 | 56 | /** 57 | * @param query 58 | */ 59 | public SecondaryIndexesKeyLister(IndexQuery query) { 60 | this.query = query; 61 | } 62 | 63 | public SecondaryIndexesKeyLister() {} 64 | 65 | /* 66 | * (non-Javadoc) 67 | * 68 | * @see com.basho.riak.hadoop.KeyLister#getInitString() 69 | */ 70 | public String getInitString() throws IOException { 71 | // TODO, this is the same as the code in IndexMapReduce, abstract out to 72 | // common class 73 | ByteArrayOutputStream out = new ByteArrayOutputStream(); 74 | final JsonGenerator jg = new JsonFactory().createJsonGenerator(out, JsonEncoding.UTF8); 75 | 76 | jg.writeStartObject(); 77 | 78 | IndexWriter e = new IndexWriter() { 79 | 80 | private void writeCommon(String bucket, String index) throws IOException { 81 | jg.writeStringField(BUCKET, bucket); 82 | jg.writeStringField(INDEX, index); 83 | } 84 | 85 | public void write(String bucket, String index, int from, int to) throws IOException { 86 | writeCommon(bucket, index); 87 | jg.writeNumberField(START, from); 88 | jg.writeNumberField(END, to); 89 | } 90 | 91 | public void write(String bucket, String index, int value) throws IOException { 92 | writeCommon(bucket, index); 93 | jg.writeNumberField(KEY, value); 94 | } 95 | 96 | public void write(String bucket, String index, String from, String to) throws IOException { 97 | writeCommon(bucket, index); 98 | jg.writeStringField(START, from); 99 | jg.writeStringField(END, to); 100 | } 101 | 102 | public void write(String bucket, String index, String value) throws IOException { 103 | writeCommon(bucket, index); 104 | jg.writeStringField(KEY, value); 105 | } 106 | }; 107 | 108 | query.write(e); 109 | jg.writeEndObject(); 110 | jg.flush(); 111 | jg.close(); 112 | return out.toString("UTF-8"); 113 | } 114 | 115 | /* 116 | * (non-Javadoc) 117 | * 118 | * @see com.basho.riak.hadoop.KeyLister#init(java.lang.String) 119 | */ 120 | public void init(String initString) throws IOException { 121 | // just like FetchIndex, again, abstract out to a common class 122 | boolean isRange = false; 123 | // turn the Json into an index query 124 | @SuppressWarnings("rawtypes") Map map = new ObjectMapper().readValue(initString, Map.class); 125 | 126 | String indexName = (String) map.get(INDEX); 127 | String bucket = (String) map.get(BUCKET); 128 | Object value = map.get(KEY); 129 | Object from = map.get(START); 130 | Object to = map.get(END); 131 | 132 | if (indexName == null) { 133 | throw new IllegalArgumentException("no index present"); 134 | } 135 | if (from != null && to != null && value == null) { 136 | isRange = true; 137 | } 138 | 139 | if (indexName != null && indexName.endsWith("_int")) { 140 | if (isRange) { 141 | query = new IntRangeQuery(IntIndex.named(indexName), bucket, (Integer) from, (Integer) to); 142 | } else { 143 | query = new IntValueQuery(IntIndex.named(indexName), bucket, (Integer) value); 144 | } 145 | } 146 | 147 | if (indexName != null && indexName.endsWith("_bin")) { 148 | if (isRange) { 149 | query = new BinRangeQuery(BinIndex.named(indexName), bucket, (String) from, (String) to); 150 | } else { 151 | query = new BinValueQuery(BinIndex.named(indexName), bucket, (String) value); 152 | } 153 | } 154 | 155 | if (query == null) { 156 | throw new IOException("unable to parse query from init string"); 157 | } 158 | } 159 | 160 | /* 161 | * (non-Javadoc) 162 | * 163 | * @see 164 | * com.basho.riak.hadoop.KeyLister#getKeys(com.basho.riak.client.IRiakClient 165 | * ) 166 | */ 167 | public Collection getKeys(IRiakClient client) throws RiakException { 168 | if (query == null) { 169 | throw new IllegalStateException("No index query"); 170 | } 171 | MapReduceResult r = client.mapReduce(query).addReducePhase(NamedErlangFunction.REDUCE_IDENTITY, 172 | Args.REDUCE_PHASE_ONLY_1).execute(); 173 | 174 | return r.getResult(BucketKey.class); 175 | } 176 | 177 | /* 178 | * (non-Javadoc) 179 | * 180 | * @see java.lang.Object#hashCode() 181 | */ 182 | @Override public int hashCode() { 183 | final int prime = 31; 184 | int result = 1; 185 | result = prime * result + ((query == null) ? 0 : query.hashCode()); 186 | return result; 187 | } 188 | 189 | /* 190 | * (non-Javadoc) 191 | * 192 | * @see java.lang.Object#equals(java.lang.Object) 193 | */ 194 | @Override public boolean equals(Object obj) { 195 | if (this == obj) { 196 | return true; 197 | } 198 | if (obj == null) { 199 | return false; 200 | } 201 | if (!(obj instanceof SecondaryIndexesKeyLister)) { 202 | return false; 203 | } 204 | SecondaryIndexesKeyLister other = (SecondaryIndexesKeyLister) obj; 205 | if (query == null) { 206 | if (other.query != null) { 207 | return false; 208 | } 209 | } else if (!query.equals(other.query)) { 210 | return false; 211 | } 212 | return true; 213 | } 214 | } 215 | --------------------------------------------------------------------------------