├── .gitignore
├── .travis.yml
├── src
├── main
│ └── java
│ │ └── com
│ │ └── basho
│ │ └── riak
│ │ └── hadoop
│ │ ├── config
│ │ ├── RiakTransport.java
│ │ ├── NoRiakLocationsException.java
│ │ ├── RiakPBLocation.java
│ │ ├── RiakHTTPLocation.java
│ │ ├── RiakLocation.java
│ │ ├── ClientFactory.java
│ │ └── RiakConfig.java
│ │ ├── keylisters
│ │ ├── KeyLister.java
│ │ ├── BucketKeyLister.java
│ │ ├── RiakSearchKeyLister.java
│ │ ├── KeysKeyLister.java
│ │ └── SecondaryIndexesKeyLister.java
│ │ ├── RiakOutputCommitter.java
│ │ ├── RiakRecordWriter.java
│ │ ├── RiakOutputFormat.java
│ │ ├── RiakMapper.java
│ │ ├── RiakRecordReader.java
│ │ ├── BucketKey.java
│ │ ├── RiakInputSplit.java
│ │ └── RiakInputFormat.java
└── test
│ └── java
│ └── com
│ └── basho
│ └── riak
│ └── hadoop
│ ├── config
│ ├── ClientFactoryTest.java
│ └── RiakConfigTest.java
│ ├── BucketKeyTest.java
│ ├── keylisters
│ ├── KeysKeyListerTest.java
│ ├── RiakSearchKeyListerTest.java
│ ├── BucketKeyListerTest.java
│ └── SecondaryIndexesKeyListerTest.java
│ └── RiakInputFormatTest.java
├── README.org
└── pom.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | .classpath
2 | .project
3 | .settings/
4 | bin/
5 | lib/
6 | target/
7 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 | notifications:
3 | email: clients@basho.com
4 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/config/RiakTransport.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.config;
15 |
16 | /**
17 | * Simple enum of available Riak transports
18 | *
19 | * @author russell
20 | *
21 | */
22 | public enum RiakTransport {
23 | HTTP, PB;
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/config/NoRiakLocationsException.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.config;
15 |
16 | import java.io.IOException;
17 |
18 | import com.basho.riak.hadoop.RiakInputFormat;
19 |
20 | /**
21 | * Tag exception for hadoop config where no {@link RiakLocation}s have been
22 | * provided to the {@link RiakInputFormat}
23 | *
24 | * @author russell
25 | *
26 | */
27 | public class NoRiakLocationsException extends IOException {
28 |
29 | /**
30 | * Eclipse generated
31 | */
32 | private static final long serialVersionUID = -4095183778220854984L;
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/config/RiakPBLocation.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.config;
15 |
16 | /**
17 | * Models the PB interface endpoint's location
18 | *
19 | * @author russell
20 | *
21 | */
22 | public class RiakPBLocation extends RiakLocation {
23 |
24 | /**
25 | * @param host
26 | * @param port
27 | */
28 | public RiakPBLocation(String host, int port) {
29 | super(RiakTransport.PB, host, port);
30 | }
31 |
32 | /*
33 | * (non-Javadoc)
34 | *
35 | * @see com.basho.riak.hadoop.RiakLocation#asString()
36 | */
37 | @Override public String asString() {
38 | return new StringBuilder(getHost()).append(":").append(getPort()).toString();
39 | }
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
1 | * Riak-Hadoop
2 |
3 | [[http://travis-ci.org/basho/riak-hadoop][Travis-CI]] :: [[https://secure.travis-ci.org/basho/riak-hadoop.png]]
4 |
5 | Riak-Hadoop is a library for using [[http://basho.com/products/riak-overview/][Riak]] as an input/output to [[http://hadoop.apache.org/mapreduce/][Hadoop
6 | Map/Reduce]].
7 |
8 | *NOTE* This library is not yet offically supported by Basho and is
9 | strictly experimental.
10 |
11 | ** How it works
12 | The library extends =InputFormat=, =InputSplit=, =RecordReader=,
13 | =OutputFormat= and =RecordWriter=, so you can declare any valid Riak
14 | M/R input (2i query, riak search query, list of keys, bucket) as input
15 | to a Hadoop M/R job. The library will split the keys into partitions
16 | and Hadoop will use the =RiakRecordReader= to load Key/Value pairs
17 | from Riak for the =Mapper= tasks. The =Reducer= output is written back
18 | to a configured bucket in Riak. It uses the [[https://github.com/basho/riak-java-client/][Riak-Java-Client]] to talk
19 | to Riak. You just write a normal Hadoop Map/Reduce job, but declare
20 | =RiakInputFormat= and =RiakOutpurFormat= as sources/sinks for your
21 | data/results.
22 |
23 | ** Example?
24 | Have a look at the
25 | [[https://github.com/russelldb/riak-hadoop-wordcount][Riak Word Count]] example project to get started with Riak-Hadoop.
26 |
27 | ** Future
28 | I plan to have the library load bulk data from Riak to HDFS to better
29 | leverage Hadoop's integration with that file system. There will be
30 | docs.
31 |
32 | ** Feedback
33 | Raise issues, pull requests, comments etc. as ever through GitHub, or
34 | [[http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com][the Riak mailing list]].
35 |
--------------------------------------------------------------------------------
/src/test/java/com/basho/riak/hadoop/config/ClientFactoryTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.config;
15 |
16 | import org.apache.hadoop.conf.Configuration;
17 | import org.junit.Before;
18 | import org.junit.Test;
19 |
20 | import com.basho.riak.client.RiakException;
21 |
22 | /**
23 | * @author russell
24 | *
25 | */
26 | public class ClientFactoryTest {
27 |
28 | /**
29 | * @throws java.lang.Exception
30 | */
31 | @Before public void setUp() throws Exception {}
32 |
33 | @Test(expected = IllegalArgumentException.class) public void getClusterClient_die() throws RiakException {
34 | Configuration conf = new Configuration();
35 |
36 | conf = RiakConfig.addLocation(conf, new RiakPBLocation("33.33.33.12", 8087));
37 | conf = RiakConfig.addLocation(conf, new RiakPBLocation("33.33.33.13", 8087));
38 | conf = RiakConfig.addLocation(conf, new RiakHTTPLocation("33.33.33.10", 8098, "riak"));
39 | conf = RiakConfig.addLocation(conf, new RiakHTTPLocation("33.33.33.11", 8098, "riak"));
40 |
41 | ClientFactory.clusterClient(RiakConfig.getRiakLocatons(conf));
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.basho.riak.hadoop
6 | riak-hadoop
7 | 0.2-SNAPSHOT
8 | jar
9 |
10 | riak-hadoop
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 |
16 |
17 |
18 |
19 | com.basho.riak
20 | https://oss.sonatype.org/content/repositories/combashoriak-168
21 |
22 |
23 |
24 |
25 |
26 | com.basho.riak
27 | riak-client
28 | 1.0.2
29 |
30 |
31 | org.apache.hadoop
32 | hadoop-core
33 | 0.20.203.0
34 |
35 |
36 |
37 | org.mockito
38 | mockito-all
39 | 1.8.0
40 | test
41 |
42 |
43 | junit
44 | junit
45 | 4.4
46 | test
47 |
48 |
49 |
50 |
51 |
52 |
53 | maven-compiler-plugin
54 |
55 | 1.5
56 | 1.5
57 |
58 |
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/src/test/java/com/basho/riak/hadoop/BucketKeyTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop;
15 |
16 | import static org.junit.Assert.*;
17 |
18 | import java.util.Collection;
19 |
20 | import org.codehaus.jackson.map.ObjectMapper;
21 | import org.codehaus.jackson.map.type.TypeFactory;
22 | import org.junit.Before;
23 | import org.junit.Test;
24 |
25 | /**
26 | * Tests that the object mapper can turn [["b", "k"], ["b", "k1"]] into a
27 | * Collection of {@link BucketKey}
28 | *
29 | * @author russell
30 | *
31 | */
32 | public class BucketKeyTest {
33 |
34 | /**
35 | * @throws java.lang.Exception
36 | */
37 | @Before public void setUp() throws Exception {}
38 |
39 | /**
40 | * Test method for
41 | * {@link com.basho.riak.hadoop.BucketKey#BucketKey(java.lang.String[])}.
42 | */
43 | @Test public void bucketKeyFromReduceIdentity() throws Exception {
44 | final String mrOut = "[[\"indexed\",\"qbert\"],[\"indexed\",\"bert\"]]";
45 |
46 | Collection bks = new ObjectMapper().readValue(mrOut,
47 | TypeFactory.collectionType(Collection.class,BucketKey.class));
48 |
49 | assertEquals(2, bks.size());
50 |
51 | assertTrue(bks.contains(new BucketKey("indexed", "qbert")));
52 | assertTrue(bks.contains(new BucketKey("indexed", "bert")));
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/config/RiakHTTPLocation.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.config;
15 |
16 | /**
17 | * Holder for a Riak HTTP interface location
18 | *
19 | * @author russell
20 | *
21 | */
22 | public class RiakHTTPLocation extends RiakLocation {
23 |
24 | private final String riakPath;
25 |
26 | /**
27 | * Create an HTTP location
28 | *
29 | * @param host
30 | * the host
31 | * @param port
32 | * the HTTP port
33 | * @param riakPath
34 | * the path to the 'riak' resource
35 | */
36 | public RiakHTTPLocation(String host, int port, String riakPath) {
37 | super(RiakTransport.HTTP, host, port);
38 | this.riakPath = riakPath;
39 | }
40 |
41 | /**
42 | * @return the path to the 'riak' resource
43 | */
44 | public String getRiakPath() {
45 | return riakPath;
46 | }
47 |
48 | /*
49 | * (non-Javadoc)
50 | *
51 | * @see com.basho.riak.hadoop.RiakLocation#asString()
52 | */
53 | @Override public String asString() {
54 | StringBuilder sb = new StringBuilder("http://");
55 | sb.append(getHost()).append(":").append(getPort());
56 |
57 | if (!riakPath.startsWith("/")) {
58 | sb.append("/");
59 | }
60 |
61 | sb.append(riakPath);
62 | return sb.toString();
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/keylisters/KeyLister.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.keylisters;
15 |
16 | import java.io.IOException;
17 | import java.util.Collection;
18 |
19 | import com.basho.riak.client.IRiakClient;
20 | import com.basho.riak.client.RiakException;
21 | import com.basho.riak.hadoop.BucketKey;
22 |
23 | /**
24 | * Strategy for obtaining list of keys for splits, {@link KeyLister}s must a
25 | * zero arg constructor.
26 | *
27 | * @author russell
28 | *
29 | */
30 | public interface KeyLister {
31 | /**
32 | * Thanks to hadoop's configuration framework a key lister has to
33 | * deserialize and serialize itself this method and init(String) below are a
34 | * light weight way of doing that
35 | *
36 | * @return a String that can be used by the implementations init method to
37 | * reconsitute the state of the lister
38 | * @throws IOException
39 | */
40 | String getInitString() throws IOException;
41 |
42 | /**
43 | * A string (from a prior call to getInitString) that this instance will use
44 | * to set itself up to list keys
45 | *
46 | * @param initString
47 | * @throws IOException
48 | */
49 | void init(String initString) throws IOException;
50 |
51 | /**
52 | * Get keys with the given client
53 | *
54 | * @param client
55 | * @return
56 | * @throws RiakException
57 | * @throws {@link IllegalStateException} is init was not called and the
58 | * lister is not set up to get keys
59 | */
60 | Collection getKeys(IRiakClient client) throws RiakException;
61 | }
62 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/RiakOutputCommitter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop;
15 |
16 | import java.io.IOException;
17 |
18 | import org.apache.hadoop.mapreduce.JobContext;
19 | import org.apache.hadoop.mapreduce.OutputCommitter;
20 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
21 |
22 | /**
23 | * A NO-OP output committer
24 | *
25 | * @author russell
26 | *
27 | */
28 | public class RiakOutputCommitter extends OutputCommitter {
29 |
30 | /* (non-Javadoc)
31 | * @see org.apache.hadoop.mapreduce.OutputCommitter#abortTask(org.apache.hadoop.mapreduce.TaskAttemptContext)
32 | */
33 | @Override public void abortTask(TaskAttemptContext tac) throws IOException {}
34 |
35 | /* (non-Javadoc)
36 | * @see org.apache.hadoop.mapreduce.OutputCommitter#commitTask(org.apache.hadoop.mapreduce.TaskAttemptContext)
37 | */
38 | @Override public void commitTask(TaskAttemptContext tac) throws IOException {}
39 |
40 | /* (non-Javadoc)
41 | * @see org.apache.hadoop.mapreduce.OutputCommitter#needsTaskCommit(org.apache.hadoop.mapreduce.TaskAttemptContext)
42 | */
43 | @Override public boolean needsTaskCommit(TaskAttemptContext tac) throws IOException {
44 | return false;
45 | }
46 |
47 | /* (non-Javadoc)
48 | * @see org.apache.hadoop.mapreduce.OutputCommitter#setupJob(org.apache.hadoop.mapreduce.JobContext)
49 | */
50 | @Override public void setupJob(JobContext jc) throws IOException {}
51 |
52 | /* (non-Javadoc)
53 | * @see org.apache.hadoop.mapreduce.OutputCommitter#setupTask(org.apache.hadoop.mapreduce.TaskAttemptContext)
54 | */
55 | @Override public void setupTask(TaskAttemptContext tac) throws IOException {}
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/RiakRecordWriter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop;
15 |
16 | import java.io.IOException;
17 |
18 | import org.apache.hadoop.conf.Configuration;
19 | import org.apache.hadoop.io.Text;
20 | import org.apache.hadoop.mapreduce.RecordWriter;
21 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
22 |
23 | import com.basho.riak.client.IRiakClient;
24 | import com.basho.riak.client.RiakException;
25 | import com.basho.riak.client.bucket.Bucket;
26 | import com.basho.riak.hadoop.config.ClientFactory;
27 | import com.basho.riak.hadoop.config.RiakConfig;
28 |
29 | /**
30 | * Writes reducer results to Riak
31 | *
32 | * @author russell
33 | * @param
34 | *
35 | */
36 | public class RiakRecordWriter extends RecordWriter {
37 |
38 | private final Bucket bucket;
39 |
40 | RiakRecordWriter(TaskAttemptContext tac) throws RiakException {
41 | Configuration conf = tac.getConfiguration();
42 | IRiakClient client = ClientFactory.clusterClient(RiakConfig.getRiakLocatons(conf));
43 | bucket = client.fetchBucket(RiakConfig.getOutputBucket(conf)).execute();
44 | }
45 |
46 | /*
47 | * (non-Javadoc)
48 | *
49 | * @see
50 | * org.apache.hadoop.mapreduce.RecordWriter#close(org.apache.hadoop.mapreduce
51 | * .TaskAttemptContext)
52 | */
53 | @Override public void close(TaskAttemptContext tac) throws IOException, InterruptedException {
54 | // NO-OP
55 | }
56 |
57 | /*
58 | * (non-Javadoc)
59 | *
60 | * @see org.apache.hadoop.mapreduce.RecordWriter#write(java.lang.Object,
61 | * java.lang.Object)
62 | */
63 | @Override public void write(Text key, V value) throws IOException, InterruptedException {
64 | try {
65 | bucket.store(key.toString(), value).execute();
66 | } catch (RiakException e) {
67 | throw new IOException(e);
68 | }
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/RiakOutputFormat.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop;
15 |
16 | import java.io.IOException;
17 |
18 | import org.apache.hadoop.io.Text;
19 | import org.apache.hadoop.mapreduce.JobContext;
20 | import org.apache.hadoop.mapreduce.OutputCommitter;
21 | import org.apache.hadoop.mapreduce.OutputFormat;
22 | import org.apache.hadoop.mapreduce.RecordWriter;
23 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
24 |
25 | import com.basho.riak.client.RiakException;
26 |
27 | /**
28 | * Riak specific {@link OutputFormat}, just creates a {@link RiakRecordWriter}
29 | *
30 | * @author russell
31 | *
32 | */
33 | public class RiakOutputFormat extends OutputFormat {
34 |
35 | /*
36 | * (non-Javadoc)
37 | *
38 | * @see
39 | * org.apache.hadoop.mapreduce.OutputFormat#checkOutputSpecs(org.apache.
40 | * hadoop.mapreduce.JobContext)
41 | */
42 | @Override public void checkOutputSpecs(JobContext ctx) throws IOException, InterruptedException {}
43 |
44 | /*
45 | * (non-Javadoc)
46 | *
47 | * @see
48 | * org.apache.hadoop.mapreduce.OutputFormat#getOutputCommitter(org.apache
49 | * .hadoop.mapreduce.TaskAttemptContext)
50 | */
51 | @Override public OutputCommitter getOutputCommitter(TaskAttemptContext tac) throws IOException,
52 | InterruptedException {
53 | return new RiakOutputCommitter();
54 | }
55 |
56 | /*
57 | * (non-Javadoc)
58 | *
59 | * @see
60 | * org.apache.hadoop.mapreduce.OutputFormat#getRecordWriter(org.apache.hadoop
61 | * .mapreduce.TaskAttemptContext)
62 | */
63 | @Override public RecordWriter getRecordWriter(TaskAttemptContext tac) throws IOException,
64 | InterruptedException {
65 | try {
66 | return new RiakRecordWriter(tac);
67 | } catch (RiakException e) {
68 | throw new IOException(e);
69 | }
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/src/test/java/com/basho/riak/hadoop/keylisters/KeysKeyListerTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.keylisters;
15 |
16 | import static org.junit.Assert.assertEquals;
17 | import static org.junit.Assert.fail;
18 |
19 | import java.util.Arrays;
20 | import java.util.HashSet;
21 | import java.util.List;
22 | import java.util.Set;
23 |
24 | import org.junit.Test;
25 |
26 | import com.basho.riak.hadoop.BucketKey;
27 | import com.basho.riak.hadoop.keylisters.KeysKeyLister;
28 |
29 | /**
30 | * @author russell
31 | *
32 | */
33 | public class KeysKeyListerTest {
34 |
35 | private static final String BUCKET_NAME = "bucket";
36 |
37 | private KeysKeyLister lister;
38 |
39 | /**
40 | * Test method for
41 | * {@link com.basho.riak.hadoop.keylisters.KeysKeyLister#KeysKeyLister(java.util.List)}
42 | * .
43 | */
44 | @Test public void createWithKeys() throws Exception {
45 | Set keys = new HashSet(Arrays.asList(new BucketKey(BUCKET_NAME, "k1"), new BucketKey(BUCKET_NAME,
46 | "k2s")));
47 | lister = new KeysKeyLister(keys);
48 | assertEquals(keys, lister.getKeys(null));
49 | }
50 |
51 | /**
52 | * Test method for
53 | * {@link com.basho.riak.hadoop.keylisters.KeysKeyLister#KeysKeyLister(java.util.List, java.lang.String)}
54 | * .
55 | */
56 | @Test public void createWithKeysAndCommonBucket() throws Exception {
57 | Set keys = new HashSet(Arrays.asList("k1", "k2", "k3", "k4"));
58 | lister = new KeysKeyLister(keys, BUCKET_NAME);
59 |
60 | Set expected = new HashSet();
61 | for (String k : keys) {
62 | expected.add(new BucketKey(BUCKET_NAME, k));
63 | }
64 |
65 | assertEquals(expected, lister.getKeys(null));
66 | }
67 |
68 | /**
69 | * Test method for
70 | * {@link com.basho.riak.hadoop.keylisters.KeysKeyLister#KeysKeyLister()}.
71 | */
72 | @Test public void noArgConstructorAndNoInitMeansIllegalState() throws Exception {
73 | lister = new KeysKeyLister();
74 |
75 | try {
76 | lister.getKeys(null);
77 | fail("Expected IllegalStateException");
78 | } catch (IllegalStateException e) {
79 | // NO-OP
80 | }
81 |
82 | }
83 |
84 | /**
85 | * Test method for
86 | * {@link com.basho.riak.hadoop.keylisters.KeysKeyLister#getInitString()}.
87 | */
88 | @Test public void initFromString() throws Exception {
89 | List keys = Arrays.asList(new BucketKey(BUCKET_NAME, "k1"), new BucketKey(BUCKET_NAME, "k2s"));
90 | lister = new KeysKeyLister(keys);
91 |
92 | KeysKeyLister lister2 = new KeysKeyLister();
93 | lister2.init(lister.getInitString());
94 |
95 | assertEquals(lister.getKeys(null), lister2.getKeys(null));
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/RiakMapper.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop;
15 |
16 | import java.io.IOException;
17 | import java.util.ArrayList;
18 | import java.util.Collection;
19 |
20 | import org.apache.hadoop.mapreduce.Mapper;
21 |
22 | import com.basho.riak.client.IRiakObject;
23 | import com.basho.riak.client.cap.ConflictResolver;
24 | import com.basho.riak.client.convert.Converter;
25 | import com.basho.riak.client.raw.RiakResponse;
26 |
27 | /**
28 | * A Riak specific extension of {@link Mapper} that can be used if you wish to
29 | * work with domain specific types and handle sibling values in your
30 | * {@link Mapper#map} method
31 | *
32 | * @author russell
33 | * @param
34 | * the type for the input value
35 | * @param
36 | * the type for the out key
37 | * @param
38 | * the type for the out value
39 | *
40 | */
41 | public abstract class RiakMapper extends Mapper {
42 |
43 | private final Converter converter;
44 | private final ConflictResolver resolver;
45 |
46 | /**
47 | * Create a {@link Mapper} that will use the provided {@link Converter} and
48 | * {@link ConflictResolver} on the raw {@link RiakResponse} returned by the
49 | * {@link RiakRecordReader}
50 | *
51 | * @param converter
52 | * a {@link Converter}
53 | * @param resolver
54 | * a {@link ConflictResolver}
55 | */
56 | public RiakMapper(Converter converter, ConflictResolver resolver) {
57 | this.converter = converter;
58 | this.resolver = resolver;
59 | }
60 |
61 | /*
62 | * (non-Javadoc)
63 | *
64 | * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object,
65 | * java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
66 | */
67 | @Override public void map(BucketKey key, RiakResponse value, Context context) throws IOException,
68 | InterruptedException {
69 |
70 | // convert, conflict resolve
71 | final Collection siblings = new ArrayList(value.numberOfValues());
72 |
73 | for (IRiakObject o : value) {
74 | siblings.add(converter.toDomain(o));
75 | }
76 |
77 | map(key, resolver.resolve(siblings), context);
78 | }
79 |
80 | /**
81 | * Override this method in your {@link Mapper}. It is called by the default
82 | * {@link Mapper#map} method, after applying the {@link Converter} and
83 | * {@link ConflictResolver}. Put your mapping code here.
84 | *
85 | * @param k
86 | * the {@link BucketKey}
87 | * @param value
88 | * the converted value
89 | * @param context
90 | * the hadoop job Context
91 | * @throws IOException
92 | * @throws InterruptedException
93 | */
94 | public abstract void map(BucketKey k, T value, Context context) throws IOException, InterruptedException;
95 |
96 | }
97 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/RiakRecordReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop;
15 |
16 | import static com.basho.riak.hadoop.config.ClientFactory.getRawClient;
17 |
18 | import java.io.IOException;
19 | import java.util.concurrent.ConcurrentLinkedQueue;
20 |
21 | import org.apache.hadoop.mapreduce.InputSplit;
22 | import org.apache.hadoop.mapreduce.RecordReader;
23 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
24 |
25 | import com.basho.riak.client.raw.RawClient;
26 | import com.basho.riak.client.raw.RiakResponse;
27 |
28 | /**
29 | * Wrapper around a {@link RawClient} for reading values from Riak
30 | *
31 | * @author russell
32 | *
33 | */
34 | public class RiakRecordReader extends RecordReader {
35 |
36 | private RawClient client;
37 | private ConcurrentLinkedQueue keys;
38 | private long initialSize;
39 |
40 | /*
41 | * (non-Javadoc)
42 | *
43 | * @see org.apache.hadoop.mapreduce.RecordReader#close()
44 | */
45 | @Override public void close() throws IOException {}
46 |
47 | /*
48 | * (non-Javadoc)
49 | *
50 | * @see org.apache.hadoop.mapreduce.RecordReader#getCurrentKey()
51 | */
52 | @Override public BucketKey getCurrentKey() throws IOException, InterruptedException {
53 | return keys.peek();
54 | }
55 |
56 | /*
57 | * (non-Javadoc)
58 | *
59 | * @see org.apache.hadoop.mapreduce.RecordReader#getCurrentValue()
60 | */
61 | @Override public RiakResponse getCurrentValue() throws IOException, InterruptedException {
62 | BucketKey key = keys.poll();
63 | return client.fetch(key.getBucket(), key.getKey());
64 | }
65 |
66 | /*
67 | * (non-Javadoc)
68 | *
69 | * @see org.apache.hadoop.mapreduce.RecordReader#getProgress()
70 | */
71 | @Override public float getProgress() throws IOException, InterruptedException {
72 | int size = keys.size();
73 | if (size == 0) {
74 | return 0;
75 | } else {
76 | return size / initialSize;
77 | }
78 | }
79 |
80 | /*
81 | * (non-Javadoc)
82 | *
83 | * @see
84 | * org.apache.hadoop.mapreduce.RecordReader#initialize(org.apache.hadoop
85 | * .mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext)
86 | */
87 | @Override public void initialize(InputSplit split, TaskAttemptContext taskAttemptContext) throws IOException,
88 | InterruptedException {
89 | RiakInputSplit inputSplit = (RiakInputSplit) split;
90 | keys = new ConcurrentLinkedQueue(inputSplit.getInputs());
91 | initialSize = split.getLength();
92 | client = getRawClient(inputSplit.getLocation());
93 | }
94 |
95 | /*
96 | * (non-Javadoc)
97 | *
98 | * @see org.apache.hadoop.mapreduce.RecordReader#nextKeyValue()
99 | */
100 | @Override public boolean nextKeyValue() throws IOException, InterruptedException {
101 | return keys.peek() != null;
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/BucketKey.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop;
15 |
16 | import org.codehaus.jackson.annotate.JsonCreator;
17 |
18 | /**
19 | * Models a bucket/key location in Riak.
20 | *
21 | * @author russell
22 | *
23 | */
24 | public class BucketKey {
25 |
26 | private final String bucket;
27 | private final String key;
28 |
29 | /**
30 | * Provide a JSON constructor for Jackson.
31 | *
32 | * @param bucketKey
33 | * a String[2] where [0] is the bucket and [1] is the key
34 | */
35 | @JsonCreator public BucketKey(String[] bucketKey) {
36 | if (bucketKey == null || bucketKey.length != 2) {
37 | throw new IllegalArgumentException("bucketKey must be a String[] of length 2");
38 | }
39 |
40 | this.bucket = bucketKey[0];
41 | this.key = bucketKey[1];
42 | }
43 |
44 | /**
45 | * Default constructor
46 | *
47 | * @param bucket
48 | * the bucket
49 | * @param key
50 | * the key
51 | */
52 | public BucketKey(String bucket, String key) {
53 | this.bucket = bucket;
54 | this.key = key;
55 | }
56 |
57 | /**
58 | * @return the bucket
59 | */
60 | public String getBucket() {
61 | return bucket;
62 | }
63 |
64 | /**
65 | * @return the key
66 | */
67 | public String getKey() {
68 | return key;
69 | }
70 |
71 | /*
72 | * (non-Javadoc)
73 | *
74 | * @see java.lang.Object#hashCode()
75 | */
76 | @Override public int hashCode() {
77 | final int prime = 31;
78 | int result = 1;
79 | result = prime * result + ((bucket == null) ? 0 : bucket.hashCode());
80 | result = prime * result + ((key == null) ? 0 : key.hashCode());
81 | return result;
82 | }
83 |
84 | /*
85 | * (non-Javadoc)
86 | *
87 | * @see java.lang.Object#equals(java.lang.Object)
88 | */
89 | @Override public boolean equals(Object obj) {
90 | if (this == obj) {
91 | return true;
92 | }
93 | if (obj == null) {
94 | return false;
95 | }
96 | if (!(obj instanceof BucketKey)) {
97 | return false;
98 | }
99 | BucketKey other = (BucketKey) obj;
100 | if (bucket == null) {
101 | if (other.bucket != null) {
102 | return false;
103 | }
104 | } else if (!bucket.equals(other.bucket)) {
105 | return false;
106 | }
107 | if (key == null) {
108 | if (other.key != null) {
109 | return false;
110 | }
111 | } else if (!key.equals(other.key)) {
112 | return false;
113 | }
114 | return true;
115 | }
116 |
117 | /*
118 | * (non-Javadoc)
119 | *
120 | * @see java.lang.Object#toString()
121 | */
122 | @Override public String toString() {
123 | return String.format("BucketKey [bucket=%s, key=%s]", bucket, key);
124 | }
125 |
126 | }
127 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/keylisters/BucketKeyLister.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.keylisters;
15 |
16 | import java.util.ArrayList;
17 | import java.util.Collection;
18 | import java.util.List;
19 |
20 | import com.basho.riak.client.IRiakClient;
21 | import com.basho.riak.client.RiakException;
22 | import com.basho.riak.client.bucket.Bucket;
23 | import com.basho.riak.hadoop.BucketKey;
24 |
25 | /**
26 | * A full list buckets key lister. DANGER, not advised for production use.
27 | *
28 | * @author russell
29 | *
30 | */
31 | public class BucketKeyLister implements KeyLister {
32 |
33 | private static final String EMPTY = "";
34 | private String bucket;
35 |
36 | /**
37 | * no arg CTOR for de-serialization
38 | */
39 | public BucketKeyLister() {}
40 |
41 | /**
42 | * @param bucket
43 | */
44 | public BucketKeyLister(String bucket) {
45 | this.bucket = bucket;
46 | }
47 |
48 | /*
49 | * (non-Javadoc)
50 | *
51 | * @see com.basho.riak.hadoop.KeyLister#getKeys()
52 | */
53 | public Collection getKeys(IRiakClient client) throws RiakException {
54 | if (bucket == null || bucket.trim().equals(EMPTY)) {
55 | throw new IllegalStateException("bucket cannot be null or empty");
56 | }
57 |
58 | List keys = new ArrayList();
59 | Bucket b = client.fetchBucket(bucket).execute();
60 |
61 | for (String key : b.keys()) {
62 | keys.add(new BucketKey(bucket, key));
63 |
64 | }
65 | return keys;
66 | }
67 |
68 | /*
69 | * (non-Javadoc)
70 | *
71 | * @see com.basho.riak.hadoop.KeyLister#init(java.lang.String)
72 | */
73 | public void init(String bucket) {
74 | this.bucket = bucket;
75 | }
76 |
77 | /*
78 | * (non-Javadoc)
79 | *
80 | * @see com.basho.riak.hadoop.KeyLister#getInitString()
81 | */
82 | public String getInitString() {
83 | return bucket;
84 | }
85 |
86 | /*
87 | * (non-Javadoc)
88 | *
89 | * @see java.lang.Object#hashCode()
90 | */
91 | @Override public int hashCode() {
92 | final int prime = 31;
93 | int result = 1;
94 | result = prime * result + ((bucket == null) ? 0 : bucket.hashCode());
95 | return result;
96 | }
97 |
98 | /*
99 | * (non-Javadoc)
100 | *
101 | * @see java.lang.Object#equals(java.lang.Object)
102 | */
103 | @Override public boolean equals(Object obj) {
104 | if (this == obj) {
105 | return true;
106 | }
107 | if (obj == null) {
108 | return false;
109 | }
110 | if (!(obj instanceof BucketKeyLister)) {
111 | return false;
112 | }
113 | BucketKeyLister other = (BucketKeyLister) obj;
114 | if (bucket == null) {
115 | if (other.bucket != null) {
116 | return false;
117 | }
118 | } else if (!bucket.equals(other.bucket)) {
119 | return false;
120 | }
121 | return true;
122 | }
123 | }
124 |
--------------------------------------------------------------------------------
/src/test/java/com/basho/riak/hadoop/keylisters/RiakSearchKeyListerTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.keylisters;
15 |
16 | import static org.mockito.Mockito.*;
17 | import static org.junit.Assert.*;
18 |
19 | import java.util.Arrays;
20 | import java.util.Collection;
21 |
22 | import org.junit.Before;
23 | import org.junit.Test;
24 | import org.mockito.Mock;
25 | import org.mockito.MockitoAnnotations;
26 |
27 | import com.basho.riak.client.IRiakClient;
28 | import com.basho.riak.client.query.MapReduceResult;
29 | import com.basho.riak.client.query.SearchMapReduce;
30 | import com.basho.riak.client.query.functions.Args;
31 | import com.basho.riak.client.query.functions.NamedErlangFunction;
32 | import com.basho.riak.hadoop.BucketKey;
33 | import com.basho.riak.hadoop.keylisters.RiakSearchKeyLister;
34 |
35 | /**
36 | * @author russell
37 | *
38 | */
39 | public class RiakSearchKeyListerTest {
40 |
41 | private static final String BUCKET = "bucket";
42 | private static final String QUERY = "foo:zero";
43 |
44 | @Mock private IRiakClient riakClient;
45 | @Mock private SearchMapReduce searchMapReduce;
46 | @Mock private MapReduceResult mapReduceResult;
47 |
48 | private RiakSearchKeyLister lister;
49 |
50 | /**
51 | * @throws java.lang.Exception
52 | */
53 | @Before public void setUp() throws Exception {
54 | MockitoAnnotations.initMocks(this);
55 | }
56 |
57 | /**
58 | * Test method for
59 | * {@link com.basho.riak.hadoop.keylisters.RiakSearchKeyLister#RiakSearchKeyLister(java.lang.String, java.lang.String)}
60 | * .
61 | */
62 | @Test public void createWithBucketAndQuery() throws Exception {
63 | lister = new RiakSearchKeyLister(BUCKET, QUERY);
64 | testLister(lister);
65 | }
66 |
67 | private void testLister(RiakSearchKeyLister lister) throws Exception {
68 | final Collection expected = Arrays.asList(new BucketKey(BUCKET, "k1"), new BucketKey(BUCKET, "k2"));
69 |
70 | when(riakClient.mapReduce(BUCKET, QUERY)).thenReturn(searchMapReduce);
71 | when(searchMapReduce.addReducePhase(NamedErlangFunction.REDUCE_IDENTITY, Args.REDUCE_PHASE_ONLY_1)).thenReturn(searchMapReduce);
72 | when(searchMapReduce.execute()).thenReturn(mapReduceResult);
73 | when(mapReduceResult.getResult(BucketKey.class)).thenReturn(expected);
74 |
75 | final Collection actual = lister.getKeys(riakClient);
76 | assertEquals(expected, actual);
77 | }
78 |
79 | /**
80 | * Test method for
81 | * {@link com.basho.riak.hadoop.keylisters.RiakSearchKeyLister#RiakSearchKeyLister()}.
82 | */
83 | @Test public void emptyListerIllegalState() throws Exception {
84 | lister = new RiakSearchKeyLister();
85 |
86 | try {
87 | lister.getKeys(riakClient);
88 | fail("Expected IllegalStateException");
89 | } catch (IllegalStateException e) {
90 | // NO-OP
91 | }
92 | }
93 |
94 | /**
95 | * Test method for
96 | * {@link com.basho.riak.hadoop.keylisters.RiakSearchKeyLister#getInitString()}.
97 | */
98 | @Test public void getInitString() throws Exception {
99 | lister = new RiakSearchKeyLister(BUCKET, QUERY);
100 |
101 | String initString = lister.getInitString();
102 |
103 | RiakSearchKeyLister listerToo = new RiakSearchKeyLister();
104 | listerToo.init(initString);
105 |
106 | testLister(listerToo);
107 | }
108 | }
109 |
--------------------------------------------------------------------------------
/src/test/java/com/basho/riak/hadoop/RiakInputFormatTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop;
15 |
16 | import static org.junit.Assert.assertEquals;
17 | import static org.junit.Assert.fail;
18 | import static org.mockito.Mockito.when;
19 |
20 | import java.util.LinkedList;
21 | import java.util.List;
22 |
23 | import org.apache.hadoop.conf.Configuration;
24 | import org.apache.hadoop.mapreduce.InputSplit;
25 | import org.apache.hadoop.mapreduce.JobContext;
26 | import org.junit.Before;
27 | import org.junit.Test;
28 | import org.mockito.Mock;
29 | import org.mockito.MockitoAnnotations;
30 |
31 | import com.basho.riak.hadoop.config.NoRiakLocationsException;
32 | import com.basho.riak.hadoop.config.RiakLocation;
33 | import com.basho.riak.hadoop.config.RiakPBLocation;
34 |
35 | /**
36 | * @author russell
37 | *
38 | */
39 | public class RiakInputFormatTest {
40 |
41 | private static final String BUCKET = "bucket";
42 | private static final String KEY = "key";
43 |
44 | @Mock public JobContext jobContext;
45 |
46 | private RiakInputFormat inputFormat;
47 |
48 | /**
49 | * @throws java.lang.Exception
50 | */
51 | @Before public void setUp() throws Exception {
52 | MockitoAnnotations.initMocks(this);
53 | inputFormat = new RiakInputFormat();
54 | }
55 |
56 | /**
57 | * Test method for
58 | * {@link com.basho.riak.hadoop.RiakInputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)}
59 | * .
60 | */
61 | @Test public void getSplits_noLocations() throws Exception {
62 | Configuration conf = new Configuration();
63 | when(jobContext.getConfiguration()).thenReturn(conf);
64 | try {
65 | inputFormat.getSplits(jobContext);
66 | fail("Expected IOException");
67 | } catch (NoRiakLocationsException e) {
68 | // NO-OP
69 | }
70 | }
71 |
72 | @Test public void getSplitSize() {
73 | assertEquals(10, RiakInputFormat.getSplitSize(10, 4));
74 | assertEquals(20, RiakInputFormat.getSplitSize(800, 4));
75 | assertEquals(2500, RiakInputFormat.getSplitSize(100000, 4));
76 | }
77 |
78 | @Test public void getSplits() throws Exception {
79 | final List bks = new LinkedList();
80 | for (int i = 0; i < 100001; i++) {
81 | bks.add(new BucketKey(BUCKET, KEY + i));
82 | }
83 |
84 | RiakLocation[] locations = new RiakLocation[] { new RiakPBLocation("host1", 8091),
85 | new RiakPBLocation("host2", 8091),
86 | new RiakPBLocation("host3", 8091),
87 | new RiakPBLocation("host4", 8091) };
88 |
89 | List splits = RiakInputFormat.getSplits(bks, locations, 999);
90 |
91 | assertEquals("Expected 101 splits", 101, splits.size());
92 |
93 | int _999SplitCnt = 0;
94 | int _101SplitCnt = 0;
95 | int otherSplitCnt = 0;
96 |
97 | for (InputSplit is : splits) {
98 | long length = is.getLength();
99 |
100 | if (length == 999) {
101 | _999SplitCnt++;
102 | } else if (length == 101) {
103 | _101SplitCnt++;
104 | } else {
105 | otherSplitCnt++;
106 | }
107 | }
108 |
109 | assertEquals("Should be 100 splits of 999 keys", 100, _999SplitCnt);
110 | assertEquals("Should be 1 split of 101 keys", 1, _101SplitCnt);
111 | assertEquals("Should be 0 splits of with neither 999 or 101 keys", 0, otherSplitCnt);
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/test/java/com/basho/riak/hadoop/config/RiakConfigTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.config;
15 |
16 | import static org.junit.Assert.assertEquals;
17 | import static org.junit.Assert.assertTrue;
18 |
19 | import java.util.Arrays;
20 |
21 | import org.apache.hadoop.conf.Configuration;
22 | import org.junit.Test;
23 |
24 | import com.basho.riak.client.query.indexes.BinIndex;
25 | import com.basho.riak.client.raw.query.indexes.BinRangeQuery;
26 | import com.basho.riak.hadoop.config.RiakConfig;
27 | import com.basho.riak.hadoop.config.RiakHTTPLocation;
28 | import com.basho.riak.hadoop.config.RiakLocation;
29 | import com.basho.riak.hadoop.config.RiakPBLocation;
30 | import com.basho.riak.hadoop.keylisters.BucketKeyLister;
31 | import com.basho.riak.hadoop.keylisters.KeyLister;
32 | import com.basho.riak.hadoop.keylisters.KeysKeyLister;
33 | import com.basho.riak.hadoop.keylisters.RiakSearchKeyLister;
34 | import com.basho.riak.hadoop.keylisters.SecondaryIndexesKeyLister;
35 |
36 | /**
37 | * @author russell
38 | *
39 | */
40 | public class RiakConfigTest {
41 |
42 | private static final String BUCKET = "bucket";
43 |
44 | /**
45 | * Test method for
46 | * {@link com.basho.riak.hadoop.config.RiakConfig#addLocation(org.apache.hadoop.conf.Configuration, com.basho.riak.hadoop.config.RiakLocation)}
47 | * .
48 | */
49 | @Test public void testAddRiakLocations() {
50 | final String host = "127.0.0.1";
51 | final int port = 8097;
52 | Configuration conf = new Configuration();
53 | conf = RiakConfig.addLocation(conf, new RiakPBLocation(host, port));
54 | conf = RiakConfig.addLocation(conf, new RiakHTTPLocation(host, port, "riak"));
55 |
56 | assertEquals("127.0.0.1:8097,http://127.0.0.1:8097/riak", conf.get(RiakConfig.LOCATIONS_PROPERTY));
57 | }
58 |
59 | /**
60 | * Test method for
61 | * {@link com.basho.riak.hadoop.config.RiakConfig#getRiakLocatons(org.apache.hadoop.conf.Configuration)}
62 | * .
63 | */
64 | @Test public void testGetRiakLocatons() {
65 | Configuration conf = new Configuration();
66 | conf.set(RiakConfig.LOCATIONS_PROPERTY, "127.0.0.1:8097,http://127.0.0.1:8097/riak");
67 |
68 | RiakLocation[] locations = RiakConfig.getRiakLocatons(conf);
69 |
70 | assertEquals(2, locations.length);
71 | assertTrue(locations[0] instanceof RiakPBLocation);
72 | assertTrue(locations[1] instanceof RiakHTTPLocation);
73 | assertEquals("127.0.0.1:8097", locations[0].asString());
74 | assertEquals("http://127.0.0.1:8097/riak", locations[1].asString());
75 | }
76 |
77 | @Test public void setAndGetKeyLister() throws Exception {
78 | Configuration conf = new Configuration();
79 |
80 | BucketKeyLister bkl = new BucketKeyLister(BUCKET);
81 | conf = RiakConfig.setKeyLister(conf, bkl);
82 | KeyLister actual = RiakConfig.getKeyLister(conf);
83 | assertEquals(bkl, actual);
84 |
85 | KeysKeyLister kkl = new KeysKeyLister(Arrays.asList("k1", "k2", "k3", "k4"), BUCKET);
86 | conf = RiakConfig.setKeyLister(conf, kkl);
87 | actual = RiakConfig.getKeyLister(conf);
88 | assertEquals(kkl, actual);
89 |
90 | RiakSearchKeyLister rskl = new RiakSearchKeyLister(BUCKET, "foo:zero");
91 | conf = RiakConfig.setKeyLister(conf, rskl);
92 | actual = RiakConfig.getKeyLister(conf);
93 | assertEquals(rskl, actual);
94 |
95 | SecondaryIndexesKeyLister sikl = new SecondaryIndexesKeyLister(new BinRangeQuery(BinIndex.named("twitter"),
96 | BUCKET, "from", "to"));
97 | conf = RiakConfig.setKeyLister(conf, sikl);
98 | actual = RiakConfig.getKeyLister(conf);
99 | assertEquals(sikl, actual);
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/config/RiakLocation.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.config;
15 |
16 | import java.net.URI;
17 |
18 | /**
19 | * Models a Riak API end point location
20 | *
21 | * @author russell
22 | *
23 | */
24 | public abstract class RiakLocation {
25 |
26 | private final RiakTransport transport;
27 | private final String host;
28 | private final int port;
29 |
30 | /**
31 | * Create a location
32 | *
33 | * @param transport
34 | * the {@link RiakTransport} for this location
35 | * @param host
36 | * the host
37 | * @param port
38 | * the port
39 | */
40 | protected RiakLocation(RiakTransport transport, String host, int port) {
41 | this.transport = transport;
42 | this.host = host;
43 | this.port = port;
44 | }
45 |
46 | /**
47 | * @return the transport
48 | */
49 | public RiakTransport getTransport() {
50 | return transport;
51 | }
52 |
53 | /**
54 | * @return the host
55 | */
56 | public String getHost() {
57 | return host;
58 | }
59 |
60 | /**
61 | * @return the port
62 | */
63 | public int getPort() {
64 | return port;
65 | }
66 |
67 | /**
68 | * Serialize this location to a String
69 | *
70 | * @return a string representation that can be used by fromString(String)
71 | */
72 | public abstract String asString();
73 |
74 | /**
75 | * De-serialize the location from a String
76 | *
77 | * @param location
78 | * a String representation from asString()
79 | * @return a {@link RiakLocation}
80 | */
81 | public static RiakLocation fromString(String location) {
82 | RiakLocation result = null;
83 | if (location.contains("/")) {
84 | result = parseHttpLocation(location);
85 | } else {
86 | String[] pbLoc = location.split(":");
87 | if (pbLoc.length != 2) {
88 | throw new IllegalArgumentException("Invalid locaton " + location);
89 | }
90 | result = new RiakPBLocation(pbLoc[0], Integer.parseInt(pbLoc[1]));
91 | }
92 | return result;
93 | }
94 |
95 | /**
96 | * @param location
97 | * @return
98 | */
99 | private static RiakLocation parseHttpLocation(String location) {
100 | final URI uri = URI.create(location);
101 | return new RiakHTTPLocation(uri.getHost(), uri.getPort(), uri.getPath());
102 | }
103 |
104 | /*
105 | * (non-Javadoc)
106 | *
107 | * @see java.lang.Object#hashCode()
108 | */
109 | @Override public int hashCode() {
110 | final int prime = 31;
111 | int result = 1;
112 | result = prime * result + ((host == null) ? 0 : host.hashCode());
113 | result = prime * result + port;
114 | result = prime * result + ((transport == null) ? 0 : transport.hashCode());
115 | return result;
116 | }
117 |
118 | /*
119 | * (non-Javadoc)
120 | *
121 | * @see java.lang.Object#equals(java.lang.Object)
122 | */
123 | @Override public boolean equals(Object obj) {
124 | if (this == obj) {
125 | return true;
126 | }
127 | if (obj == null) {
128 | return false;
129 | }
130 | if (!(obj instanceof RiakLocation)) {
131 | return false;
132 | }
133 | RiakLocation other = (RiakLocation) obj;
134 | if (host == null) {
135 | if (other.host != null) {
136 | return false;
137 | }
138 | } else if (!host.equals(other.host)) {
139 | return false;
140 | }
141 | if (port != other.port) {
142 | return false;
143 | }
144 | if (transport != other.transport) {
145 | return false;
146 | }
147 | return true;
148 | }
149 | }
150 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/keylisters/RiakSearchKeyLister.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.keylisters;
15 |
16 | import java.io.IOException;
17 | import java.util.Collection;
18 |
19 | import org.codehaus.jackson.map.ObjectMapper;
20 |
21 | import com.basho.riak.client.IRiakClient;
22 | import com.basho.riak.client.RiakException;
23 | import com.basho.riak.client.query.MapReduceResult;
24 | import com.basho.riak.client.query.functions.Args;
25 | import com.basho.riak.client.query.functions.NamedErlangFunction;
26 | import com.basho.riak.hadoop.BucketKey;
27 |
28 | /**
29 | * Uses a Riak Search M/R query to produce a list of {@link BucketKey}s for a
30 | * hadoop M/R job
31 | *
32 | * @author russell
33 | *
34 | */
35 | public class RiakSearchKeyLister implements KeyLister {
36 |
37 | private static final ObjectMapper OM = new ObjectMapper();
38 |
39 | private String bucket;
40 | private String searchQuery;
41 |
42 | /**
43 | * Create a key lister that will execute searchQuery for
44 | * bucket to get a list of {@link BucketKey}s
45 | *
46 | * @param bucket
47 | * @param searchQuery
48 | */
49 | public RiakSearchKeyLister(String bucket, String searchQuery) {
50 | this.bucket = bucket;
51 | this.searchQuery = searchQuery;
52 | }
53 |
54 | public RiakSearchKeyLister() {}
55 |
56 | /*
57 | * (non-Javadoc)
58 | *
59 | * @see com.basho.riak.hadoop.KeyLister#getInitString()
60 | */
61 | public String getInitString() throws IOException {
62 | return OM.writeValueAsString(new String[] { bucket, searchQuery });
63 | }
64 |
65 | /*
66 | * (non-Javadoc)
67 | *
68 | * @see com.basho.riak.hadoop.KeyLister#init(java.lang.String)
69 | */
70 | public void init(String initString) throws IOException {
71 | String[] bq = OM.readValue(initString, String[].class);
72 | bucket = bq[0];
73 | searchQuery = bq[1];
74 | }
75 |
76 | /*
77 | * (non-Javadoc)
78 | *
79 | * @see
80 | * com.basho.riak.hadoop.KeyLister#getKeys(com.basho.riak.client.IRiakClient
81 | * )
82 | */
83 | public Collection getKeys(IRiakClient client) throws RiakException {
84 | if (bucket == null || searchQuery == null) {
85 | throw new IllegalStateException("bucket and query cannot be null");
86 | }
87 |
88 | MapReduceResult result = client.mapReduce(bucket, searchQuery).addReducePhase(NamedErlangFunction.REDUCE_IDENTITY,
89 | Args.REDUCE_PHASE_ONLY_1).execute();
90 |
91 | return result.getResult(BucketKey.class);
92 | }
93 |
94 | /*
95 | * (non-Javadoc)
96 | *
97 | * @see java.lang.Object#hashCode()
98 | */
99 | @Override public int hashCode() {
100 | final int prime = 31;
101 | int result = 1;
102 | result = prime * result + ((bucket == null) ? 0 : bucket.hashCode());
103 | result = prime * result + ((searchQuery == null) ? 0 : searchQuery.hashCode());
104 | return result;
105 | }
106 |
107 | /*
108 | * (non-Javadoc)
109 | *
110 | * @see java.lang.Object#equals(java.lang.Object)
111 | */
112 | @Override public boolean equals(Object obj) {
113 | if (this == obj) {
114 | return true;
115 | }
116 | if (obj == null) {
117 | return false;
118 | }
119 | if (!(obj instanceof RiakSearchKeyLister)) {
120 | return false;
121 | }
122 | RiakSearchKeyLister other = (RiakSearchKeyLister) obj;
123 | if (bucket == null) {
124 | if (other.bucket != null) {
125 | return false;
126 | }
127 | } else if (!bucket.equals(other.bucket)) {
128 | return false;
129 | }
130 | if (searchQuery == null) {
131 | if (other.searchQuery != null) {
132 | return false;
133 | }
134 | } else if (!searchQuery.equals(other.searchQuery)) {
135 | return false;
136 | }
137 | return true;
138 | }
139 | }
140 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/RiakInputSplit.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop;
15 |
16 | import java.io.DataInput;
17 | import java.io.DataOutput;
18 | import java.io.IOException;
19 | import java.util.Arrays;
20 | import java.util.Collection;
21 | import java.util.List;
22 |
23 | import org.apache.hadoop.io.Writable;
24 | import org.apache.hadoop.mapreduce.InputSplit;
25 |
26 | import com.basho.riak.hadoop.config.RiakLocation;
27 |
28 | /**
29 | * Riak specific extension of {@link InputSplit}
30 | *
31 | * @author russell
32 | *
33 | */
34 | public class RiakInputSplit extends InputSplit implements Writable {
35 |
36 | private BucketKey[] inputs;
37 | private RiakLocation location;
38 |
39 | public RiakInputSplit() {};
40 |
41 | public RiakInputSplit(List split, RiakLocation location) {
42 | this.inputs = split.toArray(new BucketKey[split.size()]);
43 | this.location = location;
44 | }
45 |
46 | /**
47 | * @return the location for the split (this is where the record reader for
48 | * this split will load data from)
49 | */
50 | public synchronized RiakLocation getLocation() {
51 | return location;
52 | }
53 |
54 | /**
55 | * @return the inputs the collection of keys whose data will be fetched by
56 | * the record reader
57 | */
58 | public synchronized Collection getInputs() {
59 | return Arrays.asList(inputs.clone());
60 | }
61 |
62 | /*
63 | * (non-Javadoc)
64 | *
65 | * @see org.apache.hadoop.mapreduce.InputSplit#getLength()
66 | */
67 | @Override public long getLength() throws IOException, InterruptedException {
68 | return inputs.length;
69 | }
70 |
71 | /*
72 | * (non-Javadoc)
73 | *
74 | * @see org.apache.hadoop.mapreduce.InputSplit#getLocations()
75 | */
76 | @Override public String[] getLocations() throws IOException, InterruptedException {
77 | return new String[] { location.asString() };
78 | }
79 |
80 | /*
81 | * (non-Javadoc)
82 | *
83 | * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
84 | */
85 | public void readFields(DataInput din) throws IOException {
86 | location = RiakLocation.fromString(din.readUTF());
87 | inputs = new BucketKey[din.readInt()];
88 |
89 | for (int i = 0; i < inputs.length; i++) {
90 | inputs[i] = new BucketKey(din.readUTF(), din.readUTF());
91 | }
92 | }
93 |
94 | /*
95 | * (non-Javadoc)
96 | *
97 | * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
98 | */
99 | public void write(DataOutput dout) throws IOException {
100 | dout.writeUTF(location.asString());
101 | dout.writeInt(inputs.length);
102 |
103 | for (BucketKey bk : inputs) {
104 | dout.writeUTF(bk.getBucket());
105 | dout.writeUTF(bk.getKey());
106 | }
107 | }
108 |
109 | /*
110 | * (non-Javadoc)
111 | *
112 | * @see java.lang.Object#hashCode()
113 | */
114 | @Override public int hashCode() {
115 | final int prime = 31;
116 | int result = 1;
117 | result = prime * result + Arrays.hashCode(inputs);
118 | result = prime * result + ((location == null) ? 0 : location.hashCode());
119 | return result;
120 | }
121 |
122 | /*
123 | * (non-Javadoc)
124 | *
125 | * @see java.lang.Object#equals(java.lang.Object)
126 | */
127 | @Override public boolean equals(Object obj) {
128 | if (this == obj) {
129 | return true;
130 | }
131 | if (obj == null) {
132 | return false;
133 | }
134 | if (!(obj instanceof RiakInputSplit)) {
135 | return false;
136 | }
137 | RiakInputSplit other = (RiakInputSplit) obj;
138 | if (!Arrays.equals(inputs, other.inputs)) {
139 | return false;
140 | }
141 | if (location == null) {
142 | if (other.location != null) {
143 | return false;
144 | }
145 | } else if (!location.equals(other.location)) {
146 | return false;
147 | }
148 | return true;
149 | }
150 |
151 | }
152 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/keylisters/KeysKeyLister.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.keylisters;
15 |
16 | import java.util.Collection;
17 | import java.util.HashSet;
18 | import java.util.Set;
19 |
20 | import com.basho.riak.client.IRiakClient;
21 | import com.basho.riak.client.RiakException;
22 | import com.basho.riak.hadoop.BucketKey;
23 |
24 | /**
25 | * Key lister that simply returns the list of keys it is configured with.
26 | *
27 | * If you get your key list from outside Riak, or for testing a subset of data.
28 | *
29 | * @author russell
30 | *
31 | */
32 | public class KeysKeyLister implements KeyLister {
33 |
34 | private static final String BK_SEPARATOR = ":";
35 | private static final String ENTRY_SEPARATOR = ",";
36 |
37 | private Set keys = null;
38 |
39 | /**
40 | * Provide the keys directly (don't look up in Riak)
41 | *
42 | * @param keys
43 | * the keys to M/R over
44 | */
45 | public KeysKeyLister(Collection keys) {
46 | this.keys = new HashSet(keys);
47 | }
48 |
49 | /**
50 | * Provide the keys directly (don't look up in Riak)
51 | *
52 | * @param keys
53 | * the keys to M/R over
54 | * @param bucket
55 | * a common bucket the keys share
56 | */
57 | public KeysKeyLister(Collection keys, String bucket) {
58 | this.keys = new HashSet();
59 | for (String k : keys) {
60 | this.keys.add(new BucketKey(bucket, k));
61 | }
62 | }
63 |
64 | public KeysKeyLister() {};
65 |
66 | /*
67 | * (non-Javadoc)
68 | *
69 | * @see com.basho.riak.hadoop.KeyLister#getInitString()
70 | */
71 | public String getInitString() {
72 | StringBuilder sb = new StringBuilder();
73 | String sep = "";
74 | for (BucketKey bk : keys) {
75 | sb.append(sep).append(bk.getBucket()).append(BK_SEPARATOR).append(bk.getKey());
76 | sep = ENTRY_SEPARATOR;
77 | }
78 |
79 | return sb.toString();
80 | }
81 |
82 | /*
83 | * (non-Javadoc)
84 | *
85 | * @see com.basho.riak.hadoop.KeyLister#init(java.lang.String)
86 | */
87 | public void init(String initString) {
88 | if (initString == null) {
89 | throw new IllegalArgumentException("initString cannot be null");
90 | }
91 | this.keys = new HashSet();
92 | String[] bks = initString.split(ENTRY_SEPARATOR);
93 |
94 | for (String bk : bks) {
95 | String[] bucketKey = bk.split(BK_SEPARATOR);
96 | keys.add(new BucketKey(bucketKey[0], bucketKey[1]));
97 | }
98 | }
99 |
100 | /*
101 | * (non-Javadoc)
102 | *
103 | * @see
104 | * com.basho.riak.hadoop.KeyLister#getKeys(com.basho.riak.client.IRiakClient
105 | * )
106 | */
107 | public Collection getKeys(IRiakClient client) throws RiakException {
108 | if (keys == null) {
109 | throw new IllegalStateException("lister not initialised");
110 | }
111 | return new HashSet(keys);
112 | }
113 |
114 | /*
115 | * (non-Javadoc)
116 | *
117 | * @see java.lang.Object#hashCode()
118 | */
119 | @Override public int hashCode() {
120 | final int prime = 31;
121 | int result = 1;
122 | result = prime * result + ((keys == null) ? 0 : keys.hashCode());
123 | return result;
124 | }
125 |
126 | /*
127 | * (non-Javadoc)
128 | *
129 | * @see java.lang.Object#equals(java.lang.Object)
130 | */
131 | @Override public boolean equals(Object obj) {
132 | if (this == obj) {
133 | return true;
134 | }
135 | if (obj == null) {
136 | return false;
137 | }
138 | if (!(obj instanceof KeysKeyLister)) {
139 | return false;
140 | }
141 | KeysKeyLister other = (KeysKeyLister) obj;
142 | if (keys == null) {
143 | if (other.keys != null) {
144 | return false;
145 | }
146 | } else if (!keys.equals(other.keys)) {
147 | return false;
148 | }
149 | return true;
150 | }
151 |
152 | }
153 |
--------------------------------------------------------------------------------
/src/test/java/com/basho/riak/hadoop/keylisters/BucketKeyListerTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.keylisters;
15 |
16 | import static org.junit.Assert.assertEquals;
17 | import static org.junit.Assert.assertTrue;
18 | import static org.junit.Assert.fail;
19 | import static org.mockito.Mockito.when;
20 |
21 | import java.util.ArrayList;
22 | import java.util.Arrays;
23 | import java.util.Collection;
24 | import java.util.List;
25 |
26 | import org.junit.Before;
27 | import org.junit.Test;
28 | import org.mockito.Mock;
29 | import org.mockito.MockitoAnnotations;
30 |
31 | import com.basho.riak.client.IRiakClient;
32 | import com.basho.riak.client.RiakException;
33 | import com.basho.riak.client.bucket.Bucket;
34 | import com.basho.riak.client.bucket.FetchBucket;
35 | import com.basho.riak.hadoop.BucketKey;
36 | import com.basho.riak.hadoop.keylisters.BucketKeyLister;
37 |
38 | /**
39 | * @author russell
40 | *
41 | */
42 | public class BucketKeyListerTest {
43 |
44 | private static final String BUCKET_NAME = "bucket";
45 |
46 | @Mock private IRiakClient riakClient;
47 | @Mock private Bucket bucket;
48 | @Mock private FetchBucket fetchBucket;
49 |
50 | private BucketKeyLister lister;
51 |
52 | /**
53 | * Create {@link BucketKeyLister}, mocks, wire together, stub mocks
54 | */
55 | @Before public void setUp() throws Exception {
56 | MockitoAnnotations.initMocks(this);
57 | // stub default calls to IRiakClient and FetchBucket
58 | when(riakClient.fetchBucket(BUCKET_NAME)).thenReturn(fetchBucket);
59 | when(fetchBucket.execute()).thenReturn(bucket);
60 | }
61 |
62 | /**
63 | * Test method for
64 | * {@link com.basho.riak.hadoop.keylisters.BucketKeyLister#BucketKeyLister()}.
65 | */
66 | @Test public void illegalState() throws Exception {
67 | lister = new BucketKeyLister();
68 | try {
69 | testLister(lister);
70 | fail("expected IllegalStateException");
71 | } catch (IllegalStateException e) {
72 | // NO-OP
73 | }
74 | }
75 |
76 | /**
77 | * Test method for
78 | * {@link com.basho.riak.hadoop.keylisters.BucketKeyLister#BucketKeyLister(java.lang.String)}
79 | * .
80 | */
81 | @Test public void createWithBucket() throws Exception {
82 | lister = new BucketKeyLister(BUCKET_NAME);
83 | testLister(lister);
84 | }
85 |
86 | /**
87 | * Test method for
88 | * {@link com.basho.riak.hadoop.keylisters.BucketKeyLister#init(java.lang.String)}.
89 | */
90 | @Test public void initWithBucket() throws Exception {
91 | lister = new BucketKeyLister();
92 | lister.init(BUCKET_NAME);
93 | testLister(lister);
94 | }
95 |
96 | /**
97 | * Test method for
98 | * {@link com.basho.riak.hadoop.keylisters.BucketKeyLister#getInitString()}.
99 | */
100 | @Test public void testGetInitString() throws Exception {
101 | String initString = new BucketKeyLister(BUCKET_NAME).getInitString();
102 | assertEquals(BUCKET_NAME, initString);
103 | testLister(new BucketKeyLister(initString));
104 | }
105 |
106 | @Test public void exceptionsBubbleUp() throws Exception {
107 | final RiakException re = new RiakException();
108 | lister = new BucketKeyLister(BUCKET_NAME);
109 |
110 | when(bucket.keys()).thenThrow(re);
111 |
112 | try {
113 | lister.getKeys(riakClient);
114 | fail("Expected RiakException");
115 | } catch (RiakException e) {
116 | assertEquals(e, re);
117 | }
118 | }
119 |
120 | @Test public void zeroKeys() throws Exception {
121 | lister = new BucketKeyLister(BUCKET_NAME);
122 | testLister(lister, new ArrayList());
123 | }
124 |
125 | private void testLister(BucketKeyLister lister) throws Exception {
126 | testLister(lister, Arrays.asList("k1", "k2", "k3", "k4"));
127 | }
128 |
129 | private void testLister(BucketKeyLister lister, List expectedKeys) throws Exception {
130 | when(bucket.keys()).thenReturn(expectedKeys);
131 | Collection keys = lister.getKeys(riakClient);
132 | assertEquals("Expected keys to be same length as stubbed mock value", expectedKeys.size(), keys.size());
133 |
134 | for (String k : expectedKeys) {
135 | assertTrue("Expected keys to contain " + k, keys.contains(new BucketKey(BUCKET_NAME, k)));
136 | }
137 | }
138 | }
139 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/config/ClientFactory.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.config;
15 |
16 | import java.io.IOException;
17 |
18 | import com.basho.riak.client.IRiakClient;
19 | import com.basho.riak.client.RiakException;
20 | import com.basho.riak.client.RiakFactory;
21 | import com.basho.riak.client.raw.RawClient;
22 | import com.basho.riak.client.raw.config.Configuration;
23 | import com.basho.riak.client.raw.http.HTTPClientAdapter;
24 | import com.basho.riak.client.raw.http.HTTPClientConfig;
25 | import com.basho.riak.client.raw.http.HTTPClusterConfig;
26 | import com.basho.riak.client.raw.pbc.PBClientAdapter;
27 | import com.basho.riak.client.raw.pbc.PBClientConfig;
28 | import com.basho.riak.client.raw.pbc.PBClusterConfig;
29 |
30 | /**
31 | * Used for generating clients for input/output
32 | *
33 | * Replace with existing RJC factory when {@link RiakLocation}s is swapped for
34 | * {@link Configuration}
35 | *
36 | * @author russell
37 | *
38 | */
39 | public final class ClientFactory {
40 |
41 | private ClientFactory() {}
42 |
43 | public static IRiakClient getClient(RiakLocation location) throws RiakException {
44 | // TODO this should use getRawClient, but DefaultRiakClient's
45 | // constructor is wrong visibility
46 | // Either change the visibility or add a method to the factory to accept
47 | // a delegate (the latter!)
48 | IRiakClient client = null;
49 | switch (location.getTransport()) {
50 | case PB:
51 | client = RiakFactory.pbcClient(location.getHost(), location.getPort());
52 | break;
53 | case HTTP:
54 | client = RiakFactory.httpClient(location.asString());
55 | break;
56 | default:
57 | throw new RiakException("Unknown Transport");
58 | }
59 | return client;
60 | }
61 |
62 | public static RawClient getRawClient(RiakLocation location) throws IOException {
63 | RawClient client = null;
64 | switch (location.getTransport()) {
65 | case PB:
66 | client = new PBClientAdapter(location.getHost(), location.getPort());
67 | break;
68 | case HTTP:
69 | client = new HTTPClientAdapter(location.asString());
70 | break;
71 | default:
72 | throw new IOException("Unknown Transport");
73 | }
74 | return client;
75 | }
76 |
77 | /**
78 | * Generate a cluster client from an array of {@link RiakLocation}s
79 | *
80 | * @param riakLocatons
81 | * @return
82 | * @throws IllegalArgumentException
83 | * if locations are not all of same {@link RiakTransport}
84 | */
85 | public static IRiakClient clusterClient(RiakLocation[] riakLocatons) throws RiakException {
86 | IRiakClient client = null;
87 | RiakTransport transport = null;
88 |
89 | if (riakLocatons != null && riakLocatons.length > 0) {
90 | transport = riakLocatons[0].getTransport();
91 | }
92 |
93 | if (RiakTransport.PB.equals(transport)) {
94 | client = pbClusterClient(riakLocatons);
95 | } else if (RiakTransport.HTTP.equals(transport)) {
96 | client = httpClusterClient(riakLocatons);
97 | }
98 |
99 | return client;
100 | }
101 |
102 | /**
103 | * @param riakLocatons
104 | * @return a cluster client of HTTP clients
105 | */
106 | private static IRiakClient httpClusterClient(RiakLocation[] riakLocatons) throws RiakException {
107 | HTTPClusterConfig conf = new HTTPClusterConfig(500); // TODO make this config
108 |
109 | for (RiakLocation loc : riakLocatons) {
110 | if(!RiakTransport.HTTP.equals(loc.getTransport())) {
111 | throw new IllegalArgumentException("Cluster clients must be homogenous");
112 | }
113 |
114 | RiakHTTPLocation httpLoc = (RiakHTTPLocation)loc;
115 | conf.addClient(new HTTPClientConfig.Builder()
116 | .withHost(httpLoc.getHost())
117 | .withPort(httpLoc.getPort())
118 | .withRiakPath(httpLoc.getRiakPath())
119 | .build());
120 | }
121 | return RiakFactory.newClient(conf);
122 | }
123 |
124 | /**
125 | * @param riakLocatons
126 | * @return a cluster client of PB clients
127 | */
128 | private static IRiakClient pbClusterClient(RiakLocation[] riakLocatons) throws RiakException {
129 | PBClusterConfig conf = new PBClusterConfig(500); // TODO make this config
130 |
131 | for (RiakLocation loc : riakLocatons) {
132 | if(!RiakTransport.PB.equals(loc.getTransport())) {
133 | throw new IllegalArgumentException("Cluster clients must be homogenous");
134 | }
135 | conf.addClient(new PBClientConfig.Builder()
136 | .withHost(loc.getHost())
137 | .withPort(loc.getPort())
138 | .build());
139 | }
140 | return RiakFactory.newClient(conf);
141 | }
142 | }
143 |
--------------------------------------------------------------------------------
/src/test/java/com/basho/riak/hadoop/keylisters/SecondaryIndexesKeyListerTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.keylisters;
15 |
16 | import static org.junit.Assert.assertEquals;
17 | import static org.junit.Assert.fail;
18 | import static org.mockito.Mockito.when;
19 |
20 | import java.util.Arrays;
21 | import java.util.Collection;
22 |
23 | import org.junit.Before;
24 | import org.junit.Test;
25 | import org.mockito.Mock;
26 | import org.mockito.MockitoAnnotations;
27 |
28 | import com.basho.riak.client.IRiakClient;
29 | import com.basho.riak.client.query.IndexMapReduce;
30 | import com.basho.riak.client.query.MapReduceResult;
31 | import com.basho.riak.client.query.functions.Args;
32 | import com.basho.riak.client.query.functions.NamedErlangFunction;
33 | import com.basho.riak.client.query.indexes.BinIndex;
34 | import com.basho.riak.client.query.indexes.IntIndex;
35 | import com.basho.riak.client.raw.query.indexes.BinRangeQuery;
36 | import com.basho.riak.client.raw.query.indexes.BinValueQuery;
37 | import com.basho.riak.client.raw.query.indexes.IndexQuery;
38 | import com.basho.riak.client.raw.query.indexes.IntRangeQuery;
39 | import com.basho.riak.client.raw.query.indexes.IntValueQuery;
40 | import com.basho.riak.hadoop.BucketKey;
41 | import com.basho.riak.hadoop.keylisters.SecondaryIndexesKeyLister;
42 |
43 | /**
44 | * @author russell
45 | *
46 | */
47 | public class SecondaryIndexesKeyListerTest {
48 |
49 | private static final String INDEX = "index";
50 | private static final String BUCKET = "bucket";
51 | private static final String VALUE = "value";
52 | private static final String FROM = "from";
53 | private static final String TO = "to";
54 |
55 | @Mock private IRiakClient riakClient;
56 | @Mock private IndexMapReduce indexMapReduce;
57 | @Mock private MapReduceResult result;
58 |
59 | private SecondaryIndexesKeyLister lister;
60 |
61 | /**
62 | * @throws java.lang.Exception
63 | */
64 | @Before public void setUp() throws Exception {
65 | MockitoAnnotations.initMocks(this);
66 | }
67 |
68 | /**
69 | * Test method for
70 | * {@link com.basho.riak.hadoop.keylisters.SecondaryIndexesKeyLister#SecondaryIndexesKeyLister(com.basho.riak.client.raw.query.indexes.IndexQuery)}
71 | * .
72 | */
73 | @Test public void constructWithQuery() throws Exception {
74 |
75 | IndexQuery query = new BinRangeQuery(BinIndex.named(INDEX), BUCKET, FROM, TO);
76 | lister = new SecondaryIndexesKeyLister(query);
77 |
78 | testLister(lister, query);
79 | }
80 |
81 | /**
82 | * Test method for
83 | * {@link com.basho.riak.hadoop.keylisters.SecondaryIndexesKeyLister#SecondaryIndexesKeyLister()}
84 | * .
85 | */
86 | @Test public void illegalState() throws Exception {
87 | lister = new SecondaryIndexesKeyLister();
88 |
89 | try {
90 | lister.getKeys(riakClient);
91 | fail("Expected IllegalStateException");
92 | } catch (IllegalStateException e) {
93 | // NO-OP
94 | }
95 | }
96 |
97 | /**
98 | * Test method for
99 | * {@link com.basho.riak.hadoop.keylisters.SecondaryIndexesKeyLister#getInitString()}.
100 | */
101 | @Test public void getInitString_binRange() throws Exception {
102 | IndexQuery query = new BinRangeQuery(BinIndex.named(INDEX), BUCKET, FROM, TO);
103 | lister = new SecondaryIndexesKeyLister(query);
104 |
105 | String initString = lister.getInitString();
106 |
107 | SecondaryIndexesKeyLister listerToo = new SecondaryIndexesKeyLister();
108 | listerToo.init(initString);
109 |
110 | testLister(listerToo, query);
111 | }
112 |
113 | @Test public void getInitString_binValue() throws Exception {
114 | IndexQuery query = new BinValueQuery(BinIndex.named(INDEX), BUCKET, VALUE);
115 | lister = new SecondaryIndexesKeyLister(query);
116 |
117 | String initString = lister.getInitString();
118 |
119 | SecondaryIndexesKeyLister listerToo = new SecondaryIndexesKeyLister();
120 | listerToo.init(initString);
121 |
122 | testLister(listerToo, query);
123 | }
124 |
125 | @Test public void getInitString_intnRange() throws Exception {
126 | IndexQuery query = new IntRangeQuery(IntIndex.named(INDEX), BUCKET, 1, 100);
127 | lister = new SecondaryIndexesKeyLister(query);
128 |
129 | String initString = lister.getInitString();
130 |
131 | SecondaryIndexesKeyLister listerToo = new SecondaryIndexesKeyLister();
132 | listerToo.init(initString);
133 |
134 | testLister(listerToo, query);
135 | }
136 |
137 | @Test public void getInitString_intValue() throws Exception {
138 | IndexQuery query = new IntValueQuery(IntIndex.named(INDEX), BUCKET, 10);
139 | lister = new SecondaryIndexesKeyLister(query);
140 |
141 | String initString = lister.getInitString();
142 |
143 | SecondaryIndexesKeyLister listerToo = new SecondaryIndexesKeyLister();
144 | listerToo.init(initString);
145 |
146 | testLister(listerToo, query);
147 | }
148 |
149 | private void testLister(SecondaryIndexesKeyLister lister, IndexQuery query) throws Exception {
150 | final Collection expected = Arrays.asList(new BucketKey(BUCKET, "k1"), new BucketKey(BUCKET, "k2"));
151 | when(riakClient.mapReduce(query)).thenReturn(indexMapReduce);
152 | when(indexMapReduce.addReducePhase(NamedErlangFunction.REDUCE_IDENTITY, Args.REDUCE_PHASE_ONLY_1)).thenReturn(indexMapReduce);
153 | when(indexMapReduce.execute()).thenReturn(result);
154 | when(result.getResult(BucketKey.class)).thenReturn(expected);
155 |
156 | Collection actual = lister.getKeys(riakClient);
157 |
158 | assertEquals(expected, actual);
159 | }
160 | }
161 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/RiakInputFormat.java:
--------------------------------------------------------------------------------
1 | /*
2 | * x * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop;
15 |
16 | import static com.basho.riak.hadoop.config.ClientFactory.getClient;
17 |
18 | import java.io.IOException;
19 | import java.util.ArrayList;
20 | import java.util.List;
21 |
22 | import org.apache.hadoop.conf.Configuration;
23 | import org.apache.hadoop.mapreduce.InputFormat;
24 | import org.apache.hadoop.mapreduce.InputSplit;
25 | import org.apache.hadoop.mapreduce.JobContext;
26 | import org.apache.hadoop.mapreduce.RecordReader;
27 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
28 |
29 | import com.basho.riak.client.IRiakClient;
30 | import com.basho.riak.client.RiakException;
31 | import com.basho.riak.client.raw.RiakResponse;
32 | import com.basho.riak.hadoop.config.NoRiakLocationsException;
33 | import com.basho.riak.hadoop.config.RiakConfig;
34 | import com.basho.riak.hadoop.config.RiakLocation;
35 | import com.basho.riak.hadoop.keylisters.KeyLister;
36 |
37 | /**
38 | * Riak specific {@link InputFormat} for Hadoop Map/Reduce
39 | *
40 | * @author russell
41 | *
42 | */
43 | public class RiakInputFormat extends InputFormat {
44 |
45 | /**
46 | * TODO: add this to the configuration.
47 | */
48 | private static final int MINIMUM_SPLIT = 10;
49 |
50 | /* (non-Javadoc)
51 | * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(org.apache.hadoop.mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext)
52 | */
53 | @Override public RecordReader createRecordReader(InputSplit split,
54 | TaskAttemptContext context)
55 | throws IOException, InterruptedException {
56 | return new RiakRecordReader();
57 | }
58 |
59 | /* (non-Javadoc)
60 | * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
61 | */
62 | @Override public List getSplits(JobContext context) throws IOException, InterruptedException {
63 | Configuration conf = context.getConfiguration();
64 | RiakLocation[] locations = RiakConfig.getRiakLocatons(conf);
65 |
66 | if (locations.length == 0) {
67 | throw new NoRiakLocationsException();
68 | }
69 |
70 | final KeyLister keyLister = RiakConfig.getKeyLister(conf);
71 |
72 | try {
73 | List keys = getKeys(locations, keyLister, 0);
74 | List splits = getSplits(keys, locations,
75 | getSplitSize(keys.size(), RiakConfig.getHadoopClusterSize(conf, 3)));
76 | return splits;
77 | } catch (RiakException e) {
78 | throw new IOException(e);
79 | }
80 | }
81 |
82 | /**
83 | * Get the list of input keys for the task. If the first location fails, try
84 | * the next, and so on, until we have a success or definitive failure.
85 | *
86 | * @return the list of bucket/keys (may be empty, never null)
87 | * @throws RiakException
88 | */
89 | public static List getKeys(RiakLocation[] locations, KeyLister keyLister, int attemptNumber)
90 | throws RiakException {
91 | final List keys = new ArrayList();
92 | try {
93 | IRiakClient attemptClient = getClient(locations[attemptNumber]);
94 | keys.addAll(keyLister.getKeys(attemptClient));
95 | } catch (RiakException e) {
96 | if (attemptNumber >= (locations.length - 1)) {
97 | throw e;
98 | } else {
99 | getKeys(locations, keyLister, ++attemptNumber);
100 | }
101 | }
102 | return keys;
103 | }
104 |
105 | /**
106 | * Calculates the split size. Uses a *rough* heuristic based on the info
107 | * here http://wiki.apache.org/hadoop/HowManyMapsAndReduces to generate ~10
108 | * splits per hadoop node. Falls back to some lower number if the inputs are
109 | * smaller, and lower still when there are less inputs than hadoop nodes
110 | *
111 | * @param numberOfKeys
112 | * the total input size
113 | * @param hadoopClusterSize
114 | * rough number of nodes in the hadoop m/r cluster
115 | * @return the size for each split
116 | */
117 | public static int getSplitSize(int numberOfKeys, int hadoopClusterSize) {
118 | int splitSize = numberOfKeys / (hadoopClusterSize * 10);
119 | if (splitSize < MINIMUM_SPLIT) {
120 | // too few? then use a smaller divider
121 | splitSize = numberOfKeys / hadoopClusterSize;
122 | if (splitSize < MINIMUM_SPLIT) {
123 | // still too few? just split into splits of MINIMUM_SPLIT
124 | splitSize = MINIMUM_SPLIT;
125 | }
126 | }
127 | return splitSize;
128 | }
129 |
130 | /**
131 | * Generate the splits, each split (except maybe the last) will be
132 | * splitSize and will have a {@link RiakLocation} assigned to
133 | * it. The {@link RiakLocation} is chosen by modulus so it should be a
134 | * reasonably fair distribution.
135 | *
136 | * @param keys
137 | * the list of inputs
138 | * @param locations
139 | * all the riak locations
140 | * @param splitSize
141 | * The target size for each split
142 | * @return the input splits
143 | */
144 | public static List getSplits(final List keys, final RiakLocation[] locations, int splitSize) {
145 | final List splits = new ArrayList();
146 | int splitCnt = 0;
147 | int startIndex = 0;
148 | int numberOfKeys = keys.size();
149 | while (startIndex < numberOfKeys) {
150 | int endIndex = Math.min(numberOfKeys, splitSize + startIndex);
151 | final List split = keys.subList(startIndex, endIndex);
152 | splits.add(new RiakInputSplit(split, locations[splitCnt % locations.length]));
153 | splitCnt++;
154 | startIndex = endIndex;
155 | }
156 |
157 | return splits;
158 | }
159 | }
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/config/RiakConfig.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.config;
15 |
16 | import java.io.IOException;
17 | import java.util.ArrayList;
18 | import java.util.List;
19 | import java.util.StringTokenizer;
20 |
21 | import org.apache.hadoop.conf.Configuration;
22 | import org.apache.hadoop.mapreduce.InputSplit;
23 |
24 | import com.basho.riak.hadoop.keylisters.BucketKeyLister;
25 | import com.basho.riak.hadoop.keylisters.KeyLister;
26 |
27 | /**
28 | * Helper class to make dealing with the hadoop {@link Configuration} object
29 | * easier when setting up a Riak Map/Reduce job on Hadoop
30 | *
31 | * @author russell
32 | *
33 | */
34 | public final class RiakConfig {
35 |
36 | public static final String LOCATIONS_PROPERTY = "com.basho.riak.hadoop.mr.riak.locations";
37 | private static final String COMMA = ",";
38 | public static final String CLUSTER_SIZE_PROPERTY = "com.basho.riak.hadoop.mr.cluster.size";
39 | private static final String KEY_LISTER_CLASS_PROPERTY = "com.basho.riak.hadoop.mr.keylister.class";
40 | private static final String KEY_LISTER_INIT_STRING_PROPERTY = "com.basho.riak.hadoop.mr.keylister.init_string";
41 | private static final String OUTPUT_BUCKET_PROPERTY = "com.basho.riak.hadoop.mr.output.bucket";
42 |
43 | private RiakConfig() {}
44 |
45 | /**
46 | * Add a riak location to the {@link Configuration} passed.
47 | *
48 | * @param conf
49 | * the {@link Configuration} to add a location too
50 | * @param location
51 | * the {@link RiakLocation} to add
52 | * @return the {@link Configuration} with location added to the
53 | * location property
54 | */
55 | public static Configuration addLocation(Configuration conf, RiakLocation location) {
56 | StringBuilder sb = new StringBuilder();
57 | String currentLocations = conf.get(LOCATIONS_PROPERTY);
58 |
59 | if (currentLocations != null) {
60 | sb.append(currentLocations);
61 | }
62 |
63 | if (sb.length() > 0) {
64 | sb.append(COMMA);
65 | }
66 |
67 | sb.append(location.asString());
68 |
69 | conf.set(LOCATIONS_PROPERTY, sb.toString());
70 | return conf;
71 | }
72 |
73 | /**
74 | * Get all the riak locations from the passed {@link Configuration}
75 | *
76 | * @param conf
77 | * the {@link Configuration}
78 | * @return an array of {@link RiakLocation} (may be empty, never null)
79 | */
80 | public static RiakLocation[] getRiakLocatons(Configuration conf) {
81 | String locations = conf.get(LOCATIONS_PROPERTY, "");
82 | StringTokenizer st = new StringTokenizer(locations, COMMA);
83 | List result = new ArrayList();
84 |
85 | while (st.hasMoreTokens()) {
86 | result.add(RiakLocation.fromString(st.nextToken()));
87 | }
88 |
89 | return result.toArray(new RiakLocation[result.size()]);
90 | }
91 |
92 | /**
93 | * Set the size of the hadoop cluster, this is used by the
94 | * {@link RiakInputFormat} to try and optimize the number of
95 | * {@link InputSplit}s to create
96 | *
97 | * @param conf
98 | * the {@link Configuration} to store the hadoop cluster size in
99 | * @param hadoopClusterSize
100 | * the size of the hadoop cluster
101 | * @return the {@link Configuration} updated with the passed
102 | * hadoopClusterSize
103 | */
104 | public static Configuration setHadoopClusterSize(Configuration conf, int hadoopClusterSize) {
105 | conf.setInt(CLUSTER_SIZE_PROPERTY, hadoopClusterSize);
106 | return conf;
107 |
108 | }
109 |
110 | /**
111 | * Get the hadoop cluster size property, provide a default in case it hasn't
112 | * been set
113 | *
114 | * @param conf
115 | * the {@link Configuration} to get the property value from
116 | * @param defaultValue
117 | * the default size to use if it hasn't been set
118 | * @return the hadoop cluster size or defaultValue
119 | */
120 | public static int getHadoopClusterSize(Configuration conf, int defaultValue) {
121 | return conf.getInt(CLUSTER_SIZE_PROPERTY, defaultValue);
122 | }
123 |
124 | /**
125 | * @param conf
126 | * the {@link Configuration} to query
127 | * @return the {@link KeyLister} the job was configured with
128 | * @throws RuntimeException
129 | * if a {@link IllegalAccessException} or
130 | * {@link InstantiationException} is thrown creating a
131 | * {@link KeyLister}
132 | */
133 | public static KeyLister getKeyLister(Configuration conf) throws IOException {
134 | Class extends KeyLister> clazz = conf.getClass(KEY_LISTER_CLASS_PROPERTY, BucketKeyLister.class,
135 | KeyLister.class);
136 | try {
137 | KeyLister lister = clazz.newInstance();
138 | lister.init(conf.get(KEY_LISTER_INIT_STRING_PROPERTY));
139 | return lister;
140 | } catch (IllegalAccessException e) {
141 | throw new RuntimeException(e);
142 | } catch (InstantiationException e) {
143 | throw new RuntimeException(e);
144 | }
145 | }
146 |
147 | /**
148 | * Set the {@link KeyLister} implementation to use.
149 | *
150 | * @param conf
151 | * the {@link Configuration} to update
152 | * @param lister
153 | * the {@link KeyLister} to use
154 | * @return the configuration updated with a serialized version of the lister
155 | * provided
156 | */
157 | public static Configuration setKeyLister(Configuration conf, T lister) throws IOException {
158 | conf.setClass(KEY_LISTER_CLASS_PROPERTY, lister.getClass(), KeyLister.class);
159 | conf.setStrings(KEY_LISTER_INIT_STRING_PROPERTY, lister.getInitString());
160 | return conf;
161 | }
162 |
163 | /**
164 | * Get the configured output bucket for the job's results
165 | *
166 | * @param conf
167 | * the {@link Configuration} to query
168 | * @return the bucket name
169 | */
170 | public static String getOutputBucket(Configuration conf) {
171 | return conf.get(OUTPUT_BUCKET_PROPERTY);
172 | }
173 |
174 | /**
175 | * Add the output bucket for the results to the config.
176 | *
177 | * @param conf
178 | * the {@link Configuration} to update
179 | * @param bucket
180 | * the bucket to add
181 | * @return the updated {@link Configuration}
182 | */
183 | public static Configuration setOutputBucket(Configuration conf, String bucket) {
184 | conf.set(OUTPUT_BUCKET_PROPERTY, bucket);
185 | return conf;
186 | }
187 | }
188 |
--------------------------------------------------------------------------------
/src/main/java/com/basho/riak/hadoop/keylisters/SecondaryIndexesKeyLister.java:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is provided to you under the Apache License, Version 2.0 (the
3 | * "License"); you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11 | * License for the specific language governing permissions and limitations under
12 | * the License.
13 | */
14 | package com.basho.riak.hadoop.keylisters;
15 |
16 | import java.io.ByteArrayOutputStream;
17 | import java.io.IOException;
18 | import java.util.Collection;
19 | import java.util.Map;
20 |
21 | import org.codehaus.jackson.JsonEncoding;
22 | import org.codehaus.jackson.JsonFactory;
23 | import org.codehaus.jackson.JsonGenerator;
24 | import org.codehaus.jackson.map.ObjectMapper;
25 |
26 | import com.basho.riak.client.IRiakClient;
27 | import com.basho.riak.client.RiakException;
28 | import com.basho.riak.client.query.MapReduceResult;
29 | import com.basho.riak.client.query.functions.Args;
30 | import com.basho.riak.client.query.functions.NamedErlangFunction;
31 | import com.basho.riak.client.query.indexes.BinIndex;
32 | import com.basho.riak.client.query.indexes.IntIndex;
33 | import com.basho.riak.client.raw.query.indexes.BinRangeQuery;
34 | import com.basho.riak.client.raw.query.indexes.BinValueQuery;
35 | import com.basho.riak.client.raw.query.indexes.IndexQuery;
36 | import com.basho.riak.client.raw.query.indexes.IndexWriter;
37 | import com.basho.riak.client.raw.query.indexes.IntRangeQuery;
38 | import com.basho.riak.client.raw.query.indexes.IntValueQuery;
39 | import com.basho.riak.hadoop.BucketKey;
40 |
41 | /**
42 | * Uses a 2i query to get keys for hadoop m/r.
43 | *
44 | * @author russell
45 | *
46 | */
47 | public class SecondaryIndexesKeyLister implements KeyLister {
48 | private static final String BUCKET = "bucket";
49 | private static final String INDEX = "index";
50 | private static final String KEY = "key";
51 | private static final String START = "start";
52 | private static final String END = "end";
53 |
54 | private IndexQuery query;
55 |
56 | /**
57 | * @param query
58 | */
59 | public SecondaryIndexesKeyLister(IndexQuery query) {
60 | this.query = query;
61 | }
62 |
63 | public SecondaryIndexesKeyLister() {}
64 |
65 | /*
66 | * (non-Javadoc)
67 | *
68 | * @see com.basho.riak.hadoop.KeyLister#getInitString()
69 | */
70 | public String getInitString() throws IOException {
71 | // TODO, this is the same as the code in IndexMapReduce, abstract out to
72 | // common class
73 | ByteArrayOutputStream out = new ByteArrayOutputStream();
74 | final JsonGenerator jg = new JsonFactory().createJsonGenerator(out, JsonEncoding.UTF8);
75 |
76 | jg.writeStartObject();
77 |
78 | IndexWriter e = new IndexWriter() {
79 |
80 | private void writeCommon(String bucket, String index) throws IOException {
81 | jg.writeStringField(BUCKET, bucket);
82 | jg.writeStringField(INDEX, index);
83 | }
84 |
85 | public void write(String bucket, String index, int from, int to) throws IOException {
86 | writeCommon(bucket, index);
87 | jg.writeNumberField(START, from);
88 | jg.writeNumberField(END, to);
89 | }
90 |
91 | public void write(String bucket, String index, int value) throws IOException {
92 | writeCommon(bucket, index);
93 | jg.writeNumberField(KEY, value);
94 | }
95 |
96 | public void write(String bucket, String index, String from, String to) throws IOException {
97 | writeCommon(bucket, index);
98 | jg.writeStringField(START, from);
99 | jg.writeStringField(END, to);
100 | }
101 |
102 | public void write(String bucket, String index, String value) throws IOException {
103 | writeCommon(bucket, index);
104 | jg.writeStringField(KEY, value);
105 | }
106 | };
107 |
108 | query.write(e);
109 | jg.writeEndObject();
110 | jg.flush();
111 | jg.close();
112 | return out.toString("UTF-8");
113 | }
114 |
115 | /*
116 | * (non-Javadoc)
117 | *
118 | * @see com.basho.riak.hadoop.KeyLister#init(java.lang.String)
119 | */
120 | public void init(String initString) throws IOException {
121 | // just like FetchIndex, again, abstract out to a common class
122 | boolean isRange = false;
123 | // turn the Json into an index query
124 | @SuppressWarnings("rawtypes") Map map = new ObjectMapper().readValue(initString, Map.class);
125 |
126 | String indexName = (String) map.get(INDEX);
127 | String bucket = (String) map.get(BUCKET);
128 | Object value = map.get(KEY);
129 | Object from = map.get(START);
130 | Object to = map.get(END);
131 |
132 | if (indexName == null) {
133 | throw new IllegalArgumentException("no index present");
134 | }
135 | if (from != null && to != null && value == null) {
136 | isRange = true;
137 | }
138 |
139 | if (indexName != null && indexName.endsWith("_int")) {
140 | if (isRange) {
141 | query = new IntRangeQuery(IntIndex.named(indexName), bucket, (Integer) from, (Integer) to);
142 | } else {
143 | query = new IntValueQuery(IntIndex.named(indexName), bucket, (Integer) value);
144 | }
145 | }
146 |
147 | if (indexName != null && indexName.endsWith("_bin")) {
148 | if (isRange) {
149 | query = new BinRangeQuery(BinIndex.named(indexName), bucket, (String) from, (String) to);
150 | } else {
151 | query = new BinValueQuery(BinIndex.named(indexName), bucket, (String) value);
152 | }
153 | }
154 |
155 | if (query == null) {
156 | throw new IOException("unable to parse query from init string");
157 | }
158 | }
159 |
160 | /*
161 | * (non-Javadoc)
162 | *
163 | * @see
164 | * com.basho.riak.hadoop.KeyLister#getKeys(com.basho.riak.client.IRiakClient
165 | * )
166 | */
167 | public Collection getKeys(IRiakClient client) throws RiakException {
168 | if (query == null) {
169 | throw new IllegalStateException("No index query");
170 | }
171 | MapReduceResult r = client.mapReduce(query).addReducePhase(NamedErlangFunction.REDUCE_IDENTITY,
172 | Args.REDUCE_PHASE_ONLY_1).execute();
173 |
174 | return r.getResult(BucketKey.class);
175 | }
176 |
177 | /*
178 | * (non-Javadoc)
179 | *
180 | * @see java.lang.Object#hashCode()
181 | */
182 | @Override public int hashCode() {
183 | final int prime = 31;
184 | int result = 1;
185 | result = prime * result + ((query == null) ? 0 : query.hashCode());
186 | return result;
187 | }
188 |
189 | /*
190 | * (non-Javadoc)
191 | *
192 | * @see java.lang.Object#equals(java.lang.Object)
193 | */
194 | @Override public boolean equals(Object obj) {
195 | if (this == obj) {
196 | return true;
197 | }
198 | if (obj == null) {
199 | return false;
200 | }
201 | if (!(obj instanceof SecondaryIndexesKeyLister)) {
202 | return false;
203 | }
204 | SecondaryIndexesKeyLister other = (SecondaryIndexesKeyLister) obj;
205 | if (query == null) {
206 | if (other.query != null) {
207 | return false;
208 | }
209 | } else if (!query.equals(other.query)) {
210 | return false;
211 | }
212 | return true;
213 | }
214 | }
215 |
--------------------------------------------------------------------------------