├── .gitignore
├── src
├── main
│ ├── resources
│ │ └── log4j.xml
│ ├── config
│ │ ├── luwak.properties
│ │ └── combiner.properties
│ ├── java
│ │ └── uk
│ │ │ └── co
│ │ │ └── flax
│ │ │ └── samzaluwak
│ │ │ ├── Interact.java
│ │ │ ├── MatchRecombinerTask.java
│ │ │ └── MonitorTask.java
│ └── assembly
│ │ └── src.xml
└── test
│ └── java
│ └── uk
│ └── co
│ └── flax
│ └── samzaluwak
│ └── TestMonitorTask.java
├── README.md
└── pom.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | deploy
3 | *.iml
4 |
5 | .idea/
6 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.xml:
--------------------------------------------------------------------------------
1 |
2 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/src/main/config/luwak.properties:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | # Job
19 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory
20 | job.name=luwak
21 |
22 | # YARN
23 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz
24 |
25 | # Task
26 | task.class=uk.co.flax.samzaluwak.MonitorTask
27 | task.inputs=kafka.queries,kafka.documents
28 | task.checkpoint.factory=org.apache.samza.checkpoint.kafka.KafkaCheckpointManagerFactory
29 | task.checkpoint.system=kafka
30 | task.checkpoint.replication.factor=1
31 | task.consumer.batch.size=1
32 |
33 | # Serializers
34 | serializers.registry.json.class=org.apache.samza.serializers.JsonSerdeFactory
35 | serializers.registry.string.class=org.apache.samza.serializers.StringSerdeFactory
36 |
37 | # Kafka System
38 | systems.kafka.samza.factory=org.apache.samza.system.kafka.KafkaSystemFactory
39 | systems.kafka.samza.msg.serde=json
40 | systems.kafka.samza.key.serde=string
41 | systems.kafka.consumer.zookeeper.connect=localhost:2181/
42 | systems.kafka.producer.metadata.broker.list=localhost:9092
43 | systems.kafka.producer.producer.type=sync
44 | # Normally, we'd set this much higher, but we want things to look snappy in the demo.
45 | systems.kafka.producer.batch.num.messages=1
46 |
47 | systems.kafka.streams.queries.samza.bootstrap=true
48 | systems.kafka.streams.queries.samza.reset.offset=true
49 | systems.kafka.streams.queries.samza.offset.default=oldest
50 |
--------------------------------------------------------------------------------
/src/main/config/combiner.properties:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | # Job
19 | job.factory.class=org.apache.samza.job.yarn.YarnJobFactory
20 | job.name=combiner
21 |
22 | # YARN
23 | yarn.package.path=file://${basedir}/target/${project.artifactId}-${pom.version}-dist.tar.gz
24 |
25 | # Task
26 | task.class=uk.co.flax.samzaluwak.MatchRecombinerTask
27 | task.inputs=kafka.matches1
28 | task.checkpoint.factory=org.apache.samza.checkpoint.kafka.KafkaCheckpointManagerFactory
29 | task.checkpoint.system=kafka
30 | task.checkpoint.replication.factor=1
31 | task.consumer.batch.size=1
32 |
33 | # Serializers
34 | serializers.registry.json.class=org.apache.samza.serializers.JsonSerdeFactory
35 | serializers.registry.string.class=org.apache.samza.serializers.StringSerdeFactory
36 |
37 | # Kafka System
38 | systems.kafka.samza.factory=org.apache.samza.system.kafka.KafkaSystemFactory
39 | systems.kafka.samza.msg.serde=json
40 | systems.kafka.samza.key.serde=string
41 | systems.kafka.consumer.zookeeper.connect=localhost:2181/
42 | systems.kafka.producer.metadata.broker.list=localhost:9092
43 | systems.kafka.producer.producer.type=sync
44 | # Normally, we'd set this much higher, but we want things to look snappy in the demo.
45 | systems.kafka.producer.batch.num.messages=1
46 |
47 | stores.matches.factory=org.apache.samza.storage.kv.LevelDbKeyValueStorageEngineFactory
48 | stores.matches.changelog=kafka.matches-combiner-changelog
49 | stores.matches.key.serde=string
50 | stores.matches.msg.serde=json
--------------------------------------------------------------------------------
/src/test/java/uk/co/flax/samzaluwak/TestMonitorTask.java:
--------------------------------------------------------------------------------
1 | package uk.co.flax.samzaluwak;
2 |
3 | import com.google.common.collect.ImmutableMap;
4 | import org.apache.samza.Partition;
5 | import org.apache.samza.system.IncomingMessageEnvelope;
6 | import org.apache.samza.system.OutgoingMessageEnvelope;
7 | import org.apache.samza.system.SystemStreamPartition;
8 | import org.apache.samza.task.MessageCollector;
9 | import org.junit.Test;
10 |
11 | import static org.mockito.Matchers.any;
12 | import static org.mockito.Mockito.mock;
13 | import static org.mockito.Mockito.times;
14 | import static org.mockito.Mockito.verify;
15 |
16 | /**
17 | * Copyright (c) 2014 Lemur Consulting Ltd.
18 | *
19 | * Licensed under the Apache License, Version 2.0 (the "License");
20 | * you may not use this file except in compliance with the License.
21 | * You may obtain a copy of the License at
22 | *
23 | * http://www.apache.org/licenses/LICENSE-2.0
24 | *
25 | * Unless required by applicable law or agreed to in writing, software
26 | * distributed under the License is distributed on an "AS IS" BASIS,
27 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
28 | * See the License for the specific language governing permissions and
29 | * limitations under the License.
30 | */
31 |
32 | public class TestMonitorTask {
33 |
34 | public static final SystemStreamPartition QUERY_PART = new SystemStreamPartition("kafka", MonitorTask.QUERIES_STREAM, new Partition(0));
35 | public static final SystemStreamPartition DOCS_PART = new SystemStreamPartition("kafka", MonitorTask.DOCS_STREAM, new Partition(0));
36 |
37 | @Test
38 | public void testTask() throws Exception {
39 |
40 | MonitorTask task = new MonitorTask();
41 | task.init(null, null);
42 |
43 | MessageCollector collector = mock(MessageCollector.class);
44 |
45 | IncomingMessageEnvelope query = new IncomingMessageEnvelope(QUERY_PART, "", "1", ImmutableMap.of("query", "hello world"));
46 | task.process(query, collector, null);
47 |
48 | IncomingMessageEnvelope doc = new IncomingMessageEnvelope(DOCS_PART, "", "doc1", ImmutableMap.of("f", "hello world"));
49 | task.process(doc, collector, null);
50 |
51 | verify(collector, times(1)).send(any(OutgoingMessageEnvelope.class));
52 |
53 | }
54 |
55 |
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/java/uk/co/flax/samzaluwak/Interact.java:
--------------------------------------------------------------------------------
1 | package uk.co.flax.samzaluwak;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.IOException;
5 | import java.io.InputStreamReader;
6 | import java.util.Properties;
7 |
8 | import kafka.javaapi.producer.Producer;
9 | import kafka.producer.KeyedMessage;
10 | import kafka.producer.ProducerConfig;
11 |
12 | /**
13 | * Copyright (c) 2014 Lemur Consulting Ltd.
14 | *
15 | * Licensed under the Apache License, Version 2.0 (the "License");
16 | * you may not use this file except in compliance with the License.
17 | * You may obtain a copy of the License at
18 | *
19 | * http://www.apache.org/licenses/LICENSE-2.0
20 | *
21 | * Unless required by applicable law or agreed to in writing, software
22 | * distributed under the License is distributed on an "AS IS" BASIS,
23 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24 | * See the License for the specific language governing permissions and
25 | * limitations under the License.
26 | */
27 |
28 | public class Interact {
29 |
30 | public static void main(String... args) throws IOException {
31 |
32 | Properties props = new Properties();
33 | props.setProperty("metadata.broker.list", "localhost:9092");
34 | props.setProperty("serializer.class", "kafka.serializer.StringEncoder");
35 | ProducerConfig config = new ProducerConfig(props);
36 |
37 | Producer producer = new Producer<>(config);
38 |
39 | BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
40 |
41 | while (true) {
42 | String cmd = reader.readLine();
43 | if ("quit".equals(cmd))
44 | return;
45 | if (cmd.startsWith("q "))
46 | sendQueryUpdate(cmd, producer);
47 | if (cmd.startsWith("d "))
48 | sendDocument(cmd, producer);
49 | }
50 |
51 | }
52 |
53 | private static void sendQueryUpdate(String cmd, Producer producer) {
54 |
55 | String[] parts = cmd.split("\\s", 3);
56 | String id = parts[1];
57 | String query = parts[2];
58 |
59 | KeyedMessage message
60 | = new KeyedMessage<>(MonitorTask.QUERIES_STREAM, id, "{ \"query\" : \"" + query + "\"}");
61 | producer.send(message);
62 |
63 | }
64 |
65 | private static void sendDocument(String cmd, Producer producer) {
66 |
67 | String[] parts = cmd.split("\\s", 3);
68 | String id = parts[1];
69 | String doc = parts[2];
70 |
71 | for (int i = 0; i < MatchRecombinerTask.QUERY_PARTITIONS; i++) {
72 | KeyedMessage message =
73 | new KeyedMessage<>(MonitorTask.DOCS_STREAM, id, i, "{ \"f\" : \"" + doc + "\"}");
74 | producer.send(message);
75 | }
76 |
77 | }
78 |
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/assembly/src.xml:
--------------------------------------------------------------------------------
1 |
2 |
12 |
13 |
17 | dist
18 |
19 | tar.gz
20 |
21 | false
22 |
23 |
24 | ${basedir}/src/main/resources/log4j.xml
25 | lib
26 |
27 |
29 |
30 | ${basedir}/src/main/config/luwak.properties
31 | config
32 | true
33 |
34 |
35 | ${basedir}/src/main/config/combiner.properties
36 | config
37 | true
38 |
39 |
40 |
41 |
42 | bin
43 |
44 | org.apache.samza:samza-shell:tgz:dist:*
45 |
46 | 0744
47 | true
48 |
49 |
50 | lib
51 |
52 | org.apache.samza:samza-core_2.10
53 | org.apache.samza:samza-kafka_2.10
54 | org.apache.samza:samza-serializers_2.10
55 | org.apache.samza:samza-yarn_2.10
56 | org.apache.samza:samza-kv-leveldb_2.10
57 | org.apache.samza:samza-log4j
58 | org.slf4j:slf4j-log4j12
59 | org.apache.kafka:kafka_2.10
60 | org.apache.hadoop:hadoop-hdfs
61 | uk.co.flax:samza-luwak
62 | uk.co.flax:luwak
63 |
64 | true
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/src/main/java/uk/co/flax/samzaluwak/MatchRecombinerTask.java:
--------------------------------------------------------------------------------
1 | package uk.co.flax.samzaluwak;
2 |
3 | import java.util.HashMap;
4 | import java.util.List;
5 | import java.util.Map;
6 |
7 | import com.google.common.collect.Lists;
8 | import org.apache.samza.config.Config;
9 | import org.apache.samza.storage.kv.Entry;
10 | import org.apache.samza.storage.kv.KeyValueIterator;
11 | import org.apache.samza.storage.kv.KeyValueStore;
12 | import org.apache.samza.system.IncomingMessageEnvelope;
13 | import org.apache.samza.system.OutgoingMessageEnvelope;
14 | import org.apache.samza.system.SystemStream;
15 | import org.apache.samza.task.*;
16 | import org.slf4j.Logger;
17 | import org.slf4j.LoggerFactory;
18 |
19 | /**
20 | * Copyright (c) 2014 Lemur Consulting Ltd.
21 | *
22 | * Licensed under the Apache License, Version 2.0 (the "License");
23 | * you may not use this file except in compliance with the License.
24 | * You may obtain a copy of the License at
25 | *
26 | * http://www.apache.org/licenses/LICENSE-2.0
27 | *
28 | * Unless required by applicable law or agreed to in writing, software
29 | * distributed under the License is distributed on an "AS IS" BASIS,
30 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31 | * See the License for the specific language governing permissions and
32 | * limitations under the License.
33 | */
34 |
35 | public class MatchRecombinerTask implements StreamTask, InitableTask {
36 |
37 | private static final Logger logger = LoggerFactory.getLogger(MatchRecombinerTask.class);
38 |
39 | public static final int QUERY_PARTITIONS = 2;
40 |
41 | private KeyValueStore> store;
42 |
43 | public static final SystemStream MATCHES_STREAM = new SystemStream("kafka", "combinedmatches");
44 |
45 | @Override
46 | public void process(IncomingMessageEnvelope message, MessageCollector collector, TaskCoordinator taskCoordinator) throws Exception {
47 | String key = (String) message.getKey();
48 | Map matches = (Map) message.getMessage();
49 | store.put(key, matches);
50 | logger.info("Got partial match for {}", key);
51 |
52 | String originalKey = originalKey(key);
53 |
54 | Map> parts = collectMatches(originalKey);
55 | if (parts.size() != QUERY_PARTITIONS)
56 | return;
57 |
58 | logger.info("All partial matches for {} received", originalKey);
59 | List