├── .gitignore
├── img
├── magic_view_example.png
├── lambda_architecture.png
└── simplified_lambda_architecture.png
├── pom.xml
├── src
└── main
│ └── java
│ ├── flickr
│ ├── SimplifiedLambdaDemo
│ │ └── SimplifiedLambdaDemo.java
│ └── SimplifiedLambda
│ │ └── SimplifiedLambda.java
│ └── utility
│ └── MockHTable.java
├── test
└── java
│ └── flickr
│ └── SimplifiedLambda
│ └── SimplifiedLambdaTest.java
├── LICENSE.txt
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.iml
3 | target
4 | flickr-simplified-lambda.iml
--------------------------------------------------------------------------------
/img/magic_view_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YahooArchive/simplified-lambda/HEAD/img/magic_view_example.png
--------------------------------------------------------------------------------
/img/lambda_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YahooArchive/simplified-lambda/HEAD/img/lambda_architecture.png
--------------------------------------------------------------------------------
/img/simplified_lambda_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YahooArchive/simplified-lambda/HEAD/img/simplified_lambda_architecture.png
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | flickr-streamlined-lambda
8 | flickr-streamlined-lambda
9 | 0.1
10 |
11 |
12 |
13 | central
14 | Maven Repository Switchboard
15 | default
16 | http://repo1.maven.org/maven2
17 |
18 | false
19 |
20 |
21 |
22 |
23 |
24 |
25 | org.testng
26 | testng
27 | 6.8.7
28 |
29 |
30 | org.apache.hbase
31 | hbase-common
32 | 0.98.7-hadoop2
33 |
34 |
35 | org.apache.hbase
36 | hbase-it
37 | 0.98.7-hadoop2
38 |
39 |
40 |
41 |
42 | test/java
43 |
44 |
45 | org.apache.maven.plugins
46 | maven-compiler-plugin
47 | 2.3.2
48 |
49 | 1.7
50 | 1.7
51 |
52 |
53 |
54 | org.apache.maven.plugins
55 | maven-surefire-plugin
56 | 2.13
57 |
58 |
59 | firefox
60 |
61 |
62 |
63 |
64 | org.codehaus.mojo
65 | exec-maven-plugin
66 | 1.2.1
67 |
68 | flickr.SimplifiedLambdaDemo.SimplifiedLambdaDemo
69 |
70 |
71 |
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/src/main/java/flickr/SimplifiedLambdaDemo/SimplifiedLambdaDemo.java:
--------------------------------------------------------------------------------
1 | // Copyright 2015 Yahoo Inc.
2 | // Licensed under the terms of the Apache 2.0 License
3 | // See LICENSE.txt file in the project root folder for License terms.
4 |
5 | package flickr.SimplifiedLambdaDemo;
6 |
7 | import flickr.SimplifiedLambda.SimplifiedLambda;
8 | import utility.MockHTable;
9 | import org.apache.hadoop.hbase.client.HTableInterface;
10 |
11 | /**
12 | * Created by bjoshi on 8/18/15.
13 | */
14 | public class SimplifiedLambdaDemo {
15 | // mvn exec:java -Dexec.mainClass="flickr.SimplifiedLambdaDemo.SimplifiedLambdaDemo"
16 | public static void main(final String[] args) throws Exception {
17 | printHeader();
18 |
19 | MockHTable lambdaTable = new MockHTable("lambdaTable");
20 | lambdaTable.addColumnFamily(new String(SimplifiedLambda.FAMILY));
21 |
22 | SimplifiedLambda simplifiedLambda = new SimplifiedLambda(lambdaTable);
23 |
24 | System.out.println("\nInitial table:");
25 | System.out.print(simplifiedLambda.dumpTable());
26 |
27 | System.out.println("\nPush bulk rows:");
28 | simplifiedLambda.pushBulkEntry("rowA", "bulk");
29 | simplifiedLambda.pushBulkEntry("rowB", "bulk");
30 | simplifiedLambda.pushBulkEntry("rowC", "bulk");
31 | simplifiedLambda.cleaner();
32 | System.out.print(simplifiedLambda.dumpTable());
33 |
34 | System.out.println("\nOverride rows A,B with realtime:");
35 | simplifiedLambda.pushRealtimeEntry("rowA", "rtOvrd");
36 | simplifiedLambda.pushRealtimeEntry("rowB", "rtOvrd");
37 | System.out.print(simplifiedLambda.dumpTable());
38 |
39 | System.out.println("\nClean table:");
40 | simplifiedLambda.cleaner();
41 | System.out.print(simplifiedLambda.dumpTable());
42 |
43 | System.out.println("\nOverride rows B,C with realtime:");
44 | simplifiedLambda.pushRealtimeEntry("rowB", "rtOvrd2");
45 | simplifiedLambda.pushRealtimeEntry("rowC", "rtOvrd2");
46 | System.out.print(simplifiedLambda.dumpTable());
47 |
48 | System.out.println("\nOverride all rows with bulk:");
49 | simplifiedLambda.pushBulkEntry("rowA", "bulkOvr");
50 | simplifiedLambda.pushBulkEntry("rowB", "bulkOvr");
51 | simplifiedLambda.cleaner();
52 | System.out.print(simplifiedLambda.dumpTable());
53 |
54 | }
55 |
56 |
57 | static private void printHeader() {
58 | System.out.println(" _");
59 | System.out.println(" . - ` : ` '.' `` . - '` ` .");
60 | System.out.println(" ' ,gi$@$q pggq pggq . ' pggq");
61 | System.out.println(" + j@@@P*\\7 @@@@ @@@@ _ : @@@@ ! ._ , . _ - .");
62 | System.out.println(" . . @@@K @@@@ ; -` `_,_ ` . @@@@ ;/ ` _,,_ `");
63 | System.out.println(" ; pgg@@@@gggq @@@@ @@@@ .' ,iS@@@@@Si @@@@ .6@@@P' !!!! j!!!!7 ;");
64 | System.out.println(" @@@@@@@@@@@ @@@@ @@@@ ` j@@@P*\"*+Y7 @@@@ .6@@@P !!!!47*\"*+;");
65 | System.out.println(" `_ @@@@ @@@@ @@@@ .@@@7 . ` @@@@.6@@@P ` !!!!; . '");
66 | System.out.println(" . @@@@ ' @@@@ @@@@ :@@@! !: @@@@7@@@K `; !!!! ' ` '");
67 | System.out.println(" @@@@ . @@@@ @@@@ `%@@@. . @@@@`7@@@b . !!!! :");
68 | System.out.println(" ! @@@@ @@@@ @@@@ \\@@@$+,,+4b @@@@ `7@@@b !!!!");
69 | System.out.println(" @@@@ : @@@@ @@@@ `7%S@@hX!P' @@@@ `7@@@b !!!! .");
70 | System.out.println(" : \"\"\"\" \"\"\"\" \"\"\"\" :. `^\"^` \"\"\"\" `\"\"\"\"\" ''''");
71 | System.out.println(" ` - . . _._ ` _._ _ . -");
72 | System.out.println(" , ` ,glllllllllg, `-: ' .~ . . . ~. `");
73 | System.out.println(" ,jlllllllllllllllp, .!' .+. . . . . . .+. `.");
74 | System.out.println(" ` jllllllllllllllllllll ` +. . . . . . . . .+ .");
75 | System.out.println(" . jllllllllllllllllllllll . . . . . . . . . . .");
76 | System.out.println(" .l@@@@@@@lllllllllllllll. j. . . . . . . :::::::l `");
77 | System.out.println(" ; ;@@@@@@@@@@@@@@@@@@@lllll :. . :::::::::::::::::: ;");
78 | System.out.println(" :l@@@@@@@@@@@@@@@@@@@@@l; ::::::::::::::::::::::;");
79 | System.out.println(" ` Y@@@@@@@@@@@@@@@@@@@@@P ::::::::::::::::::::: '");
80 | System.out.println(" - Y@@@@@@@@@@@@@@@@@@@P . ::::::::::::::::::: .");
81 | System.out.println(" `*@@@@@@@@@@@@@@@*` ` ` `:::::::::::::::`");
82 | System.out.println(" `. `*%@@@@@@@%*` . ` `+:::::::::+` '");
83 | System.out.println(" . ``` _ ' - . ``` -");
84 | System.out.println(" ` ' ` ' `");
85 | System.out.println(" You're reading. We're hiring. ");
86 | System.out.println(" https://www.flickr.com/jobs/");
87 | System.out.println("");
88 | System.out.println("Simplified Lambda Example");
89 | System.out.println("=========================");
90 | }
91 |
92 |
93 | }
94 |
--------------------------------------------------------------------------------
/test/java/flickr/SimplifiedLambda/SimplifiedLambdaTest.java:
--------------------------------------------------------------------------------
1 | // Copyright 2015 Yahoo Inc.
2 | // Licensed under the terms of the Apache 2.0 License
3 | // See LICENSE.txt file in the project root folder for License terms.
4 |
5 | package flickr.SimplifiedLambda;
6 |
7 | import utility.MockHTable;
8 |
9 | import org.testng.annotations.Test;
10 |
11 | import java.io.IOException;
12 | import java.util.logging.Logger;
13 |
14 | import static org.testng.Assert.assertEquals;
15 | import static org.testng.Assert.assertNotEquals;
16 | import static org.testng.Assert.assertTrue;
17 |
18 | /**
19 | * Created by bjoshi on 8/17/15.
20 | */
21 | public class SimplifiedLambdaTest {
22 | private static Logger LOG = Logger.getLogger(SimplifiedLambda.class.getName());
23 |
24 | private MockHTable createLambdaTable() {
25 | MockHTable lambdaTable = new MockHTable("lambdaTable");
26 | lambdaTable.addColumnFamily(new String(SimplifiedLambda.FAMILY));
27 | return lambdaTable;
28 | }
29 |
30 | @Test
31 | public void testTableCreate() throws Exception {
32 | MockHTable lambdaTable = createLambdaTable();
33 |
34 | SimplifiedLambda simplifiedLambda = new SimplifiedLambda(lambdaTable);
35 | assertTrue(simplifiedLambda instanceof SimplifiedLambda);
36 | }
37 |
38 | @Test
39 | public void testPushAndGetData() throws Exception {
40 | MockHTable lambdaTable = createLambdaTable();
41 |
42 | SimplifiedLambda simplifiedLambda = new SimplifiedLambda(lambdaTable);
43 | assertTrue(simplifiedLambda instanceof SimplifiedLambda);
44 |
45 | simplifiedLambda.pushBulkEntry("bulk","0");
46 | simplifiedLambda.pushRealtimeEntry("realtime", "1");
47 |
48 | assertEquals(simplifiedLambda.getItemForKey("bulk", SimplifiedLambda.BULK_COLUMN),"0");
49 | assertEquals(simplifiedLambda.getItemForKey("realtime", SimplifiedLambda.REALTIME_COLUMN),"1");
50 |
51 | try {
52 | assertNotEquals(simplifiedLambda.getItemForKey("realtime", SimplifiedLambda.BULK_COLUMN), "0");
53 | } catch (IOException e) {
54 | assertEquals(e.toString(), "java.io.IOException: Cannot retrieve string from hbase");
55 | }
56 |
57 | try {
58 | assertNotEquals(simplifiedLambda.getItemForKey("bulk", SimplifiedLambda.REALTIME_COLUMN), "1");
59 | } catch (IOException e) {
60 | assertEquals(e.toString(), "java.io.IOException: Cannot retrieve string from hbase");
61 | }
62 | }
63 |
64 | @Test
65 | public void testTimestamps() throws Exception {
66 | MockHTable lambdaTable = createLambdaTable();
67 |
68 | SimplifiedLambda simplifiedLambda = new SimplifiedLambda(lambdaTable);
69 | assertTrue(simplifiedLambda instanceof SimplifiedLambda);
70 |
71 | simplifiedLambda.pushBulkEntry("bulk","0");
72 | simplifiedLambda.pushRealtimeEntry("realtime", "1");
73 |
74 | long bulkTimestamp = simplifiedLambda.getTimestampForKey("bulk", SimplifiedLambda.BULK_COLUMN);
75 | long realtimeTimestamp = simplifiedLambda.getTimestampForKey("realtime", SimplifiedLambda.REALTIME_COLUMN);
76 |
77 | assertTrue(realtimeTimestamp>bulkTimestamp);
78 | }
79 |
80 | @Test
81 | public void testBulkOnly() throws Exception {
82 | MockHTable lambdaTable = createLambdaTable();
83 |
84 | SimplifiedLambda simplifiedLambda = new SimplifiedLambda(lambdaTable);
85 | assertTrue(simplifiedLambda instanceof SimplifiedLambda);
86 |
87 | simplifiedLambda.pushBulkEntry("bulk","0");
88 |
89 | assertEquals(simplifiedLambda.combiner("bulk"), "0");
90 |
91 | simplifiedLambda.cleaner();
92 |
93 | assertEquals(simplifiedLambda.combiner("bulk"), "0");
94 | assertEquals(simplifiedLambda.getItemForKey("bulk", SimplifiedLambda.BULK_COLUMN), "0");
95 | try {
96 | assertNotEquals(simplifiedLambda.getItemForKey("bulk", SimplifiedLambda.REALTIME_COLUMN), "0");
97 | } catch (IOException e) {
98 | assertEquals(e.toString(), "java.io.IOException: Cannot retrieve string from hbase");
99 | }
100 | }
101 |
102 | @Test
103 | public void testRealtimeOnly() throws Exception {
104 | MockHTable lambdaTable = createLambdaTable();
105 |
106 | SimplifiedLambda simplifiedLambda = new SimplifiedLambda(lambdaTable);
107 | assertTrue(simplifiedLambda instanceof SimplifiedLambda);
108 |
109 | simplifiedLambda.pushRealtimeEntry("realtime", "0");
110 |
111 | assertEquals(simplifiedLambda.combiner("realtime"), "0");
112 |
113 | simplifiedLambda.cleaner();
114 |
115 | assertEquals(simplifiedLambda.combiner("realtime"), "0");
116 |
117 | // check that realtime is pushed to bulk after cleaner
118 | assertEquals(simplifiedLambda.getItemForKey("realtime", SimplifiedLambda.BULK_COLUMN), "0");
119 |
120 | // assert that realtime column is pushed to bulk and is deleted
121 | try {
122 | assertNotEquals(simplifiedLambda.getItemForKey("realtime", SimplifiedLambda.REALTIME_COLUMN), "0");
123 | } catch (IOException e) {
124 | assertEquals(e.toString(), "java.io.IOException: Cannot retrieve string from hbase");
125 | }
126 | }
127 |
128 | @Test
129 | public void testBulkBeforeRealtime() throws Exception {
130 | MockHTable lambdaTable = createLambdaTable();
131 |
132 | SimplifiedLambda simplifiedLambda = new SimplifiedLambda(lambdaTable);
133 | assertTrue(simplifiedLambda instanceof SimplifiedLambda);
134 |
135 | simplifiedLambda.pushBulkEntry("common","0");
136 | simplifiedLambda.pushRealtimeEntry("common", "1");
137 |
138 | assertEquals(simplifiedLambda.combiner("common"), "1");
139 |
140 | simplifiedLambda.cleaner();
141 |
142 | // check that realtime is pushed to bulk after cleaner
143 | assertEquals(simplifiedLambda.getItemForKey("common", SimplifiedLambda.BULK_COLUMN), "1");
144 |
145 | // assert that realtime column is pushed to bulk and is deleted
146 | try {
147 | assertNotEquals(simplifiedLambda.getItemForKey("common", SimplifiedLambda.REALTIME_COLUMN), "1");
148 | } catch (IOException e) {
149 | assertEquals(e.toString(), "java.io.IOException: Cannot retrieve string from hbase");
150 | }
151 | }
152 |
153 | @Test
154 | public void testRealtimeBeforeBulk() throws Exception {
155 | MockHTable lambdaTable = createLambdaTable();
156 |
157 | SimplifiedLambda simplifiedLambda = new SimplifiedLambda(lambdaTable);
158 | assertTrue(simplifiedLambda instanceof SimplifiedLambda);
159 |
160 | simplifiedLambda.pushRealtimeEntry("common", "1");
161 | simplifiedLambda.pushBulkEntry("common","0");
162 |
163 | try {
164 | assertEquals(simplifiedLambda.combiner("common"), "1");
165 | } catch (IOException e) {
166 | assertEquals(e.toString(), "java.io.IOException: Bulk timestamp newer than realtime: shouldn't happen!");
167 | }
168 | simplifiedLambda.cleaner();
169 |
170 | // bulk timestamp is newer, so check that it's intact
171 | assertEquals(simplifiedLambda.getItemForKey("common", SimplifiedLambda.BULK_COLUMN), "0");
172 |
173 | // assert that realtime column is deleted
174 | try {
175 | assertNotEquals(simplifiedLambda.getItemForKey("common", SimplifiedLambda.REALTIME_COLUMN), "1");
176 | } catch (IOException e) {
177 | assertEquals(e.toString(), "java.io.IOException: Cannot retrieve string from hbase");
178 | }
179 | }
180 | }
181 |
--------------------------------------------------------------------------------
/src/main/java/flickr/SimplifiedLambda/SimplifiedLambda.java:
--------------------------------------------------------------------------------
1 | // Copyright 2015 Yahoo Inc.
2 | // Licensed under the terms of the Apache 2.0 License
3 | // See LICENSE.txt file in the project root folder for License terms.
4 |
5 | package flickr.SimplifiedLambda;
6 |
7 | import org.apache.hadoop.hbase.Cell;
8 | import org.apache.hadoop.hbase.CellUtil;
9 | import org.apache.hadoop.hbase.client.*;
10 | import org.apache.hadoop.hbase.util.Bytes;
11 |
12 | import java.io.IOException;
13 |
14 | /**
15 | * Created by bjoshi on 8/17/15.
16 | */
17 | public class SimplifiedLambda {
18 | public final static byte[] FAMILY = Bytes.toBytes("f");
19 | public final static byte[] REALTIME_COLUMN = Bytes.toBytes("REALTIME");
20 | public final static byte[] BULK_COLUMN = Bytes.toBytes("BULK");
21 |
22 | HTableInterface lambdaTable;
23 | public SimplifiedLambda(HTableInterface _lambdaTable)
24 | {
25 | this.lambdaTable = _lambdaTable;
26 | }
27 |
28 | static private byte[] stringToBytes(String input) { return Bytes.toBytes(input); }
29 | static private byte[] getEntityKey(String entity) { return stringToBytes(entity); }
30 |
31 | public void pushBulkEntry(String key, String value) throws IOException {
32 | byte[] entityKey = this.getEntityKey(key);
33 |
34 | Put insertOp = new Put(entityKey);
35 | insertOp.add(FAMILY, BULK_COLUMN, stringToBytes(value));
36 | this.lambdaTable.put(insertOp);
37 | }
38 |
39 | public void pushRealtimeEntry(String key, String value) throws IOException {
40 | byte[] entityKey = this.getEntityKey(key);
41 |
42 | Put insertOp = new Put(entityKey);
43 | insertOp.add(FAMILY, REALTIME_COLUMN, stringToBytes(value));
44 | this.lambdaTable.put(insertOp);
45 | }
46 |
47 | private Cell getCell(String key, byte[] column) throws IOException {
48 | byte[] rowKey = this.getEntityKey(key);
49 |
50 | Get entityRowGet = new Get(rowKey);
51 | Result result = this.lambdaTable.get(entityRowGet);
52 |
53 | Cell cell = result.getColumnLatestCell(FAMILY, column);
54 | return cell;
55 | }
56 |
57 | private String getValueFromCell(Cell cell) {
58 | return Bytes.toString(CellUtil.cloneValue(cell));
59 | }
60 |
61 | public String getItemForKey(String key, byte[] column) throws IOException {
62 | Cell cell = getCell(key, column);
63 |
64 | if (cell != null) {
65 | return getValueFromCell(cell);
66 | } else {
67 | throw new IOException("Cannot retrieve string from hbase");
68 | }
69 | }
70 |
71 | public long getTimestampForKey(String key, byte[] column) throws IOException {
72 | Cell cell = getCell(key, column);
73 |
74 | if (cell != null) {
75 | return cell.getTimestamp();
76 | } else {
77 | throw new IOException("Cannot retrieve string from hbase");
78 | }
79 | }
80 |
81 | public void cleaner() throws IOException {
82 | // algorithm:
83 | // iterate over all rows
84 | // foreach row:
85 | // if hasRealtime && !hasBulk:
86 | // move realtime to bulk
87 | // else if hasRealtime && hasBulk:
88 | // if bulkTimestamp>realtimeTimestamp:
89 | // delete realtimeTimestamp
90 | // else:
91 | // move realtime to bulk
92 | //
93 |
94 | Scan scan = new Scan();
95 | scan.addFamily(FAMILY);
96 | ResultScanner resultScanner = this.lambdaTable.getScanner(scan);
97 |
98 | for (Result result : resultScanner) {
99 | if (result.isEmpty())
100 | continue;
101 | String key = Bytes.toString(result.getRow());
102 | boolean hasBulk = false;
103 | boolean hasRealtime = false;
104 |
105 | Cell bulkCell = getCell(key, BULK_COLUMN);
106 | if (bulkCell != null)
107 | hasBulk = true;
108 |
109 | Cell realtimeCell = getCell(key, REALTIME_COLUMN);
110 | if (realtimeCell != null)
111 | hasRealtime = true;
112 |
113 | byte[] rowKey = this.getEntityKey(key);
114 |
115 | if (hasRealtime && !hasBulk) {
116 | // move realtime to bulk
117 | byte[] realtimeValue = CellUtil.cloneValue(realtimeCell);
118 | Put insertOp = new Put(rowKey);
119 | insertOp.add(FAMILY, BULK_COLUMN, realtimeValue);
120 | Delete deleteOp = new Delete(rowKey);
121 | deleteOp.deleteColumn(FAMILY, REALTIME_COLUMN);
122 |
123 | this.lambdaTable.put(insertOp);
124 | this.lambdaTable.delete(deleteOp);
125 | } else if (hasRealtime && hasBulk) {
126 | long bulkTimestamp = bulkCell.getTimestamp();
127 | long realtimeTimestamp = realtimeCell.getTimestamp();
128 | if (bulkTimestamp > realtimeTimestamp ) {
129 | //delete realtimeTimestamp
130 | Delete deleteOp = new Delete(rowKey);
131 | deleteOp.deleteColumn(FAMILY, REALTIME_COLUMN);
132 | this.lambdaTable.delete(deleteOp);
133 | } else {
134 | // move realtime to bulk
135 | byte[] realtimeValue = CellUtil.cloneValue(realtimeCell);
136 | Put insertOp = new Put(rowKey);
137 | insertOp.add(FAMILY, BULK_COLUMN, realtimeValue);
138 | Delete deleteOp = new Delete(rowKey);
139 | deleteOp.deleteColumn(FAMILY, REALTIME_COLUMN);
140 | this.lambdaTable.put(insertOp);
141 | this.lambdaTable.delete(deleteOp);
142 | }
143 | }
144 | }
145 | }
146 |
147 | public String combiner(String key) throws IOException {
148 | boolean hasBulk = false;
149 | boolean hasRealtime = false;
150 |
151 | Cell bulkCell = getCell(key, BULK_COLUMN);
152 | if (bulkCell != null)
153 | hasBulk = true;
154 |
155 | Cell realtimeCell = getCell(key, REALTIME_COLUMN);
156 | if (realtimeCell != null)
157 | hasRealtime = true;
158 |
159 | if (!hasBulk && !hasRealtime) {
160 | throw new IOException("Cannot retrieve string from hbase");
161 | } else if (hasBulk && !hasRealtime) {
162 | return getValueFromCell(bulkCell);
163 | } else if (!hasBulk && hasRealtime) {
164 | return getValueFromCell(realtimeCell);
165 | } else {
166 | long bulkTimestamp = bulkCell.getTimestamp();
167 | long realtimeTimestamp = realtimeCell.getTimestamp();
168 | if (realtimeTimestamp > bulkTimestamp) {
169 | return getValueFromCell(realtimeCell);
170 | } else {
171 | throw new IOException("Bulk timestamp newer than realtime: shouldn't happen!");
172 | }
173 | }
174 | }
175 |
176 | public String dumpTable() throws IOException {
177 | String res = "key\tbulk\trealtime\tcombined\n";
178 | res += "........................................\n";
179 |
180 | Scan scan = new Scan();
181 | scan.addFamily(FAMILY);
182 | ResultScanner resultScanner = this.lambdaTable.getScanner(scan);
183 |
184 | for (Result result : resultScanner) {
185 | if (result.isEmpty())
186 | continue;
187 | String key = Bytes.toString(result.getRow());
188 | boolean hasBulk = false;
189 | boolean hasRealtime = false;
190 |
191 | Cell bulkCell = getCell(key, BULK_COLUMN);
192 | if (bulkCell != null)
193 | hasBulk = true;
194 |
195 | Cell realtimeCell = getCell(key, REALTIME_COLUMN);
196 | if (realtimeCell != null)
197 | hasRealtime = true;
198 |
199 | res += key;
200 | res += '\t';
201 | if (hasBulk) {
202 | res += getValueFromCell(bulkCell);
203 | } else {
204 | res += "None";
205 | }
206 | res += '\t';
207 | if (hasRealtime) {
208 | res += getValueFromCell(realtimeCell);
209 | } else {
210 | res += "None";
211 | }
212 | res += "\t\t";
213 |
214 | String combinedResult = combiner(key);
215 | res += combinedResult;
216 | res += '\n';
217 | }
218 | return res;
219 | }
220 | }
221 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Apache License
2 |
3 | Version 2.0, January 2004
4 |
5 | http://www.apache.org/licenses/
6 |
7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
8 |
9 | 1. Definitions.
10 |
11 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
16 |
17 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
18 |
19 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
20 |
21 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
22 |
23 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
24 |
25 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
26 |
27 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
28 |
29 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
30 |
31 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
32 |
33 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
34 |
35 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
36 |
37 | You must give any other recipients of the Work or Derivative Works a copy of this License; and
38 | You must cause any modified files to carry prominent notices stating that You changed the files; and
39 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
40 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
41 |
42 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
43 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
44 |
45 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
46 |
47 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
48 |
49 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
50 |
51 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
52 |
53 | END OF TERMS AND CONDITIONS
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Simplified Lambda Example
2 |
3 | This repository is a working example of Flickr's Simplified Lambda Architecture. The architecture is used to fuse bulk and realtime data for Flickr's Magic View at a gigantic scale using HBase.
4 |
5 | ## Introduction
6 | Flickr’s Magic View takes the hassle out of organizing your own photos by applying our cutting-edge, computer-vision technology to automatically categorize photos in your photostream and present them in a seamless view based on the content in the photos. This all happens in real-time - as soon as a photo is uploaded, it is categorized and placed into the Magic View.
7 |
8 | 
9 |
10 | ### The Challenge
11 | Our computational footprint made it easy to create per-user Magic View categories for over 12 billion images on Flickr; however, we also needed to combine this with updating the categories with the the tens of millions of tags generated from photos as they are uploaded in real-time. Ideally, the system has to allow us to efficiently but separately manage the bulk and real-time data that only computes the final state when requested. We turned to Yahoo’s Hadoop stack to find a way to build this at the massive scale we needed.
12 |
13 | ### Our Solution
14 | Powered by Apache HBase, we developed a new scheme to fuse results from bulk and real-time aggregations. Using a single table in HBase, we are able to independently update and manage the bulk and real-time data in the system while always being able to provide a consistent, correct result.
15 |
16 | We believe that this solution is a novel simplification of what is sometimes known as Lambda Architecture. We improve it by simplifying some of its complicated components making maintenance and development easier.
17 |
18 | ## Lambda Architecture
19 | ### Existing approach
20 | We’ll start with Nathan Marz’s book, ‘Big Data’, which proposes the database concept of ‘Lambda Architecture’. In his analysis, he states that a database query can be represented as a function - Query - which operates on all the data:
21 |
22 | ```
23 | result = Query(data)
24 | ```
25 | The core of the Lambda architecture allows for separately maintained real-time and bulk databases. Minimizing the number of sacrifices needed to be made but maintaining the goal of operating on all available data, the equation is now expressed as:
26 |
27 | ```
28 | result = Combiner(Query(real-time data) + Query(bulk data))
29 | ```
30 |
31 | 
32 |
33 | This equation is shown graphically in the figure above. The real-time and bulk compute subsystem write to independent databases, which could be totally different systems. When dealing with a high volume of realtime data, the operational advantage here can be significant - there’s no need to have the expense of combining it with bulk data every time an event comes in.
34 |
35 | Concerns around this approach center on the complicated nature of the Combiner function - there is the developer and systems cost from the need to maintain two separate databases, the differing latencies of querying both sides and then the mechanics of merging the result.
36 |
37 | ### Our Approach
38 | We addressed the complications of the Combiner by instead using a single database to store the real-time and bulk data. A Combiner is still required to compute a final result:
39 |
40 | ```
41 | result = Combiner(Query(data))
42 | ```
43 |
44 | 
45 |
46 | How was this achieved? We implement our simplified Lambda architecture in HBase by giving each row two sets of columns - real-time and bulk - which are managed independently by the real-time subsystem (Storm and Java) and the bulk compute subsystem (Pig Latin and Oozie). It’s worth noting that FiloDb takes a similar approach - but since we only require the latest version of the data, our implementation is simpler.
47 |
48 | The combiner stage is abstracted into a single Java process running on its own hardware which computes on the data in HBase and pushes the photostream tag aggregations to a cache for serving.
49 |
50 | ### The Combiner and Cleaner
51 | When reading a single row of data from HBase, we need to combine the data from the real-time and the bulk columns. If only the bulk or real-time data exists, then selecting the data is obvious. If both bulk and realtime data exists, we always pick real-time. This seems reasonable, but causes a subtle problem.
52 |
53 | Let’s say a photos computer vision tags are added via real-time compute - there is no bulk data. Later on, we recompute all available photos using a new version of the computer vision tagging, and load this data (including this photo) via a bulk load. Even though the newer data exists in the bulk column, we can’t get to it because the combiner will only read the real-time column. We solve this by running the Cleaner process on all the data in HBase after we do a bulk load.
54 |
55 | The Cleaner simply visits each row and sees if the HBase timestamp for the real-time data is older than the bulk load. If it is, then we delete the real-time data for that row since it’s already captured in the bulk columns. This way the results of the bulk compute aren’t ‘published’ until the cleaner has run.
56 |
57 | ## Acknowledgements
58 | Thanks to the entire Flickr Magic View and team for helping out and to Nathan Marz for kindly reviewing this work.
59 |
60 | ## Running the example
61 |
62 | Once you've checked out the repository, you can run the demo using:
63 | ```
64 | mvn package; mvn exec:java -Dexec.mainClass="flickr.SimplifiedLambdaDemo.SimplifiedLambdaDemo"
65 | ```
66 | To run the tests, simply run:
67 | ```
68 | mvn package; mvn test
69 | ```
70 |
71 | ## Worked Example
72 | Let's walk through a specific example that is used in the demo.
73 |
74 | ### Initial table
75 | Lets set up an in-memory simulation of HBase using MockHTable:
76 |
77 | ```
78 | MockHTable lambdaTable = new MockHTable("lambdaTable");
79 | lambdaTable.addColumnFamily(new String(SimplifiedLambda.FAMILY));
80 | ```
81 |
82 | Then we'll pass it to SimplifiedLambda - our central class which implements our Simplified Lambda architecture - to manage.
83 | ```
84 | SimplifiedLambda simplifiedLambda = new SimplifiedLambda(lambdaTable);
85 | ```
86 |
87 | Lets dump the initial state of the table:
88 | ```
89 | System.out.println("\nInitial table:");
90 | System.out.print(simplifiedLambda.dumpTable());
91 | ```
92 |
93 | Output:
94 | ```
95 | key bulk realtime combined
96 | ........................................
97 | ```
98 |
99 | The columns here are:
100 | key - the row key
101 | bulk - the value in the bulk column
102 | realtime - the value in the realtime column
103 | combined - the final value that SimplifiedLambda will chose between realtime or bulk
104 |
105 | ### Push bulk rows
106 | Let's push some data to the table via the bulk load mechanism. We have to call the cleaner after every time we run the bulk load to keep the system in a consistent state:
107 |
108 | ```
109 | simplifiedLambda.pushBulkEntry("rowA", "bulk");
110 | simplifiedLambda.pushBulkEntry("rowB", "bulk");
111 | simplifiedLambda.pushBulkEntry("rowC", "bulk");
112 | simplifiedLambda.cleaner();
113 | ```
114 |
115 | The table now looks like:
116 | ```
117 | key bulk realtime combined
118 | ........................................
119 | rowA bulk None bulk
120 | rowB bulk None bulk
121 | rowC bulk None bulk
122 | ```
123 |
124 | There's only bulk data available so the combiner only chooses data from the bulk column.
125 |
126 | ### Override rows A,B with realtime
127 |
128 | Now lets take rowA and rowB and push realtime data to them:
129 |
130 | ```
131 | simplifiedLambda.pushRealtimeEntry("rowA", "rtOvrd");
132 | simplifiedLambda.pushRealtimeEntry("rowB", "rtOvrd");
133 | ```
134 |
135 | The table now looks like:
136 |
137 | ```
138 | key bulk realtime combined
139 | ........................................
140 | rowA bulk rtOvrd rtOvrd
141 | rowB bulk rtOvrd rtOvrd
142 | rowC bulk None bulk
143 | ````
144 | Here for rowA and rowB, there is now realtime data available so the combiner will automatically choose it.
145 |
146 | ### Clean table
147 |
148 | For rowA and rowB, we can take the realtime data and 'publish' it to the bulk columns since it overrides the bulk columns:
149 |
150 | ```
151 | simplifiedLambda.cleaner();
152 | ```
153 |
154 | Now the table state is:
155 |
156 | ```
157 | key bulk realtime combined
158 | ........................................
159 | rowA rtOvrd None rtOvrd
160 | rowB rtOvrd None rtOvrd
161 | rowC bulk None bulk
162 | ```
163 | This isn't actually necessary after a realtime updates - it's an illlustration of how the cleaner works in this scenario.
164 |
165 | ### Override rows B,C with realtime
166 |
167 | Lets do another realtime override, now on rowB and rowC:
168 |
169 | ```
170 | simplifiedLambda.pushRealtimeEntry("rowB", "rtOvrd2");
171 | simplifiedLambda.pushRealtimeEntry("rowC", "rtOvrd2");
172 | ```
173 |
174 | Now the table looks like:
175 |
176 | ```
177 | key bulk realtime combined
178 | ........................................
179 | rowA rtOvrd None rtOvrd
180 | rowB rtOvrd rtOvrd2 rtOvrd2
181 | rowC bulk rtOvrd2 rtOvrd2
182 | ```
183 |
184 | Again: the realtime columns override the bulk data.
185 |
186 | ### Override rows with bulk:
187 |
188 | Lastly, lets push some bulk data after the realtime updates - to rowA and rowB. Since the bulk data is newer than the realtime updates for these rows, the bulk data takes precedence over the realtime data that is there.
189 |
190 | ```
191 | simplifiedLambda.pushBulkEntry("rowA", "bulkOvr");
192 | simplifiedLambda.pushBulkEntry("rowB", "bulkOvr");
193 | simplifiedLambda.cleaner();
194 | ```
195 |
196 | The result:
197 | ```
198 | key bulk realtime combined
199 | ........................................
200 | rowA bulkOvr None bulkOvr
201 | rowB bulkOvr None bulkOvr
202 | rowC rtOvrd2 None rtOvrd2
203 | ```
204 | # Copyright and License
205 | Copyright 2015 Yahoo Inc.
206 | Licensed under the terms of the Apache 2.0 License
207 | See LICENSE.txt file in the project root folder for License terms.
208 |
--------------------------------------------------------------------------------
/src/main/java/utility/MockHTable.java:
--------------------------------------------------------------------------------
1 | package utility;
2 |
3 | /**
4 | * This file is licensed to you under the Apache License, Version 2.0 (the
5 | * "License"); you may not use this file except in compliance with the
6 | * License. You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | */
14 |
15 | import com.google.protobuf.Descriptors;
16 | import com.google.protobuf.Message;
17 | import com.google.protobuf.Service;
18 | import com.google.protobuf.ServiceException;
19 | import org.apache.hadoop.conf.Configuration;
20 | import org.apache.hadoop.hbase.HColumnDescriptor;
21 | import org.apache.hadoop.hbase.HTableDescriptor;
22 | import org.apache.hadoop.hbase.KeyValue;
23 | import org.apache.hadoop.hbase.TableName;
24 | import org.apache.hadoop.hbase.client.*;
25 | import org.apache.hadoop.hbase.client.coprocessor.Batch;
26 | import org.apache.hadoop.hbase.filter.Filter;
27 | import org.apache.hadoop.hbase.ipc.CoprocessorRpcChannel;
28 | import org.apache.hadoop.hbase.util.Bytes;
29 | import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
30 |
31 | import java.io.IOException;
32 | import java.util.*;
33 | import java.util.logging.Logger;
34 |
35 | /**
36 | * utility.MockHTable.
37 | *
38 | * original utility.MockHTable (by agaoglu) : https://gist.github.com/agaoglu/613217#file_mock_h_table.java
39 | * upgraded by bhautikj 20150327
40 | * upgraded by bhautikj 20150901
41 | *
42 | * Modifications
43 | *
44 | *