├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── .gitignore ├── mapreduce ├── build.gradle ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── aerospike │ └── hadoop │ └── mapreduce │ ├── AerospikeConfigEnum.java │ ├── AerospikeClientSingleton.java │ ├── AerospikeLogger.java │ ├── AerospikeRecordWriter.java │ ├── AerospikeRecord.java │ ├── AerospikeKey.java │ ├── AerospikeSplit.java │ ├── AerospikeOutputFormat.java │ ├── AerospikeInputFormat.java │ ├── AerospikeConfigUtil.java │ └── AerospikeRecordReader.java ├── settings.gradle ├── examples ├── external_join │ ├── build.gradle │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ ├── log4j.properties │ │ │ └── commons-logging.properties │ │ │ └── java │ │ │ └── com │ │ │ └── aerospike │ │ │ └── hadoop │ │ │ └── examples │ │ │ └── externaljoin │ │ │ └── ExternalJoin.java │ └── pom.xml ├── session_rollup │ ├── build.gradle │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ ├── log4j.properties │ │ │ └── commons-logging.properties │ │ │ └── java │ │ │ └── com │ │ │ └── aerospike │ │ │ └── hadoop │ │ │ └── examples │ │ │ └── sessionrollup │ │ │ └── SessionRollup.java │ └── pom.xml ├── word_count_input │ ├── build.gradle │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ ├── log4j.properties │ │ │ └── commons-logging.properties │ │ │ └── java │ │ │ └── com │ │ │ └── aerospike │ │ │ └── hadoop │ │ │ └── examples │ │ │ └── wordcountinput │ │ │ └── WordCountInput.java │ └── pom.xml ├── aggregate_int_input │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ ├── log4j.properties │ │ │ └── commons-logging.properties │ │ │ └── java │ │ │ └── com │ │ │ └── aerospike │ │ │ └── hadoop │ │ │ └── examples │ │ │ └── aggregateintinput │ │ │ └── AggregateIntInput.java │ ├── build.gradle │ └── pom.xml ├── generate_profiles │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ ├── log4j.properties │ │ │ └── commons-logging.properties │ │ │ └── java │ │ │ └── com │ │ │ └── aerospike │ │ │ └── hadoop │ │ │ └── examples │ │ │ └── generateprofiles │ │ │ └── GenerateProfiles.java │ ├── build.gradle │ └── pom.xml ├── word_count_output │ ├── build.gradle │ ├── src │ │ └── main │ │ │ ├── resources │ │ │ ├── log4j.properties │ │ │ └── commons-logging.properties │ │ │ └── java │ │ │ └── com │ │ │ └── aerospike │ │ │ └── hadoop │ │ │ └── examples │ │ │ └── wordcountoutput │ │ │ └── WordCountOutput.java │ └── pom.xml ├── build.gradle ├── spark_session_rollup │ ├── build.gradle │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── aerospike │ │ └── spark │ │ └── examples │ │ └── SparkSessionRollup.java └── pom.xml ├── sampledata ├── src │ └── main │ │ ├── resources │ │ ├── log4j.properties │ │ └── commons-logging.properties │ │ └── java │ │ └── com │ │ └── aerospike │ │ └── hadoop │ │ └── sampledata │ │ └── SampleData.java ├── build.gradle └── pom.xml ├── TODO.md ├── pom.xml ├── gradlew.bat ├── gradlew ├── WORLDCUP_FILELIST ├── README.md └── LICENSE /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aerospike-community/aerospike-hadoop/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .gradle 2 | bin 3 | build 4 | target 5 | .settings 6 | .classpath 7 | .project 8 | *.iml 9 | *.ipr 10 | *.iws 11 | *.log 12 | metastore_db 13 | .idea 14 | 15 | # Ignore Gradle GUI config 16 | gradle-app.setting 17 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Wed Feb 12 07:28:02 CST 2014 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=http\://services.gradle.org/distributions/gradle-1.11-bin.zip 7 | -------------------------------------------------------------------------------- /mapreduce/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | apply plugin: 'application' 3 | 4 | dependencies { 5 | compile "com.aerospike:aerospike-client:3.3.0" 6 | compile "org.apache.hadoop:hadoop-common:2.7.2" 7 | compile "org.apache.hadoop:hadoop-mapreduce-client-jobclient:2.7.2" 8 | } 9 | 10 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | 2 | include ':mapreduce' 3 | include ':sampledata' 4 | include ':examples:word_count_input' 5 | include ':examples:aggregate_int_input' 6 | include ':examples:word_count_output' 7 | include ':examples:session_rollup' 8 | include ':examples:generate_profiles' 9 | include ':examples:external_join' 10 | include ':examples:spark_session_rollup' 11 | -------------------------------------------------------------------------------- /examples/external_join/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | apply plugin: 'application' 3 | 4 | mainClassName = 'com.aerospike.hadoop.examples.externaljoin.ExternalJoin' 5 | 6 | jar { 7 | manifest { 8 | attributes 'Main-Class': 'com.aerospike.hadoop.examples.externaljoin.ExternalJoin' 9 | } 10 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } 11 | } 12 | -------------------------------------------------------------------------------- /sampledata/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | General 2 | ---------------------------------------------------------------- 3 | 4 | * Switch all build to maven? 5 | 6 | * Simple torture testing. 7 | 8 | * Sunil's object all the way through example. 9 | 10 | * Add docs to www.aerospike.com website. 11 | 12 | * Benchmark vs HDFS. 13 | 14 | ---------------- 15 | 16 | * LDT support. 17 | 18 | * Hive example. 19 | 20 | * Another example. 21 | -------------------------------------------------------------------------------- /examples/session_rollup/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | apply plugin: 'application' 3 | 4 | mainClassName = 'com.aerospike.hadoop.examples.sessionrollup.SessionRollup' 5 | 6 | jar { 7 | manifest { 8 | attributes 'Main-Class': 'com.aerospike.hadoop.examples.sessionrollup.SessionRollup' 9 | } 10 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } 11 | } 12 | -------------------------------------------------------------------------------- /examples/external_join/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /examples/session_rollup/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /examples/word_count_input/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | apply plugin: 'application' 3 | 4 | mainClassName = 'com.aerospike.hadoop.examples.wordcountinput.WordCountInput' 5 | 6 | jar { 7 | manifest { 8 | attributes 'Main-Class': 'com.aerospike.hadoop.examples.wordcountinput.WordCountInput' 9 | } 10 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } 11 | } 12 | -------------------------------------------------------------------------------- /examples/aggregate_int_input/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /examples/generate_profiles/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /examples/word_count_input/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /examples/word_count_output/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | apply plugin: 'application' 3 | 4 | mainClassName = 'com.aerospike.hadoop.examples.wordcountoutput.WordCountOutput' 5 | 6 | jar { 7 | manifest { 8 | attributes 'Main-Class': 'com.aerospike.hadoop.examples.wordcountoutput.WordCountOutput' 9 | } 10 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } 11 | } 12 | -------------------------------------------------------------------------------- /examples/word_count_output/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n 9 | -------------------------------------------------------------------------------- /examples/generate_profiles/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | apply plugin: 'application' 3 | 4 | mainClassName = 'com.aerospike.hadoop.examples.generateprofiles.GenerateProfiles' 5 | 6 | jar { 7 | manifest { 8 | attributes 'Main-Class': 'com.aerospike.hadoop.examples.generateprofiles.GenerateProfiles' 9 | } 10 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } 11 | } 12 | -------------------------------------------------------------------------------- /examples/aggregate_int_input/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | apply plugin: 'application' 3 | 4 | mainClassName = 'com.aerospike.hadoop.examples.aggregateintinput.AggregateIntInput' 5 | 6 | jar { 7 | manifest { 8 | attributes 'Main-Class': 'com.aerospike.hadoop.examples.aggregateintinput.AggregateIntInput' 9 | } 10 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } 11 | } 12 | -------------------------------------------------------------------------------- /examples/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin:'java' 2 | 3 | subprojects{ 4 | apply plugin:'java' 5 | 6 | dependencies { 7 | compile project(':mapreduce') 8 | compile "com.aerospike:aerospike-client:3.3.0" 9 | compile "org.apache.hadoop:hadoop-common:2.7.2" 10 | compile "org.apache.hadoop:hadoop-mapreduce-client-jobclient:2.7.2" 11 | compile "joda-time:joda-time:2.5" 12 | compile "org.json:json:20140107" 13 | } 14 | } 15 | 16 | -------------------------------------------------------------------------------- /examples/spark_session_rollup/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | apply plugin: 'application' 3 | 4 | mainClassName = 'com.aerospike.spark.examples.SparkSessionRollup' 5 | 6 | repositories { 7 | mavenCentral() 8 | } 9 | 10 | dependencies { 11 | compile "org.apache.spark:spark-core_2.10:1.1.0" 12 | } 13 | 14 | jar { 15 | manifest { 16 | attributes 'Main-Class': 'com.aerospike.spark.examples.SparkSessionRollup' 17 | } 18 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } 19 | exclude 'META-INF/*.RSA', 'META-INF/*.SF','META-INF/*.DSA' 20 | } 21 | -------------------------------------------------------------------------------- /sampledata/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | apply plugin: 'application' 3 | 4 | mainClassName = 'com.aerospike.hadoop.sampledata.SampleData' 5 | 6 | repositories { 7 | mavenCentral() 8 | } 9 | 10 | dependencies { 11 | compile "com.aerospike:aerospike-client:3.3.0" 12 | compile "org.apache.hadoop:hadoop-common:2.7.2" 13 | } 14 | 15 | run { 16 | if ( project.hasProperty("appArgs") ) { 17 | args Eval.me(appArgs) 18 | } 19 | } 20 | 21 | jar { 22 | manifest { 23 | attributes 'Main-Class': 'com.aerospike.hadoop.sampledata.SampleData' 24 | } 25 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } 26 | } -------------------------------------------------------------------------------- /mapreduce/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 6 | 4.0.0 7 | aerospike-mapreduce 8 | 9 | 10 | com.aerospike 11 | aerospike-hadoop-parent 12 | 1.1.0-SNAPSHOT 13 | 14 | 15 | 16 | 17 | org.apache.hadoop 18 | hadoop-mapreduce-client-jobclient 19 | 2.7.2 20 | compile 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /sampledata/src/main/resources/commons-logging.properties: -------------------------------------------------------------------------------- 1 | # commons-logging.properties 2 | # jdk handlers 3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler 4 | 5 | # default log level 6 | .level=DEBUG 7 | 8 | # Specific logger level 9 | #MyClassLogger.level=FINE 10 | 11 | # FileHandler options - can also be set to the ConsoleHandler 12 | # FileHandler level can be set to override the global level: 13 | #java.util.logging.FileHandler.level=WARN 14 | 15 | # log file name for the File Handler 16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log 17 | 18 | # Specify the style of output (simple or xml) 19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter 20 | 21 | # Optional - Limit the size of the file (in bytes) 22 | java.util.logging.FileHandler.limit=50000 23 | 24 | # Optional - The number of files to cycle through, by 25 | # appending an integer to the base file name: 26 | java.util.logging.FileHandler.count=1 27 | -------------------------------------------------------------------------------- /examples/external_join/src/main/resources/commons-logging.properties: -------------------------------------------------------------------------------- 1 | # commons-logging.properties 2 | # jdk handlers 3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler 4 | 5 | # default log level 6 | .level=DEBUG 7 | 8 | # Specific logger level 9 | #MyClassLogger.level=FINE 10 | 11 | # FileHandler options - can also be set to the ConsoleHandler 12 | # FileHandler level can be set to override the global level: 13 | #java.util.logging.FileHandler.level=WARN 14 | 15 | # log file name for the File Handler 16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log 17 | 18 | # Specify the style of output (simple or xml) 19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter 20 | 21 | # Optional - Limit the size of the file (in bytes) 22 | java.util.logging.FileHandler.limit=50000 23 | 24 | # Optional - The number of files to cycle through, by 25 | # appending an integer to the base file name: 26 | java.util.logging.FileHandler.count=1 27 | -------------------------------------------------------------------------------- /examples/session_rollup/src/main/resources/commons-logging.properties: -------------------------------------------------------------------------------- 1 | # commons-logging.properties 2 | # jdk handlers 3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler 4 | 5 | # default log level 6 | .level=DEBUG 7 | 8 | # Specific logger level 9 | #MyClassLogger.level=FINE 10 | 11 | # FileHandler options - can also be set to the ConsoleHandler 12 | # FileHandler level can be set to override the global level: 13 | #java.util.logging.FileHandler.level=WARN 14 | 15 | # log file name for the File Handler 16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log 17 | 18 | # Specify the style of output (simple or xml) 19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter 20 | 21 | # Optional - Limit the size of the file (in bytes) 22 | java.util.logging.FileHandler.limit=50000 23 | 24 | # Optional - The number of files to cycle through, by 25 | # appending an integer to the base file name: 26 | java.util.logging.FileHandler.count=1 27 | -------------------------------------------------------------------------------- /examples/aggregate_int_input/src/main/resources/commons-logging.properties: -------------------------------------------------------------------------------- 1 | # commons-logging.properties 2 | # jdk handlers 3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler 4 | 5 | # default log level 6 | .level=DEBUG 7 | 8 | # Specific logger level 9 | #MyClassLogger.level=FINE 10 | 11 | # FileHandler options - can also be set to the ConsoleHandler 12 | # FileHandler level can be set to override the global level: 13 | #java.util.logging.FileHandler.level=WARN 14 | 15 | # log file name for the File Handler 16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log 17 | 18 | # Specify the style of output (simple or xml) 19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter 20 | 21 | # Optional - Limit the size of the file (in bytes) 22 | java.util.logging.FileHandler.limit=50000 23 | 24 | # Optional - The number of files to cycle through, by 25 | # appending an integer to the base file name: 26 | java.util.logging.FileHandler.count=1 27 | -------------------------------------------------------------------------------- /examples/generate_profiles/src/main/resources/commons-logging.properties: -------------------------------------------------------------------------------- 1 | # commons-logging.properties 2 | # jdk handlers 3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler 4 | 5 | # default log level 6 | .level=DEBUG 7 | 8 | # Specific logger level 9 | #MyClassLogger.level=FINE 10 | 11 | # FileHandler options - can also be set to the ConsoleHandler 12 | # FileHandler level can be set to override the global level: 13 | #java.util.logging.FileHandler.level=WARN 14 | 15 | # log file name for the File Handler 16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log 17 | 18 | # Specify the style of output (simple or xml) 19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter 20 | 21 | # Optional - Limit the size of the file (in bytes) 22 | java.util.logging.FileHandler.limit=50000 23 | 24 | # Optional - The number of files to cycle through, by 25 | # appending an integer to the base file name: 26 | java.util.logging.FileHandler.count=1 27 | -------------------------------------------------------------------------------- /examples/word_count_input/src/main/resources/commons-logging.properties: -------------------------------------------------------------------------------- 1 | # commons-logging.properties 2 | # jdk handlers 3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler 4 | 5 | # default log level 6 | .level=DEBUG 7 | 8 | # Specific logger level 9 | #MyClassLogger.level=FINE 10 | 11 | # FileHandler options - can also be set to the ConsoleHandler 12 | # FileHandler level can be set to override the global level: 13 | #java.util.logging.FileHandler.level=WARN 14 | 15 | # log file name for the File Handler 16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log 17 | 18 | # Specify the style of output (simple or xml) 19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter 20 | 21 | # Optional - Limit the size of the file (in bytes) 22 | java.util.logging.FileHandler.limit=50000 23 | 24 | # Optional - The number of files to cycle through, by 25 | # appending an integer to the base file name: 26 | java.util.logging.FileHandler.count=1 27 | -------------------------------------------------------------------------------- /examples/word_count_output/src/main/resources/commons-logging.properties: -------------------------------------------------------------------------------- 1 | # commons-logging.properties 2 | # jdk handlers 3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler 4 | 5 | # default log level 6 | .level=DEBUG 7 | 8 | # Specific logger level 9 | #MyClassLogger.level=FINE 10 | 11 | # FileHandler options - can also be set to the ConsoleHandler 12 | # FileHandler level can be set to override the global level: 13 | #java.util.logging.FileHandler.level=WARN 14 | 15 | # log file name for the File Handler 16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log 17 | 18 | # Specify the style of output (simple or xml) 19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter 20 | 21 | # Optional - Limit the size of the file (in bytes) 22 | java.util.logging.FileHandler.limit=50000 23 | 24 | # Optional - The number of files to cycle through, by 25 | # appending an integer to the base file name: 26 | java.util.logging.FileHandler.count=1 27 | -------------------------------------------------------------------------------- /examples/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 6 | 4.0.0 7 | aerospike-hadoop-examples 8 | pom 9 | 10 | 11 | com.aerospike 12 | aerospike-hadoop-parent 13 | 1.1.0-SNAPSHOT 14 | 15 | 16 | 17 | word_count_input 18 | aggregate_int_input 19 | word_count_output 20 | session_rollup 21 | generate_profiles 22 | external_join 23 | spark_session_rollup 24 | 25 | 26 | 27 | 28 | 29 | com.aerospike 30 | aerospike-mapreduce 31 | 1.1.0-SNAPSHOT 32 | compile 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeConfigEnum.java: -------------------------------------------------------------------------------- 1 | package com.aerospike.hadoop.mapreduce; 2 | 3 | public enum AerospikeConfigEnum { 4 | 5 | // ---------------- OUTPUT ---------------- 6 | 7 | INPUT_HOST("aerospike.input.host"), 8 | DEFAULT_INPUT_HOST("localhost"), 9 | INPUT_PORT("aerospike.input.port"), 10 | INPUT_NAMESPACE("aerospike.input.namespace"), 11 | INPUT_SETNAME("aerospike.input.setname"), 12 | INPUT_BINNAMES("aerospike.input.binnames"), 13 | DEFAULT_INPUT_BINNAMES(""), 14 | INPUT_OPERATION("aerospike.input.operation"), 15 | DEFAULT_INPUT_OPERATION("scan"), 16 | INPUT_SCAN_PERCENT("aerospike.input.scan.percent"), 17 | INPUT_NUMRANGE_BIN("aerospike.input.numrange.bin"), 18 | INPUT_NUMRANGE_BEGIN("aerospike.input.numrange.begin"), 19 | INPUT_NUMRANGE_END("aerospike.input.numrange.end"), 20 | 21 | // ---------------- OUTPUT ---------------- 22 | 23 | OUTPUT_HOST("aerospike.output.host"), 24 | DEFAULT_OUTPUT_HOST("localhost"), 25 | OUTPUT_PORT("aerospike.output.port"), 26 | OUTPUT_NAMESPACE("aerospike.output.namespace"), 27 | OUTPUT_SETNAME("aerospike.output.setname"), 28 | OUTPUT_BINNAME("aerospike.output.binname"), 29 | OUTPUT_KEYNAME("aerospike.output.keyname"); 30 | 31 | public final String value; 32 | 33 | private AerospikeConfigEnum(String v){ 34 | value = v; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /sampledata/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | sampledata 6 | 7 | com.aerospike 8 | aerospike-hadoop-examples 9 | 1.1.0-SNAPSHOT 10 | 11 | 12 | 13 | build/libs 14 | ${project.artifactId}-notfull 15 | 16 | 17 | maven-assembly-plugin 18 | 19 | 20 | 21 | com.aerospike.hadoop.sampledata.SampleData 22 | 23 | 24 | 25 | jar-with-dependencies 26 | 27 | ${project.artifactId} 28 | false 29 | 30 | 31 | 32 | make-assembly 33 | package 34 | 35 | single 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeClientSingleton.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.mapreduce; 20 | 21 | import com.aerospike.client.AerospikeClient; 22 | import com.aerospike.client.policy.ClientPolicy; 23 | 24 | public final class AerospikeClientSingleton { 25 | 26 | private static volatile AerospikeClient instance = null; 27 | 28 | public static AerospikeClient getInstance(ClientPolicy policy, 29 | String host, 30 | int port) { 31 | if (instance == null) { 32 | synchronized (AerospikeClientSingleton.class) { 33 | if (instance == null) { 34 | instance = new AerospikeClient(host, port); 35 | } 36 | } 37 | } 38 | return instance; 39 | } 40 | } 41 | 42 | // Local Variables: 43 | // mode: java 44 | // c-basic-offset: 4 45 | // tab-width: 4 46 | // indent-tabs-mode: nil 47 | // End: 48 | // vim: softtabstop=4:shiftwidth=4:expandtab 49 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeLogger.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.mapreduce; 20 | 21 | import org.apache.commons.logging.Log; 22 | import org.apache.commons.logging.LogFactory; 23 | 24 | import com.aerospike.client.Log.Level; 25 | 26 | public class AerospikeLogger implements com.aerospike.client.Log.Callback { 27 | 28 | private static final Log log = LogFactory.getLog(AerospikeLogger.class); 29 | 30 | public void log(Level level, String message) { 31 | switch (level) { 32 | case ERROR: 33 | log.error(message); 34 | break; 35 | case WARN: 36 | log.warn(message); 37 | break; 38 | case INFO: 39 | log.info(message); 40 | break; 41 | case DEBUG: 42 | log.debug(message); 43 | break; 44 | } 45 | } 46 | } 47 | 48 | // Local Variables: 49 | // mode: java 50 | // c-basic-offset: 4 51 | // tab-width: 4 52 | // indent-tabs-mode: nil 53 | // End: 54 | // vim: softtabstop=4:shiftwidth=4:expandtab 55 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | com.aerospike 6 | aerospike-hadoop-parent 7 | aerospike-hadoop-parent 8 | 1.1.0-SNAPSHOT 9 | pom 10 | 11 | 12 | Aerospike Inc. 13 | http://www.aerospike.com 14 | 15 | 16 | 17 | mapreduce 18 | sampledata 19 | examples 20 | 21 | 22 | 23 | UTF-8 24 | 3.8.1 25 | 4.2.2 26 | 2.7.2 27 | 28 | 29 | 30 | 31 | org.apache.hadoop 32 | hadoop-common 33 | ${hadoop.version} 34 | compile 35 | 36 | 37 | org.apache.hadoop 38 | hadoop-client 39 | ${hadoop.version} 40 | compile 41 | 42 | 43 | com.aerospike 44 | aerospike-client 45 | ${aerospike.client.version} 46 | compile 47 | 48 | 49 | 50 | junit 51 | junit 52 | ${junit.version} 53 | test 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /examples/generate_profiles/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | generate_profiles 6 | 7 | com.aerospike 8 | aerospike-hadoop-examples 9 | 1.1.0-SNAPSHOT 10 | 11 | 12 | 13 | org.json 14 | json 15 | 20140107 16 | compile 17 | 18 | 19 | org.apache.hadoop 20 | hadoop-mapreduce-client-jobclient 21 | 2.7.2 22 | compile 23 | 24 | 25 | com.aerospike 26 | aerospike-mapreduce 27 | 28 | 29 | joda-time 30 | joda-time 31 | 2.5 32 | compile 33 | 34 | 35 | 36 | build/libs 37 | ${project.artifactId}-notfull 38 | 39 | 40 | maven-assembly-plugin 41 | 42 | 43 | 44 | com.aerospike.hadoop.examples.generateprofiles.GenerateProfiles 45 | 46 | 47 | 48 | jar-with-dependencies 49 | 50 | ${project.artifactId} 51 | false 52 | 53 | 54 | 55 | make-assembly 56 | package 57 | 58 | single 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /examples/session_rollup/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | session_rollup 6 | 7 | 8 | com.aerospike 9 | aerospike-hadoop-examples 10 | 1.1.0-SNAPSHOT 11 | 12 | 13 | 14 | 15 | org.json 16 | json 17 | 20140107 18 | compile 19 | 20 | 21 | org.apache.hadoop 22 | hadoop-mapreduce-client-jobclient 23 | 2.7.2 24 | compile 25 | 26 | 27 | com.aerospike 28 | aerospike-mapreduce 29 | 30 | 31 | joda-time 32 | joda-time 33 | 2.5 34 | compile 35 | 36 | 37 | 38 | build/libs 39 | ${project.artifactId}-notfull 40 | 41 | 42 | maven-assembly-plugin 43 | 44 | 45 | 46 | com.aerospike.hadoop.examples.sessionrollup.SessionRollup 47 | 48 | 49 | 50 | jar-with-dependencies 51 | 52 | ${project.artifactId} 53 | false 54 | 55 | 56 | 57 | make-assembly 58 | package 59 | 60 | single 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /examples/word_count_input/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | word_count_input 6 | 7 | com.aerospike 8 | aerospike-hadoop-examples 9 | 1.1.0-SNAPSHOT 10 | 11 | 12 | 13 | 14 | org.json 15 | json 16 | 20140107 17 | compile 18 | 19 | 20 | org.apache.hadoop 21 | hadoop-mapreduce-client-jobclient 22 | 2.7.2 23 | compile 24 | 25 | 26 | com.aerospike 27 | aerospike-mapreduce 28 | 29 | 30 | joda-time 31 | joda-time 32 | 2.5 33 | compile 34 | 35 | 36 | 37 | build/libs 38 | ${project.artifactId}-notfull 39 | 40 | 41 | maven-assembly-plugin 42 | 43 | 44 | 45 | com.aerospike.hadoop.examples.wordcountinput.WordCountInput 46 | 47 | 48 | 49 | jar-with-dependencies 50 | 51 | ${project.artifactId} 52 | false 53 | 54 | 55 | 56 | make-assembly 57 | package 58 | 59 | single 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /examples/word_count_output/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | word_count_output 6 | 7 | com.aerospike 8 | aerospike-hadoop-examples 9 | 1.1.0-SNAPSHOT 10 | 11 | 12 | 13 | 14 | org.json 15 | json 16 | 20140107 17 | compile 18 | 19 | 20 | org.apache.hadoop 21 | hadoop-mapreduce-client-jobclient 22 | 2.7.2 23 | compile 24 | 25 | 26 | com.aerospike 27 | aerospike-mapreduce 28 | 29 | 30 | joda-time 31 | joda-time 32 | 2.5 33 | compile 34 | 35 | 36 | 37 | build/libs 38 | ${project.artifactId}-notfull 39 | 40 | 41 | maven-assembly-plugin 42 | 43 | 44 | 45 | com.aerospike.hadoop.examples.wordcountoutput.WordCountOutput 46 | 47 | 48 | 49 | jar-with-dependencies 50 | 51 | ${project.artifactId} 52 | false 53 | 54 | 55 | 56 | make-assembly 57 | package 58 | 59 | single 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /examples/aggregate_int_input/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | aggregate_int_input 6 | 7 | 8 | com.aerospike 9 | aerospike-hadoop-examples 10 | 1.1.0-SNAPSHOT 11 | 12 | 13 | 14 | 15 | com.aerospike 16 | aerospike-mapreduce 17 | 18 | 19 | org.json 20 | json 21 | 20140107 22 | compile 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-mapreduce-client-jobclient 27 | 2.2.0 28 | compile 29 | 30 | 31 | joda-time 32 | joda-time 33 | 2.5 34 | compile 35 | 36 | 37 | 38 | build/libs 39 | ${project.artifactId}-notfull 40 | 41 | 42 | maven-assembly-plugin 43 | 44 | 45 | 46 | com.aerospike.hadoop.examples.aggregateintinput.AggregateIntInput 47 | 48 | 49 | 50 | jar-with-dependencies 51 | 52 | ${project.artifactId} 53 | false 54 | 55 | 56 | 57 | make-assembly 58 | package 59 | 60 | single 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /examples/external_join/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | com.aerospike 7 | aerospike-hadoop-examples 8 | 1.1.0-SNAPSHOT 9 | 10 | external_join 11 | 1.1.0-SNAPSHOT 12 | 13 | 14 | org.json 15 | json 16 | 20140107 17 | compile 18 | 19 | 20 | org.apache.hadoop 21 | hadoop-mapreduce-client-jobclient 22 | ${hadoop.version} 23 | compile 24 | 25 | 26 | com.aerospike 27 | aerospike-mapreduce 28 | 29 | 30 | joda-time 31 | joda-time 32 | 2.5 33 | compile 34 | 35 | 36 | 37 | build/libs 38 | ${project.artifactId}-notfull 39 | 40 | 41 | maven-assembly-plugin 42 | 43 | 44 | 45 | com.aerospike.hadoop.examples.externaljoin.ExternalJoin 46 | 47 | 48 | 49 | jar-with-dependencies 50 | 51 | ${project.artifactId} 52 | false 53 | 54 | 55 | 56 | make-assembly 57 | package 58 | 59 | single 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 12 | set DEFAULT_JVM_OPTS= 13 | 14 | set DIRNAME=%~dp0 15 | if "%DIRNAME%" == "" set DIRNAME=. 16 | set APP_BASE_NAME=%~n0 17 | set APP_HOME=%DIRNAME% 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windowz variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | if "%@eval[2+2]" == "4" goto 4NT_args 53 | 54 | :win9xME_args 55 | @rem Slurp the command line arguments. 56 | set CMD_LINE_ARGS= 57 | set _SKIP=2 58 | 59 | :win9xME_args_slurp 60 | if "x%~1" == "x" goto execute 61 | 62 | set CMD_LINE_ARGS=%* 63 | goto execute 64 | 65 | :4NT_args 66 | @rem Get arguments from the 4NT Shell from JP Software 67 | set CMD_LINE_ARGS=%$ 68 | 69 | :execute 70 | @rem Setup the command line 71 | 72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 73 | 74 | @rem Execute Gradle 75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 76 | 77 | :end 78 | @rem End local scope for the variables with windows NT shell 79 | if "%ERRORLEVEL%"=="0" goto mainEnd 80 | 81 | :fail 82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 83 | rem the _cmd.exe /c_ return code! 84 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 85 | exit /b 1 86 | 87 | :mainEnd 88 | if "%OS%"=="Windows_NT" endlocal 89 | 90 | :omega 91 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeRecordWriter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.mapreduce; 20 | 21 | import java.io.IOException; 22 | 23 | import org.apache.commons.logging.Log; 24 | import org.apache.commons.logging.LogFactory; 25 | import org.apache.hadoop.conf.Configuration; 26 | import org.apache.hadoop.mapreduce.RecordWriter; 27 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 28 | import org.apache.hadoop.util.Progressable; 29 | 30 | import com.aerospike.client.AerospikeClient; 31 | import com.aerospike.client.policy.ClientPolicy; 32 | import com.aerospike.client.policy.WritePolicy; 33 | 34 | public abstract class AerospikeRecordWriter 35 | extends RecordWriter 36 | implements org.apache.hadoop.mapred.RecordWriter { 37 | 38 | private static final Log log = 39 | LogFactory.getLog(AerospikeRecordWriter.class); 40 | 41 | protected final Configuration cfg; 42 | protected boolean initialized = false; 43 | 44 | private static String namespace; 45 | private static String setName; 46 | private static AerospikeClient client; 47 | private static WritePolicy writePolicy; 48 | 49 | public AerospikeRecordWriter(Configuration cfg) { 50 | this.cfg = cfg; 51 | } 52 | 53 | public abstract void writeAerospike(KK key, 54 | VV value, 55 | AerospikeClient client, 56 | WritePolicy writePolicy, 57 | String namespace, 58 | String setName) throws IOException; 59 | 60 | @Override 61 | public void write(KK key, VV value) throws IOException { 62 | if (!initialized) { 63 | initialized = true; 64 | init(); 65 | } 66 | 67 | writeAerospike(key, value, client, writePolicy, namespace, setName); 68 | } 69 | 70 | protected void init() throws IOException { 71 | 72 | String host = AerospikeConfigUtil.getOutputHost(cfg); 73 | int port = AerospikeConfigUtil.getOutputPort(cfg); 74 | 75 | namespace = AerospikeConfigUtil.getOutputNamespace(cfg); 76 | setName = AerospikeConfigUtil.getOutputSetName(cfg); 77 | 78 | log.info(String.format("init: %s %d %s %s", 79 | host, port, namespace, setName)); 80 | 81 | ClientPolicy policy = new ClientPolicy(); 82 | policy.user = ""; 83 | policy.password = ""; 84 | policy.failIfNotConnected = true; 85 | 86 | client = AerospikeClientSingleton.getInstance(policy, host, port); 87 | 88 | writePolicy = new WritePolicy(); 89 | } 90 | 91 | @Override 92 | public void close(TaskAttemptContext context) throws IOException { 93 | doClose(context); 94 | } 95 | 96 | public void close(org.apache.hadoop.mapred.Reporter reporter 97 | ) throws IOException { 98 | doClose(reporter); 99 | } 100 | 101 | protected void doClose(Progressable progressable) { 102 | log.info("doClose"); 103 | initialized = false; 104 | } 105 | } 106 | 107 | // Local Variables: 108 | // mode: java 109 | // c-basic-offset: 4 110 | // tab-width: 4 111 | // indent-tabs-mode: nil 112 | // End: 113 | // vim: softtabstop=4:shiftwidth=4:expandtab 114 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeRecord.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.mapreduce; 20 | 21 | import java.io.DataInput; 22 | import java.io.DataOutput; 23 | import java.io.IOException; 24 | import java.util.HashMap; 25 | import java.util.Map; 26 | 27 | import org.apache.hadoop.io.Writable; 28 | 29 | import com.aerospike.client.Record; 30 | import com.aerospike.client.util.Packer; 31 | import com.aerospike.client.util.Unpacker.ObjectUnpacker; 32 | 33 | public class AerospikeRecord implements Writable { 34 | 35 | public Map bins; 36 | public int generation; 37 | public int expiration; 38 | 39 | public AerospikeRecord() { 40 | this.bins = null; 41 | this.generation = 0; 42 | this.expiration = 0; 43 | } 44 | 45 | public AerospikeRecord(Record rec) { 46 | this.bins = rec.bins; 47 | this.generation = rec.generation; 48 | this.expiration = rec.expiration; 49 | } 50 | 51 | public AerospikeRecord(AerospikeRecord rec) { 52 | this.bins = rec.bins; 53 | this.generation = rec.generation; 54 | this.expiration = rec.expiration; 55 | } 56 | 57 | public void set(Record rec) { 58 | this.bins = rec.bins; 59 | this.generation = rec.generation; 60 | this.expiration = rec.expiration; 61 | } 62 | 63 | public void set(AerospikeRecord rec) { 64 | this.bins = rec.bins; 65 | this.generation = rec.generation; 66 | this.expiration = rec.expiration; 67 | } 68 | 69 | public Record toRecord() { 70 | return new Record(bins, generation, expiration); 71 | } 72 | 73 | public void write(DataOutput out) throws IOException { 74 | try { 75 | out.writeInt(generation); 76 | out.writeInt(expiration); 77 | out.writeInt(bins.size()); 78 | for (Map.Entry entry : bins.entrySet()) { 79 | out.writeUTF(entry.getKey()); 80 | Packer pack = new Packer(); 81 | pack.packObject(entry.getValue()); 82 | byte[] buff = pack.toByteArray(); 83 | out.writeInt(buff.length); 84 | out.write(buff); 85 | } 86 | } 87 | catch (Exception ex) { 88 | throw new IOException(ex); 89 | } 90 | } 91 | 92 | public void readFields(DataInput in) throws IOException { 93 | try { 94 | generation = in.readInt(); 95 | expiration = in.readInt(); 96 | int nbins = in.readInt(); 97 | bins = new HashMap(); 98 | for (int ii = 0; ii < nbins; ++ii) { 99 | String key = in.readUTF(); 100 | int buflen = in.readInt(); 101 | byte[] buff = new byte[buflen]; 102 | in.readFully(buff); 103 | ObjectUnpacker unpack = new ObjectUnpacker(buff, 0, buff.length); 104 | Object obj = unpack.unpackObject(); 105 | bins.put(key, obj); 106 | } 107 | } 108 | catch (Exception ex) { 109 | throw new IOException(ex); 110 | } 111 | } 112 | 113 | public static AerospikeRecord read(DataInput in) throws IOException { 114 | AerospikeRecord rec = new AerospikeRecord(); 115 | rec.readFields(in); 116 | return rec; 117 | } 118 | } 119 | 120 | // Local Variables: 121 | // mode: java 122 | // c-basic-offset: 4 123 | // tab-width: 4 124 | // indent-tabs-mode: nil 125 | // End: 126 | // vim: softtabstop=4:shiftwidth=4:expandtab 127 | -------------------------------------------------------------------------------- /examples/spark_session_rollup/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | spark_session_rollup 6 | 7 | 8 | com.aerospike 9 | aerospike-hadoop-examples 10 | 1.1.0-SNAPSHOT 11 | 12 | 13 | 14 | 15 | org.apache.spark 16 | spark-core_2.11 17 | 2.4.0 18 | compile 19 | 20 | 21 | commons-codec 22 | commons-codec 23 | 1.9 24 | compile 25 | 26 | 27 | com.aerospike 28 | aerospike-mapreduce 29 | 30 | 31 | 32 | build/libs 33 | ${project.artifactId}-notfull 34 | 35 | 36 | org.apache.maven.plugins 37 | maven-shade-plugin 38 | 2.2 39 | 40 | 41 | 42 | *:* 43 | 44 | META-INF/*.SF 45 | META-INF/*.DSA 46 | META-INF/*.RSA 47 | 48 | 49 | 50 | 51 | 52 | 53 | job-driver-jar 54 | package 55 | 56 | shade 57 | 58 | 59 | true 60 | driver 61 | 62 | 63 | 67 | 68 | reference.conf 69 | 70 | 71 | com.aerospike.spark.examples.SparkSessionRollup 72 | 73 | 74 | 75 | 76 | 77 | worker-library-jar 78 | package 79 | 80 | shade 81 | 82 | 83 | true 84 | worker 85 | 86 | 87 | commons-codec:commons-codec 88 | com.aerospike:aerospike-client 89 | com.aerospike:aerospike-mapreduce 90 | org.gnu:gnu-crypto 91 | org.luaj:luaj-jse 92 | org.mindrot:jbcrypt 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /examples/word_count_input/src/main/java/com/aerospike/hadoop/examples/wordcountinput/WordCountInput.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.examples.wordcountinput; 20 | 21 | import java.io.IOException; 22 | import java.util.Iterator; 23 | import java.util.StringTokenizer; 24 | 25 | import org.apache.commons.logging.Log; 26 | import org.apache.commons.logging.LogFactory; 27 | import org.apache.hadoop.conf.Configuration; 28 | import org.apache.hadoop.conf.Configured; 29 | import org.apache.hadoop.fs.Path; 30 | import org.apache.hadoop.io.IntWritable; 31 | import org.apache.hadoop.io.Text; 32 | import org.apache.hadoop.mapred.FileOutputFormat; 33 | import org.apache.hadoop.mapred.JobClient; 34 | import org.apache.hadoop.mapred.JobConf; 35 | import org.apache.hadoop.mapred.MapReduceBase; 36 | import org.apache.hadoop.mapred.Mapper; 37 | import org.apache.hadoop.mapred.OutputCollector; 38 | import org.apache.hadoop.mapred.Reducer; 39 | import org.apache.hadoop.mapred.Reporter; 40 | import org.apache.hadoop.mapred.TextOutputFormat; 41 | import org.apache.hadoop.util.Tool; 42 | import org.apache.hadoop.util.ToolRunner; 43 | 44 | import com.aerospike.hadoop.mapreduce.AerospikeInputFormat; 45 | import com.aerospike.hadoop.mapreduce.AerospikeKey; 46 | import com.aerospike.hadoop.mapreduce.AerospikeRecord; 47 | 48 | public class WordCountInput extends Configured implements Tool { 49 | 50 | private static final Log log = LogFactory.getLog(WordCountInput.class); 51 | 52 | private static String binName = "bin1"; 53 | 54 | public static class Map 55 | extends MapReduceBase 56 | implements Mapper { 57 | 58 | private final static IntWritable one = new IntWritable(1); 59 | private Text word = new Text(); 60 | 61 | public void map(AerospikeKey key, 62 | AerospikeRecord rec, 63 | OutputCollector output, 64 | Reporter reporter 65 | ) throws IOException { 66 | String line = rec.bins.get(binName).toString(); 67 | StringTokenizer tokenizer = new StringTokenizer(line); 68 | while (tokenizer.hasMoreTokens()) { 69 | word.set(tokenizer.nextToken()); 70 | output.collect(word, one); 71 | } 72 | } 73 | } 74 | 75 | public static class Reduce 76 | extends MapReduceBase 77 | implements Reducer { 78 | 79 | public void reduce(Text word, Iterator values, 80 | OutputCollector output, 81 | Reporter reporter) 82 | throws IOException { 83 | int sum = 0; 84 | while (values.hasNext()) { 85 | sum += values.next().get(); 86 | } 87 | output.collect(word, new IntWritable(sum)); 88 | } 89 | } 90 | 91 | public int run(final String[] args) throws Exception { 92 | 93 | log.info("run starting"); 94 | 95 | final Configuration conf = getConf(); 96 | 97 | JobConf job = new JobConf(conf, WordCountInput.class); 98 | job.setJobName("AerospikeWordCountInput"); 99 | 100 | job.setInputFormat(AerospikeInputFormat.class); 101 | job.setMapperClass(Map.class); 102 | job.setCombinerClass(Reduce.class); 103 | job.setReducerClass(Reduce.class); 104 | job.setOutputKeyClass(Text.class); 105 | job.setOutputValueClass(IntWritable.class); 106 | job.setOutputFormat(TextOutputFormat.class); 107 | 108 | FileOutputFormat.setOutputPath(job, new Path(args[0])); 109 | 110 | JobClient.runJob(job); 111 | 112 | log.info("finished"); 113 | return 0; 114 | } 115 | 116 | public static void main(final String[] args) throws Exception { 117 | System.exit(ToolRunner.run(new WordCountInput(), args)); 118 | } 119 | } 120 | 121 | // Local Variables: 122 | // mode: java 123 | // c-basic-offset: 4 124 | // tab-width: 4 125 | // indent-tabs-mode: nil 126 | // End: 127 | // vim: softtabstop=4:shiftwidth=4:expandtab 128 | -------------------------------------------------------------------------------- /examples/aggregate_int_input/src/main/java/com/aerospike/hadoop/examples/aggregateintinput/AggregateIntInput.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.examples.aggregateintinput; 20 | 21 | import java.io.IOException; 22 | 23 | import org.apache.commons.logging.Log; 24 | import org.apache.commons.logging.LogFactory; 25 | import org.apache.hadoop.conf.Configuration; 26 | import org.apache.hadoop.conf.Configured; 27 | import org.apache.hadoop.fs.Path; 28 | import org.apache.hadoop.io.LongWritable; 29 | import org.apache.hadoop.io.Text; 30 | import org.apache.hadoop.mapreduce.Job; 31 | import org.apache.hadoop.mapreduce.Mapper; 32 | import org.apache.hadoop.mapreduce.Reducer; 33 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 34 | import org.apache.hadoop.util.Tool; 35 | import org.apache.hadoop.util.ToolRunner; 36 | 37 | import com.aerospike.hadoop.mapreduce.AerospikeInputFormat; 38 | import com.aerospike.hadoop.mapreduce.AerospikeKey; 39 | import com.aerospike.hadoop.mapreduce.AerospikeRecord; 40 | 41 | public class AggregateIntInput extends Configured implements Tool { 42 | 43 | private static final Log log = LogFactory.getLog(AggregateIntInput.class); 44 | 45 | private static final int KK = 3163; 46 | 47 | private static final String binName = "bin1"; 48 | 49 | public static class Map 50 | extends Mapper { 52 | 53 | private LongWritable val = new LongWritable(); 54 | private LongWritable mod = new LongWritable(); 55 | 56 | public void map(AerospikeKey key, AerospikeRecord rec, Context context) 57 | throws IOException, InterruptedException { 58 | int vv = (Integer) rec.bins.get(binName); 59 | val.set(vv); 60 | mod.set(vv % KK); 61 | context.write(mod, val); 62 | } 63 | } 64 | 65 | public static class Reduce 66 | extends Reducer { 67 | 68 | public void reduce(LongWritable mod, 69 | Iterable values, 70 | Context context) 71 | throws IOException, InterruptedException { 72 | 73 | long num = 0; // number of elements 74 | long sum = 0; // sum of elements 75 | long min = Long.MAX_VALUE; // minimum element 76 | long max = Long.MIN_VALUE; // maximum element 77 | 78 | for (LongWritable val : values) { 79 | long vv = val.get(); 80 | num += 1; 81 | sum += vv; 82 | if (vv < min) min = vv; 83 | if (vv > max) max = vv; 84 | } 85 | 86 | String rec = String.format("%d %d %d %d", num, min, max, sum); 87 | 88 | context.write(mod, new Text(rec)); 89 | } 90 | } 91 | 92 | public int run(final String[] args) throws Exception { 93 | final Configuration conf = getConf(); 94 | 95 | @SuppressWarnings("deprecation") 96 | final Job job = new Job(conf, "AerospikeAggregateIntInput"); 97 | 98 | log.info("run starting on bin " + binName); 99 | 100 | job.setJarByClass(AggregateIntInput.class); 101 | job.setInputFormatClass(AerospikeInputFormat.class); 102 | job.setMapperClass(Map.class); 103 | job.setMapOutputKeyClass(LongWritable.class); 104 | job.setMapOutputValueClass(LongWritable.class); 105 | // job.setCombinerClass(Reduce.class); // no combiner 106 | job.setReducerClass(Reduce.class); 107 | job.setOutputKeyClass(LongWritable.class); 108 | job.setOutputValueClass(Text.class); 109 | 110 | FileOutputFormat.setOutputPath(job, new Path(args[0])); 111 | 112 | int status = job.waitForCompletion(true) ? 0 : 1; 113 | log.info("run finished, status=" + status); 114 | return status; 115 | } 116 | 117 | public static void main(final String[] args) throws Exception { 118 | System.exit(ToolRunner.run(new AggregateIntInput(), args)); 119 | } 120 | } 121 | 122 | // Local Variables: 123 | // mode: java 124 | // c-basic-offset: 4 125 | // tab-width: 4 126 | // indent-tabs-mode: nil 127 | // End: 128 | // vim: softtabstop=4:shiftwidth=4:expandtab 129 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeKey.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.mapreduce; 20 | 21 | import java.io.DataInput; 22 | import java.io.DataOutput; 23 | import java.io.IOException; 24 | 25 | import org.apache.hadoop.io.WritableComparable; 26 | 27 | import com.aerospike.client.Key; 28 | import com.aerospike.client.Value; 29 | import com.aerospike.client.util.Packer; 30 | import com.aerospike.client.util.Unpacker.ObjectUnpacker; 31 | 32 | public class AerospikeKey implements WritableComparable { 33 | 34 | public String namespace; 35 | public String setName; 36 | public byte[] digest; 37 | public Value userKey; 38 | 39 | public AerospikeKey() { 40 | this.namespace = null; 41 | this.setName = null; 42 | this.digest = null; 43 | this.userKey = null; 44 | } 45 | 46 | public AerospikeKey(Key key) { 47 | this.namespace = key.namespace; 48 | this.digest = key.digest; 49 | this.setName = key.setName; 50 | this.userKey = key.userKey; 51 | } 52 | 53 | public AerospikeKey(AerospikeKey key) { 54 | this.namespace = key.namespace; 55 | this.digest = key.digest; 56 | this.setName = key.setName; 57 | this.userKey = key.userKey; 58 | } 59 | 60 | public void set(Key key) { 61 | this.namespace = key.namespace; 62 | this.digest = key.digest; 63 | this.setName = key.setName; 64 | this.userKey = key.userKey; 65 | } 66 | 67 | public void set(AerospikeKey key) { 68 | this.namespace = key.namespace; 69 | this.digest = key.digest; 70 | this.setName = key.setName; 71 | this.userKey = key.userKey; 72 | } 73 | 74 | public Key toKey() { 75 | return new Key(namespace, digest, setName, userKey); 76 | } 77 | 78 | public void write(DataOutput out) throws IOException { 79 | try { 80 | out.writeUTF(namespace); 81 | out.writeUTF(setName); 82 | out.writeInt(digest.length); 83 | out.write(digest); 84 | out.writeBoolean(userKey != null); 85 | if (userKey == null) { 86 | out.writeBoolean(false); 87 | } else { 88 | out.writeBoolean(true); 89 | Packer pack = new Packer(); 90 | pack.packObject(userKey); 91 | byte[] buff = pack.toByteArray(); 92 | out.writeInt(buff.length); 93 | out.write(buff); 94 | } 95 | } 96 | catch (Exception ex) { 97 | throw new IOException(ex); 98 | } 99 | } 100 | 101 | public void readFields(DataInput in) throws IOException { 102 | try { 103 | namespace = in.readUTF(); 104 | setName = in.readUTF(); 105 | int digestLen = in.readInt(); 106 | digest = new byte[digestLen]; 107 | in.readFully(digest); 108 | if (in.readBoolean()) { 109 | int buflen = in.readInt(); 110 | byte[] buff = new byte[buflen]; 111 | in.readFully(buff); 112 | ObjectUnpacker unpack = new ObjectUnpacker(buff, 0, buff.length); 113 | userKey = Value.get(unpack.unpackObject()); 114 | } 115 | } 116 | catch (Exception ex) { 117 | throw new IOException(ex); 118 | } 119 | } 120 | 121 | public static AerospikeKey read(DataInput in) throws IOException { 122 | AerospikeKey key = new AerospikeKey(); 123 | key.readFields(in); 124 | return key; 125 | } 126 | 127 | public int compareTo(AerospikeKey other) { 128 | byte[] left = this.digest; 129 | byte[] right = other.digest; 130 | for (int i = 0, j = 0; i < left.length && j < right.length; i++, j++) { 131 | int a = (left[i] & 0xff); 132 | int b = (right[j] & 0xff); 133 | if (a != b) { 134 | return a - b; 135 | } 136 | } 137 | return left.length - right.length; 138 | } 139 | } 140 | 141 | // Local Variables: 142 | // mode: java 143 | // c-basic-offset: 4 144 | // tab-width: 4 145 | // indent-tabs-mode: nil 146 | // End: 147 | // vim: softtabstop=4:shiftwidth=4:expandtab 148 | -------------------------------------------------------------------------------- /sampledata/src/main/java/com/aerospike/hadoop/sampledata/SampleData.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.sampledata; 20 | 21 | import java.io.BufferedReader; 22 | import java.io.FileReader; 23 | 24 | import org.apache.commons.logging.Log; 25 | import org.apache.commons.logging.LogFactory; 26 | 27 | import com.aerospike.client.AerospikeClient; 28 | import com.aerospike.client.Bin; 29 | import com.aerospike.client.Key; 30 | import com.aerospike.client.policy.ClientPolicy; 31 | import com.aerospike.client.policy.WritePolicy; 32 | import com.aerospike.client.query.IndexType; 33 | import com.aerospike.client.task.IndexTask; 34 | 35 | public class SampleData { 36 | 37 | // aql> CREATE INDEX bin1ndx ON test.sample (bin1) NUMERIC 38 | 39 | private static final Log log = LogFactory.getLog(SampleData.class); 40 | 41 | private static String host; 42 | private static int port; 43 | private static String namespace; 44 | private static String setName; 45 | private static String binName; 46 | private static AerospikeClient client; 47 | private static WritePolicy writePolicy; 48 | 49 | public static void run(String[] args) throws Exception { 50 | 51 | int argi = 0; 52 | String asspec = args[argi++]; 53 | String dataType = args[argi++]; 54 | 55 | log.info(String.format("saw %s %s", asspec, dataType)); 56 | 57 | String[] inparam = asspec.split(":"); 58 | host = inparam[0]; 59 | port = Integer.parseInt(inparam[1]); 60 | namespace = inparam[2]; 61 | setName = inparam[3]; 62 | binName = inparam[4]; 63 | 64 | ClientPolicy policy = new ClientPolicy(); 65 | policy.user = ""; 66 | policy.password = ""; 67 | policy.failIfNotConnected = true; 68 | 69 | client = new AerospikeClient(policy, host, port); 70 | 71 | writePolicy = new WritePolicy(); 72 | 73 | if (dataType.equals("text-file")) 74 | runTextFile(args, argi); 75 | else if (dataType.equals("seq-int")) 76 | runSeqInt(args, argi); 77 | else 78 | throw new RuntimeException(String.format("unknown dataType \"%s\"", 79 | dataType)); 80 | } 81 | 82 | public static void runTextFile(String[] args, int argi) throws Exception { 83 | 84 | while (argi < args.length) { 85 | String path = args[argi++]; 86 | log.info("processing " + path + " ..."); 87 | int nrecs = 0; 88 | BufferedReader br = new BufferedReader(new FileReader(path)); 89 | for (String line; (line = br.readLine()) != null; ) { 90 | // The key is "path:linenum". 91 | String keystr = path + ':' + Long.toString(nrecs++); 92 | Key key = new Key(namespace, setName, keystr); 93 | Bin bin = new Bin(binName, line); 94 | client.put(writePolicy, key, bin); 95 | } 96 | log.info("inserted " + nrecs + " records"); 97 | br.close(); 98 | } 99 | } 100 | 101 | public static void runSeqInt(String[] args, int argi) throws Exception { 102 | 103 | int offset = Integer.parseInt(args[argi++]); 104 | int nrecs = Integer.parseInt(args[argi++]); 105 | 106 | String ndxname = binName + "ndx"; 107 | 108 | IndexTask task = 109 | client.createIndex(null, namespace, setName, 110 | ndxname, binName, IndexType.NUMERIC); 111 | 112 | task.waitTillComplete(); 113 | log.info("created secondary index on " + binName); 114 | 115 | for (long ll = offset; ll < offset + nrecs; ++ll) { 116 | 117 | String keystr = "key-" + ll; 118 | 119 | Key key = new Key(namespace, setName, keystr); 120 | Bin bin1 = new Bin(binName, ll); 121 | Bin bin2 = new Bin("bin2", "value2"); 122 | 123 | client.put(writePolicy, key, bin1, bin2); 124 | } 125 | 126 | log.info("inserted " + nrecs + " records"); 127 | } 128 | 129 | public static void main(String[] args) { 130 | 131 | try { 132 | log.info("starting"); 133 | run(args); 134 | log.info("finished"); 135 | } catch (Exception ex) { 136 | 137 | log.error(ex.getMessage()); 138 | ex.printStackTrace(); 139 | } 140 | } 141 | 142 | } 143 | 144 | // Local Variables: 145 | // mode: java 146 | // c-basic-offset: 4 147 | // tab-width: 4 148 | // indent-tabs-mode: nil 149 | // End: 150 | // vim: softtabstop=4:shiftwidth=4:expandtab 151 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeSplit.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.mapreduce; 20 | 21 | import java.io.DataInput; 22 | import java.io.DataOutput; 23 | import java.io.IOException; 24 | 25 | import org.apache.hadoop.io.Text; 26 | import org.apache.hadoop.mapreduce.InputSplit; 27 | 28 | public class AerospikeSplit 29 | extends InputSplit 30 | implements org.apache.hadoop.mapred.InputSplit { 31 | 32 | private String type; 33 | private String node; 34 | private String host; 35 | private int port; 36 | private String namespace; 37 | private String setName; 38 | private String[] binNames; 39 | private String numrangeBin; 40 | private long numrangeBegin; 41 | private long numrangeEnd; 42 | private int scanPercent; 43 | 44 | AerospikeSplit() { 45 | } 46 | 47 | public AerospikeSplit(String type, String node, String host, int port, 48 | String ns, String setName, String[] binNames, 49 | String numrangeBin, long numrangeBegin, long numrangeEnd) { 50 | this(type, node, host, port, ns, setName, binNames, numrangeBin, numrangeBegin, numrangeEnd, AerospikeConfigUtil.DEFAULT_INPUT_SCAN_PERCENT); 51 | } 52 | 53 | public AerospikeSplit(String type, String node, String host, int port, 54 | String ns, String setName, String[] binNames, 55 | String numrangeBin, long numrangeBegin, 56 | long numrangeEnd, int scanPercent) { 57 | this.type = type; 58 | this.node = node; 59 | this.host = host; 60 | this.port = port; 61 | this.namespace = ns; 62 | this.setName = setName; 63 | this.binNames = binNames; 64 | this.numrangeBin = numrangeBin; 65 | this.numrangeBegin = numrangeBegin; 66 | this.numrangeEnd = numrangeEnd; 67 | this.scanPercent = scanPercent; 68 | } 69 | 70 | public String getType() { 71 | return type; 72 | } 73 | 74 | public String getNode() { 75 | return node; 76 | } 77 | 78 | public String getHost() { 79 | return host; 80 | } 81 | 82 | public int getPort() { 83 | return port; 84 | } 85 | 86 | public String getNameSpace() { 87 | return namespace; 88 | } 89 | 90 | public String getSetName() { 91 | return setName; 92 | } 93 | 94 | public String[] getBinNames() { 95 | return binNames; 96 | } 97 | 98 | public String getNumRangeBin() { 99 | return numrangeBin; 100 | } 101 | 102 | public long getNumRangeBegin() { 103 | return numrangeBegin; 104 | } 105 | 106 | public long getNumRangeEnd() { 107 | return numrangeEnd; 108 | } 109 | 110 | public int getScanPercent() { return scanPercent; } 111 | 112 | public long getLength() { 113 | return 1; 114 | } 115 | 116 | public String toString() { 117 | return type + ':' + node + ":" + host + ":" + port + ":" 118 | + namespace + ":" + setName; 119 | } 120 | 121 | public void write(DataOutput out) throws IOException { 122 | Text.writeString(out, type); 123 | Text.writeString(out, node); 124 | Text.writeString(out, host); 125 | out.writeInt(port); 126 | Text.writeString(out, namespace); 127 | Text.writeString(out, setName); 128 | if (binNames == null) { 129 | out.writeInt(0); 130 | } else { 131 | out.writeInt(binNames.length); 132 | for (String binName : binNames) 133 | Text.writeString(out, binName); 134 | } 135 | Text.writeString(out, numrangeBin); 136 | out.writeLong(numrangeBegin); 137 | out.writeLong(numrangeEnd); 138 | out.writeInt(scanPercent); 139 | } 140 | 141 | public void readFields(DataInput in) throws IOException { 142 | type = new String(Text.readString(in)); 143 | node = new String(Text.readString(in)); 144 | host = new String(Text.readString(in)); 145 | port = in.readInt(); 146 | namespace = new String(Text.readString(in)); 147 | setName = new String(Text.readString(in)); 148 | int nBinNames = in.readInt(); 149 | if (nBinNames == 0) { 150 | binNames = null; 151 | } else { 152 | binNames = new String[nBinNames]; 153 | for (int ii = 0; ii < nBinNames; ++ii) 154 | binNames[ii] = new String(Text.readString(in)); 155 | } 156 | numrangeBin = new String(Text.readString(in)); 157 | numrangeBegin = in.readLong(); 158 | numrangeEnd = in.readLong(); 159 | scanPercent = in.readInt(); 160 | } 161 | 162 | public String[] getLocations() throws IOException { 163 | return new String[]{ host }; 164 | } 165 | } 166 | 167 | // Local Variables: 168 | // mode: java 169 | // c-basic-offset: 4 170 | // tab-width: 4 171 | // indent-tabs-mode: nil 172 | // End: 173 | // vim: softtabstop=4:shiftwidth=4:expandtab 174 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 10 | DEFAULT_JVM_OPTS="-Xmx512m" 11 | 12 | APP_NAME="Gradle" 13 | APP_BASE_NAME=`basename "$0"` 14 | 15 | # Use the maximum available, or set MAX_FD != -1 to use that value. 16 | MAX_FD="maximum" 17 | 18 | warn ( ) { 19 | echo "$*" 20 | } 21 | 22 | die ( ) { 23 | echo 24 | echo "$*" 25 | echo 26 | exit 1 27 | } 28 | 29 | # OS specific support (must be 'true' or 'false'). 30 | cygwin=false 31 | msys=false 32 | darwin=false 33 | case "`uname`" in 34 | CYGWIN* ) 35 | cygwin=true 36 | ;; 37 | Darwin* ) 38 | darwin=true 39 | ;; 40 | MINGW* ) 41 | msys=true 42 | ;; 43 | esac 44 | 45 | # For Cygwin, ensure paths are in UNIX format before anything is touched. 46 | if $cygwin ; then 47 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 48 | fi 49 | 50 | # Attempt to set APP_HOME 51 | # Resolve links: $0 may be a link 52 | PRG="$0" 53 | # Need this for relative symlinks. 54 | while [ -h "$PRG" ] ; do 55 | ls=`ls -ld "$PRG"` 56 | link=`expr "$ls" : '.*-> \(.*\)$'` 57 | if expr "$link" : '/.*' > /dev/null; then 58 | PRG="$link" 59 | else 60 | PRG=`dirname "$PRG"`"/$link" 61 | fi 62 | done 63 | SAVED="`pwd`" 64 | cd "`dirname \"$PRG\"`/" >&- 65 | APP_HOME="`pwd -P`" 66 | cd "$SAVED" >&- 67 | 68 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 69 | 70 | # Determine the Java command to use to start the JVM. 71 | if [ -n "$JAVA_HOME" ] ; then 72 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 73 | # IBM's JDK on AIX uses strange locations for the executables 74 | JAVACMD="$JAVA_HOME/jre/sh/java" 75 | else 76 | JAVACMD="$JAVA_HOME/bin/java" 77 | fi 78 | if [ ! -x "$JAVACMD" ] ; then 79 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 80 | 81 | Please set the JAVA_HOME variable in your environment to match the 82 | location of your Java installation." 83 | fi 84 | else 85 | JAVACMD="java" 86 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 87 | 88 | Please set the JAVA_HOME variable in your environment to match the 89 | location of your Java installation." 90 | fi 91 | 92 | # Increase the maximum file descriptors if we can. 93 | if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then 94 | MAX_FD_LIMIT=`ulimit -H -n` 95 | if [ $? -eq 0 ] ; then 96 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 97 | MAX_FD="$MAX_FD_LIMIT" 98 | fi 99 | ulimit -n $MAX_FD 100 | if [ $? -ne 0 ] ; then 101 | warn "Could not set maximum file descriptor limit: $MAX_FD" 102 | fi 103 | else 104 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 105 | fi 106 | fi 107 | 108 | # For Darwin, add options to specify how the application appears in the dock 109 | if $darwin; then 110 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 111 | fi 112 | 113 | # For Cygwin, switch paths to Windows format before running java 114 | if $cygwin ; then 115 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 116 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules 158 | function splitJvmOpts() { 159 | JVM_OPTS=("$@") 160 | } 161 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS 162 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" 163 | 164 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" 165 | -------------------------------------------------------------------------------- /examples/word_count_output/src/main/java/com/aerospike/hadoop/examples/wordcountoutput/WordCountOutput.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.examples.wordcountoutput; 20 | 21 | import java.io.IOException; 22 | import java.util.Iterator; 23 | import java.util.StringTokenizer; 24 | 25 | import org.apache.commons.logging.Log; 26 | import org.apache.commons.logging.LogFactory; 27 | import org.apache.hadoop.conf.Configuration; 28 | import org.apache.hadoop.conf.Configured; 29 | import org.apache.hadoop.fs.Path; 30 | import org.apache.hadoop.io.IntWritable; 31 | import org.apache.hadoop.io.LongWritable; 32 | import org.apache.hadoop.io.Text; 33 | import org.apache.hadoop.mapred.FileInputFormat; 34 | import org.apache.hadoop.mapred.JobClient; 35 | import org.apache.hadoop.mapred.JobConf; 36 | import org.apache.hadoop.mapred.MapReduceBase; 37 | import org.apache.hadoop.mapred.Mapper; 38 | import org.apache.hadoop.mapred.OutputCollector; 39 | import org.apache.hadoop.mapred.RecordWriter; 40 | import org.apache.hadoop.mapred.Reducer; 41 | import org.apache.hadoop.mapred.Reporter; 42 | // These are all needed by MyOutputFormat. 43 | import org.apache.hadoop.util.Progressable; 44 | import org.apache.hadoop.util.Tool; 45 | import org.apache.hadoop.util.ToolRunner; 46 | 47 | import com.aerospike.client.AerospikeClient; 48 | import com.aerospike.client.Bin; 49 | import com.aerospike.client.Key; 50 | import com.aerospike.client.policy.WritePolicy; 51 | import com.aerospike.hadoop.mapreduce.AerospikeOutputFormat; 52 | import com.aerospike.hadoop.mapreduce.AerospikeRecordWriter; 53 | 54 | public class WordCountOutput extends Configured implements Tool { 55 | 56 | private static final Log log = LogFactory.getLog(WordCountOutput.class); 57 | 58 | public static class Map 59 | extends MapReduceBase 60 | implements Mapper { 61 | private final static IntWritable one = new IntWritable(1); 62 | private Text word = new Text(); 63 | 64 | public void map(LongWritable key, Text value, 65 | OutputCollector output, 66 | Reporter reporter) 67 | throws IOException { 68 | String line = value.toString(); 69 | StringTokenizer tokenizer = new StringTokenizer(line); 70 | while (tokenizer.hasMoreTokens()) { 71 | word.set(tokenizer.nextToken()); 72 | output.collect(word, one); 73 | } 74 | } 75 | } 76 | 77 | public static class Reduce 78 | extends MapReduceBase 79 | implements Reducer { 80 | 81 | public void reduce(Text key, Iterator values, 82 | OutputCollector output, 83 | Reporter reporter) 84 | throws IOException { 85 | int sum = 0; 86 | while (values.hasNext()) { 87 | sum += values.next().get(); 88 | } 89 | output.collect(key, new IntWritable(sum)); 90 | } 91 | } 92 | 93 | public static class MyOutputFormat 94 | extends AerospikeOutputFormat { 95 | 96 | public static class MyRecordWriter 97 | extends AerospikeRecordWriter { 98 | 99 | public MyRecordWriter(Configuration cfg, Progressable progressable) { 100 | super(cfg); 101 | } 102 | 103 | @Override 104 | public void writeAerospike(Text key, 105 | IntWritable value, 106 | AerospikeClient client, 107 | WritePolicy writePolicy, 108 | String namespace, 109 | String setName) throws IOException { 110 | Key kk = new Key(namespace, setName, key.toString()); 111 | Bin bin1 = new Bin("word", key.toString()); 112 | Bin bin2 = new Bin("count", value.get()); 113 | client.put(writePolicy, kk, bin1, bin2); 114 | } 115 | } 116 | 117 | public RecordWriter 118 | getAerospikeRecordWriter(Configuration conf, Progressable prog) { 119 | return new MyRecordWriter(conf, prog); 120 | } 121 | } 122 | 123 | public int run(final String[] args) throws Exception { 124 | 125 | log.info("run starting"); 126 | 127 | final Configuration conf = getConf(); 128 | 129 | JobConf job = new JobConf(conf, WordCountOutput.class); 130 | job.setJobName("AerospikeWordCountOutput"); 131 | 132 | for (int ii = 0; ii < args.length; ++ii) { 133 | FileInputFormat.addInputPath(job, new Path(args[ii])); 134 | } 135 | 136 | job.setMapperClass(Map.class); 137 | job.setCombinerClass(Reduce.class); 138 | job.setReducerClass(Reduce.class); 139 | job.setOutputKeyClass(Text.class); 140 | job.setOutputValueClass(IntWritable.class); 141 | 142 | job.setOutputFormat(MyOutputFormat.class); 143 | 144 | JobClient.runJob(job); 145 | 146 | log.info("finished"); 147 | return 0; 148 | } 149 | 150 | public static void main(final String[] args) throws Exception { 151 | System.exit(ToolRunner.run(new WordCountOutput(), args)); 152 | } 153 | } 154 | 155 | // Local Variables: 156 | // mode: java 157 | // c-basic-offset: 4 158 | // tab-width: 4 159 | // indent-tabs-mode: nil 160 | // End: 161 | // vim: softtabstop=4:shiftwidth=4:expandtab 162 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeOutputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.mapreduce; 20 | 21 | import java.io.IOException; 22 | 23 | import org.apache.commons.logging.Log; 24 | import org.apache.commons.logging.LogFactory; 25 | import org.apache.hadoop.conf.Configuration; 26 | import org.apache.hadoop.fs.FileSystem; 27 | import org.apache.hadoop.mapreduce.JobContext; 28 | import org.apache.hadoop.mapreduce.OutputCommitter; 29 | import org.apache.hadoop.mapreduce.OutputFormat; 30 | import org.apache.hadoop.mapreduce.RecordWriter; 31 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 32 | import org.apache.hadoop.util.Progressable; 33 | 34 | public abstract class AerospikeOutputFormat 35 | extends OutputFormat 36 | implements org.apache.hadoop.mapred.OutputFormat { 37 | 38 | private static final Log log = 39 | LogFactory.getLog(AerospikeOutputFormat.class); 40 | 41 | public static class AerospikeOutputCommitter extends OutputCommitter { 42 | 43 | @Override 44 | public void setupJob(JobContext jobContext) 45 | throws IOException {} 46 | 47 | // compatibility check with Hadoop 0.20.2 48 | @Deprecated 49 | public void cleanupJob(JobContext jobContext) 50 | throws IOException {} 51 | 52 | @Override 53 | public void setupTask(TaskAttemptContext taskContext) 54 | throws IOException { 55 | //no-op 56 | } 57 | 58 | @Override 59 | public boolean needsTaskCommit(TaskAttemptContext taskContext) 60 | throws IOException { 61 | //no-op 62 | return false; 63 | } 64 | 65 | @Override 66 | public void commitTask(TaskAttemptContext taskContext) 67 | throws IOException { 68 | //no-op 69 | } 70 | 71 | @Override 72 | public void abortTask(TaskAttemptContext taskContext) 73 | throws IOException { 74 | //no-op 75 | } 76 | 77 | } 78 | 79 | public static class AerospikeOldAPIOutputCommitter 80 | extends org.apache.hadoop.mapred.OutputCommitter { 81 | 82 | @Override 83 | public void setupJob(org.apache.hadoop.mapred.JobContext jobContext) 84 | throws IOException { 85 | //no-op 86 | } 87 | 88 | @Override 89 | public void setupTask( 90 | org.apache.hadoop.mapred.TaskAttemptContext taskContext) 91 | throws IOException { 92 | //no-op 93 | } 94 | 95 | @Override 96 | public boolean needsTaskCommit( 97 | org.apache.hadoop.mapred.TaskAttemptContext taskContext) 98 | throws IOException { 99 | //no-op 100 | return false; 101 | } 102 | 103 | @Override 104 | public void commitTask( 105 | org.apache.hadoop.mapred.TaskAttemptContext taskContext) 106 | throws IOException { 107 | //no-op 108 | } 109 | 110 | @Override 111 | public void abortTask( 112 | org.apache.hadoop.mapred.TaskAttemptContext taskContext) 113 | throws IOException { 114 | //no-op 115 | } 116 | 117 | @Override 118 | @Deprecated 119 | public void cleanupJob(org.apache.hadoop.mapred.JobContext context) 120 | throws IOException { 121 | // no-op 122 | // added for compatibility with hadoop 0.20.x (used by old 123 | // tools, such as Cascalog) 124 | } 125 | } 126 | 127 | public abstract org.apache.hadoop.mapred.RecordWriter 128 | getAerospikeRecordWriter(Configuration conf, Progressable progress); 129 | 130 | // 131 | // new API - just delegates to the Old API 132 | // 133 | @SuppressWarnings("unchecked") 134 | @Override 135 | public RecordWriter getRecordWriter(TaskAttemptContext context) { 136 | Configuration conf = context.getConfiguration(); 137 | return (RecordWriter) getAerospikeRecordWriter(conf, context); 138 | } 139 | 140 | @Override 141 | public void checkOutputSpecs(JobContext context) throws IOException { 142 | // careful as it seems the info here saved by in the config is discarded 143 | Configuration cfg = context.getConfiguration(); 144 | init(cfg); 145 | } 146 | 147 | @Override 148 | public OutputCommitter getOutputCommitter(TaskAttemptContext context) { 149 | return new AerospikeOutputCommitter(); 150 | } 151 | 152 | // 153 | // old API 154 | // 155 | @Deprecated 156 | public org.apache.hadoop.mapred.RecordWriter 157 | getRecordWriter(FileSystem ignored, 158 | org.apache.hadoop.mapred.JobConf job, 159 | String name, Progressable progress) { 160 | return getAerospikeRecordWriter(job, progress); 161 | } 162 | 163 | @Deprecated 164 | public void checkOutputSpecs(FileSystem ignored, 165 | org.apache.hadoop.mapred.JobConf cfg) 166 | throws IOException { 167 | init(cfg); 168 | } 169 | 170 | // NB: all changes to the config objects are discarded before the 171 | // job is submitted if _the old MR api_ is used 172 | private void init(Configuration cfg) throws IOException { 173 | log.info(String.format("init")); 174 | } 175 | } 176 | 177 | // Local Variables: 178 | // mode: java 179 | // c-basic-offset: 4 180 | // tab-width: 4 181 | // indent-tabs-mode: nil 182 | // End: 183 | // vim: softtabstop=4:shiftwidth=4:expandtab 184 | -------------------------------------------------------------------------------- /WORLDCUP_FILELIST: -------------------------------------------------------------------------------- 1 | /worldcup/wc_day10_1.log 2 | /worldcup/wc_day11_1.log 3 | /worldcup/wc_day12_1.log 4 | /worldcup/wc_day13_1.log 5 | /worldcup/wc_day14_1.log 6 | /worldcup/wc_day15_1.log 7 | /worldcup/wc_day16_1.log 8 | /worldcup/wc_day17_1.log 9 | /worldcup/wc_day18_1.log 10 | /worldcup/wc_day19_1.log 11 | /worldcup/wc_day20_1.log 12 | /worldcup/wc_day21_1.log 13 | /worldcup/wc_day22_1.log 14 | /worldcup/wc_day23_1.log 15 | /worldcup/wc_day24_1.log 16 | /worldcup/wc_day25_1.log 17 | /worldcup/wc_day26_1.log 18 | /worldcup/wc_day27_1.log 19 | /worldcup/wc_day28_1.log 20 | /worldcup/wc_day29_1.log 21 | /worldcup/wc_day30_1.log 22 | /worldcup/wc_day31_1.log 23 | /worldcup/wc_day32_1.log 24 | /worldcup/wc_day33_1.log 25 | /worldcup/wc_day34_1.log 26 | /worldcup/wc_day35_1.log 27 | /worldcup/wc_day36_1.log 28 | /worldcup/wc_day37_1.log 29 | /worldcup/wc_day38_1.log 30 | /worldcup/wc_day38_2.log 31 | /worldcup/wc_day39_1.log 32 | /worldcup/wc_day39_2.log 33 | /worldcup/wc_day40_1.log 34 | /worldcup/wc_day40_2.log 35 | /worldcup/wc_day41_1.log 36 | /worldcup/wc_day41_2.log 37 | /worldcup/wc_day42_1.log 38 | /worldcup/wc_day43_1.log 39 | /worldcup/wc_day44_1.log 40 | /worldcup/wc_day44_2.log 41 | /worldcup/wc_day44_3.log 42 | /worldcup/wc_day45_1.log 43 | /worldcup/wc_day45_2.log 44 | /worldcup/wc_day45_3.log 45 | /worldcup/wc_day46_1.log 46 | /worldcup/wc_day46_2.log 47 | /worldcup/wc_day46_3.log 48 | /worldcup/wc_day46_4.log 49 | /worldcup/wc_day46_5.log 50 | /worldcup/wc_day46_6.log 51 | /worldcup/wc_day46_7.log 52 | /worldcup/wc_day46_8.log 53 | /worldcup/wc_day47_1.log 54 | /worldcup/wc_day47_2.log 55 | /worldcup/wc_day47_3.log 56 | /worldcup/wc_day47_4.log 57 | /worldcup/wc_day47_5.log 58 | /worldcup/wc_day47_6.log 59 | /worldcup/wc_day47_7.log 60 | /worldcup/wc_day47_8.log 61 | /worldcup/wc_day48_1.log 62 | /worldcup/wc_day48_2.log 63 | /worldcup/wc_day48_3.log 64 | /worldcup/wc_day48_4.log 65 | /worldcup/wc_day48_5.log 66 | /worldcup/wc_day48_6.log 67 | /worldcup/wc_day48_7.log 68 | /worldcup/wc_day49_1.log 69 | /worldcup/wc_day49_2.log 70 | /worldcup/wc_day49_3.log 71 | /worldcup/wc_day49_4.log 72 | /worldcup/wc_day50_1.log 73 | /worldcup/wc_day50_2.log 74 | /worldcup/wc_day50_3.log 75 | /worldcup/wc_day50_4.log 76 | /worldcup/wc_day51_1.log 77 | /worldcup/wc_day51_2.log 78 | /worldcup/wc_day51_3.log 79 | /worldcup/wc_day51_4.log 80 | /worldcup/wc_day51_5.log 81 | /worldcup/wc_day51_6.log 82 | /worldcup/wc_day51_7.log 83 | /worldcup/wc_day51_8.log 84 | /worldcup/wc_day51_9.log 85 | /worldcup/wc_day52_1.log 86 | /worldcup/wc_day52_2.log 87 | /worldcup/wc_day52_3.log 88 | /worldcup/wc_day52_4.log 89 | /worldcup/wc_day52_5.log 90 | /worldcup/wc_day52_6.log 91 | /worldcup/wc_day53_1.log 92 | /worldcup/wc_day53_2.log 93 | /worldcup/wc_day53_3.log 94 | /worldcup/wc_day53_4.log 95 | /worldcup/wc_day53_5.log 96 | /worldcup/wc_day53_6.log 97 | /worldcup/wc_day54_1.log 98 | /worldcup/wc_day54_2.log 99 | /worldcup/wc_day54_3.log 100 | /worldcup/wc_day54_4.log 101 | /worldcup/wc_day54_5.log 102 | /worldcup/wc_day54_6.log 103 | /worldcup/wc_day55_1.log 104 | /worldcup/wc_day55_2.log 105 | /worldcup/wc_day55_3.log 106 | /worldcup/wc_day55_4.log 107 | /worldcup/wc_day55_5.log 108 | /worldcup/wc_day56_1.log 109 | /worldcup/wc_day56_2.log 110 | /worldcup/wc_day56_3.log 111 | /worldcup/wc_day57_1.log 112 | /worldcup/wc_day57_2.log 113 | /worldcup/wc_day57_3.log 114 | /worldcup/wc_day58_1.log 115 | /worldcup/wc_day58_2.log 116 | /worldcup/wc_day58_3.log 117 | /worldcup/wc_day58_4.log 118 | /worldcup/wc_day58_5.log 119 | /worldcup/wc_day58_6.log 120 | /worldcup/wc_day59_1.log 121 | /worldcup/wc_day59_2.log 122 | /worldcup/wc_day59_3.log 123 | /worldcup/wc_day59_4.log 124 | /worldcup/wc_day59_5.log 125 | /worldcup/wc_day59_6.log 126 | /worldcup/wc_day59_7.log 127 | /worldcup/wc_day5_1.log 128 | /worldcup/wc_day60_1.log 129 | /worldcup/wc_day60_2.log 130 | /worldcup/wc_day60_3.log 131 | /worldcup/wc_day60_4.log 132 | /worldcup/wc_day60_5.log 133 | /worldcup/wc_day60_6.log 134 | /worldcup/wc_day60_7.log 135 | /worldcup/wc_day61_1.log 136 | /worldcup/wc_day61_2.log 137 | /worldcup/wc_day61_3.log 138 | /worldcup/wc_day61_4.log 139 | /worldcup/wc_day61_5.log 140 | /worldcup/wc_day61_6.log 141 | /worldcup/wc_day61_7.log 142 | /worldcup/wc_day61_8.log 143 | /worldcup/wc_day62_1.log 144 | /worldcup/wc_day62_10.log 145 | /worldcup/wc_day62_2.log 146 | /worldcup/wc_day62_3.log 147 | /worldcup/wc_day62_4.log 148 | /worldcup/wc_day62_5.log 149 | /worldcup/wc_day62_6.log 150 | /worldcup/wc_day62_7.log 151 | /worldcup/wc_day62_8.log 152 | /worldcup/wc_day62_9.log 153 | /worldcup/wc_day63_1.log 154 | /worldcup/wc_day63_2.log 155 | /worldcup/wc_day63_3.log 156 | /worldcup/wc_day63_4.log 157 | /worldcup/wc_day64_1.log 158 | /worldcup/wc_day64_2.log 159 | /worldcup/wc_day64_3.log 160 | /worldcup/wc_day65_1.log 161 | /worldcup/wc_day65_2.log 162 | /worldcup/wc_day65_3.log 163 | /worldcup/wc_day65_4.log 164 | /worldcup/wc_day65_5.log 165 | /worldcup/wc_day65_6.log 166 | /worldcup/wc_day65_7.log 167 | /worldcup/wc_day65_8.log 168 | /worldcup/wc_day65_9.log 169 | /worldcup/wc_day66_1.log 170 | /worldcup/wc_day66_10.log 171 | /worldcup/wc_day66_11.log 172 | /worldcup/wc_day66_2.log 173 | /worldcup/wc_day66_3.log 174 | /worldcup/wc_day66_4.log 175 | /worldcup/wc_day66_5.log 176 | /worldcup/wc_day66_6.log 177 | /worldcup/wc_day66_7.log 178 | /worldcup/wc_day66_8.log 179 | /worldcup/wc_day66_9.log 180 | /worldcup/wc_day67_1.log 181 | /worldcup/wc_day67_2.log 182 | /worldcup/wc_day67_3.log 183 | /worldcup/wc_day67_4.log 184 | /worldcup/wc_day67_5.log 185 | /worldcup/wc_day68_1.log 186 | /worldcup/wc_day68_2.log 187 | /worldcup/wc_day68_3.log 188 | /worldcup/wc_day69_1.log 189 | /worldcup/wc_day69_2.log 190 | /worldcup/wc_day69_3.log 191 | /worldcup/wc_day69_4.log 192 | /worldcup/wc_day69_5.log 193 | /worldcup/wc_day69_6.log 194 | /worldcup/wc_day69_7.log 195 | /worldcup/wc_day6_1.log 196 | /worldcup/wc_day70_1.log 197 | /worldcup/wc_day70_2.log 198 | /worldcup/wc_day70_3.log 199 | /worldcup/wc_day71_1.log 200 | /worldcup/wc_day71_2.log 201 | /worldcup/wc_day72_1.log 202 | /worldcup/wc_day72_2.log 203 | /worldcup/wc_day72_3.log 204 | /worldcup/wc_day73_1.log 205 | /worldcup/wc_day73_2.log 206 | /worldcup/wc_day73_3.log 207 | /worldcup/wc_day73_4.log 208 | /worldcup/wc_day73_5.log 209 | /worldcup/wc_day73_6.log 210 | /worldcup/wc_day74_1.log 211 | /worldcup/wc_day74_2.log 212 | /worldcup/wc_day74_3.log 213 | /worldcup/wc_day74_4.log 214 | /worldcup/wc_day74_5.log 215 | /worldcup/wc_day74_6.log 216 | /worldcup/wc_day75_1.log 217 | /worldcup/wc_day75_2.log 218 | /worldcup/wc_day75_3.log 219 | /worldcup/wc_day76_1.log 220 | /worldcup/wc_day76_2.log 221 | /worldcup/wc_day77_1.log 222 | /worldcup/wc_day77_2.log 223 | /worldcup/wc_day78_1.log 224 | /worldcup/wc_day78_2.log 225 | /worldcup/wc_day79_1.log 226 | /worldcup/wc_day79_2.log 227 | /worldcup/wc_day79_3.log 228 | /worldcup/wc_day79_4.log 229 | /worldcup/wc_day7_1.log 230 | /worldcup/wc_day80_1.log 231 | /worldcup/wc_day80_2.log 232 | /worldcup/wc_day81_1.log 233 | /worldcup/wc_day82_1.log 234 | /worldcup/wc_day83_1.log 235 | /worldcup/wc_day84_1.log 236 | /worldcup/wc_day85_1.log 237 | /worldcup/wc_day86_1.log 238 | /worldcup/wc_day87_1.log 239 | /worldcup/wc_day88_1.log 240 | /worldcup/wc_day89_1.log 241 | /worldcup/wc_day8_1.log 242 | /worldcup/wc_day90_1.log 243 | /worldcup/wc_day91_1.log 244 | /worldcup/wc_day92_1.log 245 | /worldcup/wc_day9_1.log 246 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.mapreduce; 20 | 21 | import java.io.IOException; 22 | import java.net.InetAddress; 23 | import java.net.UnknownHostException; 24 | import java.util.ArrayList; 25 | import java.util.Arrays; 26 | import java.util.List; 27 | 28 | import org.apache.commons.logging.Log; 29 | import org.apache.commons.logging.LogFactory; 30 | import org.apache.hadoop.conf.Configuration; 31 | import org.apache.hadoop.mapred.JobConf; 32 | import org.apache.hadoop.mapred.Reporter; 33 | import org.apache.hadoop.mapreduce.InputFormat; 34 | import org.apache.hadoop.mapreduce.InputSplit; 35 | import org.apache.hadoop.mapreduce.JobContext; 36 | import org.apache.hadoop.mapreduce.RecordReader; 37 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 38 | 39 | import com.aerospike.client.AerospikeClient; 40 | import com.aerospike.client.AerospikeException; 41 | import com.aerospike.client.Host; 42 | import com.aerospike.client.cluster.Node; 43 | import com.aerospike.client.policy.ClientPolicy; 44 | 45 | /** 46 | * An {@link InputFormat} for data stored in an Aerospike database. 47 | */ 48 | public class AerospikeInputFormat 49 | extends InputFormat 50 | implements org.apache.hadoop.mapred.InputFormat { 52 | 53 | private static final Log log = 54 | LogFactory.getLog(AerospikeInputFormat.class); 55 | 56 | // ---------------- NEW API ---------------- 57 | 58 | public List getSplits(JobContext context) throws IOException { 59 | // Delegate to the old API. 60 | Configuration cfg = context.getConfiguration(); 61 | JobConf jobconf = AerospikeConfigUtil.asJobConf(cfg); 62 | return Arrays.asList((InputSplit[]) getSplits(jobconf, 63 | jobconf.getNumMapTasks())); 64 | } 65 | 66 | public RecordReader 67 | createRecordReader(InputSplit split, TaskAttemptContext context) 68 | throws IOException, InterruptedException { 69 | return new AerospikeRecordReader(); 70 | } 71 | 72 | // ---------------- OLD API ---------------- 73 | 74 | public org.apache.hadoop.mapred.InputSplit[] 75 | getSplits(JobConf job, int numSplits) throws IOException { 76 | try { 77 | 78 | String oper = AerospikeConfigUtil.getInputOperation(job); 79 | String host = AerospikeConfigUtil.getInputHost(job); 80 | int port = AerospikeConfigUtil.getInputPort(job); 81 | String namespace = AerospikeConfigUtil.getInputNamespace(job); 82 | String setName = AerospikeConfigUtil.getInputSetName(job); 83 | String[] binNames = AerospikeConfigUtil.getInputBinNames(job); 84 | String numrangeBin = ""; 85 | long numrangeBegin = 0; 86 | long numrangeEnd = 0; 87 | int scanPercent = 100; 88 | if (oper.equals("numrange")) { 89 | numrangeBin = AerospikeConfigUtil.getInputNumRangeBin(job); 90 | numrangeBegin = AerospikeConfigUtil.getInputNumRangeBegin(job); 91 | numrangeEnd = AerospikeConfigUtil.getInputNumRangeEnd(job); 92 | } else if (oper.equals("scan")) { 93 | scanPercent = AerospikeConfigUtil.getInputScanPercent(job); 94 | } 95 | 96 | log.info(String.format("using: %s %d %s %s", 97 | host, port, namespace, setName)); 98 | 99 | AerospikeClient client = 100 | AerospikeClientSingleton.getInstance(new ClientPolicy(), 101 | host, port); 102 | Node[] nodes = client.getNodes(); 103 | int nsplits = nodes.length; 104 | if (nsplits == 0) { 105 | throw new IOException("no Aerospike nodes found"); 106 | } 107 | log.info(String.format("found %d nodes", nsplits)); 108 | AerospikeSplit[] splits = new AerospikeSplit[nsplits]; 109 | for (int ii = 0; ii < nsplits; ii++) { 110 | Node node = nodes[ii]; 111 | String nodeName = node.getName(); 112 | 113 | // We want to avoid 127.0.0.1 as a hostname 114 | // because this value will be transferred to a 115 | // different hadoop node to be processed. 116 | // 117 | List aliases = getAliases(node.getHost()); 118 | Host nodehost = aliases.get(0); 119 | if (aliases.size() > 1) { 120 | for (Host a : aliases) { 121 | if (!a.name.equals("127.0.0.1")) { 122 | nodehost = a; 123 | break; 124 | } 125 | } 126 | } 127 | splits[ii] = new AerospikeSplit(oper, nodeName, 128 | nodehost.name, nodehost.port, 129 | namespace, setName, binNames, 130 | numrangeBin, numrangeBegin, 131 | numrangeEnd, scanPercent); 132 | log.info("split: " + splits[ii]); 133 | } 134 | return splits; 135 | } 136 | catch (Exception ex) { 137 | throw new IOException("exception in getSplits", ex); 138 | } 139 | } 140 | 141 | public org.apache.hadoop.mapred.RecordReader 142 | getRecordReader(org.apache.hadoop.mapred.InputSplit split, 143 | JobConf job, 144 | Reporter reporter 145 | ) throws IOException { 146 | return new AerospikeRecordReader((AerospikeSplit) split); 147 | } 148 | 149 | private List getAliases(Host host) { 150 | InetAddress[] addresses; 151 | 152 | try { 153 | addresses = InetAddress.getAllByName(host.name); 154 | } 155 | catch (UnknownHostException uhe) { 156 | throw new AerospikeException.Connection("Invalid host: " + host); 157 | } 158 | 159 | if (addresses.length == 0) { 160 | throw new AerospikeException.Connection("Failed to find addresses for " + host); 161 | } 162 | 163 | // Add capacity for current address aliases plus IPV6 address and hostname. 164 | List aliases = new ArrayList(addresses.length + 2); 165 | 166 | for (InetAddress address : addresses) { 167 | aliases.add(new Host(address.getHostAddress(), host.tlsName, host.port)); 168 | } 169 | 170 | return aliases; 171 | } 172 | 173 | } 174 | 175 | // Local Variables: 176 | // mode: java 177 | // c-basic-offset: 4 178 | // tab-width: 4 179 | // indent-tabs-mode: nil 180 | // End: 181 | // vim: softtabstop=4:shiftwidth=4:expandtab 182 | -------------------------------------------------------------------------------- /examples/generate_profiles/src/main/java/com/aerospike/hadoop/examples/generateprofiles/GenerateProfiles.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.examples.generateprofiles; 20 | 21 | import java.io.DataInput; 22 | import java.io.DataOutput; 23 | import java.io.IOException; 24 | import java.util.Iterator; 25 | import java.util.regex.Matcher; 26 | import java.util.regex.Pattern; 27 | 28 | import org.apache.commons.logging.Log; 29 | import org.apache.commons.logging.LogFactory; 30 | import org.apache.hadoop.conf.Configuration; 31 | import org.apache.hadoop.conf.Configured; 32 | import org.apache.hadoop.fs.Path; 33 | import org.apache.hadoop.io.IntWritable; 34 | import org.apache.hadoop.io.LongWritable; 35 | import org.apache.hadoop.io.Text; 36 | import org.apache.hadoop.io.Writable; 37 | import org.apache.hadoop.mapred.FileInputFormat; 38 | import org.apache.hadoop.mapred.JobClient; 39 | import org.apache.hadoop.mapred.JobConf; 40 | import org.apache.hadoop.mapred.MapReduceBase; 41 | import org.apache.hadoop.mapred.Mapper; 42 | import org.apache.hadoop.mapred.OutputCollector; 43 | import org.apache.hadoop.mapred.RecordWriter; 44 | import org.apache.hadoop.mapred.Reducer; 45 | import org.apache.hadoop.mapred.Reporter; 46 | import org.apache.hadoop.util.Progressable; 47 | import org.apache.hadoop.util.Tool; 48 | import org.apache.hadoop.util.ToolRunner; 49 | 50 | import com.aerospike.client.AerospikeClient; 51 | import com.aerospike.client.Bin; 52 | import com.aerospike.client.Key; 53 | import com.aerospike.client.policy.WritePolicy; 54 | import com.aerospike.hadoop.mapreduce.AerospikeOutputFormat; 55 | import com.aerospike.hadoop.mapreduce.AerospikeRecordWriter; 56 | 57 | public class GenerateProfiles extends Configured implements Tool { 58 | 59 | private static final Log log = LogFactory.getLog(GenerateProfiles.class); 60 | 61 | // Sample line format: 62 | // 37518 - - [16/Jun/1998:02:48:36 +0000] \ 63 | // "GET /images/hm_hola.gif HTTP/1.0" 200 2240 64 | 65 | private static final String logEntryRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\S+)"; 66 | private static final Pattern pat = Pattern.compile(logEntryRegex); 67 | 68 | private final static IntWritable one = new IntWritable(1); 69 | 70 | public static class Map extends MapReduceBase implements 71 | Mapper { 72 | 73 | int mapcount = 0; 74 | 75 | public void map(LongWritable key, 76 | Text rec, 77 | OutputCollector output, 78 | Reporter reporter) throws IOException { 79 | try { 80 | String line = rec.toString(); 81 | Matcher matcher = pat.matcher(line); 82 | if (!matcher.matches() || 7 != matcher.groupCount()) { 83 | throw new RuntimeException("match failed on: " + line); 84 | } 85 | long userid = Long.parseLong(matcher.group(1)); 86 | output.collect(new LongWritable(userid), one); 87 | } 88 | catch (Exception ex) { 89 | // log.error("exception in map", ex); 90 | } 91 | } 92 | } 93 | 94 | private static class Profile implements Writable { 95 | public long userid; 96 | public int age; 97 | public int isMale; 98 | 99 | public Profile(long userid, int age, int isMale) { 100 | this.userid = userid; 101 | this.age = age; 102 | this.isMale = isMale; 103 | } 104 | 105 | public void readFields(DataInput in) throws IOException { 106 | userid = in.readLong(); 107 | age = in.readInt(); 108 | isMale = in.readInt(); 109 | } 110 | 111 | public void write(DataOutput out) throws IOException { 112 | out.writeLong(userid); 113 | out.writeInt(age); 114 | out.writeInt(isMale); 115 | } 116 | } 117 | 118 | public static class Reduce 119 | extends MapReduceBase 120 | implements Reducer { 121 | 122 | public void reduce(LongWritable userid, 123 | Iterator ones, 124 | OutputCollector output, 125 | Reporter reporter 126 | ) throws IOException { 127 | 128 | // Fake age based on userid. 129 | int age = ((int) userid.get() % 40) + 20; 130 | 131 | // Fake gender based on userid. 132 | int isMale = (int) userid.get() % 2; 133 | 134 | Profile profile = new Profile(userid.get(), age, isMale); 135 | output.collect(userid, profile); 136 | } 137 | } 138 | 139 | public static class ProfileOutputFormat 140 | extends AerospikeOutputFormat { 141 | 142 | public static class ProfileRecordWriter 143 | extends AerospikeRecordWriter { 144 | 145 | public ProfileRecordWriter(Configuration cfg, 146 | Progressable progressable) { 147 | super(cfg); 148 | } 149 | 150 | @Override 151 | public void writeAerospike(LongWritable userid, 152 | Profile profile, 153 | AerospikeClient client, 154 | WritePolicy writePolicy, 155 | String namespace, 156 | String setName) throws IOException { 157 | writePolicy.totalTimeout = 10000; 158 | Key kk = new Key(namespace, setName, userid.get()); 159 | Bin bin0 = new Bin("userid", profile.userid); 160 | Bin bin1 = new Bin("age", profile.age); 161 | Bin bin2 = new Bin("isMale", profile.isMale); 162 | client.put(writePolicy, kk, bin0, bin1, bin2); 163 | } 164 | } 165 | 166 | public RecordWriter 167 | getAerospikeRecordWriter(Configuration conf, Progressable prog) { 168 | return new ProfileRecordWriter(conf, prog); 169 | } 170 | } 171 | 172 | public int run(final String[] args) throws Exception { 173 | 174 | log.info("run starting"); 175 | 176 | final Configuration conf = getConf(); 177 | 178 | JobConf job = new JobConf(conf, GenerateProfiles.class); 179 | job.setJobName("AerospikeGenerateProfiles"); 180 | 181 | job.setMapperClass(Map.class); 182 | job.setMapOutputKeyClass(LongWritable.class); 183 | job.setMapOutputValueClass(IntWritable.class); 184 | // job.setCombinerClass(Reduce.class); // Reduce changes format. 185 | job.setReducerClass(Reduce.class); 186 | job.setOutputKeyClass(Text.class); 187 | job.setOutputValueClass(Profile.class); 188 | 189 | job.setOutputFormat(ProfileOutputFormat.class); 190 | 191 | for (int ii = 0; ii < args.length; ++ii) 192 | FileInputFormat.addInputPath(job, new Path(args[ii])); 193 | 194 | JobClient.runJob(job); 195 | 196 | log.info("finished"); 197 | return 0; 198 | } 199 | 200 | public static void main(final String[] args) throws Exception { 201 | System.exit(ToolRunner.run(new GenerateProfiles(), args)); 202 | } 203 | } 204 | 205 | // Local Variables: 206 | // mode: java 207 | // c-basic-offset: 4 208 | // tab-width: 4 209 | // indent-tabs-mode: nil 210 | // End: 211 | // vim: softtabstop=4:shiftwidth=4:expandtab 212 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Aerospike Hadoop Connector 2 | 3 | This repository contains AerospikeInputFormat.java and 4 | AerospikeOutputFormat.java, and several examples of processing using 5 | Hadoop. 6 | 7 | The system allows putting WorkerNodes on Aerospike servers. By 8 | default, the AerospikeInputMapper will split according to the nodes on 9 | the cluster, avoiding network traffic. The InputMapper also supports 10 | using secondary indexes, thus pulling only a few of the records in the 11 | Aerospike database. 12 | 13 | Both new and old Hadoop interfaces are supported, and there are 14 | examples for both. 15 | 16 | In the case of using AerospikeOutputMapper, the Aerospike cluster is 17 | likely to be outside the Hadoop worker nodes. This allows immediate 18 | use of the Hadoop output in your application. 19 | 20 | Check out the examples. The classic word count examples are included - 21 | for both input and output. The "aggregate int example" uses a 22 | secondary index to pull data from Aerospike, and runs the InputFormat 23 | on the local node if available. 24 | 25 | The most interesting example is likely the session rollup example. In 26 | this example, the session management state is output to Aerospike as 27 | the sessions are found. 28 | 29 | See the [Wiki](https://github.com/aerospike-community/aerospike-hadoop/wiki) for more. 30 | 31 | Install Hadoop 32 | ---------------------------------------------------------------- 33 | Examples below are tested with Aerospike Java Client (version: 4.2.2) and Hadoop (version: 2.7.2) 34 | 35 | Hadoop installation guide [link](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html) 36 | 37 | Then set up environment variable: 38 | ---------------------------------------------------------------- 39 | 40 | Hadoop Installation Directory: 41 | export HADOOP_PREFIX=/usr/local/hadoop 42 | 43 | Development Directory: 44 | export AEROSPIKE_HADOOP=~/aerospike/aerospike-hadoop 45 | 46 | 47 | Build w/ Gradle 48 | ---------------------------------------------------------------- 49 | 50 | cd ${AEROSPIKE_HADOOP} 51 | 52 | # Build the mapreduce input and output connectors. 53 | ./gradlew :mapreduce:jar 54 | 55 | # Build the example programs. 56 | ./gradlew :sampledata:installApp 57 | ./gradlew :examples:word_count_input:installApp 58 | ./gradlew :examples:aggregate_int_input:installApp 59 | ./gradlew :examples:word_count_output:installApp 60 | ./gradlew :examples:session_rollup:installApp 61 | ./gradlew :examples:generate_profiles:installApp 62 | ./gradlew :examples:external_join:installApp 63 | 64 | 65 | 66 | Setup Target Input Text File 67 | ---------------------------------------------------------------- 68 | 69 | # Make a copy of /var/log/messages 70 | sudo cp /var/log/messages /tmp/input 71 | sudo chown $USER:$USER /tmp/input 72 | chmod 644 /tmp/input 73 | 74 | 75 | Start Aerospike 76 | ---------------------------------------------------------------- 77 | 78 | sudo /etc/init.d/aerospike start 79 | 80 | 81 | Setup Sample Data in Aerospike for Input Examples 82 | ---------------------------------------------------------------- 83 | 84 | cd ${AEROSPIKE_HADOOP}/sampledata 85 | 86 | # Loads a text file for word_count_input demo. 87 | java -jar build/libs/sampledata.jar \ 88 | localhost:3000:test:words:bin1 \ 89 | text-file \ 90 | /tmp/input 91 | 92 | # Generates sequential integers for aggregate_int_input demo. 93 | java -jar build/libs/sampledata.jar \ 94 | localhost:3000:test:integers:bin1 seq-int 0 100000 95 | 96 | 97 | Run Input Examples 98 | ---------------------------------------------------------------- 99 | 100 | export HADOOP_PREFIX=/usr/local/hadoop 101 | 102 | cd ${AEROSPIKE_HADOOP} 103 | 104 | # Format HDFS 105 | rm -rf /tmp/hadoop-$USER/dfs/data 106 | $HADOOP_PREFIX/bin/hdfs namenode -format 107 | 108 | # Start HDFS 109 | $HADOOP_PREFIX/sbin/start-dfs.sh 110 | 111 | # Check for {Secondary,}NameNode and DataNode 112 | jps 113 | 114 | # Make some directories 115 | $HADOOP_PREFIX/bin/hdfs dfs -mkdir /tmp 116 | 117 | # Run the Hadoop job. 118 | cd ${AEROSPIKE_HADOOP} 119 | 120 | # Run the word_count_input example (Old Hadoop API) 121 | $HADOOP_PREFIX/bin/hdfs dfs -rm -r /tmp/output 122 | $HADOOP_PREFIX/bin/hadoop \ 123 | jar \ 124 | ./examples/word_count_input/build/libs/word_count_input.jar \ 125 | -D aerospike.input.namespace=test \ 126 | -D aerospike.input.setname=words \ 127 | -D aerospike.input.operation=scan \ 128 | /tmp/output 129 | 130 | # Jump to "Inspect the results" below ... 131 | 132 | # -- OR -- 133 | 134 | # Run the aggregate_int_input range example (New Hadoop API) 135 | $HADOOP_PREFIX/bin/hdfs dfs -rm -r /tmp/output 136 | $HADOOP_PREFIX/bin/hadoop \ 137 | jar \ 138 | ./examples/aggregate_int_input/build/libs/aggregate_int_input.jar \ 139 | -D aerospike.input.namespace=test \ 140 | -D aerospike.input.setname=integers \ 141 | -D aerospike.input.binnames=bin1 \ 142 | -D aerospike.input.operation=scan \ 143 | /tmp/output 144 | 145 | # Jump to "Inspect the results" below ... 146 | 147 | # -- OR -- 148 | 149 | # Run the aggregate_int_input range example (New Hadoop API) 150 | $HADOOP_PREFIX/bin/hdfs dfs -rm -r /tmp/output 151 | $HADOOP_PREFIX/bin/hadoop \ 152 | jar \ 153 | ./examples/aggregate_int_input/build/libs/aggregate_int_input.jar \ 154 | -D aerospike.input.namespace=test \ 155 | -D aerospike.input.setname=integers \ 156 | -D aerospike.input.binnames=bin1,bin2 \ 157 | -D aerospike.input.operation=numrange \ 158 | -D aerospike.input.numrange.bin=bin1 \ 159 | -D aerospike.input.numrange.begin=100 \ 160 | -D aerospike.input.numrange.end=200 \ 161 | /tmp/output 162 | 163 | # Inspect the results. 164 | $HADOOP_PREFIX/bin/hadoop fs -ls /tmp/output 165 | rm -rf /tmp/output 166 | $HADOOP_PREFIX/bin/hadoop fs -copyToLocal /tmp/output /tmp 167 | less /tmp/output/part*00000 168 | 169 | 170 | Setup Sample Data in HDFS for Output Examples 171 | ---------------------------------------------------------------- 172 | 173 | export HADOOP_PREFIX=/usr/local/hadoop 174 | 175 | # Create a directory. 176 | $HADOOP_PREFIX/bin/hdfs dfs -mkdir /tmp 177 | 178 | # Load the test words into HDFS. 179 | $HADOOP_PREFIX/bin/hdfs dfs -rm /tmp/words 180 | $HADOOP_PREFIX/bin/hadoop fs -copyFromLocal /tmp/input /tmp/words 181 | 182 | # Load the World Cup log data into HDFS 183 | $HADOOP_PREFIX/bin/hdfs dfs -rm -r /worldcup 184 | $HADOOP_PREFIX/bin/hdfs dfs -mkdir /worldcup 185 | $HADOOP_PREFIX/bin/hadoop fs -copyFromLocal \ 186 | data/worldcup\ 187 | /worldcup/access.log 188 | 189 | # Create the secondary indexes in Aerospike. 190 | aql -c 'CREATE INDEX useridndx ON test.sessions (userid) NUMERIC' 191 | aql -c 'CREATE INDEX startndx ON test.sessions (start) NUMERIC' 192 | 193 | 194 | Run Output Examples 195 | ---------------------------------------------------------------- 196 | 197 | # Run the Hadoop job. 198 | cd ${AEROSPIKE_HADOOP} 199 | 200 | # Run the word_count_output example (Old Hadoop API) 201 | $HADOOP_PREFIX/bin/hadoop \ 202 | jar \ 203 | ./examples/word_count_output/build/libs/word_count_output.jar \ 204 | -D aerospike.output.namespace=test \ 205 | -D aerospike.output.setname=counts \ 206 | /tmp/words 207 | 208 | # Inspect the results: 209 | aql -c 'SELECT * FROM test.counts' 210 | 211 | # -- OR -- 212 | 213 | # Run the session_rollup example (Old Hadoop API, small dataset) 214 | $HADOOP_PREFIX/bin/hadoop \ 215 | jar \ 216 | ./examples/session_rollup/build/libs/session_rollup.jar \ 217 | -D aerospike.output.namespace=test \ 218 | -D aerospike.output.setname=sessions \ 219 | -D mapred.reduce.tasks=30 \ 220 | /worldcup/access.log 221 | 222 | # Inspect the results: 223 | aql -c 'SELECT * FROM test.sessions' 224 | 225 | # -- OR -- 226 | 227 | # Run generate_profiles to build sample data for external_join. 228 | $HADOOP_PREFIX/bin/hadoop \ 229 | jar \ 230 | ./examples/generate_profiles/build/libs/generate_profiles.jar \ 231 | -D aerospike.output.namespace=test \ 232 | -D aerospike.output.setname=profiles \ 233 | -D mapred.reduce.tasks=30 \ 234 | /worldcup/access.log 235 | 236 | # Inspect the results: 237 | aql -c 'SELECT * FROM test.profiles' 238 | 239 | # -- AND -- 240 | 241 | # Run the external_join example (Old Hadoop API, small dataset) 242 | $HADOOP_PREFIX/bin/hadoop \ 243 | jar \ 244 | ./examples/external_join/build/libs/external_join.jar \ 245 | -D aerospike.input.namespace=test \ 246 | -D aerospike.input.setname=profiles \ 247 | -D aerospike.output.namespace=test \ 248 | -D aerospike.output.setname=sessions2 \ 249 | -D mapred.reduce.tasks=30 \ 250 | /worldcup/access.log 251 | 252 | # Inspect the results: 253 | aql -c 'SELECT * FROM test.sessions2' 254 | ./gradlew :examples:word_count_input:installApp 255 | 256 | Done with HDFS 257 | ---------------------------------------------------------------- 258 | 259 | # Stop HDFS 260 | $HADOOP_PREFIX/sbin/stop-dfs.sh 261 | 262 | -------------------------------------------------------------------------------- /examples/spark_session_rollup/src/main/java/com/aerospike/spark/examples/SparkSessionRollup.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.spark.examples; 20 | 21 | import java.io.IOException; 22 | import java.nio.ByteBuffer; 23 | import java.security.MessageDigest; 24 | import java.security.NoSuchAlgorithmException; 25 | import java.text.ParsePosition; 26 | import java.text.SimpleDateFormat; 27 | import java.util.ArrayList; 28 | import java.util.Collections; 29 | import java.util.Date; 30 | import java.util.Iterator; 31 | import java.util.List; 32 | import java.util.regex.Matcher; 33 | import java.util.regex.Pattern; 34 | 35 | import org.apache.commons.codec.binary.Hex; 36 | import org.apache.hadoop.conf.Configuration; 37 | import org.apache.hadoop.mapred.JobConf; 38 | import org.apache.hadoop.mapred.RecordWriter; 39 | import org.apache.hadoop.util.Progressable; 40 | import org.apache.spark.SparkConf; 41 | import org.apache.spark.api.java.JavaPairRDD; 42 | import org.apache.spark.api.java.JavaRDD; 43 | import org.apache.spark.api.java.JavaSparkContext; 44 | import org.apache.spark.api.java.function.PairFlatMapFunction; 45 | import org.apache.spark.api.java.function.PairFunction; 46 | 47 | import com.aerospike.client.AerospikeClient; 48 | import com.aerospike.client.Bin; 49 | import com.aerospike.client.Key; 50 | import com.aerospike.client.policy.WritePolicy; 51 | import com.aerospike.hadoop.mapreduce.AerospikeConfigUtil; 52 | import com.aerospike.hadoop.mapreduce.AerospikeLogger; 53 | import com.aerospike.hadoop.mapreduce.AerospikeOutputFormat; 54 | import com.aerospike.hadoop.mapreduce.AerospikeRecordWriter; 55 | 56 | import scala.Tuple2; 57 | 58 | public class SparkSessionRollup { 59 | 60 | public static final String appName = "spark_session_rollup"; 61 | public static final String master = "spark://as0:7077"; 62 | 63 | public static class ExtractHits 64 | implements PairFunction { 65 | private static final long serialVersionUID = 1L; 66 | 67 | // Sample line format: 68 | // 37518 - - [16/Jun/1998:02:48:36 +0000] "GET /images/hm_hola.gif HTTP/1.0" 200 2240 69 | final String logEntryRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\S+)"; 70 | final Pattern pat = Pattern.compile(logEntryRegex); 71 | 72 | final SimpleDateFormat dateTimeParser = 73 | new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z"); 74 | 75 | public Tuple2 call(String line) { 76 | 77 | Matcher matcher = pat.matcher(line); 78 | if (!matcher.matches() || 7 != matcher.groupCount()) 79 | return new Tuple2(0L, 0L); 80 | 81 | long userid = Long.parseLong(matcher.group(1)); 82 | String tstamp = matcher.group(4); 83 | ParsePosition pos = new ParsePosition(0); 84 | Date date = dateTimeParser.parse(tstamp, pos); 85 | long msec = date.getTime(); 86 | 87 | return new Tuple2(userid, msec); 88 | } 89 | } 90 | 91 | private static class Session { 92 | public long userid; 93 | public long start; 94 | public long end; 95 | public int nhits; 96 | 97 | public Session(long userid, long start, long end, int nhits) { 98 | this.userid = userid; 99 | this.start = start; 100 | this.end = end; 101 | this.nhits = nhits; 102 | } 103 | } 104 | 105 | public static class FindSessions 106 | implements PairFlatMapFunction>, 107 | String, Session> { 108 | private static final long serialVersionUID = 1L; 109 | 110 | private static final long SESSION_GAP_MSEC = 20 * 60 * 1000; 111 | 112 | public Iterator> 113 | call(Tuple2> tup) { 114 | 115 | List> results = 116 | new ArrayList>(); 117 | 118 | // Copy the iterator to an array. 119 | ArrayList tsarray = new ArrayList(); 120 | for (Long val : tup._2()) 121 | tsarray.add(val); 122 | 123 | // Sort the timestamps. 124 | Collections.sort(tsarray); 125 | 126 | // Scan the array looking for session boundaries. 127 | long t0 = 0; 128 | long session_start = 0; 129 | long session_end = 0; 130 | int session_hits = 0; 131 | for (Long tstamp: tsarray) { 132 | long tt = tstamp; 133 | 134 | // How long since the prior hit? 135 | long delta = tt - t0; 136 | 137 | // Is this a new session? 138 | if (delta > SESSION_GAP_MSEC) { 139 | 140 | // Is there a prior session? 141 | if (session_start != 0) 142 | collect_session(tup._1(), session_start, session_end, 143 | session_hits, results); 144 | 145 | // Reset for the new session. 146 | session_start = tt; 147 | session_hits = 0; 148 | } 149 | 150 | // Extend the current session. 151 | session_hits += 1; 152 | session_end = tt; 153 | 154 | // On to the next hit ... 155 | t0 = tt; 156 | } 157 | 158 | // Write out the last session. 159 | if (session_start != 0) 160 | collect_session(tup._1(), session_start, session_end, 161 | session_hits, results); 162 | 163 | return results.iterator(); 164 | } 165 | 166 | private void collect_session(long userid, long start, 167 | long end, int nhits, 168 | List> results) { 169 | 170 | try { 171 | // Generate a sessionid from the hash of the userid and start. 172 | MessageDigest md = MessageDigest.getInstance("SHA-256"); 173 | md.update(ByteBuffer.allocate(8).putLong(userid).array()); 174 | md.update(ByteBuffer.allocate(8).putLong(start).array()); 175 | String sessid = Hex.encodeHexString(md.digest()).substring(0,16); 176 | 177 | Session session = new Session(userid, start, end, nhits); 178 | 179 | results.add(new Tuple2(sessid, session)); 180 | } 181 | catch (NoSuchAlgorithmException ex) { 182 | throw new RuntimeException(ex); 183 | } 184 | } 185 | } 186 | 187 | public static class SessionOutputFormat 188 | extends AerospikeOutputFormat { 189 | 190 | public static class SessionRecordWriter 191 | extends AerospikeRecordWriter { 192 | 193 | public SessionRecordWriter(Configuration cfg, 194 | Progressable progressable) { 195 | super(cfg); 196 | } 197 | 198 | @Override 199 | public void writeAerospike(String sessid, 200 | Session session, 201 | AerospikeClient client, 202 | WritePolicy writePolicy, 203 | String namespace, 204 | String setName) throws IOException { 205 | Key kk = new Key(namespace, setName, sessid.toString()); 206 | Bin bin0 = new Bin("userid", session.userid); 207 | Bin bin1 = new Bin("start", session.start); 208 | Bin bin2 = new Bin("end", session.end); 209 | Bin bin3 = new Bin("nhits", session.nhits); 210 | client.put(writePolicy, kk, bin0, bin1, bin2, bin3); 211 | } 212 | } 213 | 214 | public RecordWriter 215 | getAerospikeRecordWriter(Configuration conf, Progressable prog) { 216 | return new SessionRecordWriter(conf, prog); 217 | } 218 | } 219 | 220 | public static void main(String[] args) { 221 | com.aerospike.client.Log.setCallback(new AerospikeLogger()); 222 | com.aerospike.client.Log.setLevel(com.aerospike.client.Log.Level.DEBUG); 223 | 224 | SparkConf conf = new SparkConf() 225 | .setAppName(appName) 226 | .set("spark.executor.memory", "2g") 227 | .setMaster(master); 228 | JavaSparkContext sc = new JavaSparkContext(conf); 229 | sc.addJar("build/libs/spark_session_rollup.jar"); 230 | 231 | JavaRDD entries = sc.textFile("hdfs://localhost:54310/tmp/input"); 232 | 233 | JavaPairRDD> userhits = 234 | entries.mapToPair(new ExtractHits()).groupByKey(); 235 | 236 | JavaPairRDD sessions = 237 | userhits.flatMapToPair(new FindSessions()); 238 | 239 | System.err.println(sessions.count()); 240 | 241 | JobConf job = new JobConf(); 242 | job.setOutputKeyClass(String.class); 243 | job.setOutputValueClass(Session.class); 244 | job.setOutputFormat(SessionOutputFormat.class); 245 | 246 | AerospikeConfigUtil.setOutputHost(job, "localhost"); 247 | AerospikeConfigUtil.setOutputPort(job, 3000); 248 | AerospikeConfigUtil.setOutputNamespace(job, "test"); 249 | AerospikeConfigUtil.setOutputSetName(job, "sessions3"); 250 | 251 | sessions.saveAsHadoopDataset(job); 252 | } 253 | } 254 | 255 | // Local Variables: 256 | // mode: java 257 | // c-basic-offset: 4 258 | // tab-width: 4 259 | // indent-tabs-mode: nil 260 | // End: 261 | // vim: softtabstop=4:shiftwidth=4:expandtab 262 | -------------------------------------------------------------------------------- /examples/session_rollup/src/main/java/com/aerospike/hadoop/examples/sessionrollup/SessionRollup.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.examples.sessionrollup; 20 | 21 | import java.io.DataInput; 22 | import java.io.DataOutput; 23 | import java.io.IOException; 24 | import java.nio.ByteBuffer; 25 | import java.security.MessageDigest; 26 | import java.security.NoSuchAlgorithmException; 27 | import java.text.ParsePosition; 28 | import java.text.SimpleDateFormat; 29 | import java.util.ArrayList; 30 | import java.util.Collections; 31 | import java.util.Date; 32 | import java.util.Iterator; 33 | import java.util.regex.Matcher; 34 | import java.util.regex.Pattern; 35 | 36 | import org.apache.commons.codec.binary.Hex; 37 | import org.apache.commons.logging.Log; 38 | import org.apache.commons.logging.LogFactory; 39 | import org.apache.hadoop.conf.Configuration; 40 | import org.apache.hadoop.conf.Configured; 41 | import org.apache.hadoop.fs.Path; 42 | import org.apache.hadoop.io.LongWritable; 43 | import org.apache.hadoop.io.Text; 44 | import org.apache.hadoop.io.Writable; 45 | import org.apache.hadoop.mapred.FileInputFormat; 46 | import org.apache.hadoop.mapred.JobClient; 47 | import org.apache.hadoop.mapred.JobConf; 48 | import org.apache.hadoop.mapred.MapReduceBase; 49 | import org.apache.hadoop.mapred.Mapper; 50 | import org.apache.hadoop.mapred.OutputCollector; 51 | import org.apache.hadoop.mapred.RecordWriter; 52 | import org.apache.hadoop.mapred.Reducer; 53 | import org.apache.hadoop.mapred.Reporter; 54 | import org.apache.hadoop.util.Progressable; 55 | import org.apache.hadoop.util.Tool; 56 | import org.apache.hadoop.util.ToolRunner; 57 | 58 | import com.aerospike.client.AerospikeClient; 59 | import com.aerospike.client.Bin; 60 | import com.aerospike.client.Key; 61 | import com.aerospike.client.policy.WritePolicy; 62 | import com.aerospike.hadoop.mapreduce.AerospikeOutputFormat; 63 | import com.aerospike.hadoop.mapreduce.AerospikeRecordWriter; 64 | 65 | public class SessionRollup extends Configured implements Tool { 66 | 67 | private static final Log log = LogFactory.getLog(SessionRollup.class); 68 | 69 | private static final long SESSION_GAP_MSEC = 20 * 60 * 1000; 70 | 71 | // Sample line format: 72 | // 37518 - - [16/Jun/1998:02:48:36 +0000] \ 73 | // "GET /images/hm_hola.gif HTTP/1.0" 200 2240 74 | 75 | private static final String logEntryRegex = "^([\\w.-]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\S+)"; 76 | private static final Pattern pat = Pattern.compile(logEntryRegex); 77 | 78 | private static final SimpleDateFormat dateTimeParser = 79 | new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z"); 80 | 81 | public static class Map extends MapReduceBase implements 82 | Mapper { 83 | 84 | int mapcount = 0; 85 | 86 | public void map(LongWritable key, 87 | Text rec, 88 | OutputCollector output, 89 | Reporter reporter) throws IOException { 90 | try { 91 | String line = rec.toString(); 92 | Matcher matcher = pat.matcher(line); 93 | if (!matcher.matches() || 7 != matcher.groupCount()) { 94 | throw new RuntimeException("match failed on: " + line); 95 | } 96 | long userid = 10001; 97 | try{userid = Long.parseLong(matcher.group(7));}catch(Exception e){} 98 | String tstamp = matcher.group(4); 99 | ParsePosition pos = new ParsePosition(0); 100 | Date date = dateTimeParser.parse(tstamp, pos); 101 | long msec = date.getTime(); 102 | output.collect(new LongWritable(userid), new LongWritable(msec)); 103 | } 104 | catch (Exception ex) { 105 | // log.error("exception in map: " + ex); 106 | } 107 | } 108 | } 109 | 110 | private static class Session implements Writable { 111 | public long userid; 112 | public long start; 113 | public long end; 114 | public int nhits; 115 | 116 | public Session(long userid, long start, long end, int nhits) { 117 | this.userid = userid; 118 | this.start = start; 119 | this.end = end; 120 | this.nhits = nhits; 121 | } 122 | 123 | public void readFields(DataInput in) throws IOException { 124 | userid = in.readLong(); 125 | start = in.readLong(); 126 | end = in.readLong(); 127 | nhits = in.readInt(); 128 | } 129 | 130 | public void write(DataOutput out) throws IOException { 131 | out.writeLong(userid); 132 | out.writeLong(start); 133 | out.writeLong(end); 134 | out.writeInt(nhits); 135 | } 136 | } 137 | 138 | public static class Reduce 139 | extends MapReduceBase 140 | implements Reducer { 141 | 142 | public void reduce(LongWritable userid, 143 | Iterator tstamps, 144 | OutputCollector output, 145 | Reporter reporter 146 | ) throws IOException { 147 | 148 | // Copy the iterator to an array. 149 | ArrayList tsarray = new ArrayList(); 150 | while (tstamps.hasNext()) 151 | tsarray.add(new LongWritable(tstamps.next().get())); 152 | 153 | // Sort the timestamps. 154 | Collections.sort(tsarray); 155 | 156 | // Scan the array looking for session boundaries. 157 | long t0 = 0; 158 | long session_start = 0; 159 | long session_end = 0; 160 | int session_hits = 0; 161 | for (LongWritable tstamp: tsarray) { 162 | long tt = tstamp.get(); 163 | 164 | // How long since the prior hit? 165 | long delta = tt - t0; 166 | 167 | // Is this a new session? 168 | if (delta > SESSION_GAP_MSEC) { 169 | 170 | // Is there a prior session? 171 | if (session_start != 0) 172 | collect_session(userid.get(), session_start, session_end, 173 | session_hits, output); 174 | 175 | // Reset for the new session. 176 | session_start = tt; 177 | session_hits = 0; 178 | } 179 | 180 | // Extend the current session. 181 | session_hits += 1; 182 | session_end = tt; 183 | 184 | // On to the next hit ... 185 | t0 = tt; 186 | } 187 | 188 | // Write out the last session. 189 | if (session_start != 0) 190 | collect_session(userid.get(), session_start, session_end, 191 | session_hits, output); 192 | } 193 | 194 | private void collect_session(long userid, long start, 195 | long end, int nhits, 196 | OutputCollector output) 197 | throws IOException { 198 | 199 | try { 200 | // Generate a sessionid from the hash of the userid and start. 201 | MessageDigest md = MessageDigest.getInstance("SHA-256"); 202 | md.update(ByteBuffer.allocate(8).putLong(userid).array()); 203 | md.update(ByteBuffer.allocate(8).putLong(start).array()); 204 | String sessid = Hex.encodeHexString(md.digest()).substring(0,16); 205 | 206 | Session session = new Session(userid, start, end, nhits); 207 | output.collect(new Text(sessid), session); 208 | } 209 | catch (NoSuchAlgorithmException ex) { 210 | throw new RuntimeException(ex); 211 | } 212 | } 213 | } 214 | 215 | public static class SessionOutputFormat 216 | extends AerospikeOutputFormat { 217 | 218 | public static class SessionRecordWriter 219 | extends AerospikeRecordWriter { 220 | 221 | public SessionRecordWriter(Configuration cfg, 222 | Progressable progressable) { 223 | super(cfg); 224 | } 225 | 226 | @Override 227 | public void writeAerospike(Text sessid, 228 | Session session, 229 | AerospikeClient client, 230 | WritePolicy writePolicy, 231 | String namespace, 232 | String setName) throws IOException { 233 | Key kk = new Key(namespace, setName, sessid.toString()); 234 | Bin bin0 = new Bin("userid", session.userid); 235 | Bin bin1 = new Bin("start", session.start); 236 | Bin bin2 = new Bin("end", session.end); 237 | Bin bin3 = new Bin("nhits", session.nhits); 238 | client.put(writePolicy, kk, bin0, bin1, bin2, bin3); 239 | } 240 | } 241 | 242 | public RecordWriter 243 | getAerospikeRecordWriter(Configuration conf, Progressable prog) { 244 | return new SessionRecordWriter(conf, prog); 245 | } 246 | } 247 | 248 | public int run(final String[] args) throws Exception { 249 | 250 | log.info("run starting"); 251 | 252 | final Configuration conf = getConf(); 253 | 254 | JobConf job = new JobConf(conf, SessionRollup.class); 255 | job.setJobName("AerospikeSessionRollup"); 256 | 257 | job.setMapperClass(Map.class); 258 | job.setMapOutputKeyClass(LongWritable.class); 259 | job.setMapOutputValueClass(LongWritable.class); 260 | // job.setCombinerClass(Reduce.class); // Reduce changes format. 261 | job.setReducerClass(Reduce.class); 262 | job.setOutputKeyClass(Text.class); 263 | job.setOutputValueClass(Session.class); 264 | 265 | job.setOutputFormat(SessionOutputFormat.class); 266 | 267 | for (int ii = 0; ii < args.length; ++ii) 268 | FileInputFormat.addInputPath(job, new Path(args[ii])); 269 | 270 | JobClient.runJob(job); 271 | 272 | log.info("finished"); 273 | return 0; 274 | } 275 | 276 | public static void main(final String[] args) throws Exception { 277 | System.exit(ToolRunner.run(new SessionRollup(), args)); 278 | } 279 | } 280 | 281 | // Local Variables: 282 | // mode: java 283 | // c-basic-offset: 4 284 | // tab-width: 4 285 | // indent-tabs-mode: nil 286 | // End: 287 | // vim: softtabstop=4:shiftwidth=4:expandtab 288 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeConfigUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.mapreduce; 20 | 21 | import org.apache.commons.logging.Log; 22 | import org.apache.commons.logging.LogFactory; 23 | 24 | import org.apache.hadoop.conf.Configuration; 25 | 26 | public class AerospikeConfigUtil { 27 | private static final Log log = LogFactory.getLog(AerospikeConfigUtil.class); 28 | 29 | // ---------------- INPUT ---------------- 30 | 31 | public static final int DEFAULT_INPUT_PORT = 3000; 32 | public static final long INVALID_LONG = 762492121482318889L; 33 | public static final int DEFAULT_INPUT_SCAN_PERCENT = 100; 34 | // ---------------- OUTPUT ---------------- 35 | 36 | public static final int DEFAULT_OUTPUT_PORT = 3000; 37 | 38 | // ---------------- INPUT ---------------- 39 | 40 | public static void setInputHost(Configuration conf, String host) { 41 | log.info("setting " + AerospikeConfigEnum.INPUT_HOST.value + " to " + host); 42 | conf.set(AerospikeConfigEnum.INPUT_HOST.value, host); 43 | } 44 | 45 | public static String getInputHost(Configuration conf) { 46 | String host = conf.get(AerospikeConfigEnum.INPUT_HOST.value, AerospikeConfigEnum.DEFAULT_INPUT_HOST.value); 47 | log.info("using " + AerospikeConfigEnum.INPUT_HOST.value + " = " + host); 48 | return host; 49 | } 50 | 51 | public static void setInputPort(Configuration conf, int port) { 52 | log.info("setting " + AerospikeConfigEnum.INPUT_PORT.value + " to " + port); 53 | conf.setInt(AerospikeConfigEnum.INPUT_PORT.value, port); 54 | } 55 | 56 | public static int getInputPort(Configuration conf) { 57 | int port = conf.getInt(AerospikeConfigEnum.INPUT_PORT.value, DEFAULT_INPUT_PORT); 58 | log.info("using " + AerospikeConfigEnum.INPUT_PORT.value + " = " + port); 59 | return port; 60 | } 61 | 62 | public static void setInputNamespace(Configuration conf, String namespace) { 63 | log.info("setting " + AerospikeConfigEnum.INPUT_NAMESPACE.value + " to " + namespace); 64 | conf.set(AerospikeConfigEnum.INPUT_NAMESPACE.value, namespace); 65 | } 66 | 67 | public static String getInputNamespace(Configuration conf) { 68 | String namespace = conf.get(AerospikeConfigEnum.INPUT_NAMESPACE.value); 69 | if (namespace == null) 70 | throw new UnsupportedOperationException 71 | ("you must set the input namespace"); 72 | log.info("using " + AerospikeConfigEnum.INPUT_NAMESPACE.value + " = " + namespace); 73 | return namespace; 74 | } 75 | 76 | public static void setInputSetName(Configuration conf, String setname) { 77 | log.info("setting " + AerospikeConfigEnum.INPUT_SETNAME.value + " to " + setname); 78 | conf.set(AerospikeConfigEnum.INPUT_SETNAME.value, setname); 79 | } 80 | 81 | public static String getInputSetName(Configuration conf) { 82 | String setname = conf.get(AerospikeConfigEnum.INPUT_SETNAME.value); 83 | log.info("using " + AerospikeConfigEnum.INPUT_SETNAME.value + " = " + setname); 84 | return setname; 85 | } 86 | 87 | public static void setInputBinNames(Configuration conf, String bins) { 88 | log.info("setting " + AerospikeConfigEnum.INPUT_BINNAMES.value + " to " + bins); 89 | conf.set(AerospikeConfigEnum.INPUT_BINNAMES.value, bins); 90 | } 91 | 92 | public static String[] getInputBinNames(Configuration conf) { 93 | String bins = conf.get(AerospikeConfigEnum.INPUT_BINNAMES.value); 94 | log.info("using " + AerospikeConfigEnum.INPUT_BINNAMES.value + " = " + bins); 95 | if (bins == null || bins.equals("")) 96 | return null; 97 | else 98 | return bins.split(","); 99 | } 100 | 101 | public static void setInputOperation(Configuration conf, String operation) { 102 | if (!operation.equals("scan") && 103 | !operation.equals("numrange")) 104 | throw new UnsupportedOperationException 105 | ("input operation must be 'scan' or 'numrange'"); 106 | log.info("setting " + AerospikeConfigEnum.INPUT_OPERATION.value + " to " + operation); 107 | conf.set(AerospikeConfigEnum.INPUT_OPERATION.value, operation); 108 | } 109 | 110 | public static String getInputOperation(Configuration conf) { 111 | String operation = conf.get(AerospikeConfigEnum.INPUT_OPERATION.value, AerospikeConfigEnum.DEFAULT_INPUT_OPERATION.value); 112 | if (!operation.equals("scan") && 113 | !operation.equals("numrange")) 114 | throw new UnsupportedOperationException 115 | ("input operation must be 'scan' or 'numrange'"); 116 | log.info("using " + AerospikeConfigEnum.INPUT_OPERATION.value + " = " + operation); 117 | return operation; 118 | } 119 | 120 | public static void setInputNumRangeBin(Configuration conf, String binname) { 121 | log.info("setting " + AerospikeConfigEnum.INPUT_NUMRANGE_BIN.value + " to " + binname); 122 | conf.set(AerospikeConfigEnum.INPUT_NUMRANGE_BIN.value, binname); 123 | } 124 | 125 | public static String getInputNumRangeBin(Configuration conf) { 126 | String binname = conf.get(AerospikeConfigEnum.INPUT_NUMRANGE_BIN.value); 127 | log.info("using " + AerospikeConfigEnum.INPUT_NUMRANGE_BIN.value + " = " + binname); 128 | return binname; 129 | } 130 | 131 | public static void setInputNumRangeBegin(Configuration conf, long begin) { 132 | log.info("setting " + AerospikeConfigEnum.INPUT_NUMRANGE_BEGIN.value + " to " + begin); 133 | conf.setLong(AerospikeConfigEnum.INPUT_NUMRANGE_BEGIN.value, begin); 134 | } 135 | 136 | public static long getInputNumRangeBegin(Configuration conf) { 137 | long begin = conf.getLong(AerospikeConfigEnum.INPUT_NUMRANGE_BEGIN.value, INVALID_LONG); 138 | if (begin == INVALID_LONG && getInputOperation(conf).equals("numrange")) 139 | throw new UnsupportedOperationException 140 | ("missing input numrange begin"); 141 | log.info("using " + AerospikeConfigEnum.INPUT_NUMRANGE_BEGIN.value + " = " + begin); 142 | return begin; 143 | } 144 | 145 | public static void setInputNumRangeEnd(Configuration conf, long end) { 146 | log.info("setting " + AerospikeConfigEnum.INPUT_NUMRANGE_END.value + " to " + end); 147 | conf.setLong(AerospikeConfigEnum.INPUT_NUMRANGE_END.value, end); 148 | } 149 | 150 | public static long getInputNumRangeEnd(Configuration conf) { 151 | long end = conf.getLong(AerospikeConfigEnum.INPUT_NUMRANGE_END.value, INVALID_LONG); 152 | if (end == INVALID_LONG && getInputOperation(conf).equals("numrange")) 153 | throw new UnsupportedOperationException 154 | ("missing input numrange end"); 155 | log.info("using " + AerospikeConfigEnum.INPUT_NUMRANGE_END.value + " = " + end); 156 | return end; 157 | } 158 | 159 | public static int getInputScanPercent(Configuration conf) { 160 | int scanPercent = conf.getInt(AerospikeConfigEnum.INPUT_SCAN_PERCENT.value, DEFAULT_INPUT_SCAN_PERCENT); 161 | if (scanPercent <= 0) 162 | throw new UnsupportedOperationException 163 | ("scan percent is less than 1%"); 164 | log.info("using " + AerospikeConfigEnum.INPUT_SCAN_PERCENT.value + " = " + scanPercent + "%"); 165 | return scanPercent; 166 | } 167 | // ---------------- OUTPUT ---------------- 168 | 169 | public static void setOutputHost(Configuration conf, String host) { 170 | log.info("setting " + AerospikeConfigEnum.OUTPUT_HOST.value + " to " + host); 171 | conf.set(AerospikeConfigEnum.OUTPUT_HOST.value, host); 172 | } 173 | 174 | public static String getOutputHost(Configuration conf) { 175 | String host = conf.get(AerospikeConfigEnum.OUTPUT_HOST.value, AerospikeConfigEnum.DEFAULT_OUTPUT_HOST.value); 176 | log.info("using " + AerospikeConfigEnum.OUTPUT_HOST.value + " = " + host); 177 | return host; 178 | } 179 | 180 | public static void setOutputPort(Configuration conf, int port) { 181 | log.info("setting " + AerospikeConfigEnum.OUTPUT_PORT.value + " to " + port); 182 | conf.setInt(AerospikeConfigEnum.OUTPUT_PORT.value, port); 183 | } 184 | 185 | public static int getOutputPort(Configuration conf) { 186 | int port = conf.getInt(AerospikeConfigEnum.OUTPUT_PORT.value, DEFAULT_OUTPUT_PORT); 187 | log.info("using " + AerospikeConfigEnum.OUTPUT_PORT.value + " = " + port); 188 | return port; 189 | } 190 | 191 | public static void setOutputNamespace(Configuration conf, String namespace) { 192 | log.info("setting " + AerospikeConfigEnum.OUTPUT_NAMESPACE.value + " to " + namespace); 193 | conf.set(AerospikeConfigEnum.OUTPUT_NAMESPACE.value, namespace); 194 | } 195 | 196 | public static String getOutputNamespace(Configuration conf) { 197 | String namespace = conf.get(AerospikeConfigEnum.OUTPUT_NAMESPACE.value); 198 | if (namespace == null) 199 | throw new UnsupportedOperationException 200 | ("you must set the output namespace"); 201 | log.info("using " + AerospikeConfigEnum.OUTPUT_NAMESPACE.value + " = " + namespace); 202 | return namespace; 203 | } 204 | 205 | public static void setOutputSetName(Configuration conf, String setname) { 206 | log.info("setting " + AerospikeConfigEnum.OUTPUT_SETNAME.value + " to " + setname); 207 | conf.set(AerospikeConfigEnum.OUTPUT_SETNAME.value, setname); 208 | } 209 | 210 | public static String getOutputSetName(Configuration conf) { 211 | String setname = conf.get(AerospikeConfigEnum.OUTPUT_SETNAME.value); 212 | log.info("using " + AerospikeConfigEnum.OUTPUT_SETNAME.value + " = " + setname); 213 | return setname; 214 | } 215 | 216 | public static void setOutputBinName(Configuration conf, String binname) { 217 | log.info("setting " + AerospikeConfigEnum.OUTPUT_BINNAME.value + " to " + binname); 218 | conf.set(AerospikeConfigEnum.OUTPUT_BINNAME.value, binname); 219 | } 220 | 221 | public static String getOutputBinName(Configuration conf) { 222 | String binname = conf.get(AerospikeConfigEnum.OUTPUT_BINNAME.value); 223 | log.info("using " + AerospikeConfigEnum.OUTPUT_BINNAME.value + " = " + binname); 224 | return binname; 225 | } 226 | 227 | public static void setOutputKeyName(Configuration conf, String keyname) { 228 | log.info("setting " + AerospikeConfigEnum.OUTPUT_KEYNAME.value + " to " + keyname); 229 | conf.set(AerospikeConfigEnum.OUTPUT_KEYNAME.value, keyname); 230 | } 231 | 232 | public static String getOutputKeyName(Configuration conf) { 233 | String keyname = conf.get(AerospikeConfigEnum.OUTPUT_KEYNAME.value); 234 | log.info("using " + AerospikeConfigEnum.OUTPUT_KEYNAME.value + " = " + keyname); 235 | return keyname; 236 | } 237 | 238 | // ---------------- COMMON ---------------- 239 | 240 | public static org.apache.hadoop.mapred.JobConf asJobConf(Configuration cfg) { 241 | return cfg instanceof org.apache.hadoop.mapred.JobConf 242 | ? (org.apache.hadoop.mapred.JobConf) cfg 243 | : new org.apache.hadoop.mapred.JobConf(cfg); 244 | } 245 | } 246 | 247 | // Local Variables: 248 | // mode: java 249 | // c-basic-offset: 4 250 | // tab-width: 4 251 | // indent-tabs-mode: nil 252 | // End: 253 | // vim: softtabstop=4:shiftwidth=4:expandtab 254 | -------------------------------------------------------------------------------- /examples/external_join/src/main/java/com/aerospike/hadoop/examples/externaljoin/ExternalJoin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.examples.externaljoin; 20 | 21 | import java.io.DataInput; 22 | import java.io.DataOutput; 23 | import java.io.IOException; 24 | import java.nio.ByteBuffer; 25 | import java.security.MessageDigest; 26 | import java.security.NoSuchAlgorithmException; 27 | import java.text.ParsePosition; 28 | import java.text.SimpleDateFormat; 29 | import java.util.ArrayList; 30 | import java.util.Collections; 31 | import java.util.Date; 32 | import java.util.Iterator; 33 | import java.util.regex.Matcher; 34 | import java.util.regex.Pattern; 35 | 36 | import org.apache.commons.codec.binary.Hex; 37 | import org.apache.commons.logging.Log; 38 | import org.apache.commons.logging.LogFactory; 39 | import org.apache.hadoop.conf.Configuration; 40 | import org.apache.hadoop.conf.Configured; 41 | import org.apache.hadoop.fs.Path; 42 | import org.apache.hadoop.io.LongWritable; 43 | import org.apache.hadoop.io.Text; 44 | import org.apache.hadoop.io.Writable; 45 | import org.apache.hadoop.mapred.FileInputFormat; 46 | import org.apache.hadoop.mapred.JobClient; 47 | import org.apache.hadoop.mapred.JobConf; 48 | import org.apache.hadoop.mapred.MapReduceBase; 49 | import org.apache.hadoop.mapred.Mapper; 50 | import org.apache.hadoop.mapred.OutputCollector; 51 | import org.apache.hadoop.mapred.RecordWriter; 52 | import org.apache.hadoop.mapred.Reducer; 53 | import org.apache.hadoop.mapred.Reporter; 54 | import org.apache.hadoop.util.Progressable; 55 | import org.apache.hadoop.util.Tool; 56 | import org.apache.hadoop.util.ToolRunner; 57 | 58 | import com.aerospike.client.AerospikeClient; 59 | import com.aerospike.client.Bin; 60 | import com.aerospike.client.Key; 61 | import com.aerospike.client.Record; 62 | import com.aerospike.client.policy.Policy; 63 | import com.aerospike.client.policy.WritePolicy; 64 | import com.aerospike.hadoop.mapreduce.AerospikeConfigUtil; 65 | import com.aerospike.hadoop.mapreduce.AerospikeOutputFormat; 66 | import com.aerospike.hadoop.mapreduce.AerospikeRecordWriter; 67 | 68 | public class ExternalJoin extends Configured implements Tool { 69 | 70 | private static final Log log = LogFactory.getLog(ExternalJoin.class); 71 | 72 | private static final long SESSION_GAP_MSEC = 20 * 60 * 1000; 73 | 74 | // Sample line format: 75 | // 37518 - - [16/Jun/1998:02:48:36 +0000] \ 76 | // "GET /images/hm_hola.gif HTTP/1.0" 200 2240 77 | 78 | private static final String logEntryRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\S+)"; 79 | private static final Pattern pat = Pattern.compile(logEntryRegex); 80 | 81 | private static final SimpleDateFormat dateTimeParser = 82 | new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z"); 83 | 84 | public static class Map extends MapReduceBase implements 85 | Mapper { 86 | 87 | int mapcount = 0; 88 | 89 | public void map(LongWritable key, 90 | Text rec, 91 | OutputCollector output, 92 | Reporter reporter) throws IOException { 93 | try { 94 | String line = rec.toString(); 95 | Matcher matcher = pat.matcher(line); 96 | if (!matcher.matches() || 7 != matcher.groupCount()) { 97 | throw new RuntimeException("match failed on: " + line); 98 | } 99 | long userid = Long.parseLong(matcher.group(1)); 100 | String tstamp = matcher.group(4); 101 | ParsePosition pos = new ParsePosition(0); 102 | Date date = dateTimeParser.parse(tstamp, pos); 103 | long msec = date.getTime(); 104 | output.collect(new LongWritable(userid), new LongWritable(msec)); 105 | } 106 | catch (Exception ex) { 107 | // log.error("exception in map: " + ex); 108 | } 109 | } 110 | } 111 | 112 | private static class Session implements Writable { 113 | public long userid; 114 | public long start; 115 | public long end; 116 | public int nhits; 117 | public int age; 118 | public int isMale; 119 | 120 | public Session(long userid, long start, long end, int nhits, int age, int isMale) { 121 | this.userid = userid; 122 | this.start = start; 123 | this.end = end; 124 | this.nhits = nhits; 125 | this.age = age; 126 | this.isMale = isMale; 127 | } 128 | 129 | public void readFields(DataInput in) throws IOException { 130 | userid = in.readLong(); 131 | start = in.readLong(); 132 | end = in.readLong(); 133 | nhits = in.readInt(); 134 | age = in.readInt(); 135 | isMale = in.readInt(); 136 | } 137 | 138 | public void write(DataOutput out) throws IOException { 139 | out.writeLong(userid); 140 | out.writeLong(start); 141 | out.writeLong(end); 142 | out.writeInt(nhits); 143 | out.writeInt(age); 144 | out.writeInt(isMale); 145 | } 146 | } 147 | 148 | public static class Reduce 149 | extends MapReduceBase 150 | implements Reducer { 151 | 152 | private Policy policy; 153 | private AerospikeClient client; 154 | private String namespace; 155 | private String setName; 156 | 157 | @Override 158 | public void configure(JobConf job) { 159 | String host = AerospikeConfigUtil.getInputHost(job); 160 | int port = AerospikeConfigUtil.getInputPort(job); 161 | 162 | policy = new Policy(); 163 | policy.totalTimeout = 10000; 164 | client = new AerospikeClient(host, port); 165 | 166 | namespace = AerospikeConfigUtil.getInputNamespace(job); 167 | setName = AerospikeConfigUtil.getInputSetName(job); 168 | } 169 | 170 | @Override 171 | public void close() { 172 | client.close(); 173 | } 174 | 175 | public void reduce(LongWritable userid, 176 | Iterator tstamps, 177 | OutputCollector output, 178 | Reporter reporter 179 | ) throws IOException { 180 | 181 | // Copy the iterator to an array. 182 | ArrayList tsarray = new ArrayList(); 183 | while (tstamps.hasNext()) 184 | tsarray.add(new LongWritable(tstamps.next().get())); 185 | 186 | // Sort the timestamps. 187 | Collections.sort(tsarray); 188 | 189 | // Scan the array looking for session boundaries. 190 | long t0 = 0; 191 | long session_start = 0; 192 | long session_end = 0; 193 | int session_hits = 0; 194 | for (LongWritable tstamp: tsarray) { 195 | long tt = tstamp.get(); 196 | 197 | // How long since the prior hit? 198 | long delta = tt - t0; 199 | 200 | // Is this a new session? 201 | if (delta > SESSION_GAP_MSEC) { 202 | 203 | // Is there a prior session? 204 | if (session_start != 0) 205 | collect_session(userid.get(), session_start, session_end, 206 | session_hits, output); 207 | 208 | // Reset for the new session. 209 | session_start = tt; 210 | session_hits = 0; 211 | } 212 | 213 | // Extend the current session. 214 | session_hits += 1; 215 | session_end = tt; 216 | 217 | // On to the next hit ... 218 | t0 = tt; 219 | } 220 | 221 | // Write out the last session. 222 | if (session_start != 0) 223 | collect_session(userid.get(), session_start, session_end, 224 | session_hits, output); 225 | } 226 | 227 | private void collect_session(long userid, long start, 228 | long end, int nhits, 229 | OutputCollector output) 230 | throws IOException { 231 | 232 | Key kk = new Key(namespace, setName, userid); 233 | Record rec = client.get(policy, kk); 234 | 235 | int age = (Integer) rec.bins.get("age"); 236 | int isMale = (Integer) rec.bins.get("isMale"); 237 | 238 | try { 239 | // Generate a sessionid from the hash of the userid and start. 240 | MessageDigest md = MessageDigest.getInstance("SHA-256"); 241 | md.update(ByteBuffer.allocate(8).putLong(userid).array()); 242 | md.update(ByteBuffer.allocate(8).putLong(start).array()); 243 | String sessid = Hex.encodeHexString(md.digest()).substring(0,16); 244 | 245 | Session session = 246 | new Session(userid, start, end, nhits, age, isMale); 247 | 248 | output.collect(new Text(sessid), session); 249 | } 250 | catch (NoSuchAlgorithmException ex) { 251 | throw new RuntimeException(ex); 252 | } 253 | } 254 | } 255 | 256 | public static class SessionOutputFormat 257 | extends AerospikeOutputFormat { 258 | 259 | public static class SessionRecordWriter 260 | extends AerospikeRecordWriter { 261 | 262 | public SessionRecordWriter(Configuration cfg, 263 | Progressable progressable) { 264 | super(cfg); 265 | } 266 | 267 | @Override 268 | public void writeAerospike(Text sessid, 269 | Session session, 270 | AerospikeClient client, 271 | WritePolicy writePolicy, 272 | String namespace, 273 | String setName) throws IOException { 274 | writePolicy.totalTimeout = 10000; 275 | Key kk = new Key(namespace, setName, sessid.toString()); 276 | Bin bin0 = new Bin("userid", session.userid); 277 | Bin bin1 = new Bin("start", session.start); 278 | Bin bin2 = new Bin("end", session.end); 279 | Bin bin3 = new Bin("nhits", session.nhits); 280 | Bin bin4 = new Bin("age", session.age); 281 | Bin bin5 = new Bin("isMale", session.isMale); 282 | client.put(writePolicy, kk, bin0, bin1, bin2, bin3, bin4, bin5); 283 | } 284 | } 285 | 286 | public RecordWriter 287 | getAerospikeRecordWriter(Configuration conf, Progressable prog) { 288 | return new SessionRecordWriter(conf, prog); 289 | } 290 | } 291 | 292 | public int run(final String[] args) throws Exception { 293 | 294 | log.info("run starting"); 295 | 296 | final Configuration conf = getConf(); 297 | 298 | JobConf job = new JobConf(conf, ExternalJoin.class); 299 | job.setJobName("AerospikeExternalJoin"); 300 | 301 | job.setMapperClass(Map.class); 302 | job.setMapOutputKeyClass(LongWritable.class); 303 | job.setMapOutputValueClass(LongWritable.class); 304 | // job.setCombinerClass(Reduce.class); // Reduce changes format. 305 | job.setReducerClass(Reduce.class); 306 | job.setOutputKeyClass(Text.class); 307 | job.setOutputValueClass(Session.class); 308 | 309 | job.setOutputFormat(SessionOutputFormat.class); 310 | 311 | for (int ii = 0; ii < args.length; ++ii) 312 | FileInputFormat.addInputPath(job, new Path(args[ii])); 313 | 314 | JobClient.runJob(job); 315 | 316 | log.info("finished"); 317 | return 0; 318 | } 319 | 320 | public static void main(final String[] args) throws Exception { 321 | System.exit(ToolRunner.run(new ExternalJoin(), args)); 322 | } 323 | } 324 | 325 | // Local Variables: 326 | // mode: java 327 | // c-basic-offset: 4 328 | // tab-width: 4 329 | // indent-tabs-mode: nil 330 | // End: 331 | // vim: softtabstop=4:shiftwidth=4:expandtab 332 | -------------------------------------------------------------------------------- /mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeRecordReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Aerospike, Inc. 3 | * 4 | * Portions may be licensed to Aerospike, Inc. under one or more 5 | * contributor license agreements. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); you 8 | * may not use this file except in compliance with the License. You 9 | * may obtain a copy of the License at 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | * implied. See the License for the specific language governing 16 | * permissions and limitations under the License. 17 | */ 18 | 19 | package com.aerospike.hadoop.mapreduce; 20 | 21 | import java.io.IOException; 22 | import java.util.concurrent.ArrayBlockingQueue; 23 | 24 | import org.apache.commons.logging.Log; 25 | import org.apache.commons.logging.LogFactory; 26 | import org.apache.hadoop.mapreduce.InputSplit; 27 | import org.apache.hadoop.mapreduce.RecordReader; 28 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 29 | 30 | import com.aerospike.client.AerospikeClient; 31 | import com.aerospike.client.AerospikeException; 32 | import com.aerospike.client.AerospikeException.ScanTerminated; 33 | import com.aerospike.client.Key; 34 | import com.aerospike.client.Record; 35 | import com.aerospike.client.ScanCallback; 36 | import com.aerospike.client.policy.ClientPolicy; 37 | import com.aerospike.client.policy.QueryPolicy; 38 | import com.aerospike.client.policy.ScanPolicy; 39 | import com.aerospike.client.query.Filter; 40 | import com.aerospike.client.query.RecordSet; 41 | import com.aerospike.client.query.Statement; 42 | 43 | public class AerospikeRecordReader 44 | extends RecordReader 45 | implements org.apache.hadoop.mapred.RecordReader { 47 | 48 | private class KeyRecPair { 49 | public AerospikeKey key; 50 | public AerospikeRecord rec; 51 | public KeyRecPair(AerospikeKey key, AerospikeRecord rec) { 52 | this.key = key; 53 | this.rec = rec; 54 | } 55 | } 56 | 57 | private static final Log log = 58 | LogFactory.getLog(AerospikeRecordReader.class); 59 | 60 | private ASSCanReader scanReader = null; 61 | private ASQueryReader queryReader = null; 62 | 63 | private ArrayBlockingQueue queue = 64 | new ArrayBlockingQueue(16 * 1024); 65 | 66 | private boolean isFinished = false; 67 | private boolean isError = false; 68 | private boolean isRunning = false; 69 | private String numrangeBin; 70 | private long numrangeBegin; 71 | private long numrangeEnd; 72 | private int scanPercent; 73 | 74 | private AerospikeKey currentKey; 75 | private AerospikeRecord currentValue; 76 | 77 | public class CallBack implements ScanCallback { 78 | public void scanCallback(Key key, Record record) 79 | throws AerospikeException { 80 | try { 81 | queue.put(new KeyRecPair(new AerospikeKey(key), 82 | new AerospikeRecord(record))); 83 | } catch (Exception ex) { 84 | throw new ScanTerminated(ex); 85 | } 86 | } 87 | } 88 | 89 | public class ASSCanReader extends java.lang.Thread { 90 | 91 | String node; 92 | String host; 93 | int port; 94 | String namespace; 95 | String setName; 96 | String[] binNames; 97 | int scanPercent; 98 | 99 | ASSCanReader(String node, String host, int port, 100 | String ns, String setName, String[] binNames, int scanPercent) { 101 | this.node = node; 102 | this.host = host; 103 | this.port = port; 104 | this.namespace = ns; 105 | this.setName = setName; 106 | this.binNames = binNames; 107 | this.scanPercent = scanPercent; 108 | } 109 | 110 | public void run() { 111 | try { 112 | AerospikeClient client = 113 | AerospikeClientSingleton.getInstance(new ClientPolicy(), 114 | host, port); 115 | 116 | log.info(String.format("scanNode %s:%d:%s:%s", 117 | host, port, namespace, setName)); 118 | ScanPolicy scanPolicy = new ScanPolicy(); 119 | scanPolicy.scanPercent = scanPercent; 120 | CallBack cb = new CallBack(); 121 | log.info("scan starting with scan percent: " + scanPolicy.scanPercent + "%"); 122 | isRunning = true; 123 | if (binNames != null) 124 | client.scanNode(scanPolicy, node, namespace, setName, 125 | cb, binNames); 126 | else 127 | client.scanNode(scanPolicy, node, namespace, setName, 128 | cb); 129 | isFinished = true; 130 | log.info("scan finished"); 131 | } 132 | catch (Exception ex) { 133 | log.error("exception in ASSCanReader.run: " + ex); 134 | isError = true; 135 | return; 136 | } 137 | } 138 | } 139 | 140 | public class ASQueryReader extends java.lang.Thread { 141 | 142 | String node; 143 | String host; 144 | int port; 145 | String namespace; 146 | String setName; 147 | String[] binNames; 148 | String numrangeBin; 149 | long numrangeBegin; 150 | long numrangeEnd; 151 | 152 | ASQueryReader(String node, String host, int port, 153 | String ns, String setName, String[] binNames, 154 | String numrangeBin, long numrangeBegin, long numrangeEnd) { 155 | this.node = node; 156 | this.host = host; 157 | this.port = port; 158 | this.namespace = ns; 159 | this.setName = setName; 160 | this.binNames = binNames; 161 | this.numrangeBin = numrangeBin; 162 | this.numrangeBegin = numrangeBegin; 163 | this.numrangeEnd = numrangeEnd; 164 | } 165 | 166 | public void run() { 167 | try { 168 | AerospikeClient client = 169 | AerospikeClientSingleton.getInstance(new ClientPolicy(), 170 | host, port); 171 | log.info(String.format("queryNode %s:%d %s:%s:%s[%d:%d]", 172 | host, port, namespace, setName, 173 | numrangeBin, numrangeBegin, 174 | numrangeEnd)); 175 | Statement stmt = new Statement(); 176 | stmt.setNamespace(namespace); 177 | stmt.setSetName(setName); 178 | stmt.setFilters(Filter.range(numrangeBin, 179 | numrangeBegin, 180 | numrangeEnd)); 181 | if (binNames != null) 182 | stmt.setBinNames(binNames); 183 | QueryPolicy queryPolicy = new QueryPolicy(); 184 | RecordSet rs = client.queryNode(queryPolicy, 185 | stmt, 186 | client.getNode(node)); 187 | isRunning = true; 188 | try { 189 | log.info("query starting"); 190 | while (rs.next()) { 191 | Key key = rs.getKey(); 192 | Record record = rs.getRecord(); 193 | queue.put(new KeyRecPair(new AerospikeKey(key), 194 | new AerospikeRecord(record))); 195 | } 196 | } 197 | finally { 198 | rs.close(); 199 | isFinished = true; 200 | log.info("query finished"); 201 | } 202 | } 203 | catch (Exception ex) { 204 | isError = true; 205 | return; 206 | } 207 | } 208 | } 209 | 210 | public AerospikeRecordReader() 211 | throws IOException { 212 | log.info("NEW CTOR"); 213 | } 214 | 215 | public AerospikeRecordReader(AerospikeSplit split) 216 | throws IOException { 217 | log.info("OLD CTOR"); 218 | init(split); 219 | } 220 | 221 | public void init(AerospikeSplit split) 222 | throws IOException { 223 | final String type = split.getType(); 224 | final String node = split.getNode(); 225 | final String host = split.getHost(); 226 | final int port = split.getPort(); 227 | final String namespace = split.getNameSpace(); 228 | final String setName = split.getSetName(); 229 | final String[] binNames = split.getBinNames(); 230 | this.numrangeBin = split.getNumRangeBin(); 231 | this.numrangeBegin = split.getNumRangeBegin(); 232 | this.numrangeEnd = split.getNumRangeEnd(); 233 | this.scanPercent = split.getScanPercent(); 234 | 235 | if (type.equals("scan")) { 236 | scanReader = new ASSCanReader(node, host, port, namespace, 237 | setName, binNames, scanPercent); 238 | scanReader.start(); 239 | } else if (type.equals("numrange")) { 240 | queryReader = new ASQueryReader(node, host, port, namespace, 241 | setName, binNames, numrangeBin, 242 | numrangeBegin, numrangeEnd); 243 | queryReader.start(); 244 | } 245 | 246 | log.info("node: " + node); 247 | } 248 | 249 | public AerospikeKey createKey() { return new AerospikeKey(); } 250 | 251 | public AerospikeRecord createValue() { return new AerospikeRecord(); } 252 | 253 | protected AerospikeKey setCurrentKey(AerospikeKey oldApiKey, 254 | AerospikeKey newApiKey, 255 | AerospikeKey keyval) { 256 | 257 | if (oldApiKey == null) { 258 | oldApiKey = new AerospikeKey(); 259 | oldApiKey.set(keyval); 260 | } 261 | 262 | // new API might not be used 263 | if (newApiKey != null) { 264 | newApiKey.set(keyval); 265 | } 266 | return oldApiKey; 267 | } 268 | 269 | protected AerospikeRecord setCurrentValue(AerospikeRecord oldApiVal, 270 | AerospikeRecord newApiVal, 271 | AerospikeRecord val) { 272 | if (oldApiVal == null) { 273 | oldApiVal = new AerospikeRecord(); 274 | oldApiVal.set(val); 275 | } 276 | 277 | // new API might not be used 278 | if (newApiVal != null) { 279 | newApiVal.set(val); 280 | } 281 | return oldApiVal; 282 | } 283 | 284 | public synchronized boolean next(AerospikeKey key, AerospikeRecord value) 285 | throws IOException { 286 | 287 | final int waitMSec = 1000; 288 | int trials = 5; 289 | 290 | try { 291 | KeyRecPair pair; 292 | while (true) { 293 | if (isError) 294 | return false; 295 | 296 | if (!isRunning) { 297 | Thread.sleep(100); 298 | continue; 299 | } 300 | 301 | if (!isFinished && queue.size() == 0) { 302 | if (trials == 0) { 303 | log.error("SCAN TIMEOUT"); 304 | return false; 305 | } 306 | log.info("queue empty: waiting..."); 307 | Thread.sleep(waitMSec); 308 | trials--; 309 | } else if (isFinished && queue.size() == 0) { 310 | return false; 311 | } else if (queue.size() != 0) { 312 | pair = queue.take(); 313 | break; 314 | } 315 | } 316 | 317 | // log.info("key=" + pair.key + ", val=" + pair.rec); 318 | 319 | currentKey = setCurrentKey(currentKey, key, pair.key); 320 | currentValue = setCurrentValue(currentValue, value, pair.rec); 321 | } 322 | catch (Exception ex) { 323 | log.error("exception in AerospikeRecordReader.next: " + ex); 324 | throw new IOException("exception in AerospikeRecordReader.next", ex); 325 | } 326 | return true; 327 | } 328 | 329 | public float getProgress() { 330 | if (isFinished) 331 | return 1.0f; 332 | else 333 | return 0.0f; 334 | } 335 | 336 | public synchronized long getPos() throws IOException { 337 | return 0; 338 | } 339 | 340 | public synchronized void close() throws IOException { 341 | if (scanReader != null) { 342 | try { 343 | scanReader.join(); 344 | } 345 | catch (Exception ex) { 346 | throw new IOException("exception in AerospikeRecordReader.close", 347 | ex); 348 | } 349 | scanReader = null; 350 | } 351 | if (queryReader != null) { 352 | try { 353 | queryReader.join(); 354 | } 355 | catch (Exception ex) { 356 | throw new IOException("exception in AerospikeRecordReader.close", 357 | ex); 358 | } 359 | queryReader = null; 360 | } 361 | } 362 | 363 | // ---------------- NEW API ---------------- 364 | 365 | @Override 366 | public void initialize(InputSplit split, TaskAttemptContext context) 367 | throws IOException { 368 | log.info("INITIALIZE"); 369 | init((AerospikeSplit) split); 370 | } 371 | 372 | @Override 373 | public boolean nextKeyValue() throws IOException { 374 | // new API call routed to old API 375 | if (currentKey == null) { 376 | currentKey = createKey(); 377 | } 378 | if (currentValue == null) { 379 | currentValue = createValue(); 380 | } 381 | 382 | // FIXME: does the new API mandate a new instance each time (?) 383 | return next(currentKey, currentValue); 384 | } 385 | 386 | @Override 387 | public AerospikeKey getCurrentKey() throws IOException { 388 | return currentKey; 389 | } 390 | 391 | @Override 392 | public AerospikeRecord getCurrentValue() { 393 | return currentValue; 394 | } 395 | } 396 | 397 | // Local Variables: 398 | // mode: java 399 | // c-basic-offset: 4 400 | // tab-width: 4 401 | // indent-tabs-mode: nil 402 | // End: 403 | // vim: softtabstop=4:shiftwidth=4:expandtab 404 | --------------------------------------------------------------------------------