├── gradle
└── wrapper
│ ├── gradle-wrapper.jar
│ └── gradle-wrapper.properties
├── .gitignore
├── mapreduce
├── build.gradle
├── pom.xml
└── src
│ └── main
│ └── java
│ └── com
│ └── aerospike
│ └── hadoop
│ └── mapreduce
│ ├── AerospikeConfigEnum.java
│ ├── AerospikeClientSingleton.java
│ ├── AerospikeLogger.java
│ ├── AerospikeRecordWriter.java
│ ├── AerospikeRecord.java
│ ├── AerospikeKey.java
│ ├── AerospikeSplit.java
│ ├── AerospikeOutputFormat.java
│ ├── AerospikeInputFormat.java
│ ├── AerospikeConfigUtil.java
│ └── AerospikeRecordReader.java
├── settings.gradle
├── examples
├── external_join
│ ├── build.gradle
│ ├── src
│ │ └── main
│ │ │ ├── resources
│ │ │ ├── log4j.properties
│ │ │ └── commons-logging.properties
│ │ │ └── java
│ │ │ └── com
│ │ │ └── aerospike
│ │ │ └── hadoop
│ │ │ └── examples
│ │ │ └── externaljoin
│ │ │ └── ExternalJoin.java
│ └── pom.xml
├── session_rollup
│ ├── build.gradle
│ ├── src
│ │ └── main
│ │ │ ├── resources
│ │ │ ├── log4j.properties
│ │ │ └── commons-logging.properties
│ │ │ └── java
│ │ │ └── com
│ │ │ └── aerospike
│ │ │ └── hadoop
│ │ │ └── examples
│ │ │ └── sessionrollup
│ │ │ └── SessionRollup.java
│ └── pom.xml
├── word_count_input
│ ├── build.gradle
│ ├── src
│ │ └── main
│ │ │ ├── resources
│ │ │ ├── log4j.properties
│ │ │ └── commons-logging.properties
│ │ │ └── java
│ │ │ └── com
│ │ │ └── aerospike
│ │ │ └── hadoop
│ │ │ └── examples
│ │ │ └── wordcountinput
│ │ │ └── WordCountInput.java
│ └── pom.xml
├── aggregate_int_input
│ ├── src
│ │ └── main
│ │ │ ├── resources
│ │ │ ├── log4j.properties
│ │ │ └── commons-logging.properties
│ │ │ └── java
│ │ │ └── com
│ │ │ └── aerospike
│ │ │ └── hadoop
│ │ │ └── examples
│ │ │ └── aggregateintinput
│ │ │ └── AggregateIntInput.java
│ ├── build.gradle
│ └── pom.xml
├── generate_profiles
│ ├── src
│ │ └── main
│ │ │ ├── resources
│ │ │ ├── log4j.properties
│ │ │ └── commons-logging.properties
│ │ │ └── java
│ │ │ └── com
│ │ │ └── aerospike
│ │ │ └── hadoop
│ │ │ └── examples
│ │ │ └── generateprofiles
│ │ │ └── GenerateProfiles.java
│ ├── build.gradle
│ └── pom.xml
├── word_count_output
│ ├── build.gradle
│ ├── src
│ │ └── main
│ │ │ ├── resources
│ │ │ ├── log4j.properties
│ │ │ └── commons-logging.properties
│ │ │ └── java
│ │ │ └── com
│ │ │ └── aerospike
│ │ │ └── hadoop
│ │ │ └── examples
│ │ │ └── wordcountoutput
│ │ │ └── WordCountOutput.java
│ └── pom.xml
├── build.gradle
├── spark_session_rollup
│ ├── build.gradle
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ └── java
│ │ └── com
│ │ └── aerospike
│ │ └── spark
│ │ └── examples
│ │ └── SparkSessionRollup.java
└── pom.xml
├── sampledata
├── src
│ └── main
│ │ ├── resources
│ │ ├── log4j.properties
│ │ └── commons-logging.properties
│ │ └── java
│ │ └── com
│ │ └── aerospike
│ │ └── hadoop
│ │ └── sampledata
│ │ └── SampleData.java
├── build.gradle
└── pom.xml
├── TODO.md
├── pom.xml
├── gradlew.bat
├── gradlew
├── WORLDCUP_FILELIST
├── README.md
└── LICENSE
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aerospike-community/aerospike-hadoop/HEAD/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .gradle
2 | bin
3 | build
4 | target
5 | .settings
6 | .classpath
7 | .project
8 | *.iml
9 | *.ipr
10 | *.iws
11 | *.log
12 | metastore_db
13 | .idea
14 |
15 | # Ignore Gradle GUI config
16 | gradle-app.setting
17 |
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Wed Feb 12 07:28:02 CST 2014
2 | distributionBase=GRADLE_USER_HOME
3 | distributionPath=wrapper/dists
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | distributionUrl=http\://services.gradle.org/distributions/gradle-1.11-bin.zip
7 |
--------------------------------------------------------------------------------
/mapreduce/build.gradle:
--------------------------------------------------------------------------------
1 | apply plugin: 'java'
2 | apply plugin: 'application'
3 |
4 | dependencies {
5 | compile "com.aerospike:aerospike-client:3.3.0"
6 | compile "org.apache.hadoop:hadoop-common:2.7.2"
7 | compile "org.apache.hadoop:hadoop-mapreduce-client-jobclient:2.7.2"
8 | }
9 |
10 |
--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 |
2 | include ':mapreduce'
3 | include ':sampledata'
4 | include ':examples:word_count_input'
5 | include ':examples:aggregate_int_input'
6 | include ':examples:word_count_output'
7 | include ':examples:session_rollup'
8 | include ':examples:generate_profiles'
9 | include ':examples:external_join'
10 | include ':examples:spark_session_rollup'
11 |
--------------------------------------------------------------------------------
/examples/external_join/build.gradle:
--------------------------------------------------------------------------------
1 | apply plugin: 'java'
2 | apply plugin: 'application'
3 |
4 | mainClassName = 'com.aerospike.hadoop.examples.externaljoin.ExternalJoin'
5 |
6 | jar {
7 | manifest {
8 | attributes 'Main-Class': 'com.aerospike.hadoop.examples.externaljoin.ExternalJoin'
9 | }
10 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) }
11 | }
12 |
--------------------------------------------------------------------------------
/sampledata/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | General
2 | ----------------------------------------------------------------
3 |
4 | * Switch all build to maven?
5 |
6 | * Simple torture testing.
7 |
8 | * Sunil's object all the way through example.
9 |
10 | * Add docs to www.aerospike.com website.
11 |
12 | * Benchmark vs HDFS.
13 |
14 | ----------------
15 |
16 | * LDT support.
17 |
18 | * Hive example.
19 |
20 | * Another example.
21 |
--------------------------------------------------------------------------------
/examples/session_rollup/build.gradle:
--------------------------------------------------------------------------------
1 | apply plugin: 'java'
2 | apply plugin: 'application'
3 |
4 | mainClassName = 'com.aerospike.hadoop.examples.sessionrollup.SessionRollup'
5 |
6 | jar {
7 | manifest {
8 | attributes 'Main-Class': 'com.aerospike.hadoop.examples.sessionrollup.SessionRollup'
9 | }
10 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) }
11 | }
12 |
--------------------------------------------------------------------------------
/examples/external_join/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/examples/session_rollup/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/examples/word_count_input/build.gradle:
--------------------------------------------------------------------------------
1 | apply plugin: 'java'
2 | apply plugin: 'application'
3 |
4 | mainClassName = 'com.aerospike.hadoop.examples.wordcountinput.WordCountInput'
5 |
6 | jar {
7 | manifest {
8 | attributes 'Main-Class': 'com.aerospike.hadoop.examples.wordcountinput.WordCountInput'
9 | }
10 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) }
11 | }
12 |
--------------------------------------------------------------------------------
/examples/aggregate_int_input/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/examples/generate_profiles/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/examples/word_count_input/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/examples/word_count_output/build.gradle:
--------------------------------------------------------------------------------
1 | apply plugin: 'java'
2 | apply plugin: 'application'
3 |
4 | mainClassName = 'com.aerospike.hadoop.examples.wordcountoutput.WordCountOutput'
5 |
6 | jar {
7 | manifest {
8 | attributes 'Main-Class': 'com.aerospike.hadoop.examples.wordcountoutput.WordCountOutput'
9 | }
10 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) }
11 | }
12 |
--------------------------------------------------------------------------------
/examples/word_count_output/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %-5p %c{1}:%L - %m%n
9 |
--------------------------------------------------------------------------------
/examples/generate_profiles/build.gradle:
--------------------------------------------------------------------------------
1 | apply plugin: 'java'
2 | apply plugin: 'application'
3 |
4 | mainClassName = 'com.aerospike.hadoop.examples.generateprofiles.GenerateProfiles'
5 |
6 | jar {
7 | manifest {
8 | attributes 'Main-Class': 'com.aerospike.hadoop.examples.generateprofiles.GenerateProfiles'
9 | }
10 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) }
11 | }
12 |
--------------------------------------------------------------------------------
/examples/aggregate_int_input/build.gradle:
--------------------------------------------------------------------------------
1 | apply plugin: 'java'
2 | apply plugin: 'application'
3 |
4 | mainClassName = 'com.aerospike.hadoop.examples.aggregateintinput.AggregateIntInput'
5 |
6 | jar {
7 | manifest {
8 | attributes 'Main-Class': 'com.aerospike.hadoop.examples.aggregateintinput.AggregateIntInput'
9 | }
10 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) }
11 | }
12 |
--------------------------------------------------------------------------------
/examples/build.gradle:
--------------------------------------------------------------------------------
1 | apply plugin:'java'
2 |
3 | subprojects{
4 | apply plugin:'java'
5 |
6 | dependencies {
7 | compile project(':mapreduce')
8 | compile "com.aerospike:aerospike-client:3.3.0"
9 | compile "org.apache.hadoop:hadoop-common:2.7.2"
10 | compile "org.apache.hadoop:hadoop-mapreduce-client-jobclient:2.7.2"
11 | compile "joda-time:joda-time:2.5"
12 | compile "org.json:json:20140107"
13 | }
14 | }
15 |
16 |
--------------------------------------------------------------------------------
/examples/spark_session_rollup/build.gradle:
--------------------------------------------------------------------------------
1 | apply plugin: 'java'
2 | apply plugin: 'application'
3 |
4 | mainClassName = 'com.aerospike.spark.examples.SparkSessionRollup'
5 |
6 | repositories {
7 | mavenCentral()
8 | }
9 |
10 | dependencies {
11 | compile "org.apache.spark:spark-core_2.10:1.1.0"
12 | }
13 |
14 | jar {
15 | manifest {
16 | attributes 'Main-Class': 'com.aerospike.spark.examples.SparkSessionRollup'
17 | }
18 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) }
19 | exclude 'META-INF/*.RSA', 'META-INF/*.SF','META-INF/*.DSA'
20 | }
21 |
--------------------------------------------------------------------------------
/sampledata/build.gradle:
--------------------------------------------------------------------------------
1 | apply plugin: 'java'
2 | apply plugin: 'application'
3 |
4 | mainClassName = 'com.aerospike.hadoop.sampledata.SampleData'
5 |
6 | repositories {
7 | mavenCentral()
8 | }
9 |
10 | dependencies {
11 | compile "com.aerospike:aerospike-client:3.3.0"
12 | compile "org.apache.hadoop:hadoop-common:2.7.2"
13 | }
14 |
15 | run {
16 | if ( project.hasProperty("appArgs") ) {
17 | args Eval.me(appArgs)
18 | }
19 | }
20 |
21 | jar {
22 | manifest {
23 | attributes 'Main-Class': 'com.aerospike.hadoop.sampledata.SampleData'
24 | }
25 | from configurations.compile.collect { it.isDirectory() ? it : zipTree(it) }
26 | }
--------------------------------------------------------------------------------
/mapreduce/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
6 | 4.0.0
7 | aerospike-mapreduce
8 |
9 |
10 | com.aerospike
11 | aerospike-hadoop-parent
12 | 1.1.0-SNAPSHOT
13 |
14 |
15 |
16 |
17 | org.apache.hadoop
18 | hadoop-mapreduce-client-jobclient
19 | 2.7.2
20 | compile
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/sampledata/src/main/resources/commons-logging.properties:
--------------------------------------------------------------------------------
1 | # commons-logging.properties
2 | # jdk handlers
3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler
4 |
5 | # default log level
6 | .level=DEBUG
7 |
8 | # Specific logger level
9 | #MyClassLogger.level=FINE
10 |
11 | # FileHandler options - can also be set to the ConsoleHandler
12 | # FileHandler level can be set to override the global level:
13 | #java.util.logging.FileHandler.level=WARN
14 |
15 | # log file name for the File Handler
16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log
17 |
18 | # Specify the style of output (simple or xml)
19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter
20 |
21 | # Optional - Limit the size of the file (in bytes)
22 | java.util.logging.FileHandler.limit=50000
23 |
24 | # Optional - The number of files to cycle through, by
25 | # appending an integer to the base file name:
26 | java.util.logging.FileHandler.count=1
27 |
--------------------------------------------------------------------------------
/examples/external_join/src/main/resources/commons-logging.properties:
--------------------------------------------------------------------------------
1 | # commons-logging.properties
2 | # jdk handlers
3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler
4 |
5 | # default log level
6 | .level=DEBUG
7 |
8 | # Specific logger level
9 | #MyClassLogger.level=FINE
10 |
11 | # FileHandler options - can also be set to the ConsoleHandler
12 | # FileHandler level can be set to override the global level:
13 | #java.util.logging.FileHandler.level=WARN
14 |
15 | # log file name for the File Handler
16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log
17 |
18 | # Specify the style of output (simple or xml)
19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter
20 |
21 | # Optional - Limit the size of the file (in bytes)
22 | java.util.logging.FileHandler.limit=50000
23 |
24 | # Optional - The number of files to cycle through, by
25 | # appending an integer to the base file name:
26 | java.util.logging.FileHandler.count=1
27 |
--------------------------------------------------------------------------------
/examples/session_rollup/src/main/resources/commons-logging.properties:
--------------------------------------------------------------------------------
1 | # commons-logging.properties
2 | # jdk handlers
3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler
4 |
5 | # default log level
6 | .level=DEBUG
7 |
8 | # Specific logger level
9 | #MyClassLogger.level=FINE
10 |
11 | # FileHandler options - can also be set to the ConsoleHandler
12 | # FileHandler level can be set to override the global level:
13 | #java.util.logging.FileHandler.level=WARN
14 |
15 | # log file name for the File Handler
16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log
17 |
18 | # Specify the style of output (simple or xml)
19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter
20 |
21 | # Optional - Limit the size of the file (in bytes)
22 | java.util.logging.FileHandler.limit=50000
23 |
24 | # Optional - The number of files to cycle through, by
25 | # appending an integer to the base file name:
26 | java.util.logging.FileHandler.count=1
27 |
--------------------------------------------------------------------------------
/examples/aggregate_int_input/src/main/resources/commons-logging.properties:
--------------------------------------------------------------------------------
1 | # commons-logging.properties
2 | # jdk handlers
3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler
4 |
5 | # default log level
6 | .level=DEBUG
7 |
8 | # Specific logger level
9 | #MyClassLogger.level=FINE
10 |
11 | # FileHandler options - can also be set to the ConsoleHandler
12 | # FileHandler level can be set to override the global level:
13 | #java.util.logging.FileHandler.level=WARN
14 |
15 | # log file name for the File Handler
16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log
17 |
18 | # Specify the style of output (simple or xml)
19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter
20 |
21 | # Optional - Limit the size of the file (in bytes)
22 | java.util.logging.FileHandler.limit=50000
23 |
24 | # Optional - The number of files to cycle through, by
25 | # appending an integer to the base file name:
26 | java.util.logging.FileHandler.count=1
27 |
--------------------------------------------------------------------------------
/examples/generate_profiles/src/main/resources/commons-logging.properties:
--------------------------------------------------------------------------------
1 | # commons-logging.properties
2 | # jdk handlers
3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler
4 |
5 | # default log level
6 | .level=DEBUG
7 |
8 | # Specific logger level
9 | #MyClassLogger.level=FINE
10 |
11 | # FileHandler options - can also be set to the ConsoleHandler
12 | # FileHandler level can be set to override the global level:
13 | #java.util.logging.FileHandler.level=WARN
14 |
15 | # log file name for the File Handler
16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log
17 |
18 | # Specify the style of output (simple or xml)
19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter
20 |
21 | # Optional - Limit the size of the file (in bytes)
22 | java.util.logging.FileHandler.limit=50000
23 |
24 | # Optional - The number of files to cycle through, by
25 | # appending an integer to the base file name:
26 | java.util.logging.FileHandler.count=1
27 |
--------------------------------------------------------------------------------
/examples/word_count_input/src/main/resources/commons-logging.properties:
--------------------------------------------------------------------------------
1 | # commons-logging.properties
2 | # jdk handlers
3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler
4 |
5 | # default log level
6 | .level=DEBUG
7 |
8 | # Specific logger level
9 | #MyClassLogger.level=FINE
10 |
11 | # FileHandler options - can also be set to the ConsoleHandler
12 | # FileHandler level can be set to override the global level:
13 | #java.util.logging.FileHandler.level=WARN
14 |
15 | # log file name for the File Handler
16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log
17 |
18 | # Specify the style of output (simple or xml)
19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter
20 |
21 | # Optional - Limit the size of the file (in bytes)
22 | java.util.logging.FileHandler.limit=50000
23 |
24 | # Optional - The number of files to cycle through, by
25 | # appending an integer to the base file name:
26 | java.util.logging.FileHandler.count=1
27 |
--------------------------------------------------------------------------------
/examples/word_count_output/src/main/resources/commons-logging.properties:
--------------------------------------------------------------------------------
1 | # commons-logging.properties
2 | # jdk handlers
3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler
4 |
5 | # default log level
6 | .level=DEBUG
7 |
8 | # Specific logger level
9 | #MyClassLogger.level=FINE
10 |
11 | # FileHandler options - can also be set to the ConsoleHandler
12 | # FileHandler level can be set to override the global level:
13 | #java.util.logging.FileHandler.level=WARN
14 |
15 | # log file name for the File Handler
16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log
17 |
18 | # Specify the style of output (simple or xml)
19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter
20 |
21 | # Optional - Limit the size of the file (in bytes)
22 | java.util.logging.FileHandler.limit=50000
23 |
24 | # Optional - The number of files to cycle through, by
25 | # appending an integer to the base file name:
26 | java.util.logging.FileHandler.count=1
27 |
--------------------------------------------------------------------------------
/examples/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
6 | 4.0.0
7 | aerospike-hadoop-examples
8 | pom
9 |
10 |
11 | com.aerospike
12 | aerospike-hadoop-parent
13 | 1.1.0-SNAPSHOT
14 |
15 |
16 |
17 | word_count_input
18 | aggregate_int_input
19 | word_count_output
20 | session_rollup
21 | generate_profiles
22 | external_join
23 | spark_session_rollup
24 |
25 |
26 |
27 |
28 |
29 | com.aerospike
30 | aerospike-mapreduce
31 | 1.1.0-SNAPSHOT
32 | compile
33 |
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeConfigEnum.java:
--------------------------------------------------------------------------------
1 | package com.aerospike.hadoop.mapreduce;
2 |
3 | public enum AerospikeConfigEnum {
4 |
5 | // ---------------- OUTPUT ----------------
6 |
7 | INPUT_HOST("aerospike.input.host"),
8 | DEFAULT_INPUT_HOST("localhost"),
9 | INPUT_PORT("aerospike.input.port"),
10 | INPUT_NAMESPACE("aerospike.input.namespace"),
11 | INPUT_SETNAME("aerospike.input.setname"),
12 | INPUT_BINNAMES("aerospike.input.binnames"),
13 | DEFAULT_INPUT_BINNAMES(""),
14 | INPUT_OPERATION("aerospike.input.operation"),
15 | DEFAULT_INPUT_OPERATION("scan"),
16 | INPUT_SCAN_PERCENT("aerospike.input.scan.percent"),
17 | INPUT_NUMRANGE_BIN("aerospike.input.numrange.bin"),
18 | INPUT_NUMRANGE_BEGIN("aerospike.input.numrange.begin"),
19 | INPUT_NUMRANGE_END("aerospike.input.numrange.end"),
20 |
21 | // ---------------- OUTPUT ----------------
22 |
23 | OUTPUT_HOST("aerospike.output.host"),
24 | DEFAULT_OUTPUT_HOST("localhost"),
25 | OUTPUT_PORT("aerospike.output.port"),
26 | OUTPUT_NAMESPACE("aerospike.output.namespace"),
27 | OUTPUT_SETNAME("aerospike.output.setname"),
28 | OUTPUT_BINNAME("aerospike.output.binname"),
29 | OUTPUT_KEYNAME("aerospike.output.keyname");
30 |
31 | public final String value;
32 |
33 | private AerospikeConfigEnum(String v){
34 | value = v;
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/sampledata/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 | sampledata
6 |
7 | com.aerospike
8 | aerospike-hadoop-examples
9 | 1.1.0-SNAPSHOT
10 |
11 |
12 |
13 | build/libs
14 | ${project.artifactId}-notfull
15 |
16 |
17 | maven-assembly-plugin
18 |
19 |
20 |
21 | com.aerospike.hadoop.sampledata.SampleData
22 |
23 |
24 |
25 | jar-with-dependencies
26 |
27 | ${project.artifactId}
28 | false
29 |
30 |
31 |
32 | make-assembly
33 | package
34 |
35 | single
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeClientSingleton.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2014 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.mapreduce;
20 |
21 | import com.aerospike.client.AerospikeClient;
22 | import com.aerospike.client.policy.ClientPolicy;
23 |
24 | public final class AerospikeClientSingleton {
25 |
26 | private static volatile AerospikeClient instance = null;
27 |
28 | public static AerospikeClient getInstance(ClientPolicy policy,
29 | String host,
30 | int port) {
31 | if (instance == null) {
32 | synchronized (AerospikeClientSingleton.class) {
33 | if (instance == null) {
34 | instance = new AerospikeClient(host, port);
35 | }
36 | }
37 | }
38 | return instance;
39 | }
40 | }
41 |
42 | // Local Variables:
43 | // mode: java
44 | // c-basic-offset: 4
45 | // tab-width: 4
46 | // indent-tabs-mode: nil
47 | // End:
48 | // vim: softtabstop=4:shiftwidth=4:expandtab
49 |
--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeLogger.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2014 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.mapreduce;
20 |
21 | import org.apache.commons.logging.Log;
22 | import org.apache.commons.logging.LogFactory;
23 |
24 | import com.aerospike.client.Log.Level;
25 |
26 | public class AerospikeLogger implements com.aerospike.client.Log.Callback {
27 |
28 | private static final Log log = LogFactory.getLog(AerospikeLogger.class);
29 |
30 | public void log(Level level, String message) {
31 | switch (level) {
32 | case ERROR:
33 | log.error(message);
34 | break;
35 | case WARN:
36 | log.warn(message);
37 | break;
38 | case INFO:
39 | log.info(message);
40 | break;
41 | case DEBUG:
42 | log.debug(message);
43 | break;
44 | }
45 | }
46 | }
47 |
48 | // Local Variables:
49 | // mode: java
50 | // c-basic-offset: 4
51 | // tab-width: 4
52 | // indent-tabs-mode: nil
53 | // End:
54 | // vim: softtabstop=4:shiftwidth=4:expandtab
55 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
4 | 4.0.0
5 | com.aerospike
6 | aerospike-hadoop-parent
7 | aerospike-hadoop-parent
8 | 1.1.0-SNAPSHOT
9 | pom
10 |
11 |
12 | Aerospike Inc.
13 | http://www.aerospike.com
14 |
15 |
16 |
17 | mapreduce
18 | sampledata
19 | examples
20 |
21 |
22 |
23 | UTF-8
24 | 3.8.1
25 | 4.2.2
26 | 2.7.2
27 |
28 |
29 |
30 |
31 | org.apache.hadoop
32 | hadoop-common
33 | ${hadoop.version}
34 | compile
35 |
36 |
37 | org.apache.hadoop
38 | hadoop-client
39 | ${hadoop.version}
40 | compile
41 |
42 |
43 | com.aerospike
44 | aerospike-client
45 | ${aerospike.client.version}
46 | compile
47 |
48 |
49 |
50 | junit
51 | junit
52 | ${junit.version}
53 | test
54 |
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/examples/generate_profiles/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 | generate_profiles
6 |
7 | com.aerospike
8 | aerospike-hadoop-examples
9 | 1.1.0-SNAPSHOT
10 |
11 |
12 |
13 | org.json
14 | json
15 | 20140107
16 | compile
17 |
18 |
19 | org.apache.hadoop
20 | hadoop-mapreduce-client-jobclient
21 | 2.7.2
22 | compile
23 |
24 |
25 | com.aerospike
26 | aerospike-mapreduce
27 |
28 |
29 | joda-time
30 | joda-time
31 | 2.5
32 | compile
33 |
34 |
35 |
36 | build/libs
37 | ${project.artifactId}-notfull
38 |
39 |
40 | maven-assembly-plugin
41 |
42 |
43 |
44 | com.aerospike.hadoop.examples.generateprofiles.GenerateProfiles
45 |
46 |
47 |
48 | jar-with-dependencies
49 |
50 | ${project.artifactId}
51 | false
52 |
53 |
54 |
55 | make-assembly
56 | package
57 |
58 | single
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
--------------------------------------------------------------------------------
/examples/session_rollup/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 | session_rollup
6 |
7 |
8 | com.aerospike
9 | aerospike-hadoop-examples
10 | 1.1.0-SNAPSHOT
11 |
12 |
13 |
14 |
15 | org.json
16 | json
17 | 20140107
18 | compile
19 |
20 |
21 | org.apache.hadoop
22 | hadoop-mapreduce-client-jobclient
23 | 2.7.2
24 | compile
25 |
26 |
27 | com.aerospike
28 | aerospike-mapreduce
29 |
30 |
31 | joda-time
32 | joda-time
33 | 2.5
34 | compile
35 |
36 |
37 |
38 | build/libs
39 | ${project.artifactId}-notfull
40 |
41 |
42 | maven-assembly-plugin
43 |
44 |
45 |
46 | com.aerospike.hadoop.examples.sessionrollup.SessionRollup
47 |
48 |
49 |
50 | jar-with-dependencies
51 |
52 | ${project.artifactId}
53 | false
54 |
55 |
56 |
57 | make-assembly
58 | package
59 |
60 | single
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/examples/word_count_input/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 | word_count_input
6 |
7 | com.aerospike
8 | aerospike-hadoop-examples
9 | 1.1.0-SNAPSHOT
10 |
11 |
12 |
13 |
14 | org.json
15 | json
16 | 20140107
17 | compile
18 |
19 |
20 | org.apache.hadoop
21 | hadoop-mapreduce-client-jobclient
22 | 2.7.2
23 | compile
24 |
25 |
26 | com.aerospike
27 | aerospike-mapreduce
28 |
29 |
30 | joda-time
31 | joda-time
32 | 2.5
33 | compile
34 |
35 |
36 |
37 | build/libs
38 | ${project.artifactId}-notfull
39 |
40 |
41 | maven-assembly-plugin
42 |
43 |
44 |
45 | com.aerospike.hadoop.examples.wordcountinput.WordCountInput
46 |
47 |
48 |
49 | jar-with-dependencies
50 |
51 | ${project.artifactId}
52 | false
53 |
54 |
55 |
56 | make-assembly
57 | package
58 |
59 | single
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/examples/word_count_output/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 | word_count_output
6 |
7 | com.aerospike
8 | aerospike-hadoop-examples
9 | 1.1.0-SNAPSHOT
10 |
11 |
12 |
13 |
14 | org.json
15 | json
16 | 20140107
17 | compile
18 |
19 |
20 | org.apache.hadoop
21 | hadoop-mapreduce-client-jobclient
22 | 2.7.2
23 | compile
24 |
25 |
26 | com.aerospike
27 | aerospike-mapreduce
28 |
29 |
30 | joda-time
31 | joda-time
32 | 2.5
33 | compile
34 |
35 |
36 |
37 | build/libs
38 | ${project.artifactId}-notfull
39 |
40 |
41 | maven-assembly-plugin
42 |
43 |
44 |
45 | com.aerospike.hadoop.examples.wordcountoutput.WordCountOutput
46 |
47 |
48 |
49 | jar-with-dependencies
50 |
51 | ${project.artifactId}
52 | false
53 |
54 |
55 |
56 | make-assembly
57 | package
58 |
59 | single
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/examples/aggregate_int_input/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 | aggregate_int_input
6 |
7 |
8 | com.aerospike
9 | aerospike-hadoop-examples
10 | 1.1.0-SNAPSHOT
11 |
12 |
13 |
14 |
15 | com.aerospike
16 | aerospike-mapreduce
17 |
18 |
19 | org.json
20 | json
21 | 20140107
22 | compile
23 |
24 |
25 | org.apache.hadoop
26 | hadoop-mapreduce-client-jobclient
27 | 2.2.0
28 | compile
29 |
30 |
31 | joda-time
32 | joda-time
33 | 2.5
34 | compile
35 |
36 |
37 |
38 | build/libs
39 | ${project.artifactId}-notfull
40 |
41 |
42 | maven-assembly-plugin
43 |
44 |
45 |
46 | com.aerospike.hadoop.examples.aggregateintinput.AggregateIntInput
47 |
48 |
49 |
50 | jar-with-dependencies
51 |
52 | ${project.artifactId}
53 | false
54 |
55 |
56 |
57 | make-assembly
58 | package
59 |
60 | single
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/examples/external_join/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 |
6 | com.aerospike
7 | aerospike-hadoop-examples
8 | 1.1.0-SNAPSHOT
9 |
10 | external_join
11 | 1.1.0-SNAPSHOT
12 |
13 |
14 | org.json
15 | json
16 | 20140107
17 | compile
18 |
19 |
20 | org.apache.hadoop
21 | hadoop-mapreduce-client-jobclient
22 | ${hadoop.version}
23 | compile
24 |
25 |
26 | com.aerospike
27 | aerospike-mapreduce
28 |
29 |
30 | joda-time
31 | joda-time
32 | 2.5
33 | compile
34 |
35 |
36 |
37 | build/libs
38 | ${project.artifactId}-notfull
39 |
40 |
41 | maven-assembly-plugin
42 |
43 |
44 |
45 | com.aerospike.hadoop.examples.externaljoin.ExternalJoin
46 |
47 |
48 |
49 | jar-with-dependencies
50 |
51 | ${project.artifactId}
52 | false
53 |
54 |
55 |
56 | make-assembly
57 | package
58 |
59 | single
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
1 | @if "%DEBUG%" == "" @echo off
2 | @rem ##########################################################################
3 | @rem
4 | @rem Gradle startup script for Windows
5 | @rem
6 | @rem ##########################################################################
7 |
8 | @rem Set local scope for the variables with windows NT shell
9 | if "%OS%"=="Windows_NT" setlocal
10 |
11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12 | set DEFAULT_JVM_OPTS=
13 |
14 | set DIRNAME=%~dp0
15 | if "%DIRNAME%" == "" set DIRNAME=.
16 | set APP_BASE_NAME=%~n0
17 | set APP_HOME=%DIRNAME%
18 |
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 |
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 |
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 |
32 | goto fail
33 |
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 |
38 | if exist "%JAVA_EXE%" goto init
39 |
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 |
46 | goto fail
47 |
48 | :init
49 | @rem Get command-line arguments, handling Windowz variants
50 |
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | if "%@eval[2+2]" == "4" goto 4NT_args
53 |
54 | :win9xME_args
55 | @rem Slurp the command line arguments.
56 | set CMD_LINE_ARGS=
57 | set _SKIP=2
58 |
59 | :win9xME_args_slurp
60 | if "x%~1" == "x" goto execute
61 |
62 | set CMD_LINE_ARGS=%*
63 | goto execute
64 |
65 | :4NT_args
66 | @rem Get arguments from the 4NT Shell from JP Software
67 | set CMD_LINE_ARGS=%$
68 |
69 | :execute
70 | @rem Setup the command line
71 |
72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73 |
74 | @rem Execute Gradle
75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76 |
77 | :end
78 | @rem End local scope for the variables with windows NT shell
79 | if "%ERRORLEVEL%"=="0" goto mainEnd
80 |
81 | :fail
82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83 | rem the _cmd.exe /c_ return code!
84 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85 | exit /b 1
86 |
87 | :mainEnd
88 | if "%OS%"=="Windows_NT" endlocal
89 |
90 | :omega
91 |
--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeRecordWriter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2014 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.mapreduce;
20 |
21 | import java.io.IOException;
22 |
23 | import org.apache.commons.logging.Log;
24 | import org.apache.commons.logging.LogFactory;
25 | import org.apache.hadoop.conf.Configuration;
26 | import org.apache.hadoop.mapreduce.RecordWriter;
27 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
28 | import org.apache.hadoop.util.Progressable;
29 |
30 | import com.aerospike.client.AerospikeClient;
31 | import com.aerospike.client.policy.ClientPolicy;
32 | import com.aerospike.client.policy.WritePolicy;
33 |
34 | public abstract class AerospikeRecordWriter
35 | extends RecordWriter
36 | implements org.apache.hadoop.mapred.RecordWriter {
37 |
38 | private static final Log log =
39 | LogFactory.getLog(AerospikeRecordWriter.class);
40 |
41 | protected final Configuration cfg;
42 | protected boolean initialized = false;
43 |
44 | private static String namespace;
45 | private static String setName;
46 | private static AerospikeClient client;
47 | private static WritePolicy writePolicy;
48 |
49 | public AerospikeRecordWriter(Configuration cfg) {
50 | this.cfg = cfg;
51 | }
52 |
53 | public abstract void writeAerospike(KK key,
54 | VV value,
55 | AerospikeClient client,
56 | WritePolicy writePolicy,
57 | String namespace,
58 | String setName) throws IOException;
59 |
60 | @Override
61 | public void write(KK key, VV value) throws IOException {
62 | if (!initialized) {
63 | initialized = true;
64 | init();
65 | }
66 |
67 | writeAerospike(key, value, client, writePolicy, namespace, setName);
68 | }
69 |
70 | protected void init() throws IOException {
71 |
72 | String host = AerospikeConfigUtil.getOutputHost(cfg);
73 | int port = AerospikeConfigUtil.getOutputPort(cfg);
74 |
75 | namespace = AerospikeConfigUtil.getOutputNamespace(cfg);
76 | setName = AerospikeConfigUtil.getOutputSetName(cfg);
77 |
78 | log.info(String.format("init: %s %d %s %s",
79 | host, port, namespace, setName));
80 |
81 | ClientPolicy policy = new ClientPolicy();
82 | policy.user = "";
83 | policy.password = "";
84 | policy.failIfNotConnected = true;
85 |
86 | client = AerospikeClientSingleton.getInstance(policy, host, port);
87 |
88 | writePolicy = new WritePolicy();
89 | }
90 |
91 | @Override
92 | public void close(TaskAttemptContext context) throws IOException {
93 | doClose(context);
94 | }
95 |
96 | public void close(org.apache.hadoop.mapred.Reporter reporter
97 | ) throws IOException {
98 | doClose(reporter);
99 | }
100 |
101 | protected void doClose(Progressable progressable) {
102 | log.info("doClose");
103 | initialized = false;
104 | }
105 | }
106 |
107 | // Local Variables:
108 | // mode: java
109 | // c-basic-offset: 4
110 | // tab-width: 4
111 | // indent-tabs-mode: nil
112 | // End:
113 | // vim: softtabstop=4:shiftwidth=4:expandtab
114 |
--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeRecord.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2014 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.mapreduce;
20 |
21 | import java.io.DataInput;
22 | import java.io.DataOutput;
23 | import java.io.IOException;
24 | import java.util.HashMap;
25 | import java.util.Map;
26 |
27 | import org.apache.hadoop.io.Writable;
28 |
29 | import com.aerospike.client.Record;
30 | import com.aerospike.client.util.Packer;
31 | import com.aerospike.client.util.Unpacker.ObjectUnpacker;
32 |
33 | public class AerospikeRecord implements Writable {
34 |
35 | public Map bins;
36 | public int generation;
37 | public int expiration;
38 |
39 | public AerospikeRecord() {
40 | this.bins = null;
41 | this.generation = 0;
42 | this.expiration = 0;
43 | }
44 |
45 | public AerospikeRecord(Record rec) {
46 | this.bins = rec.bins;
47 | this.generation = rec.generation;
48 | this.expiration = rec.expiration;
49 | }
50 |
51 | public AerospikeRecord(AerospikeRecord rec) {
52 | this.bins = rec.bins;
53 | this.generation = rec.generation;
54 | this.expiration = rec.expiration;
55 | }
56 |
57 | public void set(Record rec) {
58 | this.bins = rec.bins;
59 | this.generation = rec.generation;
60 | this.expiration = rec.expiration;
61 | }
62 |
63 | public void set(AerospikeRecord rec) {
64 | this.bins = rec.bins;
65 | this.generation = rec.generation;
66 | this.expiration = rec.expiration;
67 | }
68 |
69 | public Record toRecord() {
70 | return new Record(bins, generation, expiration);
71 | }
72 |
73 | public void write(DataOutput out) throws IOException {
74 | try {
75 | out.writeInt(generation);
76 | out.writeInt(expiration);
77 | out.writeInt(bins.size());
78 | for (Map.Entry entry : bins.entrySet()) {
79 | out.writeUTF(entry.getKey());
80 | Packer pack = new Packer();
81 | pack.packObject(entry.getValue());
82 | byte[] buff = pack.toByteArray();
83 | out.writeInt(buff.length);
84 | out.write(buff);
85 | }
86 | }
87 | catch (Exception ex) {
88 | throw new IOException(ex);
89 | }
90 | }
91 |
92 | public void readFields(DataInput in) throws IOException {
93 | try {
94 | generation = in.readInt();
95 | expiration = in.readInt();
96 | int nbins = in.readInt();
97 | bins = new HashMap();
98 | for (int ii = 0; ii < nbins; ++ii) {
99 | String key = in.readUTF();
100 | int buflen = in.readInt();
101 | byte[] buff = new byte[buflen];
102 | in.readFully(buff);
103 | ObjectUnpacker unpack = new ObjectUnpacker(buff, 0, buff.length);
104 | Object obj = unpack.unpackObject();
105 | bins.put(key, obj);
106 | }
107 | }
108 | catch (Exception ex) {
109 | throw new IOException(ex);
110 | }
111 | }
112 |
113 | public static AerospikeRecord read(DataInput in) throws IOException {
114 | AerospikeRecord rec = new AerospikeRecord();
115 | rec.readFields(in);
116 | return rec;
117 | }
118 | }
119 |
120 | // Local Variables:
121 | // mode: java
122 | // c-basic-offset: 4
123 | // tab-width: 4
124 | // indent-tabs-mode: nil
125 | // End:
126 | // vim: softtabstop=4:shiftwidth=4:expandtab
127 |
--------------------------------------------------------------------------------
/examples/spark_session_rollup/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 | spark_session_rollup
6 |
7 |
8 | com.aerospike
9 | aerospike-hadoop-examples
10 | 1.1.0-SNAPSHOT
11 |
12 |
13 |
14 |
15 | org.apache.spark
16 | spark-core_2.11
17 | 2.4.0
18 | compile
19 |
20 |
21 | commons-codec
22 | commons-codec
23 | 1.9
24 | compile
25 |
26 |
27 | com.aerospike
28 | aerospike-mapreduce
29 |
30 |
31 |
32 | build/libs
33 | ${project.artifactId}-notfull
34 |
35 |
36 | org.apache.maven.plugins
37 | maven-shade-plugin
38 | 2.2
39 |
40 |
41 |
42 | *:*
43 |
44 | META-INF/*.SF
45 | META-INF/*.DSA
46 | META-INF/*.RSA
47 |
48 |
49 |
50 |
51 |
52 |
53 | job-driver-jar
54 | package
55 |
56 | shade
57 |
58 |
59 | true
60 | driver
61 |
62 |
63 |
67 |
68 | reference.conf
69 |
70 |
71 | com.aerospike.spark.examples.SparkSessionRollup
72 |
73 |
74 |
75 |
76 |
77 | worker-library-jar
78 | package
79 |
80 | shade
81 |
82 |
83 | true
84 | worker
85 |
86 |
87 | commons-codec:commons-codec
88 | com.aerospike:aerospike-client
89 | com.aerospike:aerospike-mapreduce
90 | org.gnu:gnu-crypto
91 | org.luaj:luaj-jse
92 | org.mindrot:jbcrypt
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
--------------------------------------------------------------------------------
/examples/word_count_input/src/main/java/com/aerospike/hadoop/examples/wordcountinput/WordCountInput.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.examples.wordcountinput;
20 |
21 | import java.io.IOException;
22 | import java.util.Iterator;
23 | import java.util.StringTokenizer;
24 |
25 | import org.apache.commons.logging.Log;
26 | import org.apache.commons.logging.LogFactory;
27 | import org.apache.hadoop.conf.Configuration;
28 | import org.apache.hadoop.conf.Configured;
29 | import org.apache.hadoop.fs.Path;
30 | import org.apache.hadoop.io.IntWritable;
31 | import org.apache.hadoop.io.Text;
32 | import org.apache.hadoop.mapred.FileOutputFormat;
33 | import org.apache.hadoop.mapred.JobClient;
34 | import org.apache.hadoop.mapred.JobConf;
35 | import org.apache.hadoop.mapred.MapReduceBase;
36 | import org.apache.hadoop.mapred.Mapper;
37 | import org.apache.hadoop.mapred.OutputCollector;
38 | import org.apache.hadoop.mapred.Reducer;
39 | import org.apache.hadoop.mapred.Reporter;
40 | import org.apache.hadoop.mapred.TextOutputFormat;
41 | import org.apache.hadoop.util.Tool;
42 | import org.apache.hadoop.util.ToolRunner;
43 |
44 | import com.aerospike.hadoop.mapreduce.AerospikeInputFormat;
45 | import com.aerospike.hadoop.mapreduce.AerospikeKey;
46 | import com.aerospike.hadoop.mapreduce.AerospikeRecord;
47 |
48 | public class WordCountInput extends Configured implements Tool {
49 |
50 | private static final Log log = LogFactory.getLog(WordCountInput.class);
51 |
52 | private static String binName = "bin1";
53 |
54 | public static class Map
55 | extends MapReduceBase
56 | implements Mapper {
57 |
58 | private final static IntWritable one = new IntWritable(1);
59 | private Text word = new Text();
60 |
61 | public void map(AerospikeKey key,
62 | AerospikeRecord rec,
63 | OutputCollector output,
64 | Reporter reporter
65 | ) throws IOException {
66 | String line = rec.bins.get(binName).toString();
67 | StringTokenizer tokenizer = new StringTokenizer(line);
68 | while (tokenizer.hasMoreTokens()) {
69 | word.set(tokenizer.nextToken());
70 | output.collect(word, one);
71 | }
72 | }
73 | }
74 |
75 | public static class Reduce
76 | extends MapReduceBase
77 | implements Reducer {
78 |
79 | public void reduce(Text word, Iterator values,
80 | OutputCollector output,
81 | Reporter reporter)
82 | throws IOException {
83 | int sum = 0;
84 | while (values.hasNext()) {
85 | sum += values.next().get();
86 | }
87 | output.collect(word, new IntWritable(sum));
88 | }
89 | }
90 |
91 | public int run(final String[] args) throws Exception {
92 |
93 | log.info("run starting");
94 |
95 | final Configuration conf = getConf();
96 |
97 | JobConf job = new JobConf(conf, WordCountInput.class);
98 | job.setJobName("AerospikeWordCountInput");
99 |
100 | job.setInputFormat(AerospikeInputFormat.class);
101 | job.setMapperClass(Map.class);
102 | job.setCombinerClass(Reduce.class);
103 | job.setReducerClass(Reduce.class);
104 | job.setOutputKeyClass(Text.class);
105 | job.setOutputValueClass(IntWritable.class);
106 | job.setOutputFormat(TextOutputFormat.class);
107 |
108 | FileOutputFormat.setOutputPath(job, new Path(args[0]));
109 |
110 | JobClient.runJob(job);
111 |
112 | log.info("finished");
113 | return 0;
114 | }
115 |
116 | public static void main(final String[] args) throws Exception {
117 | System.exit(ToolRunner.run(new WordCountInput(), args));
118 | }
119 | }
120 |
121 | // Local Variables:
122 | // mode: java
123 | // c-basic-offset: 4
124 | // tab-width: 4
125 | // indent-tabs-mode: nil
126 | // End:
127 | // vim: softtabstop=4:shiftwidth=4:expandtab
128 |
--------------------------------------------------------------------------------
/examples/aggregate_int_input/src/main/java/com/aerospike/hadoop/examples/aggregateintinput/AggregateIntInput.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.examples.aggregateintinput;
20 |
21 | import java.io.IOException;
22 |
23 | import org.apache.commons.logging.Log;
24 | import org.apache.commons.logging.LogFactory;
25 | import org.apache.hadoop.conf.Configuration;
26 | import org.apache.hadoop.conf.Configured;
27 | import org.apache.hadoop.fs.Path;
28 | import org.apache.hadoop.io.LongWritable;
29 | import org.apache.hadoop.io.Text;
30 | import org.apache.hadoop.mapreduce.Job;
31 | import org.apache.hadoop.mapreduce.Mapper;
32 | import org.apache.hadoop.mapreduce.Reducer;
33 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
34 | import org.apache.hadoop.util.Tool;
35 | import org.apache.hadoop.util.ToolRunner;
36 |
37 | import com.aerospike.hadoop.mapreduce.AerospikeInputFormat;
38 | import com.aerospike.hadoop.mapreduce.AerospikeKey;
39 | import com.aerospike.hadoop.mapreduce.AerospikeRecord;
40 |
41 | public class AggregateIntInput extends Configured implements Tool {
42 |
43 | private static final Log log = LogFactory.getLog(AggregateIntInput.class);
44 |
45 | private static final int KK = 3163;
46 |
47 | private static final String binName = "bin1";
48 |
49 | public static class Map
50 | extends Mapper {
52 |
53 | private LongWritable val = new LongWritable();
54 | private LongWritable mod = new LongWritable();
55 |
56 | public void map(AerospikeKey key, AerospikeRecord rec, Context context)
57 | throws IOException, InterruptedException {
58 | int vv = (Integer) rec.bins.get(binName);
59 | val.set(vv);
60 | mod.set(vv % KK);
61 | context.write(mod, val);
62 | }
63 | }
64 |
65 | public static class Reduce
66 | extends Reducer {
67 |
68 | public void reduce(LongWritable mod,
69 | Iterable values,
70 | Context context)
71 | throws IOException, InterruptedException {
72 |
73 | long num = 0; // number of elements
74 | long sum = 0; // sum of elements
75 | long min = Long.MAX_VALUE; // minimum element
76 | long max = Long.MIN_VALUE; // maximum element
77 |
78 | for (LongWritable val : values) {
79 | long vv = val.get();
80 | num += 1;
81 | sum += vv;
82 | if (vv < min) min = vv;
83 | if (vv > max) max = vv;
84 | }
85 |
86 | String rec = String.format("%d %d %d %d", num, min, max, sum);
87 |
88 | context.write(mod, new Text(rec));
89 | }
90 | }
91 |
92 | public int run(final String[] args) throws Exception {
93 | final Configuration conf = getConf();
94 |
95 | @SuppressWarnings("deprecation")
96 | final Job job = new Job(conf, "AerospikeAggregateIntInput");
97 |
98 | log.info("run starting on bin " + binName);
99 |
100 | job.setJarByClass(AggregateIntInput.class);
101 | job.setInputFormatClass(AerospikeInputFormat.class);
102 | job.setMapperClass(Map.class);
103 | job.setMapOutputKeyClass(LongWritable.class);
104 | job.setMapOutputValueClass(LongWritable.class);
105 | // job.setCombinerClass(Reduce.class); // no combiner
106 | job.setReducerClass(Reduce.class);
107 | job.setOutputKeyClass(LongWritable.class);
108 | job.setOutputValueClass(Text.class);
109 |
110 | FileOutputFormat.setOutputPath(job, new Path(args[0]));
111 |
112 | int status = job.waitForCompletion(true) ? 0 : 1;
113 | log.info("run finished, status=" + status);
114 | return status;
115 | }
116 |
117 | public static void main(final String[] args) throws Exception {
118 | System.exit(ToolRunner.run(new AggregateIntInput(), args));
119 | }
120 | }
121 |
122 | // Local Variables:
123 | // mode: java
124 | // c-basic-offset: 4
125 | // tab-width: 4
126 | // indent-tabs-mode: nil
127 | // End:
128 | // vim: softtabstop=4:shiftwidth=4:expandtab
129 |
--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeKey.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2014 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.mapreduce;
20 |
21 | import java.io.DataInput;
22 | import java.io.DataOutput;
23 | import java.io.IOException;
24 |
25 | import org.apache.hadoop.io.WritableComparable;
26 |
27 | import com.aerospike.client.Key;
28 | import com.aerospike.client.Value;
29 | import com.aerospike.client.util.Packer;
30 | import com.aerospike.client.util.Unpacker.ObjectUnpacker;
31 |
32 | public class AerospikeKey implements WritableComparable {
33 |
34 | public String namespace;
35 | public String setName;
36 | public byte[] digest;
37 | public Value userKey;
38 |
39 | public AerospikeKey() {
40 | this.namespace = null;
41 | this.setName = null;
42 | this.digest = null;
43 | this.userKey = null;
44 | }
45 |
46 | public AerospikeKey(Key key) {
47 | this.namespace = key.namespace;
48 | this.digest = key.digest;
49 | this.setName = key.setName;
50 | this.userKey = key.userKey;
51 | }
52 |
53 | public AerospikeKey(AerospikeKey key) {
54 | this.namespace = key.namespace;
55 | this.digest = key.digest;
56 | this.setName = key.setName;
57 | this.userKey = key.userKey;
58 | }
59 |
60 | public void set(Key key) {
61 | this.namespace = key.namespace;
62 | this.digest = key.digest;
63 | this.setName = key.setName;
64 | this.userKey = key.userKey;
65 | }
66 |
67 | public void set(AerospikeKey key) {
68 | this.namespace = key.namespace;
69 | this.digest = key.digest;
70 | this.setName = key.setName;
71 | this.userKey = key.userKey;
72 | }
73 |
74 | public Key toKey() {
75 | return new Key(namespace, digest, setName, userKey);
76 | }
77 |
78 | public void write(DataOutput out) throws IOException {
79 | try {
80 | out.writeUTF(namespace);
81 | out.writeUTF(setName);
82 | out.writeInt(digest.length);
83 | out.write(digest);
84 | out.writeBoolean(userKey != null);
85 | if (userKey == null) {
86 | out.writeBoolean(false);
87 | } else {
88 | out.writeBoolean(true);
89 | Packer pack = new Packer();
90 | pack.packObject(userKey);
91 | byte[] buff = pack.toByteArray();
92 | out.writeInt(buff.length);
93 | out.write(buff);
94 | }
95 | }
96 | catch (Exception ex) {
97 | throw new IOException(ex);
98 | }
99 | }
100 |
101 | public void readFields(DataInput in) throws IOException {
102 | try {
103 | namespace = in.readUTF();
104 | setName = in.readUTF();
105 | int digestLen = in.readInt();
106 | digest = new byte[digestLen];
107 | in.readFully(digest);
108 | if (in.readBoolean()) {
109 | int buflen = in.readInt();
110 | byte[] buff = new byte[buflen];
111 | in.readFully(buff);
112 | ObjectUnpacker unpack = new ObjectUnpacker(buff, 0, buff.length);
113 | userKey = Value.get(unpack.unpackObject());
114 | }
115 | }
116 | catch (Exception ex) {
117 | throw new IOException(ex);
118 | }
119 | }
120 |
121 | public static AerospikeKey read(DataInput in) throws IOException {
122 | AerospikeKey key = new AerospikeKey();
123 | key.readFields(in);
124 | return key;
125 | }
126 |
127 | public int compareTo(AerospikeKey other) {
128 | byte[] left = this.digest;
129 | byte[] right = other.digest;
130 | for (int i = 0, j = 0; i < left.length && j < right.length; i++, j++) {
131 | int a = (left[i] & 0xff);
132 | int b = (right[j] & 0xff);
133 | if (a != b) {
134 | return a - b;
135 | }
136 | }
137 | return left.length - right.length;
138 | }
139 | }
140 |
141 | // Local Variables:
142 | // mode: java
143 | // c-basic-offset: 4
144 | // tab-width: 4
145 | // indent-tabs-mode: nil
146 | // End:
147 | // vim: softtabstop=4:shiftwidth=4:expandtab
148 |
--------------------------------------------------------------------------------
/sampledata/src/main/java/com/aerospike/hadoop/sampledata/SampleData.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.sampledata;
20 |
21 | import java.io.BufferedReader;
22 | import java.io.FileReader;
23 |
24 | import org.apache.commons.logging.Log;
25 | import org.apache.commons.logging.LogFactory;
26 |
27 | import com.aerospike.client.AerospikeClient;
28 | import com.aerospike.client.Bin;
29 | import com.aerospike.client.Key;
30 | import com.aerospike.client.policy.ClientPolicy;
31 | import com.aerospike.client.policy.WritePolicy;
32 | import com.aerospike.client.query.IndexType;
33 | import com.aerospike.client.task.IndexTask;
34 |
35 | public class SampleData {
36 |
37 | // aql> CREATE INDEX bin1ndx ON test.sample (bin1) NUMERIC
38 |
39 | private static final Log log = LogFactory.getLog(SampleData.class);
40 |
41 | private static String host;
42 | private static int port;
43 | private static String namespace;
44 | private static String setName;
45 | private static String binName;
46 | private static AerospikeClient client;
47 | private static WritePolicy writePolicy;
48 |
49 | public static void run(String[] args) throws Exception {
50 |
51 | int argi = 0;
52 | String asspec = args[argi++];
53 | String dataType = args[argi++];
54 |
55 | log.info(String.format("saw %s %s", asspec, dataType));
56 |
57 | String[] inparam = asspec.split(":");
58 | host = inparam[0];
59 | port = Integer.parseInt(inparam[1]);
60 | namespace = inparam[2];
61 | setName = inparam[3];
62 | binName = inparam[4];
63 |
64 | ClientPolicy policy = new ClientPolicy();
65 | policy.user = "";
66 | policy.password = "";
67 | policy.failIfNotConnected = true;
68 |
69 | client = new AerospikeClient(policy, host, port);
70 |
71 | writePolicy = new WritePolicy();
72 |
73 | if (dataType.equals("text-file"))
74 | runTextFile(args, argi);
75 | else if (dataType.equals("seq-int"))
76 | runSeqInt(args, argi);
77 | else
78 | throw new RuntimeException(String.format("unknown dataType \"%s\"",
79 | dataType));
80 | }
81 |
82 | public static void runTextFile(String[] args, int argi) throws Exception {
83 |
84 | while (argi < args.length) {
85 | String path = args[argi++];
86 | log.info("processing " + path + " ...");
87 | int nrecs = 0;
88 | BufferedReader br = new BufferedReader(new FileReader(path));
89 | for (String line; (line = br.readLine()) != null; ) {
90 | // The key is "path:linenum".
91 | String keystr = path + ':' + Long.toString(nrecs++);
92 | Key key = new Key(namespace, setName, keystr);
93 | Bin bin = new Bin(binName, line);
94 | client.put(writePolicy, key, bin);
95 | }
96 | log.info("inserted " + nrecs + " records");
97 | br.close();
98 | }
99 | }
100 |
101 | public static void runSeqInt(String[] args, int argi) throws Exception {
102 |
103 | int offset = Integer.parseInt(args[argi++]);
104 | int nrecs = Integer.parseInt(args[argi++]);
105 |
106 | String ndxname = binName + "ndx";
107 |
108 | IndexTask task =
109 | client.createIndex(null, namespace, setName,
110 | ndxname, binName, IndexType.NUMERIC);
111 |
112 | task.waitTillComplete();
113 | log.info("created secondary index on " + binName);
114 |
115 | for (long ll = offset; ll < offset + nrecs; ++ll) {
116 |
117 | String keystr = "key-" + ll;
118 |
119 | Key key = new Key(namespace, setName, keystr);
120 | Bin bin1 = new Bin(binName, ll);
121 | Bin bin2 = new Bin("bin2", "value2");
122 |
123 | client.put(writePolicy, key, bin1, bin2);
124 | }
125 |
126 | log.info("inserted " + nrecs + " records");
127 | }
128 |
129 | public static void main(String[] args) {
130 |
131 | try {
132 | log.info("starting");
133 | run(args);
134 | log.info("finished");
135 | } catch (Exception ex) {
136 |
137 | log.error(ex.getMessage());
138 | ex.printStackTrace();
139 | }
140 | }
141 |
142 | }
143 |
144 | // Local Variables:
145 | // mode: java
146 | // c-basic-offset: 4
147 | // tab-width: 4
148 | // indent-tabs-mode: nil
149 | // End:
150 | // vim: softtabstop=4:shiftwidth=4:expandtab
151 |
--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeSplit.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2014 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.mapreduce;
20 |
21 | import java.io.DataInput;
22 | import java.io.DataOutput;
23 | import java.io.IOException;
24 |
25 | import org.apache.hadoop.io.Text;
26 | import org.apache.hadoop.mapreduce.InputSplit;
27 |
28 | public class AerospikeSplit
29 | extends InputSplit
30 | implements org.apache.hadoop.mapred.InputSplit {
31 |
32 | private String type;
33 | private String node;
34 | private String host;
35 | private int port;
36 | private String namespace;
37 | private String setName;
38 | private String[] binNames;
39 | private String numrangeBin;
40 | private long numrangeBegin;
41 | private long numrangeEnd;
42 | private int scanPercent;
43 |
44 | AerospikeSplit() {
45 | }
46 |
47 | public AerospikeSplit(String type, String node, String host, int port,
48 | String ns, String setName, String[] binNames,
49 | String numrangeBin, long numrangeBegin, long numrangeEnd) {
50 | this(type, node, host, port, ns, setName, binNames, numrangeBin, numrangeBegin, numrangeEnd, AerospikeConfigUtil.DEFAULT_INPUT_SCAN_PERCENT);
51 | }
52 |
53 | public AerospikeSplit(String type, String node, String host, int port,
54 | String ns, String setName, String[] binNames,
55 | String numrangeBin, long numrangeBegin,
56 | long numrangeEnd, int scanPercent) {
57 | this.type = type;
58 | this.node = node;
59 | this.host = host;
60 | this.port = port;
61 | this.namespace = ns;
62 | this.setName = setName;
63 | this.binNames = binNames;
64 | this.numrangeBin = numrangeBin;
65 | this.numrangeBegin = numrangeBegin;
66 | this.numrangeEnd = numrangeEnd;
67 | this.scanPercent = scanPercent;
68 | }
69 |
70 | public String getType() {
71 | return type;
72 | }
73 |
74 | public String getNode() {
75 | return node;
76 | }
77 |
78 | public String getHost() {
79 | return host;
80 | }
81 |
82 | public int getPort() {
83 | return port;
84 | }
85 |
86 | public String getNameSpace() {
87 | return namespace;
88 | }
89 |
90 | public String getSetName() {
91 | return setName;
92 | }
93 |
94 | public String[] getBinNames() {
95 | return binNames;
96 | }
97 |
98 | public String getNumRangeBin() {
99 | return numrangeBin;
100 | }
101 |
102 | public long getNumRangeBegin() {
103 | return numrangeBegin;
104 | }
105 |
106 | public long getNumRangeEnd() {
107 | return numrangeEnd;
108 | }
109 |
110 | public int getScanPercent() { return scanPercent; }
111 |
112 | public long getLength() {
113 | return 1;
114 | }
115 |
116 | public String toString() {
117 | return type + ':' + node + ":" + host + ":" + port + ":"
118 | + namespace + ":" + setName;
119 | }
120 |
121 | public void write(DataOutput out) throws IOException {
122 | Text.writeString(out, type);
123 | Text.writeString(out, node);
124 | Text.writeString(out, host);
125 | out.writeInt(port);
126 | Text.writeString(out, namespace);
127 | Text.writeString(out, setName);
128 | if (binNames == null) {
129 | out.writeInt(0);
130 | } else {
131 | out.writeInt(binNames.length);
132 | for (String binName : binNames)
133 | Text.writeString(out, binName);
134 | }
135 | Text.writeString(out, numrangeBin);
136 | out.writeLong(numrangeBegin);
137 | out.writeLong(numrangeEnd);
138 | out.writeInt(scanPercent);
139 | }
140 |
141 | public void readFields(DataInput in) throws IOException {
142 | type = new String(Text.readString(in));
143 | node = new String(Text.readString(in));
144 | host = new String(Text.readString(in));
145 | port = in.readInt();
146 | namespace = new String(Text.readString(in));
147 | setName = new String(Text.readString(in));
148 | int nBinNames = in.readInt();
149 | if (nBinNames == 0) {
150 | binNames = null;
151 | } else {
152 | binNames = new String[nBinNames];
153 | for (int ii = 0; ii < nBinNames; ++ii)
154 | binNames[ii] = new String(Text.readString(in));
155 | }
156 | numrangeBin = new String(Text.readString(in));
157 | numrangeBegin = in.readLong();
158 | numrangeEnd = in.readLong();
159 | scanPercent = in.readInt();
160 | }
161 |
162 | public String[] getLocations() throws IOException {
163 | return new String[]{ host };
164 | }
165 | }
166 |
167 | // Local Variables:
168 | // mode: java
169 | // c-basic-offset: 4
170 | // tab-width: 4
171 | // indent-tabs-mode: nil
172 | // End:
173 | // vim: softtabstop=4:shiftwidth=4:expandtab
174 |
--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ##############################################################################
4 | ##
5 | ## Gradle start up script for UN*X
6 | ##
7 | ##############################################################################
8 |
9 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
10 | DEFAULT_JVM_OPTS="-Xmx512m"
11 |
12 | APP_NAME="Gradle"
13 | APP_BASE_NAME=`basename "$0"`
14 |
15 | # Use the maximum available, or set MAX_FD != -1 to use that value.
16 | MAX_FD="maximum"
17 |
18 | warn ( ) {
19 | echo "$*"
20 | }
21 |
22 | die ( ) {
23 | echo
24 | echo "$*"
25 | echo
26 | exit 1
27 | }
28 |
29 | # OS specific support (must be 'true' or 'false').
30 | cygwin=false
31 | msys=false
32 | darwin=false
33 | case "`uname`" in
34 | CYGWIN* )
35 | cygwin=true
36 | ;;
37 | Darwin* )
38 | darwin=true
39 | ;;
40 | MINGW* )
41 | msys=true
42 | ;;
43 | esac
44 |
45 | # For Cygwin, ensure paths are in UNIX format before anything is touched.
46 | if $cygwin ; then
47 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
48 | fi
49 |
50 | # Attempt to set APP_HOME
51 | # Resolve links: $0 may be a link
52 | PRG="$0"
53 | # Need this for relative symlinks.
54 | while [ -h "$PRG" ] ; do
55 | ls=`ls -ld "$PRG"`
56 | link=`expr "$ls" : '.*-> \(.*\)$'`
57 | if expr "$link" : '/.*' > /dev/null; then
58 | PRG="$link"
59 | else
60 | PRG=`dirname "$PRG"`"/$link"
61 | fi
62 | done
63 | SAVED="`pwd`"
64 | cd "`dirname \"$PRG\"`/" >&-
65 | APP_HOME="`pwd -P`"
66 | cd "$SAVED" >&-
67 |
68 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
69 |
70 | # Determine the Java command to use to start the JVM.
71 | if [ -n "$JAVA_HOME" ] ; then
72 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
73 | # IBM's JDK on AIX uses strange locations for the executables
74 | JAVACMD="$JAVA_HOME/jre/sh/java"
75 | else
76 | JAVACMD="$JAVA_HOME/bin/java"
77 | fi
78 | if [ ! -x "$JAVACMD" ] ; then
79 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
80 |
81 | Please set the JAVA_HOME variable in your environment to match the
82 | location of your Java installation."
83 | fi
84 | else
85 | JAVACMD="java"
86 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
87 |
88 | Please set the JAVA_HOME variable in your environment to match the
89 | location of your Java installation."
90 | fi
91 |
92 | # Increase the maximum file descriptors if we can.
93 | if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
94 | MAX_FD_LIMIT=`ulimit -H -n`
95 | if [ $? -eq 0 ] ; then
96 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
97 | MAX_FD="$MAX_FD_LIMIT"
98 | fi
99 | ulimit -n $MAX_FD
100 | if [ $? -ne 0 ] ; then
101 | warn "Could not set maximum file descriptor limit: $MAX_FD"
102 | fi
103 | else
104 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
105 | fi
106 | fi
107 |
108 | # For Darwin, add options to specify how the application appears in the dock
109 | if $darwin; then
110 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
111 | fi
112 |
113 | # For Cygwin, switch paths to Windows format before running java
114 | if $cygwin ; then
115 | APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
117 |
118 | # We build the pattern for arguments to be converted via cygpath
119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 | SEP=""
121 | for dir in $ROOTDIRSRAW ; do
122 | ROOTDIRS="$ROOTDIRS$SEP$dir"
123 | SEP="|"
124 | done
125 | OURCYGPATTERN="(^($ROOTDIRS))"
126 | # Add a user-defined pattern to the cygpath arguments
127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 | fi
130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 | i=0
132 | for arg in "$@" ; do
133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135 |
136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 | else
139 | eval `echo args$i`="\"$arg\""
140 | fi
141 | i=$((i+1))
142 | done
143 | case $i in
144 | (0) set -- ;;
145 | (1) set -- "$args0" ;;
146 | (2) set -- "$args0" "$args1" ;;
147 | (3) set -- "$args0" "$args1" "$args2" ;;
148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 | esac
155 | fi
156 |
157 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158 | function splitJvmOpts() {
159 | JVM_OPTS=("$@")
160 | }
161 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
163 |
164 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
165 |
--------------------------------------------------------------------------------
/examples/word_count_output/src/main/java/com/aerospike/hadoop/examples/wordcountoutput/WordCountOutput.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.examples.wordcountoutput;
20 |
21 | import java.io.IOException;
22 | import java.util.Iterator;
23 | import java.util.StringTokenizer;
24 |
25 | import org.apache.commons.logging.Log;
26 | import org.apache.commons.logging.LogFactory;
27 | import org.apache.hadoop.conf.Configuration;
28 | import org.apache.hadoop.conf.Configured;
29 | import org.apache.hadoop.fs.Path;
30 | import org.apache.hadoop.io.IntWritable;
31 | import org.apache.hadoop.io.LongWritable;
32 | import org.apache.hadoop.io.Text;
33 | import org.apache.hadoop.mapred.FileInputFormat;
34 | import org.apache.hadoop.mapred.JobClient;
35 | import org.apache.hadoop.mapred.JobConf;
36 | import org.apache.hadoop.mapred.MapReduceBase;
37 | import org.apache.hadoop.mapred.Mapper;
38 | import org.apache.hadoop.mapred.OutputCollector;
39 | import org.apache.hadoop.mapred.RecordWriter;
40 | import org.apache.hadoop.mapred.Reducer;
41 | import org.apache.hadoop.mapred.Reporter;
42 | // These are all needed by MyOutputFormat.
43 | import org.apache.hadoop.util.Progressable;
44 | import org.apache.hadoop.util.Tool;
45 | import org.apache.hadoop.util.ToolRunner;
46 |
47 | import com.aerospike.client.AerospikeClient;
48 | import com.aerospike.client.Bin;
49 | import com.aerospike.client.Key;
50 | import com.aerospike.client.policy.WritePolicy;
51 | import com.aerospike.hadoop.mapreduce.AerospikeOutputFormat;
52 | import com.aerospike.hadoop.mapreduce.AerospikeRecordWriter;
53 |
54 | public class WordCountOutput extends Configured implements Tool {
55 |
56 | private static final Log log = LogFactory.getLog(WordCountOutput.class);
57 |
58 | public static class Map
59 | extends MapReduceBase
60 | implements Mapper {
61 | private final static IntWritable one = new IntWritable(1);
62 | private Text word = new Text();
63 |
64 | public void map(LongWritable key, Text value,
65 | OutputCollector output,
66 | Reporter reporter)
67 | throws IOException {
68 | String line = value.toString();
69 | StringTokenizer tokenizer = new StringTokenizer(line);
70 | while (tokenizer.hasMoreTokens()) {
71 | word.set(tokenizer.nextToken());
72 | output.collect(word, one);
73 | }
74 | }
75 | }
76 |
77 | public static class Reduce
78 | extends MapReduceBase
79 | implements Reducer {
80 |
81 | public void reduce(Text key, Iterator values,
82 | OutputCollector output,
83 | Reporter reporter)
84 | throws IOException {
85 | int sum = 0;
86 | while (values.hasNext()) {
87 | sum += values.next().get();
88 | }
89 | output.collect(key, new IntWritable(sum));
90 | }
91 | }
92 |
93 | public static class MyOutputFormat
94 | extends AerospikeOutputFormat {
95 |
96 | public static class MyRecordWriter
97 | extends AerospikeRecordWriter {
98 |
99 | public MyRecordWriter(Configuration cfg, Progressable progressable) {
100 | super(cfg);
101 | }
102 |
103 | @Override
104 | public void writeAerospike(Text key,
105 | IntWritable value,
106 | AerospikeClient client,
107 | WritePolicy writePolicy,
108 | String namespace,
109 | String setName) throws IOException {
110 | Key kk = new Key(namespace, setName, key.toString());
111 | Bin bin1 = new Bin("word", key.toString());
112 | Bin bin2 = new Bin("count", value.get());
113 | client.put(writePolicy, kk, bin1, bin2);
114 | }
115 | }
116 |
117 | public RecordWriter
118 | getAerospikeRecordWriter(Configuration conf, Progressable prog) {
119 | return new MyRecordWriter(conf, prog);
120 | }
121 | }
122 |
123 | public int run(final String[] args) throws Exception {
124 |
125 | log.info("run starting");
126 |
127 | final Configuration conf = getConf();
128 |
129 | JobConf job = new JobConf(conf, WordCountOutput.class);
130 | job.setJobName("AerospikeWordCountOutput");
131 |
132 | for (int ii = 0; ii < args.length; ++ii) {
133 | FileInputFormat.addInputPath(job, new Path(args[ii]));
134 | }
135 |
136 | job.setMapperClass(Map.class);
137 | job.setCombinerClass(Reduce.class);
138 | job.setReducerClass(Reduce.class);
139 | job.setOutputKeyClass(Text.class);
140 | job.setOutputValueClass(IntWritable.class);
141 |
142 | job.setOutputFormat(MyOutputFormat.class);
143 |
144 | JobClient.runJob(job);
145 |
146 | log.info("finished");
147 | return 0;
148 | }
149 |
150 | public static void main(final String[] args) throws Exception {
151 | System.exit(ToolRunner.run(new WordCountOutput(), args));
152 | }
153 | }
154 |
155 | // Local Variables:
156 | // mode: java
157 | // c-basic-offset: 4
158 | // tab-width: 4
159 | // indent-tabs-mode: nil
160 | // End:
161 | // vim: softtabstop=4:shiftwidth=4:expandtab
162 |
--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeOutputFormat.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2014 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.mapreduce;
20 |
21 | import java.io.IOException;
22 |
23 | import org.apache.commons.logging.Log;
24 | import org.apache.commons.logging.LogFactory;
25 | import org.apache.hadoop.conf.Configuration;
26 | import org.apache.hadoop.fs.FileSystem;
27 | import org.apache.hadoop.mapreduce.JobContext;
28 | import org.apache.hadoop.mapreduce.OutputCommitter;
29 | import org.apache.hadoop.mapreduce.OutputFormat;
30 | import org.apache.hadoop.mapreduce.RecordWriter;
31 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
32 | import org.apache.hadoop.util.Progressable;
33 |
34 | public abstract class AerospikeOutputFormat
35 | extends OutputFormat
36 | implements org.apache.hadoop.mapred.OutputFormat {
37 |
38 | private static final Log log =
39 | LogFactory.getLog(AerospikeOutputFormat.class);
40 |
41 | public static class AerospikeOutputCommitter extends OutputCommitter {
42 |
43 | @Override
44 | public void setupJob(JobContext jobContext)
45 | throws IOException {}
46 |
47 | // compatibility check with Hadoop 0.20.2
48 | @Deprecated
49 | public void cleanupJob(JobContext jobContext)
50 | throws IOException {}
51 |
52 | @Override
53 | public void setupTask(TaskAttemptContext taskContext)
54 | throws IOException {
55 | //no-op
56 | }
57 |
58 | @Override
59 | public boolean needsTaskCommit(TaskAttemptContext taskContext)
60 | throws IOException {
61 | //no-op
62 | return false;
63 | }
64 |
65 | @Override
66 | public void commitTask(TaskAttemptContext taskContext)
67 | throws IOException {
68 | //no-op
69 | }
70 |
71 | @Override
72 | public void abortTask(TaskAttemptContext taskContext)
73 | throws IOException {
74 | //no-op
75 | }
76 |
77 | }
78 |
79 | public static class AerospikeOldAPIOutputCommitter
80 | extends org.apache.hadoop.mapred.OutputCommitter {
81 |
82 | @Override
83 | public void setupJob(org.apache.hadoop.mapred.JobContext jobContext)
84 | throws IOException {
85 | //no-op
86 | }
87 |
88 | @Override
89 | public void setupTask(
90 | org.apache.hadoop.mapred.TaskAttemptContext taskContext)
91 | throws IOException {
92 | //no-op
93 | }
94 |
95 | @Override
96 | public boolean needsTaskCommit(
97 | org.apache.hadoop.mapred.TaskAttemptContext taskContext)
98 | throws IOException {
99 | //no-op
100 | return false;
101 | }
102 |
103 | @Override
104 | public void commitTask(
105 | org.apache.hadoop.mapred.TaskAttemptContext taskContext)
106 | throws IOException {
107 | //no-op
108 | }
109 |
110 | @Override
111 | public void abortTask(
112 | org.apache.hadoop.mapred.TaskAttemptContext taskContext)
113 | throws IOException {
114 | //no-op
115 | }
116 |
117 | @Override
118 | @Deprecated
119 | public void cleanupJob(org.apache.hadoop.mapred.JobContext context)
120 | throws IOException {
121 | // no-op
122 | // added for compatibility with hadoop 0.20.x (used by old
123 | // tools, such as Cascalog)
124 | }
125 | }
126 |
127 | public abstract org.apache.hadoop.mapred.RecordWriter
128 | getAerospikeRecordWriter(Configuration conf, Progressable progress);
129 |
130 | //
131 | // new API - just delegates to the Old API
132 | //
133 | @SuppressWarnings("unchecked")
134 | @Override
135 | public RecordWriter getRecordWriter(TaskAttemptContext context) {
136 | Configuration conf = context.getConfiguration();
137 | return (RecordWriter) getAerospikeRecordWriter(conf, context);
138 | }
139 |
140 | @Override
141 | public void checkOutputSpecs(JobContext context) throws IOException {
142 | // careful as it seems the info here saved by in the config is discarded
143 | Configuration cfg = context.getConfiguration();
144 | init(cfg);
145 | }
146 |
147 | @Override
148 | public OutputCommitter getOutputCommitter(TaskAttemptContext context) {
149 | return new AerospikeOutputCommitter();
150 | }
151 |
152 | //
153 | // old API
154 | //
155 | @Deprecated
156 | public org.apache.hadoop.mapred.RecordWriter
157 | getRecordWriter(FileSystem ignored,
158 | org.apache.hadoop.mapred.JobConf job,
159 | String name, Progressable progress) {
160 | return getAerospikeRecordWriter(job, progress);
161 | }
162 |
163 | @Deprecated
164 | public void checkOutputSpecs(FileSystem ignored,
165 | org.apache.hadoop.mapred.JobConf cfg)
166 | throws IOException {
167 | init(cfg);
168 | }
169 |
170 | // NB: all changes to the config objects are discarded before the
171 | // job is submitted if _the old MR api_ is used
172 | private void init(Configuration cfg) throws IOException {
173 | log.info(String.format("init"));
174 | }
175 | }
176 |
177 | // Local Variables:
178 | // mode: java
179 | // c-basic-offset: 4
180 | // tab-width: 4
181 | // indent-tabs-mode: nil
182 | // End:
183 | // vim: softtabstop=4:shiftwidth=4:expandtab
184 |
--------------------------------------------------------------------------------
/WORLDCUP_FILELIST:
--------------------------------------------------------------------------------
1 | /worldcup/wc_day10_1.log
2 | /worldcup/wc_day11_1.log
3 | /worldcup/wc_day12_1.log
4 | /worldcup/wc_day13_1.log
5 | /worldcup/wc_day14_1.log
6 | /worldcup/wc_day15_1.log
7 | /worldcup/wc_day16_1.log
8 | /worldcup/wc_day17_1.log
9 | /worldcup/wc_day18_1.log
10 | /worldcup/wc_day19_1.log
11 | /worldcup/wc_day20_1.log
12 | /worldcup/wc_day21_1.log
13 | /worldcup/wc_day22_1.log
14 | /worldcup/wc_day23_1.log
15 | /worldcup/wc_day24_1.log
16 | /worldcup/wc_day25_1.log
17 | /worldcup/wc_day26_1.log
18 | /worldcup/wc_day27_1.log
19 | /worldcup/wc_day28_1.log
20 | /worldcup/wc_day29_1.log
21 | /worldcup/wc_day30_1.log
22 | /worldcup/wc_day31_1.log
23 | /worldcup/wc_day32_1.log
24 | /worldcup/wc_day33_1.log
25 | /worldcup/wc_day34_1.log
26 | /worldcup/wc_day35_1.log
27 | /worldcup/wc_day36_1.log
28 | /worldcup/wc_day37_1.log
29 | /worldcup/wc_day38_1.log
30 | /worldcup/wc_day38_2.log
31 | /worldcup/wc_day39_1.log
32 | /worldcup/wc_day39_2.log
33 | /worldcup/wc_day40_1.log
34 | /worldcup/wc_day40_2.log
35 | /worldcup/wc_day41_1.log
36 | /worldcup/wc_day41_2.log
37 | /worldcup/wc_day42_1.log
38 | /worldcup/wc_day43_1.log
39 | /worldcup/wc_day44_1.log
40 | /worldcup/wc_day44_2.log
41 | /worldcup/wc_day44_3.log
42 | /worldcup/wc_day45_1.log
43 | /worldcup/wc_day45_2.log
44 | /worldcup/wc_day45_3.log
45 | /worldcup/wc_day46_1.log
46 | /worldcup/wc_day46_2.log
47 | /worldcup/wc_day46_3.log
48 | /worldcup/wc_day46_4.log
49 | /worldcup/wc_day46_5.log
50 | /worldcup/wc_day46_6.log
51 | /worldcup/wc_day46_7.log
52 | /worldcup/wc_day46_8.log
53 | /worldcup/wc_day47_1.log
54 | /worldcup/wc_day47_2.log
55 | /worldcup/wc_day47_3.log
56 | /worldcup/wc_day47_4.log
57 | /worldcup/wc_day47_5.log
58 | /worldcup/wc_day47_6.log
59 | /worldcup/wc_day47_7.log
60 | /worldcup/wc_day47_8.log
61 | /worldcup/wc_day48_1.log
62 | /worldcup/wc_day48_2.log
63 | /worldcup/wc_day48_3.log
64 | /worldcup/wc_day48_4.log
65 | /worldcup/wc_day48_5.log
66 | /worldcup/wc_day48_6.log
67 | /worldcup/wc_day48_7.log
68 | /worldcup/wc_day49_1.log
69 | /worldcup/wc_day49_2.log
70 | /worldcup/wc_day49_3.log
71 | /worldcup/wc_day49_4.log
72 | /worldcup/wc_day50_1.log
73 | /worldcup/wc_day50_2.log
74 | /worldcup/wc_day50_3.log
75 | /worldcup/wc_day50_4.log
76 | /worldcup/wc_day51_1.log
77 | /worldcup/wc_day51_2.log
78 | /worldcup/wc_day51_3.log
79 | /worldcup/wc_day51_4.log
80 | /worldcup/wc_day51_5.log
81 | /worldcup/wc_day51_6.log
82 | /worldcup/wc_day51_7.log
83 | /worldcup/wc_day51_8.log
84 | /worldcup/wc_day51_9.log
85 | /worldcup/wc_day52_1.log
86 | /worldcup/wc_day52_2.log
87 | /worldcup/wc_day52_3.log
88 | /worldcup/wc_day52_4.log
89 | /worldcup/wc_day52_5.log
90 | /worldcup/wc_day52_6.log
91 | /worldcup/wc_day53_1.log
92 | /worldcup/wc_day53_2.log
93 | /worldcup/wc_day53_3.log
94 | /worldcup/wc_day53_4.log
95 | /worldcup/wc_day53_5.log
96 | /worldcup/wc_day53_6.log
97 | /worldcup/wc_day54_1.log
98 | /worldcup/wc_day54_2.log
99 | /worldcup/wc_day54_3.log
100 | /worldcup/wc_day54_4.log
101 | /worldcup/wc_day54_5.log
102 | /worldcup/wc_day54_6.log
103 | /worldcup/wc_day55_1.log
104 | /worldcup/wc_day55_2.log
105 | /worldcup/wc_day55_3.log
106 | /worldcup/wc_day55_4.log
107 | /worldcup/wc_day55_5.log
108 | /worldcup/wc_day56_1.log
109 | /worldcup/wc_day56_2.log
110 | /worldcup/wc_day56_3.log
111 | /worldcup/wc_day57_1.log
112 | /worldcup/wc_day57_2.log
113 | /worldcup/wc_day57_3.log
114 | /worldcup/wc_day58_1.log
115 | /worldcup/wc_day58_2.log
116 | /worldcup/wc_day58_3.log
117 | /worldcup/wc_day58_4.log
118 | /worldcup/wc_day58_5.log
119 | /worldcup/wc_day58_6.log
120 | /worldcup/wc_day59_1.log
121 | /worldcup/wc_day59_2.log
122 | /worldcup/wc_day59_3.log
123 | /worldcup/wc_day59_4.log
124 | /worldcup/wc_day59_5.log
125 | /worldcup/wc_day59_6.log
126 | /worldcup/wc_day59_7.log
127 | /worldcup/wc_day5_1.log
128 | /worldcup/wc_day60_1.log
129 | /worldcup/wc_day60_2.log
130 | /worldcup/wc_day60_3.log
131 | /worldcup/wc_day60_4.log
132 | /worldcup/wc_day60_5.log
133 | /worldcup/wc_day60_6.log
134 | /worldcup/wc_day60_7.log
135 | /worldcup/wc_day61_1.log
136 | /worldcup/wc_day61_2.log
137 | /worldcup/wc_day61_3.log
138 | /worldcup/wc_day61_4.log
139 | /worldcup/wc_day61_5.log
140 | /worldcup/wc_day61_6.log
141 | /worldcup/wc_day61_7.log
142 | /worldcup/wc_day61_8.log
143 | /worldcup/wc_day62_1.log
144 | /worldcup/wc_day62_10.log
145 | /worldcup/wc_day62_2.log
146 | /worldcup/wc_day62_3.log
147 | /worldcup/wc_day62_4.log
148 | /worldcup/wc_day62_5.log
149 | /worldcup/wc_day62_6.log
150 | /worldcup/wc_day62_7.log
151 | /worldcup/wc_day62_8.log
152 | /worldcup/wc_day62_9.log
153 | /worldcup/wc_day63_1.log
154 | /worldcup/wc_day63_2.log
155 | /worldcup/wc_day63_3.log
156 | /worldcup/wc_day63_4.log
157 | /worldcup/wc_day64_1.log
158 | /worldcup/wc_day64_2.log
159 | /worldcup/wc_day64_3.log
160 | /worldcup/wc_day65_1.log
161 | /worldcup/wc_day65_2.log
162 | /worldcup/wc_day65_3.log
163 | /worldcup/wc_day65_4.log
164 | /worldcup/wc_day65_5.log
165 | /worldcup/wc_day65_6.log
166 | /worldcup/wc_day65_7.log
167 | /worldcup/wc_day65_8.log
168 | /worldcup/wc_day65_9.log
169 | /worldcup/wc_day66_1.log
170 | /worldcup/wc_day66_10.log
171 | /worldcup/wc_day66_11.log
172 | /worldcup/wc_day66_2.log
173 | /worldcup/wc_day66_3.log
174 | /worldcup/wc_day66_4.log
175 | /worldcup/wc_day66_5.log
176 | /worldcup/wc_day66_6.log
177 | /worldcup/wc_day66_7.log
178 | /worldcup/wc_day66_8.log
179 | /worldcup/wc_day66_9.log
180 | /worldcup/wc_day67_1.log
181 | /worldcup/wc_day67_2.log
182 | /worldcup/wc_day67_3.log
183 | /worldcup/wc_day67_4.log
184 | /worldcup/wc_day67_5.log
185 | /worldcup/wc_day68_1.log
186 | /worldcup/wc_day68_2.log
187 | /worldcup/wc_day68_3.log
188 | /worldcup/wc_day69_1.log
189 | /worldcup/wc_day69_2.log
190 | /worldcup/wc_day69_3.log
191 | /worldcup/wc_day69_4.log
192 | /worldcup/wc_day69_5.log
193 | /worldcup/wc_day69_6.log
194 | /worldcup/wc_day69_7.log
195 | /worldcup/wc_day6_1.log
196 | /worldcup/wc_day70_1.log
197 | /worldcup/wc_day70_2.log
198 | /worldcup/wc_day70_3.log
199 | /worldcup/wc_day71_1.log
200 | /worldcup/wc_day71_2.log
201 | /worldcup/wc_day72_1.log
202 | /worldcup/wc_day72_2.log
203 | /worldcup/wc_day72_3.log
204 | /worldcup/wc_day73_1.log
205 | /worldcup/wc_day73_2.log
206 | /worldcup/wc_day73_3.log
207 | /worldcup/wc_day73_4.log
208 | /worldcup/wc_day73_5.log
209 | /worldcup/wc_day73_6.log
210 | /worldcup/wc_day74_1.log
211 | /worldcup/wc_day74_2.log
212 | /worldcup/wc_day74_3.log
213 | /worldcup/wc_day74_4.log
214 | /worldcup/wc_day74_5.log
215 | /worldcup/wc_day74_6.log
216 | /worldcup/wc_day75_1.log
217 | /worldcup/wc_day75_2.log
218 | /worldcup/wc_day75_3.log
219 | /worldcup/wc_day76_1.log
220 | /worldcup/wc_day76_2.log
221 | /worldcup/wc_day77_1.log
222 | /worldcup/wc_day77_2.log
223 | /worldcup/wc_day78_1.log
224 | /worldcup/wc_day78_2.log
225 | /worldcup/wc_day79_1.log
226 | /worldcup/wc_day79_2.log
227 | /worldcup/wc_day79_3.log
228 | /worldcup/wc_day79_4.log
229 | /worldcup/wc_day7_1.log
230 | /worldcup/wc_day80_1.log
231 | /worldcup/wc_day80_2.log
232 | /worldcup/wc_day81_1.log
233 | /worldcup/wc_day82_1.log
234 | /worldcup/wc_day83_1.log
235 | /worldcup/wc_day84_1.log
236 | /worldcup/wc_day85_1.log
237 | /worldcup/wc_day86_1.log
238 | /worldcup/wc_day87_1.log
239 | /worldcup/wc_day88_1.log
240 | /worldcup/wc_day89_1.log
241 | /worldcup/wc_day8_1.log
242 | /worldcup/wc_day90_1.log
243 | /worldcup/wc_day91_1.log
244 | /worldcup/wc_day92_1.log
245 | /worldcup/wc_day9_1.log
246 |
--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeInputFormat.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2014 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.mapreduce;
20 |
21 | import java.io.IOException;
22 | import java.net.InetAddress;
23 | import java.net.UnknownHostException;
24 | import java.util.ArrayList;
25 | import java.util.Arrays;
26 | import java.util.List;
27 |
28 | import org.apache.commons.logging.Log;
29 | import org.apache.commons.logging.LogFactory;
30 | import org.apache.hadoop.conf.Configuration;
31 | import org.apache.hadoop.mapred.JobConf;
32 | import org.apache.hadoop.mapred.Reporter;
33 | import org.apache.hadoop.mapreduce.InputFormat;
34 | import org.apache.hadoop.mapreduce.InputSplit;
35 | import org.apache.hadoop.mapreduce.JobContext;
36 | import org.apache.hadoop.mapreduce.RecordReader;
37 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
38 |
39 | import com.aerospike.client.AerospikeClient;
40 | import com.aerospike.client.AerospikeException;
41 | import com.aerospike.client.Host;
42 | import com.aerospike.client.cluster.Node;
43 | import com.aerospike.client.policy.ClientPolicy;
44 |
45 | /**
46 | * An {@link InputFormat} for data stored in an Aerospike database.
47 | */
48 | public class AerospikeInputFormat
49 | extends InputFormat
50 | implements org.apache.hadoop.mapred.InputFormat {
52 |
53 | private static final Log log =
54 | LogFactory.getLog(AerospikeInputFormat.class);
55 |
56 | // ---------------- NEW API ----------------
57 |
58 | public List getSplits(JobContext context) throws IOException {
59 | // Delegate to the old API.
60 | Configuration cfg = context.getConfiguration();
61 | JobConf jobconf = AerospikeConfigUtil.asJobConf(cfg);
62 | return Arrays.asList((InputSplit[]) getSplits(jobconf,
63 | jobconf.getNumMapTasks()));
64 | }
65 |
66 | public RecordReader
67 | createRecordReader(InputSplit split, TaskAttemptContext context)
68 | throws IOException, InterruptedException {
69 | return new AerospikeRecordReader();
70 | }
71 |
72 | // ---------------- OLD API ----------------
73 |
74 | public org.apache.hadoop.mapred.InputSplit[]
75 | getSplits(JobConf job, int numSplits) throws IOException {
76 | try {
77 |
78 | String oper = AerospikeConfigUtil.getInputOperation(job);
79 | String host = AerospikeConfigUtil.getInputHost(job);
80 | int port = AerospikeConfigUtil.getInputPort(job);
81 | String namespace = AerospikeConfigUtil.getInputNamespace(job);
82 | String setName = AerospikeConfigUtil.getInputSetName(job);
83 | String[] binNames = AerospikeConfigUtil.getInputBinNames(job);
84 | String numrangeBin = "";
85 | long numrangeBegin = 0;
86 | long numrangeEnd = 0;
87 | int scanPercent = 100;
88 | if (oper.equals("numrange")) {
89 | numrangeBin = AerospikeConfigUtil.getInputNumRangeBin(job);
90 | numrangeBegin = AerospikeConfigUtil.getInputNumRangeBegin(job);
91 | numrangeEnd = AerospikeConfigUtil.getInputNumRangeEnd(job);
92 | } else if (oper.equals("scan")) {
93 | scanPercent = AerospikeConfigUtil.getInputScanPercent(job);
94 | }
95 |
96 | log.info(String.format("using: %s %d %s %s",
97 | host, port, namespace, setName));
98 |
99 | AerospikeClient client =
100 | AerospikeClientSingleton.getInstance(new ClientPolicy(),
101 | host, port);
102 | Node[] nodes = client.getNodes();
103 | int nsplits = nodes.length;
104 | if (nsplits == 0) {
105 | throw new IOException("no Aerospike nodes found");
106 | }
107 | log.info(String.format("found %d nodes", nsplits));
108 | AerospikeSplit[] splits = new AerospikeSplit[nsplits];
109 | for (int ii = 0; ii < nsplits; ii++) {
110 | Node node = nodes[ii];
111 | String nodeName = node.getName();
112 |
113 | // We want to avoid 127.0.0.1 as a hostname
114 | // because this value will be transferred to a
115 | // different hadoop node to be processed.
116 | //
117 | List aliases = getAliases(node.getHost());
118 | Host nodehost = aliases.get(0);
119 | if (aliases.size() > 1) {
120 | for (Host a : aliases) {
121 | if (!a.name.equals("127.0.0.1")) {
122 | nodehost = a;
123 | break;
124 | }
125 | }
126 | }
127 | splits[ii] = new AerospikeSplit(oper, nodeName,
128 | nodehost.name, nodehost.port,
129 | namespace, setName, binNames,
130 | numrangeBin, numrangeBegin,
131 | numrangeEnd, scanPercent);
132 | log.info("split: " + splits[ii]);
133 | }
134 | return splits;
135 | }
136 | catch (Exception ex) {
137 | throw new IOException("exception in getSplits", ex);
138 | }
139 | }
140 |
141 | public org.apache.hadoop.mapred.RecordReader
142 | getRecordReader(org.apache.hadoop.mapred.InputSplit split,
143 | JobConf job,
144 | Reporter reporter
145 | ) throws IOException {
146 | return new AerospikeRecordReader((AerospikeSplit) split);
147 | }
148 |
149 | private List getAliases(Host host) {
150 | InetAddress[] addresses;
151 |
152 | try {
153 | addresses = InetAddress.getAllByName(host.name);
154 | }
155 | catch (UnknownHostException uhe) {
156 | throw new AerospikeException.Connection("Invalid host: " + host);
157 | }
158 |
159 | if (addresses.length == 0) {
160 | throw new AerospikeException.Connection("Failed to find addresses for " + host);
161 | }
162 |
163 | // Add capacity for current address aliases plus IPV6 address and hostname.
164 | List aliases = new ArrayList(addresses.length + 2);
165 |
166 | for (InetAddress address : addresses) {
167 | aliases.add(new Host(address.getHostAddress(), host.tlsName, host.port));
168 | }
169 |
170 | return aliases;
171 | }
172 |
173 | }
174 |
175 | // Local Variables:
176 | // mode: java
177 | // c-basic-offset: 4
178 | // tab-width: 4
179 | // indent-tabs-mode: nil
180 | // End:
181 | // vim: softtabstop=4:shiftwidth=4:expandtab
182 |
--------------------------------------------------------------------------------
/examples/generate_profiles/src/main/java/com/aerospike/hadoop/examples/generateprofiles/GenerateProfiles.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.examples.generateprofiles;
20 |
21 | import java.io.DataInput;
22 | import java.io.DataOutput;
23 | import java.io.IOException;
24 | import java.util.Iterator;
25 | import java.util.regex.Matcher;
26 | import java.util.regex.Pattern;
27 |
28 | import org.apache.commons.logging.Log;
29 | import org.apache.commons.logging.LogFactory;
30 | import org.apache.hadoop.conf.Configuration;
31 | import org.apache.hadoop.conf.Configured;
32 | import org.apache.hadoop.fs.Path;
33 | import org.apache.hadoop.io.IntWritable;
34 | import org.apache.hadoop.io.LongWritable;
35 | import org.apache.hadoop.io.Text;
36 | import org.apache.hadoop.io.Writable;
37 | import org.apache.hadoop.mapred.FileInputFormat;
38 | import org.apache.hadoop.mapred.JobClient;
39 | import org.apache.hadoop.mapred.JobConf;
40 | import org.apache.hadoop.mapred.MapReduceBase;
41 | import org.apache.hadoop.mapred.Mapper;
42 | import org.apache.hadoop.mapred.OutputCollector;
43 | import org.apache.hadoop.mapred.RecordWriter;
44 | import org.apache.hadoop.mapred.Reducer;
45 | import org.apache.hadoop.mapred.Reporter;
46 | import org.apache.hadoop.util.Progressable;
47 | import org.apache.hadoop.util.Tool;
48 | import org.apache.hadoop.util.ToolRunner;
49 |
50 | import com.aerospike.client.AerospikeClient;
51 | import com.aerospike.client.Bin;
52 | import com.aerospike.client.Key;
53 | import com.aerospike.client.policy.WritePolicy;
54 | import com.aerospike.hadoop.mapreduce.AerospikeOutputFormat;
55 | import com.aerospike.hadoop.mapreduce.AerospikeRecordWriter;
56 |
57 | public class GenerateProfiles extends Configured implements Tool {
58 |
59 | private static final Log log = LogFactory.getLog(GenerateProfiles.class);
60 |
61 | // Sample line format:
62 | // 37518 - - [16/Jun/1998:02:48:36 +0000] \
63 | // "GET /images/hm_hola.gif HTTP/1.0" 200 2240
64 |
65 | private static final String logEntryRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\S+)";
66 | private static final Pattern pat = Pattern.compile(logEntryRegex);
67 |
68 | private final static IntWritable one = new IntWritable(1);
69 |
70 | public static class Map extends MapReduceBase implements
71 | Mapper {
72 |
73 | int mapcount = 0;
74 |
75 | public void map(LongWritable key,
76 | Text rec,
77 | OutputCollector output,
78 | Reporter reporter) throws IOException {
79 | try {
80 | String line = rec.toString();
81 | Matcher matcher = pat.matcher(line);
82 | if (!matcher.matches() || 7 != matcher.groupCount()) {
83 | throw new RuntimeException("match failed on: " + line);
84 | }
85 | long userid = Long.parseLong(matcher.group(1));
86 | output.collect(new LongWritable(userid), one);
87 | }
88 | catch (Exception ex) {
89 | // log.error("exception in map", ex);
90 | }
91 | }
92 | }
93 |
94 | private static class Profile implements Writable {
95 | public long userid;
96 | public int age;
97 | public int isMale;
98 |
99 | public Profile(long userid, int age, int isMale) {
100 | this.userid = userid;
101 | this.age = age;
102 | this.isMale = isMale;
103 | }
104 |
105 | public void readFields(DataInput in) throws IOException {
106 | userid = in.readLong();
107 | age = in.readInt();
108 | isMale = in.readInt();
109 | }
110 |
111 | public void write(DataOutput out) throws IOException {
112 | out.writeLong(userid);
113 | out.writeInt(age);
114 | out.writeInt(isMale);
115 | }
116 | }
117 |
118 | public static class Reduce
119 | extends MapReduceBase
120 | implements Reducer {
121 |
122 | public void reduce(LongWritable userid,
123 | Iterator ones,
124 | OutputCollector output,
125 | Reporter reporter
126 | ) throws IOException {
127 |
128 | // Fake age based on userid.
129 | int age = ((int) userid.get() % 40) + 20;
130 |
131 | // Fake gender based on userid.
132 | int isMale = (int) userid.get() % 2;
133 |
134 | Profile profile = new Profile(userid.get(), age, isMale);
135 | output.collect(userid, profile);
136 | }
137 | }
138 |
139 | public static class ProfileOutputFormat
140 | extends AerospikeOutputFormat {
141 |
142 | public static class ProfileRecordWriter
143 | extends AerospikeRecordWriter {
144 |
145 | public ProfileRecordWriter(Configuration cfg,
146 | Progressable progressable) {
147 | super(cfg);
148 | }
149 |
150 | @Override
151 | public void writeAerospike(LongWritable userid,
152 | Profile profile,
153 | AerospikeClient client,
154 | WritePolicy writePolicy,
155 | String namespace,
156 | String setName) throws IOException {
157 | writePolicy.totalTimeout = 10000;
158 | Key kk = new Key(namespace, setName, userid.get());
159 | Bin bin0 = new Bin("userid", profile.userid);
160 | Bin bin1 = new Bin("age", profile.age);
161 | Bin bin2 = new Bin("isMale", profile.isMale);
162 | client.put(writePolicy, kk, bin0, bin1, bin2);
163 | }
164 | }
165 |
166 | public RecordWriter
167 | getAerospikeRecordWriter(Configuration conf, Progressable prog) {
168 | return new ProfileRecordWriter(conf, prog);
169 | }
170 | }
171 |
172 | public int run(final String[] args) throws Exception {
173 |
174 | log.info("run starting");
175 |
176 | final Configuration conf = getConf();
177 |
178 | JobConf job = new JobConf(conf, GenerateProfiles.class);
179 | job.setJobName("AerospikeGenerateProfiles");
180 |
181 | job.setMapperClass(Map.class);
182 | job.setMapOutputKeyClass(LongWritable.class);
183 | job.setMapOutputValueClass(IntWritable.class);
184 | // job.setCombinerClass(Reduce.class); // Reduce changes format.
185 | job.setReducerClass(Reduce.class);
186 | job.setOutputKeyClass(Text.class);
187 | job.setOutputValueClass(Profile.class);
188 |
189 | job.setOutputFormat(ProfileOutputFormat.class);
190 |
191 | for (int ii = 0; ii < args.length; ++ii)
192 | FileInputFormat.addInputPath(job, new Path(args[ii]));
193 |
194 | JobClient.runJob(job);
195 |
196 | log.info("finished");
197 | return 0;
198 | }
199 |
200 | public static void main(final String[] args) throws Exception {
201 | System.exit(ToolRunner.run(new GenerateProfiles(), args));
202 | }
203 | }
204 |
205 | // Local Variables:
206 | // mode: java
207 | // c-basic-offset: 4
208 | // tab-width: 4
209 | // indent-tabs-mode: nil
210 | // End:
211 | // vim: softtabstop=4:shiftwidth=4:expandtab
212 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Aerospike Hadoop Connector
2 |
3 | This repository contains AerospikeInputFormat.java and
4 | AerospikeOutputFormat.java, and several examples of processing using
5 | Hadoop.
6 |
7 | The system allows putting WorkerNodes on Aerospike servers. By
8 | default, the AerospikeInputMapper will split according to the nodes on
9 | the cluster, avoiding network traffic. The InputMapper also supports
10 | using secondary indexes, thus pulling only a few of the records in the
11 | Aerospike database.
12 |
13 | Both new and old Hadoop interfaces are supported, and there are
14 | examples for both.
15 |
16 | In the case of using AerospikeOutputMapper, the Aerospike cluster is
17 | likely to be outside the Hadoop worker nodes. This allows immediate
18 | use of the Hadoop output in your application.
19 |
20 | Check out the examples. The classic word count examples are included -
21 | for both input and output. The "aggregate int example" uses a
22 | secondary index to pull data from Aerospike, and runs the InputFormat
23 | on the local node if available.
24 |
25 | The most interesting example is likely the session rollup example. In
26 | this example, the session management state is output to Aerospike as
27 | the sessions are found.
28 |
29 | See the [Wiki](https://github.com/aerospike-community/aerospike-hadoop/wiki) for more.
30 |
31 | Install Hadoop
32 | ----------------------------------------------------------------
33 | Examples below are tested with Aerospike Java Client (version: 4.2.2) and Hadoop (version: 2.7.2)
34 |
35 | Hadoop installation guide [link](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html)
36 |
37 | Then set up environment variable:
38 | ----------------------------------------------------------------
39 |
40 | Hadoop Installation Directory:
41 | export HADOOP_PREFIX=/usr/local/hadoop
42 |
43 | Development Directory:
44 | export AEROSPIKE_HADOOP=~/aerospike/aerospike-hadoop
45 |
46 |
47 | Build w/ Gradle
48 | ----------------------------------------------------------------
49 |
50 | cd ${AEROSPIKE_HADOOP}
51 |
52 | # Build the mapreduce input and output connectors.
53 | ./gradlew :mapreduce:jar
54 |
55 | # Build the example programs.
56 | ./gradlew :sampledata:installApp
57 | ./gradlew :examples:word_count_input:installApp
58 | ./gradlew :examples:aggregate_int_input:installApp
59 | ./gradlew :examples:word_count_output:installApp
60 | ./gradlew :examples:session_rollup:installApp
61 | ./gradlew :examples:generate_profiles:installApp
62 | ./gradlew :examples:external_join:installApp
63 |
64 |
65 |
66 | Setup Target Input Text File
67 | ----------------------------------------------------------------
68 |
69 | # Make a copy of /var/log/messages
70 | sudo cp /var/log/messages /tmp/input
71 | sudo chown $USER:$USER /tmp/input
72 | chmod 644 /tmp/input
73 |
74 |
75 | Start Aerospike
76 | ----------------------------------------------------------------
77 |
78 | sudo /etc/init.d/aerospike start
79 |
80 |
81 | Setup Sample Data in Aerospike for Input Examples
82 | ----------------------------------------------------------------
83 |
84 | cd ${AEROSPIKE_HADOOP}/sampledata
85 |
86 | # Loads a text file for word_count_input demo.
87 | java -jar build/libs/sampledata.jar \
88 | localhost:3000:test:words:bin1 \
89 | text-file \
90 | /tmp/input
91 |
92 | # Generates sequential integers for aggregate_int_input demo.
93 | java -jar build/libs/sampledata.jar \
94 | localhost:3000:test:integers:bin1 seq-int 0 100000
95 |
96 |
97 | Run Input Examples
98 | ----------------------------------------------------------------
99 |
100 | export HADOOP_PREFIX=/usr/local/hadoop
101 |
102 | cd ${AEROSPIKE_HADOOP}
103 |
104 | # Format HDFS
105 | rm -rf /tmp/hadoop-$USER/dfs/data
106 | $HADOOP_PREFIX/bin/hdfs namenode -format
107 |
108 | # Start HDFS
109 | $HADOOP_PREFIX/sbin/start-dfs.sh
110 |
111 | # Check for {Secondary,}NameNode and DataNode
112 | jps
113 |
114 | # Make some directories
115 | $HADOOP_PREFIX/bin/hdfs dfs -mkdir /tmp
116 |
117 | # Run the Hadoop job.
118 | cd ${AEROSPIKE_HADOOP}
119 |
120 | # Run the word_count_input example (Old Hadoop API)
121 | $HADOOP_PREFIX/bin/hdfs dfs -rm -r /tmp/output
122 | $HADOOP_PREFIX/bin/hadoop \
123 | jar \
124 | ./examples/word_count_input/build/libs/word_count_input.jar \
125 | -D aerospike.input.namespace=test \
126 | -D aerospike.input.setname=words \
127 | -D aerospike.input.operation=scan \
128 | /tmp/output
129 |
130 | # Jump to "Inspect the results" below ...
131 |
132 | # -- OR --
133 |
134 | # Run the aggregate_int_input range example (New Hadoop API)
135 | $HADOOP_PREFIX/bin/hdfs dfs -rm -r /tmp/output
136 | $HADOOP_PREFIX/bin/hadoop \
137 | jar \
138 | ./examples/aggregate_int_input/build/libs/aggregate_int_input.jar \
139 | -D aerospike.input.namespace=test \
140 | -D aerospike.input.setname=integers \
141 | -D aerospike.input.binnames=bin1 \
142 | -D aerospike.input.operation=scan \
143 | /tmp/output
144 |
145 | # Jump to "Inspect the results" below ...
146 |
147 | # -- OR --
148 |
149 | # Run the aggregate_int_input range example (New Hadoop API)
150 | $HADOOP_PREFIX/bin/hdfs dfs -rm -r /tmp/output
151 | $HADOOP_PREFIX/bin/hadoop \
152 | jar \
153 | ./examples/aggregate_int_input/build/libs/aggregate_int_input.jar \
154 | -D aerospike.input.namespace=test \
155 | -D aerospike.input.setname=integers \
156 | -D aerospike.input.binnames=bin1,bin2 \
157 | -D aerospike.input.operation=numrange \
158 | -D aerospike.input.numrange.bin=bin1 \
159 | -D aerospike.input.numrange.begin=100 \
160 | -D aerospike.input.numrange.end=200 \
161 | /tmp/output
162 |
163 | # Inspect the results.
164 | $HADOOP_PREFIX/bin/hadoop fs -ls /tmp/output
165 | rm -rf /tmp/output
166 | $HADOOP_PREFIX/bin/hadoop fs -copyToLocal /tmp/output /tmp
167 | less /tmp/output/part*00000
168 |
169 |
170 | Setup Sample Data in HDFS for Output Examples
171 | ----------------------------------------------------------------
172 |
173 | export HADOOP_PREFIX=/usr/local/hadoop
174 |
175 | # Create a directory.
176 | $HADOOP_PREFIX/bin/hdfs dfs -mkdir /tmp
177 |
178 | # Load the test words into HDFS.
179 | $HADOOP_PREFIX/bin/hdfs dfs -rm /tmp/words
180 | $HADOOP_PREFIX/bin/hadoop fs -copyFromLocal /tmp/input /tmp/words
181 |
182 | # Load the World Cup log data into HDFS
183 | $HADOOP_PREFIX/bin/hdfs dfs -rm -r /worldcup
184 | $HADOOP_PREFIX/bin/hdfs dfs -mkdir /worldcup
185 | $HADOOP_PREFIX/bin/hadoop fs -copyFromLocal \
186 | data/worldcup\
187 | /worldcup/access.log
188 |
189 | # Create the secondary indexes in Aerospike.
190 | aql -c 'CREATE INDEX useridndx ON test.sessions (userid) NUMERIC'
191 | aql -c 'CREATE INDEX startndx ON test.sessions (start) NUMERIC'
192 |
193 |
194 | Run Output Examples
195 | ----------------------------------------------------------------
196 |
197 | # Run the Hadoop job.
198 | cd ${AEROSPIKE_HADOOP}
199 |
200 | # Run the word_count_output example (Old Hadoop API)
201 | $HADOOP_PREFIX/bin/hadoop \
202 | jar \
203 | ./examples/word_count_output/build/libs/word_count_output.jar \
204 | -D aerospike.output.namespace=test \
205 | -D aerospike.output.setname=counts \
206 | /tmp/words
207 |
208 | # Inspect the results:
209 | aql -c 'SELECT * FROM test.counts'
210 |
211 | # -- OR --
212 |
213 | # Run the session_rollup example (Old Hadoop API, small dataset)
214 | $HADOOP_PREFIX/bin/hadoop \
215 | jar \
216 | ./examples/session_rollup/build/libs/session_rollup.jar \
217 | -D aerospike.output.namespace=test \
218 | -D aerospike.output.setname=sessions \
219 | -D mapred.reduce.tasks=30 \
220 | /worldcup/access.log
221 |
222 | # Inspect the results:
223 | aql -c 'SELECT * FROM test.sessions'
224 |
225 | # -- OR --
226 |
227 | # Run generate_profiles to build sample data for external_join.
228 | $HADOOP_PREFIX/bin/hadoop \
229 | jar \
230 | ./examples/generate_profiles/build/libs/generate_profiles.jar \
231 | -D aerospike.output.namespace=test \
232 | -D aerospike.output.setname=profiles \
233 | -D mapred.reduce.tasks=30 \
234 | /worldcup/access.log
235 |
236 | # Inspect the results:
237 | aql -c 'SELECT * FROM test.profiles'
238 |
239 | # -- AND --
240 |
241 | # Run the external_join example (Old Hadoop API, small dataset)
242 | $HADOOP_PREFIX/bin/hadoop \
243 | jar \
244 | ./examples/external_join/build/libs/external_join.jar \
245 | -D aerospike.input.namespace=test \
246 | -D aerospike.input.setname=profiles \
247 | -D aerospike.output.namespace=test \
248 | -D aerospike.output.setname=sessions2 \
249 | -D mapred.reduce.tasks=30 \
250 | /worldcup/access.log
251 |
252 | # Inspect the results:
253 | aql -c 'SELECT * FROM test.sessions2'
254 | ./gradlew :examples:word_count_input:installApp
255 |
256 | Done with HDFS
257 | ----------------------------------------------------------------
258 |
259 | # Stop HDFS
260 | $HADOOP_PREFIX/sbin/stop-dfs.sh
261 |
262 |
--------------------------------------------------------------------------------
/examples/spark_session_rollup/src/main/java/com/aerospike/spark/examples/SparkSessionRollup.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.spark.examples;
20 |
21 | import java.io.IOException;
22 | import java.nio.ByteBuffer;
23 | import java.security.MessageDigest;
24 | import java.security.NoSuchAlgorithmException;
25 | import java.text.ParsePosition;
26 | import java.text.SimpleDateFormat;
27 | import java.util.ArrayList;
28 | import java.util.Collections;
29 | import java.util.Date;
30 | import java.util.Iterator;
31 | import java.util.List;
32 | import java.util.regex.Matcher;
33 | import java.util.regex.Pattern;
34 |
35 | import org.apache.commons.codec.binary.Hex;
36 | import org.apache.hadoop.conf.Configuration;
37 | import org.apache.hadoop.mapred.JobConf;
38 | import org.apache.hadoop.mapred.RecordWriter;
39 | import org.apache.hadoop.util.Progressable;
40 | import org.apache.spark.SparkConf;
41 | import org.apache.spark.api.java.JavaPairRDD;
42 | import org.apache.spark.api.java.JavaRDD;
43 | import org.apache.spark.api.java.JavaSparkContext;
44 | import org.apache.spark.api.java.function.PairFlatMapFunction;
45 | import org.apache.spark.api.java.function.PairFunction;
46 |
47 | import com.aerospike.client.AerospikeClient;
48 | import com.aerospike.client.Bin;
49 | import com.aerospike.client.Key;
50 | import com.aerospike.client.policy.WritePolicy;
51 | import com.aerospike.hadoop.mapreduce.AerospikeConfigUtil;
52 | import com.aerospike.hadoop.mapreduce.AerospikeLogger;
53 | import com.aerospike.hadoop.mapreduce.AerospikeOutputFormat;
54 | import com.aerospike.hadoop.mapreduce.AerospikeRecordWriter;
55 |
56 | import scala.Tuple2;
57 |
58 | public class SparkSessionRollup {
59 |
60 | public static final String appName = "spark_session_rollup";
61 | public static final String master = "spark://as0:7077";
62 |
63 | public static class ExtractHits
64 | implements PairFunction {
65 | private static final long serialVersionUID = 1L;
66 |
67 | // Sample line format:
68 | // 37518 - - [16/Jun/1998:02:48:36 +0000] "GET /images/hm_hola.gif HTTP/1.0" 200 2240
69 | final String logEntryRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\S+)";
70 | final Pattern pat = Pattern.compile(logEntryRegex);
71 |
72 | final SimpleDateFormat dateTimeParser =
73 | new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z");
74 |
75 | public Tuple2 call(String line) {
76 |
77 | Matcher matcher = pat.matcher(line);
78 | if (!matcher.matches() || 7 != matcher.groupCount())
79 | return new Tuple2(0L, 0L);
80 |
81 | long userid = Long.parseLong(matcher.group(1));
82 | String tstamp = matcher.group(4);
83 | ParsePosition pos = new ParsePosition(0);
84 | Date date = dateTimeParser.parse(tstamp, pos);
85 | long msec = date.getTime();
86 |
87 | return new Tuple2(userid, msec);
88 | }
89 | }
90 |
91 | private static class Session {
92 | public long userid;
93 | public long start;
94 | public long end;
95 | public int nhits;
96 |
97 | public Session(long userid, long start, long end, int nhits) {
98 | this.userid = userid;
99 | this.start = start;
100 | this.end = end;
101 | this.nhits = nhits;
102 | }
103 | }
104 |
105 | public static class FindSessions
106 | implements PairFlatMapFunction>,
107 | String, Session> {
108 | private static final long serialVersionUID = 1L;
109 |
110 | private static final long SESSION_GAP_MSEC = 20 * 60 * 1000;
111 |
112 | public Iterator>
113 | call(Tuple2> tup) {
114 |
115 | List> results =
116 | new ArrayList>();
117 |
118 | // Copy the iterator to an array.
119 | ArrayList tsarray = new ArrayList();
120 | for (Long val : tup._2())
121 | tsarray.add(val);
122 |
123 | // Sort the timestamps.
124 | Collections.sort(tsarray);
125 |
126 | // Scan the array looking for session boundaries.
127 | long t0 = 0;
128 | long session_start = 0;
129 | long session_end = 0;
130 | int session_hits = 0;
131 | for (Long tstamp: tsarray) {
132 | long tt = tstamp;
133 |
134 | // How long since the prior hit?
135 | long delta = tt - t0;
136 |
137 | // Is this a new session?
138 | if (delta > SESSION_GAP_MSEC) {
139 |
140 | // Is there a prior session?
141 | if (session_start != 0)
142 | collect_session(tup._1(), session_start, session_end,
143 | session_hits, results);
144 |
145 | // Reset for the new session.
146 | session_start = tt;
147 | session_hits = 0;
148 | }
149 |
150 | // Extend the current session.
151 | session_hits += 1;
152 | session_end = tt;
153 |
154 | // On to the next hit ...
155 | t0 = tt;
156 | }
157 |
158 | // Write out the last session.
159 | if (session_start != 0)
160 | collect_session(tup._1(), session_start, session_end,
161 | session_hits, results);
162 |
163 | return results.iterator();
164 | }
165 |
166 | private void collect_session(long userid, long start,
167 | long end, int nhits,
168 | List> results) {
169 |
170 | try {
171 | // Generate a sessionid from the hash of the userid and start.
172 | MessageDigest md = MessageDigest.getInstance("SHA-256");
173 | md.update(ByteBuffer.allocate(8).putLong(userid).array());
174 | md.update(ByteBuffer.allocate(8).putLong(start).array());
175 | String sessid = Hex.encodeHexString(md.digest()).substring(0,16);
176 |
177 | Session session = new Session(userid, start, end, nhits);
178 |
179 | results.add(new Tuple2(sessid, session));
180 | }
181 | catch (NoSuchAlgorithmException ex) {
182 | throw new RuntimeException(ex);
183 | }
184 | }
185 | }
186 |
187 | public static class SessionOutputFormat
188 | extends AerospikeOutputFormat {
189 |
190 | public static class SessionRecordWriter
191 | extends AerospikeRecordWriter {
192 |
193 | public SessionRecordWriter(Configuration cfg,
194 | Progressable progressable) {
195 | super(cfg);
196 | }
197 |
198 | @Override
199 | public void writeAerospike(String sessid,
200 | Session session,
201 | AerospikeClient client,
202 | WritePolicy writePolicy,
203 | String namespace,
204 | String setName) throws IOException {
205 | Key kk = new Key(namespace, setName, sessid.toString());
206 | Bin bin0 = new Bin("userid", session.userid);
207 | Bin bin1 = new Bin("start", session.start);
208 | Bin bin2 = new Bin("end", session.end);
209 | Bin bin3 = new Bin("nhits", session.nhits);
210 | client.put(writePolicy, kk, bin0, bin1, bin2, bin3);
211 | }
212 | }
213 |
214 | public RecordWriter
215 | getAerospikeRecordWriter(Configuration conf, Progressable prog) {
216 | return new SessionRecordWriter(conf, prog);
217 | }
218 | }
219 |
220 | public static void main(String[] args) {
221 | com.aerospike.client.Log.setCallback(new AerospikeLogger());
222 | com.aerospike.client.Log.setLevel(com.aerospike.client.Log.Level.DEBUG);
223 |
224 | SparkConf conf = new SparkConf()
225 | .setAppName(appName)
226 | .set("spark.executor.memory", "2g")
227 | .setMaster(master);
228 | JavaSparkContext sc = new JavaSparkContext(conf);
229 | sc.addJar("build/libs/spark_session_rollup.jar");
230 |
231 | JavaRDD entries = sc.textFile("hdfs://localhost:54310/tmp/input");
232 |
233 | JavaPairRDD> userhits =
234 | entries.mapToPair(new ExtractHits()).groupByKey();
235 |
236 | JavaPairRDD sessions =
237 | userhits.flatMapToPair(new FindSessions());
238 |
239 | System.err.println(sessions.count());
240 |
241 | JobConf job = new JobConf();
242 | job.setOutputKeyClass(String.class);
243 | job.setOutputValueClass(Session.class);
244 | job.setOutputFormat(SessionOutputFormat.class);
245 |
246 | AerospikeConfigUtil.setOutputHost(job, "localhost");
247 | AerospikeConfigUtil.setOutputPort(job, 3000);
248 | AerospikeConfigUtil.setOutputNamespace(job, "test");
249 | AerospikeConfigUtil.setOutputSetName(job, "sessions3");
250 |
251 | sessions.saveAsHadoopDataset(job);
252 | }
253 | }
254 |
255 | // Local Variables:
256 | // mode: java
257 | // c-basic-offset: 4
258 | // tab-width: 4
259 | // indent-tabs-mode: nil
260 | // End:
261 | // vim: softtabstop=4:shiftwidth=4:expandtab
262 |
--------------------------------------------------------------------------------
/examples/session_rollup/src/main/java/com/aerospike/hadoop/examples/sessionrollup/SessionRollup.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.examples.sessionrollup;
20 |
21 | import java.io.DataInput;
22 | import java.io.DataOutput;
23 | import java.io.IOException;
24 | import java.nio.ByteBuffer;
25 | import java.security.MessageDigest;
26 | import java.security.NoSuchAlgorithmException;
27 | import java.text.ParsePosition;
28 | import java.text.SimpleDateFormat;
29 | import java.util.ArrayList;
30 | import java.util.Collections;
31 | import java.util.Date;
32 | import java.util.Iterator;
33 | import java.util.regex.Matcher;
34 | import java.util.regex.Pattern;
35 |
36 | import org.apache.commons.codec.binary.Hex;
37 | import org.apache.commons.logging.Log;
38 | import org.apache.commons.logging.LogFactory;
39 | import org.apache.hadoop.conf.Configuration;
40 | import org.apache.hadoop.conf.Configured;
41 | import org.apache.hadoop.fs.Path;
42 | import org.apache.hadoop.io.LongWritable;
43 | import org.apache.hadoop.io.Text;
44 | import org.apache.hadoop.io.Writable;
45 | import org.apache.hadoop.mapred.FileInputFormat;
46 | import org.apache.hadoop.mapred.JobClient;
47 | import org.apache.hadoop.mapred.JobConf;
48 | import org.apache.hadoop.mapred.MapReduceBase;
49 | import org.apache.hadoop.mapred.Mapper;
50 | import org.apache.hadoop.mapred.OutputCollector;
51 | import org.apache.hadoop.mapred.RecordWriter;
52 | import org.apache.hadoop.mapred.Reducer;
53 | import org.apache.hadoop.mapred.Reporter;
54 | import org.apache.hadoop.util.Progressable;
55 | import org.apache.hadoop.util.Tool;
56 | import org.apache.hadoop.util.ToolRunner;
57 |
58 | import com.aerospike.client.AerospikeClient;
59 | import com.aerospike.client.Bin;
60 | import com.aerospike.client.Key;
61 | import com.aerospike.client.policy.WritePolicy;
62 | import com.aerospike.hadoop.mapreduce.AerospikeOutputFormat;
63 | import com.aerospike.hadoop.mapreduce.AerospikeRecordWriter;
64 |
65 | public class SessionRollup extends Configured implements Tool {
66 |
67 | private static final Log log = LogFactory.getLog(SessionRollup.class);
68 |
69 | private static final long SESSION_GAP_MSEC = 20 * 60 * 1000;
70 |
71 | // Sample line format:
72 | // 37518 - - [16/Jun/1998:02:48:36 +0000] \
73 | // "GET /images/hm_hola.gif HTTP/1.0" 200 2240
74 |
75 | private static final String logEntryRegex = "^([\\w.-]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\S+)";
76 | private static final Pattern pat = Pattern.compile(logEntryRegex);
77 |
78 | private static final SimpleDateFormat dateTimeParser =
79 | new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z");
80 |
81 | public static class Map extends MapReduceBase implements
82 | Mapper {
83 |
84 | int mapcount = 0;
85 |
86 | public void map(LongWritable key,
87 | Text rec,
88 | OutputCollector output,
89 | Reporter reporter) throws IOException {
90 | try {
91 | String line = rec.toString();
92 | Matcher matcher = pat.matcher(line);
93 | if (!matcher.matches() || 7 != matcher.groupCount()) {
94 | throw new RuntimeException("match failed on: " + line);
95 | }
96 | long userid = 10001;
97 | try{userid = Long.parseLong(matcher.group(7));}catch(Exception e){}
98 | String tstamp = matcher.group(4);
99 | ParsePosition pos = new ParsePosition(0);
100 | Date date = dateTimeParser.parse(tstamp, pos);
101 | long msec = date.getTime();
102 | output.collect(new LongWritable(userid), new LongWritable(msec));
103 | }
104 | catch (Exception ex) {
105 | // log.error("exception in map: " + ex);
106 | }
107 | }
108 | }
109 |
110 | private static class Session implements Writable {
111 | public long userid;
112 | public long start;
113 | public long end;
114 | public int nhits;
115 |
116 | public Session(long userid, long start, long end, int nhits) {
117 | this.userid = userid;
118 | this.start = start;
119 | this.end = end;
120 | this.nhits = nhits;
121 | }
122 |
123 | public void readFields(DataInput in) throws IOException {
124 | userid = in.readLong();
125 | start = in.readLong();
126 | end = in.readLong();
127 | nhits = in.readInt();
128 | }
129 |
130 | public void write(DataOutput out) throws IOException {
131 | out.writeLong(userid);
132 | out.writeLong(start);
133 | out.writeLong(end);
134 | out.writeInt(nhits);
135 | }
136 | }
137 |
138 | public static class Reduce
139 | extends MapReduceBase
140 | implements Reducer {
141 |
142 | public void reduce(LongWritable userid,
143 | Iterator tstamps,
144 | OutputCollector output,
145 | Reporter reporter
146 | ) throws IOException {
147 |
148 | // Copy the iterator to an array.
149 | ArrayList tsarray = new ArrayList();
150 | while (tstamps.hasNext())
151 | tsarray.add(new LongWritable(tstamps.next().get()));
152 |
153 | // Sort the timestamps.
154 | Collections.sort(tsarray);
155 |
156 | // Scan the array looking for session boundaries.
157 | long t0 = 0;
158 | long session_start = 0;
159 | long session_end = 0;
160 | int session_hits = 0;
161 | for (LongWritable tstamp: tsarray) {
162 | long tt = tstamp.get();
163 |
164 | // How long since the prior hit?
165 | long delta = tt - t0;
166 |
167 | // Is this a new session?
168 | if (delta > SESSION_GAP_MSEC) {
169 |
170 | // Is there a prior session?
171 | if (session_start != 0)
172 | collect_session(userid.get(), session_start, session_end,
173 | session_hits, output);
174 |
175 | // Reset for the new session.
176 | session_start = tt;
177 | session_hits = 0;
178 | }
179 |
180 | // Extend the current session.
181 | session_hits += 1;
182 | session_end = tt;
183 |
184 | // On to the next hit ...
185 | t0 = tt;
186 | }
187 |
188 | // Write out the last session.
189 | if (session_start != 0)
190 | collect_session(userid.get(), session_start, session_end,
191 | session_hits, output);
192 | }
193 |
194 | private void collect_session(long userid, long start,
195 | long end, int nhits,
196 | OutputCollector output)
197 | throws IOException {
198 |
199 | try {
200 | // Generate a sessionid from the hash of the userid and start.
201 | MessageDigest md = MessageDigest.getInstance("SHA-256");
202 | md.update(ByteBuffer.allocate(8).putLong(userid).array());
203 | md.update(ByteBuffer.allocate(8).putLong(start).array());
204 | String sessid = Hex.encodeHexString(md.digest()).substring(0,16);
205 |
206 | Session session = new Session(userid, start, end, nhits);
207 | output.collect(new Text(sessid), session);
208 | }
209 | catch (NoSuchAlgorithmException ex) {
210 | throw new RuntimeException(ex);
211 | }
212 | }
213 | }
214 |
215 | public static class SessionOutputFormat
216 | extends AerospikeOutputFormat {
217 |
218 | public static class SessionRecordWriter
219 | extends AerospikeRecordWriter {
220 |
221 | public SessionRecordWriter(Configuration cfg,
222 | Progressable progressable) {
223 | super(cfg);
224 | }
225 |
226 | @Override
227 | public void writeAerospike(Text sessid,
228 | Session session,
229 | AerospikeClient client,
230 | WritePolicy writePolicy,
231 | String namespace,
232 | String setName) throws IOException {
233 | Key kk = new Key(namespace, setName, sessid.toString());
234 | Bin bin0 = new Bin("userid", session.userid);
235 | Bin bin1 = new Bin("start", session.start);
236 | Bin bin2 = new Bin("end", session.end);
237 | Bin bin3 = new Bin("nhits", session.nhits);
238 | client.put(writePolicy, kk, bin0, bin1, bin2, bin3);
239 | }
240 | }
241 |
242 | public RecordWriter
243 | getAerospikeRecordWriter(Configuration conf, Progressable prog) {
244 | return new SessionRecordWriter(conf, prog);
245 | }
246 | }
247 |
248 | public int run(final String[] args) throws Exception {
249 |
250 | log.info("run starting");
251 |
252 | final Configuration conf = getConf();
253 |
254 | JobConf job = new JobConf(conf, SessionRollup.class);
255 | job.setJobName("AerospikeSessionRollup");
256 |
257 | job.setMapperClass(Map.class);
258 | job.setMapOutputKeyClass(LongWritable.class);
259 | job.setMapOutputValueClass(LongWritable.class);
260 | // job.setCombinerClass(Reduce.class); // Reduce changes format.
261 | job.setReducerClass(Reduce.class);
262 | job.setOutputKeyClass(Text.class);
263 | job.setOutputValueClass(Session.class);
264 |
265 | job.setOutputFormat(SessionOutputFormat.class);
266 |
267 | for (int ii = 0; ii < args.length; ++ii)
268 | FileInputFormat.addInputPath(job, new Path(args[ii]));
269 |
270 | JobClient.runJob(job);
271 |
272 | log.info("finished");
273 | return 0;
274 | }
275 |
276 | public static void main(final String[] args) throws Exception {
277 | System.exit(ToolRunner.run(new SessionRollup(), args));
278 | }
279 | }
280 |
281 | // Local Variables:
282 | // mode: java
283 | // c-basic-offset: 4
284 | // tab-width: 4
285 | // indent-tabs-mode: nil
286 | // End:
287 | // vim: softtabstop=4:shiftwidth=4:expandtab
288 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
203 |
--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeConfigUtil.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2014 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.mapreduce;
20 |
21 | import org.apache.commons.logging.Log;
22 | import org.apache.commons.logging.LogFactory;
23 |
24 | import org.apache.hadoop.conf.Configuration;
25 |
26 | public class AerospikeConfigUtil {
27 | private static final Log log = LogFactory.getLog(AerospikeConfigUtil.class);
28 |
29 | // ---------------- INPUT ----------------
30 |
31 | public static final int DEFAULT_INPUT_PORT = 3000;
32 | public static final long INVALID_LONG = 762492121482318889L;
33 | public static final int DEFAULT_INPUT_SCAN_PERCENT = 100;
34 | // ---------------- OUTPUT ----------------
35 |
36 | public static final int DEFAULT_OUTPUT_PORT = 3000;
37 |
38 | // ---------------- INPUT ----------------
39 |
40 | public static void setInputHost(Configuration conf, String host) {
41 | log.info("setting " + AerospikeConfigEnum.INPUT_HOST.value + " to " + host);
42 | conf.set(AerospikeConfigEnum.INPUT_HOST.value, host);
43 | }
44 |
45 | public static String getInputHost(Configuration conf) {
46 | String host = conf.get(AerospikeConfigEnum.INPUT_HOST.value, AerospikeConfigEnum.DEFAULT_INPUT_HOST.value);
47 | log.info("using " + AerospikeConfigEnum.INPUT_HOST.value + " = " + host);
48 | return host;
49 | }
50 |
51 | public static void setInputPort(Configuration conf, int port) {
52 | log.info("setting " + AerospikeConfigEnum.INPUT_PORT.value + " to " + port);
53 | conf.setInt(AerospikeConfigEnum.INPUT_PORT.value, port);
54 | }
55 |
56 | public static int getInputPort(Configuration conf) {
57 | int port = conf.getInt(AerospikeConfigEnum.INPUT_PORT.value, DEFAULT_INPUT_PORT);
58 | log.info("using " + AerospikeConfigEnum.INPUT_PORT.value + " = " + port);
59 | return port;
60 | }
61 |
62 | public static void setInputNamespace(Configuration conf, String namespace) {
63 | log.info("setting " + AerospikeConfigEnum.INPUT_NAMESPACE.value + " to " + namespace);
64 | conf.set(AerospikeConfigEnum.INPUT_NAMESPACE.value, namespace);
65 | }
66 |
67 | public static String getInputNamespace(Configuration conf) {
68 | String namespace = conf.get(AerospikeConfigEnum.INPUT_NAMESPACE.value);
69 | if (namespace == null)
70 | throw new UnsupportedOperationException
71 | ("you must set the input namespace");
72 | log.info("using " + AerospikeConfigEnum.INPUT_NAMESPACE.value + " = " + namespace);
73 | return namespace;
74 | }
75 |
76 | public static void setInputSetName(Configuration conf, String setname) {
77 | log.info("setting " + AerospikeConfigEnum.INPUT_SETNAME.value + " to " + setname);
78 | conf.set(AerospikeConfigEnum.INPUT_SETNAME.value, setname);
79 | }
80 |
81 | public static String getInputSetName(Configuration conf) {
82 | String setname = conf.get(AerospikeConfigEnum.INPUT_SETNAME.value);
83 | log.info("using " + AerospikeConfigEnum.INPUT_SETNAME.value + " = " + setname);
84 | return setname;
85 | }
86 |
87 | public static void setInputBinNames(Configuration conf, String bins) {
88 | log.info("setting " + AerospikeConfigEnum.INPUT_BINNAMES.value + " to " + bins);
89 | conf.set(AerospikeConfigEnum.INPUT_BINNAMES.value, bins);
90 | }
91 |
92 | public static String[] getInputBinNames(Configuration conf) {
93 | String bins = conf.get(AerospikeConfigEnum.INPUT_BINNAMES.value);
94 | log.info("using " + AerospikeConfigEnum.INPUT_BINNAMES.value + " = " + bins);
95 | if (bins == null || bins.equals(""))
96 | return null;
97 | else
98 | return bins.split(",");
99 | }
100 |
101 | public static void setInputOperation(Configuration conf, String operation) {
102 | if (!operation.equals("scan") &&
103 | !operation.equals("numrange"))
104 | throw new UnsupportedOperationException
105 | ("input operation must be 'scan' or 'numrange'");
106 | log.info("setting " + AerospikeConfigEnum.INPUT_OPERATION.value + " to " + operation);
107 | conf.set(AerospikeConfigEnum.INPUT_OPERATION.value, operation);
108 | }
109 |
110 | public static String getInputOperation(Configuration conf) {
111 | String operation = conf.get(AerospikeConfigEnum.INPUT_OPERATION.value, AerospikeConfigEnum.DEFAULT_INPUT_OPERATION.value);
112 | if (!operation.equals("scan") &&
113 | !operation.equals("numrange"))
114 | throw new UnsupportedOperationException
115 | ("input operation must be 'scan' or 'numrange'");
116 | log.info("using " + AerospikeConfigEnum.INPUT_OPERATION.value + " = " + operation);
117 | return operation;
118 | }
119 |
120 | public static void setInputNumRangeBin(Configuration conf, String binname) {
121 | log.info("setting " + AerospikeConfigEnum.INPUT_NUMRANGE_BIN.value + " to " + binname);
122 | conf.set(AerospikeConfigEnum.INPUT_NUMRANGE_BIN.value, binname);
123 | }
124 |
125 | public static String getInputNumRangeBin(Configuration conf) {
126 | String binname = conf.get(AerospikeConfigEnum.INPUT_NUMRANGE_BIN.value);
127 | log.info("using " + AerospikeConfigEnum.INPUT_NUMRANGE_BIN.value + " = " + binname);
128 | return binname;
129 | }
130 |
131 | public static void setInputNumRangeBegin(Configuration conf, long begin) {
132 | log.info("setting " + AerospikeConfigEnum.INPUT_NUMRANGE_BEGIN.value + " to " + begin);
133 | conf.setLong(AerospikeConfigEnum.INPUT_NUMRANGE_BEGIN.value, begin);
134 | }
135 |
136 | public static long getInputNumRangeBegin(Configuration conf) {
137 | long begin = conf.getLong(AerospikeConfigEnum.INPUT_NUMRANGE_BEGIN.value, INVALID_LONG);
138 | if (begin == INVALID_LONG && getInputOperation(conf).equals("numrange"))
139 | throw new UnsupportedOperationException
140 | ("missing input numrange begin");
141 | log.info("using " + AerospikeConfigEnum.INPUT_NUMRANGE_BEGIN.value + " = " + begin);
142 | return begin;
143 | }
144 |
145 | public static void setInputNumRangeEnd(Configuration conf, long end) {
146 | log.info("setting " + AerospikeConfigEnum.INPUT_NUMRANGE_END.value + " to " + end);
147 | conf.setLong(AerospikeConfigEnum.INPUT_NUMRANGE_END.value, end);
148 | }
149 |
150 | public static long getInputNumRangeEnd(Configuration conf) {
151 | long end = conf.getLong(AerospikeConfigEnum.INPUT_NUMRANGE_END.value, INVALID_LONG);
152 | if (end == INVALID_LONG && getInputOperation(conf).equals("numrange"))
153 | throw new UnsupportedOperationException
154 | ("missing input numrange end");
155 | log.info("using " + AerospikeConfigEnum.INPUT_NUMRANGE_END.value + " = " + end);
156 | return end;
157 | }
158 |
159 | public static int getInputScanPercent(Configuration conf) {
160 | int scanPercent = conf.getInt(AerospikeConfigEnum.INPUT_SCAN_PERCENT.value, DEFAULT_INPUT_SCAN_PERCENT);
161 | if (scanPercent <= 0)
162 | throw new UnsupportedOperationException
163 | ("scan percent is less than 1%");
164 | log.info("using " + AerospikeConfigEnum.INPUT_SCAN_PERCENT.value + " = " + scanPercent + "%");
165 | return scanPercent;
166 | }
167 | // ---------------- OUTPUT ----------------
168 |
169 | public static void setOutputHost(Configuration conf, String host) {
170 | log.info("setting " + AerospikeConfigEnum.OUTPUT_HOST.value + " to " + host);
171 | conf.set(AerospikeConfigEnum.OUTPUT_HOST.value, host);
172 | }
173 |
174 | public static String getOutputHost(Configuration conf) {
175 | String host = conf.get(AerospikeConfigEnum.OUTPUT_HOST.value, AerospikeConfigEnum.DEFAULT_OUTPUT_HOST.value);
176 | log.info("using " + AerospikeConfigEnum.OUTPUT_HOST.value + " = " + host);
177 | return host;
178 | }
179 |
180 | public static void setOutputPort(Configuration conf, int port) {
181 | log.info("setting " + AerospikeConfigEnum.OUTPUT_PORT.value + " to " + port);
182 | conf.setInt(AerospikeConfigEnum.OUTPUT_PORT.value, port);
183 | }
184 |
185 | public static int getOutputPort(Configuration conf) {
186 | int port = conf.getInt(AerospikeConfigEnum.OUTPUT_PORT.value, DEFAULT_OUTPUT_PORT);
187 | log.info("using " + AerospikeConfigEnum.OUTPUT_PORT.value + " = " + port);
188 | return port;
189 | }
190 |
191 | public static void setOutputNamespace(Configuration conf, String namespace) {
192 | log.info("setting " + AerospikeConfigEnum.OUTPUT_NAMESPACE.value + " to " + namespace);
193 | conf.set(AerospikeConfigEnum.OUTPUT_NAMESPACE.value, namespace);
194 | }
195 |
196 | public static String getOutputNamespace(Configuration conf) {
197 | String namespace = conf.get(AerospikeConfigEnum.OUTPUT_NAMESPACE.value);
198 | if (namespace == null)
199 | throw new UnsupportedOperationException
200 | ("you must set the output namespace");
201 | log.info("using " + AerospikeConfigEnum.OUTPUT_NAMESPACE.value + " = " + namespace);
202 | return namespace;
203 | }
204 |
205 | public static void setOutputSetName(Configuration conf, String setname) {
206 | log.info("setting " + AerospikeConfigEnum.OUTPUT_SETNAME.value + " to " + setname);
207 | conf.set(AerospikeConfigEnum.OUTPUT_SETNAME.value, setname);
208 | }
209 |
210 | public static String getOutputSetName(Configuration conf) {
211 | String setname = conf.get(AerospikeConfigEnum.OUTPUT_SETNAME.value);
212 | log.info("using " + AerospikeConfigEnum.OUTPUT_SETNAME.value + " = " + setname);
213 | return setname;
214 | }
215 |
216 | public static void setOutputBinName(Configuration conf, String binname) {
217 | log.info("setting " + AerospikeConfigEnum.OUTPUT_BINNAME.value + " to " + binname);
218 | conf.set(AerospikeConfigEnum.OUTPUT_BINNAME.value, binname);
219 | }
220 |
221 | public static String getOutputBinName(Configuration conf) {
222 | String binname = conf.get(AerospikeConfigEnum.OUTPUT_BINNAME.value);
223 | log.info("using " + AerospikeConfigEnum.OUTPUT_BINNAME.value + " = " + binname);
224 | return binname;
225 | }
226 |
227 | public static void setOutputKeyName(Configuration conf, String keyname) {
228 | log.info("setting " + AerospikeConfigEnum.OUTPUT_KEYNAME.value + " to " + keyname);
229 | conf.set(AerospikeConfigEnum.OUTPUT_KEYNAME.value, keyname);
230 | }
231 |
232 | public static String getOutputKeyName(Configuration conf) {
233 | String keyname = conf.get(AerospikeConfigEnum.OUTPUT_KEYNAME.value);
234 | log.info("using " + AerospikeConfigEnum.OUTPUT_KEYNAME.value + " = " + keyname);
235 | return keyname;
236 | }
237 |
238 | // ---------------- COMMON ----------------
239 |
240 | public static org.apache.hadoop.mapred.JobConf asJobConf(Configuration cfg) {
241 | return cfg instanceof org.apache.hadoop.mapred.JobConf
242 | ? (org.apache.hadoop.mapred.JobConf) cfg
243 | : new org.apache.hadoop.mapred.JobConf(cfg);
244 | }
245 | }
246 |
247 | // Local Variables:
248 | // mode: java
249 | // c-basic-offset: 4
250 | // tab-width: 4
251 | // indent-tabs-mode: nil
252 | // End:
253 | // vim: softtabstop=4:shiftwidth=4:expandtab
254 |
--------------------------------------------------------------------------------
/examples/external_join/src/main/java/com/aerospike/hadoop/examples/externaljoin/ExternalJoin.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.examples.externaljoin;
20 |
21 | import java.io.DataInput;
22 | import java.io.DataOutput;
23 | import java.io.IOException;
24 | import java.nio.ByteBuffer;
25 | import java.security.MessageDigest;
26 | import java.security.NoSuchAlgorithmException;
27 | import java.text.ParsePosition;
28 | import java.text.SimpleDateFormat;
29 | import java.util.ArrayList;
30 | import java.util.Collections;
31 | import java.util.Date;
32 | import java.util.Iterator;
33 | import java.util.regex.Matcher;
34 | import java.util.regex.Pattern;
35 |
36 | import org.apache.commons.codec.binary.Hex;
37 | import org.apache.commons.logging.Log;
38 | import org.apache.commons.logging.LogFactory;
39 | import org.apache.hadoop.conf.Configuration;
40 | import org.apache.hadoop.conf.Configured;
41 | import org.apache.hadoop.fs.Path;
42 | import org.apache.hadoop.io.LongWritable;
43 | import org.apache.hadoop.io.Text;
44 | import org.apache.hadoop.io.Writable;
45 | import org.apache.hadoop.mapred.FileInputFormat;
46 | import org.apache.hadoop.mapred.JobClient;
47 | import org.apache.hadoop.mapred.JobConf;
48 | import org.apache.hadoop.mapred.MapReduceBase;
49 | import org.apache.hadoop.mapred.Mapper;
50 | import org.apache.hadoop.mapred.OutputCollector;
51 | import org.apache.hadoop.mapred.RecordWriter;
52 | import org.apache.hadoop.mapred.Reducer;
53 | import org.apache.hadoop.mapred.Reporter;
54 | import org.apache.hadoop.util.Progressable;
55 | import org.apache.hadoop.util.Tool;
56 | import org.apache.hadoop.util.ToolRunner;
57 |
58 | import com.aerospike.client.AerospikeClient;
59 | import com.aerospike.client.Bin;
60 | import com.aerospike.client.Key;
61 | import com.aerospike.client.Record;
62 | import com.aerospike.client.policy.Policy;
63 | import com.aerospike.client.policy.WritePolicy;
64 | import com.aerospike.hadoop.mapreduce.AerospikeConfigUtil;
65 | import com.aerospike.hadoop.mapreduce.AerospikeOutputFormat;
66 | import com.aerospike.hadoop.mapreduce.AerospikeRecordWriter;
67 |
68 | public class ExternalJoin extends Configured implements Tool {
69 |
70 | private static final Log log = LogFactory.getLog(ExternalJoin.class);
71 |
72 | private static final long SESSION_GAP_MSEC = 20 * 60 * 1000;
73 |
74 | // Sample line format:
75 | // 37518 - - [16/Jun/1998:02:48:36 +0000] \
76 | // "GET /images/hm_hola.gif HTTP/1.0" 200 2240
77 |
78 | private static final String logEntryRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\S+)";
79 | private static final Pattern pat = Pattern.compile(logEntryRegex);
80 |
81 | private static final SimpleDateFormat dateTimeParser =
82 | new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z");
83 |
84 | public static class Map extends MapReduceBase implements
85 | Mapper {
86 |
87 | int mapcount = 0;
88 |
89 | public void map(LongWritable key,
90 | Text rec,
91 | OutputCollector output,
92 | Reporter reporter) throws IOException {
93 | try {
94 | String line = rec.toString();
95 | Matcher matcher = pat.matcher(line);
96 | if (!matcher.matches() || 7 != matcher.groupCount()) {
97 | throw new RuntimeException("match failed on: " + line);
98 | }
99 | long userid = Long.parseLong(matcher.group(1));
100 | String tstamp = matcher.group(4);
101 | ParsePosition pos = new ParsePosition(0);
102 | Date date = dateTimeParser.parse(tstamp, pos);
103 | long msec = date.getTime();
104 | output.collect(new LongWritable(userid), new LongWritable(msec));
105 | }
106 | catch (Exception ex) {
107 | // log.error("exception in map: " + ex);
108 | }
109 | }
110 | }
111 |
112 | private static class Session implements Writable {
113 | public long userid;
114 | public long start;
115 | public long end;
116 | public int nhits;
117 | public int age;
118 | public int isMale;
119 |
120 | public Session(long userid, long start, long end, int nhits, int age, int isMale) {
121 | this.userid = userid;
122 | this.start = start;
123 | this.end = end;
124 | this.nhits = nhits;
125 | this.age = age;
126 | this.isMale = isMale;
127 | }
128 |
129 | public void readFields(DataInput in) throws IOException {
130 | userid = in.readLong();
131 | start = in.readLong();
132 | end = in.readLong();
133 | nhits = in.readInt();
134 | age = in.readInt();
135 | isMale = in.readInt();
136 | }
137 |
138 | public void write(DataOutput out) throws IOException {
139 | out.writeLong(userid);
140 | out.writeLong(start);
141 | out.writeLong(end);
142 | out.writeInt(nhits);
143 | out.writeInt(age);
144 | out.writeInt(isMale);
145 | }
146 | }
147 |
148 | public static class Reduce
149 | extends MapReduceBase
150 | implements Reducer {
151 |
152 | private Policy policy;
153 | private AerospikeClient client;
154 | private String namespace;
155 | private String setName;
156 |
157 | @Override
158 | public void configure(JobConf job) {
159 | String host = AerospikeConfigUtil.getInputHost(job);
160 | int port = AerospikeConfigUtil.getInputPort(job);
161 |
162 | policy = new Policy();
163 | policy.totalTimeout = 10000;
164 | client = new AerospikeClient(host, port);
165 |
166 | namespace = AerospikeConfigUtil.getInputNamespace(job);
167 | setName = AerospikeConfigUtil.getInputSetName(job);
168 | }
169 |
170 | @Override
171 | public void close() {
172 | client.close();
173 | }
174 |
175 | public void reduce(LongWritable userid,
176 | Iterator tstamps,
177 | OutputCollector output,
178 | Reporter reporter
179 | ) throws IOException {
180 |
181 | // Copy the iterator to an array.
182 | ArrayList tsarray = new ArrayList();
183 | while (tstamps.hasNext())
184 | tsarray.add(new LongWritable(tstamps.next().get()));
185 |
186 | // Sort the timestamps.
187 | Collections.sort(tsarray);
188 |
189 | // Scan the array looking for session boundaries.
190 | long t0 = 0;
191 | long session_start = 0;
192 | long session_end = 0;
193 | int session_hits = 0;
194 | for (LongWritable tstamp: tsarray) {
195 | long tt = tstamp.get();
196 |
197 | // How long since the prior hit?
198 | long delta = tt - t0;
199 |
200 | // Is this a new session?
201 | if (delta > SESSION_GAP_MSEC) {
202 |
203 | // Is there a prior session?
204 | if (session_start != 0)
205 | collect_session(userid.get(), session_start, session_end,
206 | session_hits, output);
207 |
208 | // Reset for the new session.
209 | session_start = tt;
210 | session_hits = 0;
211 | }
212 |
213 | // Extend the current session.
214 | session_hits += 1;
215 | session_end = tt;
216 |
217 | // On to the next hit ...
218 | t0 = tt;
219 | }
220 |
221 | // Write out the last session.
222 | if (session_start != 0)
223 | collect_session(userid.get(), session_start, session_end,
224 | session_hits, output);
225 | }
226 |
227 | private void collect_session(long userid, long start,
228 | long end, int nhits,
229 | OutputCollector output)
230 | throws IOException {
231 |
232 | Key kk = new Key(namespace, setName, userid);
233 | Record rec = client.get(policy, kk);
234 |
235 | int age = (Integer) rec.bins.get("age");
236 | int isMale = (Integer) rec.bins.get("isMale");
237 |
238 | try {
239 | // Generate a sessionid from the hash of the userid and start.
240 | MessageDigest md = MessageDigest.getInstance("SHA-256");
241 | md.update(ByteBuffer.allocate(8).putLong(userid).array());
242 | md.update(ByteBuffer.allocate(8).putLong(start).array());
243 | String sessid = Hex.encodeHexString(md.digest()).substring(0,16);
244 |
245 | Session session =
246 | new Session(userid, start, end, nhits, age, isMale);
247 |
248 | output.collect(new Text(sessid), session);
249 | }
250 | catch (NoSuchAlgorithmException ex) {
251 | throw new RuntimeException(ex);
252 | }
253 | }
254 | }
255 |
256 | public static class SessionOutputFormat
257 | extends AerospikeOutputFormat {
258 |
259 | public static class SessionRecordWriter
260 | extends AerospikeRecordWriter {
261 |
262 | public SessionRecordWriter(Configuration cfg,
263 | Progressable progressable) {
264 | super(cfg);
265 | }
266 |
267 | @Override
268 | public void writeAerospike(Text sessid,
269 | Session session,
270 | AerospikeClient client,
271 | WritePolicy writePolicy,
272 | String namespace,
273 | String setName) throws IOException {
274 | writePolicy.totalTimeout = 10000;
275 | Key kk = new Key(namespace, setName, sessid.toString());
276 | Bin bin0 = new Bin("userid", session.userid);
277 | Bin bin1 = new Bin("start", session.start);
278 | Bin bin2 = new Bin("end", session.end);
279 | Bin bin3 = new Bin("nhits", session.nhits);
280 | Bin bin4 = new Bin("age", session.age);
281 | Bin bin5 = new Bin("isMale", session.isMale);
282 | client.put(writePolicy, kk, bin0, bin1, bin2, bin3, bin4, bin5);
283 | }
284 | }
285 |
286 | public RecordWriter
287 | getAerospikeRecordWriter(Configuration conf, Progressable prog) {
288 | return new SessionRecordWriter(conf, prog);
289 | }
290 | }
291 |
292 | public int run(final String[] args) throws Exception {
293 |
294 | log.info("run starting");
295 |
296 | final Configuration conf = getConf();
297 |
298 | JobConf job = new JobConf(conf, ExternalJoin.class);
299 | job.setJobName("AerospikeExternalJoin");
300 |
301 | job.setMapperClass(Map.class);
302 | job.setMapOutputKeyClass(LongWritable.class);
303 | job.setMapOutputValueClass(LongWritable.class);
304 | // job.setCombinerClass(Reduce.class); // Reduce changes format.
305 | job.setReducerClass(Reduce.class);
306 | job.setOutputKeyClass(Text.class);
307 | job.setOutputValueClass(Session.class);
308 |
309 | job.setOutputFormat(SessionOutputFormat.class);
310 |
311 | for (int ii = 0; ii < args.length; ++ii)
312 | FileInputFormat.addInputPath(job, new Path(args[ii]));
313 |
314 | JobClient.runJob(job);
315 |
316 | log.info("finished");
317 | return 0;
318 | }
319 |
320 | public static void main(final String[] args) throws Exception {
321 | System.exit(ToolRunner.run(new ExternalJoin(), args));
322 | }
323 | }
324 |
325 | // Local Variables:
326 | // mode: java
327 | // c-basic-offset: 4
328 | // tab-width: 4
329 | // indent-tabs-mode: nil
330 | // End:
331 | // vim: softtabstop=4:shiftwidth=4:expandtab
332 |
--------------------------------------------------------------------------------
/mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeRecordReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2014 Aerospike, Inc.
3 | *
4 | * Portions may be licensed to Aerospike, Inc. under one or more
5 | * contributor license agreements.
6 | *
7 | * Licensed under the Apache License, Version 2.0 (the "License"); you
8 | * may not use this file except in compliance with the License. You
9 | * may obtain a copy of the License at
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | * implied. See the License for the specific language governing
16 | * permissions and limitations under the License.
17 | */
18 |
19 | package com.aerospike.hadoop.mapreduce;
20 |
21 | import java.io.IOException;
22 | import java.util.concurrent.ArrayBlockingQueue;
23 |
24 | import org.apache.commons.logging.Log;
25 | import org.apache.commons.logging.LogFactory;
26 | import org.apache.hadoop.mapreduce.InputSplit;
27 | import org.apache.hadoop.mapreduce.RecordReader;
28 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
29 |
30 | import com.aerospike.client.AerospikeClient;
31 | import com.aerospike.client.AerospikeException;
32 | import com.aerospike.client.AerospikeException.ScanTerminated;
33 | import com.aerospike.client.Key;
34 | import com.aerospike.client.Record;
35 | import com.aerospike.client.ScanCallback;
36 | import com.aerospike.client.policy.ClientPolicy;
37 | import com.aerospike.client.policy.QueryPolicy;
38 | import com.aerospike.client.policy.ScanPolicy;
39 | import com.aerospike.client.query.Filter;
40 | import com.aerospike.client.query.RecordSet;
41 | import com.aerospike.client.query.Statement;
42 |
43 | public class AerospikeRecordReader
44 | extends RecordReader
45 | implements org.apache.hadoop.mapred.RecordReader {
47 |
48 | private class KeyRecPair {
49 | public AerospikeKey key;
50 | public AerospikeRecord rec;
51 | public KeyRecPair(AerospikeKey key, AerospikeRecord rec) {
52 | this.key = key;
53 | this.rec = rec;
54 | }
55 | }
56 |
57 | private static final Log log =
58 | LogFactory.getLog(AerospikeRecordReader.class);
59 |
60 | private ASSCanReader scanReader = null;
61 | private ASQueryReader queryReader = null;
62 |
63 | private ArrayBlockingQueue queue =
64 | new ArrayBlockingQueue(16 * 1024);
65 |
66 | private boolean isFinished = false;
67 | private boolean isError = false;
68 | private boolean isRunning = false;
69 | private String numrangeBin;
70 | private long numrangeBegin;
71 | private long numrangeEnd;
72 | private int scanPercent;
73 |
74 | private AerospikeKey currentKey;
75 | private AerospikeRecord currentValue;
76 |
77 | public class CallBack implements ScanCallback {
78 | public void scanCallback(Key key, Record record)
79 | throws AerospikeException {
80 | try {
81 | queue.put(new KeyRecPair(new AerospikeKey(key),
82 | new AerospikeRecord(record)));
83 | } catch (Exception ex) {
84 | throw new ScanTerminated(ex);
85 | }
86 | }
87 | }
88 |
89 | public class ASSCanReader extends java.lang.Thread {
90 |
91 | String node;
92 | String host;
93 | int port;
94 | String namespace;
95 | String setName;
96 | String[] binNames;
97 | int scanPercent;
98 |
99 | ASSCanReader(String node, String host, int port,
100 | String ns, String setName, String[] binNames, int scanPercent) {
101 | this.node = node;
102 | this.host = host;
103 | this.port = port;
104 | this.namespace = ns;
105 | this.setName = setName;
106 | this.binNames = binNames;
107 | this.scanPercent = scanPercent;
108 | }
109 |
110 | public void run() {
111 | try {
112 | AerospikeClient client =
113 | AerospikeClientSingleton.getInstance(new ClientPolicy(),
114 | host, port);
115 |
116 | log.info(String.format("scanNode %s:%d:%s:%s",
117 | host, port, namespace, setName));
118 | ScanPolicy scanPolicy = new ScanPolicy();
119 | scanPolicy.scanPercent = scanPercent;
120 | CallBack cb = new CallBack();
121 | log.info("scan starting with scan percent: " + scanPolicy.scanPercent + "%");
122 | isRunning = true;
123 | if (binNames != null)
124 | client.scanNode(scanPolicy, node, namespace, setName,
125 | cb, binNames);
126 | else
127 | client.scanNode(scanPolicy, node, namespace, setName,
128 | cb);
129 | isFinished = true;
130 | log.info("scan finished");
131 | }
132 | catch (Exception ex) {
133 | log.error("exception in ASSCanReader.run: " + ex);
134 | isError = true;
135 | return;
136 | }
137 | }
138 | }
139 |
140 | public class ASQueryReader extends java.lang.Thread {
141 |
142 | String node;
143 | String host;
144 | int port;
145 | String namespace;
146 | String setName;
147 | String[] binNames;
148 | String numrangeBin;
149 | long numrangeBegin;
150 | long numrangeEnd;
151 |
152 | ASQueryReader(String node, String host, int port,
153 | String ns, String setName, String[] binNames,
154 | String numrangeBin, long numrangeBegin, long numrangeEnd) {
155 | this.node = node;
156 | this.host = host;
157 | this.port = port;
158 | this.namespace = ns;
159 | this.setName = setName;
160 | this.binNames = binNames;
161 | this.numrangeBin = numrangeBin;
162 | this.numrangeBegin = numrangeBegin;
163 | this.numrangeEnd = numrangeEnd;
164 | }
165 |
166 | public void run() {
167 | try {
168 | AerospikeClient client =
169 | AerospikeClientSingleton.getInstance(new ClientPolicy(),
170 | host, port);
171 | log.info(String.format("queryNode %s:%d %s:%s:%s[%d:%d]",
172 | host, port, namespace, setName,
173 | numrangeBin, numrangeBegin,
174 | numrangeEnd));
175 | Statement stmt = new Statement();
176 | stmt.setNamespace(namespace);
177 | stmt.setSetName(setName);
178 | stmt.setFilters(Filter.range(numrangeBin,
179 | numrangeBegin,
180 | numrangeEnd));
181 | if (binNames != null)
182 | stmt.setBinNames(binNames);
183 | QueryPolicy queryPolicy = new QueryPolicy();
184 | RecordSet rs = client.queryNode(queryPolicy,
185 | stmt,
186 | client.getNode(node));
187 | isRunning = true;
188 | try {
189 | log.info("query starting");
190 | while (rs.next()) {
191 | Key key = rs.getKey();
192 | Record record = rs.getRecord();
193 | queue.put(new KeyRecPair(new AerospikeKey(key),
194 | new AerospikeRecord(record)));
195 | }
196 | }
197 | finally {
198 | rs.close();
199 | isFinished = true;
200 | log.info("query finished");
201 | }
202 | }
203 | catch (Exception ex) {
204 | isError = true;
205 | return;
206 | }
207 | }
208 | }
209 |
210 | public AerospikeRecordReader()
211 | throws IOException {
212 | log.info("NEW CTOR");
213 | }
214 |
215 | public AerospikeRecordReader(AerospikeSplit split)
216 | throws IOException {
217 | log.info("OLD CTOR");
218 | init(split);
219 | }
220 |
221 | public void init(AerospikeSplit split)
222 | throws IOException {
223 | final String type = split.getType();
224 | final String node = split.getNode();
225 | final String host = split.getHost();
226 | final int port = split.getPort();
227 | final String namespace = split.getNameSpace();
228 | final String setName = split.getSetName();
229 | final String[] binNames = split.getBinNames();
230 | this.numrangeBin = split.getNumRangeBin();
231 | this.numrangeBegin = split.getNumRangeBegin();
232 | this.numrangeEnd = split.getNumRangeEnd();
233 | this.scanPercent = split.getScanPercent();
234 |
235 | if (type.equals("scan")) {
236 | scanReader = new ASSCanReader(node, host, port, namespace,
237 | setName, binNames, scanPercent);
238 | scanReader.start();
239 | } else if (type.equals("numrange")) {
240 | queryReader = new ASQueryReader(node, host, port, namespace,
241 | setName, binNames, numrangeBin,
242 | numrangeBegin, numrangeEnd);
243 | queryReader.start();
244 | }
245 |
246 | log.info("node: " + node);
247 | }
248 |
249 | public AerospikeKey createKey() { return new AerospikeKey(); }
250 |
251 | public AerospikeRecord createValue() { return new AerospikeRecord(); }
252 |
253 | protected AerospikeKey setCurrentKey(AerospikeKey oldApiKey,
254 | AerospikeKey newApiKey,
255 | AerospikeKey keyval) {
256 |
257 | if (oldApiKey == null) {
258 | oldApiKey = new AerospikeKey();
259 | oldApiKey.set(keyval);
260 | }
261 |
262 | // new API might not be used
263 | if (newApiKey != null) {
264 | newApiKey.set(keyval);
265 | }
266 | return oldApiKey;
267 | }
268 |
269 | protected AerospikeRecord setCurrentValue(AerospikeRecord oldApiVal,
270 | AerospikeRecord newApiVal,
271 | AerospikeRecord val) {
272 | if (oldApiVal == null) {
273 | oldApiVal = new AerospikeRecord();
274 | oldApiVal.set(val);
275 | }
276 |
277 | // new API might not be used
278 | if (newApiVal != null) {
279 | newApiVal.set(val);
280 | }
281 | return oldApiVal;
282 | }
283 |
284 | public synchronized boolean next(AerospikeKey key, AerospikeRecord value)
285 | throws IOException {
286 |
287 | final int waitMSec = 1000;
288 | int trials = 5;
289 |
290 | try {
291 | KeyRecPair pair;
292 | while (true) {
293 | if (isError)
294 | return false;
295 |
296 | if (!isRunning) {
297 | Thread.sleep(100);
298 | continue;
299 | }
300 |
301 | if (!isFinished && queue.size() == 0) {
302 | if (trials == 0) {
303 | log.error("SCAN TIMEOUT");
304 | return false;
305 | }
306 | log.info("queue empty: waiting...");
307 | Thread.sleep(waitMSec);
308 | trials--;
309 | } else if (isFinished && queue.size() == 0) {
310 | return false;
311 | } else if (queue.size() != 0) {
312 | pair = queue.take();
313 | break;
314 | }
315 | }
316 |
317 | // log.info("key=" + pair.key + ", val=" + pair.rec);
318 |
319 | currentKey = setCurrentKey(currentKey, key, pair.key);
320 | currentValue = setCurrentValue(currentValue, value, pair.rec);
321 | }
322 | catch (Exception ex) {
323 | log.error("exception in AerospikeRecordReader.next: " + ex);
324 | throw new IOException("exception in AerospikeRecordReader.next", ex);
325 | }
326 | return true;
327 | }
328 |
329 | public float getProgress() {
330 | if (isFinished)
331 | return 1.0f;
332 | else
333 | return 0.0f;
334 | }
335 |
336 | public synchronized long getPos() throws IOException {
337 | return 0;
338 | }
339 |
340 | public synchronized void close() throws IOException {
341 | if (scanReader != null) {
342 | try {
343 | scanReader.join();
344 | }
345 | catch (Exception ex) {
346 | throw new IOException("exception in AerospikeRecordReader.close",
347 | ex);
348 | }
349 | scanReader = null;
350 | }
351 | if (queryReader != null) {
352 | try {
353 | queryReader.join();
354 | }
355 | catch (Exception ex) {
356 | throw new IOException("exception in AerospikeRecordReader.close",
357 | ex);
358 | }
359 | queryReader = null;
360 | }
361 | }
362 |
363 | // ---------------- NEW API ----------------
364 |
365 | @Override
366 | public void initialize(InputSplit split, TaskAttemptContext context)
367 | throws IOException {
368 | log.info("INITIALIZE");
369 | init((AerospikeSplit) split);
370 | }
371 |
372 | @Override
373 | public boolean nextKeyValue() throws IOException {
374 | // new API call routed to old API
375 | if (currentKey == null) {
376 | currentKey = createKey();
377 | }
378 | if (currentValue == null) {
379 | currentValue = createValue();
380 | }
381 |
382 | // FIXME: does the new API mandate a new instance each time (?)
383 | return next(currentKey, currentValue);
384 | }
385 |
386 | @Override
387 | public AerospikeKey getCurrentKey() throws IOException {
388 | return currentKey;
389 | }
390 |
391 | @Override
392 | public AerospikeRecord getCurrentValue() {
393 | return currentValue;
394 | }
395 | }
396 |
397 | // Local Variables:
398 | // mode: java
399 | // c-basic-offset: 4
400 | // tab-width: 4
401 | // indent-tabs-mode: nil
402 | // End:
403 | // vim: softtabstop=4:shiftwidth=4:expandtab
404 |
--------------------------------------------------------------------------------