├── MRAadhaarAnalysis
├── .gitignore
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── stdatalabs
│ │ └── MRAadhaarAnalysis
│ │ ├── Driver.java
│ │ ├── NumUIDMapper.java
│ │ ├── NumUIDReducer.java
│ │ ├── SortComparator.java
│ │ ├── SortMapper.java
│ │ └── SortReducer.java
│ └── test
│ └── java
│ └── com
│ └── stdatalabs
│ └── MRAadhaarAnalysis
│ └── AppTest.java
├── README.md
├── SparkAadhaarAnalysis
├── .gitignore
├── pom.xml
└── src
│ ├── main
│ └── scala
│ │ └── com
│ │ └── stdatalabs
│ │ └── SparkAadhaarAnalysis
│ │ └── UIDStats.scala
│ └── test
│ └── java
│ └── com
│ └── stdatalabs
│ └── SparkAadhaarAnalysis
│ └── AppTest.java
└── data
└── UIDAI-ENR-DETAIL-20170308.csv
/MRAadhaarAnalysis/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | .*/
3 | target/
4 | .classpath
5 | .cache-main
6 | .cache-tests
7 | .settings
8 | .project
9 | tweets.txt
10 | checkpoint/
11 |
12 | # Compiled source #
13 | ###################
14 | *.com
15 | *.class
16 | *.dll
17 | *.exe
18 | *.o
19 | *.so
20 | checkpoint
21 |
22 | # Packages #
23 | ############
24 | # it's better to unpack these files and commit the raw source
25 | # git has its own built in compression methods
26 | *.7z
27 | *.dmg
28 | *.gz
29 | *.iso
30 | *.jar
31 | *.rar
32 | *.tar
33 | *.zip
34 |
35 | # Logs and databases #
36 | ######################
37 | *.log
38 | *.sql
39 | *.sqlite
40 |
41 | # OS generated files #
42 | ######################
43 | .DS_Store
44 | .DS_Store?
45 | ._*
46 | .Spotlight-V100
47 | .Trashes
48 | ehthumbs.db
49 | Thumbs.db
--------------------------------------------------------------------------------
/MRAadhaarAnalysis/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.stdatalabs
6 | MRAadhaarAnalysis
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | MRAadhaarAnalysis
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 | 2.6.0-cdh5.5.5
16 | 1.2.17
17 | 2.4
18 |
19 |
20 |
21 |
22 | junit
23 | junit
24 | 3.8.1
25 | test
26 |
27 |
28 | org.apache.hadoop
29 | hadoop-client
30 | ${hadoop.version}
31 |
32 |
33 |
34 |
35 |
36 | org.apache.maven.plugins
37 | maven-jar-plugin
38 | ${maven_jar_plugin.version}
39 |
40 |
41 |
42 |
43 |
44 |
45 | cloudera-repo
46 | http://repository.cloudera.com/artifactory/cloudera-repos/
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/MRAadhaarAnalysis/src/main/java/com/stdatalabs/MRAadhaarAnalysis/Driver.java:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.MRAadhaarAnalysis;
2 |
3 | /*#############################################################################################
4 | # Description: Aadhaar dataset analysis using MapReduce
5 | #
6 | # Input:
7 | # 1. /user/cloudera/UIDAI-ENR-DETAIL-20170308.csv
8 | #
9 | # To Run this code use the command:
10 | # yarn jar MRAadhaarAnalysis-0.0.1-SNAPSHOT.jar \
11 | # com.stdatalabs.MRAadhaarAnalysis.Driver \
12 | # UIDAI-ENR-DETAIL-20170308.csv \
13 | # MRStateWiseUIDCount \
14 | # MRStateWiseUIDCount_sorted
15 | #############################################################################################*/
16 |
17 | import org.apache.hadoop.conf.Configured;
18 | import org.apache.hadoop.fs.FileSystem;
19 | import org.apache.hadoop.fs.Path;
20 | import org.apache.hadoop.io.IntWritable;
21 | import org.apache.hadoop.io.Text;
22 | import org.apache.hadoop.mapreduce.Job;
23 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
24 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
25 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
26 | import org.apache.hadoop.util.Tool;
27 | import org.apache.hadoop.util.ToolRunner;
28 |
29 | public class Driver extends Configured implements Tool {
30 |
31 | @Override
32 | public int run(String[] args) throws Exception {
33 |
34 | if (args.length != 3) {
35 | System.out.println("Usage: [input] [output1] [output2]");
36 | System.exit(-1);
37 | }
38 | Job stateWiseCount = Job.getInstance(getConf());
39 | stateWiseCount.setJobName("Aadhaar Data Analysis");
40 | stateWiseCount.setJarByClass(Driver.class);
41 |
42 | /* Field separator for reducer output*/
43 | stateWiseCount.getConfiguration().set("mapreduce.output.textoutputformat.separator", " | ");
44 |
45 | stateWiseCount.setMapperClass(NumUIDMapper.class);
46 | stateWiseCount.setReducerClass(NumUIDReducer.class);
47 |
48 | stateWiseCount.setInputFormatClass(TextInputFormat.class);
49 | stateWiseCount.setMapOutputKeyClass(Text.class);
50 | stateWiseCount.setMapOutputValueClass(IntWritable.class);
51 |
52 | stateWiseCount.setOutputKeyClass(Text.class);
53 | stateWiseCount.setOutputValueClass(IntWritable.class);
54 |
55 | Path inputFilePath = new Path(args[0]);
56 | Path outputFilePath = new Path(args[1]);
57 |
58 | FileInputFormat.addInputPath(stateWiseCount, inputFilePath);
59 | FileOutputFormat.setOutputPath(stateWiseCount, outputFilePath);
60 |
61 | FileSystem fs = FileSystem.newInstance(getConf());
62 |
63 | if (fs.exists(outputFilePath)) {
64 | fs.delete(outputFilePath, true);
65 | }
66 |
67 | stateWiseCount.waitForCompletion(true);
68 |
69 |
70 | Job sort = Job.getInstance(getConf());
71 | sort.setJobName("Sorting States on Num Aadhaars generated");
72 | sort.setJarByClass(Driver.class);
73 |
74 | sort.setOutputKeyClass(Text.class);
75 | sort.setOutputValueClass(IntWritable.class);
76 |
77 | sort.setMapperClass(SortMapper.class);
78 | sort.setReducerClass(SortReducer.class);
79 | sort.setSortComparatorClass(SortComparator.class);
80 |
81 | sort.setMapOutputKeyClass(IntWritable.class);
82 | sort.setMapOutputValueClass(Text.class);
83 |
84 | FileInputFormat.addInputPath(sort, new Path(args[1]));
85 | FileOutputFormat.setOutputPath(sort, new Path(args[2]));
86 |
87 | if (fs.exists(new Path(args[2]))) {
88 | fs.delete(new Path(args[2]), true);
89 | }
90 |
91 | return sort.waitForCompletion(true) ? 0 : 1;
92 |
93 | }
94 |
95 | public static void main(String[] args) throws Exception {
96 | ToolRunner.run(new Driver(), args);
97 | }
98 |
99 | }
100 |
--------------------------------------------------------------------------------
/MRAadhaarAnalysis/src/main/java/com/stdatalabs/MRAadhaarAnalysis/NumUIDMapper.java:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.MRAadhaarAnalysis;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.LongWritable;
7 | import org.apache.hadoop.io.Text;
8 | import org.apache.hadoop.mapreduce.Mapper;
9 | import org.apache.log4j.Logger;
10 |
11 | public class NumUIDMapper extends Mapper {
12 |
13 | Text state = new Text();
14 | IntWritable count = new IntWritable();
15 | private static final Logger LOG = Logger.getLogger(NumUIDMapper.class);
16 |
17 | public void map(LongWritable key, Text value, Context context)
18 | throws IOException, InterruptedException {
19 | String[] list = value.toString().split(",");
20 |
21 | if (key.get() != 0) {
22 | state.set(list[2]);
23 | count.set(Integer.parseInt(list[8]));
24 | LOG.info("Reading line: " + state + "Aadhaars generated: "
25 | + list[8]);
26 | context.write(state, count);
27 | }
28 | LOG.info("Skipped line with key:" + key.get());
29 | }
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/MRAadhaarAnalysis/src/main/java/com/stdatalabs/MRAadhaarAnalysis/NumUIDReducer.java:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.MRAadhaarAnalysis;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Text;
7 | import org.apache.hadoop.mapreduce.Reducer;
8 |
9 | public class NumUIDReducer extends
10 | Reducer {
11 |
12 | public void reduce(Text key, Iterable values, Context context)
13 | throws IOException, InterruptedException {
14 | int sum = 0;
15 | for (IntWritable count : values) {
16 | sum += count.get();
17 | }
18 |
19 | context.write(key, new IntWritable(sum));
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/MRAadhaarAnalysis/src/main/java/com/stdatalabs/MRAadhaarAnalysis/SortComparator.java:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.MRAadhaarAnalysis;
2 |
3 | import org.apache.hadoop.io.IntWritable;
4 | import org.apache.hadoop.io.WritableComparable;
5 | import org.apache.hadoop.io.WritableComparator;
6 |
7 | public class SortComparator extends WritableComparator {
8 |
9 | @Override
10 | public int compare(WritableComparable k1, WritableComparable k2) {
11 | IntWritable v1 = (IntWritable) k1;
12 | IntWritable v2 = (IntWritable) k2;
13 |
14 | return v1.get() < v2.get() ? 1 : v1.get() == v2.get() ? 0 : -1;
15 |
16 | }
17 |
18 | protected SortComparator() {
19 | super(IntWritable.class, true);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/MRAadhaarAnalysis/src/main/java/com/stdatalabs/MRAadhaarAnalysis/SortMapper.java:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.MRAadhaarAnalysis;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.LongWritable;
7 | import org.apache.hadoop.io.Text;
8 | import org.apache.hadoop.mapreduce.Mapper;
9 |
10 | public class SortMapper extends Mapper {
11 |
12 | Text state = new Text();
13 | IntWritable count = new IntWritable();
14 |
15 | public void map(LongWritable key, Text value, Context context)
16 | throws IOException, InterruptedException {
17 | String[] splits = value.toString().split("\\|");
18 |
19 | state.set(splits[0].trim());
20 | count.set(Integer.parseInt(splits[1].trim()));
21 |
22 | context.write(count, state);
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/MRAadhaarAnalysis/src/main/java/com/stdatalabs/MRAadhaarAnalysis/SortReducer.java:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.MRAadhaarAnalysis;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Text;
7 | import org.apache.hadoop.mapreduce.Reducer;
8 |
9 | public class SortReducer extends Reducer {
10 |
11 | public void reduce(IntWritable key, Iterable values, Context context)
12 | throws IOException, InterruptedException {
13 | for (Text val : values) {
14 |
15 | context.write(val, key);
16 | }
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/MRAadhaarAnalysis/src/test/java/com/stdatalabs/MRAadhaarAnalysis/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.MRAadhaarAnalysis;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MapReduce VS Spark - Aadhaar dataset analysis
2 |
3 | Analyzing Aadhaar dataset using MapReduce and Spark
4 |
5 | ## Requirements
6 | - IDE
7 | - Apache Maven 3.x
8 | - JVM 6 or 7
9 |
10 | ## Objectives
11 | * Count the number of identities(Aadhaar) generated in each state
12 | * Count the number of identities(Aadhaar) generated by each Enrollment Agency
13 | * Top 10 districts with maximum identities generated for both Male and Female
14 |
15 | ## General Info
16 | The repository contains both MapReduce and Spark projects MRAadhaarAnalysis and SparkAadhaarAnalysis
17 | * com/stdatalabs/SparkAadhaarAnalysis
18 | * UIDStats.scala -- Spark code to analyze Aadhaar dataset
19 | * com/stdatalabs/MRAadhaarAnalysis
20 | * NumUIDMapper.java -- Filters the header and writes (State, Aadhaar_generated) to mapper output
21 | * NumUIDReducer.java -- Aggregates values for each State that is received as key from the mapper and outputs the State wise identities generated
22 | * SortMapper.java -- Receives output from previous MR job and swaps the (K, V) pair
23 | * SortComparator.java -- Sorts the mapper output in descending order before passing to reducer
24 | * SortReducer.java -- Swaps the (K, V) pair into (State, count) and sends to output file
25 | * Driver.java -- Driver program for MapReduce jobs
26 |
27 | ## Description
28 | * A comparison between MapReduce and Apache Spark Dataframes code for analyzing Aadhaar dataset
29 | Discussed in blog --
30 | [MapReduce VS Spark - Aadhaar dataset analysis](http://stdatalabs.blogspot.in/2017/03/mapreduce-vs-spark-aadhaar-dataset-analysis.html)
31 |
32 | ### More articles on hadoop technology stack at [stdatalabs](stdatalabs.blogspot.com)
33 |
34 |
--------------------------------------------------------------------------------
/SparkAadhaarAnalysis/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | .*/
3 | target/
4 | .classpath
5 | .cache-main
6 | .cache-tests
7 | .settings
8 | .project
9 | tweets.txt
10 | checkpoint/
11 |
12 | # Compiled source #
13 | ###################
14 | *.com
15 | *.class
16 | *.dll
17 | *.exe
18 | *.o
19 | *.so
20 | checkpoint
21 |
22 | # Packages #
23 | ############
24 | # it's better to unpack these files and commit the raw source
25 | # git has its own built in compression methods
26 | *.7z
27 | *.dmg
28 | *.gz
29 | *.iso
30 | *.jar
31 | *.rar
32 | *.tar
33 | *.zip
34 |
35 | # Logs and databases #
36 | ######################
37 | *.log
38 | *.sql
39 | *.sqlite
40 |
41 | # OS generated files #
42 | ######################
43 | .DS_Store
44 | .DS_Store?
45 | ._*
46 | .Spotlight-V100
47 | .Trashes
48 | ehthumbs.db
49 | Thumbs.db
--------------------------------------------------------------------------------
/SparkAadhaarAnalysis/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.stdatalabs
6 | SparkAadhaarAnalysis
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | SparkAadhaarAnalysis
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 |
16 |
17 |
18 |
19 | junit
20 | junit
21 | 3.8.1
22 | test
23 |
24 |
25 | org.apache.spark
26 | spark-core_2.10
27 | 1.5.0
28 | compile
29 |
30 |
31 | org.apache.spark
32 | spark-mllib_2.10
33 | 1.5.0
34 |
35 |
36 | org.apache.spark
37 | spark-sql_2.10
38 | 1.5.0
39 |
40 |
41 | org.apache.spark
42 | spark-hive_2.10
43 | 1.5.0
44 |
45 |
46 | org.apache.hadoop
47 | hadoop-client
48 | 2.6.0
49 | compile
50 |
51 |
52 | org.apache.hadoop
53 | hadoop-common
54 | 2.6.0
55 | compile
56 |
57 |
58 | com.databricks
59 | spark-csv_2.10
60 | 1.5.0
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/SparkAadhaarAnalysis/src/main/scala/com/stdatalabs/SparkAadhaarAnalysis/UIDStats.scala:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.SparkAadhaarAnalysis
2 |
3 | /*#############################################################################################
4 | # Description: Aadhaar dataset analysis using Spark
5 | ##
6 | # Input:
7 | # 1. /user/cloudera/UIDAI-ENR-DETAIL-20170308.csv
8 | #
9 | # To Run this code use the command:
10 | # spark-submit --class com.stdatalabs.SparkAadhaarAnalysis.UIDStats \
11 | # --packages com.databricks:spark-csv_2.10:1.5.0 \
12 | # --master yarn-client \
13 | # SparkAadhaarAnalysis-0.0.1-SNAPSHOT.jar \
14 | # /user/cloudera/UIDAI-ENR-DETAIL-20170308.csv \
15 | #############################################################################################*/
16 |
17 | // Scala Imports
18 | import org.apache.spark.{ SparkConf, SparkContext }
19 | import org.apache.spark.sql._
20 | import org.apache.spark.sql.SQLContext._
21 | import org.apache.spark.sql.hive.HiveContext
22 |
23 | object UIDStats {
24 |
25 | val conf = new SparkConf().setAppName("Aadhaar dataset analysis using Spark")
26 | val sc = new SparkContext(conf)
27 |
28 | val hiveContext = new HiveContext(sc)
29 | import hiveContext.implicits._
30 |
31 | def main(args: Array[String]) {
32 |
33 | // Register dataset as a temp table
34 | val uidEnrolmentDF = hiveContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load(args(0))
35 | uidEnrolmentDF.registerTempTable("uid_enrolments_detail")
36 |
37 | // Create a hive table with Total Aadhaar's generated for each state
38 | val stateWiseCountDF = hiveContext.sql("""
39 | | SELECT State,
40 | | SUM(`Aadhaar generated`) as count
41 | | FROM uid_enrolments_detail
42 | | GROUP BY state
43 | | ORDER BY count DESC""".stripMargin)
44 |
45 | stateWiseCountDF.write.mode("overwrite").saveAsTable("uid.state_wise_count")
46 |
47 | // Create a hive table with Total Aadhaar's generated by each enrolment agency
48 | val maxEnrolmentAgencyDF = hiveContext.sql("""
49 | | SELECT `Enrolment Agency` as Enrolment_Agency,
50 | | SUM(`Aadhaar generated`) as count
51 | | FROM uid_enrolments_detail
52 | | GROUP BY `Enrolment Agency`
53 | | ORDER BY count DESC""".stripMargin)
54 |
55 | maxEnrolmentAgencyDF.write.mode("overwrite").saveAsTable("uid.agency_wise_count")
56 |
57 | // Create hive table with top 10 districts with maximum Aadhaar's generated for both Male and Female
58 | val districtWiseGenderCountDF = hiveContext.sql("""
59 | | SELECT District,
60 | | count(CASE WHEN Gender='M' THEN 1 END) as male_count,
61 | | count(CASE WHEN Gender='F' THEN 1 END) as FEMALE_count
62 | | FROM uid_enrolments_detail
63 | | GROUP BY District
64 | | ORDER BY male_count DESC, FEMALE_count DESC
65 | | LIMIT 10""".stripMargin)
66 |
67 | districtWiseGenderCountDF.write.mode("overwrite").saveAsTable("uid.district_wise_gndr_count")
68 | }
69 |
70 | }
--------------------------------------------------------------------------------
/SparkAadhaarAnalysis/src/test/java/com/stdatalabs/SparkAadhaarAnalysis/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.SparkAadhaarAnalysis;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------