├── MRAadhaarAnalysis ├── .gitignore ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── stdatalabs │ │ └── MRAadhaarAnalysis │ │ ├── Driver.java │ │ ├── NumUIDMapper.java │ │ ├── NumUIDReducer.java │ │ ├── SortComparator.java │ │ ├── SortMapper.java │ │ └── SortReducer.java │ └── test │ └── java │ └── com │ └── stdatalabs │ └── MRAadhaarAnalysis │ └── AppTest.java ├── README.md ├── SparkAadhaarAnalysis ├── .gitignore ├── pom.xml └── src │ ├── main │ └── scala │ │ └── com │ │ └── stdatalabs │ │ └── SparkAadhaarAnalysis │ │ └── UIDStats.scala │ └── test │ └── java │ └── com │ └── stdatalabs │ └── SparkAadhaarAnalysis │ └── AppTest.java └── data └── UIDAI-ENR-DETAIL-20170308.csv /MRAadhaarAnalysis/.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | .*/ 3 | target/ 4 | .classpath 5 | .cache-main 6 | .cache-tests 7 | .settings 8 | .project 9 | tweets.txt 10 | checkpoint/ 11 | 12 | # Compiled source # 13 | ################### 14 | *.com 15 | *.class 16 | *.dll 17 | *.exe 18 | *.o 19 | *.so 20 | checkpoint 21 | 22 | # Packages # 23 | ############ 24 | # it's better to unpack these files and commit the raw source 25 | # git has its own built in compression methods 26 | *.7z 27 | *.dmg 28 | *.gz 29 | *.iso 30 | *.jar 31 | *.rar 32 | *.tar 33 | *.zip 34 | 35 | # Logs and databases # 36 | ###################### 37 | *.log 38 | *.sql 39 | *.sqlite 40 | 41 | # OS generated files # 42 | ###################### 43 | .DS_Store 44 | .DS_Store? 45 | ._* 46 | .Spotlight-V100 47 | .Trashes 48 | ehthumbs.db 49 | Thumbs.db -------------------------------------------------------------------------------- /MRAadhaarAnalysis/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.stdatalabs 6 | MRAadhaarAnalysis 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | MRAadhaarAnalysis 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 2.6.0-cdh5.5.5 16 | 1.2.17 17 | 2.4 18 | 19 | 20 | 21 | 22 | junit 23 | junit 24 | 3.8.1 25 | test 26 | 27 | 28 | org.apache.hadoop 29 | hadoop-client 30 | ${hadoop.version} 31 | 32 | 33 | 34 | 35 | 36 | org.apache.maven.plugins 37 | maven-jar-plugin 38 | ${maven_jar_plugin.version} 39 | 40 | 41 | 42 | 43 | 44 | 45 | cloudera-repo 46 | http://repository.cloudera.com/artifactory/cloudera-repos/ 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /MRAadhaarAnalysis/src/main/java/com/stdatalabs/MRAadhaarAnalysis/Driver.java: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.MRAadhaarAnalysis; 2 | 3 | /*############################################################################################# 4 | # Description: Aadhaar dataset analysis using MapReduce 5 | # 6 | # Input: 7 | # 1. /user/cloudera/UIDAI-ENR-DETAIL-20170308.csv 8 | # 9 | # To Run this code use the command: 10 | # yarn jar MRAadhaarAnalysis-0.0.1-SNAPSHOT.jar \ 11 | # com.stdatalabs.MRAadhaarAnalysis.Driver \ 12 | # UIDAI-ENR-DETAIL-20170308.csv \ 13 | # MRStateWiseUIDCount \ 14 | # MRStateWiseUIDCount_sorted 15 | #############################################################################################*/ 16 | 17 | import org.apache.hadoop.conf.Configured; 18 | import org.apache.hadoop.fs.FileSystem; 19 | import org.apache.hadoop.fs.Path; 20 | import org.apache.hadoop.io.IntWritable; 21 | import org.apache.hadoop.io.Text; 22 | import org.apache.hadoop.mapreduce.Job; 23 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 24 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 25 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 26 | import org.apache.hadoop.util.Tool; 27 | import org.apache.hadoop.util.ToolRunner; 28 | 29 | public class Driver extends Configured implements Tool { 30 | 31 | @Override 32 | public int run(String[] args) throws Exception { 33 | 34 | if (args.length != 3) { 35 | System.out.println("Usage: [input] [output1] [output2]"); 36 | System.exit(-1); 37 | } 38 | Job stateWiseCount = Job.getInstance(getConf()); 39 | stateWiseCount.setJobName("Aadhaar Data Analysis"); 40 | stateWiseCount.setJarByClass(Driver.class); 41 | 42 | /* Field separator for reducer output*/ 43 | stateWiseCount.getConfiguration().set("mapreduce.output.textoutputformat.separator", " | "); 44 | 45 | stateWiseCount.setMapperClass(NumUIDMapper.class); 46 | stateWiseCount.setReducerClass(NumUIDReducer.class); 47 | 48 | stateWiseCount.setInputFormatClass(TextInputFormat.class); 49 | stateWiseCount.setMapOutputKeyClass(Text.class); 50 | stateWiseCount.setMapOutputValueClass(IntWritable.class); 51 | 52 | stateWiseCount.setOutputKeyClass(Text.class); 53 | stateWiseCount.setOutputValueClass(IntWritable.class); 54 | 55 | Path inputFilePath = new Path(args[0]); 56 | Path outputFilePath = new Path(args[1]); 57 | 58 | FileInputFormat.addInputPath(stateWiseCount, inputFilePath); 59 | FileOutputFormat.setOutputPath(stateWiseCount, outputFilePath); 60 | 61 | FileSystem fs = FileSystem.newInstance(getConf()); 62 | 63 | if (fs.exists(outputFilePath)) { 64 | fs.delete(outputFilePath, true); 65 | } 66 | 67 | stateWiseCount.waitForCompletion(true); 68 | 69 | 70 | Job sort = Job.getInstance(getConf()); 71 | sort.setJobName("Sorting States on Num Aadhaars generated"); 72 | sort.setJarByClass(Driver.class); 73 | 74 | sort.setOutputKeyClass(Text.class); 75 | sort.setOutputValueClass(IntWritable.class); 76 | 77 | sort.setMapperClass(SortMapper.class); 78 | sort.setReducerClass(SortReducer.class); 79 | sort.setSortComparatorClass(SortComparator.class); 80 | 81 | sort.setMapOutputKeyClass(IntWritable.class); 82 | sort.setMapOutputValueClass(Text.class); 83 | 84 | FileInputFormat.addInputPath(sort, new Path(args[1])); 85 | FileOutputFormat.setOutputPath(sort, new Path(args[2])); 86 | 87 | if (fs.exists(new Path(args[2]))) { 88 | fs.delete(new Path(args[2]), true); 89 | } 90 | 91 | return sort.waitForCompletion(true) ? 0 : 1; 92 | 93 | } 94 | 95 | public static void main(String[] args) throws Exception { 96 | ToolRunner.run(new Driver(), args); 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /MRAadhaarAnalysis/src/main/java/com/stdatalabs/MRAadhaarAnalysis/NumUIDMapper.java: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.MRAadhaarAnalysis; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | import org.apache.log4j.Logger; 10 | 11 | public class NumUIDMapper extends Mapper { 12 | 13 | Text state = new Text(); 14 | IntWritable count = new IntWritable(); 15 | private static final Logger LOG = Logger.getLogger(NumUIDMapper.class); 16 | 17 | public void map(LongWritable key, Text value, Context context) 18 | throws IOException, InterruptedException { 19 | String[] list = value.toString().split(","); 20 | 21 | if (key.get() != 0) { 22 | state.set(list[2]); 23 | count.set(Integer.parseInt(list[8])); 24 | LOG.info("Reading line: " + state + "Aadhaars generated: " 25 | + list[8]); 26 | context.write(state, count); 27 | } 28 | LOG.info("Skipped line with key:" + key.get()); 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /MRAadhaarAnalysis/src/main/java/com/stdatalabs/MRAadhaarAnalysis/NumUIDReducer.java: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.MRAadhaarAnalysis; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Reducer; 8 | 9 | public class NumUIDReducer extends 10 | Reducer { 11 | 12 | public void reduce(Text key, Iterable values, Context context) 13 | throws IOException, InterruptedException { 14 | int sum = 0; 15 | for (IntWritable count : values) { 16 | sum += count.get(); 17 | } 18 | 19 | context.write(key, new IntWritable(sum)); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /MRAadhaarAnalysis/src/main/java/com/stdatalabs/MRAadhaarAnalysis/SortComparator.java: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.MRAadhaarAnalysis; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.WritableComparable; 5 | import org.apache.hadoop.io.WritableComparator; 6 | 7 | public class SortComparator extends WritableComparator { 8 | 9 | @Override 10 | public int compare(WritableComparable k1, WritableComparable k2) { 11 | IntWritable v1 = (IntWritable) k1; 12 | IntWritable v2 = (IntWritable) k2; 13 | 14 | return v1.get() < v2.get() ? 1 : v1.get() == v2.get() ? 0 : -1; 15 | 16 | } 17 | 18 | protected SortComparator() { 19 | super(IntWritable.class, true); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /MRAadhaarAnalysis/src/main/java/com/stdatalabs/MRAadhaarAnalysis/SortMapper.java: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.MRAadhaarAnalysis; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | 10 | public class SortMapper extends Mapper { 11 | 12 | Text state = new Text(); 13 | IntWritable count = new IntWritable(); 14 | 15 | public void map(LongWritable key, Text value, Context context) 16 | throws IOException, InterruptedException { 17 | String[] splits = value.toString().split("\\|"); 18 | 19 | state.set(splits[0].trim()); 20 | count.set(Integer.parseInt(splits[1].trim())); 21 | 22 | context.write(count, state); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /MRAadhaarAnalysis/src/main/java/com/stdatalabs/MRAadhaarAnalysis/SortReducer.java: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.MRAadhaarAnalysis; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Reducer; 8 | 9 | public class SortReducer extends Reducer { 10 | 11 | public void reduce(IntWritable key, Iterable values, Context context) 12 | throws IOException, InterruptedException { 13 | for (Text val : values) { 14 | 15 | context.write(val, key); 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /MRAadhaarAnalysis/src/test/java/com/stdatalabs/MRAadhaarAnalysis/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.MRAadhaarAnalysis; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MapReduce VS Spark - Aadhaar dataset analysis 2 | 3 | Analyzing Aadhaar dataset using MapReduce and Spark 4 | 5 | ## Requirements 6 | - IDE 7 | - Apache Maven 3.x 8 | - JVM 6 or 7 9 | 10 | ## Objectives 11 | * Count the number of identities(Aadhaar) generated in each state 12 | * Count the number of identities(Aadhaar) generated by each Enrollment Agency 13 | * Top 10 districts with maximum identities generated for both Male and Female 14 | 15 | ## General Info 16 | The repository contains both MapReduce and Spark projects MRAadhaarAnalysis and SparkAadhaarAnalysis 17 | * com/stdatalabs/SparkAadhaarAnalysis 18 | * UIDStats.scala -- Spark code to analyze Aadhaar dataset 19 | * com/stdatalabs/MRAadhaarAnalysis 20 | * NumUIDMapper.java -- Filters the header and writes (State, Aadhaar_generated) to mapper output 21 | * NumUIDReducer.java -- Aggregates values for each State that is received as key from the mapper and outputs the State wise identities generated 22 | * SortMapper.java -- Receives output from previous MR job and swaps the (K, V) pair 23 | * SortComparator.java -- Sorts the mapper output in descending order before passing to reducer 24 | * SortReducer.java -- Swaps the (K, V) pair into (State, count) and sends to output file 25 | * Driver.java -- Driver program for MapReduce jobs 26 | 27 | ## Description 28 | * A comparison between MapReduce and Apache Spark Dataframes code for analyzing Aadhaar dataset 29 | Discussed in blog -- 30 | [MapReduce VS Spark - Aadhaar dataset analysis](http://stdatalabs.blogspot.in/2017/03/mapreduce-vs-spark-aadhaar-dataset-analysis.html) 31 | 32 | ### More articles on hadoop technology stack at [stdatalabs](stdatalabs.blogspot.com) 33 | 34 | -------------------------------------------------------------------------------- /SparkAadhaarAnalysis/.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | .*/ 3 | target/ 4 | .classpath 5 | .cache-main 6 | .cache-tests 7 | .settings 8 | .project 9 | tweets.txt 10 | checkpoint/ 11 | 12 | # Compiled source # 13 | ################### 14 | *.com 15 | *.class 16 | *.dll 17 | *.exe 18 | *.o 19 | *.so 20 | checkpoint 21 | 22 | # Packages # 23 | ############ 24 | # it's better to unpack these files and commit the raw source 25 | # git has its own built in compression methods 26 | *.7z 27 | *.dmg 28 | *.gz 29 | *.iso 30 | *.jar 31 | *.rar 32 | *.tar 33 | *.zip 34 | 35 | # Logs and databases # 36 | ###################### 37 | *.log 38 | *.sql 39 | *.sqlite 40 | 41 | # OS generated files # 42 | ###################### 43 | .DS_Store 44 | .DS_Store? 45 | ._* 46 | .Spotlight-V100 47 | .Trashes 48 | ehthumbs.db 49 | Thumbs.db -------------------------------------------------------------------------------- /SparkAadhaarAnalysis/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.stdatalabs 6 | SparkAadhaarAnalysis 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | SparkAadhaarAnalysis 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | org.apache.spark 26 | spark-core_2.10 27 | 1.5.0 28 | compile 29 | 30 | 31 | org.apache.spark 32 | spark-mllib_2.10 33 | 1.5.0 34 | 35 | 36 | org.apache.spark 37 | spark-sql_2.10 38 | 1.5.0 39 | 40 | 41 | org.apache.spark 42 | spark-hive_2.10 43 | 1.5.0 44 | 45 | 46 | org.apache.hadoop 47 | hadoop-client 48 | 2.6.0 49 | compile 50 | 51 | 52 | org.apache.hadoop 53 | hadoop-common 54 | 2.6.0 55 | compile 56 | 57 | 58 | com.databricks 59 | spark-csv_2.10 60 | 1.5.0 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /SparkAadhaarAnalysis/src/main/scala/com/stdatalabs/SparkAadhaarAnalysis/UIDStats.scala: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.SparkAadhaarAnalysis 2 | 3 | /*############################################################################################# 4 | # Description: Aadhaar dataset analysis using Spark 5 | ## 6 | # Input: 7 | # 1. /user/cloudera/UIDAI-ENR-DETAIL-20170308.csv 8 | # 9 | # To Run this code use the command: 10 | # spark-submit --class com.stdatalabs.SparkAadhaarAnalysis.UIDStats \ 11 | # --packages com.databricks:spark-csv_2.10:1.5.0 \ 12 | # --master yarn-client \ 13 | # SparkAadhaarAnalysis-0.0.1-SNAPSHOT.jar \ 14 | # /user/cloudera/UIDAI-ENR-DETAIL-20170308.csv \ 15 | #############################################################################################*/ 16 | 17 | // Scala Imports 18 | import org.apache.spark.{ SparkConf, SparkContext } 19 | import org.apache.spark.sql._ 20 | import org.apache.spark.sql.SQLContext._ 21 | import org.apache.spark.sql.hive.HiveContext 22 | 23 | object UIDStats { 24 | 25 | val conf = new SparkConf().setAppName("Aadhaar dataset analysis using Spark") 26 | val sc = new SparkContext(conf) 27 | 28 | val hiveContext = new HiveContext(sc) 29 | import hiveContext.implicits._ 30 | 31 | def main(args: Array[String]) { 32 | 33 | // Register dataset as a temp table 34 | val uidEnrolmentDF = hiveContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load(args(0)) 35 | uidEnrolmentDF.registerTempTable("uid_enrolments_detail") 36 | 37 | // Create a hive table with Total Aadhaar's generated for each state 38 | val stateWiseCountDF = hiveContext.sql(""" 39 | | SELECT State, 40 | | SUM(`Aadhaar generated`) as count 41 | | FROM uid_enrolments_detail 42 | | GROUP BY state 43 | | ORDER BY count DESC""".stripMargin) 44 | 45 | stateWiseCountDF.write.mode("overwrite").saveAsTable("uid.state_wise_count") 46 | 47 | // Create a hive table with Total Aadhaar's generated by each enrolment agency 48 | val maxEnrolmentAgencyDF = hiveContext.sql(""" 49 | | SELECT `Enrolment Agency` as Enrolment_Agency, 50 | | SUM(`Aadhaar generated`) as count 51 | | FROM uid_enrolments_detail 52 | | GROUP BY `Enrolment Agency` 53 | | ORDER BY count DESC""".stripMargin) 54 | 55 | maxEnrolmentAgencyDF.write.mode("overwrite").saveAsTable("uid.agency_wise_count") 56 | 57 | // Create hive table with top 10 districts with maximum Aadhaar's generated for both Male and Female 58 | val districtWiseGenderCountDF = hiveContext.sql(""" 59 | | SELECT District, 60 | | count(CASE WHEN Gender='M' THEN 1 END) as male_count, 61 | | count(CASE WHEN Gender='F' THEN 1 END) as FEMALE_count 62 | | FROM uid_enrolments_detail 63 | | GROUP BY District 64 | | ORDER BY male_count DESC, FEMALE_count DESC 65 | | LIMIT 10""".stripMargin) 66 | 67 | districtWiseGenderCountDF.write.mode("overwrite").saveAsTable("uid.district_wise_gndr_count") 68 | } 69 | 70 | } -------------------------------------------------------------------------------- /SparkAadhaarAnalysis/src/test/java/com/stdatalabs/SparkAadhaarAnalysis/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.SparkAadhaarAnalysis; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | --------------------------------------------------------------------------------