├── HashtagAnalysis ├── .classpath ├── .project ├── .settings │ └── org.eclipse.jdt.core.prefs ├── HashtagAnalysis.xls ├── bin │ ├── HMapper.class │ ├── HReducer.class │ └── HashtagAnalysis.class ├── build.xml ├── classes │ ├── HMapper.class │ ├── HReducer.class │ └── HashtagAnalysis.class ├── dist │ └── HashtagAnalysis.jar ├── out │ └── HashtagAnalysis.txt └── src │ ├── HMapper.java │ ├── HReducer.java │ └── HashtagAnalysis.java ├── README.md ├── Report.pdf ├── TextAnalysis ├── .classpath ├── .project ├── .settings │ └── org.eclipse.jdt.core.prefs ├── TextAnalysis.xls ├── bin │ ├── TMapper.class │ ├── TReducer.class │ └── TextAnalysis.class ├── build.xml ├── classes │ ├── TMapper.class │ ├── TReducer.class │ └── TextAnalysis.class ├── dist │ └── TextAnalysis.jar ├── out │ ├── TextAnalysis0.txt │ └── TextAnalysis1.txt └── src │ ├── TMapper.java │ ├── TReducer.java │ └── TextAnalysis.java ├── TextAnalysisAvg ├── .classpath ├── .project ├── .settings │ └── org.eclipse.jdt.core.prefs ├── bin │ ├── TAvgMapper.class │ ├── TAvgReducer.class │ ├── TextAnalysisAvg.class │ └── inintpair │ │ └── IntIntPair.class ├── build.xml ├── classes │ ├── TAvgMapper.class │ ├── TAvgReducer.class │ ├── TextAnalysisAvg.class │ └── inintpair │ │ └── IntIntPair.class ├── dist │ └── TextAnalysisAvg.jar ├── out │ └── TextAnalysisAvg.txt └── src │ ├── TAvgMapper.java │ ├── TAvgReducer.java │ ├── TextAnalysisAvg.java │ └── inintpair │ └── IntIntPair.java └── TimeAnalysis ├── .classpath ├── .project ├── .settings └── org.eclipse.jdt.core.prefs ├── TimeAnalysis.xls ├── bin ├── TimeAnalysis.class ├── TimeMapper.class └── TimeReducer.class ├── build.xml ├── classes ├── TimeAnalysis.class ├── TimeMapper.class └── TimeReducer.class ├── dist └── TimeAnalysis.jar ├── out └── TimeAnalysis.txt └── src ├── TimeAnalysis.java ├── TimeMapper.java └── TimeReducer.java /HashtagAnalysis/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /HashtagAnalysis/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | HashtagAnalysis 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /HashtagAnalysis/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.7 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.7 12 | -------------------------------------------------------------------------------- /HashtagAnalysis/HashtagAnalysis.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/HashtagAnalysis/HashtagAnalysis.xls -------------------------------------------------------------------------------- /HashtagAnalysis/bin/HMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/HashtagAnalysis/bin/HMapper.class -------------------------------------------------------------------------------- /HashtagAnalysis/bin/HReducer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/HashtagAnalysis/bin/HReducer.class -------------------------------------------------------------------------------- /HashtagAnalysis/bin/HashtagAnalysis.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/HashtagAnalysis/bin/HashtagAnalysis.class -------------------------------------------------------------------------------- /HashtagAnalysis/build.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Sample MapReduce project build file 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /HashtagAnalysis/classes/HMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/HashtagAnalysis/classes/HMapper.class -------------------------------------------------------------------------------- /HashtagAnalysis/classes/HReducer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/HashtagAnalysis/classes/HReducer.class -------------------------------------------------------------------------------- /HashtagAnalysis/classes/HashtagAnalysis.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/HashtagAnalysis/classes/HashtagAnalysis.class -------------------------------------------------------------------------------- /HashtagAnalysis/dist/HashtagAnalysis.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/HashtagAnalysis/dist/HashtagAnalysis.jar -------------------------------------------------------------------------------- /HashtagAnalysis/out/HashtagAnalysis.txt: -------------------------------------------------------------------------------- 1 | 224 2 | 22indonesia 204 3 | 4gold 1916 4 | 4goldindia 255 5 | _gb 499 6 | africa 770 7 | al 394 8 | america 572 9 | at 989 10 | aus 920 11 | aussie 2261 12 | aussies 8216 13 | australia 1212 14 | autofollow 403 15 | bahamas 559 16 | bears 485 17 | belgium 505 18 | bg 244 19 | blake 200 20 | bletoffire 5276 21 | blue 662 22 | bolt 2007 23 | boomers 341 24 | boomersgo 255 25 | botswana 227 26 | brasil 749 27 | brazil 1441 28 | brijamada 300 29 | cana 218 30 | canad 216 31 | canada 19218 32 | canadago 9067 33 | caribbean 807 34 | caster 1426 35 | cav 510 36 | china 1177 37 | croatia 431 38 | d 333 39 | daley 671 40 | dblessamerica 451 41 | dsavethequeen 809 42 | ducks 469 43 | egypt 1107 44 | ennis 256 45 | ethiopia 1651 46 | fair2012 675 47 | federer 502 48 | follback 292 49 | followback 9000 50 | forgold 6163 51 | forthegold 865 52 | france 1224 53 | g 367 54 | gabby 1270 55 | gators 483 56 | gav 201 57 | gb 398581 58 | gbboxing 247 59 | gbfootball 224 60 | gbgymnastics 508 61 | gbr 624 62 | germany 532 63 | ghana 426 64 | girl 886 65 | girls 1194 66 | gold 386 67 | greatbritain 288 68 | green 331 69 | grenada 945 70 | haiti 740 71 | holland 206 72 | hungary 827 73 | ina 470 74 | india 1434 75 | indonesia 2442 76 | inforgold 400 77 | ingforgold 3608 78 | ingforthegold 294 79 | iran 570 80 | ireland 17798 81 | israel 1048 82 | italy 1052 83 | ja 1167 84 | jamaica 21033 85 | japan 985 86 | jess 561 87 | jet 223 88 | jo 370 89 | katie 1355 90 | kb24 215 91 | kenya 11152 92 | korea 515 93 | l 252 94 | latvia 582 95 | ld 63486 96 | ldbitches 1111 97 | lden 2393 98 | ldengirl 652 99 | ldengirls 514 100 | ldenhoy 1895 101 | ldformalaysia 233 102 | ldforsure 513 103 | ldgoldgold 258 104 | ldmedal 8328 105 | ldmedalgame 306 106 | ldmedalist 3350 107 | ldmedalmatch 398 108 | ldmedals 731 109 | ldmedalstamp 354 110 | ldmedalstamps 1051 111 | ldmetal 231 112 | ldorbust 1009 113 | ldpostbox 252 114 | ldrush 2005 115 | lds 227 116 | ldwednesday 9431 117 | lebron 1069 118 | lf 624 119 | lfclap 228 120 | libra 233 121 | lochte 1450 122 | ltu 392 123 | malaysia 9077 124 | matty 287 125 | mexico 1355 126 | mo 9936 127 | mogo 260 128 | mongolia 1673 129 | murray 519 130 | naija 403 131 | ne 334 132 | nieto 241 133 | nigeria 5716 134 | nl 2632 135 | nz 695 136 | od 355 137 | odbye 268 138 | odday 222 139 | odjob 326 140 | odluck 1196 141 | odlucktom 206 142 | odnight 407 143 | odolddays 1925 144 | odtimes 726 145 | odyearformgar 13590 146 | ogle 3787 147 | ogledoodle 853 148 | olympicmas 408 149 | on 202 150 | oner 241 151 | onerontour 300 152 | onjj 204 153 | opals 625 154 | opalsgo 221 155 | osebumps 1272 156 | p 969 157 | pakistan 475 158 | paralympicsgb 1128 159 | ph 2716 160 | phelps 2270 161 | philippines 1604 162 | pilipinas 215 163 | pinas 340 164 | poland 575 165 | puertorico 753 166 | pursuit 337 167 | qatar 1077 168 | rgeous 513 169 | rin 21317 170 | roger 255 171 | rsa 247 172 | russia 2093 173 | sa 10801 174 | sally 310 175 | samsung 208 176 | scotland 203 177 | serbia 303 178 | sheikie 247 179 | singapore 785 180 | skank 230 181 | somalia 324 182 | southafrica 717 183 | spain 550 184 | sprint 227 185 | ssip 1072 186 | supermodel 653 187 | sweden 206 188 | tea 3314 189 | tn 764 190 | tom 2335 191 | turkey 391 192 | twin 352 193 | u 488 194 | ud 558 195 | uganda 728 196 | uk 232 197 | us 545 198 | usa 203459 199 | usabasketball 265 200 | ve 302 201 | wiggo 544 202 | wor 274 203 | world 583 204 | yorkshire 729 205 | zimbabwe 286 206 | -------------------------------------------------------------------------------- /HashtagAnalysis/src/HMapper.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | 3 | import org.apache.commons.lang.StringUtils; 4 | import org.apache.hadoop.io.IntWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Mapper; 7 | 8 | import java.util.regex.Matcher; 9 | import java.util.regex.Pattern; 10 | 11 | public class HMapper extends Mapper { 12 | 13 | private final IntWritable one = new IntWritable(1); 14 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 15 | 16 | String tweets = value.toString(); 17 | 18 | //if there is a 4th semicolon... 19 | if(StringUtils.ordinalIndexOf(tweets,";",4)>-1){ 20 | 21 | int startIndex = StringUtils.ordinalIndexOf(tweets,";",2) + 1; 22 | 23 | String tweet = tweets.substring(startIndex,tweets.lastIndexOf(';')); 24 | tweet = tweet.toLowerCase(); 25 | 26 | Matcher matcher = Pattern.compile("#go\\s*(\\w+)").matcher(tweet); 27 | 28 | while(matcher.find()){ 29 | try{ 30 | 31 | String team = tweet.substring(matcher.start() + 3, matcher.start() + 7); 32 | if(team.equals("team")){ 33 | 34 | context.write(new Text(tweet.substring(matcher.start() + 7, matcher.end())), one); 35 | break; 36 | } 37 | 38 | } catch(StringIndexOutOfBoundsException e){ 39 | 40 | } 41 | context.write(new Text(matcher.group(1)), one); 42 | 43 | } 44 | 45 | matcher = Pattern.compile("#team\\*s*(\\w+)").matcher(tweet); 46 | 47 | while(matcher.find()){ 48 | context.write(new Text(matcher.group(1)), one); 49 | 50 | } 51 | 52 | } 53 | 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /HashtagAnalysis/src/HReducer.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.Iterator; 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | 8 | import java.io.IOException; 9 | import java.util.Iterator; 10 | import org.apache.hadoop.io.IntWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | 14 | 15 | public class HReducer extends Reducer { 16 | 17 | public void reduce(Text key, Iterable values, Context context) 18 | 19 | throws IOException, InterruptedException { 20 | 21 | int sum = 0; 22 | 23 | for (IntWritable value : values) { 24 | 25 | sum = sum + value.get(); 26 | 27 | } 28 | context.write(key, new IntWritable(sum)); 29 | 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /HashtagAnalysis/src/HashtagAnalysis.java: -------------------------------------------------------------------------------- 1 | import java.util.Arrays; 2 | 3 | import org.apache.commons.lang.StringUtils; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | 13 | public class HashtagAnalysis { 14 | 15 | public static void runJob(String[] input, String output) throws Exception { 16 | 17 | Configuration conf = new Configuration(); 18 | 19 | Job job = new Job(conf); 20 | job.setJarByClass(HashtagAnalysis.class); 21 | job.setMapperClass(HMapper.class); 22 | job.setReducerClass(HReducer.class); 23 | job.setMapOutputKeyClass(Text.class); 24 | job.setMapOutputValueClass(IntWritable.class); 25 | Path outputPath = new Path(output); 26 | FileInputFormat.setInputPaths(job, StringUtils.join(input, ",")); 27 | FileOutputFormat.setOutputPath(job, outputPath); 28 | outputPath.getFileSystem(conf).delete(outputPath,true); 29 | job.waitForCompletion(true); 30 | job.setNumReduceTasks(3); 31 | } 32 | 33 | public static void main(String[] args) throws Exception { 34 | runJob(Arrays.copyOfRange(args, 0, args.length-1), args[args.length-1]); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Big Data Project 2 | ###Big Data: Twitter Analysis with Hadoop MapReduce 3 | 4 | The goal of this project is to develop several simple Map/Reduce programs to analyze one provided dataset. 5 | The dataset contained 18 million Twitter messages captured during the London 2012 Olympics period. All messages are related in some way to the events happening in London (as they have a term such as London2012). 6 | The format for the input of a Mapreduce job is as follows: 7 | 8 | ``` 9 | tweetId;date;hashtags;tweet 10 | ``` 11 | 12 | hashtags are space separated in the individual field, with the hash symbol already removed. 13 | 14 | 15 | An example entry for the dataset (with no hashtags) is: 16 | ``` 17 | 228940055878701056;2012-07-27, 20:48:57, BST;;RT @Mets: R.A. Dickey (1996 in Atlanta), Jon Rauch (2000 in Sydney) and first base coach Tom Goodwin (1988 in Seoul) - all on Olympic teams.; 18 | ``` 19 | #####Components: 20 | A set of Map/Reduce jobs developed within this project can process the given input and generate the data required to answer the following questions: 21 | 22 | ######A. TEXT ANALYSIS 23 | You can find a Histogram plot in the pdf report that depicts the distribution of sizes (measured in number of characters) among the Twitter dataset. 24 | 25 | ######B. TIME ANALYSIS 26 | You can find a Plot with time series with the number of Tweets that were posted each day of the event (one bar per day). 27 | 28 | ######C. HASHTAG ANALYSIS 29 | During the olympics the supporters from several countries were expressing their support by adding specific hashtags to its messages, in many cases with the form team__ (e.g. #teamgb). or, go___ (#gousa). 30 | You can find a table with all the team support hashtags you can find (based on these patterns), and the number of support messages. 31 | 32 | For complete set of analysis results, please check the pdf report 33 | -------------------------------------------------------------------------------- /Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/Report.pdf -------------------------------------------------------------------------------- /TextAnalysis/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /TextAnalysis/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | TextAnalysis 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /TextAnalysis/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.7 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.7 12 | -------------------------------------------------------------------------------- /TextAnalysis/TextAnalysis.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysis/TextAnalysis.xls -------------------------------------------------------------------------------- /TextAnalysis/bin/TMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysis/bin/TMapper.class -------------------------------------------------------------------------------- /TextAnalysis/bin/TReducer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysis/bin/TReducer.class -------------------------------------------------------------------------------- /TextAnalysis/bin/TextAnalysis.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysis/bin/TextAnalysis.class -------------------------------------------------------------------------------- /TextAnalysis/build.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Sample MapReduce project build file 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /TextAnalysis/classes/TMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysis/classes/TMapper.class -------------------------------------------------------------------------------- /TextAnalysis/classes/TReducer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysis/classes/TReducer.class -------------------------------------------------------------------------------- /TextAnalysis/classes/TextAnalysis.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysis/classes/TextAnalysis.class -------------------------------------------------------------------------------- /TextAnalysis/dist/TextAnalysis.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysis/dist/TextAnalysis.jar -------------------------------------------------------------------------------- /TextAnalysis/out/TextAnalysis0.txt: -------------------------------------------------------------------------------- 1 | 10 3263 2 | 100 177497 3 | 101 146324 4 | 102 157106 5 | 103 162544 6 | 104 147539 7 | 105 156907 8 | 106 139689 9 | 107 137532 10 | 108 135789 11 | 109 153300 12 | 11 7190 13 | 110 140325 14 | 111 158538 15 | 112 151096 16 | 113 141802 17 | 114 139099 18 | 115 137109 19 | 116 139532 20 | 117 137908 21 | 118 137070 22 | 119 131898 23 | 12 2061 24 | 120 151776 25 | 121 147601 26 | 122 150836 27 | 123 134502 28 | 124 147475 29 | 125 147787 30 | 126 136015 31 | 127 159992 32 | 128 147059 33 | 129 162509 34 | 13 3203 35 | 130 169696 36 | 131 165719 37 | 132 189185 38 | 133 184192 39 | 134 202242 40 | 135 243482 41 | 136 538165 42 | 137 373816 43 | 138 328376 44 | 139 346826 45 | 14 4116 46 | 140 1840929 47 | 141 709617 48 | 142 220262 49 | 143 47523 50 | 144 17204 51 | 145 1280 52 | 146 630 53 | 147 515 54 | 148 53 55 | 149 48 56 | 15 4872 57 | 150 22 58 | 151 28 59 | 152 8 60 | 153 6 61 | 154 21 62 | 155 10 63 | 156 14 64 | 157 10 65 | 158 8 66 | 159 10 67 | 16 6467 68 | 160 15 69 | 161 19 70 | 162 24 71 | 163 6 72 | 164 6 73 | 165 2 74 | 166 2 75 | 167 6 76 | 168 2 77 | 169 4 78 | 17 7399 79 | 170 7 80 | 171 4 81 | 172 3 82 | 173 5 83 | 174 3 84 | 175 2 85 | 176 3 86 | 177 6 87 | 178 2 88 | 179 5 89 | 18 8818 90 | 180 5 91 | 181 3 92 | 182 6 93 | 183 3 94 | 184 11 95 | 185 8 96 | 186 6 97 | 187 4 98 | 188 1 99 | 189 3 100 | 19 10670 101 | 190 2 102 | 192 1 103 | 193 9 104 | 194 24 105 | 195 4 106 | 196 7 107 | 197 2 108 | 198 3 109 | 199 3 110 | 20 30291 111 | 200 2 112 | 201 2 113 | 202 3 114 | 203 6 115 | 204 1 116 | 205 4 117 | 206 7 118 | 207 3 119 | 208 1 120 | 209 44 121 | 21 14522 122 | 210 1 123 | 211 1 124 | 212 1 125 | 213 1 126 | 214 2 127 | 215 3 128 | 216 1 129 | 217 2 130 | 22 16212 131 | 220 1 132 | 221 2 133 | 222 2 134 | 224 3 135 | 225 2 136 | 226 1 137 | 227 1 138 | 228 5 139 | 229 1 140 | 23 18915 141 | 231 1 142 | 232 2 143 | 235 1 144 | 237 1 145 | 24 25213 146 | 240 1 147 | 242 1 148 | 243 2 149 | 244 1 150 | 245 2 151 | 248 1 152 | 25 24764 153 | 250 9 154 | 251 1 155 | 252 2 156 | 253 1 157 | 254 1 158 | 256 3 159 | 26 28145 160 | 260 1 161 | 261 1 162 | 262 1 163 | 263 1 164 | 264 2 165 | 265 1 166 | 268 3 167 | 269 1 168 | 27 30762 169 | 271 1 170 | 272 1 171 | 278 2 172 | 28 33945 173 | 281 1 174 | 287 2 175 | 29 36845 176 | 291 1 177 | 293 1 178 | 294 1 179 | 30 38463 180 | 303 1 181 | 305 1 182 | 308 1 183 | 309 1 184 | 31 42391 185 | 32 48664 186 | 325 1 187 | 33 48914 188 | 332 1 189 | 336 1 190 | 34 55048 191 | 344 1 192 | 349 1 193 | 35 54549 194 | 359 1 195 | 36 61057 196 | 37 67183 197 | 376 1 198 | 38 64615 199 | 39 64371 200 | 4 1 201 | 40 67757 202 | 41 71834 203 | 42 73371 204 | 426 1 205 | 43 74896 206 | 44 78717 207 | 45 80369 208 | 46 82193 209 | 47 86439 210 | 48 89963 211 | 483 1 212 | 49 91028 213 | 50 95337 214 | 51 97259 215 | 52 105368 216 | 53 130447 217 | 54 107594 218 | 543 1 219 | 547 1 220 | 55 106645 221 | 550 1 222 | 551 1 223 | 56 108674 224 | 57 119258 225 | 58 113394 226 | 59 117142 227 | 60 120344 228 | 61 119538 229 | 62 125297 230 | 63 125766 231 | 64 124467 232 | 65 127694 233 | 66 136435 234 | 67 128879 235 | 68 134941 236 | 69 139029 237 | 7 938 238 | 70 135227 239 | 71 135301 240 | 72 139376 241 | 73 144021 242 | 74 144107 243 | 75 139711 244 | 76 173563 245 | 77 143326 246 | 78 144767 247 | 79 157629 248 | 8 694 249 | 80 152856 250 | 81 149882 251 | 82 144417 252 | 83 139591 253 | 84 158433 254 | 85 147091 255 | 86 140497 256 | 87 143046 257 | 88 150478 258 | 89 141753 259 | 9 180 260 | 90 145901 261 | 91 149605 262 | 92 139729 263 | 93 143025 264 | 94 146571 265 | 95 150826 266 | 96 148090 267 | 97 158436 268 | 98 158378 269 | 99 156953 270 | -------------------------------------------------------------------------------- /TextAnalysis/out/TextAnalysis1.txt: -------------------------------------------------------------------------------- 1 | 1-5 1 2 | 101-105 747463 3 | 106-110 715430 4 | 11-15 92830 5 | 111-115 730900 6 | 116-120 801666 7 | 121-125 1179217 8 | 126-130 2165125 9 | 131-135 2132412 10 | 136-140 231288 11 | 16-20 143744 12 | 21-25 179342 13 | 26-30 253581 14 | 31-35 376048 15 | 36-40 427137 16 | 41-45 452827 17 | 46-50 572528 18 | 51-55 581480 19 | 56-60 664597 20 | 6-10 35112 21 | 61-65 690296 22 | 66-70 758927 23 | 71-75 756451 24 | 76-80 718662 25 | 81-85 753295 26 | 86-90 748296 27 | 91-95 759277 28 | 96-100 793444 29 | >140 481 30 | -------------------------------------------------------------------------------- /TextAnalysis/src/TMapper.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.math.*; 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.LongWritable; 5 | import org.apache.hadoop.io.NullWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | import org.apache.hadoop.mapreduce.Mapper.Context; 9 | import org.apache.commons.lang.StringUtils; 10 | 11 | public class TMapper extends Mapper { 12 | private Text upperbound = new Text("0"); 13 | private final IntWritable one = new IntWritable(1); 14 | 15 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 16 | // Format per tweet is id;date;hashtags;tweet; 17 | String tweets = value.toString(); 18 | if(StringUtils.ordinalIndexOf(tweets,";",4)>-1){ 19 | int startIndex = StringUtils.ordinalIndexOf(tweets,";",3) + 1; 20 | String tweet = tweets.substring(startIndex,tweets.lastIndexOf(';')); 21 | tweet = tweet.replaceAll("[^a-zA-Z0-9]+", "1"); 22 | if (tweet.length()<=140){ 23 | int upperb = (int) (Math.ceil((float)tweet.length()/5)*5); 24 | int lowerb = (int) ((Math.ceil((float)tweet.length()/5)-1)*5 + 1); 25 | upperbound.set(String.valueOf(lowerb) + "-"+ String.valueOf(upperb)); 26 | }else{ 27 | upperbound.set(">140"); 28 | } 29 | context.write(upperbound, one); 30 | 31 | } 32 | } 33 | 34 | } 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /TextAnalysis/src/TReducer.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.Iterator; 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | 8 | import java.io.IOException; 9 | import java.util.Iterator; 10 | import org.apache.hadoop.io.IntWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | 14 | 15 | public class TReducer extends Reducer { 16 | 17 | public void reduce(Text key, Iterable values, Context context) 18 | 19 | throws IOException, InterruptedException { 20 | 21 | int sum = 0; 22 | 23 | for (IntWritable value : values) { 24 | 25 | sum = sum + value.get(); 26 | 27 | } 28 | 29 | context.write(key, new IntWritable(sum)); 30 | 31 | } 32 | 33 | } -------------------------------------------------------------------------------- /TextAnalysis/src/TextAnalysis.java: -------------------------------------------------------------------------------- 1 | import java.util.Arrays; 2 | 3 | import org.apache.commons.lang.StringUtils; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | 13 | public class TextAnalysis { 14 | 15 | public static void runJob(String[] input, String output) throws Exception { 16 | 17 | Configuration conf = new Configuration(); 18 | 19 | Job job = new Job(conf); 20 | job.setJarByClass(TextAnalysis.class); 21 | job.setMapperClass(TMapper.class); 22 | job.setReducerClass(TReducer.class); 23 | job.setMapOutputKeyClass(Text.class); 24 | job.setMapOutputValueClass(IntWritable.class); 25 | Path outputPath = new Path(output); 26 | FileInputFormat.setInputPaths(job, StringUtils.join(input, ",")); 27 | FileOutputFormat.setOutputPath(job, outputPath); 28 | outputPath.getFileSystem(conf).delete(outputPath,true); 29 | job.waitForCompletion(true); 30 | job.setNumReduceTasks(3); 31 | } 32 | 33 | public static void main(String[] args) throws Exception { 34 | runJob(Arrays.copyOfRange(args, 0, args.length-1), args[args.length-1]); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /TextAnalysisAvg/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /TextAnalysisAvg/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | TextAnalysisAvg 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /TextAnalysisAvg/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.7 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.7 12 | -------------------------------------------------------------------------------- /TextAnalysisAvg/bin/TAvgMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysisAvg/bin/TAvgMapper.class -------------------------------------------------------------------------------- /TextAnalysisAvg/bin/TAvgReducer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysisAvg/bin/TAvgReducer.class -------------------------------------------------------------------------------- /TextAnalysisAvg/bin/TextAnalysisAvg.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysisAvg/bin/TextAnalysisAvg.class -------------------------------------------------------------------------------- /TextAnalysisAvg/bin/inintpair/IntIntPair.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysisAvg/bin/inintpair/IntIntPair.class -------------------------------------------------------------------------------- /TextAnalysisAvg/build.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Sample MapReduce project build file 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /TextAnalysisAvg/classes/TAvgMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysisAvg/classes/TAvgMapper.class -------------------------------------------------------------------------------- /TextAnalysisAvg/classes/TAvgReducer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysisAvg/classes/TAvgReducer.class -------------------------------------------------------------------------------- /TextAnalysisAvg/classes/TextAnalysisAvg.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysisAvg/classes/TextAnalysisAvg.class -------------------------------------------------------------------------------- /TextAnalysisAvg/classes/inintpair/IntIntPair.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysisAvg/classes/inintpair/IntIntPair.class -------------------------------------------------------------------------------- /TextAnalysisAvg/dist/TextAnalysisAvg.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TextAnalysisAvg/dist/TextAnalysisAvg.jar -------------------------------------------------------------------------------- /TextAnalysisAvg/out/TextAnalysisAvg.txt: -------------------------------------------------------------------------------- 1 | avg 92 2 | -------------------------------------------------------------------------------- /TextAnalysisAvg/src/TAvgMapper.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import inintpair.IntIntPair; 3 | 4 | import org.apache.hadoop.io.IntWritable; 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.NullWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | import org.apache.hadoop.mapreduce.Mapper.Context; 10 | import org.apache.commons.lang.StringUtils; 11 | 12 | public class TAvgMapper extends Mapper { 13 | private Text emit_key = new Text("avg"); 14 | private IntIntPair data = new IntIntPair(); 15 | private final IntWritable one = new IntWritable(1); 16 | 17 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 18 | // Format per tweet is id;date;hashtags;tweet; 19 | String tweets = value.toString(); 20 | if(StringUtils.ordinalIndexOf(tweets,";",4)>-1){ 21 | int startIndex = StringUtils.ordinalIndexOf(tweets,";",3) + 1; 22 | String tweet = tweets.substring(startIndex,tweets.lastIndexOf(';')); 23 | tweet = tweet.replaceAll("[^a-zA-Z0-9]+", "1"); 24 | data.set(new IntWritable(tweet.length()), one); 25 | context.write(emit_key, data); 26 | } 27 | } 28 | 29 | } 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /TextAnalysisAvg/src/TAvgReducer.java: -------------------------------------------------------------------------------- 1 | import inintpair.IntIntPair; 2 | import java.io.IOException; 3 | import java.util.Iterator; 4 | import org.apache.hadoop.io.IntWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | 8 | 9 | import java.io.IOException; 10 | import java.util.Iterator; 11 | import org.apache.hadoop.io.IntWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | 15 | 16 | public class TAvgReducer extends Reducer { 17 | 18 | private IntWritable result = new IntWritable(); 19 | 20 | public void reduce(Text key, Iterable values, Context context) 21 | 22 | throws IOException, InterruptedException { 23 | 24 | int sum_length = 0; 25 | int sum_tweets = 0; 26 | 27 | for (IntIntPair value : values) { 28 | 29 | sum_length = sum_length + Integer.parseInt(value.getFirst().toString()); 30 | sum_tweets = sum_tweets + Integer.parseInt(value.getSecond().toString()); 31 | 32 | } 33 | 34 | result.set(sum_length/sum_tweets); 35 | 36 | context.write(key, result); 37 | 38 | } 39 | 40 | } -------------------------------------------------------------------------------- /TextAnalysisAvg/src/TextAnalysisAvg.java: -------------------------------------------------------------------------------- 1 | import inintpair.IntIntPair; 2 | 3 | import java.util.Arrays; 4 | 5 | import org.apache.commons.lang.StringUtils; 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.NullWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | public class TextAnalysisAvg { 16 | 17 | public static void runJob(String[] input, String output) throws Exception { 18 | 19 | Configuration conf = new Configuration(); 20 | 21 | Job job = new Job(conf); 22 | job.setJarByClass(TextAnalysisAvg.class); 23 | job.setMapperClass(TAvgMapper.class); 24 | job.setReducerClass(TAvgReducer.class); 25 | job.setMapOutputKeyClass(Text.class); 26 | job.setMapOutputValueClass(IntIntPair.class); 27 | Path outputPath = new Path(output); 28 | FileInputFormat.setInputPaths(job, StringUtils.join(input, ",")); 29 | FileOutputFormat.setOutputPath(job, outputPath); 30 | outputPath.getFileSystem(conf).delete(outputPath,true); 31 | job.waitForCompletion(true); 32 | job.setNumReduceTasks(3); //set Num of reducers 33 | // job.setCombinerClass(StockReducer.class); 34 | } 35 | 36 | public static void main(String[] args) throws Exception { 37 | runJob(Arrays.copyOfRange(args, 0, args.length-1), args[args.length-1]); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /TextAnalysisAvg/src/inintpair/IntIntPair.java: -------------------------------------------------------------------------------- 1 | package inintpair; 2 | import java.io.*; 3 | import org.apache.hadoop.io.*; 4 | 5 | public class IntIntPair implements WritableComparable { 6 | 7 | private IntWritable first; 8 | private IntWritable second; 9 | 10 | public IntIntPair() { 11 | set(new IntWritable(), new IntWritable()); 12 | } 13 | 14 | public IntIntPair(int first, int second) { 15 | set(new IntWritable(first), new IntWritable(second)); 16 | } 17 | 18 | public void set(IntWritable first, IntWritable second) { 19 | this.first = first; 20 | this.second = second; 21 | } 22 | 23 | public IntWritable getFirst() { 24 | return first; 25 | } 26 | 27 | public IntWritable getSecond() { 28 | return second; 29 | } 30 | 31 | @Override 32 | public void write(DataOutput out) throws IOException { 33 | first.write(out); 34 | second.write(out); 35 | } 36 | 37 | @Override 38 | public void readFields(DataInput in) throws IOException { 39 | first.readFields(in); 40 | second.readFields(in); 41 | } 42 | 43 | @Override 44 | public int hashCode() { 45 | return first.hashCode() * 163 + second.hashCode(); 46 | } 47 | 48 | @Override 49 | public boolean equals(Object o) { 50 | if (o instanceof IntIntPair) { 51 | IntIntPair tp = (IntIntPair) o; 52 | return first.equals(tp.first) && second.equals(tp.second); 53 | } 54 | return false; 55 | } 56 | 57 | @Override 58 | public String toString() { 59 | return first + "\t" + second; 60 | } 61 | 62 | @Override 63 | public int compareTo(IntIntPair tp) { 64 | int cmp = first.compareTo(tp.first); 65 | if (cmp != 0) { 66 | return cmp; 67 | } 68 | return second.compareTo(tp.second); 69 | } 70 | } -------------------------------------------------------------------------------- /TimeAnalysis/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /TimeAnalysis/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | TimeAnalysis 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /TimeAnalysis/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.7 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.7 12 | -------------------------------------------------------------------------------- /TimeAnalysis/TimeAnalysis.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TimeAnalysis/TimeAnalysis.xls -------------------------------------------------------------------------------- /TimeAnalysis/bin/TimeAnalysis.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TimeAnalysis/bin/TimeAnalysis.class -------------------------------------------------------------------------------- /TimeAnalysis/bin/TimeMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TimeAnalysis/bin/TimeMapper.class -------------------------------------------------------------------------------- /TimeAnalysis/bin/TimeReducer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TimeAnalysis/bin/TimeReducer.class -------------------------------------------------------------------------------- /TimeAnalysis/build.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Sample MapReduce project build file 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /TimeAnalysis/classes/TimeAnalysis.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TimeAnalysis/classes/TimeAnalysis.class -------------------------------------------------------------------------------- /TimeAnalysis/classes/TimeMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TimeAnalysis/classes/TimeMapper.class -------------------------------------------------------------------------------- /TimeAnalysis/classes/TimeReducer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TimeAnalysis/classes/TimeReducer.class -------------------------------------------------------------------------------- /TimeAnalysis/dist/TimeAnalysis.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhanelya/Big-Data-Project/98396ba359a10c2b90b22bcce8d8cf8f014c1f05/TimeAnalysis/dist/TimeAnalysis.jar -------------------------------------------------------------------------------- /TimeAnalysis/out/TimeAnalysis.txt: -------------------------------------------------------------------------------- 1 | 2012-07-27 588875 2 | 2012-07-28 1881517 3 | 2012-07-29 1121121 4 | 2012-07-30 1107913 5 | 2012-07-31 1140441 6 | 2012-08-01 1130209 7 | 2012-08-02 943907 8 | 2012-08-03 1014570 9 | 2012-08-04 1079688 10 | 2012-08-05 1037284 11 | 2012-08-06 791144 12 | 2012-08-07 744324 13 | 2012-08-08 583282 14 | 2012-08-09 751563 15 | 2012-08-10 596955 16 | 2012-08-11 776756 17 | 2012-08-12 1312742 18 | 2012-08-13 776575 19 | 2012-08-14 227225 20 | 2012-08-15 138725 21 | 2012-08-16 121467 22 | 2012-08-17 112921 23 | 2012-08-18 73410 24 | 2012-08-19 47941 25 | 2012-08-20 54335 26 | 2012-08-21 49278 27 | 2012-08-22 51905 28 | 2012-08-23 49431 29 | 2012-08-24 47550 30 | 2012-08-25 34594 31 | 2012-08-26 31741 32 | 2012-08-27 33167 33 | 2012-08-28 9301 34 | -------------------------------------------------------------------------------- /TimeAnalysis/src/TimeAnalysis.java: -------------------------------------------------------------------------------- 1 | import java.util.Arrays; 2 | 3 | import org.apache.commons.lang.StringUtils; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | 13 | public class TimeAnalysis { 14 | 15 | public static void runJob(String[] input, String output) throws Exception { 16 | 17 | Configuration conf = new Configuration(); 18 | 19 | Job job = new Job(conf); 20 | job.setJarByClass(TimeAnalysis.class); 21 | job.setMapperClass(TimeMapper.class); 22 | job.setReducerClass(TimeReducer.class); 23 | job.setMapOutputKeyClass(Text.class); 24 | job.setMapOutputValueClass(IntWritable.class); 25 | Path outputPath = new Path(output); 26 | FileInputFormat.setInputPaths(job, StringUtils.join(input, ",")); 27 | FileOutputFormat.setOutputPath(job, outputPath); 28 | outputPath.getFileSystem(conf).delete(outputPath,true); 29 | job.waitForCompletion(true); 30 | job.setNumReduceTasks(3); 31 | } 32 | 33 | public static void main(String[] args) throws Exception { 34 | runJob(Arrays.copyOfRange(args, 0, args.length-1), args[args.length-1]); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /TimeAnalysis/src/TimeMapper.java: -------------------------------------------------------------------------------- 1 | import java.text.SimpleDateFormat; 2 | import java.util.Date; 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | import org.apache.hadoop.mapreduce.Mapper.Context; 11 | import org.apache.http.ParseException; 12 | import org.apache.commons.lang.StringUtils; 13 | 14 | public class TimeMapper extends Mapper { 15 | private Text data = new Text(); 16 | private final IntWritable one = new IntWritable(1); 17 | 18 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 19 | // Format per tweet is id;date;hashtags;tweet; 20 | String tweets = value.toString(); 21 | if(StringUtils.ordinalIndexOf(tweets,";",4)>-1){ 22 | int startIndex = StringUtils.ordinalIndexOf(tweets,";",1) + 1; 23 | int finishIndex = StringUtils.ordinalIndexOf(tweets, ";", 2); 24 | //split by ',' and take the first element (2012-07-27, 20:48:57, BST) 25 | String tweet_date = tweets.substring(startIndex,finishIndex).split(", ")[0]; 26 | data.set(tweet_date); 27 | context.write(data, one); 28 | } 29 | } 30 | 31 | } 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /TimeAnalysis/src/TimeReducer.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.Iterator; 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | 8 | import java.io.IOException; 9 | import java.util.Iterator; 10 | import org.apache.hadoop.io.IntWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | 14 | 15 | public class TimeReducer extends Reducer { 16 | 17 | public void reduce(Text key, Iterable values, Context context) 18 | 19 | throws IOException, InterruptedException { 20 | 21 | int sum = 0; 22 | 23 | for (IntWritable value : values) { 24 | 25 | sum = sum + value.get(); 26 | 27 | } 28 | 29 | context.write(key, new IntWritable(sum)); 30 | 31 | } 32 | 33 | } --------------------------------------------------------------------------------