├── python ├── Luigi │ ├── luigi_s3_target.py │ ├── client.cfg.template │ ├── wordcount.py │ ├── luigi_mapreduce.py │ └── luigi_pig.py ├── HDFS │ ├── list_directory.py │ ├── list_file.py │ ├── text.py │ ├── delete.py │ ├── copy_to_local.py │ └── mkdir.py ├── MapReduce │ ├── mrjob │ │ ├── word_count.py │ │ ├── top_salary.py │ │ └── avg.py │ └── HadoopStreaming │ │ ├── mapper.py │ │ └── reducer.py └── Spark │ ├── word_count.py │ └── text_search.py ├── .gitignore ├── pig ├── .gitignore ├── user_id.pig ├── simple_udf.pig ├── udfs │ ├── my_first_udf.py │ ├── string_funcs.py │ └── movies_udf.py ├── wordcount.pig ├── recent_movies.pig └── playing_with_words.pig ├── resources ├── input.txt ├── movies └── students ├── README.md └── LICENSE /python/Luigi/luigi_s3_target.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | client.cfg 3 | -------------------------------------------------------------------------------- /pig/.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | attempt_local* 3 | cpython* 4 | -------------------------------------------------------------------------------- /resources/input.txt: -------------------------------------------------------------------------------- 1 | jack be nimble 2 | jack be quick 3 | jack jumped over the candlestick -------------------------------------------------------------------------------- /resources/movies: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zdata-inc/HadoopWithPython/HEAD/resources/movies -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HadoopWithPython 2 | Repository for Hadoop with Python including example source code 3 | -------------------------------------------------------------------------------- /resources/students: -------------------------------------------------------------------------------- 1 | john 21 3.89 2 | sally 19 2.56 3 | alice 22 3.76 4 | doug 19 1.98 5 | susan 26 3.25 6 | -------------------------------------------------------------------------------- /pig/user_id.pig: -------------------------------------------------------------------------------- 1 | A = LOAD 'passwd' using PigStorage(':'); 2 | B = FOREACH A GENERATE $0 as username; 3 | STORE B INTO 'user_id.out'; 4 | -------------------------------------------------------------------------------- /python/Luigi/client.cfg.template: -------------------------------------------------------------------------------- 1 | [hadoop] 2 | streaming-jar: /usr/lib/hadoop-xyz/hadoop-streaming-xyz-123.jar 3 | 4 | [pig] 5 | home: /usr/lib/pig 6 | -------------------------------------------------------------------------------- /python/HDFS/list_directory.py: -------------------------------------------------------------------------------- 1 | from snakebite.client import Client 2 | 3 | client = Client('localhost', 9000) 4 | for x in client.ls(['/']): 5 | print x -------------------------------------------------------------------------------- /python/HDFS/list_file.py: -------------------------------------------------------------------------------- 1 | from snakebite.client import Client 2 | 3 | client = Client('localhost', 9000) 4 | for x in client.ls(['/input.txt']): 5 | print x -------------------------------------------------------------------------------- /python/HDFS/text.py: -------------------------------------------------------------------------------- 1 | from snakebite.client import Client 2 | 3 | client = Client('localhost', 9000) 4 | for l in client.text(['/input/input.txt']): 5 | print l -------------------------------------------------------------------------------- /python/HDFS/delete.py: -------------------------------------------------------------------------------- 1 | from snakebite.client import Client 2 | 3 | client = Client('localhost', 9000) 4 | for p in client.delete(['/foo', '/input'], recurse=True): 5 | print p -------------------------------------------------------------------------------- /pig/simple_udf.pig: -------------------------------------------------------------------------------- 1 | REGISTER 'udfs/my_first_udf.py' USING streaming_python AS pyudfs; 2 | 3 | A = LOAD '../resources/input.txt'; 4 | B = FOREACH A GENERATE pyudfs.return_one(); 5 | DUMP B; -------------------------------------------------------------------------------- /pig/udfs/my_first_udf.py: -------------------------------------------------------------------------------- 1 | from pig_util import outputSchema 2 | 3 | @outputSchema('value:int') 4 | def return_one(): 5 | """ 6 | Return the integer value 1 7 | """ 8 | return 1 -------------------------------------------------------------------------------- /python/HDFS/copy_to_local.py: -------------------------------------------------------------------------------- 1 | from snakebite.client import Client 2 | 3 | client = Client('localhost', 9000) 4 | for f in client.copyToLocal(['/input/input.txt'], '/tmp'): 5 | print f -------------------------------------------------------------------------------- /python/HDFS/mkdir.py: -------------------------------------------------------------------------------- 1 | from snakebite.client import Client 2 | 3 | client = Client('localhost', 9000) 4 | for p in client.mkdir(['/foo/bar', '/input'], create_parent=True): 5 | print p -------------------------------------------------------------------------------- /python/MapReduce/mrjob/word_count.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | 3 | class MRWordCount(MRJob): 4 | 5 | def mapper(self, _, line): 6 | for word in line.split(): 7 | yield(word, 1) 8 | 9 | def reducer(self, word, counts): 10 | yield(word, sum(counts)) 11 | 12 | if __name__ == '__main__': 13 | MRWordCount.run() -------------------------------------------------------------------------------- /pig/udfs/string_funcs.py: -------------------------------------------------------------------------------- 1 | from pig_util import outputSchema 2 | 3 | @outputSchema('word:chararray') 4 | def reverse(word): 5 | """ 6 | Return the reverse text of the provided word 7 | """ 8 | return word[::-1] 9 | 10 | 11 | @outputSchema('length:int') 12 | def num_chars(word): 13 | """ 14 | Return the length of the provided word 15 | """ 16 | return len(word) -------------------------------------------------------------------------------- /python/Spark/word_count.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | 3 | def main(): 4 | 5 | sc = SparkContext(appName='SparkWordCount') 6 | 7 | input_file = sc.textFile('/user/hduser/input/input.txt') 8 | counts = input_file.flatMap(lambda line: line.split()) \ 9 | .map(lambda word: (word, 1)) \ 10 | .reduceByKey(lambda a, b: a + b) 11 | counts.saveAsTextFile('/user/hduser/output') 12 | 13 | sc.stop() 14 | 15 | if __name__ == '__main__': 16 | main() -------------------------------------------------------------------------------- /python/MapReduce/HadoopStreaming/mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | # Read each line from STDIN 6 | for line in sys.stdin: 7 | 8 | # Get the words in each line 9 | words = line.split() 10 | 11 | # Generate the count for each word 12 | for word in words: 13 | 14 | # Write the key-value pair to STDOUT to be processed by the reducer. 15 | # The key is anything before the first tab character and the value is 16 | # anything after the first tab character. 17 | print '{0}\t{1}'.format(word, 1) -------------------------------------------------------------------------------- /pig/wordcount.pig: -------------------------------------------------------------------------------- 1 | %default INPUT '/user/hduser/input/input.txt'; 2 | %default OUTPUT '/user/hduser/output'; 3 | 4 | -- Load the data from the file system into the relation records 5 | records = LOAD '$INPUT'; 6 | 7 | -- Split each line of text and eliminate nesting 8 | terms = FOREACH records GENERATE FLATTEN(TOKENIZE((chararray) $0)) AS word; 9 | 10 | -- Group similar terms 11 | grouped_terms = GROUP terms BY word; 12 | 13 | -- Count the number of tuples in each group 14 | word_counts = FOREACH grouped_terms GENERATE COUNT(terms), group; 15 | 16 | -- Store the result 17 | STORE word_counts INTO '$OUTPUT'; -------------------------------------------------------------------------------- /pig/udfs/movies_udf.py: -------------------------------------------------------------------------------- 1 | from pig_util import outputSchema 2 | from datetime import datetime 3 | import re 4 | 5 | 6 | @outputSchema('title:chararray') 7 | def parse_title(title): 8 | """ 9 | Return the title without the year 10 | """ 11 | return re.sub(r'\s*\(\d{4}\)','', title) 12 | 13 | @outputSchema('days_since_release:int') 14 | def days_since_release(date): 15 | """ 16 | Calculate the number of days since the titles release 17 | """ 18 | if date is None: 19 | return None 20 | 21 | today = datetime.today() 22 | release_date = datetime.strptime(date, '%d-%b-%Y') 23 | delta = today - release_date 24 | return delta.days -------------------------------------------------------------------------------- /pig/recent_movies.pig: -------------------------------------------------------------------------------- 1 | REGISTER 'udfs/movies_udf.py' USING streaming_python AS movies_udf; 2 | 3 | -- Load the data from the file system 4 | records = LOAD '../resources/movies' USING PigStorage('|') 5 | AS (id:int, title:chararray, release_date:chararray); 6 | 7 | -- Parse the titles and determine how many days since the release date 8 | titles = FOREACH records GENERATE movies_udf.parse_title(title), movies_udf.days_since_release(release_date); 9 | 10 | -- Order the movies by the time since release 11 | most_recent = ORDER titles BY days_since_release ASC; 12 | 13 | -- Get the ten most recent movies 14 | top_ten = LIMIT most_recent 10; 15 | 16 | -- Display the top ten most recent movies 17 | DUMP top_ten; -------------------------------------------------------------------------------- /pig/playing_with_words.pig: -------------------------------------------------------------------------------- 1 | REGISTER 'udfs/string_funcs.py' USING streaming_python AS string_udf; 2 | 3 | -- Load the data from the file system 4 | records = LOAD '../resources/input.txt'; 5 | 6 | -- Split each line of text and eliminate nesting 7 | terms = FOREACH records GENERATE FLATTEN(TOKENIZE((chararray) $0)) AS word; 8 | 9 | -- Group similar terms 10 | grouped_terms = GROUP terms BY word; 11 | 12 | -- Count the number of tuples in each group 13 | unique_terms = FOREACH grouped_terms GENERATE group as word; 14 | 15 | -- Calculate the number of characters in each term 16 | term_length = FOREACH unique_terms GENERATE word, string_udf.num_chars(word) as length; 17 | 18 | -- Display the terms and their length 19 | DUMP term_length; 20 | 21 | -- Reverse each word 22 | reverse_terms = FOREACH unique_terms GENERATE word, string_udf.reverse(word) as reverse_word; 23 | 24 | -- Display the terms and the reverse terms 25 | DUMP reverse_terms; -------------------------------------------------------------------------------- /python/MapReduce/HadoopStreaming/reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | curr_word = None 5 | curr_count = 0 6 | 7 | # Process each key-value pair from the mapper 8 | for line in sys.stdin: 9 | 10 | # Get the key and value from the current line 11 | word, count = line.split('\t') 12 | 13 | # Convert the count to an int 14 | count = int(count) 15 | 16 | # If the current word is the same as the previous word, increment its 17 | # count, otherwise print the words count to STDOUT 18 | if word == curr_word: 19 | curr_count += count 20 | else: 21 | 22 | # Write word and its number of occurrences as a key-value pair to STDOUT 23 | if curr_word: 24 | print '{0}\t{1}'.format(curr_word, curr_count) 25 | 26 | curr_word = word 27 | curr_count = count 28 | 29 | # Output the count for the last word 30 | if curr_word == word: 31 | print '{0}\t{1}'.format(curr_word, curr_count) -------------------------------------------------------------------------------- /python/MapReduce/mrjob/top_salary.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | from mrjob.step import MRStep 3 | import csv 4 | 5 | cols = 'Name,JobTitle,AgencyID,Agency,HireDate,AnnualSalary,GrossPay'.split(',') 6 | 7 | class salarymax(MRJob): 8 | 9 | def mapper(self, _, line): 10 | # Convert each line into a dictionary 11 | row = dict(zip(cols, [ a.strip() for a in csv.reader([line]).next()])) 12 | 13 | # Yield the salary 14 | yield 'salary', (float(row['AnnualSalary'][1:]), line) 15 | 16 | # Yield the gross pay 17 | try: 18 | yield 'gross', (float(row['GrossPay'][1:]), line) 19 | except ValueError: 20 | self.increment_counter('warn', 'missing gross', 1) 21 | 22 | def reducer(self, key, values): 23 | topten = [] 24 | 25 | # For 'salary' and 'gross' compute the top 10 26 | for p in values: 27 | topten.append(p) 28 | topten.sort() 29 | topten = topten[-10:] 30 | 31 | for p in topten: 32 | yield key, p 33 | 34 | combiner = reducer 35 | 36 | if __name__ == '__main__': 37 | salarymax.run() -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 MinerKasch 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /python/Luigi/wordcount.py: -------------------------------------------------------------------------------- 1 | import luigi 2 | 3 | class InputFile(luigi.Task): 4 | """ 5 | A task wrapping a Target 6 | """ 7 | input_file = luigi.Parameter() 8 | 9 | def output(self): 10 | """ 11 | Return the target for this task 12 | """ 13 | return luigi.LocalTarget(self.input_file) 14 | 15 | class WordCount(luigi.Task): 16 | """ 17 | A task that counts the number of words in a file 18 | """ 19 | input_file = luigi.Parameter() 20 | output_file = luigi.Parameter(default='/tmp/wordcount') 21 | 22 | def requires(self): 23 | """ 24 | The task's dependencies: 25 | """ 26 | return InputFile(self.input_file) 27 | 28 | def output(self): 29 | """ 30 | The task's output 31 | """ 32 | return luigi.LocalTarget(self.output_file) 33 | 34 | def run(self): 35 | """ 36 | The task's logic 37 | """ 38 | count = {} 39 | 40 | ifp = self.input().open('r') 41 | 42 | for line in ifp: 43 | for word in line.strip().split(): 44 | count[word] = count.get(word, 0) + 1 45 | 46 | ofp = self.output().open('w') 47 | for k, v in count.items(): 48 | ofp.write('{}\t{}\n'.format(k, v)) 49 | ofp.close() 50 | 51 | if __name__ == '__main__': 52 | luigi.run() -------------------------------------------------------------------------------- /python/Spark/text_search.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | import re 3 | import sys 4 | 5 | def main(): 6 | 7 | # Insure a search term was supplied at the command line 8 | if len(sys.argv) != 2: 9 | sys.stderr.write('Usage: {} '.format(sys.argv[0])) 10 | sys.exit() 11 | 12 | # Create the SparkContext 13 | sc = SparkContext(appName='SparkWordCount') 14 | 15 | # Broadcast the requested term 16 | requested_movie = sc.broadcast(sys.argv[1]) 17 | 18 | # Load the input file 19 | source_file = sc.textFile('/user/hduser/input/movies') 20 | 21 | # Get the movie title from the second fields 22 | titles = source_file.map(lambda line: line.split('|')[1]) 23 | 24 | # Create a map of the normalized title to the raw title 25 | normalized_title = titles.map(lambda title: (re.sub(r'\s*\(\d{4}\)','', title).lower(), title)) 26 | 27 | # Find all movies matching the requested_movie 28 | matches = normalized_title.filter(lambda x: requested_movie.value in x[0]) 29 | 30 | # Collect all the matching titles 31 | matching_titles = matches.map(lambda x: x[1]).distinct().collect() 32 | 33 | # Display the result 34 | print '{} Matching titles found:'.format(len(matching_titles)) 35 | for title in matching_titles: 36 | print title 37 | 38 | sc.stop() 39 | 40 | if __name__ == '__main__': 41 | main() -------------------------------------------------------------------------------- /python/Luigi/luigi_mapreduce.py: -------------------------------------------------------------------------------- 1 | import luigi 2 | import luigi.contrib.hadoop 3 | import luigi.contrib.hdfs 4 | 5 | class InputFile(luigi.ExternalTask): 6 | """ 7 | A task wrapping the HDFS target 8 | """ 9 | input_file = luigi.Parameter() 10 | 11 | def output(self): 12 | """ 13 | Return the target on HDFS 14 | """ 15 | return luigi.contrib.hdfs.HdfsTarget(self.input_file) 16 | 17 | class WordCount(luigi.contrib.hadoop.JobTask): 18 | """ 19 | A task that uses Hadoop streaming to perform WordCount 20 | """ 21 | input_file = luigi.Parameter() 22 | output_file = luigi.Parameter() 23 | 24 | # Set the number of reduce tasks 25 | n_reduce_tasks = 1 26 | 27 | def requires(self): 28 | """ 29 | Read from the output of the InputFile task 30 | """ 31 | return InputFile(self.input_file) 32 | 33 | def output(self): 34 | """ 35 | Write the output to HDFS 36 | """ 37 | return luigi.contrib.hdfs.HdfsTarget(self.output_file) 38 | 39 | def mapper(self, line): 40 | """ 41 | Read each line and produce a word and 1 42 | """ 43 | for word in line.strip().split(): 44 | yield word, 1 45 | 46 | def reducer(self, key, values): 47 | """ 48 | Read each word and produce the word and the sum of it's values 49 | """ 50 | yield key, sum(values) 51 | 52 | if __name__ == '__main__': 53 | luigi.run(main_task_cls=WordCount) -------------------------------------------------------------------------------- /python/Luigi/luigi_pig.py: -------------------------------------------------------------------------------- 1 | import luigi 2 | import luigi.contrib.pig 3 | import luigi.contrib.hdfs 4 | 5 | class InputFile(luigi.ExternalTask): 6 | """ 7 | A task wrapping the HDFS target 8 | """ 9 | input_file = luigi.Parameter() 10 | 11 | def output(self): 12 | return luigi.contrib.hdfs.HdfsTarget(self.input_file) 13 | 14 | class WordCount(luigi.contrib.pig.PigJobTask): 15 | """ 16 | A task that uses Pig to perform WordCount 17 | """ 18 | input_file = luigi.Parameter() 19 | output_file = luigi.Parameter() 20 | script_path = luigi.Parameter(default='../../pig/wordcount.pig') 21 | 22 | def requires(self): 23 | """ 24 | Read from the output of the InputFile task 25 | """ 26 | return InputFile(self.input_file) 27 | 28 | def output(self): 29 | """ 30 | Write the output to HDFS 31 | """ 32 | return luigi.contrib.hdfs.HdfsTarget(self.output_file) 33 | 34 | def pig_parameters(self): 35 | """ 36 | A dictionary of parameters to pass to pig 37 | """ 38 | return {'INPUT': self.input_file, 'OUTPUT': self.output_file} 39 | 40 | def pig_options(self): 41 | """ 42 | A list of options to pass to pig 43 | """ 44 | return ['-x', 'mapreduce'] 45 | 46 | def pig_script_path(self): 47 | """ 48 | The path to the pig script to run 49 | """ 50 | return self.script_path 51 | 52 | if __name__ == '__main__': 53 | luigi.run(main_task_cls=WordCount) -------------------------------------------------------------------------------- /python/MapReduce/mrjob/avg.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | from mrjob.step import MRStep 3 | import csv 4 | 5 | cols = 'Name,JobTitle,AgencyID,Agency,HireDate,AnnualSalary,GrossPay'.split(',') 6 | 7 | 8 | class salaryavg(MRJob): 9 | 10 | def avgmapper(self, _, line): 11 | row = dict(zip(cols, [ a.strip() for a in csv.reader([line]).next()])) 12 | 13 | self.increment_counter('depts', row['Agency'], 1) 14 | 15 | yield row['JobTitle'], (int(float(row['AnnualSalary'][1:])), 1) 16 | 17 | def avgreducer(self, key, values): 18 | s = 0 19 | c = 0 20 | 21 | for average, count in values: 22 | s += average * count 23 | c += count 24 | 25 | if c > 3: 26 | self.increment_counter('stats', 'below3', 1) 27 | yield key, (s/c, c) 28 | 29 | def ttmapper(self, key, value): 30 | yield None, (value[0], key) # group by all, keep average and job title 31 | 32 | def ttreducer(self, key, values): 33 | topten = [] 34 | for average, job in values: 35 | topten.append((average, job)) 36 | topten.sort() 37 | topten = topten[-10:] 38 | 39 | for average, job in topten: 40 | yield None, (average, job) 41 | 42 | def steps(self): 43 | return [ 44 | MRStep(mapper=self.avgmapper, 45 | combiner=self.avgreducer, 46 | reducer=self.avgreducer), 47 | MRStep(mapper=self.ttmapper, 48 | combiner=self.ttreducer, 49 | reducer=self.ttreducer) ] 50 | 51 | 52 | if __name__ == '__main__': 53 | salaryavg.run() --------------------------------------------------------------------------------