├── python
    ├── Luigi
    │   ├── luigi_s3_target.py
    │   ├── client.cfg.template
    │   ├── wordcount.py
    │   ├── luigi_mapreduce.py
    │   └── luigi_pig.py
    ├── HDFS
    │   ├── list_directory.py
    │   ├── list_file.py
    │   ├── text.py
    │   ├── delete.py
    │   ├── copy_to_local.py
    │   └── mkdir.py
    ├── MapReduce
    │   ├── mrjob
    │   │   ├── word_count.py
    │   │   ├── top_salary.py
    │   │   └── avg.py
    │   └── HadoopStreaming
    │   │   ├── mapper.py
    │   │   └── reducer.py
    └── Spark
    │   ├── word_count.py
    │   └── text_search.py
├── .gitignore
├── pig
    ├── .gitignore
    ├── user_id.pig
    ├── simple_udf.pig
    ├── udfs
    │   ├── my_first_udf.py
    │   ├── string_funcs.py
    │   └── movies_udf.py
    ├── wordcount.pig
    ├── recent_movies.pig
    └── playing_with_words.pig
├── resources
    ├── input.txt
    ├── movies
    └── students
├── README.md
└── LICENSE


/python/Luigi/luigi_s3_target.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | client.cfg
3 | 


--------------------------------------------------------------------------------
/pig/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | attempt_local*
3 | cpython*
4 | 


--------------------------------------------------------------------------------
/resources/input.txt:
--------------------------------------------------------------------------------
1 | jack be nimble
2 | jack be quick
3 | jack jumped over the candlestick


--------------------------------------------------------------------------------
/resources/movies:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zdata-inc/HadoopWithPython/HEAD/resources/movies


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HadoopWithPython
2 | Repository for Hadoop with Python including example source code
3 | 


--------------------------------------------------------------------------------
/resources/students:
--------------------------------------------------------------------------------
1 | john	21	3.89
2 | sally	19	2.56
3 | alice	22	3.76
4 | doug	19	1.98
5 | susan	26	3.25
6 | 


--------------------------------------------------------------------------------
/pig/user_id.pig:
--------------------------------------------------------------------------------
1 | A = LOAD 'passwd' using PigStorage(':');
2 | B = FOREACH A GENERATE $0 as username;
3 | STORE B INTO 'user_id.out';
4 | 


--------------------------------------------------------------------------------
/python/Luigi/client.cfg.template:
--------------------------------------------------------------------------------
1 | [hadoop]
2 | streaming-jar: /usr/lib/hadoop-xyz/hadoop-streaming-xyz-123.jar
3 | 
4 | [pig]
5 | home: /usr/lib/pig
6 | 


--------------------------------------------------------------------------------
/python/HDFS/list_directory.py:
--------------------------------------------------------------------------------
1 | from snakebite.client import Client
2 | 
3 | client = Client('localhost', 9000)
4 | for x in client.ls(['/']):
5 |    print x


--------------------------------------------------------------------------------
/python/HDFS/list_file.py:
--------------------------------------------------------------------------------
1 | from snakebite.client import Client
2 | 
3 | client = Client('localhost', 9000)
4 | for x in client.ls(['/input.txt']):
5 |    print x


--------------------------------------------------------------------------------
/python/HDFS/text.py:
--------------------------------------------------------------------------------
1 | from snakebite.client import Client
2 | 
3 | client = Client('localhost', 9000)
4 | for l in client.text(['/input/input.txt']):
5 |    print l


--------------------------------------------------------------------------------
/python/HDFS/delete.py:
--------------------------------------------------------------------------------
1 | from snakebite.client import Client
2 | 
3 | client = Client('localhost', 9000)
4 | for p in client.delete(['/foo', '/input'], recurse=True):
5 |    print p


--------------------------------------------------------------------------------
/pig/simple_udf.pig:
--------------------------------------------------------------------------------
1 | REGISTER 'udfs/my_first_udf.py' USING streaming_python AS pyudfs;
2 | 
3 | A = LOAD '../resources/input.txt';
4 | B = FOREACH A GENERATE pyudfs.return_one();
5 | DUMP B;


--------------------------------------------------------------------------------
/pig/udfs/my_first_udf.py:
--------------------------------------------------------------------------------
1 | from pig_util import outputSchema
2 | 
3 | @outputSchema('value:int')
4 | def return_one():
5 |    """
6 |    Return the integer value 1
7 |    """
8 |    return 1


--------------------------------------------------------------------------------
/python/HDFS/copy_to_local.py:
--------------------------------------------------------------------------------
1 | from snakebite.client import Client
2 | 
3 | client = Client('localhost', 9000)
4 | for f in client.copyToLocal(['/input/input.txt'], '/tmp'):
5 |    print f


--------------------------------------------------------------------------------
/python/HDFS/mkdir.py:
--------------------------------------------------------------------------------
1 | from snakebite.client import Client
2 | 
3 | client = Client('localhost', 9000)
4 | for p in client.mkdir(['/foo/bar', '/input'], create_parent=True):
5 |    print p


--------------------------------------------------------------------------------
/python/MapReduce/mrjob/word_count.py:
--------------------------------------------------------------------------------
 1 | from mrjob.job import MRJob
 2 | 
 3 | class MRWordCount(MRJob):
 4 | 
 5 |    def mapper(self, _, line):
 6 |       for word in line.split():
 7 |          yield(word, 1)
 8 | 
 9 |    def reducer(self, word, counts):
10 |       yield(word, sum(counts))
11 | 
12 | if __name__ == '__main__':
13 |    MRWordCount.run()


--------------------------------------------------------------------------------
/pig/udfs/string_funcs.py:
--------------------------------------------------------------------------------
 1 | from pig_util import outputSchema
 2 | 
 3 | @outputSchema('word:chararray')
 4 | def reverse(word):
 5 |    """
 6 |    Return the reverse text of the provided word
 7 |    """
 8 |    return word[::-1]
 9 | 
10 | 
11 | @outputSchema('length:int')
12 | def num_chars(word):
13 |    """
14 |    Return the length of the provided word
15 |    """
16 |    return len(word)


--------------------------------------------------------------------------------
/python/Spark/word_count.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | 
 3 | def main():
 4 | 
 5 |    sc = SparkContext(appName='SparkWordCount')
 6 | 
 7 |    input_file = sc.textFile('/user/hduser/input/input.txt')
 8 |    counts = input_file.flatMap(lambda line: line.split()) \
 9 |                      .map(lambda word: (word, 1)) \
10 |                      .reduceByKey(lambda a, b: a + b)
11 |    counts.saveAsTextFile('/user/hduser/output')
12 | 
13 |    sc.stop()
14 | 
15 | if __name__ == '__main__':
16 |    main()


--------------------------------------------------------------------------------
/python/MapReduce/HadoopStreaming/mapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | # Read each line from STDIN
 6 | for line in sys.stdin:
 7 | 
 8 |    # Get the words in each line
 9 |    words = line.split()
10 | 
11 |    # Generate the count for each word
12 |    for word in words:
13 | 
14 |       # Write the key-value pair to STDOUT to be processed by the reducer.
15 |       # The key is anything before the first tab character and the value is
16 |       # anything after the first tab character.
17 |       print '{0}\t{1}'.format(word, 1)


--------------------------------------------------------------------------------
/pig/wordcount.pig:
--------------------------------------------------------------------------------
 1 | %default INPUT '/user/hduser/input/input.txt';
 2 | %default OUTPUT '/user/hduser/output';
 3 | 
 4 | -- Load the data from the file system into the relation records
 5 | records = LOAD '$INPUT';
 6 | 
 7 | -- Split each line of text and eliminate nesting
 8 | terms = FOREACH records GENERATE FLATTEN(TOKENIZE((chararray) $0)) AS word;
 9 | 
10 | -- Group similar terms
11 | grouped_terms = GROUP terms BY word;
12 | 
13 | -- Count the number of tuples in each group
14 | word_counts = FOREACH grouped_terms GENERATE COUNT(terms), group;
15 | 
16 | -- Store the result
17 | STORE word_counts INTO '$OUTPUT';


--------------------------------------------------------------------------------
/pig/udfs/movies_udf.py:
--------------------------------------------------------------------------------
 1 | from pig_util import outputSchema
 2 | from datetime import datetime
 3 | import re
 4 | 
 5 | 
 6 | @outputSchema('title:chararray')
 7 | def parse_title(title):
 8 |    """
 9 |    Return the title without the year
10 |    """
11 |    return re.sub(r'\s*\(\d{4}\)','', title)
12 | 
13 | @outputSchema('days_since_release:int')
14 | def days_since_release(date):
15 |    """
16 |    Calculate the number of days since the titles release
17 |    """
18 |    if date is None:
19 |       return None
20 | 
21 |    today = datetime.today()
22 |    release_date = datetime.strptime(date, '%d-%b-%Y')
23 |    delta = today - release_date
24 |    return delta.days


--------------------------------------------------------------------------------
/pig/recent_movies.pig:
--------------------------------------------------------------------------------
 1 | REGISTER 'udfs/movies_udf.py' USING streaming_python AS movies_udf;
 2 | 
 3 | -- Load the data from the file system
 4 | records = LOAD '../resources/movies' USING PigStorage('|') 
 5 |    AS (id:int, title:chararray, release_date:chararray);
 6 | 
 7 | -- Parse the titles and determine how many days since the release date
 8 | titles = FOREACH records GENERATE movies_udf.parse_title(title), movies_udf.days_since_release(release_date);
 9 | 
10 | -- Order the movies by the time since release
11 | most_recent = ORDER titles BY days_since_release ASC;
12 | 
13 | -- Get the ten most recent movies
14 | top_ten = LIMIT most_recent 10;
15 | 
16 | -- Display the top ten most recent movies
17 | DUMP top_ten;


--------------------------------------------------------------------------------
/pig/playing_with_words.pig:
--------------------------------------------------------------------------------
 1 | REGISTER 'udfs/string_funcs.py' USING streaming_python AS string_udf;
 2 | 
 3 | -- Load the data from the file system
 4 | records = LOAD '../resources/input.txt';
 5 | 
 6 | -- Split each line of text and eliminate nesting
 7 | terms = FOREACH records GENERATE FLATTEN(TOKENIZE((chararray) $0)) AS word;
 8 | 
 9 | -- Group similar terms
10 | grouped_terms = GROUP terms BY word;
11 | 
12 | -- Count the number of tuples in each group
13 | unique_terms = FOREACH grouped_terms GENERATE group as word;
14 | 
15 | -- Calculate the number of characters in each term
16 | term_length = FOREACH unique_terms GENERATE word, string_udf.num_chars(word) as length;
17 | 
18 | -- Display the terms and their length
19 | DUMP term_length;
20 | 
21 | -- Reverse each word  
22 | reverse_terms = FOREACH unique_terms GENERATE word, string_udf.reverse(word) as reverse_word;
23 | 
24 | -- Display the terms and the reverse terms
25 | DUMP reverse_terms;


--------------------------------------------------------------------------------
/python/MapReduce/HadoopStreaming/reducer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | curr_word = None
 5 | curr_count = 0
 6 | 
 7 | # Process each key-value pair from the mapper
 8 | for line in sys.stdin:
 9 | 
10 |    # Get the key and value from the current line
11 |    word, count = line.split('\t')
12 | 
13 |    # Convert the count to an int
14 |    count = int(count)
15 | 
16 |    # If the current word is the same as the previous word, increment its
17 |    # count, otherwise print the words count to STDOUT
18 |    if word == curr_word:
19 |       curr_count += count
20 |    else: 
21 | 
22 |       # Write word and its number of occurrences as a key-value pair to STDOUT
23 |       if curr_word:
24 |          print '{0}\t{1}'.format(curr_word, curr_count)
25 | 
26 |       curr_word = word
27 |       curr_count = count
28 | 
29 | # Output the count for the last word
30 | if curr_word == word:
31 |    print '{0}\t{1}'.format(curr_word, curr_count)


--------------------------------------------------------------------------------
/python/MapReduce/mrjob/top_salary.py:
--------------------------------------------------------------------------------
 1 | from mrjob.job import MRJob
 2 | from mrjob.step import MRStep
 3 | import csv
 4 | 
 5 | cols = 'Name,JobTitle,AgencyID,Agency,HireDate,AnnualSalary,GrossPay'.split(',')
 6 | 
 7 | class salarymax(MRJob):
 8 | 
 9 |     def mapper(self, _, line):
10 |         # Convert each line into a dictionary
11 |         row = dict(zip(cols, [ a.strip() for a in csv.reader([line]).next()]))
12 | 
13 |         # Yield the salary
14 |         yield 'salary', (float(row['AnnualSalary'][1:]), line)
15 |         
16 |         # Yield the gross pay
17 |         try:
18 |             yield 'gross', (float(row['GrossPay'][1:]), line)
19 |         except ValueError:
20 |             self.increment_counter('warn', 'missing gross', 1)
21 | 
22 |     def reducer(self, key, values):
23 |         topten = []
24 | 
25 |         # For 'salary' and 'gross' compute the top 10
26 |         for p in values:
27 |             topten.append(p)
28 |             topten.sort()
29 |             topten = topten[-10:]
30 | 
31 |         for p in topten:
32 |             yield key, p
33 | 
34 |     combiner = reducer
35 | 
36 | if __name__ == '__main__':
37 |     salarymax.run()


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 MinerKasch
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/python/Luigi/wordcount.py:
--------------------------------------------------------------------------------
 1 | import luigi
 2 | 
 3 | class InputFile(luigi.Task):
 4 |    """
 5 |    A task wrapping a Target 
 6 |    """
 7 |    input_file = luigi.Parameter()
 8 | 
 9 |    def output(self):
10 |       """
11 |       Return the target for this task
12 |       """
13 |       return luigi.LocalTarget(self.input_file)
14 | 
15 | class WordCount(luigi.Task):
16 |    """
17 |    A task that counts the number of words in a file
18 |    """
19 |    input_file = luigi.Parameter()
20 |    output_file = luigi.Parameter(default='/tmp/wordcount')
21 | 
22 |    def requires(self):
23 |       """
24 |       The task's dependencies:
25 |       """
26 |       return InputFile(self.input_file)
27 | 
28 |    def output(self):
29 |       """
30 |       The task's output
31 |       """
32 |       return luigi.LocalTarget(self.output_file)
33 | 
34 |    def run(self):
35 |       """
36 |       The task's logic
37 |       """
38 |       count = {}
39 | 
40 |       ifp = self.input().open('r')
41 | 
42 |       for line in ifp:
43 |          for word in line.strip().split():
44 |             count[word] = count.get(word, 0) + 1
45 | 
46 |       ofp = self.output().open('w')
47 |       for k, v in count.items():
48 |             ofp.write('{}\t{}\n'.format(k, v))
49 |       ofp.close()
50 | 
51 | if __name__ == '__main__':
52 |    luigi.run()


--------------------------------------------------------------------------------
/python/Spark/text_search.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | import re
 3 | import sys
 4 | 
 5 | def main():
 6 | 
 7 |    # Insure a search term was supplied at the command line
 8 |    if len(sys.argv) != 2:
 9 |       sys.stderr.write('Usage: {} <search_term>'.format(sys.argv[0]))
10 |       sys.exit()
11 | 
12 |    # Create the SparkContext
13 |    sc = SparkContext(appName='SparkWordCount')
14 | 
15 |    # Broadcast the requested term
16 |    requested_movie = sc.broadcast(sys.argv[1])
17 | 
18 |    # Load the input file
19 |    source_file = sc.textFile('/user/hduser/input/movies')
20 | 
21 |    # Get the movie title from the second fields
22 |    titles = source_file.map(lambda line: line.split('|')[1])
23 | 
24 |    # Create a map of the normalized title to the raw title
25 |    normalized_title = titles.map(lambda title: (re.sub(r'\s*\(\d{4}\)','', title).lower(), title))
26 |    
27 |    # Find all movies matching the requested_movie
28 |    matches = normalized_title.filter(lambda x: requested_movie.value in x[0])
29 | 
30 |    # Collect all the matching titles
31 |    matching_titles = matches.map(lambda x: x[1]).distinct().collect()
32 | 
33 |    # Display the result
34 |    print '{} Matching titles found:'.format(len(matching_titles))
35 |    for title in matching_titles:
36 |       print title
37 | 
38 |    sc.stop()
39 | 
40 | if __name__ == '__main__':
41 |    main()


--------------------------------------------------------------------------------
/python/Luigi/luigi_mapreduce.py:
--------------------------------------------------------------------------------
 1 | import luigi
 2 | import luigi.contrib.hadoop
 3 | import luigi.contrib.hdfs
 4 | 
 5 | class InputFile(luigi.ExternalTask):
 6 |    """
 7 |    A task wrapping the HDFS target
 8 |    """
 9 |    input_file = luigi.Parameter()
10 | 
11 |    def output(self):
12 |       """
13 |       Return the target on HDFS
14 |       """
15 |       return luigi.contrib.hdfs.HdfsTarget(self.input_file)
16 | 
17 | class WordCount(luigi.contrib.hadoop.JobTask):
18 |    """
19 |    A task that uses Hadoop streaming to perform WordCount
20 |    """
21 |    input_file = luigi.Parameter()
22 |    output_file = luigi.Parameter()
23 | 
24 |    # Set the number of reduce tasks
25 |    n_reduce_tasks = 1
26 | 
27 |    def requires(self):
28 |       """
29 |       Read from the output of the InputFile task
30 |       """
31 |       return InputFile(self.input_file)
32 | 
33 |    def output(self):
34 |       """
35 |       Write the output to HDFS
36 |       """
37 |       return luigi.contrib.hdfs.HdfsTarget(self.output_file)
38 | 
39 |    def mapper(self, line):
40 |       """
41 |       Read each line and produce a word and 1
42 |       """
43 |       for word in line.strip().split():
44 |          yield word, 1
45 | 
46 |    def reducer(self, key, values):
47 |       """
48 |       Read each word and produce the word and the sum of it's values
49 |       """
50 |       yield key, sum(values)
51 | 
52 | if __name__ == '__main__':
53 |    luigi.run(main_task_cls=WordCount)


--------------------------------------------------------------------------------
/python/Luigi/luigi_pig.py:
--------------------------------------------------------------------------------
 1 | import luigi
 2 | import luigi.contrib.pig
 3 | import luigi.contrib.hdfs
 4 | 
 5 | class InputFile(luigi.ExternalTask):
 6 |    """
 7 |    A task wrapping the HDFS target
 8 |    """
 9 |    input_file = luigi.Parameter()
10 | 
11 |    def output(self):
12 |       return luigi.contrib.hdfs.HdfsTarget(self.input_file)
13 | 
14 | class WordCount(luigi.contrib.pig.PigJobTask):
15 |    """
16 |    A task that uses Pig to perform WordCount
17 |    """
18 |    input_file = luigi.Parameter()
19 |    output_file = luigi.Parameter()
20 |    script_path = luigi.Parameter(default='../../pig/wordcount.pig')
21 | 
22 |    def requires(self):
23 |       """
24 |       Read from the output of the InputFile task
25 |       """
26 |       return InputFile(self.input_file)
27 | 
28 |    def output(self):
29 |       """
30 |       Write the output to HDFS
31 |       """
32 |       return luigi.contrib.hdfs.HdfsTarget(self.output_file)     
33 | 
34 |    def pig_parameters(self):
35 |       """
36 |       A dictionary of parameters to pass to pig
37 |       """
38 |       return {'INPUT': self.input_file, 'OUTPUT': self.output_file}
39 | 
40 |    def pig_options(self):
41 |       """
42 |       A list of options to pass to pig
43 |       """
44 |       return ['-x', 'mapreduce']
45 | 
46 |    def pig_script_path(self):
47 |       """
48 |       The path to the pig script to run
49 |       """
50 |       return self.script_path
51 | 
52 | if __name__ == '__main__':
53 |    luigi.run(main_task_cls=WordCount)


--------------------------------------------------------------------------------
/python/MapReduce/mrjob/avg.py:
--------------------------------------------------------------------------------
 1 | from mrjob.job import MRJob
 2 | from mrjob.step import MRStep
 3 | import csv
 4 | 
 5 | cols = 'Name,JobTitle,AgencyID,Agency,HireDate,AnnualSalary,GrossPay'.split(',')
 6 | 
 7 | 
 8 | class salaryavg(MRJob):
 9 | 
10 |     def avgmapper(self, _, line):
11 |         row = dict(zip(cols, [ a.strip() for a in csv.reader([line]).next()]))
12 | 
13 |         self.increment_counter('depts', row['Agency'], 1)
14 | 
15 |         yield row['JobTitle'], (int(float(row['AnnualSalary'][1:])), 1)
16 | 
17 |     def avgreducer(self, key, values):
18 |         s = 0
19 |         c = 0
20 | 
21 |         for average, count in values:
22 |             s += average * count
23 |             c += count
24 | 
25 |         if c > 3:
26 |             self.increment_counter('stats', 'below3', 1)
27 |             yield key, (s/c, c)
28 | 
29 |     def ttmapper(self, key, value):
30 |         yield None, (value[0], key) # group by all, keep average and job title
31 | 
32 |     def ttreducer(self, key, values):
33 |         topten = []
34 |         for average, job in values:
35 |             topten.append((average, job))
36 |             topten.sort()
37 |             topten = topten[-10:]
38 | 
39 |         for average, job in topten:
40 |             yield None, (average, job)
41 | 
42 |     def steps(self):
43 |         return [
44 |             MRStep(mapper=self.avgmapper,
45 |                    combiner=self.avgreducer,
46 |                    reducer=self.avgreducer),
47 |             MRStep(mapper=self.ttmapper,
48 |                    combiner=self.ttreducer,
49 |                    reducer=self.ttreducer) ]
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     salaryavg.run()


--------------------------------------------------------------------------------