├── .gitignore
├── README.md
├── dumbo
    └── ngrams.py
├── hadoopy
    ├── launch_frozen_hadoopy.py
    ├── launch_hadoopy.py
    └── ngrams.py
├── luigi
    ├── client.cfg
    └── ngrams.py
├── mrjob
    └── ngrams.py
├── native
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           ├── NgramsDriver.java
    │           ├── NgramsMapper.java
    │           ├── NgramsReducer.java
    │           └── TextTriple.java
├── send_data_to_hdfs.py
└── streaming
    ├── mapper.py
    └── reducer.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | 
16 | # Installer logs
17 | pip-log.txt
18 | 
19 | # Unit test / coverage reports
20 | .coverage
21 | .tox
22 | 
23 | #Translations
24 | *.mo
25 | 
26 | #Mr Developer
27 | .mr.developer.cfg
28 | 
29 | #Java
30 | *.class
31 | *.jar
32 | *.war
33 | *.ear
34 | 
35 | *.pydevproject
36 | .project
37 | .metadata
38 | bin/**
39 | tmp/**
40 | tmp/**/*
41 | *.tmp
42 | *.bak
43 | *.swp
44 | *~.nib
45 | local.properties
46 | .classpath
47 | .settings/
48 | .loadpath
49 | 
50 | # External tool builders
51 | .externalToolBuilders/
52 | 
53 | # Locally stored "Eclipse launch configurations"
54 | *.launch
55 | 
56 | # CDT-specific
57 | .cproject
58 | 
59 | # PDT-specific
60 | .buildpath
61 | 
62 | target/
63 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Source code for Cloudera blog post on using Python with Hadoop.  The blog post can be found here:
2 | 
3 | [http://blog.cloudera.com/blog/2013/01/a-guide-to-python-frameworks-for-hadoop](http://blog.cloudera.com/blog/2013/01/a-guide-to-python-frameworks-for-hadoop)
4 | 


--------------------------------------------------------------------------------
/dumbo/ngrams.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | 
 4 | class NgramMapper(object):
 5 |     
 6 |     def __init__(self):
 7 |         # determine value of n in the current block of ngrams
 8 |         input_file = os.environ['map_input_file']
 9 |         self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0])
10 |     
11 |     def __call__(self, key, value):
12 |         data = value.split('\t')
13 |         
14 |         if len(data) < 3:
15 |             return
16 |         
17 |         ngram = data[0].split()
18 |         year = data[1]
19 |         count = int(data[2])
20 |         
21 |         if len(ngram) != self.expected_tokens:
22 |             return
23 |         
24 |         pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
25 |         k = pair + [year]
26 |         
27 |         yield (k, count)        
28 | 
29 | def combiner(key, values):
30 |     yield (key, sum(values))
31 | 
32 | def reducer(key, values):
33 |     yield "%s\t%s\t%s" % tuple(key), str(sum(values))
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     import dumbo
38 |     # import pdb
39 |     # pdb.set_trace()
40 |     # dumbo.run(NgramMapper, reducer, combiner=combiner)
41 |     dumbo.run(NgramMapper, reducer)


--------------------------------------------------------------------------------
/hadoopy/launch_frozen_hadoopy.py:
--------------------------------------------------------------------------------
 1 | from hadoopy import launch_frozen
 2 | 
 3 | input_path = 'hdfs://laserson-1.ent.cloudera.com/ngrams'
 4 | output_path = 'hdfs://laserson-1.ent.cloudera.com/output-hadoopy-frozen'
 5 | 
 6 | launch_frozen(input_path,
 7 |               output_path,
 8 |               'ngrams.py',
 9 |               use_seqoutput=False,
10 |               num_reducers=10,
11 |               hstreaming='/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar')
12 | 


--------------------------------------------------------------------------------
/hadoopy/launch_hadoopy.py:
--------------------------------------------------------------------------------
 1 | from hadoopy import launch
 2 | 
 3 | input_path = 'hdfs://laserson-1.ent.cloudera.com/ngrams'
 4 | output_path = 'hdfs://laserson-1.ent.cloudera.com/output-hadoopy'
 5 | 
 6 | launch(input_path,
 7 |        output_path,
 8 |        'ngrams.py',
 9 |        use_seqoutput=False,
10 |        num_reducers=10,
11 |        hstreaming='/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar')
12 | 


--------------------------------------------------------------------------------
/hadoopy/ngrams.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | 
 4 | import hadoopy
 5 | 
 6 | class Mapper(object):
 7 |     
 8 |     def __init__(self):
 9 |         # determine value of n in the current block of ngrams
10 |         input_file = os.environ['map_input_file']
11 |         self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0])
12 |     
13 |     def map(self, key, value):
14 |         data = value.split('\t')
15 |         
16 |         if len(data) < 3:
17 |             return
18 |         
19 |         ngram = data[0].split()
20 |         year = data[1]
21 |         count = int(data[2])
22 |         
23 |         if len(ngram) != self.expected_tokens:
24 |             return
25 |         
26 |         pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
27 |         k = pair + [year]
28 |         
29 |         yield (k, count)        
30 | 
31 | def combiner(key, values):
32 |     yield (key, sum(values))
33 | 
34 | def reducer(key, values):
35 |     yield "%s\t%s\t%s" % tuple(key), str(sum(values))
36 | 
37 | if __name__ == '__main__':
38 |     hadoopy.run(Mapper, reducer, combiner)
39 | 


--------------------------------------------------------------------------------
/luigi/client.cfg:
--------------------------------------------------------------------------------
1 | [hadoop]
2 | streaming-jar: /usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.2.0.jar


--------------------------------------------------------------------------------
/luigi/ngrams.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | 
 4 | import luigi
 5 | import luigi.hadoop
 6 | import luigi.hdfs
 7 | 
 8 | class InputText(luigi.ExternalTask):
 9 |     path = luigi.Parameter()
10 |     
11 |     def output(self):
12 |         return luigi.hdfs.HdfsTarget(self.path)
13 | 
14 | class Ngrams(luigi.hadoop.JobTask):
15 |     source = luigi.Parameter()
16 |     destination = luigi.Parameter()
17 |     # overrides superclass; gets set as jobconf:
18 |     n_reduce_tasks = luigi.IntParameter(default=10)
19 |     
20 |     def requires(self):
21 |         tasks = []
22 |         paths = luigi.hdfs.HdfsClient().listdir(self.source, ignore_directories=True, recursive=True)
23 |         for path in paths:
24 |             tasks.append(InputText(path))
25 |         return tasks
26 |     
27 |     def output(self):
28 |         return luigi.hdfs.HdfsTarget(self.destination)
29 |     
30 |     def init_mapper(self):
31 |         input_file = os.environ['map_input_file']
32 |         self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0])
33 |     
34 |     def mapper(self, line):
35 |         data = line.split('\t')
36 |         
37 |         if len(data) < 3:
38 |             return
39 |         
40 |         # unpack data
41 |         ngram = data[0].split()
42 |         year = data[1]
43 |         count = int(data[2])
44 |         
45 |         if len(ngram) != self.expected_tokens:
46 |             return
47 |         
48 |         # generate key
49 |         pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
50 |         k = pair + [year]
51 |         
52 |         yield (k, count)
53 |     
54 |     def combiner(self, key, values):
55 |         yield (key, sum(values))
56 |     
57 |     def reducer(self, key, values):
58 |         yield "%s\t%s\t%s" % tuple(key), str(sum(values))
59 |     
60 | if __name__ == '__main__':
61 |     luigi.run()
62 | 


--------------------------------------------------------------------------------
/mrjob/ngrams.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import os
 4 | import re
 5 | 
 6 | from mrjob.job import MRJob
 7 | from mrjob.protocol import RawProtocol, ReprProtocol
 8 | 
 9 | class NgramNeighbors(MRJob):
10 |     
11 |     # mrjob allows you to specify input/intermediate/output serialization
12 |     # default output protocol is JSON; here we set it to text
13 |     OUTPUT_PROTOCOL = RawProtocol
14 |     
15 |     def mapper_init(self):
16 |         # determine value of n in the current block of ngrams by parsing filename
17 |         input_file = os.environ['map_input_file']
18 |         self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0])
19 |     
20 |     def mapper(self, key, line):
21 |         data = line.split('\t')
22 |         
23 |         # error checking
24 |         if len(data) < 3:
25 |             return
26 |         
27 |         # unpack data
28 |         ngram = data[0].split()
29 |         year = data[1]
30 |         count = int(data[2])
31 |         
32 |         # more error checking
33 |         if len(ngram) != self.expected_tokens:
34 |             return
35 |         
36 |         # generate key
37 |         pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
38 |         k = pair + [year]
39 |         
40 |         # note that the key is an object (a list in this case)
41 |         # that mrjob will serialize as JSON text
42 |         yield (k, count)
43 |     
44 |     def combiner(self, key, counts):
45 |         # the combiner must be separate from the reducer because the input
46 |         # and output must both be JSON
47 |         yield (key, sum(counts))
48 |     
49 |     def reducer(self, key, counts):
50 |         # the final output is encoded as text
51 |         yield "%s\t%s\t%s" % tuple(key), str(sum(counts))
52 | 
53 | if __name__ == '__main__':
54 |     # sets up a runner, based on command line options
55 |     NgramNeighbors.run()
56 | 


--------------------------------------------------------------------------------
/native/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <groupId>com.cloudera</groupId>
 4 |   <artifactId>NgramsComparison</artifactId>
 5 |   <version>0.0.1-SNAPSHOT</version>
 6 |   <name>Ngrams Comparison</name>
 7 |   
 8 |   <properties>
 9 |   	<hadoop.version>2.0.0-mr1-cdh4.0.1</hadoop.version>
10 |   	<!-- <hadoop.version>1.0.4</hadoop.version> -->
11 |   </properties>
12 |   
13 |   <repositories>
14 |       <repository>
15 |         <id>cloudera-releases</id>
16 |         <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
17 |         <releases>
18 |           <enabled>true</enabled>
19 |         </releases>
20 |         <snapshots>
21 |           <enabled>false</enabled>
22 |         </snapshots>
23 |       </repository>
24 |     </repositories>
25 |     
26 |     <dependencies>
27 |     <dependency>
28 |       <groupId>org.apache.hadoop</groupId>
29 |       <artifactId>hadoop-client</artifactId>
30 |       <version>${hadoop.version}</version>
31 |     </dependency>
32 |     <dependency>
33 |     	<groupId>log4j</groupId>
34 |     	<artifactId>log4j</artifactId>
35 |     	<version>1.2.17</version>
36 |     </dependency>
37 |     </dependencies>
38 |   
39 | </project>


--------------------------------------------------------------------------------
/native/src/main/java/NgramsDriver.java:
--------------------------------------------------------------------------------
 1 | import org.apache.hadoop.conf.Configured;
 2 | import org.apache.hadoop.fs.Path;
 3 | import org.apache.hadoop.io.IntWritable;
 4 | import org.apache.hadoop.mapreduce.Job;
 5 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 6 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 7 | import org.apache.hadoop.util.Tool;
 8 | import org.apache.hadoop.util.ToolRunner;
 9 | 
10 | 
11 | public class NgramsDriver extends Configured implements Tool {
12 | 
13 | 	public int run(String[] args) throws Exception {
14 | 		Job job = new Job(getConf());
15 | 		job.setJarByClass(getClass());
16 | 		
17 | 		FileInputFormat.addInputPath(job, new Path(args[0]));
18 | 		FileOutputFormat.setOutputPath(job, new Path(args[1]));
19 | 		
20 | 		job.setMapperClass(NgramsMapper.class);
21 | 		job.setCombinerClass(NgramsReducer.class);
22 | 		job.setReducerClass(NgramsReducer.class);
23 | 		
24 | 		job.setOutputKeyClass(TextTriple.class);
25 | 		job.setOutputValueClass(IntWritable.class);
26 | 		
27 | 		job.setNumReduceTasks(10);
28 | 		
29 | 		return job.waitForCompletion(true) ? 0 : 1;
30 | 	}
31 | 	
32 | 	public static void main(String[] args) throws Exception {
33 | 		int exitCode = ToolRunner.run(new NgramsDriver(), args);
34 | 		System.exit(exitCode);
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/native/src/main/java/NgramsMapper.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.ArrayList;
 3 | import java.util.Collections;
 4 | import java.util.List;
 5 | import java.util.regex.Matcher;
 6 | import java.util.regex.Pattern;
 7 | 
 8 | import org.apache.hadoop.io.IntWritable;
 9 | import org.apache.hadoop.io.LongWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Mapper;
12 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
13 | import org.apache.log4j.Logger;
14 | 
15 | 
16 | public class NgramsMapper extends Mapper<LongWritable, Text, TextTriple, IntWritable> {
17 | 	
18 | 	private Logger LOG = Logger.getLogger(getClass());
19 | 	
20 | 	private int expectedTokens;
21 | 	
22 | 	@Override
23 | 	protected void setup(Context context) throws IOException, InterruptedException {
24 | 		String inputFile = ((FileSplit) context.getInputSplit()).getPath().getName();
25 | 		LOG.info("inputFile: " + inputFile);
26 | 		Pattern c = Pattern.compile("([\\d]+)gram");
27 | 		Matcher m = c.matcher(inputFile);
28 | 		m.find();
29 | 		expectedTokens = Integer.parseInt(m.group(1));
30 | 		return;
31 | 	}
32 | 	
33 | 	@Override
34 | 	public void map(LongWritable key, Text value, Context context)
35 | 			throws IOException, InterruptedException {
36 | 		String[] data = value.toString().split("\\t");
37 | 		
38 | 		if (data.length < 3) {
39 | 			return;
40 | 		}
41 | 		
42 | 		String[] ngram = data[0].split("\\s+");
43 | 		String year = data[1];
44 | 		IntWritable count = new IntWritable(Integer.parseInt(data[2]));
45 | 		
46 | 		if (ngram.length != this.expectedTokens) {
47 | 			return;
48 | 		}
49 | 		
50 | 		// build keyOut
51 | 		List<String> triple = new ArrayList<String>(3);
52 | 		triple.add(ngram[0]);
53 | 		triple.add(ngram[expectedTokens - 1]);
54 | 		Collections.sort(triple);
55 | 		triple.add(year);
56 | 		TextTriple keyOut = new TextTriple(triple);
57 | 		
58 | 		context.write(keyOut, count);
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/native/src/main/java/NgramsReducer.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | 
 3 | import org.apache.hadoop.io.IntWritable;
 4 | import org.apache.hadoop.mapreduce.Reducer;
 5 | 
 6 | 
 7 | public class NgramsReducer extends Reducer<TextTriple, IntWritable, TextTriple, IntWritable> {
 8 | 	
 9 | 	@Override
10 | 	protected void reduce(TextTriple key, Iterable<IntWritable> values, Context context)
11 | 			throws IOException, InterruptedException {
12 | 		int sum = 0;
13 | 		for (IntWritable value : values) {
14 | 			sum += value.get();
15 | 		}
16 | 		context.write(key, new IntWritable(sum));
17 | 	}
18 | }
19 | 


--------------------------------------------------------------------------------
/native/src/main/java/TextTriple.java:
--------------------------------------------------------------------------------
 1 | import java.io.DataInput;
 2 | import java.io.DataOutput;
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | 
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.io.WritableComparable;
 8 | 
 9 | 
10 | public class TextTriple implements WritableComparable<TextTriple> {
11 | 	
12 | 	private Text first;
13 | 	private Text second;
14 | 	private Text third;
15 | 	
16 | 	public TextTriple() {
17 | 		set(new Text(), new Text(), new Text());
18 | 	}
19 | 	
20 | 	public TextTriple(List<String> list) {
21 | 		set(new Text(list.get(0)),
22 | 			new Text(list.get(1)),
23 | 			new Text(list.get(2)));
24 | 	}
25 | 	
26 | 	public void set(Text first, Text second, Text third) {
27 | 		this.first = first;
28 | 		this.second = second;
29 | 		this.third = third;
30 | 	}
31 | 	
32 | 	public void write(DataOutput out) throws IOException {
33 | 		first.write(out);
34 | 		second.write(out);
35 | 		third.write(out);
36 | 	}
37 | 
38 | 	public void readFields(DataInput in) throws IOException {
39 | 		first.readFields(in);
40 | 		second.readFields(in);
41 | 		third.readFields(in);
42 | 	}
43 | 
44 | 	@Override
45 | 	public int hashCode() {
46 | 		return first.hashCode() * 163 + second.hashCode() * 31 + third.hashCode();
47 | 	}
48 | 	
49 | 	@Override
50 | 	public boolean equals(Object obj) {
51 | 		if (obj instanceof TextTriple) {
52 | 			TextTriple tt = (TextTriple) obj;
53 | 			return first.equals(tt.first) && second.equals(tt.second) && third.equals(tt.third);
54 | 		}
55 | 		return false;
56 | 	}
57 | 	
58 | 	@Override
59 | 	public String toString() {
60 | 		return first + "\t" + second + "\t" + third;
61 | 	}
62 | 
63 | 	public int compareTo(TextTriple other) {
64 | 		int comp = first.compareTo(other.first);
65 | 		if (comp != 0) {
66 | 			return comp;
67 | 		}
68 | 		comp = second.compareTo(other.second);
69 | 		if (comp != 0) {
70 | 			return comp;
71 | 		}
72 | 		return third.compareTo(other.third);
73 | 	}
74 | 	
75 | 	
76 | }
77 | 


--------------------------------------------------------------------------------
/send_data_to_hdfs.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | import subprocess
 4 | 
 5 | GB = 1024 ** 3
 6 | def du():
 7 |     p = subprocess.Popen('hadoop fs -du -s /ngrams', shell=True, stdout=subprocess.PIPE)
 8 |     return int(p.stdout.read().split()[0])
 9 | 
10 | # generate list of URLs
11 | base_url = 'http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-%igram-20090715-%i.csv.zip'
12 | sizes = [(2, 100), (3, 200), (4, 400), (5, 800)]
13 | ngram_urls = []
14 | for size in sizes:
15 |     n = size[0]
16 |     num_files = size[1]
17 |     for i in xrange(num_files):
18 |         ngram_urls.append(base_url % (n, i))
19 | 
20 | # download data directly into HDFS
21 | stream_cmd = 'curl "%s" | funzip | hadoop fs -put - /ngrams/%s'
22 | random.shuffle(ngram_urls)
23 | finished = False
24 | while not finished:
25 |     url = ngram_urls.pop()
26 |     filename = '.'.join(url.split('/')[-1].split('.')[:-1])
27 |     sys.stdout.write("%s\n" % filename)
28 |     sys.stdout.flush()
29 |     subprocess.Popen(stream_cmd % (url, filename), shell=True).wait()
30 |     if du() > 20 * GB:
31 |         finished = True
32 | 


--------------------------------------------------------------------------------
/streaming/mapper.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import os
 4 | import re
 5 | import sys
 6 | 
 7 | # determine value of n in the current block of ngrams by parsing the filename
 8 | input_file = os.environ['map_input_file']
 9 | expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0])
10 | 
11 | for line in sys.stdin:
12 |     data = line.split('\t')
13 |     
14 |     # perform some error checking
15 |     if len(data) < 3:
16 |         continue
17 |     
18 |     # unpack data
19 |     ngram = data[0].split()
20 |     year = data[1]
21 |     count = data[2]
22 |     
23 |     # more error checking
24 |     if len(ngram) != expected_tokens:
25 |         continue
26 |     
27 |     # build key and emit
28 |     pair = sorted([ngram[0], ngram[expected_tokens - 1]])
29 |     print >>sys.stdout, "%s\t%s\t%s\t%s" % (pair[0], pair[1], year, count)
30 | 


--------------------------------------------------------------------------------
/streaming/reducer.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | total = 0
 6 | prev_key = False
 7 | for line in sys.stdin:
 8 |     data = line.split('\t')
 9 |     curr_key = '\t'.join(data[:3])
10 |     count = int(data[3])
11 |     
12 |     # found a boundary; emit current sum
13 |     if prev_key and curr_key != prev_key:
14 |         print >>sys.stdout, "%s\t%i" % (prev_key, total)
15 |         prev_key = curr_key
16 |         total = count
17 |     # same key; accumulate sum
18 |     else:
19 |         prev_key = curr_key
20 |         total += count
21 | 
22 | # emit last key
23 | if prev_key:
24 |     print >>sys.stdout, "%s\t%i" % (prev_key, total)
25 | 


--------------------------------------------------------------------------------