├── .gitignore
├── README.md
├── dumbo
└── ngrams.py
├── hadoopy
├── launch_frozen_hadoopy.py
├── launch_hadoopy.py
└── ngrams.py
├── luigi
├── client.cfg
└── ngrams.py
├── mrjob
└── ngrams.py
├── native
├── pom.xml
└── src
│ └── main
│ └── java
│ ├── NgramsDriver.java
│ ├── NgramsMapper.java
│ ├── NgramsReducer.java
│ └── TextTriple.java
├── send_data_to_hdfs.py
└── streaming
├── mapper.py
└── reducer.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[co]
2 |
3 | # Packages
4 | *.egg
5 | *.egg-info
6 | dist
7 | build
8 | eggs
9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 |
16 | # Installer logs
17 | pip-log.txt
18 |
19 | # Unit test / coverage reports
20 | .coverage
21 | .tox
22 |
23 | #Translations
24 | *.mo
25 |
26 | #Mr Developer
27 | .mr.developer.cfg
28 |
29 | #Java
30 | *.class
31 | *.jar
32 | *.war
33 | *.ear
34 |
35 | *.pydevproject
36 | .project
37 | .metadata
38 | bin/**
39 | tmp/**
40 | tmp/**/*
41 | *.tmp
42 | *.bak
43 | *.swp
44 | *~.nib
45 | local.properties
46 | .classpath
47 | .settings/
48 | .loadpath
49 |
50 | # External tool builders
51 | .externalToolBuilders/
52 |
53 | # Locally stored "Eclipse launch configurations"
54 | *.launch
55 |
56 | # CDT-specific
57 | .cproject
58 |
59 | # PDT-specific
60 | .buildpath
61 |
62 | target/
63 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Source code for Cloudera blog post on using Python with Hadoop. The blog post can be found here:
2 |
3 | [http://blog.cloudera.com/blog/2013/01/a-guide-to-python-frameworks-for-hadoop](http://blog.cloudera.com/blog/2013/01/a-guide-to-python-frameworks-for-hadoop)
4 |
--------------------------------------------------------------------------------
/dumbo/ngrams.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | class NgramMapper(object):
5 |
6 | def __init__(self):
7 | # determine value of n in the current block of ngrams
8 | input_file = os.environ['map_input_file']
9 | self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0])
10 |
11 | def __call__(self, key, value):
12 | data = value.split('\t')
13 |
14 | if len(data) < 3:
15 | return
16 |
17 | ngram = data[0].split()
18 | year = data[1]
19 | count = int(data[2])
20 |
21 | if len(ngram) != self.expected_tokens:
22 | return
23 |
24 | pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
25 | k = pair + [year]
26 |
27 | yield (k, count)
28 |
29 | def combiner(key, values):
30 | yield (key, sum(values))
31 |
32 | def reducer(key, values):
33 | yield "%s\t%s\t%s" % tuple(key), str(sum(values))
34 |
35 |
36 | if __name__ == '__main__':
37 | import dumbo
38 | # import pdb
39 | # pdb.set_trace()
40 | # dumbo.run(NgramMapper, reducer, combiner=combiner)
41 | dumbo.run(NgramMapper, reducer)
--------------------------------------------------------------------------------
/hadoopy/launch_frozen_hadoopy.py:
--------------------------------------------------------------------------------
1 | from hadoopy import launch_frozen
2 |
3 | input_path = 'hdfs://laserson-1.ent.cloudera.com/ngrams'
4 | output_path = 'hdfs://laserson-1.ent.cloudera.com/output-hadoopy-frozen'
5 |
6 | launch_frozen(input_path,
7 | output_path,
8 | 'ngrams.py',
9 | use_seqoutput=False,
10 | num_reducers=10,
11 | hstreaming='/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar')
12 |
--------------------------------------------------------------------------------
/hadoopy/launch_hadoopy.py:
--------------------------------------------------------------------------------
1 | from hadoopy import launch
2 |
3 | input_path = 'hdfs://laserson-1.ent.cloudera.com/ngrams'
4 | output_path = 'hdfs://laserson-1.ent.cloudera.com/output-hadoopy'
5 |
6 | launch(input_path,
7 | output_path,
8 | 'ngrams.py',
9 | use_seqoutput=False,
10 | num_reducers=10,
11 | hstreaming='/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar')
12 |
--------------------------------------------------------------------------------
/hadoopy/ngrams.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | import hadoopy
5 |
6 | class Mapper(object):
7 |
8 | def __init__(self):
9 | # determine value of n in the current block of ngrams
10 | input_file = os.environ['map_input_file']
11 | self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0])
12 |
13 | def map(self, key, value):
14 | data = value.split('\t')
15 |
16 | if len(data) < 3:
17 | return
18 |
19 | ngram = data[0].split()
20 | year = data[1]
21 | count = int(data[2])
22 |
23 | if len(ngram) != self.expected_tokens:
24 | return
25 |
26 | pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
27 | k = pair + [year]
28 |
29 | yield (k, count)
30 |
31 | def combiner(key, values):
32 | yield (key, sum(values))
33 |
34 | def reducer(key, values):
35 | yield "%s\t%s\t%s" % tuple(key), str(sum(values))
36 |
37 | if __name__ == '__main__':
38 | hadoopy.run(Mapper, reducer, combiner)
39 |
--------------------------------------------------------------------------------
/luigi/client.cfg:
--------------------------------------------------------------------------------
1 | [hadoop]
2 | streaming-jar: /usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.2.0.jar
--------------------------------------------------------------------------------
/luigi/ngrams.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | import luigi
5 | import luigi.hadoop
6 | import luigi.hdfs
7 |
8 | class InputText(luigi.ExternalTask):
9 | path = luigi.Parameter()
10 |
11 | def output(self):
12 | return luigi.hdfs.HdfsTarget(self.path)
13 |
14 | class Ngrams(luigi.hadoop.JobTask):
15 | source = luigi.Parameter()
16 | destination = luigi.Parameter()
17 | # overrides superclass; gets set as jobconf:
18 | n_reduce_tasks = luigi.IntParameter(default=10)
19 |
20 | def requires(self):
21 | tasks = []
22 | paths = luigi.hdfs.HdfsClient().listdir(self.source, ignore_directories=True, recursive=True)
23 | for path in paths:
24 | tasks.append(InputText(path))
25 | return tasks
26 |
27 | def output(self):
28 | return luigi.hdfs.HdfsTarget(self.destination)
29 |
30 | def init_mapper(self):
31 | input_file = os.environ['map_input_file']
32 | self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0])
33 |
34 | def mapper(self, line):
35 | data = line.split('\t')
36 |
37 | if len(data) < 3:
38 | return
39 |
40 | # unpack data
41 | ngram = data[0].split()
42 | year = data[1]
43 | count = int(data[2])
44 |
45 | if len(ngram) != self.expected_tokens:
46 | return
47 |
48 | # generate key
49 | pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
50 | k = pair + [year]
51 |
52 | yield (k, count)
53 |
54 | def combiner(self, key, values):
55 | yield (key, sum(values))
56 |
57 | def reducer(self, key, values):
58 | yield "%s\t%s\t%s" % tuple(key), str(sum(values))
59 |
60 | if __name__ == '__main__':
61 | luigi.run()
62 |
--------------------------------------------------------------------------------
/mrjob/ngrams.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | import os
4 | import re
5 |
6 | from mrjob.job import MRJob
7 | from mrjob.protocol import RawProtocol, ReprProtocol
8 |
9 | class NgramNeighbors(MRJob):
10 |
11 | # mrjob allows you to specify input/intermediate/output serialization
12 | # default output protocol is JSON; here we set it to text
13 | OUTPUT_PROTOCOL = RawProtocol
14 |
15 | def mapper_init(self):
16 | # determine value of n in the current block of ngrams by parsing filename
17 | input_file = os.environ['map_input_file']
18 | self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0])
19 |
20 | def mapper(self, key, line):
21 | data = line.split('\t')
22 |
23 | # error checking
24 | if len(data) < 3:
25 | return
26 |
27 | # unpack data
28 | ngram = data[0].split()
29 | year = data[1]
30 | count = int(data[2])
31 |
32 | # more error checking
33 | if len(ngram) != self.expected_tokens:
34 | return
35 |
36 | # generate key
37 | pair = sorted([ngram[0], ngram[self.expected_tokens - 1]])
38 | k = pair + [year]
39 |
40 | # note that the key is an object (a list in this case)
41 | # that mrjob will serialize as JSON text
42 | yield (k, count)
43 |
44 | def combiner(self, key, counts):
45 | # the combiner must be separate from the reducer because the input
46 | # and output must both be JSON
47 | yield (key, sum(counts))
48 |
49 | def reducer(self, key, counts):
50 | # the final output is encoded as text
51 | yield "%s\t%s\t%s" % tuple(key), str(sum(counts))
52 |
53 | if __name__ == '__main__':
54 | # sets up a runner, based on command line options
55 | NgramNeighbors.run()
56 |
--------------------------------------------------------------------------------
/native/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | com.cloudera
4 | NgramsComparison
5 | 0.0.1-SNAPSHOT
6 | Ngrams Comparison
7 |
8 |
9 | 2.0.0-mr1-cdh4.0.1
10 |
11 |
12 |
13 |
14 |
15 | cloudera-releases
16 | https://repository.cloudera.com/artifactory/cloudera-repos
17 |
18 | true
19 |
20 |
21 | false
22 |
23 |
24 |
25 |
26 |
27 |
28 | org.apache.hadoop
29 | hadoop-client
30 | ${hadoop.version}
31 |
32 |
33 | log4j
34 | log4j
35 | 1.2.17
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/native/src/main/java/NgramsDriver.java:
--------------------------------------------------------------------------------
1 | import org.apache.hadoop.conf.Configured;
2 | import org.apache.hadoop.fs.Path;
3 | import org.apache.hadoop.io.IntWritable;
4 | import org.apache.hadoop.mapreduce.Job;
5 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
6 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
7 | import org.apache.hadoop.util.Tool;
8 | import org.apache.hadoop.util.ToolRunner;
9 |
10 |
11 | public class NgramsDriver extends Configured implements Tool {
12 |
13 | public int run(String[] args) throws Exception {
14 | Job job = new Job(getConf());
15 | job.setJarByClass(getClass());
16 |
17 | FileInputFormat.addInputPath(job, new Path(args[0]));
18 | FileOutputFormat.setOutputPath(job, new Path(args[1]));
19 |
20 | job.setMapperClass(NgramsMapper.class);
21 | job.setCombinerClass(NgramsReducer.class);
22 | job.setReducerClass(NgramsReducer.class);
23 |
24 | job.setOutputKeyClass(TextTriple.class);
25 | job.setOutputValueClass(IntWritable.class);
26 |
27 | job.setNumReduceTasks(10);
28 |
29 | return job.waitForCompletion(true) ? 0 : 1;
30 | }
31 |
32 | public static void main(String[] args) throws Exception {
33 | int exitCode = ToolRunner.run(new NgramsDriver(), args);
34 | System.exit(exitCode);
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/native/src/main/java/NgramsMapper.java:
--------------------------------------------------------------------------------
1 | import java.io.IOException;
2 | import java.util.ArrayList;
3 | import java.util.Collections;
4 | import java.util.List;
5 | import java.util.regex.Matcher;
6 | import java.util.regex.Pattern;
7 |
8 | import org.apache.hadoop.io.IntWritable;
9 | import org.apache.hadoop.io.LongWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Mapper;
12 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
13 | import org.apache.log4j.Logger;
14 |
15 |
16 | public class NgramsMapper extends Mapper {
17 |
18 | private Logger LOG = Logger.getLogger(getClass());
19 |
20 | private int expectedTokens;
21 |
22 | @Override
23 | protected void setup(Context context) throws IOException, InterruptedException {
24 | String inputFile = ((FileSplit) context.getInputSplit()).getPath().getName();
25 | LOG.info("inputFile: " + inputFile);
26 | Pattern c = Pattern.compile("([\\d]+)gram");
27 | Matcher m = c.matcher(inputFile);
28 | m.find();
29 | expectedTokens = Integer.parseInt(m.group(1));
30 | return;
31 | }
32 |
33 | @Override
34 | public void map(LongWritable key, Text value, Context context)
35 | throws IOException, InterruptedException {
36 | String[] data = value.toString().split("\\t");
37 |
38 | if (data.length < 3) {
39 | return;
40 | }
41 |
42 | String[] ngram = data[0].split("\\s+");
43 | String year = data[1];
44 | IntWritable count = new IntWritable(Integer.parseInt(data[2]));
45 |
46 | if (ngram.length != this.expectedTokens) {
47 | return;
48 | }
49 |
50 | // build keyOut
51 | List triple = new ArrayList(3);
52 | triple.add(ngram[0]);
53 | triple.add(ngram[expectedTokens - 1]);
54 | Collections.sort(triple);
55 | triple.add(year);
56 | TextTriple keyOut = new TextTriple(triple);
57 |
58 | context.write(keyOut, count);
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/native/src/main/java/NgramsReducer.java:
--------------------------------------------------------------------------------
1 | import java.io.IOException;
2 |
3 | import org.apache.hadoop.io.IntWritable;
4 | import org.apache.hadoop.mapreduce.Reducer;
5 |
6 |
7 | public class NgramsReducer extends Reducer {
8 |
9 | @Override
10 | protected void reduce(TextTriple key, Iterable values, Context context)
11 | throws IOException, InterruptedException {
12 | int sum = 0;
13 | for (IntWritable value : values) {
14 | sum += value.get();
15 | }
16 | context.write(key, new IntWritable(sum));
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/native/src/main/java/TextTriple.java:
--------------------------------------------------------------------------------
1 | import java.io.DataInput;
2 | import java.io.DataOutput;
3 | import java.io.IOException;
4 | import java.util.List;
5 |
6 | import org.apache.hadoop.io.Text;
7 | import org.apache.hadoop.io.WritableComparable;
8 |
9 |
10 | public class TextTriple implements WritableComparable {
11 |
12 | private Text first;
13 | private Text second;
14 | private Text third;
15 |
16 | public TextTriple() {
17 | set(new Text(), new Text(), new Text());
18 | }
19 |
20 | public TextTriple(List list) {
21 | set(new Text(list.get(0)),
22 | new Text(list.get(1)),
23 | new Text(list.get(2)));
24 | }
25 |
26 | public void set(Text first, Text second, Text third) {
27 | this.first = first;
28 | this.second = second;
29 | this.third = third;
30 | }
31 |
32 | public void write(DataOutput out) throws IOException {
33 | first.write(out);
34 | second.write(out);
35 | third.write(out);
36 | }
37 |
38 | public void readFields(DataInput in) throws IOException {
39 | first.readFields(in);
40 | second.readFields(in);
41 | third.readFields(in);
42 | }
43 |
44 | @Override
45 | public int hashCode() {
46 | return first.hashCode() * 163 + second.hashCode() * 31 + third.hashCode();
47 | }
48 |
49 | @Override
50 | public boolean equals(Object obj) {
51 | if (obj instanceof TextTriple) {
52 | TextTriple tt = (TextTriple) obj;
53 | return first.equals(tt.first) && second.equals(tt.second) && third.equals(tt.third);
54 | }
55 | return false;
56 | }
57 |
58 | @Override
59 | public String toString() {
60 | return first + "\t" + second + "\t" + third;
61 | }
62 |
63 | public int compareTo(TextTriple other) {
64 | int comp = first.compareTo(other.first);
65 | if (comp != 0) {
66 | return comp;
67 | }
68 | comp = second.compareTo(other.second);
69 | if (comp != 0) {
70 | return comp;
71 | }
72 | return third.compareTo(other.third);
73 | }
74 |
75 |
76 | }
77 |
--------------------------------------------------------------------------------
/send_data_to_hdfs.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import random
3 | import subprocess
4 |
5 | GB = 1024 ** 3
6 | def du():
7 | p = subprocess.Popen('hadoop fs -du -s /ngrams', shell=True, stdout=subprocess.PIPE)
8 | return int(p.stdout.read().split()[0])
9 |
10 | # generate list of URLs
11 | base_url = 'http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-%igram-20090715-%i.csv.zip'
12 | sizes = [(2, 100), (3, 200), (4, 400), (5, 800)]
13 | ngram_urls = []
14 | for size in sizes:
15 | n = size[0]
16 | num_files = size[1]
17 | for i in xrange(num_files):
18 | ngram_urls.append(base_url % (n, i))
19 |
20 | # download data directly into HDFS
21 | stream_cmd = 'curl "%s" | funzip | hadoop fs -put - /ngrams/%s'
22 | random.shuffle(ngram_urls)
23 | finished = False
24 | while not finished:
25 | url = ngram_urls.pop()
26 | filename = '.'.join(url.split('/')[-1].split('.')[:-1])
27 | sys.stdout.write("%s\n" % filename)
28 | sys.stdout.flush()
29 | subprocess.Popen(stream_cmd % (url, filename), shell=True).wait()
30 | if du() > 20 * GB:
31 | finished = True
32 |
--------------------------------------------------------------------------------
/streaming/mapper.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | import os
4 | import re
5 | import sys
6 |
7 | # determine value of n in the current block of ngrams by parsing the filename
8 | input_file = os.environ['map_input_file']
9 | expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0])
10 |
11 | for line in sys.stdin:
12 | data = line.split('\t')
13 |
14 | # perform some error checking
15 | if len(data) < 3:
16 | continue
17 |
18 | # unpack data
19 | ngram = data[0].split()
20 | year = data[1]
21 | count = data[2]
22 |
23 | # more error checking
24 | if len(ngram) != expected_tokens:
25 | continue
26 |
27 | # build key and emit
28 | pair = sorted([ngram[0], ngram[expected_tokens - 1]])
29 | print >>sys.stdout, "%s\t%s\t%s\t%s" % (pair[0], pair[1], year, count)
30 |
--------------------------------------------------------------------------------
/streaming/reducer.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | import sys
4 |
5 | total = 0
6 | prev_key = False
7 | for line in sys.stdin:
8 | data = line.split('\t')
9 | curr_key = '\t'.join(data[:3])
10 | count = int(data[3])
11 |
12 | # found a boundary; emit current sum
13 | if prev_key and curr_key != prev_key:
14 | print >>sys.stdout, "%s\t%i" % (prev_key, total)
15 | prev_key = curr_key
16 | total = count
17 | # same key; accumulate sum
18 | else:
19 | prev_key = curr_key
20 | total += count
21 |
22 | # emit last key
23 | if prev_key:
24 | print >>sys.stdout, "%s\t%i" % (prev_key, total)
25 |
--------------------------------------------------------------------------------