├── .gitignore ├── README.md ├── dumbo └── ngrams.py ├── hadoopy ├── launch_frozen_hadoopy.py ├── launch_hadoopy.py └── ngrams.py ├── luigi ├── client.cfg └── ngrams.py ├── mrjob └── ngrams.py ├── native ├── pom.xml └── src │ └── main │ └── java │ ├── NgramsDriver.java │ ├── NgramsMapper.java │ ├── NgramsReducer.java │ └── TextTriple.java ├── send_data_to_hdfs.py └── streaming ├── mapper.py └── reducer.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | 29 | #Java 30 | *.class 31 | *.jar 32 | *.war 33 | *.ear 34 | 35 | *.pydevproject 36 | .project 37 | .metadata 38 | bin/** 39 | tmp/** 40 | tmp/**/* 41 | *.tmp 42 | *.bak 43 | *.swp 44 | *~.nib 45 | local.properties 46 | .classpath 47 | .settings/ 48 | .loadpath 49 | 50 | # External tool builders 51 | .externalToolBuilders/ 52 | 53 | # Locally stored "Eclipse launch configurations" 54 | *.launch 55 | 56 | # CDT-specific 57 | .cproject 58 | 59 | # PDT-specific 60 | .buildpath 61 | 62 | target/ 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Source code for Cloudera blog post on using Python with Hadoop. The blog post can be found here: 2 | 3 | [http://blog.cloudera.com/blog/2013/01/a-guide-to-python-frameworks-for-hadoop](http://blog.cloudera.com/blog/2013/01/a-guide-to-python-frameworks-for-hadoop) 4 | -------------------------------------------------------------------------------- /dumbo/ngrams.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | class NgramMapper(object): 5 | 6 | def __init__(self): 7 | # determine value of n in the current block of ngrams 8 | input_file = os.environ['map_input_file'] 9 | self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0]) 10 | 11 | def __call__(self, key, value): 12 | data = value.split('\t') 13 | 14 | if len(data) < 3: 15 | return 16 | 17 | ngram = data[0].split() 18 | year = data[1] 19 | count = int(data[2]) 20 | 21 | if len(ngram) != self.expected_tokens: 22 | return 23 | 24 | pair = sorted([ngram[0], ngram[self.expected_tokens - 1]]) 25 | k = pair + [year] 26 | 27 | yield (k, count) 28 | 29 | def combiner(key, values): 30 | yield (key, sum(values)) 31 | 32 | def reducer(key, values): 33 | yield "%s\t%s\t%s" % tuple(key), str(sum(values)) 34 | 35 | 36 | if __name__ == '__main__': 37 | import dumbo 38 | # import pdb 39 | # pdb.set_trace() 40 | # dumbo.run(NgramMapper, reducer, combiner=combiner) 41 | dumbo.run(NgramMapper, reducer) -------------------------------------------------------------------------------- /hadoopy/launch_frozen_hadoopy.py: -------------------------------------------------------------------------------- 1 | from hadoopy import launch_frozen 2 | 3 | input_path = 'hdfs://laserson-1.ent.cloudera.com/ngrams' 4 | output_path = 'hdfs://laserson-1.ent.cloudera.com/output-hadoopy-frozen' 5 | 6 | launch_frozen(input_path, 7 | output_path, 8 | 'ngrams.py', 9 | use_seqoutput=False, 10 | num_reducers=10, 11 | hstreaming='/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar') 12 | -------------------------------------------------------------------------------- /hadoopy/launch_hadoopy.py: -------------------------------------------------------------------------------- 1 | from hadoopy import launch 2 | 3 | input_path = 'hdfs://laserson-1.ent.cloudera.com/ngrams' 4 | output_path = 'hdfs://laserson-1.ent.cloudera.com/output-hadoopy' 5 | 6 | launch(input_path, 7 | output_path, 8 | 'ngrams.py', 9 | use_seqoutput=False, 10 | num_reducers=10, 11 | hstreaming='/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar') 12 | -------------------------------------------------------------------------------- /hadoopy/ngrams.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import hadoopy 5 | 6 | class Mapper(object): 7 | 8 | def __init__(self): 9 | # determine value of n in the current block of ngrams 10 | input_file = os.environ['map_input_file'] 11 | self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0]) 12 | 13 | def map(self, key, value): 14 | data = value.split('\t') 15 | 16 | if len(data) < 3: 17 | return 18 | 19 | ngram = data[0].split() 20 | year = data[1] 21 | count = int(data[2]) 22 | 23 | if len(ngram) != self.expected_tokens: 24 | return 25 | 26 | pair = sorted([ngram[0], ngram[self.expected_tokens - 1]]) 27 | k = pair + [year] 28 | 29 | yield (k, count) 30 | 31 | def combiner(key, values): 32 | yield (key, sum(values)) 33 | 34 | def reducer(key, values): 35 | yield "%s\t%s\t%s" % tuple(key), str(sum(values)) 36 | 37 | if __name__ == '__main__': 38 | hadoopy.run(Mapper, reducer, combiner) 39 | -------------------------------------------------------------------------------- /luigi/client.cfg: -------------------------------------------------------------------------------- 1 | [hadoop] 2 | streaming-jar: /usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.2.0.jar -------------------------------------------------------------------------------- /luigi/ngrams.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import luigi 5 | import luigi.hadoop 6 | import luigi.hdfs 7 | 8 | class InputText(luigi.ExternalTask): 9 | path = luigi.Parameter() 10 | 11 | def output(self): 12 | return luigi.hdfs.HdfsTarget(self.path) 13 | 14 | class Ngrams(luigi.hadoop.JobTask): 15 | source = luigi.Parameter() 16 | destination = luigi.Parameter() 17 | # overrides superclass; gets set as jobconf: 18 | n_reduce_tasks = luigi.IntParameter(default=10) 19 | 20 | def requires(self): 21 | tasks = [] 22 | paths = luigi.hdfs.HdfsClient().listdir(self.source, ignore_directories=True, recursive=True) 23 | for path in paths: 24 | tasks.append(InputText(path)) 25 | return tasks 26 | 27 | def output(self): 28 | return luigi.hdfs.HdfsTarget(self.destination) 29 | 30 | def init_mapper(self): 31 | input_file = os.environ['map_input_file'] 32 | self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0]) 33 | 34 | def mapper(self, line): 35 | data = line.split('\t') 36 | 37 | if len(data) < 3: 38 | return 39 | 40 | # unpack data 41 | ngram = data[0].split() 42 | year = data[1] 43 | count = int(data[2]) 44 | 45 | if len(ngram) != self.expected_tokens: 46 | return 47 | 48 | # generate key 49 | pair = sorted([ngram[0], ngram[self.expected_tokens - 1]]) 50 | k = pair + [year] 51 | 52 | yield (k, count) 53 | 54 | def combiner(self, key, values): 55 | yield (key, sum(values)) 56 | 57 | def reducer(self, key, values): 58 | yield "%s\t%s\t%s" % tuple(key), str(sum(values)) 59 | 60 | if __name__ == '__main__': 61 | luigi.run() 62 | -------------------------------------------------------------------------------- /mrjob/ngrams.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import os 4 | import re 5 | 6 | from mrjob.job import MRJob 7 | from mrjob.protocol import RawProtocol, ReprProtocol 8 | 9 | class NgramNeighbors(MRJob): 10 | 11 | # mrjob allows you to specify input/intermediate/output serialization 12 | # default output protocol is JSON; here we set it to text 13 | OUTPUT_PROTOCOL = RawProtocol 14 | 15 | def mapper_init(self): 16 | # determine value of n in the current block of ngrams by parsing filename 17 | input_file = os.environ['map_input_file'] 18 | self.expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0]) 19 | 20 | def mapper(self, key, line): 21 | data = line.split('\t') 22 | 23 | # error checking 24 | if len(data) < 3: 25 | return 26 | 27 | # unpack data 28 | ngram = data[0].split() 29 | year = data[1] 30 | count = int(data[2]) 31 | 32 | # more error checking 33 | if len(ngram) != self.expected_tokens: 34 | return 35 | 36 | # generate key 37 | pair = sorted([ngram[0], ngram[self.expected_tokens - 1]]) 38 | k = pair + [year] 39 | 40 | # note that the key is an object (a list in this case) 41 | # that mrjob will serialize as JSON text 42 | yield (k, count) 43 | 44 | def combiner(self, key, counts): 45 | # the combiner must be separate from the reducer because the input 46 | # and output must both be JSON 47 | yield (key, sum(counts)) 48 | 49 | def reducer(self, key, counts): 50 | # the final output is encoded as text 51 | yield "%s\t%s\t%s" % tuple(key), str(sum(counts)) 52 | 53 | if __name__ == '__main__': 54 | # sets up a runner, based on command line options 55 | NgramNeighbors.run() 56 | -------------------------------------------------------------------------------- /native/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.cloudera 4 | NgramsComparison 5 | 0.0.1-SNAPSHOT 6 | Ngrams Comparison 7 | 8 | 9 | 2.0.0-mr1-cdh4.0.1 10 | 11 | 12 | 13 | 14 | 15 | cloudera-releases 16 | https://repository.cloudera.com/artifactory/cloudera-repos 17 | 18 | true 19 | 20 | 21 | false 22 | 23 | 24 | 25 | 26 | 27 | 28 | org.apache.hadoop 29 | hadoop-client 30 | ${hadoop.version} 31 | 32 | 33 | log4j 34 | log4j 35 | 1.2.17 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /native/src/main/java/NgramsDriver.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.conf.Configured; 2 | import org.apache.hadoop.fs.Path; 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.mapreduce.Job; 5 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 6 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 7 | import org.apache.hadoop.util.Tool; 8 | import org.apache.hadoop.util.ToolRunner; 9 | 10 | 11 | public class NgramsDriver extends Configured implements Tool { 12 | 13 | public int run(String[] args) throws Exception { 14 | Job job = new Job(getConf()); 15 | job.setJarByClass(getClass()); 16 | 17 | FileInputFormat.addInputPath(job, new Path(args[0])); 18 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 19 | 20 | job.setMapperClass(NgramsMapper.class); 21 | job.setCombinerClass(NgramsReducer.class); 22 | job.setReducerClass(NgramsReducer.class); 23 | 24 | job.setOutputKeyClass(TextTriple.class); 25 | job.setOutputValueClass(IntWritable.class); 26 | 27 | job.setNumReduceTasks(10); 28 | 29 | return job.waitForCompletion(true) ? 0 : 1; 30 | } 31 | 32 | public static void main(String[] args) throws Exception { 33 | int exitCode = ToolRunner.run(new NgramsDriver(), args); 34 | System.exit(exitCode); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /native/src/main/java/NgramsMapper.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.ArrayList; 3 | import java.util.Collections; 4 | import java.util.List; 5 | import java.util.regex.Matcher; 6 | import java.util.regex.Pattern; 7 | 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Mapper; 12 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 13 | import org.apache.log4j.Logger; 14 | 15 | 16 | public class NgramsMapper extends Mapper { 17 | 18 | private Logger LOG = Logger.getLogger(getClass()); 19 | 20 | private int expectedTokens; 21 | 22 | @Override 23 | protected void setup(Context context) throws IOException, InterruptedException { 24 | String inputFile = ((FileSplit) context.getInputSplit()).getPath().getName(); 25 | LOG.info("inputFile: " + inputFile); 26 | Pattern c = Pattern.compile("([\\d]+)gram"); 27 | Matcher m = c.matcher(inputFile); 28 | m.find(); 29 | expectedTokens = Integer.parseInt(m.group(1)); 30 | return; 31 | } 32 | 33 | @Override 34 | public void map(LongWritable key, Text value, Context context) 35 | throws IOException, InterruptedException { 36 | String[] data = value.toString().split("\\t"); 37 | 38 | if (data.length < 3) { 39 | return; 40 | } 41 | 42 | String[] ngram = data[0].split("\\s+"); 43 | String year = data[1]; 44 | IntWritable count = new IntWritable(Integer.parseInt(data[2])); 45 | 46 | if (ngram.length != this.expectedTokens) { 47 | return; 48 | } 49 | 50 | // build keyOut 51 | List triple = new ArrayList(3); 52 | triple.add(ngram[0]); 53 | triple.add(ngram[expectedTokens - 1]); 54 | Collections.sort(triple); 55 | triple.add(year); 56 | TextTriple keyOut = new TextTriple(triple); 57 | 58 | context.write(keyOut, count); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /native/src/main/java/NgramsReducer.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.mapreduce.Reducer; 5 | 6 | 7 | public class NgramsReducer extends Reducer { 8 | 9 | @Override 10 | protected void reduce(TextTriple key, Iterable values, Context context) 11 | throws IOException, InterruptedException { 12 | int sum = 0; 13 | for (IntWritable value : values) { 14 | sum += value.get(); 15 | } 16 | context.write(key, new IntWritable(sum)); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /native/src/main/java/TextTriple.java: -------------------------------------------------------------------------------- 1 | import java.io.DataInput; 2 | import java.io.DataOutput; 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.io.WritableComparable; 8 | 9 | 10 | public class TextTriple implements WritableComparable { 11 | 12 | private Text first; 13 | private Text second; 14 | private Text third; 15 | 16 | public TextTriple() { 17 | set(new Text(), new Text(), new Text()); 18 | } 19 | 20 | public TextTriple(List list) { 21 | set(new Text(list.get(0)), 22 | new Text(list.get(1)), 23 | new Text(list.get(2))); 24 | } 25 | 26 | public void set(Text first, Text second, Text third) { 27 | this.first = first; 28 | this.second = second; 29 | this.third = third; 30 | } 31 | 32 | public void write(DataOutput out) throws IOException { 33 | first.write(out); 34 | second.write(out); 35 | third.write(out); 36 | } 37 | 38 | public void readFields(DataInput in) throws IOException { 39 | first.readFields(in); 40 | second.readFields(in); 41 | third.readFields(in); 42 | } 43 | 44 | @Override 45 | public int hashCode() { 46 | return first.hashCode() * 163 + second.hashCode() * 31 + third.hashCode(); 47 | } 48 | 49 | @Override 50 | public boolean equals(Object obj) { 51 | if (obj instanceof TextTriple) { 52 | TextTriple tt = (TextTriple) obj; 53 | return first.equals(tt.first) && second.equals(tt.second) && third.equals(tt.third); 54 | } 55 | return false; 56 | } 57 | 58 | @Override 59 | public String toString() { 60 | return first + "\t" + second + "\t" + third; 61 | } 62 | 63 | public int compareTo(TextTriple other) { 64 | int comp = first.compareTo(other.first); 65 | if (comp != 0) { 66 | return comp; 67 | } 68 | comp = second.compareTo(other.second); 69 | if (comp != 0) { 70 | return comp; 71 | } 72 | return third.compareTo(other.third); 73 | } 74 | 75 | 76 | } 77 | -------------------------------------------------------------------------------- /send_data_to_hdfs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | import subprocess 4 | 5 | GB = 1024 ** 3 6 | def du(): 7 | p = subprocess.Popen('hadoop fs -du -s /ngrams', shell=True, stdout=subprocess.PIPE) 8 | return int(p.stdout.read().split()[0]) 9 | 10 | # generate list of URLs 11 | base_url = 'http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-%igram-20090715-%i.csv.zip' 12 | sizes = [(2, 100), (3, 200), (4, 400), (5, 800)] 13 | ngram_urls = [] 14 | for size in sizes: 15 | n = size[0] 16 | num_files = size[1] 17 | for i in xrange(num_files): 18 | ngram_urls.append(base_url % (n, i)) 19 | 20 | # download data directly into HDFS 21 | stream_cmd = 'curl "%s" | funzip | hadoop fs -put - /ngrams/%s' 22 | random.shuffle(ngram_urls) 23 | finished = False 24 | while not finished: 25 | url = ngram_urls.pop() 26 | filename = '.'.join(url.split('/')[-1].split('.')[:-1]) 27 | sys.stdout.write("%s\n" % filename) 28 | sys.stdout.flush() 29 | subprocess.Popen(stream_cmd % (url, filename), shell=True).wait() 30 | if du() > 20 * GB: 31 | finished = True 32 | -------------------------------------------------------------------------------- /streaming/mapper.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import os 4 | import re 5 | import sys 6 | 7 | # determine value of n in the current block of ngrams by parsing the filename 8 | input_file = os.environ['map_input_file'] 9 | expected_tokens = int(re.findall(r'([\d]+)gram', os.path.basename(input_file))[0]) 10 | 11 | for line in sys.stdin: 12 | data = line.split('\t') 13 | 14 | # perform some error checking 15 | if len(data) < 3: 16 | continue 17 | 18 | # unpack data 19 | ngram = data[0].split() 20 | year = data[1] 21 | count = data[2] 22 | 23 | # more error checking 24 | if len(ngram) != expected_tokens: 25 | continue 26 | 27 | # build key and emit 28 | pair = sorted([ngram[0], ngram[expected_tokens - 1]]) 29 | print >>sys.stdout, "%s\t%s\t%s\t%s" % (pair[0], pair[1], year, count) 30 | -------------------------------------------------------------------------------- /streaming/reducer.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | 5 | total = 0 6 | prev_key = False 7 | for line in sys.stdin: 8 | data = line.split('\t') 9 | curr_key = '\t'.join(data[:3]) 10 | count = int(data[3]) 11 | 12 | # found a boundary; emit current sum 13 | if prev_key and curr_key != prev_key: 14 | print >>sys.stdout, "%s\t%i" % (prev_key, total) 15 | prev_key = curr_key 16 | total = count 17 | # same key; accumulate sum 18 | else: 19 | prev_key = curr_key 20 | total += count 21 | 22 | # emit last key 23 | if prev_key: 24 | print >>sys.stdout, "%s\t%i" % (prev_key, total) 25 | --------------------------------------------------------------------------------