├── homework
├── week1
│ ├── dict.txt
│ ├── compress.patch
│ ├── sequencefile.patch
│ ├── partitioner.patch
│ └── dict.patch
├── week2
│ ├── bootstrap
│ │ ├── test.pig
│ │ ├── src
│ │ │ └── java
│ │ │ │ └── com
│ │ │ │ └── example
│ │ │ │ └── pig
│ │ │ │ └── BootstrapSampleLoader.java
│ │ └── pom.xml
│ └── extract_time.pig
├── profile_notes.txt
└── week3
│ └── sqoop.patch
├── week3
├── oozie
│ ├── cleanup.sql
│ ├── aggr.pig
│ ├── job.properties
│ ├── job-all.properties
│ ├── hive-config.xml
│ ├── workflow.xml
│ └── workflow-all.xml
└── hive
│ ├── evalfunc
│ ├── src
│ │ └── java
│ │ │ └── com
│ │ │ └── example
│ │ │ └── hive
│ │ │ └── evalfunc
│ │ │ └── Hello.java
│ └── pom.xml
│ └── tpcds
│ ├── tpcds.patch
│ ├── upload.sh
│ ├── insert.sql
│ ├── all-tables-orc.sql
│ ├── all-tables.sql
│ └── all-tables-base.sql
├── week1
├── conf
│ ├── mapred-site.xml
│ ├── core-site.xml
│ ├── yarn-site.xml
│ └── hdfs-site.xml
├── docker
│ └── Dockerfile
├── wordcount
│ ├── patches
│ │ ├── combiner.patch
│ │ ├── nummapreduce.patch
│ │ ├── config.patches
│ │ ├── counters.patch
│ │ └── distributedcache.patch
│ ├── pom.xml
│ └── src
│ │ └── java
│ │ └── com
│ │ └── example
│ │ └── WordCount.java
└── googleplaycrawler
│ ├── fixskew.patch
│ └── googleplaycrawler.patch
├── week2
├── loadfunc
│ ├── loadgoogle.pig
│ ├── pom.xml
│ └── src
│ │ └── java
│ │ └── com
│ │ └── example
│ │ └── NutchParsedDataLoader.java
├── python
│ ├── demo.py
│ └── kmeans.py
├── pigserver
│ ├── log4j.properties
│ ├── src
│ │ └── java
│ │ │ └── com
│ │ │ └── example
│ │ │ └── pig
│ │ │ └── TestPigServer.java
│ └── pom.xml
└── evalfunc
│ ├── pom.xml
│ ├── src
│ └── java
│ │ └── com
│ │ └── example
│ │ └── pig
│ │ └── GetCountry.java
│ └── patches
│ └── country_city.patch
├── druid
├── topn.json
├── sessionize.pig
└── cloudacl-index.json
├── week4
├── fixdoccompile.patch
├── PIG-3399-2.patch
├── doc.patch
├── jobname.patch
└── set.patch
└── capstone
└── track1
└── data_description.txt
/homework/week1/dict.txt:
--------------------------------------------------------------------------------
1 | 我 I
2 | 爱 love
3 |
--------------------------------------------------------------------------------
/week3/oozie/cleanup.sql:
--------------------------------------------------------------------------------
1 | drop table if exists student;
2 |
--------------------------------------------------------------------------------
/homework/week2/bootstrap/test.pig:
--------------------------------------------------------------------------------
1 | register target/bootstrap-0.0.1-SNAPSHOT.jar
2 |
3 | a = load 'studenttab10k' using com.example.pig.BootstrapSampleLoader();
4 | dump a;
5 |
--------------------------------------------------------------------------------
/week1/conf/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | mapreduce.framework.name
4 | yarn
5 |
6 |
7 |
--------------------------------------------------------------------------------
/week3/oozie/aggr.pig:
--------------------------------------------------------------------------------
1 | A = load 'student' using org.apache.hive.hcatalog.pig.HCatLoader();
2 | B = group A by name;
3 | C = foreach B generate group as name, AVG(A.gpa) as gpa;
4 | store C into '$OUTPUT' USING PigStorage();
5 |
--------------------------------------------------------------------------------
/week3/oozie/job.properties:
--------------------------------------------------------------------------------
1 | nameNode=hdfs://localhost:9000
2 | jobTracker=localhost:8032
3 | queueName=default
4 |
5 | oozie.use.system.libpath=true
6 | oozie.wf.application.path=${nameNode}/user/${user.name}/oozie/apps/workflow.xml
7 |
--------------------------------------------------------------------------------
/week3/oozie/job-all.properties:
--------------------------------------------------------------------------------
1 | nameNode=hdfs://localhost:9000
2 | jobTracker=localhost:8032
3 | queueName=default
4 |
5 | oozie.use.system.libpath=true
6 | oozie.wf.application.path=${nameNode}/user/${user.name}/oozie/apps/workflow-all.xml
7 |
--------------------------------------------------------------------------------
/week3/oozie/hive-config.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | javax.jdo.option.ConnectionURL
4 | jdbc:derby:;databaseName=/home/hadoop/apache-hive-1.2.1-bin/metastore_db;create=true
5 | JDBC connect string for a JDBC metastore
6 |
7 |
8 |
--------------------------------------------------------------------------------
/week3/hive/evalfunc/src/java/com/example/hive/evalfunc/Hello.java:
--------------------------------------------------------------------------------
1 | package com.example.hive.evalfunc;
2 |
3 | import org.apache.hadoop.hive.ql.exec.UDF;
4 | import org.apache.hadoop.io.Text;
5 |
6 | public class Hello extends UDF {
7 | public Text evaluate(Text input) {
8 | return new Text("Hello " + input.toString());
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/homework/week2/extract_time.pig:
--------------------------------------------------------------------------------
1 | a = LOAD 'access_logs' AS (line:chararray);
2 | b = FOREACH a GENERATE flatten(REGEX_EXTRACT_ALL(line, '(.*?) .*?\\[(.*?)\\].*')) as (ip:chararray, dt:chararray);
3 | c = FOREACH b GENERATE ip, ToDate(dt, 'yyyy-MM-dd HH:mm:ss.SSSSSS') as dt;
4 | d = FOREACH c GENERATE ip, GetYear(dt), GetMonth(dt), GetDay(dt), GetHour(dt), GetMinute(dt), GetSecond(dt);
5 | dump d;
6 |
--------------------------------------------------------------------------------
/week2/loadfunc/loadgoogle.pig:
--------------------------------------------------------------------------------
1 | register target/nutchdbloader-0.0.1-SNAPSHOT.jar
2 | register /home/hadoop/hadoop-2.7.3/share/hadoop/tools/lib/hadoop-aws-2.7.3.jar
3 | register nutch-1.12.jar
4 |
5 | rmf output
6 | loaded = load 's3n://daijytest/nutchdb/segments/*/parse_data/part-*/data' using com.example.NutchParsedDataLoader();
7 | filtered = filter loaded by $0 is not null;
8 | store filtered into 'output';
9 |
--------------------------------------------------------------------------------
/druid/topn.json:
--------------------------------------------------------------------------------
1 | {
2 | "queryType": "topN",
3 | "dataSource": "cloudacl_accesslog",
4 | "dimension": "country_code",
5 | "threshold": 5,
6 | "metric": "count",
7 | "granularity": "all",
8 | "aggregations": [
9 | {
10 | "type": "longSum",
11 | "name": "count",
12 | "fieldName": "count"
13 | }
14 | ],
15 | "intervals": [
16 | "2017-03-05T00:00:00.000/2017-03-12T00:00:00.000"
17 | ]
18 | }
19 |
--------------------------------------------------------------------------------
/homework/profile_notes.txt:
--------------------------------------------------------------------------------
1 | Yourkit CPU profile:
2 | export PIG_OPTS="-agentpath:/home/hadoop/yjp-2016.02/bin/linux-x86-64/libyjpagent.so=onexit=snapshot,sampling,dir=/tmp"
3 |
4 | Java memory dump:
5 | jmap -dump:file=
6 |
7 | Java memory dump upon OOM:
8 | export PIG_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp"
9 |
10 | MAT reading:
11 | http://eclipsesource.com/blogs/2013/01/21/10-tips-for-using-the-eclipse-memory-analyzer/
12 |
--------------------------------------------------------------------------------
/week3/hive/tpcds/tpcds.patch:
--------------------------------------------------------------------------------
1 | diff --git a/query_templates/netezza.tpl b/query_templates/netezza.tpl
2 | index 75488d2..0ff3ce1 100755
3 | --- a/query_templates/netezza.tpl
4 | +++ b/query_templates/netezza.tpl
5 | @@ -35,3 +35,5 @@
6 | define __LIMITA = "";
7 | define __LIMITB = "";
8 | define __LIMITC = "limit %d";
9 | +define _BEGIN = "-- start query " + [_QUERY] + " in stream " + [_STREAM] + " using template " + [_TEMPLATE];
10 | +define _END = "-- end query " + [_QUERY] + " in stream " + [_STREAM] + " using template " + [_TEMPLATE];
11 |
--------------------------------------------------------------------------------
/week1/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | RUN apt-get update && apt-get install -y openssh-server
4 | RUN apt-get install -y vim
5 | RUN mkdir /var/run/sshd
6 | RUN echo 'root:hadoop' | chpasswd
7 | RUN sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
8 |
9 | # SSH login fix. Otherwise user is kicked off after login
10 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
11 |
12 | ENV NOTVISIBLE "in users profile"
13 | RUN echo "export VISIBLE=now" >> /etc/profile
14 |
15 | EXPOSE 22
16 | EXPOSE 50070
17 | EXPOSE 8088
18 | EXPOSE 8000
19 | CMD ["/usr/sbin/sshd", "-D"]
20 |
--------------------------------------------------------------------------------
/week1/wordcount/patches/combiner.patch:
--------------------------------------------------------------------------------
1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java
2 | index 954aaab..39ffb71 100644
3 | --- a/week1/wordcount/src/java/com/example/WordCount.java
4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java
5 | @@ -75,6 +75,7 @@ public class WordCount {
6 | Job job = Job.getInstance(conf, "word count");
7 | job.setJarByClass(WordCount.class);
8 | job.setMapperClass(TokenizerMapper.class);
9 | + job.setCombinerClass(IntSumReducer.class);
10 | job.setReducerClass(IntSumReducer.class);
11 | job.setOutputKeyClass(Text.class);
12 | job.setOutputValueClass(IntWritable.class);
13 |
--------------------------------------------------------------------------------
/homework/week3/sqoop.patch:
--------------------------------------------------------------------------------
1 | diff --git a/src/java/org/apache/sqoop/mapreduce/db/TextSplitter.java b/src/java/org/apache/sqoop/mapreduce/db/TextSplitter.java
2 | index d3085cd..54dfac8 100644
3 | --- a/src/java/org/apache/sqoop/mapreduce/db/TextSplitter.java
4 | +++ b/src/java/org/apache/sqoop/mapreduce/db/TextSplitter.java
5 | @@ -156,7 +156,8 @@
6 | List splitStrings = new ArrayList();
7 |
8 | // Convert the BigDecimal splitPoints into their string representations.
9 | - for (BigDecimal bd : splitPoints) {
10 | + for (int i=1;i [...] ");
14 |
--------------------------------------------------------------------------------
/homework/week1/compress.patch:
--------------------------------------------------------------------------------
1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java
2 | index 954aaab..808e61a 100644
3 | --- a/week1/wordcount/src/java/com/example/WordCount.java
4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java
5 | @@ -67,6 +67,8 @@ public class WordCount {
6 |
7 | public static void main(String[] args) throws Exception {
8 | Configuration conf = new Configuration();
9 | + conf.setBoolean("mapreduce.output.fileoutputformat.compress", true);
10 | + conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
11 | String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
12 | if (otherArgs.length < 2) {
13 | System.err.println("Usage: wordcount [...] ");
14 |
--------------------------------------------------------------------------------
/homework/week2/bootstrap/src/java/com/example/pig/BootstrapSampleLoader.java:
--------------------------------------------------------------------------------
1 | package com.example.pig;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.commons.math3.distribution.PoissonDistribution;
6 | import org.apache.pig.builtin.PigStorage;
7 | import org.apache.pig.data.Tuple;
8 |
9 | public class BootstrapSampleLoader extends PigStorage
10 | {
11 | PoissonDistribution pd = new PoissonDistribution(1);
12 | Tuple originalTuple;
13 | int remaining = 0;
14 | @Override
15 | public Tuple getNext() throws IOException {
16 | if (remaining > 0) {
17 | remaining --;
18 | return originalTuple;
19 | }
20 |
21 | do {
22 | remaining = pd.sample();
23 | originalTuple = super.getNext();
24 | } while (originalTuple!=null && remaining == 0);
25 | remaining--;
26 | return originalTuple;
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/druid/sessionize.pig:
--------------------------------------------------------------------------------
1 | register datafu-pig-incubating-1.3.0-SNAPSHOT.jar
2 | DEFINE Sessionize datafu.pig.sessions.Sessionize('30m');
3 |
4 | rmf ooo
5 |
6 | a = LOAD 'sample.txt' AS (ip:chararray, dt:chararray, category:chararray);
7 | b = FILTER a BY ip IS NOT NULL;
8 | c = FOREACH b GENERATE ToDate(dt, 'dd/MMM/yyyy:HH:mm:ss') as dt, ip, category;
9 | d = FOREACH c GENERATE ToMilliSeconds(dt) as ts, dt, ip, category;
10 | e = GROUP d BY (ip, category);
11 | f = FOREACH e {
12 | ordered = ORDER d BY ts;
13 | GENERATE FLATTEN(Sessionize(ordered)) AS (ts,dt,ip,category,sessionId);
14 | }
15 | g = group f by (sessionId, ip, category);
16 | h = foreach g generate group.ip, group.category, MIN(f.dt) as start_time, COUNT(f) as session_count, ((MAX(f.ts) - MIN(f.ts))/ 1000.0/ 60.0) as session_length;
17 | i = foreach h generate ip, category, ToString(start_time, 'dd/MMM/yyyy:HH:mm:ss') as start_time, session_count, session_length;
18 | store i into 'ooo';
19 |
--------------------------------------------------------------------------------
/week1/conf/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 | fs.defaultFS
22 | hdfs://localhost:9000
23 |
24 |
25 |
--------------------------------------------------------------------------------
/week2/python/demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | # explicitly import Pig class
4 | from org.apache.pig.scripting import Pig
5 |
6 | # COMPILE: compile method returns a Pig object that represents the pipeline
7 | P = Pig.compile("""a = load '$input' using PigStorage() as (name:chararray, age:int, gpa:double);
8 | a1 = filter a by age > 18;
9 | a2 = foreach a1 generate name, ROUND(gpa) as gpa;
10 | b = load 'votertab10k' using PigStorage() as (name:chararray, age:int, registration:chararray, contributions:double);
11 | c = join a2 by name, b by name;
12 | d = group c by registration;
13 | e = foreach d generate group, AVG(c.gpa) as gpa;
14 | f = order e by gpa desc;
15 | store f into '$output';
16 | """)
17 |
18 | results = P.bind({'input':'studenttab10k', 'output':'output'}).runSingle()
19 |
20 | if results.isSuccessful() == "FAILED":
21 | raise "Pig job failed"
22 | iter = results.result("f").iterator()
23 | while iter.hasNext():
24 | tuple = iter.next()
25 | print tuple
26 |
--------------------------------------------------------------------------------
/week1/conf/yarn-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
15 |
16 |
17 |
18 |
19 | yarn.nodemanager.aux-services
20 | mapreduce_shuffle
21 |
22 |
23 | yarn.log-aggregation-enable
24 | true
25 |
26 |
27 |
--------------------------------------------------------------------------------
/homework/week2/bootstrap/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.example.pig
6 | bootstrap
7 | 0.0.1-SNAPSHOT
8 | bootstrap
9 | jar
10 |
11 |
12 | ${basedir}/src/java
13 |
14 |
15 |
16 |
17 |
18 | org.apache.pig
19 | pig
20 | 0.16.0
21 | h2
22 |
23 |
24 |
25 | org.apache.hadoop
26 | hadoop-common
27 | 2.7.3
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/week4/fixdoccompile.patch:
--------------------------------------------------------------------------------
1 | diff --git a/src/docs/src/documentation/content/xdocs/basic.xml b/src/docs/src/documentation/content/xdocs/basic.xml
2 | index a631607..0264089 100644
3 | --- a/src/docs/src/documentation/content/xdocs/basic.xml
4 | +++ b/src/docs/src/documentation/content/xdocs/basic.xml
5 | @@ -5424,7 +5424,7 @@ D = foreach C generate A::y, z; -- Cannot simply refer to y as it can refer to A
6 |
In cases where the schema is stored as part of the StoreFunc like PigStorage, JsonStorage, AvroStorage or OrcStorage,
7 | users generally have to use an extra FOREACH before STORE to rename the field names and remove the disambiguate
8 | operator from the names. To automatically remove the disambiguate operator from the schema for the STORE operation,
9 | - the pig.store.schema.disambiguate Pig property can be set to "false". It is the responsibility of the user
10 | + the pig.store.schema.disambiguate Pig property can be set to "false". It is the responsibility of the user
11 | to make sure that there is no conflict in the field names when using this setting.
12 |
13 |
14 |
--------------------------------------------------------------------------------
/homework/week1/sequencefile.patch:
--------------------------------------------------------------------------------
1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java
2 | index 954aaab..ce90bce 100644
3 | --- a/week1/wordcount/src/java/com/example/WordCount.java
4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java
5 | @@ -29,6 +29,7 @@ import org.apache.hadoop.mapreduce.Mapper;
6 | import org.apache.hadoop.mapreduce.Reducer;
7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
9 | +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
10 | import org.apache.hadoop.util.GenericOptionsParser;
11 |
12 | public class WordCount {
13 | @@ -78,6 +79,7 @@ public class WordCount {
14 | job.setReducerClass(IntSumReducer.class);
15 | job.setOutputKeyClass(Text.class);
16 | job.setOutputValueClass(IntWritable.class);
17 | + job.setOutputFormatClass(SequenceFileOutputFormat.class);
18 | for (int i = 0; i < otherArgs.length - 1; ++i) {
19 | FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
20 | }
21 |
--------------------------------------------------------------------------------
/week3/hive/evalfunc/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.example.hive
6 | evalfunc
7 | 0.0.1-SNAPSHOT
8 | citylookup
9 | jar
10 |
11 |
12 | 2.7.3
13 |
14 |
15 |
16 | ${basedir}/src/java
17 |
18 |
19 |
20 |
21 |
22 | org.apache.hive
23 | hive-exec
24 | 1.2.1
25 |
26 |
27 |
28 | org.apache.hadoop
29 | hadoop-common
30 | 2.7.3
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/week3/oozie/workflow.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ${jobTracker}
6 | ${nameNode}
7 |
8 |
9 |
10 |
11 |
12 | mapred.job.queue.name
13 | ${queueName}
14 |
15 |
16 | import --connect jdbc:mysql://localhost/cs502 --username hadoop --password hadoop --table student --hive-import --hive-home /home/hadoop/apache-hive-1.2.1-bin --create-hive-table --hive-table student --m 2 --split-by age
17 |
18 |
19 |
20 |
21 |
22 | Sqoop failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/week2/pigserver/log4j.properties:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | # ***** Set root logger level to DEBUG and its only appender to A.
19 | log4j.logger.org.apache.pig=info, A
20 |
21 | # ***** A is set to be a ConsoleAppender.
22 | log4j.appender.A=org.apache.log4j.ConsoleAppender
23 | # ***** A uses PatternLayout.
24 | log4j.appender.A.layout=org.apache.log4j.PatternLayout
25 | log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
26 |
--------------------------------------------------------------------------------
/week1/conf/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 | dfs.replication
22 | 1
23 |
24 |
25 | dfs.name.dir
26 | /home/hadoop/hadoop-2.7.3/data/name
27 |
28 |
29 | dfs.data.dir
30 | /home/hadoop/hadoop-2.7.3/data/data
31 |
32 |
33 |
--------------------------------------------------------------------------------
/week2/evalfunc/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.example
6 | citylookup
7 | 0.0.1-SNAPSHOT
8 | citylookup
9 | jar
10 |
11 |
12 | ${basedir}/src/java
13 |
14 |
15 |
16 |
17 |
18 | org.apache.pig
19 | pig
20 | 0.16.0
21 | h2
22 |
23 |
24 |
25 | org.apache.hadoop
26 | hadoop-common
27 | 2.7.3
28 |
29 |
30 |
31 | com.maxmind.geoip
32 | geoip-api
33 | 1.3.1
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/week1/wordcount/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.example
6 | wordcount
7 | 0.0.1-SNAPSHOT
8 | wordcount
9 | jar
10 |
11 |
12 | ${basedir}/src/java
13 |
14 |
15 |
16 |
17 |
18 |
19 | org.apache.hadoop
20 | hadoop-hdfs
21 | 2.7.3
22 |
23 |
24 |
25 | org.apache.hadoop
26 | hadoop-common
27 | 2.7.3
28 |
29 |
30 |
31 | org.apache.hadoop
32 | hadoop-mapreduce-client-core
33 | 2.7.3
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/week2/pigserver/src/java/com/example/pig/TestPigServer.java:
--------------------------------------------------------------------------------
1 | package com.example.pig;
2 |
3 | import java.io.IOException;
4 | import java.util.Iterator;
5 |
6 | import org.apache.pig.PigServer;
7 | import org.apache.pig.data.Tuple;
8 |
9 | public class TestPigServer {
10 | static public void main(String[] args) throws IOException {
11 |
12 | PigServer pigServer = new PigServer("local");
13 |
14 | pigServer.registerQuery("a = load 'studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);");
15 |
16 | pigServer.registerQuery("a1 = filter a by age > 18;");
17 |
18 | pigServer.registerQuery("a2 = foreach a1 generate name, ROUND(gpa) as gpa;");
19 |
20 | pigServer.registerQuery("b = load 'votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);");
21 |
22 | pigServer.registerQuery("c = join a2 by name, b by name;");
23 |
24 | pigServer.registerQuery("d = group c by registration;");
25 |
26 | pigServer.registerQuery("e = foreach d generate group, AVG(c.gpa) as gpa;");
27 |
28 | pigServer.registerQuery("f = order e by gpa desc;");
29 |
30 | Iterator iter = pigServer.openIterator("f");
31 |
32 | while (iter.hasNext()) {
33 | System.out.println(iter.next());
34 | }
35 |
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/week1/wordcount/patches/config.patches:
--------------------------------------------------------------------------------
1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java
2 | index 6d47026..6af3784 100644
3 | --- a/week1/wordcount/src/java/com/example/WordCount.java
4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java
5 | @@ -36,15 +36,18 @@ public class WordCount {
6 | public static class TokenizerMapper
7 | extends Mapper