├── homework
    ├── week1
    │   ├── dict.txt
    │   ├── compress.patch
    │   ├── sequencefile.patch
    │   ├── partitioner.patch
    │   └── dict.patch
    ├── week2
    │   ├── bootstrap
    │   │   ├── test.pig
    │   │   ├── src
    │   │   │   └── java
    │   │   │   │   └── com
    │   │   │   │       └── example
    │   │   │   │           └── pig
    │   │   │   │               └── BootstrapSampleLoader.java
    │   │   └── pom.xml
    │   └── extract_time.pig
    ├── profile_notes.txt
    └── week3
    │   └── sqoop.patch
├── week3
    ├── oozie
    │   ├── cleanup.sql
    │   ├── aggr.pig
    │   ├── job.properties
    │   ├── job-all.properties
    │   ├── hive-config.xml
    │   ├── workflow.xml
    │   └── workflow-all.xml
    └── hive
    │   ├── evalfunc
    │       ├── src
    │       │   └── java
    │       │   │   └── com
    │       │   │       └── example
    │       │   │           └── hive
    │       │   │               └── evalfunc
    │       │   │                   └── Hello.java
    │       └── pom.xml
    │   └── tpcds
    │       ├── tpcds.patch
    │       ├── upload.sh
    │       ├── insert.sql
    │       ├── all-tables-orc.sql
    │       ├── all-tables.sql
    │       └── all-tables-base.sql
├── week1
    ├── conf
    │   ├── mapred-site.xml
    │   ├── core-site.xml
    │   ├── yarn-site.xml
    │   └── hdfs-site.xml
    ├── docker
    │   └── Dockerfile
    ├── wordcount
    │   ├── patches
    │   │   ├── combiner.patch
    │   │   ├── nummapreduce.patch
    │   │   ├── config.patches
    │   │   ├── counters.patch
    │   │   └── distributedcache.patch
    │   ├── pom.xml
    │   └── src
    │   │   └── java
    │   │       └── com
    │   │           └── example
    │   │               └── WordCount.java
    └── googleplaycrawler
    │   ├── fixskew.patch
    │   └── googleplaycrawler.patch
├── week2
    ├── loadfunc
    │   ├── loadgoogle.pig
    │   ├── pom.xml
    │   └── src
    │   │   └── java
    │   │       └── com
    │   │           └── example
    │   │               └── NutchParsedDataLoader.java
    ├── python
    │   ├── demo.py
    │   └── kmeans.py
    ├── pigserver
    │   ├── log4j.properties
    │   ├── src
    │   │   └── java
    │   │   │   └── com
    │   │   │       └── example
    │   │   │           └── pig
    │   │   │               └── TestPigServer.java
    │   └── pom.xml
    └── evalfunc
    │   ├── pom.xml
    │   ├── src
    │       └── java
    │       │   └── com
    │       │       └── example
    │       │           └── pig
    │       │               └── GetCountry.java
    │   └── patches
    │       └── country_city.patch
├── druid
    ├── topn.json
    ├── sessionize.pig
    └── cloudacl-index.json
├── week4
    ├── fixdoccompile.patch
    ├── PIG-3399-2.patch
    ├── doc.patch
    ├── jobname.patch
    └── set.patch
└── capstone
    └── track1
        └── data_description.txt


/homework/week1/dict.txt:
--------------------------------------------------------------------------------
1 | 我	I
2 | 爱	love
3 | 


--------------------------------------------------------------------------------
/week3/oozie/cleanup.sql:
--------------------------------------------------------------------------------
1 | drop table if exists student;
2 | 


--------------------------------------------------------------------------------
/homework/week2/bootstrap/test.pig:
--------------------------------------------------------------------------------
1 | register target/bootstrap-0.0.1-SNAPSHOT.jar
2 | 
3 | a = load 'studenttab10k' using com.example.pig.BootstrapSampleLoader();
4 | dump a;
5 | 


--------------------------------------------------------------------------------
/week1/conf/mapred-site.xml:
--------------------------------------------------------------------------------
1 | <configuration>
2 |     <property>
3 |         <name>mapreduce.framework.name</name>
4 |         <value>yarn</value>
5 |     </property>
6 | </configuration>
7 | 


--------------------------------------------------------------------------------
/week3/oozie/aggr.pig:
--------------------------------------------------------------------------------
1 | A = load 'student' using org.apache.hive.hcatalog.pig.HCatLoader();
2 | B = group A by name;
3 | C = foreach B generate group as name, AVG(A.gpa) as gpa;
4 | store C into '$OUTPUT' USING PigStorage();
5 | 


--------------------------------------------------------------------------------
/week3/oozie/job.properties:
--------------------------------------------------------------------------------
1 | nameNode=hdfs://localhost:9000
2 | jobTracker=localhost:8032
3 | queueName=default
4 | 
5 | oozie.use.system.libpath=true
6 | oozie.wf.application.path=${nameNode}/user/${user.name}/oozie/apps/workflow.xml
7 | 


--------------------------------------------------------------------------------
/week3/oozie/job-all.properties:
--------------------------------------------------------------------------------
1 | nameNode=hdfs://localhost:9000
2 | jobTracker=localhost:8032
3 | queueName=default
4 | 
5 | oozie.use.system.libpath=true
6 | oozie.wf.application.path=${nameNode}/user/${user.name}/oozie/apps/workflow-all.xml
7 | 


--------------------------------------------------------------------------------
/week3/oozie/hive-config.xml:
--------------------------------------------------------------------------------
1 | <configuration>
2 |   <property>
3 |     <name>javax.jdo.option.ConnectionURL</name>  
4 |     <value>jdbc:derby:;databaseName=/home/hadoop/apache-hive-1.2.1-bin/metastore_db;create=true</value>
5 |     <description>JDBC connect string for a JDBC metastore</description>
6 |   </property>
7 | </configuration>
8 | 


--------------------------------------------------------------------------------
/week3/hive/evalfunc/src/java/com/example/hive/evalfunc/Hello.java:
--------------------------------------------------------------------------------
 1 | package com.example.hive.evalfunc;
 2 | 
 3 | import org.apache.hadoop.hive.ql.exec.UDF;
 4 | import org.apache.hadoop.io.Text;
 5 | 
 6 | public class Hello extends UDF {
 7 |     public Text evaluate(Text input) {
 8 |         return new Text("Hello " + input.toString());
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/homework/week2/extract_time.pig:
--------------------------------------------------------------------------------
1 | a = LOAD 'access_logs' AS (line:chararray);
2 | b = FOREACH a GENERATE flatten(REGEX_EXTRACT_ALL(line, '(.*?) .*?\\[(.*?)\\].*')) as (ip:chararray, dt:chararray);
3 | c = FOREACH b GENERATE ip, ToDate(dt, 'yyyy-MM-dd HH:mm:ss.SSSSSS') as dt;
4 | d = FOREACH c GENERATE ip, GetYear(dt), GetMonth(dt), GetDay(dt), GetHour(dt), GetMinute(dt), GetSecond(dt);
5 | dump d;
6 | 


--------------------------------------------------------------------------------
/week2/loadfunc/loadgoogle.pig:
--------------------------------------------------------------------------------
1 | register target/nutchdbloader-0.0.1-SNAPSHOT.jar
2 | register /home/hadoop/hadoop-2.7.3/share/hadoop/tools/lib/hadoop-aws-2.7.3.jar
3 | register nutch-1.12.jar
4 | 
5 | rmf output
6 | loaded = load 's3n://daijytest/nutchdb/segments/*/parse_data/part-*/data' using com.example.NutchParsedDataLoader();
7 | filtered = filter loaded by $0 is not null;
8 | store filtered into 'output';
9 | 


--------------------------------------------------------------------------------
/druid/topn.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "queryType": "topN",
 3 |   "dataSource": "cloudacl_accesslog",
 4 |   "dimension": "country_code",
 5 |   "threshold": 5,
 6 |   "metric": "count",
 7 |   "granularity": "all",
 8 |   "aggregations": [
 9 |     {
10 |       "type": "longSum",
11 |       "name": "count",
12 |       "fieldName": "count"
13 |     }
14 |   ],
15 |   "intervals": [
16 |     "2017-03-05T00:00:00.000/2017-03-12T00:00:00.000"
17 |   ]
18 | }
19 | 


--------------------------------------------------------------------------------
/homework/profile_notes.txt:
--------------------------------------------------------------------------------
 1 | Yourkit CPU profile:
 2 | export PIG_OPTS="-agentpath:/home/hadoop/yjp-2016.02/bin/linux-x86-64/libyjpagent.so=onexit=snapshot,sampling,dir=/tmp"
 3 | 
 4 | Java memory dump:
 5 | jmap -dump:file=<file_name> <pid>
 6 | 
 7 | Java memory dump upon OOM:
 8 | export PIG_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp"
 9 | 
10 | MAT reading:
11 | http://eclipsesource.com/blogs/2013/01/21/10-tips-for-using-the-eclipse-memory-analyzer/
12 | 


--------------------------------------------------------------------------------
/week3/hive/tpcds/tpcds.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/query_templates/netezza.tpl b/query_templates/netezza.tpl
 2 | index 75488d2..0ff3ce1 100755
 3 | --- a/query_templates/netezza.tpl
 4 | +++ b/query_templates/netezza.tpl
 5 | @@ -35,3 +35,5 @@
 6 |  define __LIMITA = "";
 7 |  define __LIMITB = "";
 8 |  define __LIMITC = "limit %d";
 9 | +define _BEGIN = "-- start query " + [_QUERY] + " in stream " + [_STREAM] + " using template " + [_TEMPLATE];
10 | +define _END = "-- end query " + [_QUERY] + " in stream " + [_STREAM] + " using template " + [_TEMPLATE];
11 | 


--------------------------------------------------------------------------------
/week1/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | RUN apt-get update && apt-get install -y openssh-server
 4 | RUN apt-get install -y vim
 5 | RUN mkdir /var/run/sshd
 6 | RUN echo 'root:hadoop' | chpasswd
 7 | RUN sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
 8 | 
 9 | # SSH login fix. Otherwise user is kicked off after login
10 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
11 | 
12 | ENV NOTVISIBLE "in users profile"
13 | RUN echo "export VISIBLE=now" >> /etc/profile
14 | 
15 | EXPOSE 22
16 | EXPOSE 50070
17 | EXPOSE 8088
18 | EXPOSE 8000
19 | CMD ["/usr/sbin/sshd", "-D"]
20 | 


--------------------------------------------------------------------------------
/week1/wordcount/patches/combiner.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java
 2 | index 954aaab..39ffb71 100644
 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java
 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java
 5 | @@ -75,6 +75,7 @@ public class WordCount {
 6 |      Job job = Job.getInstance(conf, "word count");
 7 |      job.setJarByClass(WordCount.class);
 8 |      job.setMapperClass(TokenizerMapper.class);
 9 | +    job.setCombinerClass(IntSumReducer.class);
10 |      job.setReducerClass(IntSumReducer.class);
11 |      job.setOutputKeyClass(Text.class);
12 |      job.setOutputValueClass(IntWritable.class);
13 | 


--------------------------------------------------------------------------------
/homework/week3/sqoop.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/src/java/org/apache/sqoop/mapreduce/db/TextSplitter.java b/src/java/org/apache/sqoop/mapreduce/db/TextSplitter.java
 2 | index d3085cd..54dfac8 100644
 3 | --- a/src/java/org/apache/sqoop/mapreduce/db/TextSplitter.java
 4 | +++ b/src/java/org/apache/sqoop/mapreduce/db/TextSplitter.java
 5 | @@ -156,7 +156,8 @@
 6 |      List<String> splitStrings = new ArrayList<String>();
 7 |  
 8 |      // Convert the BigDecimal splitPoints into their string representations.
 9 | -    for (BigDecimal bd : splitPoints) {
10 | +    for (int i=1;i<splitPoints.size()-1;i++) {
11 | +      BigDecimal bd = splitPoints.get(i);
12 |        splitStrings.add(commonPrefix + bigDecimalToString(bd));
13 |      }
14 |  
15 | 


--------------------------------------------------------------------------------
/week1/wordcount/patches/nummapreduce.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java
 2 | index 954aaab..96f77b0 100644
 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java
 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java
 5 | @@ -67,6 +67,8 @@ public class WordCount {
 6 |  
 7 |    public static void main(String[] args) throws Exception {
 8 |      Configuration conf = new Configuration();
 9 | +    conf.setInt("mapred.max.split.size", 10);
10 | +    conf.setInt("mapred.reduce.tasks", 2);
11 |      String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
12 |      if (otherArgs.length < 2) {
13 |        System.err.println("Usage: wordcount <in> [<in>...] <out>");
14 | 


--------------------------------------------------------------------------------
/homework/week1/compress.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java
 2 | index 954aaab..808e61a 100644
 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java
 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java
 5 | @@ -67,6 +67,8 @@ public class WordCount {
 6 |  
 7 |    public static void main(String[] args) throws Exception {
 8 |      Configuration conf = new Configuration();
 9 | +    conf.setBoolean("mapreduce.output.fileoutputformat.compress", true);
10 | +    conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
11 |      String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
12 |      if (otherArgs.length < 2) {
13 |        System.err.println("Usage: wordcount <in> [<in>...] <out>");
14 | 


--------------------------------------------------------------------------------
/homework/week2/bootstrap/src/java/com/example/pig/BootstrapSampleLoader.java:
--------------------------------------------------------------------------------
 1 | package com.example.pig;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.commons.math3.distribution.PoissonDistribution;
 6 | import org.apache.pig.builtin.PigStorage;
 7 | import org.apache.pig.data.Tuple;
 8 | 
 9 | public class BootstrapSampleLoader extends PigStorage
10 | {
11 |     PoissonDistribution pd = new PoissonDistribution(1);
12 |     Tuple originalTuple;
13 |     int remaining = 0;
14 |     @Override
15 |     public Tuple getNext() throws IOException {
16 |         if (remaining > 0) {
17 |             remaining --;
18 |             return originalTuple;
19 |         }
20 | 
21 |         do {
22 |             remaining = pd.sample();
23 |             originalTuple = super.getNext();
24 |         } while (originalTuple!=null && remaining == 0);
25 |         remaining--;
26 |         return originalTuple;
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/druid/sessionize.pig:
--------------------------------------------------------------------------------
 1 | register datafu-pig-incubating-1.3.0-SNAPSHOT.jar
 2 | DEFINE Sessionize datafu.pig.sessions.Sessionize('30m');
 3 | 
 4 | rmf ooo
 5 | 
 6 | a = LOAD 'sample.txt' AS (ip:chararray, dt:chararray, category:chararray);
 7 | b = FILTER a BY ip IS NOT NULL;
 8 | c = FOREACH b GENERATE ToDate(dt, 'dd/MMM/yyyy:HH:mm:ss') as dt, ip, category;
 9 | d = FOREACH c GENERATE ToMilliSeconds(dt) as ts, dt, ip, category;
10 | e = GROUP d BY (ip, category); 
11 | f = FOREACH e {
12 |     ordered = ORDER d BY ts;
13 |     GENERATE FLATTEN(Sessionize(ordered)) AS (ts,dt,ip,category,sessionId);
14 | }
15 | g = group f by (sessionId, ip, category);
16 | h = foreach g generate group.ip, group.category, MIN(f.dt) as start_time, COUNT(f) as session_count, ((MAX(f.ts) - MIN(f.ts))/ 1000.0/ 60.0) as session_length;
17 | i = foreach h generate ip, category, ToString(start_time, 'dd/MMM/yyyy:HH:mm:ss') as start_time, session_count, session_length;
18 | store i into 'ooo';
19 | 


--------------------------------------------------------------------------------
/week1/conf/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 |     <property>
21 |         <name>fs.defaultFS</name>
22 |         <value>hdfs://localhost:9000</value>
23 |     </property>
24 | </configuration>
25 | 


--------------------------------------------------------------------------------
/week2/python/demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python 
 2 | 
 3 | # explicitly import Pig class 
 4 | from org.apache.pig.scripting import Pig 
 5 | 
 6 | # COMPILE: compile method returns a Pig object that represents the pipeline
 7 | P = Pig.compile("""a = load '$input' using PigStorage() as (name:chararray, age:int, gpa:double);
 8 |     a1 = filter a by age > 18;
 9 |     a2 = foreach a1 generate name, ROUND(gpa) as gpa;
10 |     b = load 'votertab10k' using PigStorage() as (name:chararray, age:int, registration:chararray, contributions:double);
11 |     c = join a2 by name, b by name;
12 |     d = group c by registration;
13 |     e = foreach d generate group, AVG(c.gpa) as gpa;
14 |     f = order e by gpa desc;
15 |     store f into '$output';
16 | """)
17 | 
18 | results = P.bind({'input':'studenttab10k', 'output':'output'}).runSingle()
19 | 
20 | if results.isSuccessful() == "FAILED":
21 |     raise "Pig job failed"
22 | iter = results.result("f").iterator()
23 | while iter.hasNext():
24 |     tuple = iter.next()
25 |     print tuple
26 | 


--------------------------------------------------------------------------------
/week1/conf/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!--
 3 |   Licensed under the Apache License, Version 2.0 (the "License");
 4 |   you may not use this file except in compliance with the License.
 5 |   You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |   Unless required by applicable law or agreed to in writing, software
10 |   distributed under the License is distributed on an "AS IS" BASIS,
11 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |   See the License for the specific language governing permissions and
13 |   limitations under the License. See accompanying LICENSE file.
14 | -->
15 | <configuration>
16 | 
17 | <!-- Site specific YARN configuration properties -->
18 |     <property>
19 |         <name>yarn.nodemanager.aux-services</name>
20 |         <value>mapreduce_shuffle</value>
21 |     </property>
22 |     <property>
23 |         <name>yarn.log-aggregation-enable</name>
24 |         <value>true</value>
25 |     </property>
26 | </configuration>
27 | 


--------------------------------------------------------------------------------
/homework/week2/bootstrap/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <groupId>com.example.pig</groupId>
 6 |     <artifactId>bootstrap</artifactId>
 7 |     <version>0.0.1-SNAPSHOT</version>
 8 |     <name>bootstrap</name>
 9 |     <packaging>jar</packaging>
10 | 
11 |     <build>
12 |         <sourceDirectory>${basedir}/src/java</sourceDirectory>
13 |     </build>
14 | 
15 |     <dependencies>
16 | 
17 |         <dependency>
18 |             <groupId>org.apache.pig</groupId>
19 |             <artifactId>pig</artifactId>
20 |             <version>0.16.0</version>
21 |             <classifier>h2</classifier>
22 |         </dependency>
23 | 
24 |         <dependency>
25 |             <groupId>org.apache.hadoop</groupId>
26 |             <artifactId>hadoop-common</artifactId>
27 |             <version>2.7.3</version>
28 |         </dependency>
29 | 
30 |     </dependencies>
31 | 
32 | </project>
33 | 


--------------------------------------------------------------------------------
/week4/fixdoccompile.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/src/docs/src/documentation/content/xdocs/basic.xml b/src/docs/src/documentation/content/xdocs/basic.xml
 2 | index a631607..0264089 100644
 3 | --- a/src/docs/src/documentation/content/xdocs/basic.xml
 4 | +++ b/src/docs/src/documentation/content/xdocs/basic.xml
 5 | @@ -5424,7 +5424,7 @@ D = foreach C generate A::y, z; -- Cannot simply refer to y as it can refer to A
 6 |  <p> In cases where the schema is stored as part of the StoreFunc like PigStorage, JsonStorage, AvroStorage or OrcStorage,
 7 |     users generally have to use an extra FOREACH before STORE to rename the field names and remove the disambiguate
 8 |     operator from the names. To automatically remove the disambiguate operator from the schema for the STORE operation,
 9 | -   the <i>pig.store.schema.disambiguate</i> Pig property can be set to "false". It is the responsibility of the user
10 | +   the pig.store.schema.disambiguate Pig property can be set to "false". It is the responsibility of the user
11 |     to make sure that there is no conflict in the field names when using this setting.
12 |  </p>
13 |  </section>
14 | 


--------------------------------------------------------------------------------
/homework/week1/sequencefile.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java
 2 | index 954aaab..ce90bce 100644
 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java
 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java
 5 | @@ -29,6 +29,7 @@ import org.apache.hadoop.mapreduce.Mapper;
 6 |  import org.apache.hadoop.mapreduce.Reducer;
 7 |  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 8 |  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 9 | +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
10 |  import org.apache.hadoop.util.GenericOptionsParser;
11 |  
12 |  public class WordCount {
13 | @@ -78,6 +79,7 @@ public class WordCount {
14 |      job.setReducerClass(IntSumReducer.class);
15 |      job.setOutputKeyClass(Text.class);
16 |      job.setOutputValueClass(IntWritable.class);
17 | +    job.setOutputFormatClass(SequenceFileOutputFormat.class);
18 |      for (int i = 0; i < otherArgs.length - 1; ++i) {
19 |        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
20 |      }
21 | 


--------------------------------------------------------------------------------
/week3/hive/evalfunc/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <groupId>com.example.hive</groupId>
 6 |     <artifactId>evalfunc</artifactId>
 7 |     <version>0.0.1-SNAPSHOT</version>
 8 |     <name>citylookup</name>
 9 |     <packaging>jar</packaging>
10 | 
11 |     <properties>
12 |         <hadoop-23.version>2.7.3</hadoop-23.version>
13 |     </properties>
14 | 
15 |     <build>
16 |         <sourceDirectory>${basedir}/src/java</sourceDirectory>
17 |     </build>
18 | 
19 |     <dependencies>
20 | 
21 |         <dependency>
22 |             <groupId>org.apache.hive</groupId>
23 |             <artifactId>hive-exec</artifactId>
24 |             <version>1.2.1</version>
25 |         </dependency>
26 | 
27 |         <dependency>
28 |             <groupId>org.apache.hadoop</groupId>
29 |             <artifactId>hadoop-common</artifactId>
30 |             <version>2.7.3</version>
31 |         </dependency>
32 |     </dependencies>
33 | 
34 | </project>
35 | 


--------------------------------------------------------------------------------
/week3/oozie/workflow.xml:
--------------------------------------------------------------------------------
 1 | <workflow-app xmlns="uri:oozie:workflow:0.2" name="sqoop-wf">
 2 |     <start to="sqoop-node"/>
 3 |     <action name="sqoop-node">
 4 |         <sqoop xmlns="uri:oozie:sqoop-action:0.2">
 5 |             <job-tracker>${jobTracker}</job-tracker>
 6 |             <name-node>${nameNode}</name-node>
 7 |             <prepare>
 8 |                 <delete path="${nameNode}/user/${wf:user()}/student"/>
 9 |             </prepare>
10 |             <configuration>
11 |                 <property>
12 |                     <name>mapred.job.queue.name</name>
13 |                     <value>${queueName}</value>
14 |                 </property>
15 |             </configuration>
16 |             <command>import --connect jdbc:mysql://localhost/cs502 --username hadoop --password hadoop --table student --hive-import --hive-home /home/hadoop/apache-hive-1.2.1-bin --create-hive-table --hive-table student --m 2 --split-by age</command>
17 |         </sqoop>
18 |         <ok to="end"/>
19 |         <error to="fail"/>
20 |     </action>
21 |     <kill name="fail">
22 |         <message>Sqoop failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
23 |     </kill>
24 |     <end name="end"/>
25 | </workflow-app>
26 | 


--------------------------------------------------------------------------------
/week2/pigserver/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | # ***** Set root logger level to DEBUG and its only appender to A.
19 | log4j.logger.org.apache.pig=info, A
20 | 
21 | # ***** A is set to be a ConsoleAppender.
22 | log4j.appender.A=org.apache.log4j.ConsoleAppender
23 | # ***** A uses PatternLayout.
24 | log4j.appender.A.layout=org.apache.log4j.PatternLayout
25 | log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
26 | 


--------------------------------------------------------------------------------
/week1/conf/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 |     <property>
21 |         <name>dfs.replication</name>
22 |         <value>1</value>
23 |     </property>
24 |     <property>
25 |         <name>dfs.name.dir</name>
26 |         <value>/home/hadoop/hadoop-2.7.3/data/name</value>
27 |     </property>
28 |     <property>
29 |         <name>dfs.data.dir</name>
30 |         <value>/home/hadoop/hadoop-2.7.3/data/data</value>
31 |     </property>
32 | </configuration>
33 | 


--------------------------------------------------------------------------------
/week2/evalfunc/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <groupId>com.example</groupId>
 6 |     <artifactId>citylookup</artifactId>
 7 |     <version>0.0.1-SNAPSHOT</version>
 8 |     <name>citylookup</name>
 9 |     <packaging>jar</packaging>
10 | 
11 |     <build>
12 |         <sourceDirectory>${basedir}/src/java</sourceDirectory>
13 |     </build>
14 | 
15 |     <dependencies>
16 | 
17 |         <dependency>
18 |             <groupId>org.apache.pig</groupId>
19 |             <artifactId>pig</artifactId>
20 |             <version>0.16.0</version>
21 |             <classifier>h2</classifier>
22 |         </dependency>
23 | 
24 |         <dependency>
25 |             <groupId>org.apache.hadoop</groupId>
26 |             <artifactId>hadoop-common</artifactId>
27 |             <version>2.7.3</version>
28 |         </dependency>
29 | 
30 |         <dependency>
31 |             <groupId>com.maxmind.geoip</groupId>
32 |             <artifactId>geoip-api</artifactId>
33 |             <version>1.3.1</version>
34 |         </dependency>
35 |     </dependencies>
36 | 
37 | </project>
38 | 


--------------------------------------------------------------------------------
/week1/wordcount/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <groupId>com.example</groupId>
 6 |     <artifactId>wordcount</artifactId>
 7 |     <version>0.0.1-SNAPSHOT</version>
 8 |     <name>wordcount</name>
 9 |     <packaging>jar</packaging>
10 | 
11 |     <build>
12 |         <sourceDirectory>${basedir}/src/java</sourceDirectory>
13 |     </build>
14 | 
15 |     <dependencies>
16 | 
17 |         <!-- Hadoop -->
18 |         <dependency>
19 |             <groupId>org.apache.hadoop</groupId>
20 |             <artifactId>hadoop-hdfs</artifactId>
21 |             <version>2.7.3</version>
22 |         </dependency>
23 | 
24 |         <dependency>
25 |             <groupId>org.apache.hadoop</groupId>
26 |             <artifactId>hadoop-common</artifactId>
27 |             <version>2.7.3</version>
28 |         </dependency>
29 | 
30 |         <dependency>
31 |             <groupId>org.apache.hadoop</groupId>
32 |             <artifactId>hadoop-mapreduce-client-core</artifactId>
33 |             <version>2.7.3</version>
34 |         </dependency>
35 | 
36 |     </dependencies>
37 | 
38 | </project>
39 | 


--------------------------------------------------------------------------------
/week2/pigserver/src/java/com/example/pig/TestPigServer.java:
--------------------------------------------------------------------------------
 1 | package com.example.pig;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | 
 6 | import org.apache.pig.PigServer;
 7 | import org.apache.pig.data.Tuple;
 8 | 
 9 | public class TestPigServer {
10 |     static public void main(String[] args) throws IOException {
11 | 
12 |         PigServer pigServer = new PigServer("local");
13 | 
14 |         pigServer.registerQuery("a = load 'studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);");
15 | 
16 |         pigServer.registerQuery("a1 = filter a by age > 18;");
17 | 
18 |         pigServer.registerQuery("a2 = foreach a1 generate name, ROUND(gpa) as gpa;");
19 | 
20 |         pigServer.registerQuery("b = load 'votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);");
21 | 
22 |         pigServer.registerQuery("c = join a2 by name, b by name;");
23 | 
24 |         pigServer.registerQuery("d = group c by registration;");
25 | 
26 |         pigServer.registerQuery("e = foreach d generate group, AVG(c.gpa) as gpa;");
27 | 
28 |         pigServer.registerQuery("f = order e by gpa desc;");
29 | 
30 |         Iterator<Tuple> iter = pigServer.openIterator("f");
31 | 
32 |         while (iter.hasNext()) {
33 |             System.out.println(iter.next());
34 |         }
35 | 
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/week1/wordcount/patches/config.patches:
--------------------------------------------------------------------------------
 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java
 2 | index 6d47026..6af3784 100644
 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java
 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java
 5 | @@ -36,15 +36,18 @@ public class WordCount {
 6 |    public static class TokenizerMapper 
 7 |         extends Mapper<Object, Text, Text, IntWritable>{
 8 |      
 9 | -    private final static IntWritable one = new IntWritable(1);
10 |      private Text word = new Text();
11 | +    int multiplier = -1;
12 |  
13 |      public void map(Object key, Text value, Context context
14 |                      ) throws IOException, InterruptedException {
15 | +      if (multiplier == -1) {
16 | +        multiplier = context.getConfiguration().getInt("multiplier", -1);
17 | +      }
18 |        StringTokenizer itr = new StringTokenizer(value.toString());
19 |        while (itr.hasMoreTokens()) {
20 |          word.set(itr.nextToken());
21 | -        context.write(word, one);
22 | +        context.write(word, new IntWritable(multiplier));
23 |        }
24 |      }
25 |    }
26 | @@ -72,6 +75,7 @@ public class WordCount {
27 |        System.err.println("Usage: wordcount <in> [<in>...] <out>");
28 |        System.exit(2);
29 |      }
30 | +    conf.setInt("multiplier", 2);
31 |      Job job = Job.getInstance(conf, "word count");
32 |      job.setJarByClass(WordCount.class);
33 |      job.setMapperClass(TokenizerMapper.class);
34 | 


--------------------------------------------------------------------------------
/week2/loadfunc/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <groupId>com.example</groupId>
 6 |     <artifactId>nutchdbloader</artifactId>
 7 |     <version>0.0.1-SNAPSHOT</version>
 8 |     <name>nutchdbloader</name>
 9 |     <packaging>jar</packaging>
10 | 
11 |     <build>
12 |         <sourceDirectory>${basedir}/src/java</sourceDirectory>
13 |     </build>
14 | 
15 |     <dependencies>
16 | 
17 |         <dependency>
18 |             <groupId>org.apache.pig</groupId>
19 |             <artifactId>pig</artifactId>
20 |             <version>0.16.0</version>
21 |             <classifier>h2</classifier>
22 |         </dependency>
23 | 
24 |         <dependency>
25 |             <groupId>org.apache.hadoop</groupId>
26 |             <artifactId>hadoop-common</artifactId>
27 |             <version>2.7.3</version>
28 |         </dependency>
29 | 
30 |         <dependency>
31 |             <groupId>org.apache.hadoop</groupId>
32 |             <artifactId>hadoop-mapreduce-client-core</artifactId>
33 |             <version>2.7.3</version>
34 |         </dependency>
35 | 
36 |         <dependency>
37 |             <groupId>org.apache.nutch</groupId>
38 |             <artifactId>nutch</artifactId>
39 |             <version>1.12</version>
40 |         </dependency>
41 |     </dependencies>
42 | 
43 | </project>
44 | 


--------------------------------------------------------------------------------
/druid/cloudacl-index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type" : "index_hadoop",
 3 |   "spec" : {
 4 |     "ioConfig" : {
 5 |       "type" : "hadoop",
 6 |       "inputSpec" : {
 7 |         "type" : "static",
 8 |         "paths" : "file:///Users/daijy/capstone/cloudacl.txt"
 9 |       }
10 |     },
11 |     "dataSchema" : {
12 |       "dataSource" : "cloudacl_accesslog",
13 |       "granularitySpec" : {
14 |         "type" : "uniform",
15 |         "segmentGranularity" : "day",
16 |         "queryGranularity" : "none",
17 |         "intervals" : ["2017-03-05/2017-03-11"]
18 |       },
19 |       "parser" : {
20 |         "type" : "hadoopyString",
21 |         "parseSpec" : {
22 |           "format" : "tsv",
23 |           "columns" : [
24 |             "country_code",
25 |             "country",
26 |             "city",
27 |             "timestamp",
28 |             "category"
29 |           ],
30 |           "dimensionsSpec" : {
31 |             "dimensions" : [
32 |               "country_code",
33 |               "city",
34 |               "category"
35 |             ]
36 |           },
37 |           "timestampSpec" : {
38 |             "format": "dd/MMM/yyyy:HH:mm:ss",
39 |             "column" : "timestamp"
40 |           }
41 |         }
42 |       },
43 |       "metricsSpec" : [
44 |         {
45 |           "name" : "count",
46 |           "type" : "count"
47 |         }
48 |       ]
49 |     },
50 |     "tuningConfig" : {
51 |       "type" : "hadoop",
52 |       "partitionsSpec" : {
53 |         "type" : "hashed",
54 |         "targetPartitionSize" : 5000000
55 |       },
56 |       "jobProperties" : {}
57 |     }
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/week2/evalfunc/src/java/com/example/pig/GetCountry.java:
--------------------------------------------------------------------------------
 1 | package com.example.pig;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.pig.EvalFunc;
 8 | import org.apache.pig.data.Tuple;
 9 | import org.apache.pig.data.TupleFactory;
10 | import org.apache.pig.impl.logicalLayer.schema.Schema;
11 | import org.apache.pig.impl.util.Utils;
12 | import org.apache.pig.parser.ParserException;
13 | 
14 | import com.maxmind.geoip.Location;
15 | import com.maxmind.geoip.LookupService;
16 | 
17 | public class GetCountry extends EvalFunc<Tuple> {
18 |     LookupService cl;
19 | 	@Override
20 | 	public Tuple exec(Tuple t) throws IOException {
21 | 	    if (cl == null) {
22 | 	        cl = new LookupService("GeoLiteCity.dat",
23 | 	                LookupService.GEOIP_MEMORY_CACHE );
24 | 	    }
25 | 	    Location loc = cl.getLocation((String)t.get(0));
26 | 	    if (loc == null) {
27 | 	        return null;
28 | 	    }
29 | 	    Tuple r = TupleFactory.getInstance().newTuple();
30 | 	    r.append(loc.countryName);
31 | 	    r.append(loc.city);
32 | 	    return r;
33 | 	}
34 | 	@Override
35 |     public List<String> getShipFiles() {
36 |         List<String> shipFiles = new ArrayList<String>();
37 |         shipFiles.add("GeoLiteCity.dat");
38 |         return shipFiles;
39 |     }
40 | 	@Override
41 |     public Schema outputSchema(Schema input) {
42 | 	    try {
43 |             return Utils.getSchemaFromString("(country:chararray, city:chararray)");
44 |         } catch (ParserException e) {
45 |             throw new RuntimeException(e);
46 |         }
47 |     }
48 | }


--------------------------------------------------------------------------------
/week1/wordcount/patches/counters.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java
 2 | index 6d47026..bb7127a 100644
 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java
 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java
 5 | @@ -24,6 +24,7 @@ import org.apache.hadoop.conf.Configuration;
 6 |  import org.apache.hadoop.fs.Path;
 7 |  import org.apache.hadoop.io.IntWritable;
 8 |  import org.apache.hadoop.io.Text;
 9 | +import org.apache.hadoop.mapreduce.Counters;
10 |  import org.apache.hadoop.mapreduce.Job;
11 |  import org.apache.hadoop.mapreduce.Mapper;
12 |  import org.apache.hadoop.mapreduce.Reducer;
13 | @@ -32,12 +33,14 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 |  import org.apache.hadoop.util.GenericOptionsParser;
15 |  
16 |  public class WordCount {
17 | +  enum WordRange {A_M, N_Z}
18 |  
19 |    public static class TokenizerMapper 
20 |         extends Mapper<Object, Text, Text, IntWritable>{
21 |      
22 |      private final static IntWritable one = new IntWritable(1);
23 |      private Text word = new Text();
24 | +    Counters counters = new Counters();
25 |  
26 |      public void map(Object key, Text value, Context context
27 |                      ) throws IOException, InterruptedException {
28 | @@ -45,6 +48,11 @@ public class WordCount {
29 |        while (itr.hasMoreTokens()) {
30 |          word.set(itr.nextToken());
31 |          context.write(word, one);
32 | +        if (word.toString().toUpperCase().compareTo("N") < 0) {
33 | +            context.getCounter(WordRange.A_M).increment(1);
34 | +        } else {
35 | +            context.getCounter(WordRange.N_Z).increment(1);
36 | +        }
37 |        }
38 |      }
39 |    }
40 | 


--------------------------------------------------------------------------------
/homework/week1/partitioner.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java
 2 | index 954aaab..49a1ea0 100644
 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java
 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java
 5 | @@ -26,6 +26,7 @@ import org.apache.hadoop.io.IntWritable;
 6 |  import org.apache.hadoop.io.Text;
 7 |  import org.apache.hadoop.mapreduce.Job;
 8 |  import org.apache.hadoop.mapreduce.Mapper;
 9 | +import org.apache.hadoop.mapreduce.Partitioner;
10 |  import org.apache.hadoop.mapreduce.Reducer;
11 |  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 |  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 | @@ -65,6 +66,16 @@ public class WordCount {
14 |      }
15 |    }
16 |  
17 | +  public static class MyPartitioner extends Partitioner<Text, IntWritable> {
18 | +    @Override
19 | +    public int getPartition(Text key, IntWritable value, int numPartitions) {
20 | +        if (key.charAt(0)<='n') {
21 | +            return 1;
22 | +        } else {
23 | +            return 0;
24 | +        }
25 | +    }
26 | +  }
27 |    public static void main(String[] args) throws Exception {
28 |      Configuration conf = new Configuration();
29 |      String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
30 | @@ -78,6 +89,8 @@ public class WordCount {
31 |      job.setReducerClass(IntSumReducer.class);
32 |      job.setOutputKeyClass(Text.class);
33 |      job.setOutputValueClass(IntWritable.class);
34 | +    job.setPartitionerClass(MyPartitioner.class);
35 | +    job.setNumReduceTasks(2);
36 |      for (int i = 0; i < otherArgs.length - 1; ++i) {
37 |        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
38 |      }
39 | 


--------------------------------------------------------------------------------
/week1/wordcount/patches/distributedcache.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/week1/wordcount/pom.xml b/week1/wordcount/pom.xml
 2 | index 416f1d4..5c73b93 100644
 3 | --- a/week1/wordcount/pom.xml
 4 | +++ b/week1/wordcount/pom.xml
 5 | @@ -33,6 +33,12 @@
 6 |              <version>2.7.3</version>
 7 |          </dependency>
 8 |  
 9 | +        <dependency>
10 | +            <groupId>dk.brics.automaton</groupId>
11 | +            <artifactId>automaton</artifactId>
12 | +            <version>1.11-8</version>
13 | +        </dependency>
14 | +
15 |      </dependencies>
16 |  
17 |  </project>
18 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java
19 | index 6d47026..23a7f24 100644
20 | --- a/week1/wordcount/src/java/com/example/WordCount.java
21 | +++ b/week1/wordcount/src/java/com/example/WordCount.java
22 | @@ -31,6 +31,9 @@ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
23 |  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
24 |  import org.apache.hadoop.util.GenericOptionsParser;
25 |  
26 | +import dk.brics.automaton.Automaton;
27 | +import dk.brics.automaton.RegExp;
28 | +
29 |  public class WordCount {
30 |  
31 |    public static class TokenizerMapper 
32 | @@ -42,9 +45,13 @@ public class WordCount {
33 |      public void map(Object key, Text value, Context context
34 |                      ) throws IOException, InterruptedException {
35 |        StringTokenizer itr = new StringTokenizer(value.toString());
36 | +      Automaton automaton = new RegExp("h(.*)").toAutomaton();
37 |        while (itr.hasMoreTokens()) {
38 | -        word.set(itr.nextToken());
39 | -        context.write(word, one);
40 | +        String w = itr.nextToken();
41 | +        if (automaton.run(w)) {
42 | +          word.set(w);
43 | +          context.write(word, one);
44 | +        }
45 |        }
46 |      }
47 |    }
48 | 


--------------------------------------------------------------------------------
/week2/evalfunc/patches/country_city.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/week2/evalfunc/src/java/com/example/pig/GetCountry.java b/week2/evalfunc/src/java/com/example/pig/GetCountry.java
 2 | index 2526dec..2f8756f 100644
 3 | --- a/week2/evalfunc/src/java/com/example/pig/GetCountry.java
 4 | +++ b/week2/evalfunc/src/java/com/example/pig/GetCountry.java
 5 | @@ -6,20 +6,30 @@ import java.util.List;
 6 |  
 7 |  import org.apache.pig.EvalFunc;
 8 |  import org.apache.pig.data.Tuple;
 9 | +import org.apache.pig.data.TupleFactory;
10 | +import org.apache.pig.impl.logicalLayer.schema.Schema;
11 | +import org.apache.pig.impl.util.Utils;
12 | +import org.apache.pig.parser.ParserException;
13 |  
14 |  import com.maxmind.geoip.Location;
15 |  import com.maxmind.geoip.LookupService;
16 |  
17 | -public class GetCountry extends EvalFunc<String> {
18 | +public class GetCountry extends EvalFunc<Tuple> {
19 |      LookupService cl;
20 |  	@Override
21 | -	public String exec(Tuple t) throws IOException {
22 | +	public Tuple exec(Tuple t) throws IOException {
23 |  	    if (cl == null) {
24 |  	        cl = new LookupService("GeoLiteCity.dat",
25 |  	                LookupService.GEOIP_MEMORY_CACHE );
26 |  	    }
27 |  	    Location loc = cl.getLocation((String)t.get(0));
28 | -	    return loc!=null? loc.countryName:null;
29 | +	    if (loc == null) {
30 | +	        return null;
31 | +	    }
32 | +	    Tuple r = TupleFactory.getInstance().newTuple();
33 | +	    r.append(loc.countryName);
34 | +	    r.append(loc.city);
35 | +	    return r;
36 |  	}
37 |  	@Override
38 |      public List<String> getShipFiles() {
39 | @@ -27,4 +37,12 @@ public class GetCountry extends EvalFunc<String> {
40 |          shipFiles.add("GeoLiteCity.dat");
41 |          return shipFiles;
42 |      }
43 | +	@Override
44 | +    public Schema outputSchema(Schema input) {
45 | +	    try {
46 | +            return Utils.getSchemaFromString("(country:chararray, city:chararray)");
47 | +        } catch (ParserException e) {
48 | +            throw new RuntimeException(e);
49 | +        }
50 | +    }
51 |  }
52 | \ No newline at end of file
53 | 


--------------------------------------------------------------------------------
/capstone/track1/data_description.txt:
--------------------------------------------------------------------------------
 1 | 我们的数据是CloudACL采集的API请求纪录。CloudACL前端的产品包括mobile app和browser add-on，每个前端会截获url，通过API请求向后端的tomcat获取该url对应的类型，前端根据url类型做相应动作。
 2 | 下载的文件每个tar.gz包是一天的tomcat log记录，包含若干个.processed文件，每个文件都是text格式。每条记录一行。记录分为两种格式，分别对应新旧两组不同的API。处理时需要根据GET的URL分别处理。以下各举一个例子：
 3 | axis2:
 4 | 203.87.133.189 - - [04/Mar/2017:23:59:59 +0000] "GET /axis2/services/WebFilteringService/getCategoryByUrl?app=chrome_antiporn&ver=0.19.7.1&url=https%3A//www.googleapis.com/rpc&cat=search-engine HTTP/1.1" 200 133 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
 5 | 
 6 | jersey:
 7 | 51.39.90.114 - - [05/Mar/2017:00:04:19 +0000] "GET /webapi/getcategory?uri=pt.tvtc.gov.sa&cat=government&key=d647fe2a-2193-4c2d-8cad-38e78316d020 HTTP/1.1" 200 63 "-" "Apache-HttpClient/UNAVAILABLE (java 1.4)"
 8 | 
 9 | 在axis2 api中，app是指前端app的名字。jersey api中，key是指前端app的application key。其他项目应该是显而易见的。
10 | 有同学可能有困惑为什么请求中已经包含了url类型(cat)。其实这一项是为了让数据更丰富一些，后来加上去的。原始数据是没有这一项的。我们可能的类型有:
11 | unknown,hacking,phishing-and-fraud,botnet,malware,spyware-and-adware,keylogger-and-monitoring,peer2peer,media-streaming,online-storage,abortion,adult-and-pornography,sex-education,nudity,abused-drugs,marijuana,healthy-and-medicine,real-estate,internet-security,financial-service,business-and-economy,computer-information,auctions,shopping,cult-and-occult,travel,home-garden,military,social-networking,dead-sites,stock-and-tool,training-and-tool,dating,religion,entertainment-and-art,personal-site-and-blog,legal,local-info,job-search,gambling,translation,research-reference,software-download,game,philosophy-and-political,weapon,pay2surf,hunting-and-fishing,society,educational-institution,online-greeting,sport,swimsuits-&-intimate-apparel,questionable,kid,search-engine,internet-portal,online-advertisement,web-mail,envasion-proxy,music,government,news-and-media,content-delivery-network,internet-communication,spam-comfirmed,spam-url,spam-unconfirmed,http-proxy,dynamically-content,parked-domain,alcohol-and-tobacco,private-ip,image-and-video-search,fashion-and-beauty,recreation-and-hobbies,motor-vehicle,web-hosting
12 | 


--------------------------------------------------------------------------------
/week3/oozie/workflow-all.xml:
--------------------------------------------------------------------------------
 1 | <workflow-app xmlns="uri:oozie:workflow:0.2" name="demo-wf">
 2 |     <start to="hive-node"/>
 3 |     <action name="hive-node">
 4 |         <hive xmlns="uri:oozie:hive-action:0.2">
 5 |             <job-tracker>${jobTracker}</job-tracker>
 6 |             <name-node>${nameNode}</name-node>
 7 |             <job-xml>hive-config.xml</job-xml>
 8 |             <script>cleanup.sql</script>
 9 |         </hive>
10 |         <ok to="sqoop-node"/>
11 |         <error to="fail"/>
12 |     </action>
13 |     <action name="sqoop-node">
14 |         <sqoop xmlns="uri:oozie:sqoop-action:0.2">
15 |             <job-tracker>${jobTracker}</job-tracker>
16 |             <name-node>${nameNode}</name-node>
17 |             <prepare>
18 |                 <delete path="${nameNode}/user/${wf:user()}/student"/>
19 |             </prepare>
20 |             <configuration>
21 |                 <property>
22 |                     <name>mapred.job.queue.name</name>
23 |                     <value>${queueName}</value>
24 |                 </property>
25 |             </configuration>
26 |             <command>import --connect jdbc:mysql://localhost/cs502 --username hadoop --password hadoop --table student --hive-import --hive-home /home/hadoop/apache-hive-1.2.1-bin --create-hive-table --hive-table student --m 2 --split-by age</command>
27 |         </sqoop>
28 |         <ok to="pig-node"/>
29 |         <error to="fail"/>
30 |     </action>
31 |     <action name="pig-node">
32 |         <pig>
33 |             <job-tracker>${jobTracker}</job-tracker>
34 |             <name-node>${nameNode}</name-node>
35 |             <prepare>
36 |                 <delete path="${nameNode}/user/${wf:user()}/output"/>
37 |             </prepare>
38 |             <configuration>
39 |                 <property>
40 |                     <name>mapred.job.queue.name</name>
41 |                     <value>${queueName}</value>
42 |                 </property>
43 |             </configuration>
44 |             <script>aggr.pig</script>
45 |             <param>OUTPUT=/user/${wf:user()}/output</param>
46 |         </pig>
47 |         <ok to="end"/>
48 |         <error to="fail"/>
49 |     </action>
50 |     <kill name="fail">
51 |         <message>Workflow failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
52 |     </kill>
53 |     <end name="end"/>
54 | </workflow-app>
55 | 


--------------------------------------------------------------------------------
/week1/googleplaycrawler/fixskew.patch:
--------------------------------------------------------------------------------
 1 | commit 7cf95ef3bd61497a4359f091406c8c3e186fb06b
 2 | Author: Daniel Dai <daijy@hortonworks.com>
 3 | Date:   Thu Feb 9 22:00:45 2017 -0800
 4 | 
 5 |     Fix skew
 6 | 
 7 | diff --git a/conf/nutch-site.xml.template b/conf/nutch-site.xml.template
 8 | index 85dc30b..7ae16d2 100644
 9 | --- a/conf/nutch-site.xml.template
10 | +++ b/conf/nutch-site.xml.template
11 | @@ -37,6 +37,10 @@
12 |    <value>3600</value>
13 |  </property>
14 |  <property>
15 | +  <name>partition.url.mode</name>
16 | +  <value>byURL</value>
17 | +</property>
18 | +<property>
19 |    <name>fetcher.threads.fetch</name>
20 |    <value>20</value>
21 |  </property>
22 | diff --git a/src/java/org/apache/nutch/crawl/URLPartitioner.java b/src/java/org/apache/nutch/crawl/URLPartitioner.java
23 | index 4675f83..eb6844b 100644
24 | --- a/src/java/org/apache/nutch/crawl/URLPartitioner.java
25 | +++ b/src/java/org/apache/nutch/crawl/URLPartitioner.java
26 | @@ -42,6 +42,7 @@ public class URLPartitioner implements Partitioner<Text, Writable> {
27 |    public static final String PARTITION_MODE_HOST = "byHost";
28 |    public static final String PARTITION_MODE_DOMAIN = "byDomain";
29 |    public static final String PARTITION_MODE_IP = "byIP";
30 | +  public static final String PARTITION_MODE_URL = "byURL";
31 |  
32 |    private int seed;
33 |    private URLNormalizers normalizers;
34 | @@ -52,7 +53,7 @@ public class URLPartitioner implements Partitioner<Text, Writable> {
35 |      mode = job.get(PARTITION_MODE_KEY, PARTITION_MODE_HOST);
36 |      // check that the mode is known
37 |      if (!mode.equals(PARTITION_MODE_IP) && !mode.equals(PARTITION_MODE_DOMAIN)
38 | -        && !mode.equals(PARTITION_MODE_HOST)) {
39 | +        && !mode.equals(PARTITION_MODE_HOST) && !mode.equals(PARTITION_MODE_URL)) {
40 |        LOG.error("Unknown partition mode : " + mode + " - forcing to byHost");
41 |        mode = PARTITION_MODE_HOST;
42 |      }
43 | @@ -71,7 +72,11 @@ public class URLPartitioner implements Partitioner<Text, Writable> {
44 |        urlString = normalizers.normalize(urlString,
45 |            URLNormalizers.SCOPE_PARTITION);
46 |        url = new URL(urlString);
47 | -      hashCode = url.getHost().hashCode();
48 | +      if (mode.equals(PARTITION_MODE_URL)) {
49 | +          hashCode = url.toString().hashCode();
50 | +      } else {
51 | +          hashCode = url.getHost().hashCode();
52 | +      }
53 |      } catch (MalformedURLException e) {
54 |        LOG.warn("Malformed URL: '" + urlString + "'");
55 |      }
56 | 


--------------------------------------------------------------------------------
/week2/pigserver/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <groupId>com.example</groupId>
 6 |     <artifactId>pigserver</artifactId>
 7 |     <version>0.0.1-SNAPSHOT</version>
 8 |     <name>pigtest</name>
 9 |     <packaging>jar</packaging>
10 | 
11 |     <build>
12 |         <sourceDirectory>${basedir}/src/java</sourceDirectory>
13 |         <plugins>
14 |             <plugin>
15 |                 <groupId>org.codehaus.mojo</groupId>
16 |                 <artifactId>exec-maven-plugin</artifactId>
17 |                 <version>1.5.0</version>
18 |                 <executions>
19 |                     <execution>
20 |                         <goals>
21 |                             <goal>java</goal>
22 |                         </goals>
23 |                     </execution>
24 |                 </executions>
25 |                 <configuration>
26 |                     <cleanupDaemonThreads>false</cleanupDaemonThreads>
27 |                 </configuration>
28 |             </plugin>
29 |         </plugins>
30 |     </build>
31 | 
32 |     <dependencies>
33 | 
34 |         <!-- Hadoop -->
35 |         <dependency>
36 |             <groupId>org.apache.hadoop</groupId>
37 |             <artifactId>hadoop-common</artifactId>
38 |             <version>2.7.3</version>
39 |         </dependency>
40 | 
41 |         <dependency>
42 |             <groupId>org.apache.hadoop</groupId>
43 |             <artifactId>hadoop-hdfs</artifactId>
44 |             <version>2.7.3</version>
45 |         </dependency>
46 | 
47 |         <dependency>
48 |             <groupId>org.apache.hadoop</groupId>
49 |             <artifactId>hadoop-mapreduce-client-core</artifactId>
50 |             <version>2.7.3</version>
51 |         </dependency>
52 | 
53 |         <dependency>
54 |             <groupId>org.apache.hadoop</groupId>
55 |             <artifactId>hadoop-mapreduce-client-common</artifactId>
56 |             <version>2.7.3</version>
57 |         </dependency>
58 | 
59 |         <dependency>
60 |             <groupId>org.apache.hadoop</groupId>
61 |             <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
62 |             <version>2.7.3</version>
63 |         </dependency>
64 | 
65 |         <dependency>
66 |             <groupId>org.apache.pig</groupId>
67 |             <artifactId>pig</artifactId>
68 |             <version>0.16.0</version>
69 |             <classifier>h2</classifier>
70 |         </dependency>
71 | 
72 |     </dependencies>
73 | 
74 | </project>
75 | 


--------------------------------------------------------------------------------
/homework/week1/dict.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java
 2 | index 954aaab..d1fa7f6 100644
 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java
 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java
 5 | @@ -17,7 +17,13 @@
 6 |   */
 7 |  package com.example;
 8 |  
 9 | +import java.io.BufferedReader;
10 | +import java.io.File;
11 | +import java.io.FileReader;
12 |  import java.io.IOException;
13 | +import java.net.URI;
14 | +import java.util.HashMap;
15 | +import java.util.Map;
16 |  import java.util.StringTokenizer;
17 |  
18 |  import org.apache.hadoop.conf.Configuration;
19 | @@ -27,6 +33,7 @@ import org.apache.hadoop.io.Text;
20 |  import org.apache.hadoop.mapreduce.Job;
21 |  import org.apache.hadoop.mapreduce.Mapper;
22 |  import org.apache.hadoop.mapreduce.Reducer;
23 | +import org.apache.hadoop.mapreduce.Mapper.Context;
24 |  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
25 |  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
26 |  import org.apache.hadoop.util.GenericOptionsParser;
27 | @@ -38,12 +45,27 @@ public class WordCount {
28 |      
29 |      private final static IntWritable one = new IntWritable(1);
30 |      private Text word = new Text();
31 | -      
32 | +    Map<String, String> dict = new HashMap<String, String>();
33 | +
34 | +    @Override
35 | +    public void setup(Context context) throws IOException, InterruptedException {
36 | +        BufferedReader reader = new BufferedReader(new FileReader("dict.txt"));
37 | +        String line;
38 | +        while ((line=reader.readLine())!=null) {
39 | +            String[] items = line.split("\t");
40 | +            dict.put(items[0], items[1]);
41 | +        }
42 | +        reader.close();
43 | +    }
44 |      public void map(Object key, Text value, Context context
45 |                      ) throws IOException, InterruptedException {
46 |        StringTokenizer itr = new StringTokenizer(value.toString());
47 |        while (itr.hasMoreTokens()) {
48 | -        word.set(itr.nextToken());
49 | +        String w = itr.nextToken();
50 | +        if (dict.containsKey(w)) {
51 | +            w = dict.get(w);
52 | +        }
53 | +        word.set(w);
54 |          context.write(word, one);
55 |        }
56 |      }
57 | @@ -78,6 +100,7 @@ public class WordCount {
58 |      job.setReducerClass(IntSumReducer.class);
59 |      job.setOutputKeyClass(Text.class);
60 |      job.setOutputValueClass(IntWritable.class);
61 | +    job.setCacheArchives(new URI[] {new File("dict.txt").toURI()});
62 |      for (int i = 0; i < otherArgs.length - 1; ++i) {
63 |        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
64 |      }
65 | 


--------------------------------------------------------------------------------
/week3/hive/tpcds/upload.sh:
--------------------------------------------------------------------------------
 1 | hadoop fs -mkdir /data/call_center
 2 | hadoop fs -put /home/hadoop/data/call_center.dat /data/call_center
 3 | hadoop fs -mkdir /data/catalog_page
 4 | hadoop fs -put /home/hadoop/data/catalog_page.dat /data/catalog_page
 5 | hadoop fs -mkdir /data/catalog_returns
 6 | hadoop fs -put /home/hadoop/data/catalog_returns.dat /data/catalog_returns
 7 | hadoop fs -mkdir /data/catalog_sales
 8 | hadoop fs -put /home/hadoop/data/catalog_sales.dat /data/catalog_sales
 9 | hadoop fs -mkdir /data/customer
10 | hadoop fs -put /home/hadoop/data/customer.dat /data/customer
11 | hadoop fs -mkdir /data/customer_address
12 | hadoop fs -put /home/hadoop/data/customer_address.dat /data/customer_address
13 | hadoop fs -mkdir /data/customer_demographics
14 | hadoop fs -put /home/hadoop/data/customer_demographics.dat /data/customer_demographics
15 | hadoop fs -mkdir /data/date_dim
16 | hadoop fs -put /home/hadoop/data/date_dim.dat /data/date_dim
17 | hadoop fs -mkdir /data/dbgen_version
18 | hadoop fs -put /home/hadoop/data/dbgen_version.dat /data/dbgen_version
19 | hadoop fs -mkdir /data/household_demographics
20 | hadoop fs -put /home/hadoop/data/household_demographics.dat /data/household_demographics
21 | hadoop fs -mkdir /data/income_band
22 | hadoop fs -put /home/hadoop/data/income_band.dat /data/income_band
23 | hadoop fs -mkdir /data/inventory
24 | hadoop fs -put /home/hadoop/data/inventory.dat /data/inventory
25 | hadoop fs -mkdir /data/item
26 | hadoop fs -put /home/hadoop/data/item.dat /data/item
27 | hadoop fs -mkdir /data/promotion
28 | hadoop fs -put /home/hadoop/data/promotion.dat /data/promotion
29 | hadoop fs -mkdir /data/reason
30 | hadoop fs -put /home/hadoop/data/reason.dat /data/reason
31 | hadoop fs -mkdir /data/ship_mode
32 | hadoop fs -put /home/hadoop/data/ship_mode.dat /data/ship_mode
33 | hadoop fs -mkdir /data/store
34 | hadoop fs -put /home/hadoop/data/store.dat /data/store
35 | hadoop fs -mkdir /data/store_returns
36 | hadoop fs -put /home/hadoop/data/store_returns.dat /data/store_returns
37 | hadoop fs -mkdir /data/store_sales
38 | hadoop fs -put /home/hadoop/data/store_sales.dat /data/store_sales
39 | hadoop fs -mkdir /data/time_dim
40 | hadoop fs -put /home/hadoop/data/time_dim.dat /data/time_dim
41 | hadoop fs -mkdir /data/warehouse
42 | hadoop fs -put /home/hadoop/data/warehouse.dat /data/warehouse
43 | hadoop fs -mkdir /data/web_page
44 | hadoop fs -put /home/hadoop/data/web_page.dat /data/web_page
45 | hadoop fs -mkdir /data/web_returns
46 | hadoop fs -put /home/hadoop/data/web_returns.dat /data/web_returns
47 | hadoop fs -mkdir /data/web_sales
48 | hadoop fs -put /home/hadoop/data/web_sales.dat /data/web_sales
49 | hadoop fs -mkdir /data/web_site
50 | hadoop fs -put /home/hadoop/data/web_site.dat /data/web_site
51 | 


--------------------------------------------------------------------------------
/week4/PIG-3399-2.patch:
--------------------------------------------------------------------------------
 1 | Index: build.xml
 2 | ===================================================================
 3 | --- build.xml	(revision 1636705)
 4 | +++ build.xml	(working copy)
 5 | @@ -310,7 +310,7 @@
 6 |      </target>
 7 |  
 8 |      <target name="eclipse-files"
 9 | -            depends="init,ant-eclipse-download,ivy-compile,ivy-test"
10 | +            depends="compile-test,ant-eclipse-download"
11 |              description="Create eclipse project files">
12 |          <pathconvert property="eclipse.project">
13 |              <path path="${basedir}"/>
14 | @@ -334,7 +334,7 @@
15 |                  <source path="${src.shims.dir}"/>
16 |                  <source path="${src.shims.test.dir}"/>
17 |                  <source path="tutorial/src"/>
18 | -                <source path="${test.src.dir}" excluding="e2e/pig/udfs/java/|resources/"/>
19 | +                <source path="${test.src.dir}" excluding="e2e/pig/udfs/java/|resources/|perf/"/>
20 |                  <output path="${build.dir.eclipse-main-classes}" />
21 |                  <library pathref="eclipse.classpath" exported="true" />
22 |                  <!--library pathref="classpath" exported="false"/-->
23 | @@ -399,7 +399,6 @@
24 |          <mkdir dir="${test.build.classes}" />
25 |          <mkdir dir="${src.gen.dot.parser.dir}" />
26 |          <mkdir dir="${src.gen.textdata.parser.dir}" />
27 | -        <move file="${ivy.lib.dir}/javacc-${javacc.version}.jar" tofile="${javacc.home}/javacc.jar"/>
28 |          <tstamp>
29 |              <format property="timestamp" pattern="MMM dd yyyy, HH:mm:ss" />
30 |          </tstamp>
31 | @@ -482,11 +481,13 @@
32 |      <!-- Java Compiler Compiler, generate Parsers                           -->
33 |      <!-- ================================================================== -->
34 |      <target name="cc-compile" depends="init, ivy-compile" description="Create and Compile Parser">
35 | +        <move file="${ivy.lib.dir}/javacc-${javacc.version}.jar" tofile="${javacc.home}/javacc.jar"/>
36 |          <javacc target="${src.dir}/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj" outputdirectory="${src.gen.script.parser.dir}" javacchome="${javacc.home}" />
37 |          <javacc target="${src.dir}/org/apache/pig/tools/parameters/PigFileParser.jj" outputdirectory="${src.gen.param.parser.dir}" javacchome="${javacc.home}" />
38 |          <javacc target="${src.dir}/org/apache/pig/tools/parameters/ParamLoader.jj" outputdirectory="${src.gen.param.parser.dir}" javacchome="${javacc.home}" />
39 |          <jjtree target="${test.src.dir}/org/apache/pig/test/utils/dotGraph/DOTParser.jjt" outputdirectory="${src.gen.dot.parser.dir}" javacchome="${javacc.home}" />
40 |          <javacc target="${src.gen.dot.parser.dir}/DOTParser.jj" outputdirectory="${src.gen.dot.parser.dir}" javacchome="${javacc.home}" />
41 | +        <move file="${javacc.home}/javacc.jar" tofile="${ivy.lib.dir}/javacc-${javacc.version}.jar"/>
42 |      </target>
43 |  
44 |      <target name="gen" depends="genTreeParser"
45 | 


--------------------------------------------------------------------------------
/week1/wordcount/src/java/com/example/WordCount.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.example;
19 | 
20 | import java.io.IOException;
21 | import java.util.StringTokenizer;
22 | 
23 | import org.apache.hadoop.conf.Configuration;
24 | import org.apache.hadoop.fs.Path;
25 | import org.apache.hadoop.io.IntWritable;
26 | import org.apache.hadoop.io.Text;
27 | import org.apache.hadoop.mapreduce.Job;
28 | import org.apache.hadoop.mapreduce.Mapper;
29 | import org.apache.hadoop.mapreduce.Reducer;
30 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
31 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
32 | import org.apache.hadoop.util.GenericOptionsParser;
33 | 
34 | public class WordCount {
35 | 
36 |   public static class TokenizerMapper 
37 |        extends Mapper<Object, Text, Text, IntWritable>{
38 |     
39 |     private final static IntWritable one = new IntWritable(1);
40 |     private Text word = new Text();
41 | 
42 |     public void map(Object key, Text value, Context context
43 |                     ) throws IOException, InterruptedException {
44 |       StringTokenizer itr = new StringTokenizer(value.toString());
45 |       while (itr.hasMoreTokens()) {
46 |         word.set(itr.nextToken());
47 |         context.write(word, one);
48 |       }
49 |     }
50 |   }
51 |   
52 |   public static class IntSumReducer 
53 |        extends Reducer<Text,IntWritable,Text,IntWritable> {
54 |     private IntWritable result = new IntWritable();
55 | 
56 |     public void reduce(Text key, Iterable<IntWritable> values, 
57 |                        Context context
58 |                        ) throws IOException, InterruptedException {
59 |       int sum = 0;
60 |       for (IntWritable val : values) {
61 |         sum += val.get();
62 |       }
63 |       result.set(sum);
64 |       context.write(key, result);
65 |     }
66 |   }
67 | 
68 |   public static void main(String[] args) throws Exception {
69 |     Configuration conf = new Configuration();
70 |     String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
71 |     if (otherArgs.length < 2) {
72 |       System.err.println("Usage: wordcount <in> [<in>...] <out>");
73 |       System.exit(2);
74 |     }
75 |     Job job = Job.getInstance(conf, "word count");
76 |     job.setJarByClass(WordCount.class);
77 |     job.setMapperClass(TokenizerMapper.class);
78 |     job.setReducerClass(IntSumReducer.class);
79 |     job.setOutputKeyClass(Text.class);
80 |     job.setOutputValueClass(IntWritable.class);
81 |     for (int i = 0; i < otherArgs.length - 1; ++i) {
82 |       FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
83 |     }
84 |     FileOutputFormat.setOutputPath(job,
85 |       new Path(otherArgs[otherArgs.length - 1]));
86 |     System.exit(job.waitForCompletion(true) ? 0 : 1);
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/week4/doc.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/src/docs/src/documentation/content/xdocs/basic.xml b/src/docs/src/documentation/content/xdocs/basic.xml
 2 | index a631607..854d511 100644
 3 | --- a/src/docs/src/documentation/content/xdocs/basic.xml
 4 | +++ b/src/docs/src/documentation/content/xdocs/basic.xml
 5 | @@ -7629,16 +7629,16 @@ ILLUSTRATE A;
 6 |        
 7 |  
 8 |  <!-- =================================================================== -->
 9 | -<section id="mapreduce">
10 | -   <title>MAPREDUCE</title>
11 | -   <p>Executes native MapReduce jobs inside a Pig script.</p>      
12 | +<section id="native">
13 | +   <title>NATIVE</title>
14 | +   <p>Executes native MapReduce/Tez jobs inside a Pig script.</p>
15 |     
16 |     <section>
17 |     <title>Syntax</title>
18 |        <table>
19 |        <tr> 
20 |              <td>
21 | -               <p>alias1 = MAPREDUCE 'mr.jar' STORE alias2 INTO 
22 | +               <p>alias1 = NATIVE 'native.jar' STORE alias2 INTO
23 |  'inputLocation' USING storeFunc LOAD 'outputLocation' USING loadFunc AS schema [`params, ... `];</p>
24 |              </td>
25 |           </tr> 
26 | @@ -7658,11 +7658,11 @@ ILLUSTRATE A;
27 |       </tr>
28 |       <tr>
29 |              <td>
30 | -               <p>mr.jar</p>
31 | +               <p>native.jar</p>
32 |              </td>
33 |              <td>
34 | -                <p>The MapReduce jar file (enclosed in single quotes).</p>
35 | -               <p>You can specify any MapReduce jar file that can be run through the <code>hadoop jar mymr.jar params</code> command. </p>
36 | +                <p>The jar file containing MapReduce/Tez job (enclosed in single quotes).</p>
37 | +               <p>You can specify any MapReduce/Tez jar file that can be run through the <code>yarn jar native.jar params</code> command. </p>
38 |                 <p>The values for inputLocation and outputLocation can be passed in the params. </p>
39 |              </td>
40 |       </tr>
41 | @@ -7684,7 +7684,7 @@ ILLUSTRATE A;
42 |              </td>
43 |              <td>
44 |                 <p>See <a href="basic.html#load">LOAD</a></p>
45 | -               <p>After running mr.jar's MapReduce job, load back the data from outputLocation into alias1 using loadFunc as schema.</p>
46 | +               <p>After running native.jar's MapReduce/Tez job, load back the data from outputLocation into alias1 using loadFunc as schema.</p>
47 |              </td>
48 |       </tr>
49 |  
50 | @@ -7693,7 +7693,7 @@ ILLUSTRATE A;
51 |                 <p>`params, ...`</p>
52 |              </td>
53 |              <td>
54 | -               <p>Extra parameters required for the mapreduce job (enclosed in back tics). </p>
55 | +               <p>Extra parameters required for the mapreduce/Tez job (enclosed in back tics). </p>
56 |              </td>
57 |       </tr>
58 |         
59 | @@ -7702,20 +7702,20 @@ ILLUSTRATE A;
60 |  
61 |  <section>
62 |  <title>Usage</title>
63 | -<p>Use the MAPREDUCE operator to run native MapReduce jobs from inside a Pig script.</p>
64 | +<p>Use the NATIVE operator to run native MapReduce/Tez jobs from inside a Pig script.</p>
65 |  
66 | -<p>The input and output locations for the MapReduce program are conveyed to Pig using the STORE/LOAD clauses.  
67 | -Pig, however, does not pass this information (nor require that this information be passed) to the MapReduce program. 
68 | -If you want to pass the input and output locations to the MapReduce program you can use the params clause or you can hardcode the locations in the MapReduce program.</p>
69 | +<p>The input and output locations for the MapReduce/tez program are conveyed to Pig using the STORE/LOAD clauses.
70 | +Pig, however, does not pass this information (nor require that this information be passed) to the MapReduce/Tez program.
71 | +If you want to pass the input and output locations to the MapReduce/tez program you can use the params clause or you can hardcode the locations in the MapReduce/Tez program.</p>
72 |  </section>
73 |  
74 |  <section>
75 |  <title>Example</title>
76 | -<p>This example demonstrates how to run the wordcount MapReduce progam from Pig.
77 | -Note that the files specified as input and output locations in the MAPREDUCE statement will NOT be deleted by Pig automatically. You will need to delete them manually. </p>
78 | +<p>This example demonstrates how to run the wordcount MapReduce progam from Pig (if exectype=mapreduce).
79 | +Note that the files specified as input and output locations in the NATIVE statement will NOT be deleted by Pig automatically. You will need to delete them manually. </p>
80 |  <source>
81 |  A = LOAD 'WordcountInput.txt';
82 | -B = MAPREDUCE 'wordcount.jar' STORE A INTO 'inputDir' LOAD 'outputDir' 
83 | +B = NATIVE 'wordcount.jar' STORE A INTO 'inputDir' LOAD 'outputDir'
84 |      AS (word:chararray, count: int) `org.myorg.WordCount inputDir outputDir`;
85 |  </source>
86 |  </section>
87 | 


--------------------------------------------------------------------------------
/week3/hive/tpcds/insert.sql:
--------------------------------------------------------------------------------
 1 | set hive.exec.max.dynamic.partitions=3000;
 2 | set hive.exec.max.dynamic.partitions.pernode=3000;
 3 | set hive.exec.dynamic.partition.mode=nonstrict;
 4 | 
 5 | insert into tpcds.call_center select * from tpcds_base.call_center;
 6 | 
 7 | insert into tpcds.catalog_page select * from tpcds_base.catalog_page;
 8 | 
 9 | insert into tpcds.catalog_returns partition(cr_returned_date_sk) select cr_returned_time_sk,cr_item_sk,cr_refunded_customer_sk,cr_refunded_cdemo_sk,cr_refunded_hdemo_sk,cr_refunded_addr_sk,cr_returning_customer_sk,cr_returning_cdemo_sk,cr_returning_hdemo_sk,cr_returning_addr_sk,cr_call_center_sk,cr_catalog_page_sk,cr_ship_mode_sk,cr_warehouse_sk,cr_reason_sk,cr_order_number,cr_return_quantity,cr_return_amount,cr_return_tax,cr_return_amt_inc_tax,cr_fee,cr_return_ship_cost,cr_refunded_cash,cr_reversed_charge,cr_store_credit,cr_net_loss,cr_returned_date_sk from tpcds_base.catalog_returns;
10 | 
11 | insert into tpcds.catalog_sales partition(cs_sold_date_sk) select cs_sold_time_sk,cs_ship_date_sk,cs_bill_customer_sk,cs_bill_cdemo_sk,cs_bill_hdemo_sk,cs_bill_addr_sk,cs_ship_customer_sk,cs_ship_cdemo_sk,cs_ship_hdemo_sk,cs_ship_addr_sk,cs_call_center_sk,cs_catalog_page_sk,cs_ship_mode_sk,cs_warehouse_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity,cs_wholesale_cost,cs_list_price,cs_sales_price,cs_ext_discount_amt,cs_ext_sales_price,cs_ext_wholesale_cost,cs_ext_list_price,cs_ext_tax,cs_coupon_amt,cs_ext_ship_cost,cs_net_paid,cs_net_paid_inc_tax,cs_net_paid_inc_ship,cs_net_paid_inc_ship_tax,cs_net_profit,cs_sold_date_sk from tpcds_base.catalog_sales;
12 | 
13 | insert into tpcds.customer select * from tpcds_base.customer;
14 | 
15 | insert into tpcds.customer_address select * from tpcds_base.customer_address;
16 | 
17 | insert into tpcds.customer_demographics select * from tpcds_base.customer_demographics;
18 | 
19 | insert into tpcds.date_dim select * from tpcds_base.date_dim;
20 | 
21 | insert into tpcds.household_demographics select * from tpcds_base.household_demographics;
22 | 
23 | insert into tpcds.income_band select * from tpcds_base.income_band;
24 | 
25 | insert into tpcds.inventory partition(inv_date_sk) select inv_item_sk,inv_warehouse_sk,inv_quantity_on_hand,inv_date_sk from tpcds_base.inventory;
26 | 
27 | insert into tpcds.item select * from tpcds_base.item;
28 | 
29 | insert into tpcds.promotion select * from tpcds_base.promotion;
30 | 
31 | insert into tpcds.reason select * from tpcds_base.reason;
32 | 
33 | insert into tpcds.ship_mode select * from tpcds_base.ship_mode;
34 | 
35 | insert into tpcds.store select * from tpcds_base.store;
36 | 
37 | insert into tpcds.store_returns partition(sr_returned_date_sk) select sr_return_time_sk,sr_item_sk,sr_customer_sk,sr_cdemo_sk,sr_hdemo_sk,sr_addr_sk,sr_store_sk,sr_reason_sk,sr_ticket_number,sr_return_quantity,sr_return_amt,sr_return_tax,sr_return_amt_inc_tax,sr_fee,sr_return_ship_cost,sr_refunded_cash,sr_reversed_charge,sr_store_credit,sr_net_loss,sr_returned_date_sk from tpcds_base.store_returns;
38 | 
39 | insert into tpcds.store_sales partition(ss_sold_date_sk) select ss_sold_time_sk,ss_item_sk,ss_customer_sk,ss_cdemo_sk,ss_hdemo_sk,ss_addr_sk,ss_store_sk,ss_promo_sk,ss_ticket_number,ss_quantity,ss_wholesale_cost,ss_list_price,ss_sales_price,ss_ext_discount_amt,ss_ext_sales_price,ss_ext_wholesale_cost,ss_ext_list_price,ss_ext_tax,ss_coupon_amt,ss_net_paid,ss_net_paid_inc_tax,ss_net_profit,ss_sold_date_sk from tpcds_base.store_sales;
40 | 
41 | insert into tpcds.time_dim select * from tpcds_base.time_dim;
42 | 
43 | insert into tpcds.warehouse select * from tpcds_base.warehouse;
44 | 
45 | insert into tpcds.web_page select * from tpcds_base.web_page;
46 | 
47 | insert into tpcds.web_returns partition(wr_returned_date_sk) select wr_returned_time_sk,wr_item_sk,wr_refunded_customer_sk,wr_refunded_cdemo_sk,wr_refunded_hdemo_sk,wr_refunded_addr_sk,wr_returning_customer_sk,wr_returning_cdemo_sk,wr_returning_hdemo_sk,wr_returning_addr_sk,wr_web_page_sk,wr_reason_sk,wr_order_number,wr_return_quantity,wr_return_amt,wr_return_tax,wr_return_amt_inc_tax,wr_fee,wr_return_ship_cost,wr_refunded_cash,wr_reversed_charge,wr_account_credit,wr_net_loss,wr_returned_date_sk from tpcds_base.web_returns;
48 | 
49 | insert into tpcds.web_sales partition(ws_sold_date_sk) select ws_sold_time_sk,ws_ship_date_sk,ws_item_sk,ws_bill_customer_sk,ws_bill_cdemo_sk,ws_bill_hdemo_sk,ws_bill_addr_sk,ws_ship_customer_sk,ws_ship_cdemo_sk,ws_ship_hdemo_sk,ws_ship_addr_sk,ws_web_page_sk,ws_web_site_sk,ws_ship_mode_sk,ws_warehouse_sk,ws_promo_sk,ws_order_number,ws_quantity,ws_wholesale_cost,ws_list_price,ws_sales_price,ws_ext_discount_amt,ws_ext_sales_price,ws_ext_wholesale_cost,ws_ext_list_price,ws_ext_tax,ws_coupon_amt,ws_ext_ship_cost,ws_net_paid,ws_net_paid_inc_tax,ws_net_paid_inc_ship,ws_net_paid_inc_ship_tax,ws_net_profit,ws_sold_date_sk from tpcds_base.web_sales;
50 | 
51 | insert into tpcds.web_site select * from tpcds_base.web_site;
52 | 
53 | 


--------------------------------------------------------------------------------
/week4/jobname.patch:
--------------------------------------------------------------------------------
  1 | Index: src/org/apache/pig/scripting/BoundScript.java
  2 | ===================================================================
  3 | --- src/org/apache/pig/scripting/BoundScript.java	(revision 1785219)
  4 | +++ src/org/apache/pig/scripting/BoundScript.java	(working copy)
  5 | @@ -17,6 +17,7 @@
  6 |   */
  7 |  package org.apache.pig.scripting;
  8 |  
  9 | +import java.io.File;
 10 |  import java.io.FileInputStream;
 11 |  import java.io.IOException;
 12 |  import java.io.StringReader;
 13 | @@ -264,7 +265,9 @@
 14 |          LOG.info("Query to run:\n" + query);
 15 |          List<PigProgressNotificationListener> listeners = ScriptState.get().getAllListeners();
 16 |          PigContext pc = scriptContext.getPigContext();
 17 | +        String scriptName = new File(ScriptState.get().getFileName()).getName();
 18 |          ScriptState scriptState = pc.getExecutionEngine().instantiateScriptState();
 19 | +        scriptState.setFileName(scriptName);
 20 |          ScriptState.start(scriptState);
 21 |          ScriptState.get().setScript(query);
 22 |          for (PigProgressNotificationListener listener : listeners) {
 23 | @@ -271,6 +274,9 @@
 24 |              ScriptState.get().registerListener(listener);
 25 |          }
 26 |          PigServer pigServer = new PigServer(scriptContext.getPigContext(), false);
 27 | +        if (pc.getProperties().getProperty(PigContext.JOB_NAME) == null) {
 28 | +            pigServer.setJobName(scriptName);
 29 | +        }
 30 |          GruntParser grunt = new GruntParser(new StringReader(query), pigServer);
 31 |          grunt.setInteractive(false);
 32 |          try {
 33 | Index: test/org/apache/pig/test/TestScriptLanguage.java
 34 | ===================================================================
 35 | --- test/org/apache/pig/test/TestScriptLanguage.java	(revision 1785219)
 36 | +++ test/org/apache/pig/test/TestScriptLanguage.java	(working copy)
 37 | @@ -31,6 +31,7 @@
 38 |  import org.apache.pig.PigRunner;
 39 |  import org.apache.pig.PigServer;
 40 |  import org.apache.pig.data.Tuple;
 41 | +import org.apache.pig.impl.PigContext;
 42 |  import org.apache.pig.scripting.ScriptEngine;
 43 |  import org.apache.pig.tools.pigstats.OutputStats;
 44 |  import org.apache.pig.tools.pigstats.PigStats;
 45 | @@ -669,6 +670,59 @@
 46 |          assertFileNotExists(file1, file2);
 47 |      }
 48 |  
 49 | +    @Test
 50 | +    public void testJobName() throws Exception {
 51 | +        String[] script1 = {
 52 | +                "#!/usr/bin/python",
 53 | +                "from org.apache.pig.scripting import *",
 54 | +                "Pig.fs(\"rmr simple_out\")",
 55 | +                "input = 'simple_table'",
 56 | +                "output = 'simple_out'",
 57 | +                "P = Pig.compile(\"\"\"a = load '$input';store a into '$output';\"\"\")",
 58 | +                "Q = P.bind({'input':input, 'output':output})",
 59 | +                "stats = Q.runSingle()",
 60 | +                "if stats.isSuccessful():",
 61 | +                "\tprint 'success!'",
 62 | +                "else:",
 63 | +                "\traise 'failed'"
 64 | +        };
 65 | +        String[] input = {
 66 | +                "1\t3",
 67 | +                "2\t4",
 68 | +                "3\t5"
 69 | +        };
 70 | +
 71 | +        File script1File = Util.createInputFile("jobname1", ".py", script1);
 72 | +        Util.createLocalInputFile("simple_table", input);
 73 | +
 74 | +        PigStats stats = PigRunner.run(new String[] { "-x", Util.getLocalTestMode().toString(),
 75 | +                "-f", script1File.getAbsolutePath()}, null);
 76 | +        String jobName = stats.getAllStats().values().iterator().next().get(0).getPigProperties().getProperty(PigContext.JOB_NAME);
 77 | +        assertTrue(jobName.contains(script1File.getName()));
 78 | +
 79 | +        // set jobName manually in script
 80 | +        String[] script2 = {
 81 | +                "#!/usr/bin/python",
 82 | +                "from org.apache.pig.scripting import *",
 83 | +                "Pig.fs(\"rmr simple_out\")",
 84 | +                "input = 'simple_table'",
 85 | +                "output = 'simple_out'",
 86 | +                "P = Pig.compile(\"\"\"a = load '$input';store a into '$output';\"\"\")",
 87 | +                "P.set(\"jobName\", \"myjob\")",
 88 | +                "Q = P.bind({'input':input, 'output':output})",
 89 | +                "stats = Q.runSingle()",
 90 | +                "if stats.isSuccessful():",
 91 | +                "\tprint 'success!'",
 92 | +                "else:",
 93 | +                "\traise 'failed'"
 94 | +        };
 95 | +        File script2File = Util.createInputFile("jobname2", ".py", script2);
 96 | +        stats = PigRunner.run(new String[] { "-x", Util.getLocalTestMode().toString(),
 97 | +                "-f", script2File.getAbsolutePath()}, null);
 98 | +        jobName = stats.getAllStats().values().iterator().next().get(0).getPigProperties().getProperty(PigContext.JOB_NAME);
 99 | +        assertTrue(jobName.contains("myjob"));
100 | +    }
101 | +
102 |      private void createEmptyFiles(String... filenames) throws IOException {
103 |          for (String file : filenames) {
104 |              Util.createInputFile(cluster, file, new String[]{""});
105 | 


--------------------------------------------------------------------------------
/week2/python/kmeans.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # This code is made available under the Apache License, Version 2.0 (the
  3 | # "License"); you may not use this file except in compliance with the License.
  4 | # You may obtain a copy of the License at
  5 | #
  6 | # http://www.apache.org/licenses/LICENSE-2.0
  7 | #
  8 | # Unless required by applicable law or agreed to in writing, software
  9 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 10 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 11 | # License for the specific language governing permissions and limitations
 12 | # under the License.
 13 | 
 14 | import sys
 15 | from math import fabs
 16 | from org.apache.pig.scripting import Pig
 17 | 
 18 | @outputSchemaFunction("findCentroidSchema")
 19 | # Assign each value to the closest centroid
 20 | def findCentroid(initialCentroid, value):
 21 |     # initialCentroid is a constant, we can optimize to derive centroids only once
 22 |     centroids = initialCentroid.split(":")
 23 | 
 24 |     min_distance = float("inf")
 25 |     closest_centroid = 0
 26 |     for centroid in centroids:
 27 |         distance = fabs(float(centroid) - value)
 28 |         if distance < min_distance:
 29 |             min_distance = distance
 30 |             closest_centroid = centroid
 31 |     return float(closest_centroid)
 32 | 
 33 | # The output schema is the second field of the input, which is the same type of the param value
 34 | @schemaFunction("findCentroidSchema")
 35 | def findCentroidSchema(input):
 36 |     return input.getField(1)
 37 | 
 38 | def main():
 39 |     filename = "studenttab10k"
 40 |     k = 4
 41 |     tolerance = 0.01
 42 | 
 43 |     MAX_SCORE = 4
 44 |     MIN_SCORE = 0
 45 |     MAX_ITERATION = 100
 46 | 
 47 |     # initial centroid, equally divide the space
 48 |     initial_centroids = ""
 49 |     last_centroids = [None] * k
 50 |     for i in range(k):
 51 |         last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
 52 |         initial_centroids = initial_centroids + str(last_centroids[i])
 53 |         if i!=k-1:
 54 |             initial_centroids = initial_centroids + ":"
 55 | 
 56 |     # Compile Pig script. Register the same script since it contains the Jython UDF.
 57 |     # $centroids is the only binding parameter. It will be bound to different parameter with the
 58 |     # estimation for centroid from the last round. Then we calculate the average of the new clusters
 59 |     # to get the new centroid estimation, and store into "output"
 60 |     P = Pig.compile("""register 'kmeans.py' using jython as util;
 61 |                        raw = load 'studenttab10k' as (name:chararray, age:int, gpa:double);
 62 |                        centroided = foreach raw generate gpa, util.findCentroid('$centroids', gpa) as centroid;
 63 |                        grouped = group centroided by centroid;
 64 |                        result = foreach grouped generate group, AVG(centroided.gpa);
 65 |                        store result into 'output';
 66 |                     """)
 67 | 
 68 |     converged = False
 69 |     iter_num = 0
 70 |     while iter_num < MAX_ITERATION:
 71 |         # Binding parameter centroids to current centroids
 72 |         Q = P.bind({'centroids':initial_centroids})
 73 | 
 74 |         # Run Pig script
 75 |         results = Q.runSingle()
 76 | 
 77 |         # Check the result of the Pig script
 78 |         if results.isSuccessful() == "FAILED":
 79 |             raise "Pig job failed"
 80 | 
 81 |         # Get the new centroids from the output
 82 |         iter = results.result("result").iterator()
 83 |         centroids = [None] * k
 84 |         distance_move = 0
 85 | 
 86 |         # Calculate the moving distance with last iteration
 87 |         for i in range(k):
 88 |             tuple = iter.next()
 89 |             centroids[i] = float(str(tuple.get(1)))
 90 |             distance_move = distance_move + fabs(last_centroids[i]-centroids[i])
 91 |         distance_move = distance_move / k;
 92 |         Pig.fs("rmr output")
 93 |         print("iteration " + str(iter_num))
 94 |         print("average distance moved: " + str(distance_move))
 95 | 
 96 |         # Converge
 97 |         if distance_move < tolerance:
 98 |             sys.stdout.write("k-means converged at centroids: [")
 99 |             sys.stdout.write(",".join(str(v) for v in centroids))
100 |             sys.stdout.write("]\n")
101 |             converged = True
102 |             break
103 | 
104 |         # Not converge, use the new centroids as the initial centroids for next iteration
105 |         last_centroids = centroids[:]
106 |         initial_centroids = ""
107 |         for i in range(k):
108 |             initial_centroids = initial_centroids + str(last_centroids[i])
109 |             if i!=k-1:
110 |                 initial_centroids = initial_centroids + ":"
111 |         iter_num += 1
112 | 
113 |     # Not converge after MAX_ITERATION
114 |     if not converged:
115 |         print("not converge after " + str(iter_num) + " iterations")
116 |         sys.stdout.write("last centroids: [")
117 |         sys.stdout.write(",".join(str(v) for v in last_centroids))
118 |         sys.stdout.write("]\n")
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/week2/loadfunc/src/java/com/example/NutchParsedDataLoader.java:
--------------------------------------------------------------------------------
  1 | package com.example;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.apache.commons.logging.Log;
  6 | import org.apache.commons.logging.LogFactory;
  7 | import org.apache.hadoop.io.Text;
  8 | import org.apache.hadoop.io.Writable;
  9 | import org.apache.hadoop.mapreduce.InputFormat;
 10 | import org.apache.hadoop.mapreduce.Job;
 11 | import org.apache.hadoop.mapreduce.RecordReader;
 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 13 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 14 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
 15 | import org.apache.nutch.parse.ParseData;
 16 | import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
 17 | import org.apache.pig.data.DataType;
 18 | import org.apache.pig.data.Tuple;
 19 | import org.apache.pig.data.TupleFactory;
 20 | import org.apache.pig.Expression;
 21 | import org.apache.pig.FileInputLoadFunc;
 22 | import org.apache.pig.LoadMetadata;
 23 | import org.apache.pig.ResourceSchema;
 24 | import org.apache.pig.ResourceSchema.ResourceFieldSchema;
 25 | import org.apache.pig.ResourceStatistics;
 26 | 
 27 | public class NutchParsedDataLoader extends FileInputLoadFunc implements LoadMetadata {
 28 | 
 29 |     private SequenceFileRecordReader<Writable, Writable> reader;
 30 |     
 31 |     private Text key;
 32 |     private ParseData value;
 33 |     
 34 |     protected static final Log LOG = LogFactory.getLog(NutchParsedDataLoader.class);
 35 |     protected TupleFactory mTupleFactory = TupleFactory.getInstance();
 36 |       
 37 |     public NutchParsedDataLoader() {
 38 |     }
 39 |     
 40 |     @Override
 41 |     public Tuple getNext() throws IOException {
 42 |       boolean next = false;
 43 |       try {
 44 |         next = reader.nextKeyValue();
 45 |       } catch (InterruptedException e) {
 46 |         throw new IOException(e);
 47 |       }
 48 |       
 49 |       if (!next) return null;
 50 |       
 51 |       key = (Text)reader.getCurrentKey();
 52 |       value = (ParseData)reader.getCurrentValue();
 53 | 
 54 |       Tuple t =  mTupleFactory.newTuple(15);
 55 |       t.set(0, key.toString());
 56 |       t.set(1, value.getTitle());
 57 |       t.set(2, value.getMeta("name"));
 58 |       t.set(3, value.getMeta("publisher"));
 59 |       t.set(4, value.getMeta("updateTime"));
 60 |       t.set(5, value.getMeta("category"));
 61 |       t.set(6, value.getMeta("price"));
 62 |       t.set(7, value.getMeta("reviewScore"));
 63 |       t.set(8, value.getMeta("reviewCount"));
 64 |       t.set(9, value.getMeta("install"));
 65 |       t.set(10, value.getMeta("version"));
 66 |       t.set(11, value.getMeta("rating"));
 67 |       t.set(12, value.getMeta("developerSite"));
 68 |       t.set(13, value.getMeta("developerEmail"));
 69 |       
 70 |       return t;
 71 |     }
 72 | 
 73 |     @SuppressWarnings("unchecked")
 74 |     @Override
 75 |     public InputFormat getInputFormat() throws IOException {
 76 |       return new SequenceFileInputFormat<Writable, Writable>();
 77 |     }
 78 | 
 79 |     @SuppressWarnings("unchecked")
 80 |     @Override
 81 |     public void prepareToRead(RecordReader reader, PigSplit split)
 82 |           throws IOException {
 83 |       this.reader = (SequenceFileRecordReader) reader;
 84 |     }
 85 | 
 86 |     @Override
 87 |     public void setLocation(String location, Job job) throws IOException {
 88 |       FileInputFormat.setInputPaths(job, location);    
 89 |     }
 90 | 
 91 |     @Override
 92 |     public String[] getPartitionKeys(String location, Job job) throws IOException {
 93 |         return null;
 94 |     }
 95 | 
 96 |     @Override
 97 |     public ResourceSchema getSchema(String location, Job job) throws IOException {
 98 |         ResourceSchema schema = new ResourceSchema();
 99 |         ResourceFieldSchema fields[] = new ResourceFieldSchema[14];
100 |         fields[0] = new ResourceFieldSchema(); fields[0].setName("url"); fields[0].setType(DataType.CHARARRAY);
101 |         fields[1] = new ResourceFieldSchema(); fields[1].setName("title"); fields[1].setType(DataType.CHARARRAY);
102 |         fields[2] = new ResourceFieldSchema(); fields[2].setName("name"); fields[2].setType(DataType.CHARARRAY);
103 |         fields[3] = new ResourceFieldSchema(); fields[3].setName("publisher"); fields[3].setType(DataType.CHARARRAY);
104 |         fields[4] = new ResourceFieldSchema(); fields[4].setName("updateTime"); fields[4].setType(DataType.CHARARRAY);
105 |         fields[5] = new ResourceFieldSchema(); fields[5].setName("category"); fields[5].setType(DataType.CHARARRAY);
106 |         fields[6] = new ResourceFieldSchema(); fields[6].setName("price"); fields[6].setType(DataType.CHARARRAY);
107 |         fields[7] = new ResourceFieldSchema(); fields[7].setName("reviewScore"); fields[7].setType(DataType.CHARARRAY);
108 |         fields[8] = new ResourceFieldSchema(); fields[8].setName("reviewCount"); fields[8].setType(DataType.CHARARRAY);
109 |         fields[9] = new ResourceFieldSchema(); fields[9].setName("install"); fields[9].setType(DataType.CHARARRAY);
110 |         fields[10] = new ResourceFieldSchema(); fields[10].setName("version"); fields[10].setType(DataType.CHARARRAY);
111 |         fields[11] = new ResourceFieldSchema(); fields[11].setName("rating"); fields[11].setType(DataType.CHARARRAY);
112 |         fields[12] = new ResourceFieldSchema(); fields[12].setName("developerSite"); fields[12].setType(DataType.CHARARRAY);
113 |         fields[13] = new ResourceFieldSchema(); fields[13].setName("developerEmail"); fields[13].setType(DataType.CHARARRAY);
114 |         schema.setFields(fields);
115 |         return schema;
116 |     }
117 | 
118 |     @Override
119 |     public ResourceStatistics getStatistics(String location, Job job)
120 |             throws IOException {
121 |         return null;
122 |     }
123 | 
124 |     @Override
125 |     public void setPartitionFilter(Expression expr) throws IOException {
126 |     }
127 | }
128 | 


--------------------------------------------------------------------------------
/week4/set.patch:
--------------------------------------------------------------------------------
  1 | Index: src/org/apache/pig/PigServer.java
  2 | ===================================================================
  3 | --- src/org/apache/pig/PigServer.java	(revision 1785219)
  4 | +++ src/org/apache/pig/PigServer.java	(working copy)
  5 | @@ -43,6 +43,7 @@
  6 |  import java.util.Set;
  7 |  import java.util.concurrent.atomic.AtomicInteger;
  8 |  
  9 | +import org.apache.commons.lang.StringUtils;
 10 |  import org.apache.commons.logging.Log;
 11 |  import org.apache.commons.logging.LogFactory;
 12 |  import org.apache.hadoop.conf.Configuration;
 13 | @@ -102,6 +103,7 @@
 14 |  import org.apache.pig.validator.BlackAndWhitelistFilter;
 15 |  import org.apache.pig.validator.PigCommandFilter;
 16 |  
 17 | +import com.beust.jcommander.Strings;
 18 |  import com.google.common.annotations.VisibleForTesting;
 19 |  
 20 |  /**
 21 | @@ -395,6 +397,12 @@
 22 |          pigContext.getLog4jProperties().setProperty("log4j.logger.org.apache.pig", pigContext.getDefaultLogLevel().toString());
 23 |      }
 24 |  
 25 | +    public boolean isDebugOn() {
 26 | +        if (Logger.getLogger("org.apache.pig").getLevel() == Level.DEBUG) {
 27 | +            return true;
 28 | +        }
 29 | +        return false;
 30 | +    }
 31 |      /**
 32 |       * Set the default parallelism for this job
 33 |       * @param p default number of reducers to use for this job.
 34 | @@ -403,6 +411,10 @@
 35 |          pigContext.defaultParallel = p;
 36 |      }
 37 |  
 38 | +    public int getDefaultParallel() {
 39 | +        return pigContext.defaultParallel;
 40 | +    }
 41 | +
 42 |      /**
 43 |       * Starts batch execution mode.
 44 |       */
 45 | @@ -547,6 +559,10 @@
 46 |          pigContext.addPathToSkip(path);
 47 |      }
 48 |  
 49 | +    public String getPathToSkip() {
 50 | +        return StringUtils.join(pigContext.getPathsToSkip(), ",");
 51 | +    }
 52 | +
 53 |      /**
 54 |       * Defines an alias for the given function spec. This
 55 |       * is useful for functions that require arguments to the
 56 | @@ -956,6 +972,10 @@
 57 |          jobName = PigContext.JOB_NAME_PREFIX + ":" + name;
 58 |      }
 59 |  
 60 | +    public String getJobName() {
 61 | +        return jobName;
 62 | +    }
 63 | +
 64 |      /**
 65 |       * Set Hadoop job priority.  This value will get translated to mapred.job.priority.
 66 |       * @param priority valid values are found in {@link org.apache.hadoop.mapred.JobPriority}
 67 | @@ -964,6 +984,10 @@
 68 |          jobPriority = priority;
 69 |      }
 70 |  
 71 | +    public String getJobPriority() {
 72 | +        return jobPriority;
 73 | +    }
 74 | +
 75 |      /**
 76 |       * Executes a Pig Latin script up to and including indicated alias.  That is, if a user does:
 77 |       * <pre>
 78 | Index: src/org/apache/pig/tools/grunt/GruntParser.java
 79 | ===================================================================
 80 | --- src/org/apache/pig/tools/grunt/GruntParser.java	(revision 1785219)
 81 | +++ src/org/apache/pig/tools/grunt/GruntParser.java	(working copy)
 82 | @@ -572,44 +572,74 @@
 83 |      protected void processSet(String key, String value) throws IOException, ParseException {
 84 |          filter.validate(PigCommandFilter.Command.SET);
 85 |          key = parameterSubstitutionInGrunt(key);
 86 | -        value = parameterSubstitutionInGrunt(value);
 87 | +        if (value != null) {
 88 | +            value = parameterSubstitutionInGrunt(value);
 89 | +        }
 90 |          if (key.equals("debug"))
 91 |          {
 92 | -            if (value.equals("on"))
 93 | -                mPigServer.debugOn();
 94 | -            else if (value.equals("off"))
 95 | -                mPigServer.debugOff();
 96 | -            else
 97 | -                throw new ParseException("Invalid value " + value + " provided for " + key);
 98 | +            if (value == null) {
 99 | +                System.out.println("debug=" + mPigServer.isDebugOn());
100 | +            } else {
101 | +                if (value.equals("on"))
102 | +                    mPigServer.debugOn();
103 | +                else if (value.equals("off"))
104 | +                    mPigServer.debugOff();
105 | +                else
106 | +                    throw new ParseException("Invalid value " + value + " provided for " + key);
107 | +            }
108 |          }
109 |          else if (key.equals("job.name"))
110 |          {
111 | -            mPigServer.setJobName(value);
112 | +            if (value == null) {
113 | +                System.out.println("job.name=" + mPigServer.getJobName());
114 | +            } else {
115 | +                mPigServer.setJobName(value);
116 | +            }
117 |          }
118 |          else if (key.equals("job.priority"))
119 |          {
120 | -            mPigServer.setJobPriority(value);
121 | +            if (value == null) {
122 | +                System.out.println("job.priority=" + mPigServer.getJobPriority());
123 | +            } else {
124 | +                mPigServer.setJobPriority(value);
125 | +            }
126 |          }
127 |          else if (key.equals("stream.skippath")) {
128 | -            // Validate
129 | -            File file = new File(value);
130 | -            if (!file.exists() || file.isDirectory()) {
131 | -                throw new IOException("Invalid value for stream.skippath:" +
132 | -                                      value);
133 | +            if (value == null) {
134 | +                System.out.println("stream.skippath=" + mPigServer.getPathToSkip());
135 | +            } else {
136 | +                // Validate
137 | +                File file = new File(value);
138 | +                if (!file.exists() || file.isDirectory()) {
139 | +                    throw new IOException("Invalid value for stream.skippath:" +
140 | +                                          value);
141 | +                }
142 | +                mPigServer.addPathToSkip(value);
143 |              }
144 | -            mPigServer.addPathToSkip(value);
145 |          }
146 |          else if (key.equals("default_parallel")) {
147 | -            // Validate
148 | -            try {
149 | -                mPigServer.setDefaultParallel(Integer.parseInt(value));
150 | -            } catch (NumberFormatException e) {
151 | -                throw new ParseException("Invalid value for default_parallel");
152 | +            if (value == null) {
153 | +                System.out.println("default_parallel=" + mPigServer.getDefaultParallel());
154 | +            } else {
155 | +                // Validate
156 | +                try {
157 | +                    mPigServer.setDefaultParallel(Integer.parseInt(value));
158 | +                } catch (NumberFormatException e) {
159 | +                    throw new ParseException("Invalid value for default_parallel");
160 | +                }
161 |              }
162 |          }
163 |          else
164 |          {
165 | -           mPigServer.getPigContext().getExecutionEngine().setProperty(key, value);
166 | +            if (value == null) {
167 | +                if (mPigServer.getPigContext().getProperties().get(key) != null) {
168 | +                    System.out.println(key + "=" + mPigServer.getPigContext().getProperties().get(key));
169 | +                } else {
170 | +                    System.out.println(key + " is undefined");
171 | +                }
172 | +            } else {
173 | +                mPigServer.getPigContext().getExecutionEngine().setProperty(key, value);
174 | +            }
175 |          }
176 |      }
177 |  
178 | Index: src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj
179 | ===================================================================
180 | --- src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj	(revision 1785219)
181 | +++ src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj	(working copy)
182 | @@ -132,7 +132,13 @@
183 |  		else
184 |  			return s;
185 |  	}
186 | -
187 | +	static boolean eolOrSemicolon(int kind)
188 | +    {
189 | +        if (kind == EOL || kind == SEMICOLON) {
190 | +            return true;
191 | +        }
192 | +        return false;
193 | +    }
194 |  }
195 |  PARSER_END(PigScriptParser)
196 |  
197 | @@ -626,8 +632,8 @@
198 |  	<SET>
199 |  	(
200 |  		t1 = GetKey()
201 | -		t2 = GetValue()
202 | -		{processSet(t1.image, unquote(t2.image));}
203 | +		t2 = GetValueOrNull()
204 | +		{processSet(t1.image, eolOrSemicolon(t2.kind)?null:unquote(t2.image));}
205 |      	|
206 |  		{processSet();}
207 |  	)
208 | @@ -828,6 +834,21 @@
209 |  	{return t;}
210 |  }
211 |  
212 | +Token GetValueOrNull() :
213 | +{
214 | +	Token t;
215 | +}
216 | +{
217 | +    (
218 | +    t = GetValue()
219 | +    |
220 | +    t = <EOL>
221 | +    |
222 | +    t = <SEMICOLON>
223 | +    )
224 | +    {return t;}
225 | +}
226 | +
227 |  Token GetValue() :
228 |  {
229 |  	Token t;
230 | Index: test/org/apache/pig/test/TestGrunt.java
231 | ===================================================================
232 | --- test/org/apache/pig/test/TestGrunt.java	(revision 1785219)
233 | +++ test/org/apache/pig/test/TestGrunt.java	(working copy)
234 | @@ -24,6 +24,7 @@
235 |  
236 |  import java.io.BufferedReader;
237 |  import java.io.ByteArrayInputStream;
238 | +import java.io.ByteArrayOutputStream;
239 |  import java.io.File;
240 |  import java.io.FileInputStream;
241 |  import java.io.FileReader;
242 | @@ -31,6 +32,7 @@
243 |  import java.io.FilenameFilter;
244 |  import java.io.InputStream;
245 |  import java.io.InputStreamReader;
246 | +import java.io.PrintStream;
247 |  import java.io.PrintWriter;
248 |  import java.io.StringReader;
249 |  import java.util.ArrayList;
250 | @@ -1430,6 +1432,14 @@
251 |          new Grunt(new BufferedReader(reader), pc).exec();
252 |  
253 |          assertEquals("my.arbitrary.value",  pc.getProperties().getProperty("my.arbitrary.key"));
254 | +
255 | +        ByteArrayOutputStream baos = new ByteArrayOutputStream();
256 | +        System.setOut(new PrintStream(baos));
257 | +        strCmd = "set my.arbitrary.key\n";
258 | +        reader = new InputStreamReader(new ByteArrayInputStream(strCmd.getBytes()));
259 | +        new Grunt(new BufferedReader(reader), pc).exec();
260 | +
261 | +        assertEquals(baos.toString(), "my.arbitrary.key=my.arbitrary.value\n");
262 |      }
263 |  
264 |      @Test
265 | 


--------------------------------------------------------------------------------
/week3/hive/tpcds/all-tables-orc.sql:
--------------------------------------------------------------------------------
  1 | create database tpcds;
  2 | use tpcds;
  3 | 
  4 | drop table if exists call_center;
  5 | 
  6 | create table call_center(
  7 |       cc_call_center_sk         bigint               
  8 | ,     cc_call_center_id         string              
  9 | ,     cc_rec_start_date        string                         
 10 | ,     cc_rec_end_date          string                         
 11 | ,     cc_closed_date_sk         bigint                       
 12 | ,     cc_open_date_sk           bigint                       
 13 | ,     cc_name                   string                   
 14 | ,     cc_class                  string                   
 15 | ,     cc_employees              int                       
 16 | ,     cc_sq_ft                  int                       
 17 | ,     cc_hours                  string                      
 18 | ,     cc_manager                string                   
 19 | ,     cc_mkt_id                 int                       
 20 | ,     cc_mkt_class              string                      
 21 | ,     cc_mkt_desc               string                  
 22 | ,     cc_market_manager         string                   
 23 | ,     cc_division               int                       
 24 | ,     cc_division_name          string                   
 25 | ,     cc_company                int                       
 26 | ,     cc_company_name           string                      
 27 | ,     cc_street_number          string                      
 28 | ,     cc_street_name            string                   
 29 | ,     cc_street_type            string                      
 30 | ,     cc_suite_number           string                      
 31 | ,     cc_city                   string                   
 32 | ,     cc_county                 string                   
 33 | ,     cc_state                  string                       
 34 | ,     cc_zip                    string                      
 35 | ,     cc_country                string                   
 36 | ,     cc_gmt_offset             double                  
 37 | ,     cc_tax_percentage         double
 38 | )
 39 | stored as orc;
 40 | 
 41 | drop table if exists catalog_page;
 42 | 
 43 | create table catalog_page(
 44 |       cp_catalog_page_sk        bigint               
 45 | ,     cp_catalog_page_id        string              
 46 | ,     cp_start_date_sk          bigint                       
 47 | ,     cp_end_date_sk            bigint                       
 48 | ,     cp_department             string                   
 49 | ,     cp_catalog_number         int                       
 50 | ,     cp_catalog_page_number    int                       
 51 | ,     cp_description            string                  
 52 | ,     cp_type                   string
 53 | )
 54 | stored as orc;
 55 | 
 56 | drop table if exists catalog_returns;
 57 | 
 58 | create table catalog_returns
 59 | (
 60 |     cr_returned_time_sk       bigint,
 61 |     cr_item_sk                bigint,
 62 |     cr_refunded_customer_sk   bigint,
 63 |     cr_refunded_cdemo_sk      bigint,
 64 |     cr_refunded_hdemo_sk      bigint,
 65 |     cr_refunded_addr_sk       bigint,
 66 |     cr_returning_customer_sk  bigint,
 67 |     cr_returning_cdemo_sk     bigint,
 68 |     cr_returning_hdemo_sk     bigint,
 69 |     cr_returning_addr_sk      bigint,
 70 |     cr_call_center_sk         bigint,
 71 |     cr_catalog_page_sk        bigint,
 72 |     cr_ship_mode_sk           bigint,
 73 |     cr_warehouse_sk           bigint,
 74 |     cr_reason_sk              bigint,
 75 |     cr_order_number           bigint,
 76 |     cr_return_quantity        int,
 77 |     cr_return_amount          double,
 78 |     cr_return_tax             double,
 79 |     cr_return_amt_inc_tax     double,
 80 |     cr_fee                    double,
 81 |     cr_return_ship_cost       double,
 82 |     cr_refunded_cash          double,
 83 |     cr_reversed_charge        double,
 84 |     cr_store_credit           double,
 85 |     cr_net_loss               double
 86 | )
 87 | partitioned by (cr_returned_date_sk bigint)
 88 | stored as orc;
 89 | 
 90 | drop table if exists catalog_sales;
 91 | 
 92 | create table catalog_sales
 93 | (
 94 |     cs_sold_time_sk           bigint,
 95 |     cs_ship_date_sk           bigint,
 96 |     cs_bill_customer_sk       bigint,
 97 |     cs_bill_cdemo_sk          bigint,
 98 |     cs_bill_hdemo_sk          bigint,
 99 |     cs_bill_addr_sk           bigint,
100 |     cs_ship_customer_sk       bigint,
101 |     cs_ship_cdemo_sk          bigint,
102 |     cs_ship_hdemo_sk          bigint,
103 |     cs_ship_addr_sk           bigint,
104 |     cs_call_center_sk         bigint,
105 |     cs_catalog_page_sk        bigint,
106 |     cs_ship_mode_sk           bigint,
107 |     cs_warehouse_sk           bigint,
108 |     cs_item_sk                bigint,
109 |     cs_promo_sk               bigint,
110 |     cs_order_number           bigint,
111 |     cs_quantity               int,
112 |     cs_wholesale_cost         double,
113 |     cs_list_price             double,
114 |     cs_sales_price            double,
115 |     cs_ext_discount_amt       double,
116 |     cs_ext_sales_price        double,
117 |     cs_ext_wholesale_cost     double,
118 |     cs_ext_list_price         double,
119 |     cs_ext_tax                double,
120 |     cs_coupon_amt             double,
121 |     cs_ext_ship_cost          double,
122 |     cs_net_paid               double,
123 |     cs_net_paid_inc_tax       double,
124 |     cs_net_paid_inc_ship      double,
125 |     cs_net_paid_inc_ship_tax  double,
126 |     cs_net_profit             double
127 | )
128 | partitioned by (cs_sold_date_sk bigint)
129 | stored as orc;
130 | 
131 | drop table if exists customer_address;
132 | 
133 | create table customer_address
134 | (
135 |     ca_address_sk             bigint,
136 |     ca_address_id             string,
137 |     ca_street_number          string,
138 |     ca_street_name            string,
139 |     ca_street_type            string,
140 |     ca_suite_number           string,
141 |     ca_city                   string,
142 |     ca_county                 string,
143 |     ca_state                  string,
144 |     ca_zip                    string,
145 |     ca_country                string,
146 |     ca_gmt_offset             double,
147 |     ca_location_type          string
148 | )
149 | stored as orc;
150 | 
151 | drop table if exists customer_demographics;
152 | 
153 | create table customer_demographics
154 | (
155 |     cd_demo_sk                bigint,
156 |     cd_gender                 string,
157 |     cd_marital_status         string,
158 |     cd_education_status       string,
159 |     cd_purchase_estimate      int,
160 |     cd_credit_rating          string,
161 |     cd_dep_count              int,
162 |     cd_dep_employed_count     int,
163 |     cd_dep_college_count      int 
164 | )
165 | stored as orc;
166 | 
167 | drop table if exists customer;
168 | 
169 | create table customer
170 | (
171 |     c_customer_sk             bigint,
172 |     c_customer_id             string,
173 |     c_current_cdemo_sk        bigint,
174 |     c_current_hdemo_sk        bigint,
175 |     c_current_addr_sk         bigint,
176 |     c_first_shipto_date_sk    bigint,
177 |     c_first_sales_date_sk     bigint,
178 |     c_salutation              string,
179 |     c_first_name              string,
180 |     c_last_name               string,
181 |     c_preferred_cust_flag     string,
182 |     c_birth_day               int,
183 |     c_birth_month             int,
184 |     c_birth_year              int,
185 |     c_birth_country           string,
186 |     c_login                   string,
187 |     c_email_address           string,
188 |     c_last_review_date        string
189 | )
190 | stored as orc;
191 | 
192 | drop table if exists date_dim;
193 | 
194 | create table date_dim
195 | (
196 |     d_date_sk                 bigint,
197 |     d_date_id                 string,
198 |     d_date                    string,
199 |     d_month_seq               int,
200 |     d_week_seq                int,
201 |     d_quarter_seq             int,
202 |     d_year                    int,
203 |     d_dow                     int,
204 |     d_moy                     int,
205 |     d_dom                     int,
206 |     d_qoy                     int,
207 |     d_fy_year                 int,
208 |     d_fy_quarter_seq          int,
209 |     d_fy_week_seq             int,
210 |     d_day_name                string,
211 |     d_quarter_name            string,
212 |     d_holiday                 string,
213 |     d_weekend                 string,
214 |     d_following_holiday       string,
215 |     d_first_dom               int,
216 |     d_last_dom                int,
217 |     d_same_day_ly             int,
218 |     d_same_day_lq             int,
219 |     d_current_day             string,
220 |     d_current_week            string,
221 |     d_current_month           string,
222 |     d_current_quarter         string,
223 |     d_current_year            string 
224 | )
225 | stored as orc;
226 | 
227 | drop table if exists household_demographics;
228 | 
229 | create table household_demographics
230 | (
231 |     hd_demo_sk                bigint,
232 |     hd_income_band_sk         bigint,
233 |     hd_buy_potential          string,
234 |     hd_dep_count              int,
235 |     hd_vehicle_count          int
236 | )
237 | stored as orc;
238 | 
239 | drop table if exists income_band;
240 | 
241 | create table income_band(
242 |       ib_income_band_sk         bigint               
243 | ,     ib_lower_bound            int                       
244 | ,     ib_upper_bound            int
245 | )
246 | stored as orc;
247 | 
248 | drop table if exists inventory;
249 | 
250 | create table inventory
251 | (
252 |     inv_item_sk			bigint,
253 |     inv_warehouse_sk		bigint,
254 |     inv_quantity_on_hand	int
255 | )
256 | partitioned by (inv_date_sk bigint)
257 | stored as orc;
258 | 
259 | drop table if exists item;
260 | 
261 | create table item
262 | (
263 |     i_item_sk                 bigint,
264 |     i_item_id                 string,
265 |     i_rec_start_date          string,
266 |     i_rec_end_date            string,
267 |     i_item_desc               string,
268 |     i_current_price           double,
269 |     i_wholesale_cost          double,
270 |     i_brand_id                int,
271 |     i_brand                   string,
272 |     i_class_id                int,
273 |     i_class                   string,
274 |     i_category_id             int,
275 |     i_category                string,
276 |     i_manufact_id             int,
277 |     i_manufact                string,
278 |     i_size                    string,
279 |     i_formulation             string,
280 |     i_color                   string,
281 |     i_units                   string,
282 |     i_container               string,
283 |     i_manager_id              int,
284 |     i_product_name            string
285 | )
286 | stored as orc;
287 | 
288 | drop table if exists promotion;
289 | 
290 | create table promotion
291 | (
292 |     p_promo_sk                bigint,
293 |     p_promo_id                string,
294 |     p_start_date_sk           bigint,
295 |     p_end_date_sk             bigint,
296 |     p_item_sk                 bigint,
297 |     p_cost                    double,
298 |     p_response_target         int,
299 |     p_promo_name              string,
300 |     p_channel_dmail           string,
301 |     p_channel_email           string,
302 |     p_channel_catalog         string,
303 |     p_channel_tv              string,
304 |     p_channel_radio           string,
305 |     p_channel_press           string,
306 |     p_channel_event           string,
307 |     p_channel_demo            string,
308 |     p_channel_details         string,
309 |     p_purpose                 string,
310 |     p_discount_active         string 
311 | )
312 | stored as orc;
313 | 
314 | drop table if exists reason;
315 | 
316 | create table reason(
317 |       r_reason_sk               bigint               
318 | ,     r_reason_id               string              
319 | ,     r_reason_desc             string                
320 | )
321 | stored as orc;
322 | 
323 | drop table if exists ship_mode;
324 | 
325 | create table ship_mode(
326 |       sm_ship_mode_sk           bigint               
327 | ,     sm_ship_mode_id           string              
328 | ,     sm_type                   string                      
329 | ,     sm_code                   string                      
330 | ,     sm_carrier                string                      
331 | ,     sm_contract               string                      
332 | )
333 | stored as orc;
334 | 
335 | drop table if exists store_returns;
336 | 
337 | create table store_returns
338 | (
339 |     sr_return_time_sk         bigint,
340 |     sr_item_sk                bigint,
341 |     sr_customer_sk            bigint,
342 |     sr_cdemo_sk               bigint,
343 |     sr_hdemo_sk               bigint,
344 |     sr_addr_sk                bigint,
345 |     sr_store_sk               bigint,
346 |     sr_reason_sk              bigint,
347 |     sr_ticket_number          bigint,
348 |     sr_return_quantity        int,
349 |     sr_return_amt             double,
350 |     sr_return_tax             double,
351 |     sr_return_amt_inc_tax     double,
352 |     sr_fee                    double,
353 |     sr_return_ship_cost       double,
354 |     sr_refunded_cash          double,
355 |     sr_reversed_charge        double,
356 |     sr_store_credit           double,
357 |     sr_net_loss               double
358 | )
359 | partitioned by (sr_returned_date_sk bigint)
360 | stored as orc;
361 | 
362 | drop table if exists store_sales;
363 | 
364 | create table store_sales
365 | (
366 |     ss_sold_time_sk           bigint,
367 |     ss_item_sk                bigint,
368 |     ss_customer_sk            bigint,
369 |     ss_cdemo_sk               bigint,
370 |     ss_hdemo_sk               bigint,
371 |     ss_addr_sk                bigint,
372 |     ss_store_sk               bigint,
373 |     ss_promo_sk               bigint,
374 |     ss_ticket_number          bigint,
375 |     ss_quantity               int,
376 |     ss_wholesale_cost         double,
377 |     ss_list_price             double,
378 |     ss_sales_price            double,
379 |     ss_ext_discount_amt       double,
380 |     ss_ext_sales_price        double,
381 |     ss_ext_wholesale_cost     double,
382 |     ss_ext_list_price         double,
383 |     ss_ext_tax                double,
384 |     ss_coupon_amt             double,
385 |     ss_net_paid               double,
386 |     ss_net_paid_inc_tax       double,
387 |     ss_net_profit             double
388 | )
389 | partitioned by (ss_sold_date_sk bigint)
390 | stored as orc;
391 | 
392 | drop table if exists store;
393 | 
394 | create table store
395 | (
396 |     s_store_sk                bigint,
397 |     s_store_id                string,
398 |     s_rec_start_date          string,
399 |     s_rec_end_date            string,
400 |     s_closed_date_sk          bigint,
401 |     s_store_name              string,
402 |     s_number_employees        int,
403 |     s_floor_space             int,
404 |     s_hours                   string,
405 |     s_manager                 string,
406 |     s_market_id               int,
407 |     s_geography_class         string,
408 |     s_market_desc             string,
409 |     s_market_manager          string,
410 |     s_division_id             int,
411 |     s_division_name           string,
412 |     s_company_id              int,
413 |     s_company_name            string,
414 |     s_street_number           string,
415 |     s_street_name             string,
416 |     s_street_type             string,
417 |     s_suite_number            string,
418 |     s_city                    string,
419 |     s_county                  string,
420 |     s_state                   string,
421 |     s_zip                     string,
422 |     s_country                 string,
423 |     s_gmt_offset              double,
424 |     s_tax_precentage          double                  
425 | )
426 | stored as orc;
427 | 
428 | drop table if exists time_dim;
429 | 
430 | create table time_dim
431 | (
432 |     t_time_sk                 bigint,
433 |     t_time_id                 string,
434 |     t_time                    int,
435 |     t_hour                    int,
436 |     t_minute                  int,
437 |     t_second                  int,
438 |     t_am_pm                   string,
439 |     t_shift                   string,
440 |     t_sub_shift               string,
441 |     t_meal_time               string
442 | )
443 | stored as orc;
444 | 
445 | drop table if exists warehouse;
446 | 
447 | create table warehouse(
448 |       w_warehouse_sk            bigint               
449 | ,     w_warehouse_id            string              
450 | ,     w_warehouse_name          string                   
451 | ,     w_warehouse_sq_ft         int                       
452 | ,     w_street_number           string                      
453 | ,     w_street_name             string                   
454 | ,     w_street_type             string                      
455 | ,     w_suite_number            string                      
456 | ,     w_city                    string                   
457 | ,     w_county                  string                   
458 | ,     w_state                   string                       
459 | ,     w_zip                     string                      
460 | ,     w_country                 string                   
461 | ,     w_gmt_offset              double                  
462 | )
463 | stored as orc;
464 | 
465 | drop table if exists web_page;
466 | 
467 | create table web_page(
468 |       wp_web_page_sk            bigint               
469 | ,     wp_web_page_id            string              
470 | ,     wp_rec_start_date        string                         
471 | ,     wp_rec_end_date          string                         
472 | ,     wp_creation_date_sk       bigint                       
473 | ,     wp_access_date_sk         bigint                       
474 | ,     wp_autogen_flag           string                       
475 | ,     wp_customer_sk            bigint                       
476 | ,     wp_url                    string                  
477 | ,     wp_type                   string                      
478 | ,     wp_char_count             int                       
479 | ,     wp_link_count             int                       
480 | ,     wp_image_count            int                       
481 | ,     wp_max_ad_count           int
482 | )
483 | stored as orc;
484 | 
485 | drop table if exists web_returns;
486 | 
487 | create table web_returns
488 | (
489 |     wr_returned_time_sk       bigint,
490 |     wr_item_sk                bigint,
491 |     wr_refunded_customer_sk   bigint,
492 |     wr_refunded_cdemo_sk      bigint,
493 |     wr_refunded_hdemo_sk      bigint,
494 |     wr_refunded_addr_sk       bigint,
495 |     wr_returning_customer_sk  bigint,
496 |     wr_returning_cdemo_sk     bigint,
497 |     wr_returning_hdemo_sk     bigint,
498 |     wr_returning_addr_sk      bigint,
499 |     wr_web_page_sk            bigint,
500 |     wr_reason_sk              bigint,
501 |     wr_order_number           bigint,
502 |     wr_return_quantity        int,
503 |     wr_return_amt             double,
504 |     wr_return_tax             double,
505 |     wr_return_amt_inc_tax     double,
506 |     wr_fee                    double,
507 |     wr_return_ship_cost       double,
508 |     wr_refunded_cash          double,
509 |     wr_reversed_charge        double,
510 |     wr_account_credit         double,
511 |     wr_net_loss               double
512 | )
513 | partitioned by (wr_returned_date_sk bigint)
514 | stored as orc;
515 | 
516 | drop table if exists web_sales;
517 | 
518 | create table web_sales
519 | (
520 |     ws_sold_time_sk           bigint,
521 |     ws_ship_date_sk           bigint,
522 |     ws_item_sk                bigint,
523 |     ws_bill_customer_sk       bigint,
524 |     ws_bill_cdemo_sk          bigint,
525 |     ws_bill_hdemo_sk          bigint,
526 |     ws_bill_addr_sk           bigint,
527 |     ws_ship_customer_sk       bigint,
528 |     ws_ship_cdemo_sk          bigint,
529 |     ws_ship_hdemo_sk          bigint,
530 |     ws_ship_addr_sk           bigint,
531 |     ws_web_page_sk            bigint,
532 |     ws_web_site_sk            bigint,
533 |     ws_ship_mode_sk           bigint,
534 |     ws_warehouse_sk           bigint,
535 |     ws_promo_sk               bigint,
536 |     ws_order_number           bigint,
537 |     ws_quantity               int,
538 |     ws_wholesale_cost         double,
539 |     ws_list_price             double,
540 |     ws_sales_price            double,
541 |     ws_ext_discount_amt       double,
542 |     ws_ext_sales_price        double,
543 |     ws_ext_wholesale_cost     double,
544 |     ws_ext_list_price         double,
545 |     ws_ext_tax                double,
546 |     ws_coupon_amt             double,
547 |     ws_ext_ship_cost          double,
548 |     ws_net_paid               double,
549 |     ws_net_paid_inc_tax       double,
550 |     ws_net_paid_inc_ship      double,
551 |     ws_net_paid_inc_ship_tax  double,
552 |     ws_net_profit             double
553 | )
554 | partitioned by (ws_sold_date_sk bigint)
555 | stored as orc;
556 | 
557 | drop table if exists web_site;
558 | 
559 | create table web_site
560 | (
561 |     web_site_sk           bigint,
562 |     web_site_id           string,
563 |     web_rec_start_date    string,
564 |     web_rec_end_date      string,
565 |     web_name              string,
566 |     web_open_date_sk      bigint,
567 |     web_close_date_sk     bigint,
568 |     web_class             string,
569 |     web_manager           string,
570 |     web_mkt_id            int,
571 |     web_mkt_class         string,
572 |     web_mkt_desc          string,
573 |     web_market_manager    string,
574 |     web_company_id        int,
575 |     web_company_name      string,
576 |     web_street_number     string,
577 |     web_street_name       string,
578 |     web_street_type       string,
579 |     web_suite_number      string,
580 |     web_city              string,
581 |     web_county            string,
582 |     web_state             string,
583 |     web_zip               string,
584 |     web_country           string,
585 |     web_gmt_offset        double,
586 |     web_tax_percentage    double
587 | )
588 | stored as orc;
589 | 


--------------------------------------------------------------------------------
/week3/hive/tpcds/all-tables.sql:
--------------------------------------------------------------------------------
  1 | drop table if exists call_center;
  2 | 
  3 | create external table call_center(
  4 |       cc_call_center_sk         bigint               
  5 | ,     cc_call_center_id         string              
  6 | ,     cc_rec_start_date        string                         
  7 | ,     cc_rec_end_date          string                         
  8 | ,     cc_closed_date_sk         bigint                       
  9 | ,     cc_open_date_sk           bigint                       
 10 | ,     cc_name                   string                   
 11 | ,     cc_class                  string                   
 12 | ,     cc_employees              int                       
 13 | ,     cc_sq_ft                  int                       
 14 | ,     cc_hours                  string                      
 15 | ,     cc_manager                string                   
 16 | ,     cc_mkt_id                 int                       
 17 | ,     cc_mkt_class              string                      
 18 | ,     cc_mkt_desc               string                  
 19 | ,     cc_market_manager         string                   
 20 | ,     cc_division               int                       
 21 | ,     cc_division_name          string                   
 22 | ,     cc_company                int                       
 23 | ,     cc_company_name           string                      
 24 | ,     cc_street_number          string                      
 25 | ,     cc_street_name            string                   
 26 | ,     cc_street_type            string                      
 27 | ,     cc_suite_number           string                      
 28 | ,     cc_city                   string                   
 29 | ,     cc_county                 string                   
 30 | ,     cc_state                  string                       
 31 | ,     cc_zip                    string                      
 32 | ,     cc_country                string                   
 33 | ,     cc_gmt_offset             double                  
 34 | ,     cc_tax_percentage         double
 35 | )
 36 | row format delimited fields terminated by '|' 
 37 | location '/data/call_center';
 38 | 
 39 | drop table if exists catalog_page;
 40 | 
 41 | create external table catalog_page(
 42 |       cp_catalog_page_sk        bigint               
 43 | ,     cp_catalog_page_id        string              
 44 | ,     cp_start_date_sk          bigint                       
 45 | ,     cp_end_date_sk            bigint                       
 46 | ,     cp_department             string                   
 47 | ,     cp_catalog_number         int                       
 48 | ,     cp_catalog_page_number    int                       
 49 | ,     cp_description            string                  
 50 | ,     cp_type                   string
 51 | )
 52 | row format delimited fields terminated by '|' 
 53 | location '/data/catalog_page';
 54 | 
 55 | drop table if exists catalog_returns;
 56 | 
 57 | create external table catalog_returns
 58 | (
 59 |     cr_returned_date_sk       bigint,
 60 |     cr_returned_time_sk       bigint,
 61 |     cr_item_sk                bigint,
 62 |     cr_refunded_customer_sk   bigint,
 63 |     cr_refunded_cdemo_sk      bigint,
 64 |     cr_refunded_hdemo_sk      bigint,
 65 |     cr_refunded_addr_sk       bigint,
 66 |     cr_returning_customer_sk  bigint,
 67 |     cr_returning_cdemo_sk     bigint,
 68 |     cr_returning_hdemo_sk     bigint,
 69 |     cr_returning_addr_sk      bigint,
 70 |     cr_call_center_sk         bigint,
 71 |     cr_catalog_page_sk        bigint,
 72 |     cr_ship_mode_sk           bigint,
 73 |     cr_warehouse_sk           bigint,
 74 |     cr_reason_sk              bigint,
 75 |     cr_order_number           bigint,
 76 |     cr_return_quantity        int,
 77 |     cr_return_amount          double,
 78 |     cr_return_tax             double,
 79 |     cr_return_amt_inc_tax     double,
 80 |     cr_fee                    double,
 81 |     cr_return_ship_cost       double,
 82 |     cr_refunded_cash          double,
 83 |     cr_reversed_charge        double,
 84 |     cr_store_credit           double,
 85 |     cr_net_loss               double
 86 | )
 87 | row format delimited fields terminated by '|' 
 88 | location '/data/catalog_returns';
 89 | 
 90 | drop table if exists catalog_sales;
 91 | 
 92 | create external table catalog_sales
 93 | (
 94 |     cs_sold_date_sk           bigint,
 95 |     cs_sold_time_sk           bigint,
 96 |     cs_ship_date_sk           bigint,
 97 |     cs_bill_customer_sk       bigint,
 98 |     cs_bill_cdemo_sk          bigint,
 99 |     cs_bill_hdemo_sk          bigint,
100 |     cs_bill_addr_sk           bigint,
101 |     cs_ship_customer_sk       bigint,
102 |     cs_ship_cdemo_sk          bigint,
103 |     cs_ship_hdemo_sk          bigint,
104 |     cs_ship_addr_sk           bigint,
105 |     cs_call_center_sk         bigint,
106 |     cs_catalog_page_sk        bigint,
107 |     cs_ship_mode_sk           bigint,
108 |     cs_warehouse_sk           bigint,
109 |     cs_item_sk                bigint,
110 |     cs_promo_sk               bigint,
111 |     cs_order_number           bigint,
112 |     cs_quantity               int,
113 |     cs_wholesale_cost         double,
114 |     cs_list_price             double,
115 |     cs_sales_price            double,
116 |     cs_ext_discount_amt       double,
117 |     cs_ext_sales_price        double,
118 |     cs_ext_wholesale_cost     double,
119 |     cs_ext_list_price         double,
120 |     cs_ext_tax                double,
121 |     cs_coupon_amt             double,
122 |     cs_ext_ship_cost          double,
123 |     cs_net_paid               double,
124 |     cs_net_paid_inc_tax       double,
125 |     cs_net_paid_inc_ship      double,
126 |     cs_net_paid_inc_ship_tax  double,
127 |     cs_net_profit             double
128 | )
129 | row format delimited fields terminated by '|' 
130 | location '/data/catalog_sales';
131 | 
132 | drop table if exists customer_address;
133 | 
134 | create external table customer_address
135 | (
136 |     ca_address_sk             bigint,
137 |     ca_address_id             string,
138 |     ca_street_number          string,
139 |     ca_street_name            string,
140 |     ca_street_type            string,
141 |     ca_suite_number           string,
142 |     ca_city                   string,
143 |     ca_county                 string,
144 |     ca_state                  string,
145 |     ca_zip                    string,
146 |     ca_country                string,
147 |     ca_gmt_offset             double,
148 |     ca_location_type          string
149 | )
150 | row format delimited fields terminated by '|' 
151 | location '/data/customer_address';
152 | 
153 | drop table if exists customer_demographics;
154 | 
155 | create external table customer_demographics
156 | (
157 |     cd_demo_sk                bigint,
158 |     cd_gender                 string,
159 |     cd_marital_status         string,
160 |     cd_education_status       string,
161 |     cd_purchase_estimate      int,
162 |     cd_credit_rating          string,
163 |     cd_dep_count              int,
164 |     cd_dep_employed_count     int,
165 |     cd_dep_college_count      int 
166 | )
167 | row format delimited fields terminated by '|' 
168 | location '/data/customer_demographics';
169 | 
170 | drop table if exists customer;
171 | 
172 | create external table customer
173 | (
174 |     c_customer_sk             bigint,
175 |     c_customer_id             string,
176 |     c_current_cdemo_sk        bigint,
177 |     c_current_hdemo_sk        bigint,
178 |     c_current_addr_sk         bigint,
179 |     c_first_shipto_date_sk    bigint,
180 |     c_first_sales_date_sk     bigint,
181 |     c_salutation              string,
182 |     c_first_name              string,
183 |     c_last_name               string,
184 |     c_preferred_cust_flag     string,
185 |     c_birth_day               int,
186 |     c_birth_month             int,
187 |     c_birth_year              int,
188 |     c_birth_country           string,
189 |     c_login                   string,
190 |     c_email_address           string,
191 |     c_last_review_date        string
192 | )
193 | row format delimited fields terminated by '|' 
194 | location '/data/customer';
195 | 
196 | drop table if exists date_dim;
197 | 
198 | create external table date_dim
199 | (
200 |     d_date_sk                 bigint,
201 |     d_date_id                 string,
202 |     d_date                    string,
203 |     d_month_seq               int,
204 |     d_week_seq                int,
205 |     d_quarter_seq             int,
206 |     d_year                    int,
207 |     d_dow                     int,
208 |     d_moy                     int,
209 |     d_dom                     int,
210 |     d_qoy                     int,
211 |     d_fy_year                 int,
212 |     d_fy_quarter_seq          int,
213 |     d_fy_week_seq             int,
214 |     d_day_name                string,
215 |     d_quarter_name            string,
216 |     d_holiday                 string,
217 |     d_weekend                 string,
218 |     d_following_holiday       string,
219 |     d_first_dom               int,
220 |     d_last_dom                int,
221 |     d_same_day_ly             int,
222 |     d_same_day_lq             int,
223 |     d_current_day             string,
224 |     d_current_week            string,
225 |     d_current_month           string,
226 |     d_current_quarter         string,
227 |     d_current_year            string 
228 | )
229 | row format delimited fields terminated by '|' 
230 | location '/data/date_dim';
231 | 
232 | drop table if exists household_demographics;
233 | 
234 | create external table household_demographics
235 | (
236 |     hd_demo_sk                bigint,
237 |     hd_income_band_sk         bigint,
238 |     hd_buy_potential          string,
239 |     hd_dep_count              int,
240 |     hd_vehicle_count          int
241 | )
242 | row format delimited fields terminated by '|' 
243 | location '/data/household_demographics';
244 | 
245 | drop table if exists income_band;
246 | 
247 | create external table income_band(
248 |       ib_income_band_sk         bigint               
249 | ,     ib_lower_bound            int                       
250 | ,     ib_upper_bound            int
251 | )
252 | row format delimited fields terminated by '|' 
253 | location '/data/income_band';
254 | 
255 | drop table if exists inventory;
256 | 
257 | create external table inventory
258 | (
259 |     inv_date_sk			bigint,
260 |     inv_item_sk			bigint,
261 |     inv_warehouse_sk		bigint,
262 |     inv_quantity_on_hand	int
263 | )
264 | row format delimited fields terminated by '|' 
265 | location '/data/inventory';
266 | 
267 | drop table if exists item;
268 | 
269 | create external table item
270 | (
271 |     i_item_sk                 bigint,
272 |     i_item_id                 string,
273 |     i_rec_start_date          string,
274 |     i_rec_end_date            string,
275 |     i_item_desc               string,
276 |     i_current_price           double,
277 |     i_wholesale_cost          double,
278 |     i_brand_id                int,
279 |     i_brand                   string,
280 |     i_class_id                int,
281 |     i_class                   string,
282 |     i_category_id             int,
283 |     i_category                string,
284 |     i_manufact_id             int,
285 |     i_manufact                string,
286 |     i_size                    string,
287 |     i_formulation             string,
288 |     i_color                   string,
289 |     i_units                   string,
290 |     i_container               string,
291 |     i_manager_id              int,
292 |     i_product_name            string
293 | )
294 | row format delimited fields terminated by '|' 
295 | location '/data/item';
296 | 
297 | drop table if exists promotion;
298 | 
299 | create external table promotion
300 | (
301 |     p_promo_sk                bigint,
302 |     p_promo_id                string,
303 |     p_start_date_sk           bigint,
304 |     p_end_date_sk             bigint,
305 |     p_item_sk                 bigint,
306 |     p_cost                    double,
307 |     p_response_target         int,
308 |     p_promo_name              string,
309 |     p_channel_dmail           string,
310 |     p_channel_email           string,
311 |     p_channel_catalog         string,
312 |     p_channel_tv              string,
313 |     p_channel_radio           string,
314 |     p_channel_press           string,
315 |     p_channel_event           string,
316 |     p_channel_demo            string,
317 |     p_channel_details         string,
318 |     p_purpose                 string,
319 |     p_discount_active         string 
320 | )
321 | row format delimited fields terminated by '|' 
322 | location '/data/promotion';
323 | 
324 | drop table if exists reason;
325 | 
326 | create external table reason(
327 |       r_reason_sk               bigint               
328 | ,     r_reason_id               string              
329 | ,     r_reason_desc             string                
330 | )
331 | row format delimited fields terminated by '|' 
332 | location '/data/reason';
333 | 
334 | drop table if exists ship_mode;
335 | 
336 | create external table ship_mode(
337 |       sm_ship_mode_sk           bigint               
338 | ,     sm_ship_mode_id           string              
339 | ,     sm_type                   string                      
340 | ,     sm_code                   string                      
341 | ,     sm_carrier                string                      
342 | ,     sm_contract               string                      
343 | )
344 | row format delimited fields terminated by '|' 
345 | location '/data/ship_mode';
346 | 
347 | drop table if exists store_returns;
348 | 
349 | create external table store_returns
350 | (
351 |     sr_returned_date_sk       bigint,
352 |     sr_return_time_sk         bigint,
353 |     sr_item_sk                bigint,
354 |     sr_customer_sk            bigint,
355 |     sr_cdemo_sk               bigint,
356 |     sr_hdemo_sk               bigint,
357 |     sr_addr_sk                bigint,
358 |     sr_store_sk               bigint,
359 |     sr_reason_sk              bigint,
360 |     sr_ticket_number          bigint,
361 |     sr_return_quantity        int,
362 |     sr_return_amt             double,
363 |     sr_return_tax             double,
364 |     sr_return_amt_inc_tax     double,
365 |     sr_fee                    double,
366 |     sr_return_ship_cost       double,
367 |     sr_refunded_cash          double,
368 |     sr_reversed_charge        double,
369 |     sr_store_credit           double,
370 |     sr_net_loss               double             
371 | )
372 | row format delimited fields terminated by '|' 
373 | location '/data/store_returns';
374 | 
375 | drop table if exists store_sales;
376 | 
377 | create external table store_sales
378 | (
379 |     ss_sold_date_sk           bigint,
380 |     ss_sold_time_sk           bigint,
381 |     ss_item_sk                bigint,
382 |     ss_customer_sk            bigint,
383 |     ss_cdemo_sk               bigint,
384 |     ss_hdemo_sk               bigint,
385 |     ss_addr_sk                bigint,
386 |     ss_store_sk               bigint,
387 |     ss_promo_sk               bigint,
388 |     ss_ticket_number          bigint,
389 |     ss_quantity               int,
390 |     ss_wholesale_cost         double,
391 |     ss_list_price             double,
392 |     ss_sales_price            double,
393 |     ss_ext_discount_amt       double,
394 |     ss_ext_sales_price        double,
395 |     ss_ext_wholesale_cost     double,
396 |     ss_ext_list_price         double,
397 |     ss_ext_tax                double,
398 |     ss_coupon_amt             double,
399 |     ss_net_paid               double,
400 |     ss_net_paid_inc_tax       double,
401 |     ss_net_profit             double                  
402 | )
403 | row format delimited fields terminated by '|' 
404 | location '/data/store_sales';
405 | 
406 | drop table if exists store;
407 | 
408 | create external table store
409 | (
410 |     s_store_sk                bigint,
411 |     s_store_id                string,
412 |     s_rec_start_date          string,
413 |     s_rec_end_date            string,
414 |     s_closed_date_sk          bigint,
415 |     s_store_name              string,
416 |     s_number_employees        int,
417 |     s_floor_space             int,
418 |     s_hours                   string,
419 |     s_manager                 string,
420 |     s_market_id               int,
421 |     s_geography_class         string,
422 |     s_market_desc             string,
423 |     s_market_manager          string,
424 |     s_division_id             int,
425 |     s_division_name           string,
426 |     s_company_id              int,
427 |     s_company_name            string,
428 |     s_street_number           string,
429 |     s_street_name             string,
430 |     s_street_type             string,
431 |     s_suite_number            string,
432 |     s_city                    string,
433 |     s_county                  string,
434 |     s_state                   string,
435 |     s_zip                     string,
436 |     s_country                 string,
437 |     s_gmt_offset              double,
438 |     s_tax_precentage          double                  
439 | )
440 | row format delimited fields terminated by '|' 
441 | location '/data/store';
442 | 
443 | drop table if exists time_dim;
444 | 
445 | create external table time_dim
446 | (
447 |     t_time_sk                 bigint,
448 |     t_time_id                 string,
449 |     t_time                    int,
450 |     t_hour                    int,
451 |     t_minute                  int,
452 |     t_second                  int,
453 |     t_am_pm                   string,
454 |     t_shift                   string,
455 |     t_sub_shift               string,
456 |     t_meal_time               string
457 | )
458 | row format delimited fields terminated by '|' 
459 | location '/data/time_dim';
460 | 
461 | drop table if exists warehouse;
462 | 
463 | create external table warehouse(
464 |       w_warehouse_sk            bigint               
465 | ,     w_warehouse_id            string              
466 | ,     w_warehouse_name          string                   
467 | ,     w_warehouse_sq_ft         int                       
468 | ,     w_street_number           string                      
469 | ,     w_street_name             string                   
470 | ,     w_street_type             string                      
471 | ,     w_suite_number            string                      
472 | ,     w_city                    string                   
473 | ,     w_county                  string                   
474 | ,     w_state                   string                       
475 | ,     w_zip                     string                      
476 | ,     w_country                 string                   
477 | ,     w_gmt_offset              double                  
478 | )
479 | row format delimited fields terminated by '|' 
480 | location '/data/warehouse';
481 | 
482 | drop table if exists web_page;
483 | 
484 | create external table web_page(
485 |       wp_web_page_sk            bigint               
486 | ,     wp_web_page_id            string              
487 | ,     wp_rec_start_date        string                         
488 | ,     wp_rec_end_date          string                         
489 | ,     wp_creation_date_sk       bigint                       
490 | ,     wp_access_date_sk         bigint                       
491 | ,     wp_autogen_flag           string                       
492 | ,     wp_customer_sk            bigint                       
493 | ,     wp_url                    string                  
494 | ,     wp_type                   string                      
495 | ,     wp_char_count             int                       
496 | ,     wp_link_count             int                       
497 | ,     wp_image_count            int                       
498 | ,     wp_max_ad_count           int
499 | )
500 | row format delimited fields terminated by '|' 
501 | location '/data/web_page';
502 | 
503 | drop table if exists web_returns;
504 | 
505 | create external table web_returns
506 | (
507 |     wr_returned_date_sk       bigint,
508 |     wr_returned_time_sk       bigint,
509 |     wr_item_sk                bigint,
510 |     wr_refunded_customer_sk   bigint,
511 |     wr_refunded_cdemo_sk      bigint,
512 |     wr_refunded_hdemo_sk      bigint,
513 |     wr_refunded_addr_sk       bigint,
514 |     wr_returning_customer_sk  bigint,
515 |     wr_returning_cdemo_sk     bigint,
516 |     wr_returning_hdemo_sk     bigint,
517 |     wr_returning_addr_sk      bigint,
518 |     wr_web_page_sk            bigint,
519 |     wr_reason_sk              bigint,
520 |     wr_order_number           bigint,
521 |     wr_return_quantity        int,
522 |     wr_return_amt             double,
523 |     wr_return_tax             double,
524 |     wr_return_amt_inc_tax     double,
525 |     wr_fee                    double,
526 |     wr_return_ship_cost       double,
527 |     wr_refunded_cash          double,
528 |     wr_reversed_charge        double,
529 |     wr_account_credit         double,
530 |     wr_net_loss               double
531 | )
532 | row format delimited fields terminated by '|' 
533 | location '/data/web_returns';
534 | 
535 | drop table if exists web_sales;
536 | 
537 | create external table web_sales
538 | (
539 |     ws_sold_date_sk           bigint,
540 |     ws_sold_time_sk           bigint,
541 |     ws_ship_date_sk           bigint,
542 |     ws_item_sk                bigint,
543 |     ws_bill_customer_sk       bigint,
544 |     ws_bill_cdemo_sk          bigint,
545 |     ws_bill_hdemo_sk          bigint,
546 |     ws_bill_addr_sk           bigint,
547 |     ws_ship_customer_sk       bigint,
548 |     ws_ship_cdemo_sk          bigint,
549 |     ws_ship_hdemo_sk          bigint,
550 |     ws_ship_addr_sk           bigint,
551 |     ws_web_page_sk            bigint,
552 |     ws_web_site_sk            bigint,
553 |     ws_ship_mode_sk           bigint,
554 |     ws_warehouse_sk           bigint,
555 |     ws_promo_sk               bigint,
556 |     ws_order_number           bigint,
557 |     ws_quantity               int,
558 |     ws_wholesale_cost         double,
559 |     ws_list_price             double,
560 |     ws_sales_price            double,
561 |     ws_ext_discount_amt       double,
562 |     ws_ext_sales_price        double,
563 |     ws_ext_wholesale_cost     double,
564 |     ws_ext_list_price         double,
565 |     ws_ext_tax                double,
566 |     ws_coupon_amt             double,
567 |     ws_ext_ship_cost          double,
568 |     ws_net_paid               double,
569 |     ws_net_paid_inc_tax       double,
570 |     ws_net_paid_inc_ship      double,
571 |     ws_net_paid_inc_ship_tax  double,
572 |     ws_net_profit             double
573 | )
574 | row format delimited fields terminated by '|' 
575 | location '/data/web_sales';
576 | 
577 | drop table if exists web_site;
578 | 
579 | create external table web_site
580 | (
581 |     web_site_sk           bigint,
582 |     web_site_id           string,
583 |     web_rec_start_date    string,
584 |     web_rec_end_date      string,
585 |     web_name              string,
586 |     web_open_date_sk      bigint,
587 |     web_close_date_sk     bigint,
588 |     web_class             string,
589 |     web_manager           string,
590 |     web_mkt_id            int,
591 |     web_mkt_class         string,
592 |     web_mkt_desc          string,
593 |     web_market_manager    string,
594 |     web_company_id        int,
595 |     web_company_name      string,
596 |     web_street_number     string,
597 |     web_street_name       string,
598 |     web_street_type       string,
599 |     web_suite_number      string,
600 |     web_city              string,
601 |     web_county            string,
602 |     web_state             string,
603 |     web_zip               string,
604 |     web_country           string,
605 |     web_gmt_offset        double,
606 |     web_tax_percentage    double
607 | )
608 | row format delimited fields terminated by '|' 
609 | location '/data/web_site';
610 | 


--------------------------------------------------------------------------------
/week3/hive/tpcds/all-tables-base.sql:
--------------------------------------------------------------------------------
  1 | drop database if exists tpcds cascade;
  2 | 
  3 | create database tpcds_base;
  4 | use tpcds_base;
  5 | 
  6 | drop table if exists call_center;
  7 | 
  8 | create external table call_center(
  9 |       cc_call_center_sk         bigint               
 10 | ,     cc_call_center_id         string              
 11 | ,     cc_rec_start_date        string                         
 12 | ,     cc_rec_end_date          string                         
 13 | ,     cc_closed_date_sk         bigint                       
 14 | ,     cc_open_date_sk           bigint                       
 15 | ,     cc_name                   string                   
 16 | ,     cc_class                  string                   
 17 | ,     cc_employees              int                       
 18 | ,     cc_sq_ft                  int                       
 19 | ,     cc_hours                  string                      
 20 | ,     cc_manager                string                   
 21 | ,     cc_mkt_id                 int                       
 22 | ,     cc_mkt_class              string                      
 23 | ,     cc_mkt_desc               string                  
 24 | ,     cc_market_manager         string                   
 25 | ,     cc_division               int                       
 26 | ,     cc_division_name          string                   
 27 | ,     cc_company                int                       
 28 | ,     cc_company_name           string                      
 29 | ,     cc_street_number          string                      
 30 | ,     cc_street_name            string                   
 31 | ,     cc_street_type            string                      
 32 | ,     cc_suite_number           string                      
 33 | ,     cc_city                   string                   
 34 | ,     cc_county                 string                   
 35 | ,     cc_state                  string                       
 36 | ,     cc_zip                    string                      
 37 | ,     cc_country                string                   
 38 | ,     cc_gmt_offset             double                  
 39 | ,     cc_tax_percentage         double
 40 | )
 41 | row format delimited fields terminated by '|' 
 42 | location '/data/call_center';
 43 | 
 44 | drop table if exists catalog_page;
 45 | 
 46 | create external table catalog_page(
 47 |       cp_catalog_page_sk        bigint               
 48 | ,     cp_catalog_page_id        string              
 49 | ,     cp_start_date_sk          bigint                       
 50 | ,     cp_end_date_sk            bigint                       
 51 | ,     cp_department             string                   
 52 | ,     cp_catalog_number         int                       
 53 | ,     cp_catalog_page_number    int                       
 54 | ,     cp_description            string                  
 55 | ,     cp_type                   string
 56 | )
 57 | row format delimited fields terminated by '|' 
 58 | location '/data/catalog_page';
 59 | 
 60 | drop table if exists catalog_returns;
 61 | 
 62 | create external table catalog_returns
 63 | (
 64 |     cr_returned_date_sk       bigint,
 65 |     cr_returned_time_sk       bigint,
 66 |     cr_item_sk                bigint,
 67 |     cr_refunded_customer_sk   bigint,
 68 |     cr_refunded_cdemo_sk      bigint,
 69 |     cr_refunded_hdemo_sk      bigint,
 70 |     cr_refunded_addr_sk       bigint,
 71 |     cr_returning_customer_sk  bigint,
 72 |     cr_returning_cdemo_sk     bigint,
 73 |     cr_returning_hdemo_sk     bigint,
 74 |     cr_returning_addr_sk      bigint,
 75 |     cr_call_center_sk         bigint,
 76 |     cr_catalog_page_sk        bigint,
 77 |     cr_ship_mode_sk           bigint,
 78 |     cr_warehouse_sk           bigint,
 79 |     cr_reason_sk              bigint,
 80 |     cr_order_number           bigint,
 81 |     cr_return_quantity        int,
 82 |     cr_return_amount          double,
 83 |     cr_return_tax             double,
 84 |     cr_return_amt_inc_tax     double,
 85 |     cr_fee                    double,
 86 |     cr_return_ship_cost       double,
 87 |     cr_refunded_cash          double,
 88 |     cr_reversed_charge        double,
 89 |     cr_store_credit           double,
 90 |     cr_net_loss               double
 91 | )
 92 | row format delimited fields terminated by '|' 
 93 | location '/data/catalog_returns';
 94 | 
 95 | drop table if exists catalog_sales;
 96 | 
 97 | create external table catalog_sales
 98 | (
 99 |     cs_sold_date_sk           bigint,
100 |     cs_sold_time_sk           bigint,
101 |     cs_ship_date_sk           bigint,
102 |     cs_bill_customer_sk       bigint,
103 |     cs_bill_cdemo_sk          bigint,
104 |     cs_bill_hdemo_sk          bigint,
105 |     cs_bill_addr_sk           bigint,
106 |     cs_ship_customer_sk       bigint,
107 |     cs_ship_cdemo_sk          bigint,
108 |     cs_ship_hdemo_sk          bigint,
109 |     cs_ship_addr_sk           bigint,
110 |     cs_call_center_sk         bigint,
111 |     cs_catalog_page_sk        bigint,
112 |     cs_ship_mode_sk           bigint,
113 |     cs_warehouse_sk           bigint,
114 |     cs_item_sk                bigint,
115 |     cs_promo_sk               bigint,
116 |     cs_order_number           bigint,
117 |     cs_quantity               int,
118 |     cs_wholesale_cost         double,
119 |     cs_list_price             double,
120 |     cs_sales_price            double,
121 |     cs_ext_discount_amt       double,
122 |     cs_ext_sales_price        double,
123 |     cs_ext_wholesale_cost     double,
124 |     cs_ext_list_price         double,
125 |     cs_ext_tax                double,
126 |     cs_coupon_amt             double,
127 |     cs_ext_ship_cost          double,
128 |     cs_net_paid               double,
129 |     cs_net_paid_inc_tax       double,
130 |     cs_net_paid_inc_ship      double,
131 |     cs_net_paid_inc_ship_tax  double,
132 |     cs_net_profit             double
133 | )
134 | row format delimited fields terminated by '|' 
135 | location '/data/catalog_sales';
136 | 
137 | drop table if exists customer_address;
138 | 
139 | create external table customer_address
140 | (
141 |     ca_address_sk             bigint,
142 |     ca_address_id             string,
143 |     ca_street_number          string,
144 |     ca_street_name            string,
145 |     ca_street_type            string,
146 |     ca_suite_number           string,
147 |     ca_city                   string,
148 |     ca_county                 string,
149 |     ca_state                  string,
150 |     ca_zip                    string,
151 |     ca_country                string,
152 |     ca_gmt_offset             double,
153 |     ca_location_type          string
154 | )
155 | row format delimited fields terminated by '|' 
156 | location '/data/customer_address';
157 | 
158 | drop table if exists customer_demographics;
159 | 
160 | create external table customer_demographics
161 | (
162 |     cd_demo_sk                bigint,
163 |     cd_gender                 string,
164 |     cd_marital_status         string,
165 |     cd_education_status       string,
166 |     cd_purchase_estimate      int,
167 |     cd_credit_rating          string,
168 |     cd_dep_count              int,
169 |     cd_dep_employed_count     int,
170 |     cd_dep_college_count      int 
171 | )
172 | row format delimited fields terminated by '|' 
173 | location '/data/customer_demographics';
174 | 
175 | drop table if exists customer;
176 | 
177 | create external table customer
178 | (
179 |     c_customer_sk             bigint,
180 |     c_customer_id             string,
181 |     c_current_cdemo_sk        bigint,
182 |     c_current_hdemo_sk        bigint,
183 |     c_current_addr_sk         bigint,
184 |     c_first_shipto_date_sk    bigint,
185 |     c_first_sales_date_sk     bigint,
186 |     c_salutation              string,
187 |     c_first_name              string,
188 |     c_last_name               string,
189 |     c_preferred_cust_flag     string,
190 |     c_birth_day               int,
191 |     c_birth_month             int,
192 |     c_birth_year              int,
193 |     c_birth_country           string,
194 |     c_login                   string,
195 |     c_email_address           string,
196 |     c_last_review_date        string
197 | )
198 | row format delimited fields terminated by '|' 
199 | location '/data/customer';
200 | 
201 | drop table if exists date_dim;
202 | 
203 | create external table date_dim
204 | (
205 |     d_date_sk                 bigint,
206 |     d_date_id                 string,
207 |     d_date                    string,
208 |     d_month_seq               int,
209 |     d_week_seq                int,
210 |     d_quarter_seq             int,
211 |     d_year                    int,
212 |     d_dow                     int,
213 |     d_moy                     int,
214 |     d_dom                     int,
215 |     d_qoy                     int,
216 |     d_fy_year                 int,
217 |     d_fy_quarter_seq          int,
218 |     d_fy_week_seq             int,
219 |     d_day_name                string,
220 |     d_quarter_name            string,
221 |     d_holiday                 string,
222 |     d_weekend                 string,
223 |     d_following_holiday       string,
224 |     d_first_dom               int,
225 |     d_last_dom                int,
226 |     d_same_day_ly             int,
227 |     d_same_day_lq             int,
228 |     d_current_day             string,
229 |     d_current_week            string,
230 |     d_current_month           string,
231 |     d_current_quarter         string,
232 |     d_current_year            string 
233 | )
234 | row format delimited fields terminated by '|' 
235 | location '/data/date_dim';
236 | 
237 | drop table if exists household_demographics;
238 | 
239 | create external table household_demographics
240 | (
241 |     hd_demo_sk                bigint,
242 |     hd_income_band_sk         bigint,
243 |     hd_buy_potential          string,
244 |     hd_dep_count              int,
245 |     hd_vehicle_count          int
246 | )
247 | row format delimited fields terminated by '|' 
248 | location '/data/household_demographics';
249 | 
250 | drop table if exists income_band;
251 | 
252 | create external table income_band(
253 |       ib_income_band_sk         bigint               
254 | ,     ib_lower_bound            int                       
255 | ,     ib_upper_bound            int
256 | )
257 | row format delimited fields terminated by '|' 
258 | location '/data/income_band';
259 | 
260 | drop table if exists inventory;
261 | 
262 | create external table inventory
263 | (
264 |     inv_date_sk			bigint,
265 |     inv_item_sk			bigint,
266 |     inv_warehouse_sk		bigint,
267 |     inv_quantity_on_hand	int
268 | )
269 | row format delimited fields terminated by '|' 
270 | location '/data/inventory';
271 | 
272 | drop table if exists item;
273 | 
274 | create external table item
275 | (
276 |     i_item_sk                 bigint,
277 |     i_item_id                 string,
278 |     i_rec_start_date          string,
279 |     i_rec_end_date            string,
280 |     i_item_desc               string,
281 |     i_current_price           double,
282 |     i_wholesale_cost          double,
283 |     i_brand_id                int,
284 |     i_brand                   string,
285 |     i_class_id                int,
286 |     i_class                   string,
287 |     i_category_id             int,
288 |     i_category                string,
289 |     i_manufact_id             int,
290 |     i_manufact                string,
291 |     i_size                    string,
292 |     i_formulation             string,
293 |     i_color                   string,
294 |     i_units                   string,
295 |     i_container               string,
296 |     i_manager_id              int,
297 |     i_product_name            string
298 | )
299 | row format delimited fields terminated by '|' 
300 | location '/data/item';
301 | 
302 | drop table if exists promotion;
303 | 
304 | create external table promotion
305 | (
306 |     p_promo_sk                bigint,
307 |     p_promo_id                string,
308 |     p_start_date_sk           bigint,
309 |     p_end_date_sk             bigint,
310 |     p_item_sk                 bigint,
311 |     p_cost                    double,
312 |     p_response_target         int,
313 |     p_promo_name              string,
314 |     p_channel_dmail           string,
315 |     p_channel_email           string,
316 |     p_channel_catalog         string,
317 |     p_channel_tv              string,
318 |     p_channel_radio           string,
319 |     p_channel_press           string,
320 |     p_channel_event           string,
321 |     p_channel_demo            string,
322 |     p_channel_details         string,
323 |     p_purpose                 string,
324 |     p_discount_active         string 
325 | )
326 | row format delimited fields terminated by '|' 
327 | location '/data/promotion';
328 | 
329 | drop table if exists reason;
330 | 
331 | create external table reason(
332 |       r_reason_sk               bigint               
333 | ,     r_reason_id               string              
334 | ,     r_reason_desc             string                
335 | )
336 | row format delimited fields terminated by '|' 
337 | location '/data/reason';
338 | 
339 | drop table if exists ship_mode;
340 | 
341 | create external table ship_mode(
342 |       sm_ship_mode_sk           bigint               
343 | ,     sm_ship_mode_id           string              
344 | ,     sm_type                   string                      
345 | ,     sm_code                   string                      
346 | ,     sm_carrier                string                      
347 | ,     sm_contract               string                      
348 | )
349 | row format delimited fields terminated by '|' 
350 | location '/data/ship_mode';
351 | 
352 | drop table if exists store_returns;
353 | 
354 | create external table store_returns
355 | (
356 |     sr_returned_date_sk       bigint,
357 |     sr_return_time_sk         bigint,
358 |     sr_item_sk                bigint,
359 |     sr_customer_sk            bigint,
360 |     sr_cdemo_sk               bigint,
361 |     sr_hdemo_sk               bigint,
362 |     sr_addr_sk                bigint,
363 |     sr_store_sk               bigint,
364 |     sr_reason_sk              bigint,
365 |     sr_ticket_number          bigint,
366 |     sr_return_quantity        int,
367 |     sr_return_amt             double,
368 |     sr_return_tax             double,
369 |     sr_return_amt_inc_tax     double,
370 |     sr_fee                    double,
371 |     sr_return_ship_cost       double,
372 |     sr_refunded_cash          double,
373 |     sr_reversed_charge        double,
374 |     sr_store_credit           double,
375 |     sr_net_loss               double             
376 | )
377 | row format delimited fields terminated by '|' 
378 | location '/data/store_returns';
379 | 
380 | drop table if exists store_sales;
381 | 
382 | create external table store_sales
383 | (
384 |     ss_sold_date_sk           bigint,
385 |     ss_sold_time_sk           bigint,
386 |     ss_item_sk                bigint,
387 |     ss_customer_sk            bigint,
388 |     ss_cdemo_sk               bigint,
389 |     ss_hdemo_sk               bigint,
390 |     ss_addr_sk                bigint,
391 |     ss_store_sk               bigint,
392 |     ss_promo_sk               bigint,
393 |     ss_ticket_number          bigint,
394 |     ss_quantity               int,
395 |     ss_wholesale_cost         double,
396 |     ss_list_price             double,
397 |     ss_sales_price            double,
398 |     ss_ext_discount_amt       double,
399 |     ss_ext_sales_price        double,
400 |     ss_ext_wholesale_cost     double,
401 |     ss_ext_list_price         double,
402 |     ss_ext_tax                double,
403 |     ss_coupon_amt             double,
404 |     ss_net_paid               double,
405 |     ss_net_paid_inc_tax       double,
406 |     ss_net_profit             double                  
407 | )
408 | row format delimited fields terminated by '|' 
409 | location '/data/store_sales';
410 | 
411 | drop table if exists store;
412 | 
413 | create external table store
414 | (
415 |     s_store_sk                bigint,
416 |     s_store_id                string,
417 |     s_rec_start_date          string,
418 |     s_rec_end_date            string,
419 |     s_closed_date_sk          bigint,
420 |     s_store_name              string,
421 |     s_number_employees        int,
422 |     s_floor_space             int,
423 |     s_hours                   string,
424 |     s_manager                 string,
425 |     s_market_id               int,
426 |     s_geography_class         string,
427 |     s_market_desc             string,
428 |     s_market_manager          string,
429 |     s_division_id             int,
430 |     s_division_name           string,
431 |     s_company_id              int,
432 |     s_company_name            string,
433 |     s_street_number           string,
434 |     s_street_name             string,
435 |     s_street_type             string,
436 |     s_suite_number            string,
437 |     s_city                    string,
438 |     s_county                  string,
439 |     s_state                   string,
440 |     s_zip                     string,
441 |     s_country                 string,
442 |     s_gmt_offset              double,
443 |     s_tax_precentage          double                  
444 | )
445 | row format delimited fields terminated by '|' 
446 | location '/data/store';
447 | 
448 | drop table if exists time_dim;
449 | 
450 | create external table time_dim
451 | (
452 |     t_time_sk                 bigint,
453 |     t_time_id                 string,
454 |     t_time                    int,
455 |     t_hour                    int,
456 |     t_minute                  int,
457 |     t_second                  int,
458 |     t_am_pm                   string,
459 |     t_shift                   string,
460 |     t_sub_shift               string,
461 |     t_meal_time               string
462 | )
463 | row format delimited fields terminated by '|' 
464 | location '/data/time_dim';
465 | 
466 | drop table if exists warehouse;
467 | 
468 | create external table warehouse(
469 |       w_warehouse_sk            bigint               
470 | ,     w_warehouse_id            string              
471 | ,     w_warehouse_name          string                   
472 | ,     w_warehouse_sq_ft         int                       
473 | ,     w_street_number           string                      
474 | ,     w_street_name             string                   
475 | ,     w_street_type             string                      
476 | ,     w_suite_number            string                      
477 | ,     w_city                    string                   
478 | ,     w_county                  string                   
479 | ,     w_state                   string                       
480 | ,     w_zip                     string                      
481 | ,     w_country                 string                   
482 | ,     w_gmt_offset              double                  
483 | )
484 | row format delimited fields terminated by '|' 
485 | location '/data/warehouse';
486 | 
487 | drop table if exists web_page;
488 | 
489 | create external table web_page(
490 |       wp_web_page_sk            bigint               
491 | ,     wp_web_page_id            string              
492 | ,     wp_rec_start_date        string                         
493 | ,     wp_rec_end_date          string                         
494 | ,     wp_creation_date_sk       bigint                       
495 | ,     wp_access_date_sk         bigint                       
496 | ,     wp_autogen_flag           string                       
497 | ,     wp_customer_sk            bigint                       
498 | ,     wp_url                    string                  
499 | ,     wp_type                   string                      
500 | ,     wp_char_count             int                       
501 | ,     wp_link_count             int                       
502 | ,     wp_image_count            int                       
503 | ,     wp_max_ad_count           int
504 | )
505 | row format delimited fields terminated by '|' 
506 | location '/data/web_page';
507 | 
508 | drop table if exists web_returns;
509 | 
510 | create external table web_returns
511 | (
512 |     wr_returned_date_sk       bigint,
513 |     wr_returned_time_sk       bigint,
514 |     wr_item_sk                bigint,
515 |     wr_refunded_customer_sk   bigint,
516 |     wr_refunded_cdemo_sk      bigint,
517 |     wr_refunded_hdemo_sk      bigint,
518 |     wr_refunded_addr_sk       bigint,
519 |     wr_returning_customer_sk  bigint,
520 |     wr_returning_cdemo_sk     bigint,
521 |     wr_returning_hdemo_sk     bigint,
522 |     wr_returning_addr_sk      bigint,
523 |     wr_web_page_sk            bigint,
524 |     wr_reason_sk              bigint,
525 |     wr_order_number           bigint,
526 |     wr_return_quantity        int,
527 |     wr_return_amt             double,
528 |     wr_return_tax             double,
529 |     wr_return_amt_inc_tax     double,
530 |     wr_fee                    double,
531 |     wr_return_ship_cost       double,
532 |     wr_refunded_cash          double,
533 |     wr_reversed_charge        double,
534 |     wr_account_credit         double,
535 |     wr_net_loss               double
536 | )
537 | row format delimited fields terminated by '|' 
538 | location '/data/web_returns';
539 | 
540 | drop table if exists web_sales;
541 | 
542 | create external table web_sales
543 | (
544 |     ws_sold_date_sk           bigint,
545 |     ws_sold_time_sk           bigint,
546 |     ws_ship_date_sk           bigint,
547 |     ws_item_sk                bigint,
548 |     ws_bill_customer_sk       bigint,
549 |     ws_bill_cdemo_sk          bigint,
550 |     ws_bill_hdemo_sk          bigint,
551 |     ws_bill_addr_sk           bigint,
552 |     ws_ship_customer_sk       bigint,
553 |     ws_ship_cdemo_sk          bigint,
554 |     ws_ship_hdemo_sk          bigint,
555 |     ws_ship_addr_sk           bigint,
556 |     ws_web_page_sk            bigint,
557 |     ws_web_site_sk            bigint,
558 |     ws_ship_mode_sk           bigint,
559 |     ws_warehouse_sk           bigint,
560 |     ws_promo_sk               bigint,
561 |     ws_order_number           bigint,
562 |     ws_quantity               int,
563 |     ws_wholesale_cost         double,
564 |     ws_list_price             double,
565 |     ws_sales_price            double,
566 |     ws_ext_discount_amt       double,
567 |     ws_ext_sales_price        double,
568 |     ws_ext_wholesale_cost     double,
569 |     ws_ext_list_price         double,
570 |     ws_ext_tax                double,
571 |     ws_coupon_amt             double,
572 |     ws_ext_ship_cost          double,
573 |     ws_net_paid               double,
574 |     ws_net_paid_inc_tax       double,
575 |     ws_net_paid_inc_ship      double,
576 |     ws_net_paid_inc_ship_tax  double,
577 |     ws_net_profit             double
578 | )
579 | row format delimited fields terminated by '|' 
580 | location '/data/web_sales';
581 | 
582 | drop table if exists web_site;
583 | 
584 | create external table web_site
585 | (
586 |     web_site_sk           bigint,
587 |     web_site_id           string,
588 |     web_rec_start_date    string,
589 |     web_rec_end_date      string,
590 |     web_name              string,
591 |     web_open_date_sk      bigint,
592 |     web_close_date_sk     bigint,
593 |     web_class             string,
594 |     web_manager           string,
595 |     web_mkt_id            int,
596 |     web_mkt_class         string,
597 |     web_mkt_desc          string,
598 |     web_market_manager    string,
599 |     web_company_id        int,
600 |     web_company_name      string,
601 |     web_street_number     string,
602 |     web_street_name       string,
603 |     web_street_type       string,
604 |     web_suite_number      string,
605 |     web_city              string,
606 |     web_county            string,
607 |     web_state             string,
608 |     web_zip               string,
609 |     web_country           string,
610 |     web_gmt_offset        double,
611 |     web_tax_percentage    double
612 | )
613 | row format delimited fields terminated by '|' 
614 | location '/data/web_site';
615 | 


--------------------------------------------------------------------------------
/week1/googleplaycrawler/googleplaycrawler.patch:
--------------------------------------------------------------------------------
  1 | commit 13d4f43387e58d2ba0a528c545f9d87dac9f6986
  2 | Author: Daniel Dai <daijy@hortonworks.com>
  3 | Date:   Tue Feb 21 21:22:14 2017 -0800
  4 | 
  5 |     GooglePlayCrawler
  6 | 
  7 | diff --git a/build.xml b/build.xml
  8 | index 5cff1ea..8c6d1f9 100644
  9 | --- a/build.xml
 10 | +++ b/build.xml
 11 | @@ -890,6 +890,8 @@
 12 |    <!-- target: clean-build  ============================================= -->
 13 |    <target name="clean-build" description="--> clean the project built files">
 14 |      <delete includeemptydirs="true" dir="${build.dir}"/>
 15 | +    <delete file="conf/regex-urlfilter.txt"/>
 16 | +    <delete file="conf/nutch-site.xml"/>
 17 |    </target>
 18 |  
 19 |    <!-- target: clean-dist   ============================================= -->
 20 | @@ -1051,6 +1053,7 @@
 21 |          <source path="${plugins.dir}/parse-js/src/java/" />
 22 |          <source path="${plugins.dir}/parse-metatags/src/java/" />
 23 |          <source path="${plugins.dir}/parse-metatags/src/test/" />
 24 | +        <source path="${plugins.dir}/parse-googleplay/src/java/" />
 25 |          <source path="${plugins.dir}/parse-swf/src/java/" />
 26 |          <source path="${plugins.dir}/parse-swf/src/test/" />
 27 |          <source path="${plugins.dir}/parse-tika/src/java/" />
 28 | diff --git a/conf/nutch-site.xml.template b/conf/nutch-site.xml.template
 29 | index 970c8fe..8a34a76 100644
 30 | --- a/conf/nutch-site.xml.template
 31 | +++ b/conf/nutch-site.xml.template
 32 | @@ -4,5 +4,44 @@
 33 |  <!-- Put site-specific property overrides in this file. -->
 34 |  
 35 |  <configuration>
 36 | -
 37 | +<property>
 38 | + <name>http.agent.name</name>
 39 | + <value>GooglePlayCrawler</value>
 40 | +</property>
 41 | +<property>
 42 | +  <name>plugin.includes</name>
 43 | +  <value>protocol-httpclient|urlfilter-regex|parse-googleplay|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
 44 | +  <description>Regular expression naming plugin directory names to
 45 | +  include.  Any plugin not matching this expression is excluded.
 46 | +  In any case you need at least include the nutch-extensionpoints plugin. By
 47 | +  default Nutch includes crawling just HTML and plain text via HTTP,
 48 | +  and basic indexing and search plugins. In order to use HTTPS please enable
 49 | +  protocol-httpclient, but be aware of possible intermittent problems with the
 50 | +  underlying commons-httpclient library.
 51 | +  </description>
 52 | +</property>
 53 | +<property>
 54 | +  <name>db.max.outlinks.per.page</name>
 55 | +  <value>1000</value>
 56 | +  <description>The maximum number of outlinks that we'll process for a page.
 57 | +  If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks
 58 | +  will be processed for a page; otherwise, all outlinks will be processed.
 59 | +  </description>
 60 | +</property>
 61 | +<property>
 62 | +  <name>http.content.limit</name>
 63 | +  <value>1048576</value>
 64 | +</property>
 65 | +<property>
 66 | +  <name>parser.timeout</name>
 67 | +  <value>3600</value>
 68 | +</property>
 69 | +<property>
 70 | +  <name>fetcher.threads.fetch</name>
 71 | +  <value>20</value>
 72 | +</property>
 73 | +<property>
 74 | +  <name>mapred.reduce.tasks</name>
 75 | +  <value>10</value>
 76 | +</property>
 77 |  </configuration>
 78 | diff --git a/conf/parse-plugins.xml b/conf/parse-plugins.xml
 79 | index 20c8724..56f53f8 100644
 80 | --- a/conf/parse-plugins.xml
 81 | +++ b/conf/parse-plugins.xml
 82 | @@ -68,6 +68,10 @@
 83 |  		<plugin id="feed" />
 84 |  	</mimeType>
 85 |  
 86 | +        <mimeType name="text/html">
 87 | +                <plugin id="parse-googleplay" />
 88 | +        </mimeType>
 89 | +
 90 |         <!-- Types for parse-ext plugin: required for unit tests to pass. -->
 91 |  
 92 |  	<mimeType name="application/vnd.nutch.example.cat">
 93 | @@ -86,6 +90,8 @@
 94 |  		<alias name="parse-ext" extension-id="ExtParser" />
 95 |  		<alias name="parse-html"
 96 |  			extension-id="org.apache.nutch.parse.html.HtmlParser" />
 97 | +                <alias name="parse-googleplay"
 98 | +                        extension-id="com.example.googleplay.GoogleplayParser" />
 99 |  		<alias name="parse-js" extension-id="JSParser" />
100 |  		<alias name="feed"
101 |  			extension-id="org.apache.nutch.parse.feed.FeedParser" />
102 | diff --git a/conf/regex-urlfilter.txt.template b/conf/regex-urlfilter.txt.template
103 | index 78b2b31..5b0eb81 100644
104 | --- a/conf/regex-urlfilter.txt.template
105 | +++ b/conf/regex-urlfilter.txt.template
106 | @@ -30,10 +30,10 @@
107 |  -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
108 |  
109 |  # skip URLs containing certain characters as probable queries, etc.
110 | --[?*!@=]
111 | +#-[?*!@=]
112 |  
113 |  # skip URLs with slash-delimited segment that repeats 3+ times, to break loops
114 | --.*(/[^/]+)/[^/]+\1/[^/]+\1/
115 | +#-.*(/[^/]+)/[^/]+\1/[^/]+\1/
116 |  
117 |  # accept anything else
118 |  +.
119 | diff --git a/src/java/org/apache/nutch/googleplay/GooglePlayCrawler.java b/src/java/org/apache/nutch/googleplay/GooglePlayCrawler.java
120 | new file mode 100644
121 | index 0000000..40bdf8e
122 | --- /dev/null
123 | +++ b/src/java/org/apache/nutch/googleplay/GooglePlayCrawler.java
124 | @@ -0,0 +1,122 @@
125 | +/**
126 | + * Licensed to the Apache Software Foundation (ASF) under one or more
127 | + * contributor license agreements.  See the NOTICE file distributed with
128 | + * this work for additional information regarding copyright ownership.
129 | + * The ASF licenses this file to You under the Apache License, Version 2.0
130 | + * (the "License"); you may not use this file except in compliance with
131 | + * the License.  You may obtain a copy of the License at
132 | + *
133 | + *     http://www.apache.org/licenses/LICENSE-2.0
134 | + *
135 | + * Unless required by applicable law or agreed to in writing, software
136 | + * distributed under the License is distributed on an "AS IS" BASIS,
137 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
138 | + * See the License for the specific language governing permissions and
139 | + * limitations under the License.
140 | + */
141 | +
142 | +package org.apache.nutch.googleplay;
143 | +
144 | +// Commons Logging imports
145 | +import org.slf4j.Logger;
146 | +import org.slf4j.LoggerFactory;
147 | +
148 | +import org.apache.hadoop.fs.*;
149 | +import org.apache.hadoop.conf.*;
150 | +import org.apache.hadoop.mapred.*;
151 | +import org.apache.hadoop.util.Tool;
152 | +import org.apache.hadoop.util.ToolRunner;
153 | +import org.apache.nutch.crawl.*;
154 | +import org.apache.nutch.parse.ParseSegment;
155 | +import org.apache.nutch.util.NutchConfiguration;
156 | +import org.apache.nutch.util.NutchJob;
157 | +
158 | +import org.apache.nutch.fetcher.Fetcher;
159 | +
160 | +public class GooglePlayCrawler extends Configured implements Tool {
161 | +  public static final Logger LOG = LoggerFactory.getLogger(GooglePlayCrawler.class);
162 | +
163 | +  /* Perform complete crawling and indexing (to Solr) given a set of root urls and the -solr
164 | +     parameter respectively. More information and Usage parameters can be found below. */
165 | +  public static void main(String args[]) throws Exception {
166 | +    Configuration conf = NutchConfiguration.create();
167 | +    int res = ToolRunner.run(conf, new GooglePlayCrawler(), args);
168 | +    System.exit(res);
169 | +  }
170 | +  
171 | +  @Override
172 | +  public int run(String[] args) throws Exception {
173 | +    if (args.length < 1) {
174 | +      System.out.println
175 | +      ("Usage: Crawl <urlDir> [-dir d] [-depth i] [-numFetchers n]");
176 | +      return -1;
177 | +    }
178 | +    Path rootUrlDir = null;
179 | +    Path dir = new Path("nutchdb");
180 | +    Path finalOutput = null;
181 | +    int threads = getConf().getInt("fetcher.threads.fetch", 10);
182 | +    int depth = 2;
183 | +    int numFetchers = 200;
184 | +    
185 | +    for (int i = 0; i < args.length; i++) {
186 | +      if ("-dir".equals(args[i])) {
187 | +        dir = new Path(args[i+1]);
188 | +        i++;
189 | +      } else if ("-depth".equals(args[i])) {
190 | +        depth = Integer.parseInt(args[i+1]);
191 | +        i++;
192 | +      } else if ("-numFetchers".equals(args[i])) {
193 | +        numFetchers = Integer.parseInt(args[i+1]);
194 | +        i++;
195 | +      } else if ("-finalOutput".equals(args[i])) {
196 | +        finalOutput =  new Path(args[i+1]);
197 | +        i++;
198 | +      } else if (args[i] != null) {
199 | +        rootUrlDir = new Path(args[i]);
200 | +      }
201 | +    }
202 | +    
203 | +    JobConf job = new NutchJob(getConf());
204 | +
205 | +    if (LOG.isInfoEnabled()) {
206 | +      LOG.info("crawl started in: " + dir);
207 | +      LOG.info("rootUrlDir = " + rootUrlDir);
208 | +      LOG.info("depth = " + depth);      
209 | +      LOG.info("numFetchers =" + numFetchers);
210 | +    }
211 | +    
212 | +    Path crawlDb = new Path("nutchdb");
213 | +    Path segments = new Path(dir + "/segments");
214 | +
215 | +    Injector injector = new Injector(getConf());
216 | +    Generator generator = new Generator(getConf());
217 | +    Fetcher fetcher = new Fetcher(getConf());
218 | +    ParseSegment parseSegment = new ParseSegment(getConf());
219 | +    CrawlDb crawlDbTool = new CrawlDb(getConf());
220 | +      
221 | +    // initialize crawlDb
222 | +    injector.inject(crawlDb, rootUrlDir);
223 | +    int i;
224 | +    for (i = 0; i < depth; i++) {             // generate new segment
225 | +      Path[] segs = generator.generate(crawlDb, segments, numFetchers, Long.MAX_VALUE, System
226 | +          .currentTimeMillis());
227 | +      if (segs == null) {
228 | +        LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
229 | +        break;
230 | +      }
231 | +      fetcher.fetch(segs[0], threads);  // fetch it
232 | +      if (!Fetcher.isParsing(job)) {
233 | +        parseSegment.parse(segs[0]);    // parse it, if needed
234 | +      }
235 | +      crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
236 | +    }
237 | +    if (i == 0) {
238 | +      LOG.warn("No URLs to fetch - check your seed list and URL filters.");
239 | +    }
240 | +    if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
241 | +    if (finalOutput != null) {
242 | +        FsShell.main(new String[] {"-cp", dir.toString(), finalOutput.toString()});
243 | +    }
244 | +    return 0;
245 | +  }
246 | +}
247 | diff --git a/src/plugin/build.xml b/src/plugin/build.xml
248 | index 75ae2e7..5742403 100755
249 | --- a/src/plugin/build.xml
250 | +++ b/src/plugin/build.xml
251 | @@ -62,6 +62,7 @@
252 |       <ant dir="parse-js" target="deploy"/>
253 |       <ant dir="parse-html" target="deploy"/>
254 |       <ant dir="parse-metatags" target="deploy"/>
255 | +     <ant dir="parse-googleplay" target="deploy"/>
256 |       <ant dir="parse-swf" target="deploy"/>
257 |       <ant dir="parse-tika" target="deploy"/>
258 |       <ant dir="parse-zip" target="deploy"/>
259 | @@ -115,6 +116,7 @@
260 |       <ant dir="feed" target="test"/>
261 |       <ant dir="parse-html" target="test"/>
262 |       <ant dir="parse-metatags" target="test"/>
263 | +     <ant dir="parse-googleplay" target="test"/>
264 |       <ant dir="parse-swf" target="test"/>
265 |       <ant dir="parse-tika" target="test"/>
266 |       <ant dir="parse-zip" target="test"/>
267 | @@ -181,6 +183,7 @@
268 |      <ant dir="parse-js" target="clean"/>
269 |      <ant dir="parse-html" target="clean"/>
270 |      <ant dir="parse-metatags" target="clean"/>
271 | +    <ant dir="parse-googleplay" target="clean"/>
272 |      <ant dir="parse-swf" target="clean"/>
273 |      <ant dir="parse-tika" target="clean"/>
274 |      <ant dir="parse-zip" target="clean"/>
275 | diff --git a/src/plugin/parse-googleplay/build.xml b/src/plugin/parse-googleplay/build.xml
276 | new file mode 100644
277 | index 0000000..7e78ea2
278 | --- /dev/null
279 | +++ b/src/plugin/parse-googleplay/build.xml
280 | @@ -0,0 +1,28 @@
281 | +<?xml version="1.0"?>
282 | +<!--
283 | + Licensed to the Apache Software Foundation (ASF) under one or more
284 | + contributor license agreements.  See the NOTICE file distributed with
285 | + this work for additional information regarding copyright ownership.
286 | + The ASF licenses this file to You under the Apache License, Version 2.0
287 | + (the "License"); you may not use this file except in compliance with
288 | + the License.  You may obtain a copy of the License at
289 | +
290 | +     http://www.apache.org/licenses/LICENSE-2.0
291 | +
292 | + Unless required by applicable law or agreed to in writing, software
293 | + distributed under the License is distributed on an "AS IS" BASIS,
294 | + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
295 | + See the License for the specific language governing permissions and
296 | + limitations under the License.
297 | +-->
298 | +<project name="parse-googleplay" default="jar-core">
299 | +
300 | +  <import file="../build-plugin.xml"/>
301 | +
302 | +  <!-- Deploy Unit test dependencies -->
303 | +  <target name="deps-test">
304 | +    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
305 | +    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
306 | +  </target>
307 | +
308 | +</project>
309 | diff --git a/src/plugin/parse-googleplay/ivy.xml b/src/plugin/parse-googleplay/ivy.xml
310 | new file mode 100644
311 | index 0000000..1a86d68
312 | --- /dev/null
313 | +++ b/src/plugin/parse-googleplay/ivy.xml
314 | @@ -0,0 +1,41 @@
315 | +<?xml version="1.0" ?>
316 | +
317 | +<!--
318 | +   Licensed to the Apache Software Foundation (ASF) under one or more
319 | +   contributor license agreements.  See the NOTICE file distributed with
320 | +   this work for additional information regarding copyright ownership.
321 | +   The ASF licenses this file to You under the Apache License, Version 2.0
322 | +   (the "License"); you may not use this file except in compliance with
323 | +   the License.  You may obtain a copy of the License at
324 | +
325 | +       http://www.apache.org/licenses/LICENSE-2.0
326 | +
327 | +   Unless required by applicable law or agreed to in writing, software
328 | +   distributed under the License is distributed on an "AS IS" BASIS,
329 | +   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
330 | +   See the License for the specific language governing permissions and
331 | +   limitations under the License.
332 | +-->
333 | +
334 | +<ivy-module version="1.0">
335 | +  <info organisation="org.apache.nutch" module="${ant.project.name}">
336 | +    <license name="Apache 2.0"/>
337 | +    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
338 | +    <description>
339 | +        Apache Nutch
340 | +    </description>
341 | +  </info>
342 | +
343 | +  <configurations>
344 | +    <include file="../../..//ivy/ivy-configurations.xml"/>
345 | +  </configurations>
346 | +
347 | +  <publications>
348 | +    <!--get the artifact from our module name-->
349 | +    <artifact conf="master"/>
350 | +  </publications>
351 | +
352 | +  <dependencies>
353 | +  </dependencies>
354 | +  
355 | +</ivy-module>
356 | diff --git a/src/plugin/parse-googleplay/plugin.xml b/src/plugin/parse-googleplay/plugin.xml
357 | new file mode 100644
358 | index 0000000..4b3d354
359 | --- /dev/null
360 | +++ b/src/plugin/parse-googleplay/plugin.xml
361 | @@ -0,0 +1,47 @@
362 | +<?xml version="1.0" encoding="UTF-8"?>
363 | +<!--
364 | + Licensed to the Apache Software Foundation (ASF) under one or more
365 | + contributor license agreements.  See the NOTICE file distributed with
366 | + this work for additional information regarding copyright ownership.
367 | + The ASF licenses this file to You under the Apache License, Version 2.0
368 | + (the "License"); you may not use this file except in compliance with
369 | + the License.  You may obtain a copy of the License at
370 | +
371 | +     http://www.apache.org/licenses/LICENSE-2.0
372 | +
373 | + Unless required by applicable law or agreed to in writing, software
374 | + distributed under the License is distributed on an "AS IS" BASIS,
375 | + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
376 | + See the License for the specific language governing permissions and
377 | + limitations under the License.
378 | +-->
379 | +<plugin
380 | +   id="parse-googleplay"
381 | +   name="Goolge play store parser"
382 | +   version="1.0.0"
383 | +   provider-name="example.com">
384 | +
385 | +
386 | +   <runtime>
387 | +      <library name="parse-googleplay.jar">
388 | +         <export name="*"/>
389 | +      </library>
390 | +   </runtime>
391 | +
392 | +   <requires>
393 | +      <import plugin="nutch-extensionpoints"/>
394 | +   </requires>
395 | +
396 | +   <extension id="com.example.googleplay"
397 | +              name="GoogleplayParser"
398 | +              point="org.apache.nutch.parse.Parser">
399 | +
400 | +      <implementation id="com.example.googleplay.GoogleplayParser"
401 | +                      class="com.example.googleplay.GoogleplayParser">
402 | +        <parameter name="contentType" value="text/html"/>
403 | +        <parameter name="pathSuffix"  value=""/>
404 | +      </implementation>
405 | +      
406 | +   </extension>
407 | +
408 | +</plugin>
409 | diff --git a/src/plugin/parse-googleplay/src/java/com/example/googleplay/GoogleplayParser.java b/src/plugin/parse-googleplay/src/java/com/example/googleplay/GoogleplayParser.java
410 | new file mode 100644
411 | index 0000000..f26aba1
412 | --- /dev/null
413 | +++ b/src/plugin/parse-googleplay/src/java/com/example/googleplay/GoogleplayParser.java
414 | @@ -0,0 +1,190 @@
415 | +package com.example.googleplay;
416 | +
417 | +import java.net.MalformedURLException;
418 | +import java.util.ArrayList;
419 | +import java.util.HashSet;
420 | +import java.util.List;
421 | +import java.util.Set;
422 | +import java.util.regex.Matcher;
423 | +import java.util.regex.Pattern;
424 | +
425 | +import org.apache.hadoop.conf.Configuration;
426 | +import org.apache.nutch.metadata.Metadata;
427 | +import org.apache.nutch.parse.Outlink;
428 | +import org.apache.nutch.parse.ParseData;
429 | +import org.apache.nutch.parse.ParseImpl;
430 | +import org.apache.nutch.parse.ParseResult;
431 | +import org.apache.nutch.parse.ParseStatus;
432 | +import org.apache.nutch.parse.Parser;
433 | +import org.apache.nutch.protocol.Content;
434 | +import org.slf4j.Logger;
435 | +import org.slf4j.LoggerFactory;
436 | +
437 | +public class GoogleplayParser implements Parser {
438 | +    public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.parse.googleplay");
439 | +    static Pattern appUrlPattern = Pattern.compile("https://play.google.com/store/apps/details\\?id=[a-zA-Z0-9\\._]+");
440 | +    static Pattern titlePattern = Pattern.compile("<title.*?>(.*?)</title>");
441 | +    static Pattern appNamePattern= Pattern.compile("<div class=\"document-title\" itemprop=\"name\"> <div.*?>(.*?)</div");
442 | +    static Pattern linkPattern = Pattern.compile("href=\"/store/apps/details\\?id=([a-zA-Z0-9\\._]+)");
443 | +    static Pattern publisherPattern = Pattern.compile("<span itemprop=\"name\">(.*?)</span>");
444 | +    static Pattern updateTimePattern = Pattern.compile("<div class=\"document-subtitle\">- (.*?)</div>");
445 | +    static Pattern categoryPattern = Pattern.compile("<span itemprop=\"genre\">(.*?)</span>");
446 | +    static Pattern pricePattern = Pattern.compile("<meta content=\"(\\d*?)\" itemprop=\"price\">");
447 | +    static Pattern reviewPattern = Pattern.compile("<div class=\"score-container\"(.*?)<meta content=\"(.*?)\" itemprop=\"ratingValue\">(.*?)<meta content=\"(.*)?\" itemprop=\"ratingCount\">");
448 | +    static Pattern installPattern = Pattern.compile("<div class=\"content\" itemprop=\"numDownloads\">(.*?)</div>");
449 | +    static Pattern versionPattern = Pattern.compile("<div class=\"content\" itemprop=\"softwareVersion\">(.*?)</div>");
450 | +    static Pattern ratingPattern = Pattern.compile("<div class=\"content\" itemprop=\"contentRating\">(.*?)</div>");
451 | +    static Pattern developerSitePattern = Pattern.compile("<a class=\"dev-link\" href=\"https://www.google.com/url\\?q=(.*?)&");
452 | +    static Pattern developerEmailPattern = Pattern.compile("<a class=\"dev-link\" href=\"mailto:(.*?)\"");
453 | +    static Pattern descriptionPattern = Pattern.compile("<div class=\"show-more-content text-body\" itemprop=\"description\" tabindex=\"0\"> <div jsname=\"C4s9Ed\">(.*?)</div>");
454 | +
455 | +    private Configuration conf;
456 | +
457 | +    @Override
458 | +    public Configuration getConf() {
459 | +        return conf;
460 | +    }
461 | +
462 | +    @Override
463 | +    public void setConf(Configuration conf) {
464 | +        this.conf = conf;
465 | +    }
466 | +
467 | +    @Override
468 | +    public ParseResult getParse(Content content) {
469 | +        String thisId = content.getBaseUrl().substring(content.getBaseUrl().indexOf("=")+1);
470 | +        byte[] contentInOctets = content.getContent();
471 | +        String htmlText = new String(contentInOctets);
472 | +        
473 | +        Metadata meta = content.getMetadata();
474 | +        
475 | +        String title = null;
476 | +        String appName = null;
477 | +        Set<String> ids = new HashSet<String>();
478 | +        String publisher = null;
479 | +        String updateTime = null;
480 | +        String category = null;
481 | +        String price = null;
482 | +        String reviewScore = null;
483 | +        String reviewCount = null;
484 | +        String install = null;
485 | +        String version = null;
486 | +        String rating = null;
487 | +        String developerSite = null;
488 | +        String developerEmail = null;
489 | +        String description = null;
490 | +        
491 | +        Matcher m = titlePattern.matcher(htmlText);
492 | +        if (m.find()) {
493 | +            title = m.group(1);
494 | +        }
495 | +        
496 | +        m = linkPattern.matcher(htmlText);
497 | +        while (m.find()) {
498 | +            if (!m.group(1).equals(thisId)) {
499 | +                ids.add(m.group(1));
500 | +            }
501 | +        }
502 | +        List<Outlink> outlinks = new ArrayList<Outlink>();
503 | +        for (String id : ids) {
504 | +            try {
505 | +                outlinks.add(new Outlink("https://play.google.com/store/apps/details?id=" + id, ""));
506 | +            } catch (MalformedURLException mue) {
507 | +                LOG.warn("Invalid url: '" + id + "', skipping.");
508 | +            }
509 | +        }
510 | +        
511 | +        m = appUrlPattern.matcher(content.getBaseUrl());
512 | +        if (m.matches()) {  // App page
513 | +            m = appNamePattern.matcher(htmlText);
514 | +            if (m.find()) {
515 | +                appName = m.group(1);
516 | +            }
517 | +            meta.set("name", appName);
518 | +            
519 | +            m = publisherPattern.matcher(htmlText);
520 | +            if (m.find()) {
521 | +                publisher = m.group(1);
522 | +            }
523 | +            meta.set("publisher", publisher!=null?publisher:"");
524 | +            
525 | +            m = updateTimePattern.matcher(htmlText);
526 | +            if (m.find()) {
527 | +                updateTime = m.group(1);
528 | +            }
529 | +            meta.set("updateTime", updateTime!=null?updateTime:"");
530 | +            
531 | +            m = categoryPattern.matcher(htmlText);
532 | +            if (m.find()) {
533 | +                category = m.group(1);
534 | +                category = category.replace("&amp;", "and");
535 | +            }
536 | +            meta.set("category", category!=null?category:"");
537 | +            
538 | +            m = pricePattern.matcher(htmlText);
539 | +            if (m.find()) {
540 | +                price = m.group(1);
541 | +            }
542 | +            meta.set("price", price!=null?price:"");
543 | +            
544 | +            m = reviewPattern.matcher(htmlText);
545 | +            if (m.find()) {
546 | +                reviewScore = m.group(2);
547 | +                reviewCount = m.group(4);
548 | +            }
549 | +            meta.set("reviewScore", reviewScore!=null?reviewScore:"");
550 | +            meta.set("reviewCount", reviewCount!=null?reviewCount:"");
551 | +            
552 | +            m = installPattern.matcher(htmlText);
553 | +            if (m.find()) {
554 | +                install = m.group(1)!=null?m.group(1):"";
555 | +                install = install.trim();
556 | +            }
557 | +            meta.set("install", install);
558 | +            
559 | +            m = versionPattern.matcher(htmlText);
560 | +            if (m.find()) {
561 | +                version = m.group(1)!=null?m.group(1):"";
562 | +                version = version.trim();
563 | +            }
564 | +            meta.set("version", version);
565 | +            
566 | +            m = ratingPattern.matcher(htmlText);
567 | +            if (m.find()) {
568 | +                rating = m.group(1)!=null?m.group(1):"";
569 | +                rating = rating.trim();
570 | +            }
571 | +            meta.set("rating", rating);
572 | +            
573 | +            m = developerSitePattern.matcher(htmlText);
574 | +            if (m.find()) {
575 | +                developerSite = m.group(1)!=null?m.group(1):"";
576 | +                developerSite = developerSite.trim();
577 | +            }
578 | +            meta.set("developerSite", developerSite);
579 | +            
580 | +            m = developerEmailPattern.matcher(htmlText);
581 | +            if (m.find()) {
582 | +                developerEmail = m.group(1)!=null?m.group(1):"";
583 | +                developerEmail = developerEmail.trim();
584 | +            }
585 | +            meta.set("developerEmail", developerEmail);
586 | +            
587 | +            m = descriptionPattern.matcher(htmlText);
588 | +            if (m.find()) {
589 | +                description = m.group(1);
590 | +            }
591 | +            meta.set("description", description!=null?description:"");
592 | +        }
593 | +                
594 | +        ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
595 | +                outlinks.toArray(new Outlink[0]), meta);
596 | +        ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), 
597 | +                new ParseImpl("", parseData));
598 | +        try {
599 | +            Thread.sleep(200);
600 | +        } catch (InterruptedException e) {
601 | +        }
602 | +        return parseResult;
603 | +    }
604 | +}
605 | 


--------------------------------------------------------------------------------