├── homework ├── week1 │ ├── dict.txt │ ├── compress.patch │ ├── sequencefile.patch │ ├── partitioner.patch │ └── dict.patch ├── week2 │ ├── bootstrap │ │ ├── test.pig │ │ ├── src │ │ │ └── java │ │ │ │ └── com │ │ │ │ └── example │ │ │ │ └── pig │ │ │ │ └── BootstrapSampleLoader.java │ │ └── pom.xml │ └── extract_time.pig ├── profile_notes.txt └── week3 │ └── sqoop.patch ├── week3 ├── oozie │ ├── cleanup.sql │ ├── aggr.pig │ ├── job.properties │ ├── job-all.properties │ ├── hive-config.xml │ ├── workflow.xml │ └── workflow-all.xml └── hive │ ├── evalfunc │ ├── src │ │ └── java │ │ │ └── com │ │ │ └── example │ │ │ └── hive │ │ │ └── evalfunc │ │ │ └── Hello.java │ └── pom.xml │ └── tpcds │ ├── tpcds.patch │ ├── upload.sh │ ├── insert.sql │ ├── all-tables-orc.sql │ ├── all-tables.sql │ └── all-tables-base.sql ├── week1 ├── conf │ ├── mapred-site.xml │ ├── core-site.xml │ ├── yarn-site.xml │ └── hdfs-site.xml ├── docker │ └── Dockerfile ├── wordcount │ ├── patches │ │ ├── combiner.patch │ │ ├── nummapreduce.patch │ │ ├── config.patches │ │ ├── counters.patch │ │ └── distributedcache.patch │ ├── pom.xml │ └── src │ │ └── java │ │ └── com │ │ └── example │ │ └── WordCount.java └── googleplaycrawler │ ├── fixskew.patch │ └── googleplaycrawler.patch ├── week2 ├── loadfunc │ ├── loadgoogle.pig │ ├── pom.xml │ └── src │ │ └── java │ │ └── com │ │ └── example │ │ └── NutchParsedDataLoader.java ├── python │ ├── demo.py │ └── kmeans.py ├── pigserver │ ├── log4j.properties │ ├── src │ │ └── java │ │ │ └── com │ │ │ └── example │ │ │ └── pig │ │ │ └── TestPigServer.java │ └── pom.xml └── evalfunc │ ├── pom.xml │ ├── src │ └── java │ │ └── com │ │ └── example │ │ └── pig │ │ └── GetCountry.java │ └── patches │ └── country_city.patch ├── druid ├── topn.json ├── sessionize.pig └── cloudacl-index.json ├── week4 ├── fixdoccompile.patch ├── PIG-3399-2.patch ├── doc.patch ├── jobname.patch └── set.patch └── capstone └── track1 └── data_description.txt /homework/week1/dict.txt: -------------------------------------------------------------------------------- 1 | 我 I 2 | 爱 love 3 | -------------------------------------------------------------------------------- /week3/oozie/cleanup.sql: -------------------------------------------------------------------------------- 1 | drop table if exists student; 2 | -------------------------------------------------------------------------------- /homework/week2/bootstrap/test.pig: -------------------------------------------------------------------------------- 1 | register target/bootstrap-0.0.1-SNAPSHOT.jar 2 | 3 | a = load 'studenttab10k' using com.example.pig.BootstrapSampleLoader(); 4 | dump a; 5 | -------------------------------------------------------------------------------- /week1/conf/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | mapreduce.framework.name 4 | yarn 5 | 6 | 7 | -------------------------------------------------------------------------------- /week3/oozie/aggr.pig: -------------------------------------------------------------------------------- 1 | A = load 'student' using org.apache.hive.hcatalog.pig.HCatLoader(); 2 | B = group A by name; 3 | C = foreach B generate group as name, AVG(A.gpa) as gpa; 4 | store C into '$OUTPUT' USING PigStorage(); 5 | -------------------------------------------------------------------------------- /week3/oozie/job.properties: -------------------------------------------------------------------------------- 1 | nameNode=hdfs://localhost:9000 2 | jobTracker=localhost:8032 3 | queueName=default 4 | 5 | oozie.use.system.libpath=true 6 | oozie.wf.application.path=${nameNode}/user/${user.name}/oozie/apps/workflow.xml 7 | -------------------------------------------------------------------------------- /week3/oozie/job-all.properties: -------------------------------------------------------------------------------- 1 | nameNode=hdfs://localhost:9000 2 | jobTracker=localhost:8032 3 | queueName=default 4 | 5 | oozie.use.system.libpath=true 6 | oozie.wf.application.path=${nameNode}/user/${user.name}/oozie/apps/workflow-all.xml 7 | -------------------------------------------------------------------------------- /week3/oozie/hive-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | javax.jdo.option.ConnectionURL 4 | jdbc:derby:;databaseName=/home/hadoop/apache-hive-1.2.1-bin/metastore_db;create=true 5 | JDBC connect string for a JDBC metastore 6 | 7 | 8 | -------------------------------------------------------------------------------- /week3/hive/evalfunc/src/java/com/example/hive/evalfunc/Hello.java: -------------------------------------------------------------------------------- 1 | package com.example.hive.evalfunc; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | import org.apache.hadoop.io.Text; 5 | 6 | public class Hello extends UDF { 7 | public Text evaluate(Text input) { 8 | return new Text("Hello " + input.toString()); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /homework/week2/extract_time.pig: -------------------------------------------------------------------------------- 1 | a = LOAD 'access_logs' AS (line:chararray); 2 | b = FOREACH a GENERATE flatten(REGEX_EXTRACT_ALL(line, '(.*?) .*?\\[(.*?)\\].*')) as (ip:chararray, dt:chararray); 3 | c = FOREACH b GENERATE ip, ToDate(dt, 'yyyy-MM-dd HH:mm:ss.SSSSSS') as dt; 4 | d = FOREACH c GENERATE ip, GetYear(dt), GetMonth(dt), GetDay(dt), GetHour(dt), GetMinute(dt), GetSecond(dt); 5 | dump d; 6 | -------------------------------------------------------------------------------- /week2/loadfunc/loadgoogle.pig: -------------------------------------------------------------------------------- 1 | register target/nutchdbloader-0.0.1-SNAPSHOT.jar 2 | register /home/hadoop/hadoop-2.7.3/share/hadoop/tools/lib/hadoop-aws-2.7.3.jar 3 | register nutch-1.12.jar 4 | 5 | rmf output 6 | loaded = load 's3n://daijytest/nutchdb/segments/*/parse_data/part-*/data' using com.example.NutchParsedDataLoader(); 7 | filtered = filter loaded by $0 is not null; 8 | store filtered into 'output'; 9 | -------------------------------------------------------------------------------- /druid/topn.json: -------------------------------------------------------------------------------- 1 | { 2 | "queryType": "topN", 3 | "dataSource": "cloudacl_accesslog", 4 | "dimension": "country_code", 5 | "threshold": 5, 6 | "metric": "count", 7 | "granularity": "all", 8 | "aggregations": [ 9 | { 10 | "type": "longSum", 11 | "name": "count", 12 | "fieldName": "count" 13 | } 14 | ], 15 | "intervals": [ 16 | "2017-03-05T00:00:00.000/2017-03-12T00:00:00.000" 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /homework/profile_notes.txt: -------------------------------------------------------------------------------- 1 | Yourkit CPU profile: 2 | export PIG_OPTS="-agentpath:/home/hadoop/yjp-2016.02/bin/linux-x86-64/libyjpagent.so=onexit=snapshot,sampling,dir=/tmp" 3 | 4 | Java memory dump: 5 | jmap -dump:file= 6 | 7 | Java memory dump upon OOM: 8 | export PIG_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp" 9 | 10 | MAT reading: 11 | http://eclipsesource.com/blogs/2013/01/21/10-tips-for-using-the-eclipse-memory-analyzer/ 12 | -------------------------------------------------------------------------------- /week3/hive/tpcds/tpcds.patch: -------------------------------------------------------------------------------- 1 | diff --git a/query_templates/netezza.tpl b/query_templates/netezza.tpl 2 | index 75488d2..0ff3ce1 100755 3 | --- a/query_templates/netezza.tpl 4 | +++ b/query_templates/netezza.tpl 5 | @@ -35,3 +35,5 @@ 6 | define __LIMITA = ""; 7 | define __LIMITB = ""; 8 | define __LIMITC = "limit %d"; 9 | +define _BEGIN = "-- start query " + [_QUERY] + " in stream " + [_STREAM] + " using template " + [_TEMPLATE]; 10 | +define _END = "-- end query " + [_QUERY] + " in stream " + [_STREAM] + " using template " + [_TEMPLATE]; 11 | -------------------------------------------------------------------------------- /week1/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update && apt-get install -y openssh-server 4 | RUN apt-get install -y vim 5 | RUN mkdir /var/run/sshd 6 | RUN echo 'root:hadoop' | chpasswd 7 | RUN sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config 8 | 9 | # SSH login fix. Otherwise user is kicked off after login 10 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 11 | 12 | ENV NOTVISIBLE "in users profile" 13 | RUN echo "export VISIBLE=now" >> /etc/profile 14 | 15 | EXPOSE 22 16 | EXPOSE 50070 17 | EXPOSE 8088 18 | EXPOSE 8000 19 | CMD ["/usr/sbin/sshd", "-D"] 20 | -------------------------------------------------------------------------------- /week1/wordcount/patches/combiner.patch: -------------------------------------------------------------------------------- 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java 2 | index 954aaab..39ffb71 100644 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java 5 | @@ -75,6 +75,7 @@ public class WordCount { 6 | Job job = Job.getInstance(conf, "word count"); 7 | job.setJarByClass(WordCount.class); 8 | job.setMapperClass(TokenizerMapper.class); 9 | + job.setCombinerClass(IntSumReducer.class); 10 | job.setReducerClass(IntSumReducer.class); 11 | job.setOutputKeyClass(Text.class); 12 | job.setOutputValueClass(IntWritable.class); 13 | -------------------------------------------------------------------------------- /homework/week3/sqoop.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/java/org/apache/sqoop/mapreduce/db/TextSplitter.java b/src/java/org/apache/sqoop/mapreduce/db/TextSplitter.java 2 | index d3085cd..54dfac8 100644 3 | --- a/src/java/org/apache/sqoop/mapreduce/db/TextSplitter.java 4 | +++ b/src/java/org/apache/sqoop/mapreduce/db/TextSplitter.java 5 | @@ -156,7 +156,8 @@ 6 | List splitStrings = new ArrayList(); 7 | 8 | // Convert the BigDecimal splitPoints into their string representations. 9 | - for (BigDecimal bd : splitPoints) { 10 | + for (int i=1;i [...] "); 14 | -------------------------------------------------------------------------------- /homework/week1/compress.patch: -------------------------------------------------------------------------------- 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java 2 | index 954aaab..808e61a 100644 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java 5 | @@ -67,6 +67,8 @@ public class WordCount { 6 | 7 | public static void main(String[] args) throws Exception { 8 | Configuration conf = new Configuration(); 9 | + conf.setBoolean("mapreduce.output.fileoutputformat.compress", true); 10 | + conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); 11 | String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 12 | if (otherArgs.length < 2) { 13 | System.err.println("Usage: wordcount [...] "); 14 | -------------------------------------------------------------------------------- /homework/week2/bootstrap/src/java/com/example/pig/BootstrapSampleLoader.java: -------------------------------------------------------------------------------- 1 | package com.example.pig; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.commons.math3.distribution.PoissonDistribution; 6 | import org.apache.pig.builtin.PigStorage; 7 | import org.apache.pig.data.Tuple; 8 | 9 | public class BootstrapSampleLoader extends PigStorage 10 | { 11 | PoissonDistribution pd = new PoissonDistribution(1); 12 | Tuple originalTuple; 13 | int remaining = 0; 14 | @Override 15 | public Tuple getNext() throws IOException { 16 | if (remaining > 0) { 17 | remaining --; 18 | return originalTuple; 19 | } 20 | 21 | do { 22 | remaining = pd.sample(); 23 | originalTuple = super.getNext(); 24 | } while (originalTuple!=null && remaining == 0); 25 | remaining--; 26 | return originalTuple; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /druid/sessionize.pig: -------------------------------------------------------------------------------- 1 | register datafu-pig-incubating-1.3.0-SNAPSHOT.jar 2 | DEFINE Sessionize datafu.pig.sessions.Sessionize('30m'); 3 | 4 | rmf ooo 5 | 6 | a = LOAD 'sample.txt' AS (ip:chararray, dt:chararray, category:chararray); 7 | b = FILTER a BY ip IS NOT NULL; 8 | c = FOREACH b GENERATE ToDate(dt, 'dd/MMM/yyyy:HH:mm:ss') as dt, ip, category; 9 | d = FOREACH c GENERATE ToMilliSeconds(dt) as ts, dt, ip, category; 10 | e = GROUP d BY (ip, category); 11 | f = FOREACH e { 12 | ordered = ORDER d BY ts; 13 | GENERATE FLATTEN(Sessionize(ordered)) AS (ts,dt,ip,category,sessionId); 14 | } 15 | g = group f by (sessionId, ip, category); 16 | h = foreach g generate group.ip, group.category, MIN(f.dt) as start_time, COUNT(f) as session_count, ((MAX(f.ts) - MIN(f.ts))/ 1000.0/ 60.0) as session_length; 17 | i = foreach h generate ip, category, ToString(start_time, 'dd/MMM/yyyy:HH:mm:ss') as start_time, session_count, session_length; 18 | store i into 'ooo'; 19 | -------------------------------------------------------------------------------- /week1/conf/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | fs.defaultFS 22 | hdfs://localhost:9000 23 | 24 | 25 | -------------------------------------------------------------------------------- /week2/python/demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # explicitly import Pig class 4 | from org.apache.pig.scripting import Pig 5 | 6 | # COMPILE: compile method returns a Pig object that represents the pipeline 7 | P = Pig.compile("""a = load '$input' using PigStorage() as (name:chararray, age:int, gpa:double); 8 | a1 = filter a by age > 18; 9 | a2 = foreach a1 generate name, ROUND(gpa) as gpa; 10 | b = load 'votertab10k' using PigStorage() as (name:chararray, age:int, registration:chararray, contributions:double); 11 | c = join a2 by name, b by name; 12 | d = group c by registration; 13 | e = foreach d generate group, AVG(c.gpa) as gpa; 14 | f = order e by gpa desc; 15 | store f into '$output'; 16 | """) 17 | 18 | results = P.bind({'input':'studenttab10k', 'output':'output'}).runSingle() 19 | 20 | if results.isSuccessful() == "FAILED": 21 | raise "Pig job failed" 22 | iter = results.result("f").iterator() 23 | while iter.hasNext(): 24 | tuple = iter.next() 25 | print tuple 26 | -------------------------------------------------------------------------------- /week1/conf/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 16 | 17 | 18 | 19 | yarn.nodemanager.aux-services 20 | mapreduce_shuffle 21 | 22 | 23 | yarn.log-aggregation-enable 24 | true 25 | 26 | 27 | -------------------------------------------------------------------------------- /homework/week2/bootstrap/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.example.pig 6 | bootstrap 7 | 0.0.1-SNAPSHOT 8 | bootstrap 9 | jar 10 | 11 | 12 | ${basedir}/src/java 13 | 14 | 15 | 16 | 17 | 18 | org.apache.pig 19 | pig 20 | 0.16.0 21 | h2 22 | 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-common 27 | 2.7.3 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /week4/fixdoccompile.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/docs/src/documentation/content/xdocs/basic.xml b/src/docs/src/documentation/content/xdocs/basic.xml 2 | index a631607..0264089 100644 3 | --- a/src/docs/src/documentation/content/xdocs/basic.xml 4 | +++ b/src/docs/src/documentation/content/xdocs/basic.xml 5 | @@ -5424,7 +5424,7 @@ D = foreach C generate A::y, z; -- Cannot simply refer to y as it can refer to A 6 |

In cases where the schema is stored as part of the StoreFunc like PigStorage, JsonStorage, AvroStorage or OrcStorage, 7 | users generally have to use an extra FOREACH before STORE to rename the field names and remove the disambiguate 8 | operator from the names. To automatically remove the disambiguate operator from the schema for the STORE operation, 9 | - the pig.store.schema.disambiguate Pig property can be set to "false". It is the responsibility of the user 10 | + the pig.store.schema.disambiguate Pig property can be set to "false". It is the responsibility of the user 11 | to make sure that there is no conflict in the field names when using this setting. 12 |

13 | 14 | -------------------------------------------------------------------------------- /homework/week1/sequencefile.patch: -------------------------------------------------------------------------------- 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java 2 | index 954aaab..ce90bce 100644 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java 5 | @@ -29,6 +29,7 @@ import org.apache.hadoop.mapreduce.Mapper; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 9 | +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 10 | import org.apache.hadoop.util.GenericOptionsParser; 11 | 12 | public class WordCount { 13 | @@ -78,6 +79,7 @@ public class WordCount { 14 | job.setReducerClass(IntSumReducer.class); 15 | job.setOutputKeyClass(Text.class); 16 | job.setOutputValueClass(IntWritable.class); 17 | + job.setOutputFormatClass(SequenceFileOutputFormat.class); 18 | for (int i = 0; i < otherArgs.length - 1; ++i) { 19 | FileInputFormat.addInputPath(job, new Path(otherArgs[i])); 20 | } 21 | -------------------------------------------------------------------------------- /week3/hive/evalfunc/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.example.hive 6 | evalfunc 7 | 0.0.1-SNAPSHOT 8 | citylookup 9 | jar 10 | 11 | 12 | 2.7.3 13 | 14 | 15 | 16 | ${basedir}/src/java 17 | 18 | 19 | 20 | 21 | 22 | org.apache.hive 23 | hive-exec 24 | 1.2.1 25 | 26 | 27 | 28 | org.apache.hadoop 29 | hadoop-common 30 | 2.7.3 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /week3/oozie/workflow.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ${jobTracker} 6 | ${nameNode} 7 | 8 | 9 | 10 | 11 | 12 | mapred.job.queue.name 13 | ${queueName} 14 | 15 | 16 | import --connect jdbc:mysql://localhost/cs502 --username hadoop --password hadoop --table student --hive-import --hive-home /home/hadoop/apache-hive-1.2.1-bin --create-hive-table --hive-table student --m 2 --split-by age 17 | 18 | 19 | 20 | 21 | 22 | Sqoop failed, error message[${wf:errorMessage(wf:lastErrorNode())}] 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /week2/pigserver/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # ***** Set root logger level to DEBUG and its only appender to A. 19 | log4j.logger.org.apache.pig=info, A 20 | 21 | # ***** A is set to be a ConsoleAppender. 22 | log4j.appender.A=org.apache.log4j.ConsoleAppender 23 | # ***** A uses PatternLayout. 24 | log4j.appender.A.layout=org.apache.log4j.PatternLayout 25 | log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n 26 | -------------------------------------------------------------------------------- /week1/conf/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | dfs.replication 22 | 1 23 | 24 | 25 | dfs.name.dir 26 | /home/hadoop/hadoop-2.7.3/data/name 27 | 28 | 29 | dfs.data.dir 30 | /home/hadoop/hadoop-2.7.3/data/data 31 | 32 | 33 | -------------------------------------------------------------------------------- /week2/evalfunc/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.example 6 | citylookup 7 | 0.0.1-SNAPSHOT 8 | citylookup 9 | jar 10 | 11 | 12 | ${basedir}/src/java 13 | 14 | 15 | 16 | 17 | 18 | org.apache.pig 19 | pig 20 | 0.16.0 21 | h2 22 | 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-common 27 | 2.7.3 28 | 29 | 30 | 31 | com.maxmind.geoip 32 | geoip-api 33 | 1.3.1 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /week1/wordcount/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.example 6 | wordcount 7 | 0.0.1-SNAPSHOT 8 | wordcount 9 | jar 10 | 11 | 12 | ${basedir}/src/java 13 | 14 | 15 | 16 | 17 | 18 | 19 | org.apache.hadoop 20 | hadoop-hdfs 21 | 2.7.3 22 | 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-common 27 | 2.7.3 28 | 29 | 30 | 31 | org.apache.hadoop 32 | hadoop-mapreduce-client-core 33 | 2.7.3 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /week2/pigserver/src/java/com/example/pig/TestPigServer.java: -------------------------------------------------------------------------------- 1 | package com.example.pig; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | import org.apache.pig.PigServer; 7 | import org.apache.pig.data.Tuple; 8 | 9 | public class TestPigServer { 10 | static public void main(String[] args) throws IOException { 11 | 12 | PigServer pigServer = new PigServer("local"); 13 | 14 | pigServer.registerQuery("a = load 'studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);"); 15 | 16 | pigServer.registerQuery("a1 = filter a by age > 18;"); 17 | 18 | pigServer.registerQuery("a2 = foreach a1 generate name, ROUND(gpa) as gpa;"); 19 | 20 | pigServer.registerQuery("b = load 'votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);"); 21 | 22 | pigServer.registerQuery("c = join a2 by name, b by name;"); 23 | 24 | pigServer.registerQuery("d = group c by registration;"); 25 | 26 | pigServer.registerQuery("e = foreach d generate group, AVG(c.gpa) as gpa;"); 27 | 28 | pigServer.registerQuery("f = order e by gpa desc;"); 29 | 30 | Iterator iter = pigServer.openIterator("f"); 31 | 32 | while (iter.hasNext()) { 33 | System.out.println(iter.next()); 34 | } 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /week1/wordcount/patches/config.patches: -------------------------------------------------------------------------------- 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java 2 | index 6d47026..6af3784 100644 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java 5 | @@ -36,15 +36,18 @@ public class WordCount { 6 | public static class TokenizerMapper 7 | extends Mapper{ 8 | 9 | - private final static IntWritable one = new IntWritable(1); 10 | private Text word = new Text(); 11 | + int multiplier = -1; 12 | 13 | public void map(Object key, Text value, Context context 14 | ) throws IOException, InterruptedException { 15 | + if (multiplier == -1) { 16 | + multiplier = context.getConfiguration().getInt("multiplier", -1); 17 | + } 18 | StringTokenizer itr = new StringTokenizer(value.toString()); 19 | while (itr.hasMoreTokens()) { 20 | word.set(itr.nextToken()); 21 | - context.write(word, one); 22 | + context.write(word, new IntWritable(multiplier)); 23 | } 24 | } 25 | } 26 | @@ -72,6 +75,7 @@ public class WordCount { 27 | System.err.println("Usage: wordcount [...] "); 28 | System.exit(2); 29 | } 30 | + conf.setInt("multiplier", 2); 31 | Job job = Job.getInstance(conf, "word count"); 32 | job.setJarByClass(WordCount.class); 33 | job.setMapperClass(TokenizerMapper.class); 34 | -------------------------------------------------------------------------------- /week2/loadfunc/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.example 6 | nutchdbloader 7 | 0.0.1-SNAPSHOT 8 | nutchdbloader 9 | jar 10 | 11 | 12 | ${basedir}/src/java 13 | 14 | 15 | 16 | 17 | 18 | org.apache.pig 19 | pig 20 | 0.16.0 21 | h2 22 | 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-common 27 | 2.7.3 28 | 29 | 30 | 31 | org.apache.hadoop 32 | hadoop-mapreduce-client-core 33 | 2.7.3 34 | 35 | 36 | 37 | org.apache.nutch 38 | nutch 39 | 1.12 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /druid/cloudacl-index.json: -------------------------------------------------------------------------------- 1 | { 2 | "type" : "index_hadoop", 3 | "spec" : { 4 | "ioConfig" : { 5 | "type" : "hadoop", 6 | "inputSpec" : { 7 | "type" : "static", 8 | "paths" : "file:///Users/daijy/capstone/cloudacl.txt" 9 | } 10 | }, 11 | "dataSchema" : { 12 | "dataSource" : "cloudacl_accesslog", 13 | "granularitySpec" : { 14 | "type" : "uniform", 15 | "segmentGranularity" : "day", 16 | "queryGranularity" : "none", 17 | "intervals" : ["2017-03-05/2017-03-11"] 18 | }, 19 | "parser" : { 20 | "type" : "hadoopyString", 21 | "parseSpec" : { 22 | "format" : "tsv", 23 | "columns" : [ 24 | "country_code", 25 | "country", 26 | "city", 27 | "timestamp", 28 | "category" 29 | ], 30 | "dimensionsSpec" : { 31 | "dimensions" : [ 32 | "country_code", 33 | "city", 34 | "category" 35 | ] 36 | }, 37 | "timestampSpec" : { 38 | "format": "dd/MMM/yyyy:HH:mm:ss", 39 | "column" : "timestamp" 40 | } 41 | } 42 | }, 43 | "metricsSpec" : [ 44 | { 45 | "name" : "count", 46 | "type" : "count" 47 | } 48 | ] 49 | }, 50 | "tuningConfig" : { 51 | "type" : "hadoop", 52 | "partitionsSpec" : { 53 | "type" : "hashed", 54 | "targetPartitionSize" : 5000000 55 | }, 56 | "jobProperties" : {} 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /week2/evalfunc/src/java/com/example/pig/GetCountry.java: -------------------------------------------------------------------------------- 1 | package com.example.pig; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import org.apache.pig.EvalFunc; 8 | import org.apache.pig.data.Tuple; 9 | import org.apache.pig.data.TupleFactory; 10 | import org.apache.pig.impl.logicalLayer.schema.Schema; 11 | import org.apache.pig.impl.util.Utils; 12 | import org.apache.pig.parser.ParserException; 13 | 14 | import com.maxmind.geoip.Location; 15 | import com.maxmind.geoip.LookupService; 16 | 17 | public class GetCountry extends EvalFunc { 18 | LookupService cl; 19 | @Override 20 | public Tuple exec(Tuple t) throws IOException { 21 | if (cl == null) { 22 | cl = new LookupService("GeoLiteCity.dat", 23 | LookupService.GEOIP_MEMORY_CACHE ); 24 | } 25 | Location loc = cl.getLocation((String)t.get(0)); 26 | if (loc == null) { 27 | return null; 28 | } 29 | Tuple r = TupleFactory.getInstance().newTuple(); 30 | r.append(loc.countryName); 31 | r.append(loc.city); 32 | return r; 33 | } 34 | @Override 35 | public List getShipFiles() { 36 | List shipFiles = new ArrayList(); 37 | shipFiles.add("GeoLiteCity.dat"); 38 | return shipFiles; 39 | } 40 | @Override 41 | public Schema outputSchema(Schema input) { 42 | try { 43 | return Utils.getSchemaFromString("(country:chararray, city:chararray)"); 44 | } catch (ParserException e) { 45 | throw new RuntimeException(e); 46 | } 47 | } 48 | } -------------------------------------------------------------------------------- /week1/wordcount/patches/counters.patch: -------------------------------------------------------------------------------- 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java 2 | index 6d47026..bb7127a 100644 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java 5 | @@ -24,6 +24,7 @@ import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | +import org.apache.hadoop.mapreduce.Counters; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.Mapper; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | @@ -32,12 +33,14 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import org.apache.hadoop.util.GenericOptionsParser; 15 | 16 | public class WordCount { 17 | + enum WordRange {A_M, N_Z} 18 | 19 | public static class TokenizerMapper 20 | extends Mapper{ 21 | 22 | private final static IntWritable one = new IntWritable(1); 23 | private Text word = new Text(); 24 | + Counters counters = new Counters(); 25 | 26 | public void map(Object key, Text value, Context context 27 | ) throws IOException, InterruptedException { 28 | @@ -45,6 +48,11 @@ public class WordCount { 29 | while (itr.hasMoreTokens()) { 30 | word.set(itr.nextToken()); 31 | context.write(word, one); 32 | + if (word.toString().toUpperCase().compareTo("N") < 0) { 33 | + context.getCounter(WordRange.A_M).increment(1); 34 | + } else { 35 | + context.getCounter(WordRange.N_Z).increment(1); 36 | + } 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /homework/week1/partitioner.patch: -------------------------------------------------------------------------------- 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java 2 | index 954aaab..49a1ea0 100644 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java 5 | @@ -26,6 +26,7 @@ import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | +import org.apache.hadoop.mapreduce.Partitioner; 10 | import org.apache.hadoop.mapreduce.Reducer; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | @@ -65,6 +66,16 @@ public class WordCount { 14 | } 15 | } 16 | 17 | + public static class MyPartitioner extends Partitioner { 18 | + @Override 19 | + public int getPartition(Text key, IntWritable value, int numPartitions) { 20 | + if (key.charAt(0)<='n') { 21 | + return 1; 22 | + } else { 23 | + return 0; 24 | + } 25 | + } 26 | + } 27 | public static void main(String[] args) throws Exception { 28 | Configuration conf = new Configuration(); 29 | String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 30 | @@ -78,6 +89,8 @@ public class WordCount { 31 | job.setReducerClass(IntSumReducer.class); 32 | job.setOutputKeyClass(Text.class); 33 | job.setOutputValueClass(IntWritable.class); 34 | + job.setPartitionerClass(MyPartitioner.class); 35 | + job.setNumReduceTasks(2); 36 | for (int i = 0; i < otherArgs.length - 1; ++i) { 37 | FileInputFormat.addInputPath(job, new Path(otherArgs[i])); 38 | } 39 | -------------------------------------------------------------------------------- /week1/wordcount/patches/distributedcache.patch: -------------------------------------------------------------------------------- 1 | diff --git a/week1/wordcount/pom.xml b/week1/wordcount/pom.xml 2 | index 416f1d4..5c73b93 100644 3 | --- a/week1/wordcount/pom.xml 4 | +++ b/week1/wordcount/pom.xml 5 | @@ -33,6 +33,12 @@ 6 | 2.7.3 7 | 8 | 9 | + 10 | + dk.brics.automaton 11 | + automaton 12 | + 1.11-8 13 | + 14 | + 15 | 16 | 17 | 18 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java 19 | index 6d47026..23a7f24 100644 20 | --- a/week1/wordcount/src/java/com/example/WordCount.java 21 | +++ b/week1/wordcount/src/java/com/example/WordCount.java 22 | @@ -31,6 +31,9 @@ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 23 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 24 | import org.apache.hadoop.util.GenericOptionsParser; 25 | 26 | +import dk.brics.automaton.Automaton; 27 | +import dk.brics.automaton.RegExp; 28 | + 29 | public class WordCount { 30 | 31 | public static class TokenizerMapper 32 | @@ -42,9 +45,13 @@ public class WordCount { 33 | public void map(Object key, Text value, Context context 34 | ) throws IOException, InterruptedException { 35 | StringTokenizer itr = new StringTokenizer(value.toString()); 36 | + Automaton automaton = new RegExp("h(.*)").toAutomaton(); 37 | while (itr.hasMoreTokens()) { 38 | - word.set(itr.nextToken()); 39 | - context.write(word, one); 40 | + String w = itr.nextToken(); 41 | + if (automaton.run(w)) { 42 | + word.set(w); 43 | + context.write(word, one); 44 | + } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /week2/evalfunc/patches/country_city.patch: -------------------------------------------------------------------------------- 1 | diff --git a/week2/evalfunc/src/java/com/example/pig/GetCountry.java b/week2/evalfunc/src/java/com/example/pig/GetCountry.java 2 | index 2526dec..2f8756f 100644 3 | --- a/week2/evalfunc/src/java/com/example/pig/GetCountry.java 4 | +++ b/week2/evalfunc/src/java/com/example/pig/GetCountry.java 5 | @@ -6,20 +6,30 @@ import java.util.List; 6 | 7 | import org.apache.pig.EvalFunc; 8 | import org.apache.pig.data.Tuple; 9 | +import org.apache.pig.data.TupleFactory; 10 | +import org.apache.pig.impl.logicalLayer.schema.Schema; 11 | +import org.apache.pig.impl.util.Utils; 12 | +import org.apache.pig.parser.ParserException; 13 | 14 | import com.maxmind.geoip.Location; 15 | import com.maxmind.geoip.LookupService; 16 | 17 | -public class GetCountry extends EvalFunc { 18 | +public class GetCountry extends EvalFunc { 19 | LookupService cl; 20 | @Override 21 | - public String exec(Tuple t) throws IOException { 22 | + public Tuple exec(Tuple t) throws IOException { 23 | if (cl == null) { 24 | cl = new LookupService("GeoLiteCity.dat", 25 | LookupService.GEOIP_MEMORY_CACHE ); 26 | } 27 | Location loc = cl.getLocation((String)t.get(0)); 28 | - return loc!=null? loc.countryName:null; 29 | + if (loc == null) { 30 | + return null; 31 | + } 32 | + Tuple r = TupleFactory.getInstance().newTuple(); 33 | + r.append(loc.countryName); 34 | + r.append(loc.city); 35 | + return r; 36 | } 37 | @Override 38 | public List getShipFiles() { 39 | @@ -27,4 +37,12 @@ public class GetCountry extends EvalFunc { 40 | shipFiles.add("GeoLiteCity.dat"); 41 | return shipFiles; 42 | } 43 | + @Override 44 | + public Schema outputSchema(Schema input) { 45 | + try { 46 | + return Utils.getSchemaFromString("(country:chararray, city:chararray)"); 47 | + } catch (ParserException e) { 48 | + throw new RuntimeException(e); 49 | + } 50 | + } 51 | } 52 | \ No newline at end of file 53 | -------------------------------------------------------------------------------- /capstone/track1/data_description.txt: -------------------------------------------------------------------------------- 1 | 我们的数据是CloudACL采集的API请求纪录。CloudACL前端的产品包括mobile app和browser add-on,每个前端会截获url,通过API请求向后端的tomcat获取该url对应的类型,前端根据url类型做相应动作。 2 | 下载的文件每个tar.gz包是一天的tomcat log记录,包含若干个.processed文件,每个文件都是text格式。每条记录一行。记录分为两种格式,分别对应新旧两组不同的API。处理时需要根据GET的URL分别处理。以下各举一个例子: 3 | axis2: 4 | 203.87.133.189 - - [04/Mar/2017:23:59:59 +0000] "GET /axis2/services/WebFilteringService/getCategoryByUrl?app=chrome_antiporn&ver=0.19.7.1&url=https%3A//www.googleapis.com/rpc&cat=search-engine HTTP/1.1" 200 133 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36" 5 | 6 | jersey: 7 | 51.39.90.114 - - [05/Mar/2017:00:04:19 +0000] "GET /webapi/getcategory?uri=pt.tvtc.gov.sa&cat=government&key=d647fe2a-2193-4c2d-8cad-38e78316d020 HTTP/1.1" 200 63 "-" "Apache-HttpClient/UNAVAILABLE (java 1.4)" 8 | 9 | 在axis2 api中,app是指前端app的名字。jersey api中,key是指前端app的application key。其他项目应该是显而易见的。 10 | 有同学可能有困惑为什么请求中已经包含了url类型(cat)。其实这一项是为了让数据更丰富一些,后来加上去的。原始数据是没有这一项的。我们可能的类型有: 11 | unknown,hacking,phishing-and-fraud,botnet,malware,spyware-and-adware,keylogger-and-monitoring,peer2peer,media-streaming,online-storage,abortion,adult-and-pornography,sex-education,nudity,abused-drugs,marijuana,healthy-and-medicine,real-estate,internet-security,financial-service,business-and-economy,computer-information,auctions,shopping,cult-and-occult,travel,home-garden,military,social-networking,dead-sites,stock-and-tool,training-and-tool,dating,religion,entertainment-and-art,personal-site-and-blog,legal,local-info,job-search,gambling,translation,research-reference,software-download,game,philosophy-and-political,weapon,pay2surf,hunting-and-fishing,society,educational-institution,online-greeting,sport,swimsuits-&-intimate-apparel,questionable,kid,search-engine,internet-portal,online-advertisement,web-mail,envasion-proxy,music,government,news-and-media,content-delivery-network,internet-communication,spam-comfirmed,spam-url,spam-unconfirmed,http-proxy,dynamically-content,parked-domain,alcohol-and-tobacco,private-ip,image-and-video-search,fashion-and-beauty,recreation-and-hobbies,motor-vehicle,web-hosting 12 | -------------------------------------------------------------------------------- /week3/oozie/workflow-all.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ${jobTracker} 6 | ${nameNode} 7 | hive-config.xml 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | ${jobTracker} 16 | ${nameNode} 17 | 18 | 19 | 20 | 21 | 22 | mapred.job.queue.name 23 | ${queueName} 24 | 25 | 26 | import --connect jdbc:mysql://localhost/cs502 --username hadoop --password hadoop --table student --hive-import --hive-home /home/hadoop/apache-hive-1.2.1-bin --create-hive-table --hive-table student --m 2 --split-by age 27 | 28 | 29 | 30 | 31 | 32 | 33 | ${jobTracker} 34 | ${nameNode} 35 | 36 | 37 | 38 | 39 | 40 | mapred.job.queue.name 41 | ${queueName} 42 | 43 | 44 | 45 | OUTPUT=/user/${wf:user()}/output 46 | 47 | 48 | 49 | 50 | 51 | Workflow failed, error message[${wf:errorMessage(wf:lastErrorNode())}] 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /week1/googleplaycrawler/fixskew.patch: -------------------------------------------------------------------------------- 1 | commit 7cf95ef3bd61497a4359f091406c8c3e186fb06b 2 | Author: Daniel Dai 3 | Date: Thu Feb 9 22:00:45 2017 -0800 4 | 5 | Fix skew 6 | 7 | diff --git a/conf/nutch-site.xml.template b/conf/nutch-site.xml.template 8 | index 85dc30b..7ae16d2 100644 9 | --- a/conf/nutch-site.xml.template 10 | +++ b/conf/nutch-site.xml.template 11 | @@ -37,6 +37,10 @@ 12 | 3600 13 | 14 | 15 | + partition.url.mode 16 | + byURL 17 | + 18 | + 19 | fetcher.threads.fetch 20 | 20 21 | 22 | diff --git a/src/java/org/apache/nutch/crawl/URLPartitioner.java b/src/java/org/apache/nutch/crawl/URLPartitioner.java 23 | index 4675f83..eb6844b 100644 24 | --- a/src/java/org/apache/nutch/crawl/URLPartitioner.java 25 | +++ b/src/java/org/apache/nutch/crawl/URLPartitioner.java 26 | @@ -42,6 +42,7 @@ public class URLPartitioner implements Partitioner { 27 | public static final String PARTITION_MODE_HOST = "byHost"; 28 | public static final String PARTITION_MODE_DOMAIN = "byDomain"; 29 | public static final String PARTITION_MODE_IP = "byIP"; 30 | + public static final String PARTITION_MODE_URL = "byURL"; 31 | 32 | private int seed; 33 | private URLNormalizers normalizers; 34 | @@ -52,7 +53,7 @@ public class URLPartitioner implements Partitioner { 35 | mode = job.get(PARTITION_MODE_KEY, PARTITION_MODE_HOST); 36 | // check that the mode is known 37 | if (!mode.equals(PARTITION_MODE_IP) && !mode.equals(PARTITION_MODE_DOMAIN) 38 | - && !mode.equals(PARTITION_MODE_HOST)) { 39 | + && !mode.equals(PARTITION_MODE_HOST) && !mode.equals(PARTITION_MODE_URL)) { 40 | LOG.error("Unknown partition mode : " + mode + " - forcing to byHost"); 41 | mode = PARTITION_MODE_HOST; 42 | } 43 | @@ -71,7 +72,11 @@ public class URLPartitioner implements Partitioner { 44 | urlString = normalizers.normalize(urlString, 45 | URLNormalizers.SCOPE_PARTITION); 46 | url = new URL(urlString); 47 | - hashCode = url.getHost().hashCode(); 48 | + if (mode.equals(PARTITION_MODE_URL)) { 49 | + hashCode = url.toString().hashCode(); 50 | + } else { 51 | + hashCode = url.getHost().hashCode(); 52 | + } 53 | } catch (MalformedURLException e) { 54 | LOG.warn("Malformed URL: '" + urlString + "'"); 55 | } 56 | -------------------------------------------------------------------------------- /week2/pigserver/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.example 6 | pigserver 7 | 0.0.1-SNAPSHOT 8 | pigtest 9 | jar 10 | 11 | 12 | ${basedir}/src/java 13 | 14 | 15 | org.codehaus.mojo 16 | exec-maven-plugin 17 | 1.5.0 18 | 19 | 20 | 21 | java 22 | 23 | 24 | 25 | 26 | false 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | org.apache.hadoop 37 | hadoop-common 38 | 2.7.3 39 | 40 | 41 | 42 | org.apache.hadoop 43 | hadoop-hdfs 44 | 2.7.3 45 | 46 | 47 | 48 | org.apache.hadoop 49 | hadoop-mapreduce-client-core 50 | 2.7.3 51 | 52 | 53 | 54 | org.apache.hadoop 55 | hadoop-mapreduce-client-common 56 | 2.7.3 57 | 58 | 59 | 60 | org.apache.hadoop 61 | hadoop-mapreduce-client-jobclient 62 | 2.7.3 63 | 64 | 65 | 66 | org.apache.pig 67 | pig 68 | 0.16.0 69 | h2 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /homework/week1/dict.patch: -------------------------------------------------------------------------------- 1 | diff --git a/week1/wordcount/src/java/com/example/WordCount.java b/week1/wordcount/src/java/com/example/WordCount.java 2 | index 954aaab..d1fa7f6 100644 3 | --- a/week1/wordcount/src/java/com/example/WordCount.java 4 | +++ b/week1/wordcount/src/java/com/example/WordCount.java 5 | @@ -17,7 +17,13 @@ 6 | */ 7 | package com.example; 8 | 9 | +import java.io.BufferedReader; 10 | +import java.io.File; 11 | +import java.io.FileReader; 12 | import java.io.IOException; 13 | +import java.net.URI; 14 | +import java.util.HashMap; 15 | +import java.util.Map; 16 | import java.util.StringTokenizer; 17 | 18 | import org.apache.hadoop.conf.Configuration; 19 | @@ -27,6 +33,7 @@ import org.apache.hadoop.io.Text; 20 | import org.apache.hadoop.mapreduce.Job; 21 | import org.apache.hadoop.mapreduce.Mapper; 22 | import org.apache.hadoop.mapreduce.Reducer; 23 | +import org.apache.hadoop.mapreduce.Mapper.Context; 24 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 25 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 26 | import org.apache.hadoop.util.GenericOptionsParser; 27 | @@ -38,12 +45,27 @@ public class WordCount { 28 | 29 | private final static IntWritable one = new IntWritable(1); 30 | private Text word = new Text(); 31 | - 32 | + Map dict = new HashMap(); 33 | + 34 | + @Override 35 | + public void setup(Context context) throws IOException, InterruptedException { 36 | + BufferedReader reader = new BufferedReader(new FileReader("dict.txt")); 37 | + String line; 38 | + while ((line=reader.readLine())!=null) { 39 | + String[] items = line.split("\t"); 40 | + dict.put(items[0], items[1]); 41 | + } 42 | + reader.close(); 43 | + } 44 | public void map(Object key, Text value, Context context 45 | ) throws IOException, InterruptedException { 46 | StringTokenizer itr = new StringTokenizer(value.toString()); 47 | while (itr.hasMoreTokens()) { 48 | - word.set(itr.nextToken()); 49 | + String w = itr.nextToken(); 50 | + if (dict.containsKey(w)) { 51 | + w = dict.get(w); 52 | + } 53 | + word.set(w); 54 | context.write(word, one); 55 | } 56 | } 57 | @@ -78,6 +100,7 @@ public class WordCount { 58 | job.setReducerClass(IntSumReducer.class); 59 | job.setOutputKeyClass(Text.class); 60 | job.setOutputValueClass(IntWritable.class); 61 | + job.setCacheArchives(new URI[] {new File("dict.txt").toURI()}); 62 | for (int i = 0; i < otherArgs.length - 1; ++i) { 63 | FileInputFormat.addInputPath(job, new Path(otherArgs[i])); 64 | } 65 | -------------------------------------------------------------------------------- /week3/hive/tpcds/upload.sh: -------------------------------------------------------------------------------- 1 | hadoop fs -mkdir /data/call_center 2 | hadoop fs -put /home/hadoop/data/call_center.dat /data/call_center 3 | hadoop fs -mkdir /data/catalog_page 4 | hadoop fs -put /home/hadoop/data/catalog_page.dat /data/catalog_page 5 | hadoop fs -mkdir /data/catalog_returns 6 | hadoop fs -put /home/hadoop/data/catalog_returns.dat /data/catalog_returns 7 | hadoop fs -mkdir /data/catalog_sales 8 | hadoop fs -put /home/hadoop/data/catalog_sales.dat /data/catalog_sales 9 | hadoop fs -mkdir /data/customer 10 | hadoop fs -put /home/hadoop/data/customer.dat /data/customer 11 | hadoop fs -mkdir /data/customer_address 12 | hadoop fs -put /home/hadoop/data/customer_address.dat /data/customer_address 13 | hadoop fs -mkdir /data/customer_demographics 14 | hadoop fs -put /home/hadoop/data/customer_demographics.dat /data/customer_demographics 15 | hadoop fs -mkdir /data/date_dim 16 | hadoop fs -put /home/hadoop/data/date_dim.dat /data/date_dim 17 | hadoop fs -mkdir /data/dbgen_version 18 | hadoop fs -put /home/hadoop/data/dbgen_version.dat /data/dbgen_version 19 | hadoop fs -mkdir /data/household_demographics 20 | hadoop fs -put /home/hadoop/data/household_demographics.dat /data/household_demographics 21 | hadoop fs -mkdir /data/income_band 22 | hadoop fs -put /home/hadoop/data/income_band.dat /data/income_band 23 | hadoop fs -mkdir /data/inventory 24 | hadoop fs -put /home/hadoop/data/inventory.dat /data/inventory 25 | hadoop fs -mkdir /data/item 26 | hadoop fs -put /home/hadoop/data/item.dat /data/item 27 | hadoop fs -mkdir /data/promotion 28 | hadoop fs -put /home/hadoop/data/promotion.dat /data/promotion 29 | hadoop fs -mkdir /data/reason 30 | hadoop fs -put /home/hadoop/data/reason.dat /data/reason 31 | hadoop fs -mkdir /data/ship_mode 32 | hadoop fs -put /home/hadoop/data/ship_mode.dat /data/ship_mode 33 | hadoop fs -mkdir /data/store 34 | hadoop fs -put /home/hadoop/data/store.dat /data/store 35 | hadoop fs -mkdir /data/store_returns 36 | hadoop fs -put /home/hadoop/data/store_returns.dat /data/store_returns 37 | hadoop fs -mkdir /data/store_sales 38 | hadoop fs -put /home/hadoop/data/store_sales.dat /data/store_sales 39 | hadoop fs -mkdir /data/time_dim 40 | hadoop fs -put /home/hadoop/data/time_dim.dat /data/time_dim 41 | hadoop fs -mkdir /data/warehouse 42 | hadoop fs -put /home/hadoop/data/warehouse.dat /data/warehouse 43 | hadoop fs -mkdir /data/web_page 44 | hadoop fs -put /home/hadoop/data/web_page.dat /data/web_page 45 | hadoop fs -mkdir /data/web_returns 46 | hadoop fs -put /home/hadoop/data/web_returns.dat /data/web_returns 47 | hadoop fs -mkdir /data/web_sales 48 | hadoop fs -put /home/hadoop/data/web_sales.dat /data/web_sales 49 | hadoop fs -mkdir /data/web_site 50 | hadoop fs -put /home/hadoop/data/web_site.dat /data/web_site 51 | -------------------------------------------------------------------------------- /week4/PIG-3399-2.patch: -------------------------------------------------------------------------------- 1 | Index: build.xml 2 | =================================================================== 3 | --- build.xml (revision 1636705) 4 | +++ build.xml (working copy) 5 | @@ -310,7 +310,7 @@ 6 | 7 | 8 | 12 | 13 | 14 | @@ -334,7 +334,7 @@ 15 | 16 | 17 | 18 | - 19 | + 20 | 21 | 22 | 23 | @@ -399,7 +399,6 @@ 24 | 25 | 26 | 27 | - 28 | 29 | 30 | 31 | @@ -482,11 +481,13 @@ 32 | 33 | 34 | 35 | + 36 | 37 | 38 | 39 | 40 | 41 | + 42 | 43 | 44 | { 38 | 39 | private final static IntWritable one = new IntWritable(1); 40 | private Text word = new Text(); 41 | 42 | public void map(Object key, Text value, Context context 43 | ) throws IOException, InterruptedException { 44 | StringTokenizer itr = new StringTokenizer(value.toString()); 45 | while (itr.hasMoreTokens()) { 46 | word.set(itr.nextToken()); 47 | context.write(word, one); 48 | } 49 | } 50 | } 51 | 52 | public static class IntSumReducer 53 | extends Reducer { 54 | private IntWritable result = new IntWritable(); 55 | 56 | public void reduce(Text key, Iterable values, 57 | Context context 58 | ) throws IOException, InterruptedException { 59 | int sum = 0; 60 | for (IntWritable val : values) { 61 | sum += val.get(); 62 | } 63 | result.set(sum); 64 | context.write(key, result); 65 | } 66 | } 67 | 68 | public static void main(String[] args) throws Exception { 69 | Configuration conf = new Configuration(); 70 | String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 71 | if (otherArgs.length < 2) { 72 | System.err.println("Usage: wordcount [...] "); 73 | System.exit(2); 74 | } 75 | Job job = Job.getInstance(conf, "word count"); 76 | job.setJarByClass(WordCount.class); 77 | job.setMapperClass(TokenizerMapper.class); 78 | job.setReducerClass(IntSumReducer.class); 79 | job.setOutputKeyClass(Text.class); 80 | job.setOutputValueClass(IntWritable.class); 81 | for (int i = 0; i < otherArgs.length - 1; ++i) { 82 | FileInputFormat.addInputPath(job, new Path(otherArgs[i])); 83 | } 84 | FileOutputFormat.setOutputPath(job, 85 | new Path(otherArgs[otherArgs.length - 1])); 86 | System.exit(job.waitForCompletion(true) ? 0 : 1); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /week4/doc.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/docs/src/documentation/content/xdocs/basic.xml b/src/docs/src/documentation/content/xdocs/basic.xml 2 | index a631607..854d511 100644 3 | --- a/src/docs/src/documentation/content/xdocs/basic.xml 4 | +++ b/src/docs/src/documentation/content/xdocs/basic.xml 5 | @@ -7629,16 +7629,16 @@ ILLUSTRATE A; 6 | 7 | 8 | 9 | -
10 | - MAPREDUCE 11 | -

Executes native MapReduce jobs inside a Pig script.

12 | +
13 | + NATIVE 14 | +

Executes native MapReduce/Tez jobs inside a Pig script.

15 | 16 |
17 | Syntax 18 | 19 | 20 | 25 | 26 | @@ -7658,11 +7658,11 @@ ILLUSTRATE A; 27 | 28 | 29 | 33 | 40 | 41 | @@ -7684,7 +7684,7 @@ ILLUSTRATE A; 42 | 43 | 48 | 49 | 50 | @@ -7693,7 +7693,7 @@ ILLUSTRATE A; 51 |

`params, ...`

52 | 53 | 57 | 58 | 59 | @@ -7702,20 +7702,20 @@ ILLUSTRATE A; 60 | 61 |
62 | Usage 63 | -

Use the MAPREDUCE operator to run native MapReduce jobs from inside a Pig script.

64 | +

Use the NATIVE operator to run native MapReduce/Tez jobs from inside a Pig script.

65 | 66 | -

The input and output locations for the MapReduce program are conveyed to Pig using the STORE/LOAD clauses. 67 | -Pig, however, does not pass this information (nor require that this information be passed) to the MapReduce program. 68 | -If you want to pass the input and output locations to the MapReduce program you can use the params clause or you can hardcode the locations in the MapReduce program.

69 | +

The input and output locations for the MapReduce/tez program are conveyed to Pig using the STORE/LOAD clauses. 70 | +Pig, however, does not pass this information (nor require that this information be passed) to the MapReduce/Tez program. 71 | +If you want to pass the input and output locations to the MapReduce/tez program you can use the params clause or you can hardcode the locations in the MapReduce/Tez program.

72 |
73 | 74 |
75 | Example 76 | -

This example demonstrates how to run the wordcount MapReduce progam from Pig. 77 | -Note that the files specified as input and output locations in the MAPREDUCE statement will NOT be deleted by Pig automatically. You will need to delete them manually.

78 | +

This example demonstrates how to run the wordcount MapReduce progam from Pig (if exectype=mapreduce). 79 | +Note that the files specified as input and output locations in the NATIVE statement will NOT be deleted by Pig automatically. You will need to delete them manually.

80 | 81 | A = LOAD 'WordcountInput.txt'; 82 | -B = MAPREDUCE 'wordcount.jar' STORE A INTO 'inputDir' LOAD 'outputDir' 83 | +B = NATIVE 'wordcount.jar' STORE A INTO 'inputDir' LOAD 'outputDir' 84 | AS (word:chararray, count: int) `org.myorg.WordCount inputDir outputDir`; 85 | 86 |
87 | -------------------------------------------------------------------------------- /week3/hive/tpcds/insert.sql: -------------------------------------------------------------------------------- 1 | set hive.exec.max.dynamic.partitions=3000; 2 | set hive.exec.max.dynamic.partitions.pernode=3000; 3 | set hive.exec.dynamic.partition.mode=nonstrict; 4 | 5 | insert into tpcds.call_center select * from tpcds_base.call_center; 6 | 7 | insert into tpcds.catalog_page select * from tpcds_base.catalog_page; 8 | 9 | insert into tpcds.catalog_returns partition(cr_returned_date_sk) select cr_returned_time_sk,cr_item_sk,cr_refunded_customer_sk,cr_refunded_cdemo_sk,cr_refunded_hdemo_sk,cr_refunded_addr_sk,cr_returning_customer_sk,cr_returning_cdemo_sk,cr_returning_hdemo_sk,cr_returning_addr_sk,cr_call_center_sk,cr_catalog_page_sk,cr_ship_mode_sk,cr_warehouse_sk,cr_reason_sk,cr_order_number,cr_return_quantity,cr_return_amount,cr_return_tax,cr_return_amt_inc_tax,cr_fee,cr_return_ship_cost,cr_refunded_cash,cr_reversed_charge,cr_store_credit,cr_net_loss,cr_returned_date_sk from tpcds_base.catalog_returns; 10 | 11 | insert into tpcds.catalog_sales partition(cs_sold_date_sk) select cs_sold_time_sk,cs_ship_date_sk,cs_bill_customer_sk,cs_bill_cdemo_sk,cs_bill_hdemo_sk,cs_bill_addr_sk,cs_ship_customer_sk,cs_ship_cdemo_sk,cs_ship_hdemo_sk,cs_ship_addr_sk,cs_call_center_sk,cs_catalog_page_sk,cs_ship_mode_sk,cs_warehouse_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity,cs_wholesale_cost,cs_list_price,cs_sales_price,cs_ext_discount_amt,cs_ext_sales_price,cs_ext_wholesale_cost,cs_ext_list_price,cs_ext_tax,cs_coupon_amt,cs_ext_ship_cost,cs_net_paid,cs_net_paid_inc_tax,cs_net_paid_inc_ship,cs_net_paid_inc_ship_tax,cs_net_profit,cs_sold_date_sk from tpcds_base.catalog_sales; 12 | 13 | insert into tpcds.customer select * from tpcds_base.customer; 14 | 15 | insert into tpcds.customer_address select * from tpcds_base.customer_address; 16 | 17 | insert into tpcds.customer_demographics select * from tpcds_base.customer_demographics; 18 | 19 | insert into tpcds.date_dim select * from tpcds_base.date_dim; 20 | 21 | insert into tpcds.household_demographics select * from tpcds_base.household_demographics; 22 | 23 | insert into tpcds.income_band select * from tpcds_base.income_band; 24 | 25 | insert into tpcds.inventory partition(inv_date_sk) select inv_item_sk,inv_warehouse_sk,inv_quantity_on_hand,inv_date_sk from tpcds_base.inventory; 26 | 27 | insert into tpcds.item select * from tpcds_base.item; 28 | 29 | insert into tpcds.promotion select * from tpcds_base.promotion; 30 | 31 | insert into tpcds.reason select * from tpcds_base.reason; 32 | 33 | insert into tpcds.ship_mode select * from tpcds_base.ship_mode; 34 | 35 | insert into tpcds.store select * from tpcds_base.store; 36 | 37 | insert into tpcds.store_returns partition(sr_returned_date_sk) select sr_return_time_sk,sr_item_sk,sr_customer_sk,sr_cdemo_sk,sr_hdemo_sk,sr_addr_sk,sr_store_sk,sr_reason_sk,sr_ticket_number,sr_return_quantity,sr_return_amt,sr_return_tax,sr_return_amt_inc_tax,sr_fee,sr_return_ship_cost,sr_refunded_cash,sr_reversed_charge,sr_store_credit,sr_net_loss,sr_returned_date_sk from tpcds_base.store_returns; 38 | 39 | insert into tpcds.store_sales partition(ss_sold_date_sk) select ss_sold_time_sk,ss_item_sk,ss_customer_sk,ss_cdemo_sk,ss_hdemo_sk,ss_addr_sk,ss_store_sk,ss_promo_sk,ss_ticket_number,ss_quantity,ss_wholesale_cost,ss_list_price,ss_sales_price,ss_ext_discount_amt,ss_ext_sales_price,ss_ext_wholesale_cost,ss_ext_list_price,ss_ext_tax,ss_coupon_amt,ss_net_paid,ss_net_paid_inc_tax,ss_net_profit,ss_sold_date_sk from tpcds_base.store_sales; 40 | 41 | insert into tpcds.time_dim select * from tpcds_base.time_dim; 42 | 43 | insert into tpcds.warehouse select * from tpcds_base.warehouse; 44 | 45 | insert into tpcds.web_page select * from tpcds_base.web_page; 46 | 47 | insert into tpcds.web_returns partition(wr_returned_date_sk) select wr_returned_time_sk,wr_item_sk,wr_refunded_customer_sk,wr_refunded_cdemo_sk,wr_refunded_hdemo_sk,wr_refunded_addr_sk,wr_returning_customer_sk,wr_returning_cdemo_sk,wr_returning_hdemo_sk,wr_returning_addr_sk,wr_web_page_sk,wr_reason_sk,wr_order_number,wr_return_quantity,wr_return_amt,wr_return_tax,wr_return_amt_inc_tax,wr_fee,wr_return_ship_cost,wr_refunded_cash,wr_reversed_charge,wr_account_credit,wr_net_loss,wr_returned_date_sk from tpcds_base.web_returns; 48 | 49 | insert into tpcds.web_sales partition(ws_sold_date_sk) select ws_sold_time_sk,ws_ship_date_sk,ws_item_sk,ws_bill_customer_sk,ws_bill_cdemo_sk,ws_bill_hdemo_sk,ws_bill_addr_sk,ws_ship_customer_sk,ws_ship_cdemo_sk,ws_ship_hdemo_sk,ws_ship_addr_sk,ws_web_page_sk,ws_web_site_sk,ws_ship_mode_sk,ws_warehouse_sk,ws_promo_sk,ws_order_number,ws_quantity,ws_wholesale_cost,ws_list_price,ws_sales_price,ws_ext_discount_amt,ws_ext_sales_price,ws_ext_wholesale_cost,ws_ext_list_price,ws_ext_tax,ws_coupon_amt,ws_ext_ship_cost,ws_net_paid,ws_net_paid_inc_tax,ws_net_paid_inc_ship,ws_net_paid_inc_ship_tax,ws_net_profit,ws_sold_date_sk from tpcds_base.web_sales; 50 | 51 | insert into tpcds.web_site select * from tpcds_base.web_site; 52 | 53 | -------------------------------------------------------------------------------- /week4/jobname.patch: -------------------------------------------------------------------------------- 1 | Index: src/org/apache/pig/scripting/BoundScript.java 2 | =================================================================== 3 | --- src/org/apache/pig/scripting/BoundScript.java (revision 1785219) 4 | +++ src/org/apache/pig/scripting/BoundScript.java (working copy) 5 | @@ -17,6 +17,7 @@ 6 | */ 7 | package org.apache.pig.scripting; 8 | 9 | +import java.io.File; 10 | import java.io.FileInputStream; 11 | import java.io.IOException; 12 | import java.io.StringReader; 13 | @@ -264,7 +265,9 @@ 14 | LOG.info("Query to run:\n" + query); 15 | List listeners = ScriptState.get().getAllListeners(); 16 | PigContext pc = scriptContext.getPigContext(); 17 | + String scriptName = new File(ScriptState.get().getFileName()).getName(); 18 | ScriptState scriptState = pc.getExecutionEngine().instantiateScriptState(); 19 | + scriptState.setFileName(scriptName); 20 | ScriptState.start(scriptState); 21 | ScriptState.get().setScript(query); 22 | for (PigProgressNotificationListener listener : listeners) { 23 | @@ -271,6 +274,9 @@ 24 | ScriptState.get().registerListener(listener); 25 | } 26 | PigServer pigServer = new PigServer(scriptContext.getPigContext(), false); 27 | + if (pc.getProperties().getProperty(PigContext.JOB_NAME) == null) { 28 | + pigServer.setJobName(scriptName); 29 | + } 30 | GruntParser grunt = new GruntParser(new StringReader(query), pigServer); 31 | grunt.setInteractive(false); 32 | try { 33 | Index: test/org/apache/pig/test/TestScriptLanguage.java 34 | =================================================================== 35 | --- test/org/apache/pig/test/TestScriptLanguage.java (revision 1785219) 36 | +++ test/org/apache/pig/test/TestScriptLanguage.java (working copy) 37 | @@ -31,6 +31,7 @@ 38 | import org.apache.pig.PigRunner; 39 | import org.apache.pig.PigServer; 40 | import org.apache.pig.data.Tuple; 41 | +import org.apache.pig.impl.PigContext; 42 | import org.apache.pig.scripting.ScriptEngine; 43 | import org.apache.pig.tools.pigstats.OutputStats; 44 | import org.apache.pig.tools.pigstats.PigStats; 45 | @@ -669,6 +670,59 @@ 46 | assertFileNotExists(file1, file2); 47 | } 48 | 49 | + @Test 50 | + public void testJobName() throws Exception { 51 | + String[] script1 = { 52 | + "#!/usr/bin/python", 53 | + "from org.apache.pig.scripting import *", 54 | + "Pig.fs(\"rmr simple_out\")", 55 | + "input = 'simple_table'", 56 | + "output = 'simple_out'", 57 | + "P = Pig.compile(\"\"\"a = load '$input';store a into '$output';\"\"\")", 58 | + "Q = P.bind({'input':input, 'output':output})", 59 | + "stats = Q.runSingle()", 60 | + "if stats.isSuccessful():", 61 | + "\tprint 'success!'", 62 | + "else:", 63 | + "\traise 'failed'" 64 | + }; 65 | + String[] input = { 66 | + "1\t3", 67 | + "2\t4", 68 | + "3\t5" 69 | + }; 70 | + 71 | + File script1File = Util.createInputFile("jobname1", ".py", script1); 72 | + Util.createLocalInputFile("simple_table", input); 73 | + 74 | + PigStats stats = PigRunner.run(new String[] { "-x", Util.getLocalTestMode().toString(), 75 | + "-f", script1File.getAbsolutePath()}, null); 76 | + String jobName = stats.getAllStats().values().iterator().next().get(0).getPigProperties().getProperty(PigContext.JOB_NAME); 77 | + assertTrue(jobName.contains(script1File.getName())); 78 | + 79 | + // set jobName manually in script 80 | + String[] script2 = { 81 | + "#!/usr/bin/python", 82 | + "from org.apache.pig.scripting import *", 83 | + "Pig.fs(\"rmr simple_out\")", 84 | + "input = 'simple_table'", 85 | + "output = 'simple_out'", 86 | + "P = Pig.compile(\"\"\"a = load '$input';store a into '$output';\"\"\")", 87 | + "P.set(\"jobName\", \"myjob\")", 88 | + "Q = P.bind({'input':input, 'output':output})", 89 | + "stats = Q.runSingle()", 90 | + "if stats.isSuccessful():", 91 | + "\tprint 'success!'", 92 | + "else:", 93 | + "\traise 'failed'" 94 | + }; 95 | + File script2File = Util.createInputFile("jobname2", ".py", script2); 96 | + stats = PigRunner.run(new String[] { "-x", Util.getLocalTestMode().toString(), 97 | + "-f", script2File.getAbsolutePath()}, null); 98 | + jobName = stats.getAllStats().values().iterator().next().get(0).getPigProperties().getProperty(PigContext.JOB_NAME); 99 | + assertTrue(jobName.contains("myjob")); 100 | + } 101 | + 102 | private void createEmptyFiles(String... filenames) throws IOException { 103 | for (String file : filenames) { 104 | Util.createInputFile(cluster, file, new String[]{""}); 105 | -------------------------------------------------------------------------------- /week2/python/kmeans.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This code is made available under the Apache License, Version 2.0 (the 3 | # "License"); you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | # License for the specific language governing permissions and limitations 12 | # under the License. 13 | 14 | import sys 15 | from math import fabs 16 | from org.apache.pig.scripting import Pig 17 | 18 | @outputSchemaFunction("findCentroidSchema") 19 | # Assign each value to the closest centroid 20 | def findCentroid(initialCentroid, value): 21 | # initialCentroid is a constant, we can optimize to derive centroids only once 22 | centroids = initialCentroid.split(":") 23 | 24 | min_distance = float("inf") 25 | closest_centroid = 0 26 | for centroid in centroids: 27 | distance = fabs(float(centroid) - value) 28 | if distance < min_distance: 29 | min_distance = distance 30 | closest_centroid = centroid 31 | return float(closest_centroid) 32 | 33 | # The output schema is the second field of the input, which is the same type of the param value 34 | @schemaFunction("findCentroidSchema") 35 | def findCentroidSchema(input): 36 | return input.getField(1) 37 | 38 | def main(): 39 | filename = "studenttab10k" 40 | k = 4 41 | tolerance = 0.01 42 | 43 | MAX_SCORE = 4 44 | MIN_SCORE = 0 45 | MAX_ITERATION = 100 46 | 47 | # initial centroid, equally divide the space 48 | initial_centroids = "" 49 | last_centroids = [None] * k 50 | for i in range(k): 51 | last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) 52 | initial_centroids = initial_centroids + str(last_centroids[i]) 53 | if i!=k-1: 54 | initial_centroids = initial_centroids + ":" 55 | 56 | # Compile Pig script. Register the same script since it contains the Jython UDF. 57 | # $centroids is the only binding parameter. It will be bound to different parameter with the 58 | # estimation for centroid from the last round. Then we calculate the average of the new clusters 59 | # to get the new centroid estimation, and store into "output" 60 | P = Pig.compile("""register 'kmeans.py' using jython as util; 61 | raw = load 'studenttab10k' as (name:chararray, age:int, gpa:double); 62 | centroided = foreach raw generate gpa, util.findCentroid('$centroids', gpa) as centroid; 63 | grouped = group centroided by centroid; 64 | result = foreach grouped generate group, AVG(centroided.gpa); 65 | store result into 'output'; 66 | """) 67 | 68 | converged = False 69 | iter_num = 0 70 | while iter_num < MAX_ITERATION: 71 | # Binding parameter centroids to current centroids 72 | Q = P.bind({'centroids':initial_centroids}) 73 | 74 | # Run Pig script 75 | results = Q.runSingle() 76 | 77 | # Check the result of the Pig script 78 | if results.isSuccessful() == "FAILED": 79 | raise "Pig job failed" 80 | 81 | # Get the new centroids from the output 82 | iter = results.result("result").iterator() 83 | centroids = [None] * k 84 | distance_move = 0 85 | 86 | # Calculate the moving distance with last iteration 87 | for i in range(k): 88 | tuple = iter.next() 89 | centroids[i] = float(str(tuple.get(1))) 90 | distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) 91 | distance_move = distance_move / k; 92 | Pig.fs("rmr output") 93 | print("iteration " + str(iter_num)) 94 | print("average distance moved: " + str(distance_move)) 95 | 96 | # Converge 97 | if distance_move < tolerance: 98 | sys.stdout.write("k-means converged at centroids: [") 99 | sys.stdout.write(",".join(str(v) for v in centroids)) 100 | sys.stdout.write("]\n") 101 | converged = True 102 | break 103 | 104 | # Not converge, use the new centroids as the initial centroids for next iteration 105 | last_centroids = centroids[:] 106 | initial_centroids = "" 107 | for i in range(k): 108 | initial_centroids = initial_centroids + str(last_centroids[i]) 109 | if i!=k-1: 110 | initial_centroids = initial_centroids + ":" 111 | iter_num += 1 112 | 113 | # Not converge after MAX_ITERATION 114 | if not converged: 115 | print("not converge after " + str(iter_num) + " iterations") 116 | sys.stdout.write("last centroids: [") 117 | sys.stdout.write(",".join(str(v) for v in last_centroids)) 118 | sys.stdout.write("]\n") 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /week2/loadfunc/src/java/com/example/NutchParsedDataLoader.java: -------------------------------------------------------------------------------- 1 | package com.example; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.commons.logging.Log; 6 | import org.apache.commons.logging.LogFactory; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.io.Writable; 9 | import org.apache.hadoop.mapreduce.InputFormat; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.RecordReader; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader; 15 | import org.apache.nutch.parse.ParseData; 16 | import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; 17 | import org.apache.pig.data.DataType; 18 | import org.apache.pig.data.Tuple; 19 | import org.apache.pig.data.TupleFactory; 20 | import org.apache.pig.Expression; 21 | import org.apache.pig.FileInputLoadFunc; 22 | import org.apache.pig.LoadMetadata; 23 | import org.apache.pig.ResourceSchema; 24 | import org.apache.pig.ResourceSchema.ResourceFieldSchema; 25 | import org.apache.pig.ResourceStatistics; 26 | 27 | public class NutchParsedDataLoader extends FileInputLoadFunc implements LoadMetadata { 28 | 29 | private SequenceFileRecordReader reader; 30 | 31 | private Text key; 32 | private ParseData value; 33 | 34 | protected static final Log LOG = LogFactory.getLog(NutchParsedDataLoader.class); 35 | protected TupleFactory mTupleFactory = TupleFactory.getInstance(); 36 | 37 | public NutchParsedDataLoader() { 38 | } 39 | 40 | @Override 41 | public Tuple getNext() throws IOException { 42 | boolean next = false; 43 | try { 44 | next = reader.nextKeyValue(); 45 | } catch (InterruptedException e) { 46 | throw new IOException(e); 47 | } 48 | 49 | if (!next) return null; 50 | 51 | key = (Text)reader.getCurrentKey(); 52 | value = (ParseData)reader.getCurrentValue(); 53 | 54 | Tuple t = mTupleFactory.newTuple(15); 55 | t.set(0, key.toString()); 56 | t.set(1, value.getTitle()); 57 | t.set(2, value.getMeta("name")); 58 | t.set(3, value.getMeta("publisher")); 59 | t.set(4, value.getMeta("updateTime")); 60 | t.set(5, value.getMeta("category")); 61 | t.set(6, value.getMeta("price")); 62 | t.set(7, value.getMeta("reviewScore")); 63 | t.set(8, value.getMeta("reviewCount")); 64 | t.set(9, value.getMeta("install")); 65 | t.set(10, value.getMeta("version")); 66 | t.set(11, value.getMeta("rating")); 67 | t.set(12, value.getMeta("developerSite")); 68 | t.set(13, value.getMeta("developerEmail")); 69 | 70 | return t; 71 | } 72 | 73 | @SuppressWarnings("unchecked") 74 | @Override 75 | public InputFormat getInputFormat() throws IOException { 76 | return new SequenceFileInputFormat(); 77 | } 78 | 79 | @SuppressWarnings("unchecked") 80 | @Override 81 | public void prepareToRead(RecordReader reader, PigSplit split) 82 | throws IOException { 83 | this.reader = (SequenceFileRecordReader) reader; 84 | } 85 | 86 | @Override 87 | public void setLocation(String location, Job job) throws IOException { 88 | FileInputFormat.setInputPaths(job, location); 89 | } 90 | 91 | @Override 92 | public String[] getPartitionKeys(String location, Job job) throws IOException { 93 | return null; 94 | } 95 | 96 | @Override 97 | public ResourceSchema getSchema(String location, Job job) throws IOException { 98 | ResourceSchema schema = new ResourceSchema(); 99 | ResourceFieldSchema fields[] = new ResourceFieldSchema[14]; 100 | fields[0] = new ResourceFieldSchema(); fields[0].setName("url"); fields[0].setType(DataType.CHARARRAY); 101 | fields[1] = new ResourceFieldSchema(); fields[1].setName("title"); fields[1].setType(DataType.CHARARRAY); 102 | fields[2] = new ResourceFieldSchema(); fields[2].setName("name"); fields[2].setType(DataType.CHARARRAY); 103 | fields[3] = new ResourceFieldSchema(); fields[3].setName("publisher"); fields[3].setType(DataType.CHARARRAY); 104 | fields[4] = new ResourceFieldSchema(); fields[4].setName("updateTime"); fields[4].setType(DataType.CHARARRAY); 105 | fields[5] = new ResourceFieldSchema(); fields[5].setName("category"); fields[5].setType(DataType.CHARARRAY); 106 | fields[6] = new ResourceFieldSchema(); fields[6].setName("price"); fields[6].setType(DataType.CHARARRAY); 107 | fields[7] = new ResourceFieldSchema(); fields[7].setName("reviewScore"); fields[7].setType(DataType.CHARARRAY); 108 | fields[8] = new ResourceFieldSchema(); fields[8].setName("reviewCount"); fields[8].setType(DataType.CHARARRAY); 109 | fields[9] = new ResourceFieldSchema(); fields[9].setName("install"); fields[9].setType(DataType.CHARARRAY); 110 | fields[10] = new ResourceFieldSchema(); fields[10].setName("version"); fields[10].setType(DataType.CHARARRAY); 111 | fields[11] = new ResourceFieldSchema(); fields[11].setName("rating"); fields[11].setType(DataType.CHARARRAY); 112 | fields[12] = new ResourceFieldSchema(); fields[12].setName("developerSite"); fields[12].setType(DataType.CHARARRAY); 113 | fields[13] = new ResourceFieldSchema(); fields[13].setName("developerEmail"); fields[13].setType(DataType.CHARARRAY); 114 | schema.setFields(fields); 115 | return schema; 116 | } 117 | 118 | @Override 119 | public ResourceStatistics getStatistics(String location, Job job) 120 | throws IOException { 121 | return null; 122 | } 123 | 124 | @Override 125 | public void setPartitionFilter(Expression expr) throws IOException { 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /week4/set.patch: -------------------------------------------------------------------------------- 1 | Index: src/org/apache/pig/PigServer.java 2 | =================================================================== 3 | --- src/org/apache/pig/PigServer.java (revision 1785219) 4 | +++ src/org/apache/pig/PigServer.java (working copy) 5 | @@ -43,6 +43,7 @@ 6 | import java.util.Set; 7 | import java.util.concurrent.atomic.AtomicInteger; 8 | 9 | +import org.apache.commons.lang.StringUtils; 10 | import org.apache.commons.logging.Log; 11 | import org.apache.commons.logging.LogFactory; 12 | import org.apache.hadoop.conf.Configuration; 13 | @@ -102,6 +103,7 @@ 14 | import org.apache.pig.validator.BlackAndWhitelistFilter; 15 | import org.apache.pig.validator.PigCommandFilter; 16 | 17 | +import com.beust.jcommander.Strings; 18 | import com.google.common.annotations.VisibleForTesting; 19 | 20 | /** 21 | @@ -395,6 +397,12 @@ 22 | pigContext.getLog4jProperties().setProperty("log4j.logger.org.apache.pig", pigContext.getDefaultLogLevel().toString()); 23 | } 24 | 25 | + public boolean isDebugOn() { 26 | + if (Logger.getLogger("org.apache.pig").getLevel() == Level.DEBUG) { 27 | + return true; 28 | + } 29 | + return false; 30 | + } 31 | /** 32 | * Set the default parallelism for this job 33 | * @param p default number of reducers to use for this job. 34 | @@ -403,6 +411,10 @@ 35 | pigContext.defaultParallel = p; 36 | } 37 | 38 | + public int getDefaultParallel() { 39 | + return pigContext.defaultParallel; 40 | + } 41 | + 42 | /** 43 | * Starts batch execution mode. 44 | */ 45 | @@ -547,6 +559,10 @@ 46 | pigContext.addPathToSkip(path); 47 | } 48 | 49 | + public String getPathToSkip() { 50 | + return StringUtils.join(pigContext.getPathsToSkip(), ","); 51 | + } 52 | + 53 | /** 54 | * Defines an alias for the given function spec. This 55 | * is useful for functions that require arguments to the 56 | @@ -956,6 +972,10 @@ 57 | jobName = PigContext.JOB_NAME_PREFIX + ":" + name; 58 | } 59 | 60 | + public String getJobName() { 61 | + return jobName; 62 | + } 63 | + 64 | /** 65 | * Set Hadoop job priority. This value will get translated to mapred.job.priority. 66 | * @param priority valid values are found in {@link org.apache.hadoop.mapred.JobPriority} 67 | @@ -964,6 +984,10 @@ 68 | jobPriority = priority; 69 | } 70 | 71 | + public String getJobPriority() { 72 | + return jobPriority; 73 | + } 74 | + 75 | /** 76 | * Executes a Pig Latin script up to and including indicated alias. That is, if a user does: 77 | *
 78 | Index: src/org/apache/pig/tools/grunt/GruntParser.java
 79 | ===================================================================
 80 | --- src/org/apache/pig/tools/grunt/GruntParser.java	(revision 1785219)
 81 | +++ src/org/apache/pig/tools/grunt/GruntParser.java	(working copy)
 82 | @@ -572,44 +572,74 @@
 83 |      protected void processSet(String key, String value) throws IOException, ParseException {
 84 |          filter.validate(PigCommandFilter.Command.SET);
 85 |          key = parameterSubstitutionInGrunt(key);
 86 | -        value = parameterSubstitutionInGrunt(value);
 87 | +        if (value != null) {
 88 | +            value = parameterSubstitutionInGrunt(value);
 89 | +        }
 90 |          if (key.equals("debug"))
 91 |          {
 92 | -            if (value.equals("on"))
 93 | -                mPigServer.debugOn();
 94 | -            else if (value.equals("off"))
 95 | -                mPigServer.debugOff();
 96 | -            else
 97 | -                throw new ParseException("Invalid value " + value + " provided for " + key);
 98 | +            if (value == null) {
 99 | +                System.out.println("debug=" + mPigServer.isDebugOn());
100 | +            } else {
101 | +                if (value.equals("on"))
102 | +                    mPigServer.debugOn();
103 | +                else if (value.equals("off"))
104 | +                    mPigServer.debugOff();
105 | +                else
106 | +                    throw new ParseException("Invalid value " + value + " provided for " + key);
107 | +            }
108 |          }
109 |          else if (key.equals("job.name"))
110 |          {
111 | -            mPigServer.setJobName(value);
112 | +            if (value == null) {
113 | +                System.out.println("job.name=" + mPigServer.getJobName());
114 | +            } else {
115 | +                mPigServer.setJobName(value);
116 | +            }
117 |          }
118 |          else if (key.equals("job.priority"))
119 |          {
120 | -            mPigServer.setJobPriority(value);
121 | +            if (value == null) {
122 | +                System.out.println("job.priority=" + mPigServer.getJobPriority());
123 | +            } else {
124 | +                mPigServer.setJobPriority(value);
125 | +            }
126 |          }
127 |          else if (key.equals("stream.skippath")) {
128 | -            // Validate
129 | -            File file = new File(value);
130 | -            if (!file.exists() || file.isDirectory()) {
131 | -                throw new IOException("Invalid value for stream.skippath:" +
132 | -                                      value);
133 | +            if (value == null) {
134 | +                System.out.println("stream.skippath=" + mPigServer.getPathToSkip());
135 | +            } else {
136 | +                // Validate
137 | +                File file = new File(value);
138 | +                if (!file.exists() || file.isDirectory()) {
139 | +                    throw new IOException("Invalid value for stream.skippath:" +
140 | +                                          value);
141 | +                }
142 | +                mPigServer.addPathToSkip(value);
143 |              }
144 | -            mPigServer.addPathToSkip(value);
145 |          }
146 |          else if (key.equals("default_parallel")) {
147 | -            // Validate
148 | -            try {
149 | -                mPigServer.setDefaultParallel(Integer.parseInt(value));
150 | -            } catch (NumberFormatException e) {
151 | -                throw new ParseException("Invalid value for default_parallel");
152 | +            if (value == null) {
153 | +                System.out.println("default_parallel=" + mPigServer.getDefaultParallel());
154 | +            } else {
155 | +                // Validate
156 | +                try {
157 | +                    mPigServer.setDefaultParallel(Integer.parseInt(value));
158 | +                } catch (NumberFormatException e) {
159 | +                    throw new ParseException("Invalid value for default_parallel");
160 | +                }
161 |              }
162 |          }
163 |          else
164 |          {
165 | -           mPigServer.getPigContext().getExecutionEngine().setProperty(key, value);
166 | +            if (value == null) {
167 | +                if (mPigServer.getPigContext().getProperties().get(key) != null) {
168 | +                    System.out.println(key + "=" + mPigServer.getPigContext().getProperties().get(key));
169 | +                } else {
170 | +                    System.out.println(key + " is undefined");
171 | +                }
172 | +            } else {
173 | +                mPigServer.getPigContext().getExecutionEngine().setProperty(key, value);
174 | +            }
175 |          }
176 |      }
177 |  
178 | Index: src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj
179 | ===================================================================
180 | --- src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj	(revision 1785219)
181 | +++ src/org/apache/pig/tools/pigscript/parser/PigScriptParser.jj	(working copy)
182 | @@ -132,7 +132,13 @@
183 |  		else
184 |  			return s;
185 |  	}
186 | -
187 | +	static boolean eolOrSemicolon(int kind)
188 | +    {
189 | +        if (kind == EOL || kind == SEMICOLON) {
190 | +            return true;
191 | +        }
192 | +        return false;
193 | +    }
194 |  }
195 |  PARSER_END(PigScriptParser)
196 |  
197 | @@ -626,8 +632,8 @@
198 |  	
199 |  	(
200 |  		t1 = GetKey()
201 | -		t2 = GetValue()
202 | -		{processSet(t1.image, unquote(t2.image));}
203 | +		t2 = GetValueOrNull()
204 | +		{processSet(t1.image, eolOrSemicolon(t2.kind)?null:unquote(t2.image));}
205 |      	|
206 |  		{processSet();}
207 |  	)
208 | @@ -828,6 +834,21 @@
209 |  	{return t;}
210 |  }
211 |  
212 | +Token GetValueOrNull() :
213 | +{
214 | +	Token t;
215 | +}
216 | +{
217 | +    (
218 | +    t = GetValue()
219 | +    |
220 | +    t = 
221 | +    |
222 | +    t = 
223 | +    )
224 | +    {return t;}
225 | +}
226 | +
227 |  Token GetValue() :
228 |  {
229 |  	Token t;
230 | Index: test/org/apache/pig/test/TestGrunt.java
231 | ===================================================================
232 | --- test/org/apache/pig/test/TestGrunt.java	(revision 1785219)
233 | +++ test/org/apache/pig/test/TestGrunt.java	(working copy)
234 | @@ -24,6 +24,7 @@
235 |  
236 |  import java.io.BufferedReader;
237 |  import java.io.ByteArrayInputStream;
238 | +import java.io.ByteArrayOutputStream;
239 |  import java.io.File;
240 |  import java.io.FileInputStream;
241 |  import java.io.FileReader;
242 | @@ -31,6 +32,7 @@
243 |  import java.io.FilenameFilter;
244 |  import java.io.InputStream;
245 |  import java.io.InputStreamReader;
246 | +import java.io.PrintStream;
247 |  import java.io.PrintWriter;
248 |  import java.io.StringReader;
249 |  import java.util.ArrayList;
250 | @@ -1430,6 +1432,14 @@
251 |          new Grunt(new BufferedReader(reader), pc).exec();
252 |  
253 |          assertEquals("my.arbitrary.value",  pc.getProperties().getProperty("my.arbitrary.key"));
254 | +
255 | +        ByteArrayOutputStream baos = new ByteArrayOutputStream();
256 | +        System.setOut(new PrintStream(baos));
257 | +        strCmd = "set my.arbitrary.key\n";
258 | +        reader = new InputStreamReader(new ByteArrayInputStream(strCmd.getBytes()));
259 | +        new Grunt(new BufferedReader(reader), pc).exec();
260 | +
261 | +        assertEquals(baos.toString(), "my.arbitrary.key=my.arbitrary.value\n");
262 |      }
263 |  
264 |      @Test
265 | 


--------------------------------------------------------------------------------
/week3/hive/tpcds/all-tables-orc.sql:
--------------------------------------------------------------------------------
  1 | create database tpcds;
  2 | use tpcds;
  3 | 
  4 | drop table if exists call_center;
  5 | 
  6 | create table call_center(
  7 |       cc_call_center_sk         bigint               
  8 | ,     cc_call_center_id         string              
  9 | ,     cc_rec_start_date        string                         
 10 | ,     cc_rec_end_date          string                         
 11 | ,     cc_closed_date_sk         bigint                       
 12 | ,     cc_open_date_sk           bigint                       
 13 | ,     cc_name                   string                   
 14 | ,     cc_class                  string                   
 15 | ,     cc_employees              int                       
 16 | ,     cc_sq_ft                  int                       
 17 | ,     cc_hours                  string                      
 18 | ,     cc_manager                string                   
 19 | ,     cc_mkt_id                 int                       
 20 | ,     cc_mkt_class              string                      
 21 | ,     cc_mkt_desc               string                  
 22 | ,     cc_market_manager         string                   
 23 | ,     cc_division               int                       
 24 | ,     cc_division_name          string                   
 25 | ,     cc_company                int                       
 26 | ,     cc_company_name           string                      
 27 | ,     cc_street_number          string                      
 28 | ,     cc_street_name            string                   
 29 | ,     cc_street_type            string                      
 30 | ,     cc_suite_number           string                      
 31 | ,     cc_city                   string                   
 32 | ,     cc_county                 string                   
 33 | ,     cc_state                  string                       
 34 | ,     cc_zip                    string                      
 35 | ,     cc_country                string                   
 36 | ,     cc_gmt_offset             double                  
 37 | ,     cc_tax_percentage         double
 38 | )
 39 | stored as orc;
 40 | 
 41 | drop table if exists catalog_page;
 42 | 
 43 | create table catalog_page(
 44 |       cp_catalog_page_sk        bigint               
 45 | ,     cp_catalog_page_id        string              
 46 | ,     cp_start_date_sk          bigint                       
 47 | ,     cp_end_date_sk            bigint                       
 48 | ,     cp_department             string                   
 49 | ,     cp_catalog_number         int                       
 50 | ,     cp_catalog_page_number    int                       
 51 | ,     cp_description            string                  
 52 | ,     cp_type                   string
 53 | )
 54 | stored as orc;
 55 | 
 56 | drop table if exists catalog_returns;
 57 | 
 58 | create table catalog_returns
 59 | (
 60 |     cr_returned_time_sk       bigint,
 61 |     cr_item_sk                bigint,
 62 |     cr_refunded_customer_sk   bigint,
 63 |     cr_refunded_cdemo_sk      bigint,
 64 |     cr_refunded_hdemo_sk      bigint,
 65 |     cr_refunded_addr_sk       bigint,
 66 |     cr_returning_customer_sk  bigint,
 67 |     cr_returning_cdemo_sk     bigint,
 68 |     cr_returning_hdemo_sk     bigint,
 69 |     cr_returning_addr_sk      bigint,
 70 |     cr_call_center_sk         bigint,
 71 |     cr_catalog_page_sk        bigint,
 72 |     cr_ship_mode_sk           bigint,
 73 |     cr_warehouse_sk           bigint,
 74 |     cr_reason_sk              bigint,
 75 |     cr_order_number           bigint,
 76 |     cr_return_quantity        int,
 77 |     cr_return_amount          double,
 78 |     cr_return_tax             double,
 79 |     cr_return_amt_inc_tax     double,
 80 |     cr_fee                    double,
 81 |     cr_return_ship_cost       double,
 82 |     cr_refunded_cash          double,
 83 |     cr_reversed_charge        double,
 84 |     cr_store_credit           double,
 85 |     cr_net_loss               double
 86 | )
 87 | partitioned by (cr_returned_date_sk bigint)
 88 | stored as orc;
 89 | 
 90 | drop table if exists catalog_sales;
 91 | 
 92 | create table catalog_sales
 93 | (
 94 |     cs_sold_time_sk           bigint,
 95 |     cs_ship_date_sk           bigint,
 96 |     cs_bill_customer_sk       bigint,
 97 |     cs_bill_cdemo_sk          bigint,
 98 |     cs_bill_hdemo_sk          bigint,
 99 |     cs_bill_addr_sk           bigint,
100 |     cs_ship_customer_sk       bigint,
101 |     cs_ship_cdemo_sk          bigint,
102 |     cs_ship_hdemo_sk          bigint,
103 |     cs_ship_addr_sk           bigint,
104 |     cs_call_center_sk         bigint,
105 |     cs_catalog_page_sk        bigint,
106 |     cs_ship_mode_sk           bigint,
107 |     cs_warehouse_sk           bigint,
108 |     cs_item_sk                bigint,
109 |     cs_promo_sk               bigint,
110 |     cs_order_number           bigint,
111 |     cs_quantity               int,
112 |     cs_wholesale_cost         double,
113 |     cs_list_price             double,
114 |     cs_sales_price            double,
115 |     cs_ext_discount_amt       double,
116 |     cs_ext_sales_price        double,
117 |     cs_ext_wholesale_cost     double,
118 |     cs_ext_list_price         double,
119 |     cs_ext_tax                double,
120 |     cs_coupon_amt             double,
121 |     cs_ext_ship_cost          double,
122 |     cs_net_paid               double,
123 |     cs_net_paid_inc_tax       double,
124 |     cs_net_paid_inc_ship      double,
125 |     cs_net_paid_inc_ship_tax  double,
126 |     cs_net_profit             double
127 | )
128 | partitioned by (cs_sold_date_sk bigint)
129 | stored as orc;
130 | 
131 | drop table if exists customer_address;
132 | 
133 | create table customer_address
134 | (
135 |     ca_address_sk             bigint,
136 |     ca_address_id             string,
137 |     ca_street_number          string,
138 |     ca_street_name            string,
139 |     ca_street_type            string,
140 |     ca_suite_number           string,
141 |     ca_city                   string,
142 |     ca_county                 string,
143 |     ca_state                  string,
144 |     ca_zip                    string,
145 |     ca_country                string,
146 |     ca_gmt_offset             double,
147 |     ca_location_type          string
148 | )
149 | stored as orc;
150 | 
151 | drop table if exists customer_demographics;
152 | 
153 | create table customer_demographics
154 | (
155 |     cd_demo_sk                bigint,
156 |     cd_gender                 string,
157 |     cd_marital_status         string,
158 |     cd_education_status       string,
159 |     cd_purchase_estimate      int,
160 |     cd_credit_rating          string,
161 |     cd_dep_count              int,
162 |     cd_dep_employed_count     int,
163 |     cd_dep_college_count      int 
164 | )
165 | stored as orc;
166 | 
167 | drop table if exists customer;
168 | 
169 | create table customer
170 | (
171 |     c_customer_sk             bigint,
172 |     c_customer_id             string,
173 |     c_current_cdemo_sk        bigint,
174 |     c_current_hdemo_sk        bigint,
175 |     c_current_addr_sk         bigint,
176 |     c_first_shipto_date_sk    bigint,
177 |     c_first_sales_date_sk     bigint,
178 |     c_salutation              string,
179 |     c_first_name              string,
180 |     c_last_name               string,
181 |     c_preferred_cust_flag     string,
182 |     c_birth_day               int,
183 |     c_birth_month             int,
184 |     c_birth_year              int,
185 |     c_birth_country           string,
186 |     c_login                   string,
187 |     c_email_address           string,
188 |     c_last_review_date        string
189 | )
190 | stored as orc;
191 | 
192 | drop table if exists date_dim;
193 | 
194 | create table date_dim
195 | (
196 |     d_date_sk                 bigint,
197 |     d_date_id                 string,
198 |     d_date                    string,
199 |     d_month_seq               int,
200 |     d_week_seq                int,
201 |     d_quarter_seq             int,
202 |     d_year                    int,
203 |     d_dow                     int,
204 |     d_moy                     int,
205 |     d_dom                     int,
206 |     d_qoy                     int,
207 |     d_fy_year                 int,
208 |     d_fy_quarter_seq          int,
209 |     d_fy_week_seq             int,
210 |     d_day_name                string,
211 |     d_quarter_name            string,
212 |     d_holiday                 string,
213 |     d_weekend                 string,
214 |     d_following_holiday       string,
215 |     d_first_dom               int,
216 |     d_last_dom                int,
217 |     d_same_day_ly             int,
218 |     d_same_day_lq             int,
219 |     d_current_day             string,
220 |     d_current_week            string,
221 |     d_current_month           string,
222 |     d_current_quarter         string,
223 |     d_current_year            string 
224 | )
225 | stored as orc;
226 | 
227 | drop table if exists household_demographics;
228 | 
229 | create table household_demographics
230 | (
231 |     hd_demo_sk                bigint,
232 |     hd_income_band_sk         bigint,
233 |     hd_buy_potential          string,
234 |     hd_dep_count              int,
235 |     hd_vehicle_count          int
236 | )
237 | stored as orc;
238 | 
239 | drop table if exists income_band;
240 | 
241 | create table income_band(
242 |       ib_income_band_sk         bigint               
243 | ,     ib_lower_bound            int                       
244 | ,     ib_upper_bound            int
245 | )
246 | stored as orc;
247 | 
248 | drop table if exists inventory;
249 | 
250 | create table inventory
251 | (
252 |     inv_item_sk			bigint,
253 |     inv_warehouse_sk		bigint,
254 |     inv_quantity_on_hand	int
255 | )
256 | partitioned by (inv_date_sk bigint)
257 | stored as orc;
258 | 
259 | drop table if exists item;
260 | 
261 | create table item
262 | (
263 |     i_item_sk                 bigint,
264 |     i_item_id                 string,
265 |     i_rec_start_date          string,
266 |     i_rec_end_date            string,
267 |     i_item_desc               string,
268 |     i_current_price           double,
269 |     i_wholesale_cost          double,
270 |     i_brand_id                int,
271 |     i_brand                   string,
272 |     i_class_id                int,
273 |     i_class                   string,
274 |     i_category_id             int,
275 |     i_category                string,
276 |     i_manufact_id             int,
277 |     i_manufact                string,
278 |     i_size                    string,
279 |     i_formulation             string,
280 |     i_color                   string,
281 |     i_units                   string,
282 |     i_container               string,
283 |     i_manager_id              int,
284 |     i_product_name            string
285 | )
286 | stored as orc;
287 | 
288 | drop table if exists promotion;
289 | 
290 | create table promotion
291 | (
292 |     p_promo_sk                bigint,
293 |     p_promo_id                string,
294 |     p_start_date_sk           bigint,
295 |     p_end_date_sk             bigint,
296 |     p_item_sk                 bigint,
297 |     p_cost                    double,
298 |     p_response_target         int,
299 |     p_promo_name              string,
300 |     p_channel_dmail           string,
301 |     p_channel_email           string,
302 |     p_channel_catalog         string,
303 |     p_channel_tv              string,
304 |     p_channel_radio           string,
305 |     p_channel_press           string,
306 |     p_channel_event           string,
307 |     p_channel_demo            string,
308 |     p_channel_details         string,
309 |     p_purpose                 string,
310 |     p_discount_active         string 
311 | )
312 | stored as orc;
313 | 
314 | drop table if exists reason;
315 | 
316 | create table reason(
317 |       r_reason_sk               bigint               
318 | ,     r_reason_id               string              
319 | ,     r_reason_desc             string                
320 | )
321 | stored as orc;
322 | 
323 | drop table if exists ship_mode;
324 | 
325 | create table ship_mode(
326 |       sm_ship_mode_sk           bigint               
327 | ,     sm_ship_mode_id           string              
328 | ,     sm_type                   string                      
329 | ,     sm_code                   string                      
330 | ,     sm_carrier                string                      
331 | ,     sm_contract               string                      
332 | )
333 | stored as orc;
334 | 
335 | drop table if exists store_returns;
336 | 
337 | create table store_returns
338 | (
339 |     sr_return_time_sk         bigint,
340 |     sr_item_sk                bigint,
341 |     sr_customer_sk            bigint,
342 |     sr_cdemo_sk               bigint,
343 |     sr_hdemo_sk               bigint,
344 |     sr_addr_sk                bigint,
345 |     sr_store_sk               bigint,
346 |     sr_reason_sk              bigint,
347 |     sr_ticket_number          bigint,
348 |     sr_return_quantity        int,
349 |     sr_return_amt             double,
350 |     sr_return_tax             double,
351 |     sr_return_amt_inc_tax     double,
352 |     sr_fee                    double,
353 |     sr_return_ship_cost       double,
354 |     sr_refunded_cash          double,
355 |     sr_reversed_charge        double,
356 |     sr_store_credit           double,
357 |     sr_net_loss               double
358 | )
359 | partitioned by (sr_returned_date_sk bigint)
360 | stored as orc;
361 | 
362 | drop table if exists store_sales;
363 | 
364 | create table store_sales
365 | (
366 |     ss_sold_time_sk           bigint,
367 |     ss_item_sk                bigint,
368 |     ss_customer_sk            bigint,
369 |     ss_cdemo_sk               bigint,
370 |     ss_hdemo_sk               bigint,
371 |     ss_addr_sk                bigint,
372 |     ss_store_sk               bigint,
373 |     ss_promo_sk               bigint,
374 |     ss_ticket_number          bigint,
375 |     ss_quantity               int,
376 |     ss_wholesale_cost         double,
377 |     ss_list_price             double,
378 |     ss_sales_price            double,
379 |     ss_ext_discount_amt       double,
380 |     ss_ext_sales_price        double,
381 |     ss_ext_wholesale_cost     double,
382 |     ss_ext_list_price         double,
383 |     ss_ext_tax                double,
384 |     ss_coupon_amt             double,
385 |     ss_net_paid               double,
386 |     ss_net_paid_inc_tax       double,
387 |     ss_net_profit             double
388 | )
389 | partitioned by (ss_sold_date_sk bigint)
390 | stored as orc;
391 | 
392 | drop table if exists store;
393 | 
394 | create table store
395 | (
396 |     s_store_sk                bigint,
397 |     s_store_id                string,
398 |     s_rec_start_date          string,
399 |     s_rec_end_date            string,
400 |     s_closed_date_sk          bigint,
401 |     s_store_name              string,
402 |     s_number_employees        int,
403 |     s_floor_space             int,
404 |     s_hours                   string,
405 |     s_manager                 string,
406 |     s_market_id               int,
407 |     s_geography_class         string,
408 |     s_market_desc             string,
409 |     s_market_manager          string,
410 |     s_division_id             int,
411 |     s_division_name           string,
412 |     s_company_id              int,
413 |     s_company_name            string,
414 |     s_street_number           string,
415 |     s_street_name             string,
416 |     s_street_type             string,
417 |     s_suite_number            string,
418 |     s_city                    string,
419 |     s_county                  string,
420 |     s_state                   string,
421 |     s_zip                     string,
422 |     s_country                 string,
423 |     s_gmt_offset              double,
424 |     s_tax_precentage          double                  
425 | )
426 | stored as orc;
427 | 
428 | drop table if exists time_dim;
429 | 
430 | create table time_dim
431 | (
432 |     t_time_sk                 bigint,
433 |     t_time_id                 string,
434 |     t_time                    int,
435 |     t_hour                    int,
436 |     t_minute                  int,
437 |     t_second                  int,
438 |     t_am_pm                   string,
439 |     t_shift                   string,
440 |     t_sub_shift               string,
441 |     t_meal_time               string
442 | )
443 | stored as orc;
444 | 
445 | drop table if exists warehouse;
446 | 
447 | create table warehouse(
448 |       w_warehouse_sk            bigint               
449 | ,     w_warehouse_id            string              
450 | ,     w_warehouse_name          string                   
451 | ,     w_warehouse_sq_ft         int                       
452 | ,     w_street_number           string                      
453 | ,     w_street_name             string                   
454 | ,     w_street_type             string                      
455 | ,     w_suite_number            string                      
456 | ,     w_city                    string                   
457 | ,     w_county                  string                   
458 | ,     w_state                   string                       
459 | ,     w_zip                     string                      
460 | ,     w_country                 string                   
461 | ,     w_gmt_offset              double                  
462 | )
463 | stored as orc;
464 | 
465 | drop table if exists web_page;
466 | 
467 | create table web_page(
468 |       wp_web_page_sk            bigint               
469 | ,     wp_web_page_id            string              
470 | ,     wp_rec_start_date        string                         
471 | ,     wp_rec_end_date          string                         
472 | ,     wp_creation_date_sk       bigint                       
473 | ,     wp_access_date_sk         bigint                       
474 | ,     wp_autogen_flag           string                       
475 | ,     wp_customer_sk            bigint                       
476 | ,     wp_url                    string                  
477 | ,     wp_type                   string                      
478 | ,     wp_char_count             int                       
479 | ,     wp_link_count             int                       
480 | ,     wp_image_count            int                       
481 | ,     wp_max_ad_count           int
482 | )
483 | stored as orc;
484 | 
485 | drop table if exists web_returns;
486 | 
487 | create table web_returns
488 | (
489 |     wr_returned_time_sk       bigint,
490 |     wr_item_sk                bigint,
491 |     wr_refunded_customer_sk   bigint,
492 |     wr_refunded_cdemo_sk      bigint,
493 |     wr_refunded_hdemo_sk      bigint,
494 |     wr_refunded_addr_sk       bigint,
495 |     wr_returning_customer_sk  bigint,
496 |     wr_returning_cdemo_sk     bigint,
497 |     wr_returning_hdemo_sk     bigint,
498 |     wr_returning_addr_sk      bigint,
499 |     wr_web_page_sk            bigint,
500 |     wr_reason_sk              bigint,
501 |     wr_order_number           bigint,
502 |     wr_return_quantity        int,
503 |     wr_return_amt             double,
504 |     wr_return_tax             double,
505 |     wr_return_amt_inc_tax     double,
506 |     wr_fee                    double,
507 |     wr_return_ship_cost       double,
508 |     wr_refunded_cash          double,
509 |     wr_reversed_charge        double,
510 |     wr_account_credit         double,
511 |     wr_net_loss               double
512 | )
513 | partitioned by (wr_returned_date_sk bigint)
514 | stored as orc;
515 | 
516 | drop table if exists web_sales;
517 | 
518 | create table web_sales
519 | (
520 |     ws_sold_time_sk           bigint,
521 |     ws_ship_date_sk           bigint,
522 |     ws_item_sk                bigint,
523 |     ws_bill_customer_sk       bigint,
524 |     ws_bill_cdemo_sk          bigint,
525 |     ws_bill_hdemo_sk          bigint,
526 |     ws_bill_addr_sk           bigint,
527 |     ws_ship_customer_sk       bigint,
528 |     ws_ship_cdemo_sk          bigint,
529 |     ws_ship_hdemo_sk          bigint,
530 |     ws_ship_addr_sk           bigint,
531 |     ws_web_page_sk            bigint,
532 |     ws_web_site_sk            bigint,
533 |     ws_ship_mode_sk           bigint,
534 |     ws_warehouse_sk           bigint,
535 |     ws_promo_sk               bigint,
536 |     ws_order_number           bigint,
537 |     ws_quantity               int,
538 |     ws_wholesale_cost         double,
539 |     ws_list_price             double,
540 |     ws_sales_price            double,
541 |     ws_ext_discount_amt       double,
542 |     ws_ext_sales_price        double,
543 |     ws_ext_wholesale_cost     double,
544 |     ws_ext_list_price         double,
545 |     ws_ext_tax                double,
546 |     ws_coupon_amt             double,
547 |     ws_ext_ship_cost          double,
548 |     ws_net_paid               double,
549 |     ws_net_paid_inc_tax       double,
550 |     ws_net_paid_inc_ship      double,
551 |     ws_net_paid_inc_ship_tax  double,
552 |     ws_net_profit             double
553 | )
554 | partitioned by (ws_sold_date_sk bigint)
555 | stored as orc;
556 | 
557 | drop table if exists web_site;
558 | 
559 | create table web_site
560 | (
561 |     web_site_sk           bigint,
562 |     web_site_id           string,
563 |     web_rec_start_date    string,
564 |     web_rec_end_date      string,
565 |     web_name              string,
566 |     web_open_date_sk      bigint,
567 |     web_close_date_sk     bigint,
568 |     web_class             string,
569 |     web_manager           string,
570 |     web_mkt_id            int,
571 |     web_mkt_class         string,
572 |     web_mkt_desc          string,
573 |     web_market_manager    string,
574 |     web_company_id        int,
575 |     web_company_name      string,
576 |     web_street_number     string,
577 |     web_street_name       string,
578 |     web_street_type       string,
579 |     web_suite_number      string,
580 |     web_city              string,
581 |     web_county            string,
582 |     web_state             string,
583 |     web_zip               string,
584 |     web_country           string,
585 |     web_gmt_offset        double,
586 |     web_tax_percentage    double
587 | )
588 | stored as orc;
589 | 


--------------------------------------------------------------------------------
/week3/hive/tpcds/all-tables.sql:
--------------------------------------------------------------------------------
  1 | drop table if exists call_center;
  2 | 
  3 | create external table call_center(
  4 |       cc_call_center_sk         bigint               
  5 | ,     cc_call_center_id         string              
  6 | ,     cc_rec_start_date        string                         
  7 | ,     cc_rec_end_date          string                         
  8 | ,     cc_closed_date_sk         bigint                       
  9 | ,     cc_open_date_sk           bigint                       
 10 | ,     cc_name                   string                   
 11 | ,     cc_class                  string                   
 12 | ,     cc_employees              int                       
 13 | ,     cc_sq_ft                  int                       
 14 | ,     cc_hours                  string                      
 15 | ,     cc_manager                string                   
 16 | ,     cc_mkt_id                 int                       
 17 | ,     cc_mkt_class              string                      
 18 | ,     cc_mkt_desc               string                  
 19 | ,     cc_market_manager         string                   
 20 | ,     cc_division               int                       
 21 | ,     cc_division_name          string                   
 22 | ,     cc_company                int                       
 23 | ,     cc_company_name           string                      
 24 | ,     cc_street_number          string                      
 25 | ,     cc_street_name            string                   
 26 | ,     cc_street_type            string                      
 27 | ,     cc_suite_number           string                      
 28 | ,     cc_city                   string                   
 29 | ,     cc_county                 string                   
 30 | ,     cc_state                  string                       
 31 | ,     cc_zip                    string                      
 32 | ,     cc_country                string                   
 33 | ,     cc_gmt_offset             double                  
 34 | ,     cc_tax_percentage         double
 35 | )
 36 | row format delimited fields terminated by '|' 
 37 | location '/data/call_center';
 38 | 
 39 | drop table if exists catalog_page;
 40 | 
 41 | create external table catalog_page(
 42 |       cp_catalog_page_sk        bigint               
 43 | ,     cp_catalog_page_id        string              
 44 | ,     cp_start_date_sk          bigint                       
 45 | ,     cp_end_date_sk            bigint                       
 46 | ,     cp_department             string                   
 47 | ,     cp_catalog_number         int                       
 48 | ,     cp_catalog_page_number    int                       
 49 | ,     cp_description            string                  
 50 | ,     cp_type                   string
 51 | )
 52 | row format delimited fields terminated by '|' 
 53 | location '/data/catalog_page';
 54 | 
 55 | drop table if exists catalog_returns;
 56 | 
 57 | create external table catalog_returns
 58 | (
 59 |     cr_returned_date_sk       bigint,
 60 |     cr_returned_time_sk       bigint,
 61 |     cr_item_sk                bigint,
 62 |     cr_refunded_customer_sk   bigint,
 63 |     cr_refunded_cdemo_sk      bigint,
 64 |     cr_refunded_hdemo_sk      bigint,
 65 |     cr_refunded_addr_sk       bigint,
 66 |     cr_returning_customer_sk  bigint,
 67 |     cr_returning_cdemo_sk     bigint,
 68 |     cr_returning_hdemo_sk     bigint,
 69 |     cr_returning_addr_sk      bigint,
 70 |     cr_call_center_sk         bigint,
 71 |     cr_catalog_page_sk        bigint,
 72 |     cr_ship_mode_sk           bigint,
 73 |     cr_warehouse_sk           bigint,
 74 |     cr_reason_sk              bigint,
 75 |     cr_order_number           bigint,
 76 |     cr_return_quantity        int,
 77 |     cr_return_amount          double,
 78 |     cr_return_tax             double,
 79 |     cr_return_amt_inc_tax     double,
 80 |     cr_fee                    double,
 81 |     cr_return_ship_cost       double,
 82 |     cr_refunded_cash          double,
 83 |     cr_reversed_charge        double,
 84 |     cr_store_credit           double,
 85 |     cr_net_loss               double
 86 | )
 87 | row format delimited fields terminated by '|' 
 88 | location '/data/catalog_returns';
 89 | 
 90 | drop table if exists catalog_sales;
 91 | 
 92 | create external table catalog_sales
 93 | (
 94 |     cs_sold_date_sk           bigint,
 95 |     cs_sold_time_sk           bigint,
 96 |     cs_ship_date_sk           bigint,
 97 |     cs_bill_customer_sk       bigint,
 98 |     cs_bill_cdemo_sk          bigint,
 99 |     cs_bill_hdemo_sk          bigint,
100 |     cs_bill_addr_sk           bigint,
101 |     cs_ship_customer_sk       bigint,
102 |     cs_ship_cdemo_sk          bigint,
103 |     cs_ship_hdemo_sk          bigint,
104 |     cs_ship_addr_sk           bigint,
105 |     cs_call_center_sk         bigint,
106 |     cs_catalog_page_sk        bigint,
107 |     cs_ship_mode_sk           bigint,
108 |     cs_warehouse_sk           bigint,
109 |     cs_item_sk                bigint,
110 |     cs_promo_sk               bigint,
111 |     cs_order_number           bigint,
112 |     cs_quantity               int,
113 |     cs_wholesale_cost         double,
114 |     cs_list_price             double,
115 |     cs_sales_price            double,
116 |     cs_ext_discount_amt       double,
117 |     cs_ext_sales_price        double,
118 |     cs_ext_wholesale_cost     double,
119 |     cs_ext_list_price         double,
120 |     cs_ext_tax                double,
121 |     cs_coupon_amt             double,
122 |     cs_ext_ship_cost          double,
123 |     cs_net_paid               double,
124 |     cs_net_paid_inc_tax       double,
125 |     cs_net_paid_inc_ship      double,
126 |     cs_net_paid_inc_ship_tax  double,
127 |     cs_net_profit             double
128 | )
129 | row format delimited fields terminated by '|' 
130 | location '/data/catalog_sales';
131 | 
132 | drop table if exists customer_address;
133 | 
134 | create external table customer_address
135 | (
136 |     ca_address_sk             bigint,
137 |     ca_address_id             string,
138 |     ca_street_number          string,
139 |     ca_street_name            string,
140 |     ca_street_type            string,
141 |     ca_suite_number           string,
142 |     ca_city                   string,
143 |     ca_county                 string,
144 |     ca_state                  string,
145 |     ca_zip                    string,
146 |     ca_country                string,
147 |     ca_gmt_offset             double,
148 |     ca_location_type          string
149 | )
150 | row format delimited fields terminated by '|' 
151 | location '/data/customer_address';
152 | 
153 | drop table if exists customer_demographics;
154 | 
155 | create external table customer_demographics
156 | (
157 |     cd_demo_sk                bigint,
158 |     cd_gender                 string,
159 |     cd_marital_status         string,
160 |     cd_education_status       string,
161 |     cd_purchase_estimate      int,
162 |     cd_credit_rating          string,
163 |     cd_dep_count              int,
164 |     cd_dep_employed_count     int,
165 |     cd_dep_college_count      int 
166 | )
167 | row format delimited fields terminated by '|' 
168 | location '/data/customer_demographics';
169 | 
170 | drop table if exists customer;
171 | 
172 | create external table customer
173 | (
174 |     c_customer_sk             bigint,
175 |     c_customer_id             string,
176 |     c_current_cdemo_sk        bigint,
177 |     c_current_hdemo_sk        bigint,
178 |     c_current_addr_sk         bigint,
179 |     c_first_shipto_date_sk    bigint,
180 |     c_first_sales_date_sk     bigint,
181 |     c_salutation              string,
182 |     c_first_name              string,
183 |     c_last_name               string,
184 |     c_preferred_cust_flag     string,
185 |     c_birth_day               int,
186 |     c_birth_month             int,
187 |     c_birth_year              int,
188 |     c_birth_country           string,
189 |     c_login                   string,
190 |     c_email_address           string,
191 |     c_last_review_date        string
192 | )
193 | row format delimited fields terminated by '|' 
194 | location '/data/customer';
195 | 
196 | drop table if exists date_dim;
197 | 
198 | create external table date_dim
199 | (
200 |     d_date_sk                 bigint,
201 |     d_date_id                 string,
202 |     d_date                    string,
203 |     d_month_seq               int,
204 |     d_week_seq                int,
205 |     d_quarter_seq             int,
206 |     d_year                    int,
207 |     d_dow                     int,
208 |     d_moy                     int,
209 |     d_dom                     int,
210 |     d_qoy                     int,
211 |     d_fy_year                 int,
212 |     d_fy_quarter_seq          int,
213 |     d_fy_week_seq             int,
214 |     d_day_name                string,
215 |     d_quarter_name            string,
216 |     d_holiday                 string,
217 |     d_weekend                 string,
218 |     d_following_holiday       string,
219 |     d_first_dom               int,
220 |     d_last_dom                int,
221 |     d_same_day_ly             int,
222 |     d_same_day_lq             int,
223 |     d_current_day             string,
224 |     d_current_week            string,
225 |     d_current_month           string,
226 |     d_current_quarter         string,
227 |     d_current_year            string 
228 | )
229 | row format delimited fields terminated by '|' 
230 | location '/data/date_dim';
231 | 
232 | drop table if exists household_demographics;
233 | 
234 | create external table household_demographics
235 | (
236 |     hd_demo_sk                bigint,
237 |     hd_income_band_sk         bigint,
238 |     hd_buy_potential          string,
239 |     hd_dep_count              int,
240 |     hd_vehicle_count          int
241 | )
242 | row format delimited fields terminated by '|' 
243 | location '/data/household_demographics';
244 | 
245 | drop table if exists income_band;
246 | 
247 | create external table income_band(
248 |       ib_income_band_sk         bigint               
249 | ,     ib_lower_bound            int                       
250 | ,     ib_upper_bound            int
251 | )
252 | row format delimited fields terminated by '|' 
253 | location '/data/income_band';
254 | 
255 | drop table if exists inventory;
256 | 
257 | create external table inventory
258 | (
259 |     inv_date_sk			bigint,
260 |     inv_item_sk			bigint,
261 |     inv_warehouse_sk		bigint,
262 |     inv_quantity_on_hand	int
263 | )
264 | row format delimited fields terminated by '|' 
265 | location '/data/inventory';
266 | 
267 | drop table if exists item;
268 | 
269 | create external table item
270 | (
271 |     i_item_sk                 bigint,
272 |     i_item_id                 string,
273 |     i_rec_start_date          string,
274 |     i_rec_end_date            string,
275 |     i_item_desc               string,
276 |     i_current_price           double,
277 |     i_wholesale_cost          double,
278 |     i_brand_id                int,
279 |     i_brand                   string,
280 |     i_class_id                int,
281 |     i_class                   string,
282 |     i_category_id             int,
283 |     i_category                string,
284 |     i_manufact_id             int,
285 |     i_manufact                string,
286 |     i_size                    string,
287 |     i_formulation             string,
288 |     i_color                   string,
289 |     i_units                   string,
290 |     i_container               string,
291 |     i_manager_id              int,
292 |     i_product_name            string
293 | )
294 | row format delimited fields terminated by '|' 
295 | location '/data/item';
296 | 
297 | drop table if exists promotion;
298 | 
299 | create external table promotion
300 | (
301 |     p_promo_sk                bigint,
302 |     p_promo_id                string,
303 |     p_start_date_sk           bigint,
304 |     p_end_date_sk             bigint,
305 |     p_item_sk                 bigint,
306 |     p_cost                    double,
307 |     p_response_target         int,
308 |     p_promo_name              string,
309 |     p_channel_dmail           string,
310 |     p_channel_email           string,
311 |     p_channel_catalog         string,
312 |     p_channel_tv              string,
313 |     p_channel_radio           string,
314 |     p_channel_press           string,
315 |     p_channel_event           string,
316 |     p_channel_demo            string,
317 |     p_channel_details         string,
318 |     p_purpose                 string,
319 |     p_discount_active         string 
320 | )
321 | row format delimited fields terminated by '|' 
322 | location '/data/promotion';
323 | 
324 | drop table if exists reason;
325 | 
326 | create external table reason(
327 |       r_reason_sk               bigint               
328 | ,     r_reason_id               string              
329 | ,     r_reason_desc             string                
330 | )
331 | row format delimited fields terminated by '|' 
332 | location '/data/reason';
333 | 
334 | drop table if exists ship_mode;
335 | 
336 | create external table ship_mode(
337 |       sm_ship_mode_sk           bigint               
338 | ,     sm_ship_mode_id           string              
339 | ,     sm_type                   string                      
340 | ,     sm_code                   string                      
341 | ,     sm_carrier                string                      
342 | ,     sm_contract               string                      
343 | )
344 | row format delimited fields terminated by '|' 
345 | location '/data/ship_mode';
346 | 
347 | drop table if exists store_returns;
348 | 
349 | create external table store_returns
350 | (
351 |     sr_returned_date_sk       bigint,
352 |     sr_return_time_sk         bigint,
353 |     sr_item_sk                bigint,
354 |     sr_customer_sk            bigint,
355 |     sr_cdemo_sk               bigint,
356 |     sr_hdemo_sk               bigint,
357 |     sr_addr_sk                bigint,
358 |     sr_store_sk               bigint,
359 |     sr_reason_sk              bigint,
360 |     sr_ticket_number          bigint,
361 |     sr_return_quantity        int,
362 |     sr_return_amt             double,
363 |     sr_return_tax             double,
364 |     sr_return_amt_inc_tax     double,
365 |     sr_fee                    double,
366 |     sr_return_ship_cost       double,
367 |     sr_refunded_cash          double,
368 |     sr_reversed_charge        double,
369 |     sr_store_credit           double,
370 |     sr_net_loss               double             
371 | )
372 | row format delimited fields terminated by '|' 
373 | location '/data/store_returns';
374 | 
375 | drop table if exists store_sales;
376 | 
377 | create external table store_sales
378 | (
379 |     ss_sold_date_sk           bigint,
380 |     ss_sold_time_sk           bigint,
381 |     ss_item_sk                bigint,
382 |     ss_customer_sk            bigint,
383 |     ss_cdemo_sk               bigint,
384 |     ss_hdemo_sk               bigint,
385 |     ss_addr_sk                bigint,
386 |     ss_store_sk               bigint,
387 |     ss_promo_sk               bigint,
388 |     ss_ticket_number          bigint,
389 |     ss_quantity               int,
390 |     ss_wholesale_cost         double,
391 |     ss_list_price             double,
392 |     ss_sales_price            double,
393 |     ss_ext_discount_amt       double,
394 |     ss_ext_sales_price        double,
395 |     ss_ext_wholesale_cost     double,
396 |     ss_ext_list_price         double,
397 |     ss_ext_tax                double,
398 |     ss_coupon_amt             double,
399 |     ss_net_paid               double,
400 |     ss_net_paid_inc_tax       double,
401 |     ss_net_profit             double                  
402 | )
403 | row format delimited fields terminated by '|' 
404 | location '/data/store_sales';
405 | 
406 | drop table if exists store;
407 | 
408 | create external table store
409 | (
410 |     s_store_sk                bigint,
411 |     s_store_id                string,
412 |     s_rec_start_date          string,
413 |     s_rec_end_date            string,
414 |     s_closed_date_sk          bigint,
415 |     s_store_name              string,
416 |     s_number_employees        int,
417 |     s_floor_space             int,
418 |     s_hours                   string,
419 |     s_manager                 string,
420 |     s_market_id               int,
421 |     s_geography_class         string,
422 |     s_market_desc             string,
423 |     s_market_manager          string,
424 |     s_division_id             int,
425 |     s_division_name           string,
426 |     s_company_id              int,
427 |     s_company_name            string,
428 |     s_street_number           string,
429 |     s_street_name             string,
430 |     s_street_type             string,
431 |     s_suite_number            string,
432 |     s_city                    string,
433 |     s_county                  string,
434 |     s_state                   string,
435 |     s_zip                     string,
436 |     s_country                 string,
437 |     s_gmt_offset              double,
438 |     s_tax_precentage          double                  
439 | )
440 | row format delimited fields terminated by '|' 
441 | location '/data/store';
442 | 
443 | drop table if exists time_dim;
444 | 
445 | create external table time_dim
446 | (
447 |     t_time_sk                 bigint,
448 |     t_time_id                 string,
449 |     t_time                    int,
450 |     t_hour                    int,
451 |     t_minute                  int,
452 |     t_second                  int,
453 |     t_am_pm                   string,
454 |     t_shift                   string,
455 |     t_sub_shift               string,
456 |     t_meal_time               string
457 | )
458 | row format delimited fields terminated by '|' 
459 | location '/data/time_dim';
460 | 
461 | drop table if exists warehouse;
462 | 
463 | create external table warehouse(
464 |       w_warehouse_sk            bigint               
465 | ,     w_warehouse_id            string              
466 | ,     w_warehouse_name          string                   
467 | ,     w_warehouse_sq_ft         int                       
468 | ,     w_street_number           string                      
469 | ,     w_street_name             string                   
470 | ,     w_street_type             string                      
471 | ,     w_suite_number            string                      
472 | ,     w_city                    string                   
473 | ,     w_county                  string                   
474 | ,     w_state                   string                       
475 | ,     w_zip                     string                      
476 | ,     w_country                 string                   
477 | ,     w_gmt_offset              double                  
478 | )
479 | row format delimited fields terminated by '|' 
480 | location '/data/warehouse';
481 | 
482 | drop table if exists web_page;
483 | 
484 | create external table web_page(
485 |       wp_web_page_sk            bigint               
486 | ,     wp_web_page_id            string              
487 | ,     wp_rec_start_date        string                         
488 | ,     wp_rec_end_date          string                         
489 | ,     wp_creation_date_sk       bigint                       
490 | ,     wp_access_date_sk         bigint                       
491 | ,     wp_autogen_flag           string                       
492 | ,     wp_customer_sk            bigint                       
493 | ,     wp_url                    string                  
494 | ,     wp_type                   string                      
495 | ,     wp_char_count             int                       
496 | ,     wp_link_count             int                       
497 | ,     wp_image_count            int                       
498 | ,     wp_max_ad_count           int
499 | )
500 | row format delimited fields terminated by '|' 
501 | location '/data/web_page';
502 | 
503 | drop table if exists web_returns;
504 | 
505 | create external table web_returns
506 | (
507 |     wr_returned_date_sk       bigint,
508 |     wr_returned_time_sk       bigint,
509 |     wr_item_sk                bigint,
510 |     wr_refunded_customer_sk   bigint,
511 |     wr_refunded_cdemo_sk      bigint,
512 |     wr_refunded_hdemo_sk      bigint,
513 |     wr_refunded_addr_sk       bigint,
514 |     wr_returning_customer_sk  bigint,
515 |     wr_returning_cdemo_sk     bigint,
516 |     wr_returning_hdemo_sk     bigint,
517 |     wr_returning_addr_sk      bigint,
518 |     wr_web_page_sk            bigint,
519 |     wr_reason_sk              bigint,
520 |     wr_order_number           bigint,
521 |     wr_return_quantity        int,
522 |     wr_return_amt             double,
523 |     wr_return_tax             double,
524 |     wr_return_amt_inc_tax     double,
525 |     wr_fee                    double,
526 |     wr_return_ship_cost       double,
527 |     wr_refunded_cash          double,
528 |     wr_reversed_charge        double,
529 |     wr_account_credit         double,
530 |     wr_net_loss               double
531 | )
532 | row format delimited fields terminated by '|' 
533 | location '/data/web_returns';
534 | 
535 | drop table if exists web_sales;
536 | 
537 | create external table web_sales
538 | (
539 |     ws_sold_date_sk           bigint,
540 |     ws_sold_time_sk           bigint,
541 |     ws_ship_date_sk           bigint,
542 |     ws_item_sk                bigint,
543 |     ws_bill_customer_sk       bigint,
544 |     ws_bill_cdemo_sk          bigint,
545 |     ws_bill_hdemo_sk          bigint,
546 |     ws_bill_addr_sk           bigint,
547 |     ws_ship_customer_sk       bigint,
548 |     ws_ship_cdemo_sk          bigint,
549 |     ws_ship_hdemo_sk          bigint,
550 |     ws_ship_addr_sk           bigint,
551 |     ws_web_page_sk            bigint,
552 |     ws_web_site_sk            bigint,
553 |     ws_ship_mode_sk           bigint,
554 |     ws_warehouse_sk           bigint,
555 |     ws_promo_sk               bigint,
556 |     ws_order_number           bigint,
557 |     ws_quantity               int,
558 |     ws_wholesale_cost         double,
559 |     ws_list_price             double,
560 |     ws_sales_price            double,
561 |     ws_ext_discount_amt       double,
562 |     ws_ext_sales_price        double,
563 |     ws_ext_wholesale_cost     double,
564 |     ws_ext_list_price         double,
565 |     ws_ext_tax                double,
566 |     ws_coupon_amt             double,
567 |     ws_ext_ship_cost          double,
568 |     ws_net_paid               double,
569 |     ws_net_paid_inc_tax       double,
570 |     ws_net_paid_inc_ship      double,
571 |     ws_net_paid_inc_ship_tax  double,
572 |     ws_net_profit             double
573 | )
574 | row format delimited fields terminated by '|' 
575 | location '/data/web_sales';
576 | 
577 | drop table if exists web_site;
578 | 
579 | create external table web_site
580 | (
581 |     web_site_sk           bigint,
582 |     web_site_id           string,
583 |     web_rec_start_date    string,
584 |     web_rec_end_date      string,
585 |     web_name              string,
586 |     web_open_date_sk      bigint,
587 |     web_close_date_sk     bigint,
588 |     web_class             string,
589 |     web_manager           string,
590 |     web_mkt_id            int,
591 |     web_mkt_class         string,
592 |     web_mkt_desc          string,
593 |     web_market_manager    string,
594 |     web_company_id        int,
595 |     web_company_name      string,
596 |     web_street_number     string,
597 |     web_street_name       string,
598 |     web_street_type       string,
599 |     web_suite_number      string,
600 |     web_city              string,
601 |     web_county            string,
602 |     web_state             string,
603 |     web_zip               string,
604 |     web_country           string,
605 |     web_gmt_offset        double,
606 |     web_tax_percentage    double
607 | )
608 | row format delimited fields terminated by '|' 
609 | location '/data/web_site';
610 | 


--------------------------------------------------------------------------------
/week3/hive/tpcds/all-tables-base.sql:
--------------------------------------------------------------------------------
  1 | drop database if exists tpcds cascade;
  2 | 
  3 | create database tpcds_base;
  4 | use tpcds_base;
  5 | 
  6 | drop table if exists call_center;
  7 | 
  8 | create external table call_center(
  9 |       cc_call_center_sk         bigint               
 10 | ,     cc_call_center_id         string              
 11 | ,     cc_rec_start_date        string                         
 12 | ,     cc_rec_end_date          string                         
 13 | ,     cc_closed_date_sk         bigint                       
 14 | ,     cc_open_date_sk           bigint                       
 15 | ,     cc_name                   string                   
 16 | ,     cc_class                  string                   
 17 | ,     cc_employees              int                       
 18 | ,     cc_sq_ft                  int                       
 19 | ,     cc_hours                  string                      
 20 | ,     cc_manager                string                   
 21 | ,     cc_mkt_id                 int                       
 22 | ,     cc_mkt_class              string                      
 23 | ,     cc_mkt_desc               string                  
 24 | ,     cc_market_manager         string                   
 25 | ,     cc_division               int                       
 26 | ,     cc_division_name          string                   
 27 | ,     cc_company                int                       
 28 | ,     cc_company_name           string                      
 29 | ,     cc_street_number          string                      
 30 | ,     cc_street_name            string                   
 31 | ,     cc_street_type            string                      
 32 | ,     cc_suite_number           string                      
 33 | ,     cc_city                   string                   
 34 | ,     cc_county                 string                   
 35 | ,     cc_state                  string                       
 36 | ,     cc_zip                    string                      
 37 | ,     cc_country                string                   
 38 | ,     cc_gmt_offset             double                  
 39 | ,     cc_tax_percentage         double
 40 | )
 41 | row format delimited fields terminated by '|' 
 42 | location '/data/call_center';
 43 | 
 44 | drop table if exists catalog_page;
 45 | 
 46 | create external table catalog_page(
 47 |       cp_catalog_page_sk        bigint               
 48 | ,     cp_catalog_page_id        string              
 49 | ,     cp_start_date_sk          bigint                       
 50 | ,     cp_end_date_sk            bigint                       
 51 | ,     cp_department             string                   
 52 | ,     cp_catalog_number         int                       
 53 | ,     cp_catalog_page_number    int                       
 54 | ,     cp_description            string                  
 55 | ,     cp_type                   string
 56 | )
 57 | row format delimited fields terminated by '|' 
 58 | location '/data/catalog_page';
 59 | 
 60 | drop table if exists catalog_returns;
 61 | 
 62 | create external table catalog_returns
 63 | (
 64 |     cr_returned_date_sk       bigint,
 65 |     cr_returned_time_sk       bigint,
 66 |     cr_item_sk                bigint,
 67 |     cr_refunded_customer_sk   bigint,
 68 |     cr_refunded_cdemo_sk      bigint,
 69 |     cr_refunded_hdemo_sk      bigint,
 70 |     cr_refunded_addr_sk       bigint,
 71 |     cr_returning_customer_sk  bigint,
 72 |     cr_returning_cdemo_sk     bigint,
 73 |     cr_returning_hdemo_sk     bigint,
 74 |     cr_returning_addr_sk      bigint,
 75 |     cr_call_center_sk         bigint,
 76 |     cr_catalog_page_sk        bigint,
 77 |     cr_ship_mode_sk           bigint,
 78 |     cr_warehouse_sk           bigint,
 79 |     cr_reason_sk              bigint,
 80 |     cr_order_number           bigint,
 81 |     cr_return_quantity        int,
 82 |     cr_return_amount          double,
 83 |     cr_return_tax             double,
 84 |     cr_return_amt_inc_tax     double,
 85 |     cr_fee                    double,
 86 |     cr_return_ship_cost       double,
 87 |     cr_refunded_cash          double,
 88 |     cr_reversed_charge        double,
 89 |     cr_store_credit           double,
 90 |     cr_net_loss               double
 91 | )
 92 | row format delimited fields terminated by '|' 
 93 | location '/data/catalog_returns';
 94 | 
 95 | drop table if exists catalog_sales;
 96 | 
 97 | create external table catalog_sales
 98 | (
 99 |     cs_sold_date_sk           bigint,
100 |     cs_sold_time_sk           bigint,
101 |     cs_ship_date_sk           bigint,
102 |     cs_bill_customer_sk       bigint,
103 |     cs_bill_cdemo_sk          bigint,
104 |     cs_bill_hdemo_sk          bigint,
105 |     cs_bill_addr_sk           bigint,
106 |     cs_ship_customer_sk       bigint,
107 |     cs_ship_cdemo_sk          bigint,
108 |     cs_ship_hdemo_sk          bigint,
109 |     cs_ship_addr_sk           bigint,
110 |     cs_call_center_sk         bigint,
111 |     cs_catalog_page_sk        bigint,
112 |     cs_ship_mode_sk           bigint,
113 |     cs_warehouse_sk           bigint,
114 |     cs_item_sk                bigint,
115 |     cs_promo_sk               bigint,
116 |     cs_order_number           bigint,
117 |     cs_quantity               int,
118 |     cs_wholesale_cost         double,
119 |     cs_list_price             double,
120 |     cs_sales_price            double,
121 |     cs_ext_discount_amt       double,
122 |     cs_ext_sales_price        double,
123 |     cs_ext_wholesale_cost     double,
124 |     cs_ext_list_price         double,
125 |     cs_ext_tax                double,
126 |     cs_coupon_amt             double,
127 |     cs_ext_ship_cost          double,
128 |     cs_net_paid               double,
129 |     cs_net_paid_inc_tax       double,
130 |     cs_net_paid_inc_ship      double,
131 |     cs_net_paid_inc_ship_tax  double,
132 |     cs_net_profit             double
133 | )
134 | row format delimited fields terminated by '|' 
135 | location '/data/catalog_sales';
136 | 
137 | drop table if exists customer_address;
138 | 
139 | create external table customer_address
140 | (
141 |     ca_address_sk             bigint,
142 |     ca_address_id             string,
143 |     ca_street_number          string,
144 |     ca_street_name            string,
145 |     ca_street_type            string,
146 |     ca_suite_number           string,
147 |     ca_city                   string,
148 |     ca_county                 string,
149 |     ca_state                  string,
150 |     ca_zip                    string,
151 |     ca_country                string,
152 |     ca_gmt_offset             double,
153 |     ca_location_type          string
154 | )
155 | row format delimited fields terminated by '|' 
156 | location '/data/customer_address';
157 | 
158 | drop table if exists customer_demographics;
159 | 
160 | create external table customer_demographics
161 | (
162 |     cd_demo_sk                bigint,
163 |     cd_gender                 string,
164 |     cd_marital_status         string,
165 |     cd_education_status       string,
166 |     cd_purchase_estimate      int,
167 |     cd_credit_rating          string,
168 |     cd_dep_count              int,
169 |     cd_dep_employed_count     int,
170 |     cd_dep_college_count      int 
171 | )
172 | row format delimited fields terminated by '|' 
173 | location '/data/customer_demographics';
174 | 
175 | drop table if exists customer;
176 | 
177 | create external table customer
178 | (
179 |     c_customer_sk             bigint,
180 |     c_customer_id             string,
181 |     c_current_cdemo_sk        bigint,
182 |     c_current_hdemo_sk        bigint,
183 |     c_current_addr_sk         bigint,
184 |     c_first_shipto_date_sk    bigint,
185 |     c_first_sales_date_sk     bigint,
186 |     c_salutation              string,
187 |     c_first_name              string,
188 |     c_last_name               string,
189 |     c_preferred_cust_flag     string,
190 |     c_birth_day               int,
191 |     c_birth_month             int,
192 |     c_birth_year              int,
193 |     c_birth_country           string,
194 |     c_login                   string,
195 |     c_email_address           string,
196 |     c_last_review_date        string
197 | )
198 | row format delimited fields terminated by '|' 
199 | location '/data/customer';
200 | 
201 | drop table if exists date_dim;
202 | 
203 | create external table date_dim
204 | (
205 |     d_date_sk                 bigint,
206 |     d_date_id                 string,
207 |     d_date                    string,
208 |     d_month_seq               int,
209 |     d_week_seq                int,
210 |     d_quarter_seq             int,
211 |     d_year                    int,
212 |     d_dow                     int,
213 |     d_moy                     int,
214 |     d_dom                     int,
215 |     d_qoy                     int,
216 |     d_fy_year                 int,
217 |     d_fy_quarter_seq          int,
218 |     d_fy_week_seq             int,
219 |     d_day_name                string,
220 |     d_quarter_name            string,
221 |     d_holiday                 string,
222 |     d_weekend                 string,
223 |     d_following_holiday       string,
224 |     d_first_dom               int,
225 |     d_last_dom                int,
226 |     d_same_day_ly             int,
227 |     d_same_day_lq             int,
228 |     d_current_day             string,
229 |     d_current_week            string,
230 |     d_current_month           string,
231 |     d_current_quarter         string,
232 |     d_current_year            string 
233 | )
234 | row format delimited fields terminated by '|' 
235 | location '/data/date_dim';
236 | 
237 | drop table if exists household_demographics;
238 | 
239 | create external table household_demographics
240 | (
241 |     hd_demo_sk                bigint,
242 |     hd_income_band_sk         bigint,
243 |     hd_buy_potential          string,
244 |     hd_dep_count              int,
245 |     hd_vehicle_count          int
246 | )
247 | row format delimited fields terminated by '|' 
248 | location '/data/household_demographics';
249 | 
250 | drop table if exists income_band;
251 | 
252 | create external table income_band(
253 |       ib_income_band_sk         bigint               
254 | ,     ib_lower_bound            int                       
255 | ,     ib_upper_bound            int
256 | )
257 | row format delimited fields terminated by '|' 
258 | location '/data/income_band';
259 | 
260 | drop table if exists inventory;
261 | 
262 | create external table inventory
263 | (
264 |     inv_date_sk			bigint,
265 |     inv_item_sk			bigint,
266 |     inv_warehouse_sk		bigint,
267 |     inv_quantity_on_hand	int
268 | )
269 | row format delimited fields terminated by '|' 
270 | location '/data/inventory';
271 | 
272 | drop table if exists item;
273 | 
274 | create external table item
275 | (
276 |     i_item_sk                 bigint,
277 |     i_item_id                 string,
278 |     i_rec_start_date          string,
279 |     i_rec_end_date            string,
280 |     i_item_desc               string,
281 |     i_current_price           double,
282 |     i_wholesale_cost          double,
283 |     i_brand_id                int,
284 |     i_brand                   string,
285 |     i_class_id                int,
286 |     i_class                   string,
287 |     i_category_id             int,
288 |     i_category                string,
289 |     i_manufact_id             int,
290 |     i_manufact                string,
291 |     i_size                    string,
292 |     i_formulation             string,
293 |     i_color                   string,
294 |     i_units                   string,
295 |     i_container               string,
296 |     i_manager_id              int,
297 |     i_product_name            string
298 | )
299 | row format delimited fields terminated by '|' 
300 | location '/data/item';
301 | 
302 | drop table if exists promotion;
303 | 
304 | create external table promotion
305 | (
306 |     p_promo_sk                bigint,
307 |     p_promo_id                string,
308 |     p_start_date_sk           bigint,
309 |     p_end_date_sk             bigint,
310 |     p_item_sk                 bigint,
311 |     p_cost                    double,
312 |     p_response_target         int,
313 |     p_promo_name              string,
314 |     p_channel_dmail           string,
315 |     p_channel_email           string,
316 |     p_channel_catalog         string,
317 |     p_channel_tv              string,
318 |     p_channel_radio           string,
319 |     p_channel_press           string,
320 |     p_channel_event           string,
321 |     p_channel_demo            string,
322 |     p_channel_details         string,
323 |     p_purpose                 string,
324 |     p_discount_active         string 
325 | )
326 | row format delimited fields terminated by '|' 
327 | location '/data/promotion';
328 | 
329 | drop table if exists reason;
330 | 
331 | create external table reason(
332 |       r_reason_sk               bigint               
333 | ,     r_reason_id               string              
334 | ,     r_reason_desc             string                
335 | )
336 | row format delimited fields terminated by '|' 
337 | location '/data/reason';
338 | 
339 | drop table if exists ship_mode;
340 | 
341 | create external table ship_mode(
342 |       sm_ship_mode_sk           bigint               
343 | ,     sm_ship_mode_id           string              
344 | ,     sm_type                   string                      
345 | ,     sm_code                   string                      
346 | ,     sm_carrier                string                      
347 | ,     sm_contract               string                      
348 | )
349 | row format delimited fields terminated by '|' 
350 | location '/data/ship_mode';
351 | 
352 | drop table if exists store_returns;
353 | 
354 | create external table store_returns
355 | (
356 |     sr_returned_date_sk       bigint,
357 |     sr_return_time_sk         bigint,
358 |     sr_item_sk                bigint,
359 |     sr_customer_sk            bigint,
360 |     sr_cdemo_sk               bigint,
361 |     sr_hdemo_sk               bigint,
362 |     sr_addr_sk                bigint,
363 |     sr_store_sk               bigint,
364 |     sr_reason_sk              bigint,
365 |     sr_ticket_number          bigint,
366 |     sr_return_quantity        int,
367 |     sr_return_amt             double,
368 |     sr_return_tax             double,
369 |     sr_return_amt_inc_tax     double,
370 |     sr_fee                    double,
371 |     sr_return_ship_cost       double,
372 |     sr_refunded_cash          double,
373 |     sr_reversed_charge        double,
374 |     sr_store_credit           double,
375 |     sr_net_loss               double             
376 | )
377 | row format delimited fields terminated by '|' 
378 | location '/data/store_returns';
379 | 
380 | drop table if exists store_sales;
381 | 
382 | create external table store_sales
383 | (
384 |     ss_sold_date_sk           bigint,
385 |     ss_sold_time_sk           bigint,
386 |     ss_item_sk                bigint,
387 |     ss_customer_sk            bigint,
388 |     ss_cdemo_sk               bigint,
389 |     ss_hdemo_sk               bigint,
390 |     ss_addr_sk                bigint,
391 |     ss_store_sk               bigint,
392 |     ss_promo_sk               bigint,
393 |     ss_ticket_number          bigint,
394 |     ss_quantity               int,
395 |     ss_wholesale_cost         double,
396 |     ss_list_price             double,
397 |     ss_sales_price            double,
398 |     ss_ext_discount_amt       double,
399 |     ss_ext_sales_price        double,
400 |     ss_ext_wholesale_cost     double,
401 |     ss_ext_list_price         double,
402 |     ss_ext_tax                double,
403 |     ss_coupon_amt             double,
404 |     ss_net_paid               double,
405 |     ss_net_paid_inc_tax       double,
406 |     ss_net_profit             double                  
407 | )
408 | row format delimited fields terminated by '|' 
409 | location '/data/store_sales';
410 | 
411 | drop table if exists store;
412 | 
413 | create external table store
414 | (
415 |     s_store_sk                bigint,
416 |     s_store_id                string,
417 |     s_rec_start_date          string,
418 |     s_rec_end_date            string,
419 |     s_closed_date_sk          bigint,
420 |     s_store_name              string,
421 |     s_number_employees        int,
422 |     s_floor_space             int,
423 |     s_hours                   string,
424 |     s_manager                 string,
425 |     s_market_id               int,
426 |     s_geography_class         string,
427 |     s_market_desc             string,
428 |     s_market_manager          string,
429 |     s_division_id             int,
430 |     s_division_name           string,
431 |     s_company_id              int,
432 |     s_company_name            string,
433 |     s_street_number           string,
434 |     s_street_name             string,
435 |     s_street_type             string,
436 |     s_suite_number            string,
437 |     s_city                    string,
438 |     s_county                  string,
439 |     s_state                   string,
440 |     s_zip                     string,
441 |     s_country                 string,
442 |     s_gmt_offset              double,
443 |     s_tax_precentage          double                  
444 | )
445 | row format delimited fields terminated by '|' 
446 | location '/data/store';
447 | 
448 | drop table if exists time_dim;
449 | 
450 | create external table time_dim
451 | (
452 |     t_time_sk                 bigint,
453 |     t_time_id                 string,
454 |     t_time                    int,
455 |     t_hour                    int,
456 |     t_minute                  int,
457 |     t_second                  int,
458 |     t_am_pm                   string,
459 |     t_shift                   string,
460 |     t_sub_shift               string,
461 |     t_meal_time               string
462 | )
463 | row format delimited fields terminated by '|' 
464 | location '/data/time_dim';
465 | 
466 | drop table if exists warehouse;
467 | 
468 | create external table warehouse(
469 |       w_warehouse_sk            bigint               
470 | ,     w_warehouse_id            string              
471 | ,     w_warehouse_name          string                   
472 | ,     w_warehouse_sq_ft         int                       
473 | ,     w_street_number           string                      
474 | ,     w_street_name             string                   
475 | ,     w_street_type             string                      
476 | ,     w_suite_number            string                      
477 | ,     w_city                    string                   
478 | ,     w_county                  string                   
479 | ,     w_state                   string                       
480 | ,     w_zip                     string                      
481 | ,     w_country                 string                   
482 | ,     w_gmt_offset              double                  
483 | )
484 | row format delimited fields terminated by '|' 
485 | location '/data/warehouse';
486 | 
487 | drop table if exists web_page;
488 | 
489 | create external table web_page(
490 |       wp_web_page_sk            bigint               
491 | ,     wp_web_page_id            string              
492 | ,     wp_rec_start_date        string                         
493 | ,     wp_rec_end_date          string                         
494 | ,     wp_creation_date_sk       bigint                       
495 | ,     wp_access_date_sk         bigint                       
496 | ,     wp_autogen_flag           string                       
497 | ,     wp_customer_sk            bigint                       
498 | ,     wp_url                    string                  
499 | ,     wp_type                   string                      
500 | ,     wp_char_count             int                       
501 | ,     wp_link_count             int                       
502 | ,     wp_image_count            int                       
503 | ,     wp_max_ad_count           int
504 | )
505 | row format delimited fields terminated by '|' 
506 | location '/data/web_page';
507 | 
508 | drop table if exists web_returns;
509 | 
510 | create external table web_returns
511 | (
512 |     wr_returned_date_sk       bigint,
513 |     wr_returned_time_sk       bigint,
514 |     wr_item_sk                bigint,
515 |     wr_refunded_customer_sk   bigint,
516 |     wr_refunded_cdemo_sk      bigint,
517 |     wr_refunded_hdemo_sk      bigint,
518 |     wr_refunded_addr_sk       bigint,
519 |     wr_returning_customer_sk  bigint,
520 |     wr_returning_cdemo_sk     bigint,
521 |     wr_returning_hdemo_sk     bigint,
522 |     wr_returning_addr_sk      bigint,
523 |     wr_web_page_sk            bigint,
524 |     wr_reason_sk              bigint,
525 |     wr_order_number           bigint,
526 |     wr_return_quantity        int,
527 |     wr_return_amt             double,
528 |     wr_return_tax             double,
529 |     wr_return_amt_inc_tax     double,
530 |     wr_fee                    double,
531 |     wr_return_ship_cost       double,
532 |     wr_refunded_cash          double,
533 |     wr_reversed_charge        double,
534 |     wr_account_credit         double,
535 |     wr_net_loss               double
536 | )
537 | row format delimited fields terminated by '|' 
538 | location '/data/web_returns';
539 | 
540 | drop table if exists web_sales;
541 | 
542 | create external table web_sales
543 | (
544 |     ws_sold_date_sk           bigint,
545 |     ws_sold_time_sk           bigint,
546 |     ws_ship_date_sk           bigint,
547 |     ws_item_sk                bigint,
548 |     ws_bill_customer_sk       bigint,
549 |     ws_bill_cdemo_sk          bigint,
550 |     ws_bill_hdemo_sk          bigint,
551 |     ws_bill_addr_sk           bigint,
552 |     ws_ship_customer_sk       bigint,
553 |     ws_ship_cdemo_sk          bigint,
554 |     ws_ship_hdemo_sk          bigint,
555 |     ws_ship_addr_sk           bigint,
556 |     ws_web_page_sk            bigint,
557 |     ws_web_site_sk            bigint,
558 |     ws_ship_mode_sk           bigint,
559 |     ws_warehouse_sk           bigint,
560 |     ws_promo_sk               bigint,
561 |     ws_order_number           bigint,
562 |     ws_quantity               int,
563 |     ws_wholesale_cost         double,
564 |     ws_list_price             double,
565 |     ws_sales_price            double,
566 |     ws_ext_discount_amt       double,
567 |     ws_ext_sales_price        double,
568 |     ws_ext_wholesale_cost     double,
569 |     ws_ext_list_price         double,
570 |     ws_ext_tax                double,
571 |     ws_coupon_amt             double,
572 |     ws_ext_ship_cost          double,
573 |     ws_net_paid               double,
574 |     ws_net_paid_inc_tax       double,
575 |     ws_net_paid_inc_ship      double,
576 |     ws_net_paid_inc_ship_tax  double,
577 |     ws_net_profit             double
578 | )
579 | row format delimited fields terminated by '|' 
580 | location '/data/web_sales';
581 | 
582 | drop table if exists web_site;
583 | 
584 | create external table web_site
585 | (
586 |     web_site_sk           bigint,
587 |     web_site_id           string,
588 |     web_rec_start_date    string,
589 |     web_rec_end_date      string,
590 |     web_name              string,
591 |     web_open_date_sk      bigint,
592 |     web_close_date_sk     bigint,
593 |     web_class             string,
594 |     web_manager           string,
595 |     web_mkt_id            int,
596 |     web_mkt_class         string,
597 |     web_mkt_desc          string,
598 |     web_market_manager    string,
599 |     web_company_id        int,
600 |     web_company_name      string,
601 |     web_street_number     string,
602 |     web_street_name       string,
603 |     web_street_type       string,
604 |     web_suite_number      string,
605 |     web_city              string,
606 |     web_county            string,
607 |     web_state             string,
608 |     web_zip               string,
609 |     web_country           string,
610 |     web_gmt_offset        double,
611 |     web_tax_percentage    double
612 | )
613 | row format delimited fields terminated by '|' 
614 | location '/data/web_site';
615 | 


--------------------------------------------------------------------------------
/week1/googleplaycrawler/googleplaycrawler.patch:
--------------------------------------------------------------------------------
  1 | commit 13d4f43387e58d2ba0a528c545f9d87dac9f6986
  2 | Author: Daniel Dai 
  3 | Date:   Tue Feb 21 21:22:14 2017 -0800
  4 | 
  5 |     GooglePlayCrawler
  6 | 
  7 | diff --git a/build.xml b/build.xml
  8 | index 5cff1ea..8c6d1f9 100644
  9 | --- a/build.xml
 10 | +++ b/build.xml
 11 | @@ -890,6 +890,8 @@
 12 |    
 13 |    
 14 |      
 15 | +    
 16 | +    
 17 |    
 18 |  
 19 |    
 20 | @@ -1051,6 +1053,7 @@
 21 |          
 22 |          
 23 |          
 24 | +        
 25 |          
 26 |          
 27 |          
 28 | diff --git a/conf/nutch-site.xml.template b/conf/nutch-site.xml.template
 29 | index 970c8fe..8a34a76 100644
 30 | --- a/conf/nutch-site.xml.template
 31 | +++ b/conf/nutch-site.xml.template
 32 | @@ -4,5 +4,44 @@
 33 |  
 34 |  
 35 |  
 36 | -
 37 | +
 38 | + http.agent.name
 39 | + GooglePlayCrawler
 40 | +
 41 | +
 42 | +  plugin.includes
 43 | +  protocol-httpclient|urlfilter-regex|parse-googleplay|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)
 44 | +  Regular expression naming plugin directory names to
 45 | +  include.  Any plugin not matching this expression is excluded.
 46 | +  In any case you need at least include the nutch-extensionpoints plugin. By
 47 | +  default Nutch includes crawling just HTML and plain text via HTTP,
 48 | +  and basic indexing and search plugins. In order to use HTTPS please enable
 49 | +  protocol-httpclient, but be aware of possible intermittent problems with the
 50 | +  underlying commons-httpclient library.
 51 | +  
 52 | +
 53 | +
 54 | +  db.max.outlinks.per.page
 55 | +  1000
 56 | +  The maximum number of outlinks that we'll process for a page.
 57 | +  If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks
 58 | +  will be processed for a page; otherwise, all outlinks will be processed.
 59 | +  
 60 | +
 61 | +
 62 | +  http.content.limit
 63 | +  1048576
 64 | +
 65 | +
 66 | +  parser.timeout
 67 | +  3600
 68 | +
 69 | +
 70 | +  fetcher.threads.fetch
 71 | +  20
 72 | +
 73 | +
 74 | +  mapred.reduce.tasks
 75 | +  10
 76 | +
 77 |  
 78 | diff --git a/conf/parse-plugins.xml b/conf/parse-plugins.xml
 79 | index 20c8724..56f53f8 100644
 80 | --- a/conf/parse-plugins.xml
 81 | +++ b/conf/parse-plugins.xml
 82 | @@ -68,6 +68,10 @@
 83 |  		
 84 |  	
 85 |  
 86 | +        
 87 | +                
 88 | +        
 89 | +
 90 |         
 91 |  
 92 |  	
 93 | @@ -86,6 +90,8 @@
 94 |  		
 95 |  		
 97 | +                
 99 |  		
100 |  		
102 | diff --git a/conf/regex-urlfilter.txt.template b/conf/regex-urlfilter.txt.template
103 | index 78b2b31..5b0eb81 100644
104 | --- a/conf/regex-urlfilter.txt.template
105 | +++ b/conf/regex-urlfilter.txt.template
106 | @@ -30,10 +30,10 @@
107 |  -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
108 |  
109 |  # skip URLs containing certain characters as probable queries, etc.
110 | --[?*!@=]
111 | +#-[?*!@=]
112 |  
113 |  # skip URLs with slash-delimited segment that repeats 3+ times, to break loops
114 | --.*(/[^/]+)/[^/]+\1/[^/]+\1/
115 | +#-.*(/[^/]+)/[^/]+\1/[^/]+\1/
116 |  
117 |  # accept anything else
118 |  +.
119 | diff --git a/src/java/org/apache/nutch/googleplay/GooglePlayCrawler.java b/src/java/org/apache/nutch/googleplay/GooglePlayCrawler.java
120 | new file mode 100644
121 | index 0000000..40bdf8e
122 | --- /dev/null
123 | +++ b/src/java/org/apache/nutch/googleplay/GooglePlayCrawler.java
124 | @@ -0,0 +1,122 @@
125 | +/**
126 | + * Licensed to the Apache Software Foundation (ASF) under one or more
127 | + * contributor license agreements.  See the NOTICE file distributed with
128 | + * this work for additional information regarding copyright ownership.
129 | + * The ASF licenses this file to You under the Apache License, Version 2.0
130 | + * (the "License"); you may not use this file except in compliance with
131 | + * the License.  You may obtain a copy of the License at
132 | + *
133 | + *     http://www.apache.org/licenses/LICENSE-2.0
134 | + *
135 | + * Unless required by applicable law or agreed to in writing, software
136 | + * distributed under the License is distributed on an "AS IS" BASIS,
137 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
138 | + * See the License for the specific language governing permissions and
139 | + * limitations under the License.
140 | + */
141 | +
142 | +package org.apache.nutch.googleplay;
143 | +
144 | +// Commons Logging imports
145 | +import org.slf4j.Logger;
146 | +import org.slf4j.LoggerFactory;
147 | +
148 | +import org.apache.hadoop.fs.*;
149 | +import org.apache.hadoop.conf.*;
150 | +import org.apache.hadoop.mapred.*;
151 | +import org.apache.hadoop.util.Tool;
152 | +import org.apache.hadoop.util.ToolRunner;
153 | +import org.apache.nutch.crawl.*;
154 | +import org.apache.nutch.parse.ParseSegment;
155 | +import org.apache.nutch.util.NutchConfiguration;
156 | +import org.apache.nutch.util.NutchJob;
157 | +
158 | +import org.apache.nutch.fetcher.Fetcher;
159 | +
160 | +public class GooglePlayCrawler extends Configured implements Tool {
161 | +  public static final Logger LOG = LoggerFactory.getLogger(GooglePlayCrawler.class);
162 | +
163 | +  /* Perform complete crawling and indexing (to Solr) given a set of root urls and the -solr
164 | +     parameter respectively. More information and Usage parameters can be found below. */
165 | +  public static void main(String args[]) throws Exception {
166 | +    Configuration conf = NutchConfiguration.create();
167 | +    int res = ToolRunner.run(conf, new GooglePlayCrawler(), args);
168 | +    System.exit(res);
169 | +  }
170 | +  
171 | +  @Override
172 | +  public int run(String[] args) throws Exception {
173 | +    if (args.length < 1) {
174 | +      System.out.println
175 | +      ("Usage: Crawl  [-dir d] [-depth i] [-numFetchers n]");
176 | +      return -1;
177 | +    }
178 | +    Path rootUrlDir = null;
179 | +    Path dir = new Path("nutchdb");
180 | +    Path finalOutput = null;
181 | +    int threads = getConf().getInt("fetcher.threads.fetch", 10);
182 | +    int depth = 2;
183 | +    int numFetchers = 200;
184 | +    
185 | +    for (int i = 0; i < args.length; i++) {
186 | +      if ("-dir".equals(args[i])) {
187 | +        dir = new Path(args[i+1]);
188 | +        i++;
189 | +      } else if ("-depth".equals(args[i])) {
190 | +        depth = Integer.parseInt(args[i+1]);
191 | +        i++;
192 | +      } else if ("-numFetchers".equals(args[i])) {
193 | +        numFetchers = Integer.parseInt(args[i+1]);
194 | +        i++;
195 | +      } else if ("-finalOutput".equals(args[i])) {
196 | +        finalOutput =  new Path(args[i+1]);
197 | +        i++;
198 | +      } else if (args[i] != null) {
199 | +        rootUrlDir = new Path(args[i]);
200 | +      }
201 | +    }
202 | +    
203 | +    JobConf job = new NutchJob(getConf());
204 | +
205 | +    if (LOG.isInfoEnabled()) {
206 | +      LOG.info("crawl started in: " + dir);
207 | +      LOG.info("rootUrlDir = " + rootUrlDir);
208 | +      LOG.info("depth = " + depth);      
209 | +      LOG.info("numFetchers =" + numFetchers);
210 | +    }
211 | +    
212 | +    Path crawlDb = new Path("nutchdb");
213 | +    Path segments = new Path(dir + "/segments");
214 | +
215 | +    Injector injector = new Injector(getConf());
216 | +    Generator generator = new Generator(getConf());
217 | +    Fetcher fetcher = new Fetcher(getConf());
218 | +    ParseSegment parseSegment = new ParseSegment(getConf());
219 | +    CrawlDb crawlDbTool = new CrawlDb(getConf());
220 | +      
221 | +    // initialize crawlDb
222 | +    injector.inject(crawlDb, rootUrlDir);
223 | +    int i;
224 | +    for (i = 0; i < depth; i++) {             // generate new segment
225 | +      Path[] segs = generator.generate(crawlDb, segments, numFetchers, Long.MAX_VALUE, System
226 | +          .currentTimeMillis());
227 | +      if (segs == null) {
228 | +        LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
229 | +        break;
230 | +      }
231 | +      fetcher.fetch(segs[0], threads);  // fetch it
232 | +      if (!Fetcher.isParsing(job)) {
233 | +        parseSegment.parse(segs[0]);    // parse it, if needed
234 | +      }
235 | +      crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
236 | +    }
237 | +    if (i == 0) {
238 | +      LOG.warn("No URLs to fetch - check your seed list and URL filters.");
239 | +    }
240 | +    if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
241 | +    if (finalOutput != null) {
242 | +        FsShell.main(new String[] {"-cp", dir.toString(), finalOutput.toString()});
243 | +    }
244 | +    return 0;
245 | +  }
246 | +}
247 | diff --git a/src/plugin/build.xml b/src/plugin/build.xml
248 | index 75ae2e7..5742403 100755
249 | --- a/src/plugin/build.xml
250 | +++ b/src/plugin/build.xml
251 | @@ -62,6 +62,7 @@
252 |       
253 |       
254 |       
255 | +     
256 |       
257 |       
258 |       
259 | @@ -115,6 +116,7 @@
260 |       
261 |       
262 |       
263 | +     
264 |       
265 |       
266 |       
267 | @@ -181,6 +183,7 @@
268 |      
269 |      
270 |      
271 | +    
272 |      
273 |      
274 |      
275 | diff --git a/src/plugin/parse-googleplay/build.xml b/src/plugin/parse-googleplay/build.xml
276 | new file mode 100644
277 | index 0000000..7e78ea2
278 | --- /dev/null
279 | +++ b/src/plugin/parse-googleplay/build.xml
280 | @@ -0,0 +1,28 @@
281 | +
282 | +
298 | +
299 | +
300 | +  
301 | +
302 | +  
303 | +  
304 | +    
305 | +    
306 | +  
307 | +
308 | +
309 | diff --git a/src/plugin/parse-googleplay/ivy.xml b/src/plugin/parse-googleplay/ivy.xml
310 | new file mode 100644
311 | index 0000000..1a86d68
312 | --- /dev/null
313 | +++ b/src/plugin/parse-googleplay/ivy.xml
314 | @@ -0,0 +1,41 @@
315 | +
316 | +
317 | +
333 | +
334 | +
335 | +  
336 | +    
337 | +    
338 | +    
339 | +        Apache Nutch
340 | +    
341 | +  
342 | +
343 | +  
344 | +    
345 | +  
346 | +
347 | +  
348 | +    
349 | +    
350 | +  
351 | +
352 | +  
353 | +  
354 | +  
355 | +
356 | diff --git a/src/plugin/parse-googleplay/plugin.xml b/src/plugin/parse-googleplay/plugin.xml
357 | new file mode 100644
358 | index 0000000..4b3d354
359 | --- /dev/null
360 | +++ b/src/plugin/parse-googleplay/plugin.xml
361 | @@ -0,0 +1,47 @@
362 | +
363 | +
379 | +
384 | +
385 | +
386 | +   
387 | +      
388 | +         
389 | +      
390 | +   
391 | +
392 | +   
393 | +      
394 | +   
395 | +
396 | +   
399 | +
400 | +      
402 | +        
403 | +        
404 | +      
405 | +      
406 | +   
407 | +
408 | +
409 | diff --git a/src/plugin/parse-googleplay/src/java/com/example/googleplay/GoogleplayParser.java b/src/plugin/parse-googleplay/src/java/com/example/googleplay/GoogleplayParser.java
410 | new file mode 100644
411 | index 0000000..f26aba1
412 | --- /dev/null
413 | +++ b/src/plugin/parse-googleplay/src/java/com/example/googleplay/GoogleplayParser.java
414 | @@ -0,0 +1,190 @@
415 | +package com.example.googleplay;
416 | +
417 | +import java.net.MalformedURLException;
418 | +import java.util.ArrayList;
419 | +import java.util.HashSet;
420 | +import java.util.List;
421 | +import java.util.Set;
422 | +import java.util.regex.Matcher;
423 | +import java.util.regex.Pattern;
424 | +
425 | +import org.apache.hadoop.conf.Configuration;
426 | +import org.apache.nutch.metadata.Metadata;
427 | +import org.apache.nutch.parse.Outlink;
428 | +import org.apache.nutch.parse.ParseData;
429 | +import org.apache.nutch.parse.ParseImpl;
430 | +import org.apache.nutch.parse.ParseResult;
431 | +import org.apache.nutch.parse.ParseStatus;
432 | +import org.apache.nutch.parse.Parser;
433 | +import org.apache.nutch.protocol.Content;
434 | +import org.slf4j.Logger;
435 | +import org.slf4j.LoggerFactory;
436 | +
437 | +public class GoogleplayParser implements Parser {
438 | +    public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.parse.googleplay");
439 | +    static Pattern appUrlPattern = Pattern.compile("https://play.google.com/store/apps/details\\?id=[a-zA-Z0-9\\._]+");
440 | +    static Pattern titlePattern = Pattern.compile("(.*?)");
441 | +    static Pattern appNamePattern= Pattern.compile("
(.*?)(.*?)"); 444 | + static Pattern updateTimePattern = Pattern.compile("
- (.*?)
"); 445 | + static Pattern categoryPattern = Pattern.compile("(.*?)"); 446 | + static Pattern pricePattern = Pattern.compile(""); 447 | + static Pattern reviewPattern = Pattern.compile("
(.*?)"); 448 | + static Pattern installPattern = Pattern.compile("
(.*?)
"); 449 | + static Pattern versionPattern = Pattern.compile("
(.*?)
"); 450 | + static Pattern ratingPattern = Pattern.compile("
(.*?)
"); 451 | + static Pattern developerSitePattern = Pattern.compile("
(.*?)
"); 454 | + 455 | + private Configuration conf; 456 | + 457 | + @Override 458 | + public Configuration getConf() { 459 | + return conf; 460 | + } 461 | + 462 | + @Override 463 | + public void setConf(Configuration conf) { 464 | + this.conf = conf; 465 | + } 466 | + 467 | + @Override 468 | + public ParseResult getParse(Content content) { 469 | + String thisId = content.getBaseUrl().substring(content.getBaseUrl().indexOf("=")+1); 470 | + byte[] contentInOctets = content.getContent(); 471 | + String htmlText = new String(contentInOctets); 472 | + 473 | + Metadata meta = content.getMetadata(); 474 | + 475 | + String title = null; 476 | + String appName = null; 477 | + Set ids = new HashSet(); 478 | + String publisher = null; 479 | + String updateTime = null; 480 | + String category = null; 481 | + String price = null; 482 | + String reviewScore = null; 483 | + String reviewCount = null; 484 | + String install = null; 485 | + String version = null; 486 | + String rating = null; 487 | + String developerSite = null; 488 | + String developerEmail = null; 489 | + String description = null; 490 | + 491 | + Matcher m = titlePattern.matcher(htmlText); 492 | + if (m.find()) { 493 | + title = m.group(1); 494 | + } 495 | + 496 | + m = linkPattern.matcher(htmlText); 497 | + while (m.find()) { 498 | + if (!m.group(1).equals(thisId)) { 499 | + ids.add(m.group(1)); 500 | + } 501 | + } 502 | + List outlinks = new ArrayList(); 503 | + for (String id : ids) { 504 | + try { 505 | + outlinks.add(new Outlink("https://play.google.com/store/apps/details?id=" + id, "")); 506 | + } catch (MalformedURLException mue) { 507 | + LOG.warn("Invalid url: '" + id + "', skipping."); 508 | + } 509 | + } 510 | + 511 | + m = appUrlPattern.matcher(content.getBaseUrl()); 512 | + if (m.matches()) { // App page 513 | + m = appNamePattern.matcher(htmlText); 514 | + if (m.find()) { 515 | + appName = m.group(1); 516 | + } 517 | + meta.set("name", appName); 518 | + 519 | + m = publisherPattern.matcher(htmlText); 520 | + if (m.find()) { 521 | + publisher = m.group(1); 522 | + } 523 | + meta.set("publisher", publisher!=null?publisher:""); 524 | + 525 | + m = updateTimePattern.matcher(htmlText); 526 | + if (m.find()) { 527 | + updateTime = m.group(1); 528 | + } 529 | + meta.set("updateTime", updateTime!=null?updateTime:""); 530 | + 531 | + m = categoryPattern.matcher(htmlText); 532 | + if (m.find()) { 533 | + category = m.group(1); 534 | + category = category.replace("&", "and"); 535 | + } 536 | + meta.set("category", category!=null?category:""); 537 | + 538 | + m = pricePattern.matcher(htmlText); 539 | + if (m.find()) { 540 | + price = m.group(1); 541 | + } 542 | + meta.set("price", price!=null?price:""); 543 | + 544 | + m = reviewPattern.matcher(htmlText); 545 | + if (m.find()) { 546 | + reviewScore = m.group(2); 547 | + reviewCount = m.group(4); 548 | + } 549 | + meta.set("reviewScore", reviewScore!=null?reviewScore:""); 550 | + meta.set("reviewCount", reviewCount!=null?reviewCount:""); 551 | + 552 | + m = installPattern.matcher(htmlText); 553 | + if (m.find()) { 554 | + install = m.group(1)!=null?m.group(1):""; 555 | + install = install.trim(); 556 | + } 557 | + meta.set("install", install); 558 | + 559 | + m = versionPattern.matcher(htmlText); 560 | + if (m.find()) { 561 | + version = m.group(1)!=null?m.group(1):""; 562 | + version = version.trim(); 563 | + } 564 | + meta.set("version", version); 565 | + 566 | + m = ratingPattern.matcher(htmlText); 567 | + if (m.find()) { 568 | + rating = m.group(1)!=null?m.group(1):""; 569 | + rating = rating.trim(); 570 | + } 571 | + meta.set("rating", rating); 572 | + 573 | + m = developerSitePattern.matcher(htmlText); 574 | + if (m.find()) { 575 | + developerSite = m.group(1)!=null?m.group(1):""; 576 | + developerSite = developerSite.trim(); 577 | + } 578 | + meta.set("developerSite", developerSite); 579 | + 580 | + m = developerEmailPattern.matcher(htmlText); 581 | + if (m.find()) { 582 | + developerEmail = m.group(1)!=null?m.group(1):""; 583 | + developerEmail = developerEmail.trim(); 584 | + } 585 | + meta.set("developerEmail", developerEmail); 586 | + 587 | + m = descriptionPattern.matcher(htmlText); 588 | + if (m.find()) { 589 | + description = m.group(1); 590 | + } 591 | + meta.set("description", description!=null?description:""); 592 | + } 593 | + 594 | + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, 595 | + outlinks.toArray(new Outlink[0]), meta); 596 | + ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), 597 | + new ParseImpl("", parseData)); 598 | + try { 599 | + Thread.sleep(200); 600 | + } catch (InterruptedException e) { 601 | + } 602 | + return parseResult; 603 | + } 604 | +} 605 | --------------------------------------------------------------------------------
21 | -

alias1 = MAPREDUCE 'mr.jar' STORE alias2 INTO 22 | +

alias1 = NATIVE 'native.jar' STORE alias2 INTO 23 | 'inputLocation' USING storeFunc LOAD 'outputLocation' USING loadFunc AS schema [`params, ... `];

24 |
30 | -

mr.jar

31 | +

native.jar

32 |
34 | -

The MapReduce jar file (enclosed in single quotes).

35 | -

You can specify any MapReduce jar file that can be run through the hadoop jar mymr.jar params command.

36 | +

The jar file containing MapReduce/Tez job (enclosed in single quotes).

37 | +

You can specify any MapReduce/Tez jar file that can be run through the yarn jar native.jar params command.

38 |

The values for inputLocation and outputLocation can be passed in the params.

39 |
44 |

See LOAD

45 | -

After running mr.jar's MapReduce job, load back the data from outputLocation into alias1 using loadFunc as schema.

46 | +

After running native.jar's MapReduce/Tez job, load back the data from outputLocation into alias1 using loadFunc as schema.

47 |
54 | -

Extra parameters required for the mapreduce job (enclosed in back tics).

55 | +

Extra parameters required for the mapreduce/Tez job (enclosed in back tics).

56 |