├── .gitignore ├── 01_loggen ├── README.md ├── generate_apache_logs.py └── generate_apache_logs.sh ├── 02_ingestion └── Flume │ ├── README.md │ ├── client.conf │ ├── collector1.conf │ ├── collector2.conf │ ├── start_client.sh │ ├── start_collector1.sh │ └── start_collector2.sh ├── 03_processing ├── 01_dedup │ └── pig │ │ ├── dedup.pig │ │ └── dedup.sh ├── 02_sessionization │ ├── deprecated-hive │ │ ├── 01-create-raw-log-table.hql │ │ ├── 02-create-parquet-log-table.hql │ │ ├── 03-populate-parquet-log-table.hql │ │ ├── 04-query-parquet-log-table.hql │ │ └── run_all.sh │ ├── mr │ │ ├── MRSessionize.iml │ │ ├── README.md │ │ ├── pom.xml │ │ ├── run.sh │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── hadooparchitecturebook │ │ │ ├── CompositeKeyComparator.java │ │ │ ├── IpTimestampKey.java │ │ │ ├── MRSessionize.java │ │ │ ├── NaturalKeyComparator.java │ │ │ └── NaturalKeyPartitioner.java │ └── spark │ │ ├── JavaSessionize.iml │ │ ├── README.md │ │ ├── pom.xml │ │ ├── spark_sessionize.sh │ │ └── src │ │ └── main │ │ ├── java │ │ └── com │ │ │ └── hadooparchitecturebook │ │ │ └── clickstream │ │ │ └── JavaSessionize.java │ │ └── resources │ │ └── avro │ │ └── LogLine.avsc ├── 03_parquetize │ └── hive │ │ ├── 01-create-sessionized-log-table.hql │ │ ├── 01_parquetize.hql │ │ ├── 02-create-parquet-log-table.hql │ │ ├── 03-populate-parquet-log-table.hql │ │ └── run_all.sh └── 04_query │ └── query-parquet-log-table.hql ├── 04_orchestration ├── .gitignore ├── coord-app.xml ├── dedup.pig ├── job.properties ├── processing.xml ├── run.sh └── setup.sh ├── LICENSE ├── README.md ├── cleanup.sh └── setup.sh /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/.gitignore -------------------------------------------------------------------------------- /01_loggen/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/01_loggen/README.md -------------------------------------------------------------------------------- /01_loggen/generate_apache_logs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/01_loggen/generate_apache_logs.py -------------------------------------------------------------------------------- /01_loggen/generate_apache_logs.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/01_loggen/generate_apache_logs.sh -------------------------------------------------------------------------------- /02_ingestion/Flume/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/02_ingestion/Flume/README.md -------------------------------------------------------------------------------- /02_ingestion/Flume/client.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/02_ingestion/Flume/client.conf -------------------------------------------------------------------------------- /02_ingestion/Flume/collector1.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/02_ingestion/Flume/collector1.conf -------------------------------------------------------------------------------- /02_ingestion/Flume/collector2.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/02_ingestion/Flume/collector2.conf -------------------------------------------------------------------------------- /02_ingestion/Flume/start_client.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/02_ingestion/Flume/start_client.sh -------------------------------------------------------------------------------- /02_ingestion/Flume/start_collector1.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/02_ingestion/Flume/start_collector1.sh -------------------------------------------------------------------------------- /02_ingestion/Flume/start_collector2.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/02_ingestion/Flume/start_collector2.sh -------------------------------------------------------------------------------- /03_processing/01_dedup/pig/dedup.pig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/01_dedup/pig/dedup.pig -------------------------------------------------------------------------------- /03_processing/01_dedup/pig/dedup.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/01_dedup/pig/dedup.sh -------------------------------------------------------------------------------- /03_processing/02_sessionization/deprecated-hive/01-create-raw-log-table.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/deprecated-hive/01-create-raw-log-table.hql -------------------------------------------------------------------------------- /03_processing/02_sessionization/deprecated-hive/02-create-parquet-log-table.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/deprecated-hive/02-create-parquet-log-table.hql -------------------------------------------------------------------------------- /03_processing/02_sessionization/deprecated-hive/03-populate-parquet-log-table.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/deprecated-hive/03-populate-parquet-log-table.hql -------------------------------------------------------------------------------- /03_processing/02_sessionization/deprecated-hive/04-query-parquet-log-table.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/deprecated-hive/04-query-parquet-log-table.hql -------------------------------------------------------------------------------- /03_processing/02_sessionization/deprecated-hive/run_all.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/deprecated-hive/run_all.sh -------------------------------------------------------------------------------- /03_processing/02_sessionization/mr/MRSessionize.iml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/mr/MRSessionize.iml -------------------------------------------------------------------------------- /03_processing/02_sessionization/mr/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/mr/README.md -------------------------------------------------------------------------------- /03_processing/02_sessionization/mr/pom.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/mr/pom.xml -------------------------------------------------------------------------------- /03_processing/02_sessionization/mr/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/mr/run.sh -------------------------------------------------------------------------------- /03_processing/02_sessionization/mr/src/main/java/com/hadooparchitecturebook/CompositeKeyComparator.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/mr/src/main/java/com/hadooparchitecturebook/CompositeKeyComparator.java -------------------------------------------------------------------------------- /03_processing/02_sessionization/mr/src/main/java/com/hadooparchitecturebook/IpTimestampKey.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/mr/src/main/java/com/hadooparchitecturebook/IpTimestampKey.java -------------------------------------------------------------------------------- /03_processing/02_sessionization/mr/src/main/java/com/hadooparchitecturebook/MRSessionize.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/mr/src/main/java/com/hadooparchitecturebook/MRSessionize.java -------------------------------------------------------------------------------- /03_processing/02_sessionization/mr/src/main/java/com/hadooparchitecturebook/NaturalKeyComparator.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/mr/src/main/java/com/hadooparchitecturebook/NaturalKeyComparator.java -------------------------------------------------------------------------------- /03_processing/02_sessionization/mr/src/main/java/com/hadooparchitecturebook/NaturalKeyPartitioner.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/mr/src/main/java/com/hadooparchitecturebook/NaturalKeyPartitioner.java -------------------------------------------------------------------------------- /03_processing/02_sessionization/spark/JavaSessionize.iml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/spark/JavaSessionize.iml -------------------------------------------------------------------------------- /03_processing/02_sessionization/spark/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/spark/README.md -------------------------------------------------------------------------------- /03_processing/02_sessionization/spark/pom.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/spark/pom.xml -------------------------------------------------------------------------------- /03_processing/02_sessionization/spark/spark_sessionize.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/spark/spark_sessionize.sh -------------------------------------------------------------------------------- /03_processing/02_sessionization/spark/src/main/java/com/hadooparchitecturebook/clickstream/JavaSessionize.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/spark/src/main/java/com/hadooparchitecturebook/clickstream/JavaSessionize.java -------------------------------------------------------------------------------- /03_processing/02_sessionization/spark/src/main/resources/avro/LogLine.avsc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/02_sessionization/spark/src/main/resources/avro/LogLine.avsc -------------------------------------------------------------------------------- /03_processing/03_parquetize/hive/01-create-sessionized-log-table.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/03_parquetize/hive/01-create-sessionized-log-table.hql -------------------------------------------------------------------------------- /03_processing/03_parquetize/hive/01_parquetize.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/03_parquetize/hive/01_parquetize.hql -------------------------------------------------------------------------------- /03_processing/03_parquetize/hive/02-create-parquet-log-table.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/03_parquetize/hive/02-create-parquet-log-table.hql -------------------------------------------------------------------------------- /03_processing/03_parquetize/hive/03-populate-parquet-log-table.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/03_parquetize/hive/03-populate-parquet-log-table.hql -------------------------------------------------------------------------------- /03_processing/03_parquetize/hive/run_all.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/03_parquetize/hive/run_all.sh -------------------------------------------------------------------------------- /03_processing/04_query/query-parquet-log-table.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/03_processing/04_query/query-parquet-log-table.hql -------------------------------------------------------------------------------- /04_orchestration/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /04_orchestration/coord-app.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/04_orchestration/coord-app.xml -------------------------------------------------------------------------------- /04_orchestration/dedup.pig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/04_orchestration/dedup.pig -------------------------------------------------------------------------------- /04_orchestration/job.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/04_orchestration/job.properties -------------------------------------------------------------------------------- /04_orchestration/processing.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/04_orchestration/processing.xml -------------------------------------------------------------------------------- /04_orchestration/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/04_orchestration/run.sh -------------------------------------------------------------------------------- /04_orchestration/setup.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/04_orchestration/setup.sh -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/README.md -------------------------------------------------------------------------------- /cleanup.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/cleanup.sh -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hadooparchitecturebook/clickstream-tutorial/HEAD/setup.sh --------------------------------------------------------------------------------