├── .gitignore ├── chapter4 ├── generator │ ├── requests.txt │ ├── referers.txt │ ├── codes.txt │ ├── ips.txt │ ├── resources.txt │ ├── user_agents.txt │ └── generator.py ├── logstash │ ├── indexer │ │ └── kafka_to_elasticsearch.conf │ ├── processor │ │ └── forwarder_to_kafka.conf │ └── forwarder │ │ └── forwarder.json └── kafka │ ├── server-1.properties │ └── server-2.properties ├── chapter7 └── docker │ ├── generator │ ├── requests.txt │ ├── referers.txt │ ├── codes.txt │ ├── ips.txt │ ├── resources.txt │ ├── user_agents.txt │ └── generator.py │ ├── docker-machine.sh │ ├── logstash │ ├── indexer │ │ └── kafka_to_elasticsearch.conf │ ├── processor │ │ └── forwarder_to_kafka.conf │ ├── forwarder │ │ └── forwarder.json │ └── .logstash-forwarder │ ├── templates │ └── template.json │ ├── kafka │ ├── server1 │ │ ├── zookeeper.properties │ │ └── server.properties │ └── server2 │ │ ├── zookeeper.properties │ │ └── server.properties │ ├── security │ ├── lumberjack.crt │ └── lumberjack.key │ └── docker-compose.yml ├── chapter6 └── spark-enrich-and-ml │ ├── build.sh │ ├── src │ └── main │ │ ├── main.iml │ │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── examples │ │ ├── StreamingExamples.scala │ │ └── SparkEnricher.scala │ └── pom.xml └── chapter5 └── spark-scala-streamer ├── build.sh ├── src └── main │ ├── main.iml │ └── scala │ ├── org │ └── apress │ │ └── examples │ │ └── chapter4 │ │ ├── StreamingExamples.scala │ │ └── KafkaStreamer.scala │ └── clickstream │ ├── PageViewGenerator.scala │ └── PageViewStream.scala ├── pom.xml └── spark-scala-streamer.iml /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /chapter4/generator/requests.txt: -------------------------------------------------------------------------------- 1 | GET 2 | GET 3 | GET 4 | GET 5 | GET 6 | GET 7 | GET 8 | GET 9 | GET 10 | GET 11 | POST 12 | POST -------------------------------------------------------------------------------- /chapter7/docker/generator/requests.txt: -------------------------------------------------------------------------------- 1 | GET 2 | GET 3 | GET 4 | GET 5 | GET 6 | GET 7 | GET 8 | GET 9 | GET 10 | GET 11 | POST 12 | POST -------------------------------------------------------------------------------- /chapter4/generator/referers.txt: -------------------------------------------------------------------------------- 1 | - 2 | http://www.google.com 3 | http://www.yahoo.com 4 | http://www.referrer.com 5 | http://www.bing.com 6 | http://www.amazon.com -------------------------------------------------------------------------------- /chapter7/docker/generator/referers.txt: -------------------------------------------------------------------------------- 1 | - 2 | http://www.google.com 3 | http://www.yahoo.com 4 | http://www.referrer.com 5 | http://www.bing.com 6 | http://www.amazon.com -------------------------------------------------------------------------------- /chapter4/generator/codes.txt: -------------------------------------------------------------------------------- 1 | 200 2 | 200 3 | 200 4 | 200 5 | 200 6 | 200 7 | 200 8 | 200 9 | 200 10 | 200 11 | 404 12 | 404 13 | 404 14 | 401 15 | 401 16 | 403 17 | 500 -------------------------------------------------------------------------------- /chapter7/docker/generator/codes.txt: -------------------------------------------------------------------------------- 1 | 200 2 | 200 3 | 200 4 | 200 5 | 200 6 | 200 7 | 200 8 | 200 9 | 200 10 | 200 11 | 404 12 | 404 13 | 404 14 | 401 15 | 401 16 | 403 17 | 500 -------------------------------------------------------------------------------- /chapter4/generator/ips.txt: -------------------------------------------------------------------------------- 1 | 10.10.10.10 2 | 10.10.10.11 3 | 10.10.10.12 4 | 10.10.10.13 5 | 10.10.10.14 6 | 10.10.10.15 7 | 10.10.10.16 8 | 10.10.10.17 9 | 10.10.10.18 10 | 10.10.10.19 -------------------------------------------------------------------------------- /chapter7/docker/generator/ips.txt: -------------------------------------------------------------------------------- 1 | 10.10.10.10 2 | 10.10.10.11 3 | 10.10.10.12 4 | 10.10.10.13 5 | 10.10.10.14 6 | 10.10.10.15 7 | 10.10.10.16 8 | 10.10.10.17 9 | 10.10.10.18 10 | 10.10.10.19 -------------------------------------------------------------------------------- /chapter7/docker/docker-machine.sh: -------------------------------------------------------------------------------- 1 | docker-machine -D create \ 2 | --driver amazonec2 \ 3 | --amazonec2-access-key $AWS_ACCESS_KEY \ 4 | --amazonec2-secret-key $AWS_SECRET_KEY \ 5 | --amazonec2-vpc-id $AWS_VPC_ID \ 6 | --amazonec2-zone b \ 7 | baha-lambda-architecture -------------------------------------------------------------------------------- /chapter4/generator/resources.txt: -------------------------------------------------------------------------------- 1 | /products/product1 2 | /products/product2 3 | /products/product3 4 | /products/product4 5 | /products/product5 6 | /products/product6 7 | /store/cart 8 | /store/cart/checkout 9 | /account 10 | /page1 11 | /page2 12 | /page3 13 | /page4 14 | /page5 15 | /page6 16 | /page6 -------------------------------------------------------------------------------- /chapter7/docker/generator/resources.txt: -------------------------------------------------------------------------------- 1 | /products/product1 2 | /products/product2 3 | /products/product3 4 | /products/product4 5 | /products/product5 6 | /products/product6 7 | /store/cart 8 | /store/cart/checkout 9 | /account 10 | /page1 11 | /page2 12 | /page3 13 | /page4 14 | /page5 15 | /page6 16 | /page6 -------------------------------------------------------------------------------- /chapter4/logstash/indexer/kafka_to_elasticsearch.conf: -------------------------------------------------------------------------------- 1 | input { 2 | kafka { 3 | topic_id => "clickstream" 4 | } 5 | } 6 | filter { 7 | } 8 | output { 9 | stdout { codec => rubydebug } 10 | elasticsearch { 11 | index => "clickstream-%{+YYYY.MM.dd}" 12 | manage_template => false 13 | host => localhost 14 | protocol => http 15 | } 16 | } -------------------------------------------------------------------------------- /chapter7/docker/logstash/indexer/kafka_to_elasticsearch.conf: -------------------------------------------------------------------------------- 1 | input { 2 | kafka { 3 | zk_connect => "brokers:2181" 4 | topic_id => "clickstream" 5 | } 6 | } 7 | filter { 8 | mutate {convert => ["bytes", "integer"]} 9 | } 10 | output { 11 | stdout { codec => rubydebug } 12 | elasticsearch { 13 | index => "clickstream-%{+YYYY.MM.dd}" 14 | manage_template => false 15 | host => brokers 16 | protocol => http 17 | } 18 | } -------------------------------------------------------------------------------- /chapter6/spark-enrich-and-ml/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo 'Setting JAVA_HOME to Java7' 4 | export JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.7.0_21.jdk/Contents/Home/ 5 | 6 | echo 'Package sources' 7 | mvn clean scala:compile package 8 | 9 | echo 'Running Spark' 10 | spark-submit --class org.apache.spark.examples.SparkEnricher \ 11 | --master local[2] \ 12 | target/spark-enrich-and-ml-1.0.0-jar-with-dependencies.jar \ 13 | 192.168.59.103:9092,192.168.59.103:9093 clickstream 14 | -------------------------------------------------------------------------------- /chapter5/spark-scala-streamer/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo 'Setting JAVA_HOME to Java7' 4 | export JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.7.0_21.jdk/Contents/Home/ 5 | 6 | echo 'Package sources' 7 | mvn clean scala:compile package 8 | 9 | echo 'Running Spark' 10 | spark-submit --class org.apress.examples.chapter4.KafkaStreamer \ 11 | --master local[2] \ 12 | target/spark-scala-streamer-1.0.0-jar-with-dependencies.jar \ 13 | 192.168.59.103:9092,192.168.59.103:9093 clickstream 14 | -------------------------------------------------------------------------------- /chapter7/docker/logstash/processor/forwarder_to_kafka.conf: -------------------------------------------------------------------------------- 1 | input { 2 | lumberjack { 3 | port => "5043" 4 | ssl_certificate => "/etc/logstash/security/lumberjack.crt" 5 | ssl_key => "/etc/logstash/security/lumberjack.key" 6 | } 7 | } 8 | filter { 9 | grok { 10 | match => { 11 | "message" => "%{COMBINEDAPACHELOG}" 12 | } 13 | } 14 | } 15 | output { 16 | stdout { codec => rubydebug } 17 | kafka { 18 | broker_list => "192.168.59.103:9092" 19 | topic_id => "clickstream" 20 | } 21 | } -------------------------------------------------------------------------------- /chapter4/logstash/processor/forwarder_to_kafka.conf: -------------------------------------------------------------------------------- 1 | input { 2 | lumberjack { 3 | port => "5043" 4 | ssl_certificate => "/Users/bahaaldine/Dropbox/apress/demo/chapter4/security/lumberjack.crt" 5 | ssl_key => "/Users/bahaaldine/Dropbox/apress/demo/chapter4/security/lumberjack.key" 6 | } 7 | } 8 | filter { 9 | grok { 10 | match => { 11 | "message" => "%{COMBINEDAPACHELOG}" 12 | } 13 | } 14 | } 15 | output { 16 | stdout { codec => rubydebug } 17 | kafka { 18 | topic_id => "clickstream" 19 | } 20 | } -------------------------------------------------------------------------------- /chapter4/logstash/forwarder/forwarder.json: -------------------------------------------------------------------------------- 1 | { 2 | "network": { 3 | "servers": [ "localhost:5043" ], 4 | "ssl certificate": "../security/lumberjack.crt", 5 | "ssl key": "../security/lumberjack.key", 6 | "ssl ca": "../security/lumberjack.crt", 7 | "timeout": 15 8 | }, 9 | 10 | "files": [ 11 | { 12 | "paths": [ 13 | "/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_*" 14 | ], 15 | "fields": { "type": "access_log" } 16 | }, { 17 | "paths": [ "-" ], 18 | "fields": { "type": "stdin" } 19 | } 20 | ] 21 | } -------------------------------------------------------------------------------- /chapter5/spark-scala-streamer/src/main/main.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /chapter6/spark-enrich-and-ml/src/main/main.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /chapter7/docker/logstash/forwarder/forwarder.json: -------------------------------------------------------------------------------- 1 | { 2 | "network": { 3 | "servers": [ "processors:5043" ], 4 | "ssl certificate": "/etc/logstash-forwarder/security/lumberjack.crt", 5 | "ssl key": "/etc/logstash-forwarder/security/lumberjack.key", 6 | "ssl ca": "/etc/logstash-forwarder/security/lumberjack.crt", 7 | "timeout": 15 8 | }, 9 | 10 | "files": [ 11 | { 12 | "paths": [ 13 | "/tmp/source/access_log_*" 14 | ], 15 | "fields": { "type": "access_log" } 16 | }, { 17 | "paths": [ "-" ], 18 | "fields": { "type": "stdin" } 19 | } 20 | ] 21 | } -------------------------------------------------------------------------------- /chapter4/generator/user_agents.txt: -------------------------------------------------------------------------------- 1 | Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0) 2 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36 3 | Mozilla/5.0 (Linux; U; Android 2.3.5; en-us; HTC Vision Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 4 | Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25 5 | Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201 6 | Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0 7 | Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US)) -------------------------------------------------------------------------------- /chapter7/docker/generator/user_agents.txt: -------------------------------------------------------------------------------- 1 | Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0) 2 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36 3 | Mozilla/5.0 (Linux; U; Android 2.3.5; en-us; HTC Vision Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 4 | Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25 5 | Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201 6 | Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0 7 | Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US)) -------------------------------------------------------------------------------- /chapter7/docker/templates/template.json: -------------------------------------------------------------------------------- 1 | { 2 | "template_1" : { 3 | "template" : "*", 4 | "settings" : { 5 | "index.number_of_shards" : 1, 6 | "index.number_of_replicas" : 0 7 | }, 8 | "mappings" : { 9 | "_default_": { 10 | "dynamic_templates": [ 11 | { 12 | "string_fields": { 13 | "mapping": { 14 | "index": "not_analyzed", 15 | "omit_norms": true, 16 | "type": "string" 17 | }, 18 | "match_mapping_type": "string", 19 | "match": "*" 20 | } 21 | } 22 | ], 23 | "_all": { 24 | "enabled": true 25 | }, 26 | "properties": { 27 | "bytes": { "type": "integer" }, 28 | "response": { "type": "integer" } 29 | } 30 | } 31 | } 32 | } 33 | } -------------------------------------------------------------------------------- /chapter7/docker/kafka/server1/zookeeper.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # the directory where the snapshot is stored. 16 | dataDir=/tmp/zookeeper 17 | # the port at which the clients will connect 18 | clientPort=2181 19 | # disable the per-ip limit on the number of connections since this is a non-production config 20 | maxClientCnxns=0 21 | -------------------------------------------------------------------------------- /chapter7/docker/kafka/server2/zookeeper.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # the directory where the snapshot is stored. 16 | dataDir=/tmp/zookeeper 17 | # the port at which the clients will connect 18 | clientPort=2182 19 | # disable the per-ip limit on the number of connections since this is a non-production config 20 | maxClientCnxns=0 21 | -------------------------------------------------------------------------------- /chapter7/docker/security/lumberjack.crt: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIDIzCCAgugAwIBAgIJAPlrM1BCQmOVMA0GCSqGSIb3DQEBBQUAMBUxEzARBgNV 3 | BAMTCnByb2Nlc3NvcnMwHhcNMTUwNzE2MDk1NjU1WhcNMTUwODE1MDk1NjU1WjAV 4 | MRMwEQYDVQQDEwpwcm9jZXNzb3JzMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIB 5 | CgKCAQEAnIDqokdntcXHt1ez4bsg2gpqYENF9cpd7Y/cJa393l++/nul24HQQrrr 6 | Q3+gYSw1u+dAb9nh8ThDeUABBItxSAHpPB2ozxkHa3SmVf02XWSl5vGWVxsnYliB 7 | qCEqxtLXUzB8Az1FIZy0HLlCTSDCb5U2JHInIkqqxWTnHRQ4s9vXuai0ysxT3BPQ 8 | h4d9ocpgdn80eQO+XCrlyxDaUsSRwY+MdO2G0vEqgiAgru6bndzlCsyPa19yMrru 9 | qwkErv16Yiws6Qmc4jxYDspR7xOfcrQl2N6SgqHtPL0Qv2rAmZhqcyl5V2lbEd7T 10 | WGqbLntYjtht43EyLB7TVoSi2SVFIwIDAQABo3YwdDAdBgNVHQ4EFgQUYAbYar6K 11 | K2/WhrZiWhDZcWU73Y0wRQYDVR0jBD4wPIAUYAbYar6KK2/WhrZiWhDZcWU73Y2h 12 | GaQXMBUxEzARBgNVBAMTCnByb2Nlc3NvcnOCCQD5azNQQkJjlTAMBgNVHRMEBTAD 13 | AQH/MA0GCSqGSIb3DQEBBQUAA4IBAQBJJdcAnFCxnRz8x1/Nta9J9ZGCVb/HM2LD 14 | AEh2sdbWoOZ4tvYtIxfvIsOQW/UfSUCfZnTXQE4qaCqT+r85QxmPE0UuFcTfbmU2 15 | 0u96m5J+tRGoFHI6/FegALtLpt2xQ5yEE2QWBXfKTjL+9TfKxcllyyGHJwKE1oOL 16 | BBAptNxfWRIZWFL73pdy4xmqAzhAE8zhFLmyy7VJY8fK5eewvGaACwKCMRBdBeAN 17 | IzPZwYG3VycJwRAoNBs3HhuIdgbKI95WpszQbxmo8MRBrwp3dxkwel6AHh4jyJiF 18 | cDYUoKp6V0NmtJ40jKPc1riN4BER5pO8aztXcyfp7GDb12/eY0Ur 19 | -----END CERTIFICATE----- 20 | -------------------------------------------------------------------------------- /chapter4/generator/generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys 3 | import time 4 | import datetime 5 | import random 6 | from pytz import timezone 7 | 8 | line_count = int(sys.argv[1]) 9 | timestr = time.strftime("%Y%m%d-%H%M%S") 10 | 11 | f = open('../source/access_log_'+timestr+'.log','w') 12 | 13 | # ips 14 | with open('ips.txt') as ips_file: 15 | ips = ips_file.read().splitlines() 16 | 17 | # referers 18 | with open('referers.txt') as referers_file: 19 | referers = referers_file.read().splitlines() 20 | 21 | # resources 22 | with open('resources.txt') as resources_file: 23 | resources = resources_file.read().splitlines() 24 | 25 | # user agents 26 | with open('user_agents.txt') as user_agents_file: 27 | useragents = user_agents_file.read().splitlines() 28 | 29 | # codes 30 | with open('codes.txt') as codes_file: 31 | codes = codes_file.read().splitlines() 32 | 33 | # requests 34 | with open('requests.txt') as requests_file: 35 | requests = requests_file.read().splitlines() 36 | 37 | event_time = datetime.datetime(2013,10,10).replace(tzinfo=timezone('UTC')) 38 | 39 | for i in xrange(0,line_count): 40 | increment = datetime.timedelta(seconds=random.randint(30,300)) 41 | event_time += increment 42 | uri = random.choice(resources) 43 | if uri.find("Store")>0: 44 | uri += `random.randint(1000,1500)` 45 | ip = random.choice(ips) 46 | useragent = random.choice(useragents) 47 | referer = random.choice(referers) 48 | code = random.choice(codes) 49 | request= random.choice(requests) 50 | f.write('%s - - [%s] "%s %s HTTP/1.0" %s %s "%s" "%s" \n' % (random.choice(ips),event_time.strftime('%d/%b/%Y:%H:%M:%S %z'),request,uri,code,random.randint(2000,5000),referer,useragent)) -------------------------------------------------------------------------------- /chapter7/docker/generator/generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys 3 | import time 4 | import datetime 5 | import random 6 | from pytz import timezone 7 | 8 | line_count = int(sys.argv[1]) 9 | timestr = time.strftime("%Y%m%d-%H%M%S") 10 | 11 | f = open('../source/access_log_'+timestr+'.log','w') 12 | 13 | # ips 14 | with open('ips.txt') as ips_file: 15 | ips = ips_file.read().splitlines() 16 | 17 | # referers 18 | with open('referers.txt') as referers_file: 19 | referers = referers_file.read().splitlines() 20 | 21 | # resources 22 | with open('resources.txt') as resources_file: 23 | resources = resources_file.read().splitlines() 24 | 25 | # user agents 26 | with open('user_agents.txt') as user_agents_file: 27 | useragents = user_agents_file.read().splitlines() 28 | 29 | # codes 30 | with open('codes.txt') as codes_file: 31 | codes = codes_file.read().splitlines() 32 | 33 | # requests 34 | with open('requests.txt') as requests_file: 35 | requests = requests_file.read().splitlines() 36 | 37 | event_time = datetime.datetime(2013,10,10).replace(tzinfo=timezone('UTC')) 38 | 39 | for i in xrange(0,line_count): 40 | increment = datetime.timedelta(seconds=random.randint(30,300)) 41 | event_time += increment 42 | uri = random.choice(resources) 43 | if uri.find("Store")>0: 44 | uri += `random.randint(1000,1500)` 45 | ip = random.choice(ips) 46 | useragent = random.choice(useragents) 47 | referer = random.choice(referers) 48 | code = random.choice(codes) 49 | request= random.choice(requests) 50 | f.write('%s - - [%s] "%s %s HTTP/1.0" %s %s "%s" "%s" \n' % (random.choice(ips),event_time.strftime('%d/%b/%Y:%H:%M:%S %z'),request,uri,code,random.randint(2000,5000),referer,useragent)) -------------------------------------------------------------------------------- /chapter6/spark-enrich-and-ml/src/main/scala/org/apache/spark/examples/StreamingExamples.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.spark.Logging 21 | import org.apache.log4j.{Level, Logger} 22 | 23 | /** Utility functions for Spark Streaming examples. */ 24 | object StreamingExamples extends Logging { 25 | 26 | /** Set reasonable logging levels for streaming if the user has not configured log4j. */ 27 | def setStreamingLogLevels() { 28 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements 29 | if (!log4jInitialized) { 30 | // We first log something to initialize Spark's default logging, then we override the 31 | // logging level. 32 | logInfo("Setting log level to [WARN] for streaming example." + 33 | " To override add a custom log4j.properties to the classpath.") 34 | Logger.getRootLogger.setLevel(Level.WARN) 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /chapter5/spark-scala-streamer/src/main/scala/org/apress/examples/chapter4/StreamingExamples.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apress.examples.chapter4 19 | 20 | import org.apache.spark.Logging 21 | import org.apache.log4j.{Level, Logger} 22 | 23 | /** Utility functions for Spark Streaming examples. */ 24 | object StreamingExamples extends Logging { 25 | 26 | /** Set reasonable logging levels for streaming if the user has not configured log4j. */ 27 | def setStreamingLogLevels() { 28 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements 29 | if (!log4jInitialized) { 30 | // We first log something to initialize Spark's default logging, then we override the 31 | // logging level. 32 | logInfo("Setting log level to [WARN] for streaming example." + 33 | " To override add a custom log4j.properties to the classpath.") 34 | Logger.getRootLogger.setLevel(Level.WARN) 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /chapter7/docker/security/lumberjack.key: -------------------------------------------------------------------------------- 1 | -----BEGIN RSA PRIVATE KEY----- 2 | MIIEogIBAAKCAQEAnIDqokdntcXHt1ez4bsg2gpqYENF9cpd7Y/cJa393l++/nul 3 | 24HQQrrrQ3+gYSw1u+dAb9nh8ThDeUABBItxSAHpPB2ozxkHa3SmVf02XWSl5vGW 4 | VxsnYliBqCEqxtLXUzB8Az1FIZy0HLlCTSDCb5U2JHInIkqqxWTnHRQ4s9vXuai0 5 | ysxT3BPQh4d9ocpgdn80eQO+XCrlyxDaUsSRwY+MdO2G0vEqgiAgru6bndzlCsyP 6 | a19yMrruqwkErv16Yiws6Qmc4jxYDspR7xOfcrQl2N6SgqHtPL0Qv2rAmZhqcyl5 7 | V2lbEd7TWGqbLntYjtht43EyLB7TVoSi2SVFIwIDAQABAoIBAEgL6PgP2ySJCinT 8 | DBJMrRDzXok2Lwy5vofQhE4sTQSEG07X6UCmABPYLNoD9PcG/S2lhbp4GVWMyzMU 9 | TeOjteCqUI7GMKI2KX5EUxae3NodrWolqUosS/MwP3AA5Mpm4Y2AcnajTCHvG9Nl 10 | Tapn/FqV3Djm/DEDIp6AlwyB9KCHSdlietrdqncvIFY7kBv9E+O2wvD+8L5iw/mr 11 | 1IiirsJyb5Q6YeJ2E/x3a1eAL5RN3kLzUQdxbuVqJW1ebQIRVuZV0Y7+va8qrDsm 12 | ZcNtuEXjj25h2c3RHB9pbS9EYum6QrKJBgQNaf3Ol3/HnfB4DHgYH6stgS8L7P6i 13 | ae/Dg8ECgYEAz+jISccOKi7FnYemWyQSIlsuRH1Ew8dFEY3vtLSEqGtmGbW2b6M5 14 | YL+LcoiTaRTfGG7gvyPYVfu1CxIUEw8ZjzWUUL0rD6Rj5HYqPPBQB5M24E68Bm6y 15 | qE/GF+FAyNk4goRfy0Bt8bugXe35YuE3CwUTFvsZ7cbbf84DcKuS8TECgYEAwLQt 16 | Pc+N2nxDoXLwtfOai5ZYxbHoirNqNn6JEpLEbBoGeMhlhVoK7D1RBUBo84lV42kU 17 | w3op8HUxrsoWcL8UQmfDY+lXqhebomb/jOBSxqFLOz0uCN1iOlk9h8mZbbIKpICR 18 | GICt33VLiCXn6ftzvpPUTznSwijFTZJi8yGPppMCgYB4B+B6F52p5M7aH5J/U9fU 19 | VefkSp4EmWras6MMJJvhz/9mIG5vDUD3Bh6vxQG1zQLvzDOcpkkjeuoOtiU4y6Lp 20 | 9vjllBDkOvgg0ceY3vSVKvyni00qOYaTfVrUNot8aUwireHQKiZPRQ9UqysrVvK8 21 | PfXDcryJdiELLBj4V1XCQQKBgDOCde1lw3c5bCKFxM3+FbtmLsh5a71Xg/aZEC7E 22 | yHhU5JH4jxp8HRtUsImE1Aj1Ft44wnIV/4vk56jfH+x3RwURE2trfiFbOiDJA1o2 23 | xCmQB7oH+nwoIQ+TtxzKyJZH1wxtyVsRWmi2w7/a0gj7S88PWNrsi+eWWlcH4Cc5 24 | O1q9AoGANw3EWZ/z+48w5ZqEFcd0ThkY7uujknufRGM4v41tx5c2EWNBzHjXKtX5 25 | yXu11/uXmNCTQtzQHHUbJjYmhGOMrv8ohOeNOyzSpEoVtHeSllzNldLS7uVb3Fni 26 | Ae/BDC0ARoFjHcxAbusEF5vqhSSro22nMQIoJ8h7TILT9Cj0j4g= 27 | -----END RSA PRIVATE KEY----- 28 | -------------------------------------------------------------------------------- /chapter7/docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | zookeeper1: 2 | image: bahaaldine/docker-zookeeper 3 | volumes: 4 | - "kafka/server1:/etc/kafka" 5 | - "logs/logstash-zookeeper1:/opt/kafka/logs/" 6 | ports: 7 | - "2181:2181" 8 | extra_hosts: 9 | - "brokers:192.168.59.103" 10 | 11 | kafka1: 12 | image: bahaaldine/docker-kafka 13 | volumes: 14 | - "kafka/server1:/etc/kafka" 15 | - "logs/logstash-kafka1:/opt/kafka/logs/" 16 | extra_hosts: 17 | - "brokers:192.168.59.103" 18 | ports: 19 | - "9092:9092" 20 | 21 | kafka2: 22 | image: bahaaldine/docker-kafka 23 | volumes: 24 | - "kafka/server2:/etc/kafka" 25 | - "logs/logstash-kafka2:/opt/kafka/logs/" 26 | extra_hosts: 27 | - "brokers:192.168.59.103" 28 | links: 29 | - "kafka1" 30 | ports: 31 | - "9093:9093" 32 | 33 | logstashProcessor1: 34 | image: bahaaldine/docker-logstash-agent 35 | volumes: 36 | - "logstash/processor:/etc/logstash" 37 | - "security:/etc/logstash/security" 38 | - "logs/logstash-processor1:/var/log/logstash" 39 | links: 40 | - kafka1 41 | ports: 42 | - "5043:5043" 43 | 44 | elasticsearch1: 45 | image: bahaaldine/docker-elasticsearch 46 | ports: 47 | - "9200:9200" 48 | volumes: 49 | - "logs/elasticsearch1:/var/log/elasticsearch" 50 | - "templates:/etc/elasticsearch/templates" 51 | extra_hosts: 52 | - "elasticsearch:192.168.59.103" 53 | 54 | logstashIndexer1: 55 | image: bahaaldine/docker-logstash-agent 56 | volumes: 57 | - "logstash/indexer:/etc/logstash" 58 | - "logs/logstash-indexer1:/var/log/logstash" 59 | links: 60 | - elasticsearch1 61 | extra_hosts: 62 | - "brokers:192.168.59.103" 63 | 64 | logstashForwarder: 65 | image: bahaaldine/docker-logstash-forwarder 66 | volumes: 67 | - "logstash/forwarder:/etc/logstash-forwarder" 68 | - "security:/etc/logstash-forwarder/security" 69 | - "logs/logstash-forwarder1:/tmp/logs/" 70 | - "source:/tmp/source" 71 | extra_hosts: 72 | - "processors:192.168.59.103" 73 | kibana1: 74 | image: bahaaldine/docker-kibana 75 | ports: 76 | - "5601:5601" 77 | volumes: 78 | - "logs/kibana:/var/log/kibana" 79 | extra_hosts: 80 | - "elasticsearch:192.168.59.103" -------------------------------------------------------------------------------- /chapter5/spark-scala-streamer/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | org.apress 7 | spark-scala-streamer 8 | jar 9 | Apress Streamer 10 | http://spark.apache.org/ 11 | 1.0.0 12 | 13 | 14 | 1.4.0 15 | 2.10 16 | 2.10.0 17 | 1.7.7 18 | 19 | compile 20 | 21 | 22 | 23 | 24 | org.apache.spark 25 | spark-core_${scala.binary.version} 26 | ${spark.version} 27 | provided 28 | 29 | 30 | org.apache.spark 31 | spark-streaming_${scala.binary.version} 32 | ${spark.version} 33 | provided 34 | 35 | 36 | org.apache.spark 37 | spark-mllib_${scala.binary.version} 38 | ${spark.version} 39 | provided 40 | 41 | 42 | org.apache.spark 43 | spark-streaming-kafka_${scala.binary.version} 44 | ${spark.version} 45 | 46 | 47 | com.google.guava 48 | guava 49 | 14.0.1 50 | 51 | 52 | org.scala-lang 53 | scala-library 54 | ${scala.version} 55 | 56 | 57 | org.elasticsearch 58 | elasticsearch-hadoop 59 | 2.1.0 60 | 61 | 62 | org.apache.avro 63 | avro 64 | ${avro.version} 65 | ${hadoop.deps.scope} 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | net.alchim31.maven 75 | scala-maven-plugin 76 | 3.1.5 77 | 78 | 79 | org.apache.maven.plugins 80 | maven-compiler-plugin 81 | 2.0.2 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | net.alchim31.maven 90 | scala-maven-plugin 91 | 92 | 93 | 94 | 95 | maven-assembly-plugin 96 | 2.4 97 | 98 | 99 | jar-with-dependencies 100 | 101 | 102 | 103 | 104 | make-assembly 105 | package 106 | 107 | single 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /chapter6/spark-enrich-and-ml/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | org.apache.spark.examples 7 | spark-enrich-and-ml 8 | jar 9 | Spark enrichment and machine learning 10 | http://spark.apache.org/ 11 | 1.0.0 12 | 13 | 14 | 1.4.0 15 | 2.10 16 | 2.10.0 17 | 1.7.7 18 | 19 | compile 20 | 21 | 22 | 23 | 24 | org.apache.spark 25 | spark-core_${scala.binary.version} 26 | ${spark.version} 27 | provided 28 | 29 | 30 | org.apache.spark 31 | spark-streaming_${scala.binary.version} 32 | ${spark.version} 33 | provided 34 | 35 | 36 | org.apache.spark 37 | spark-mllib_${scala.binary.version} 38 | ${spark.version} 39 | provided 40 | 41 | 42 | org.apache.spark 43 | spark-streaming-kafka_${scala.binary.version} 44 | ${spark.version} 45 | 46 | 47 | com.google.guava 48 | guava 49 | 14.0.1 50 | 51 | 52 | org.scala-lang 53 | scala-library 54 | ${scala.version} 55 | 56 | 57 | org.elasticsearch 58 | elasticsearch-hadoop 59 | 2.1.0 60 | 61 | 62 | org.apache.avro 63 | avro 64 | ${avro.version} 65 | ${hadoop.deps.scope} 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | net.alchim31.maven 75 | scala-maven-plugin 76 | 3.1.5 77 | 78 | 79 | org.apache.maven.plugins 80 | maven-compiler-plugin 81 | 2.0.2 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | net.alchim31.maven 90 | scala-maven-plugin 91 | 92 | 93 | 94 | 95 | maven-assembly-plugin 96 | 2.4 97 | 98 | 99 | jar-with-dependencies 100 | 101 | 102 | 103 | 104 | make-assembly 105 | package 106 | 107 | single 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /chapter5/spark-scala-streamer/src/main/scala/org/apress/examples/chapter4/KafkaStreamer.scala: -------------------------------------------------------------------------------- 1 | package org.apress.examples.chapter4 2 | 3 | import kafka.serializer.StringDecoder 4 | import org.apache.spark.streaming._ 5 | import org.apache.spark.streaming.kafka._ 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.sql.SQLContext 10 | import scala.util.parsing.json.JSON 11 | import org.elasticsearch.spark._ 12 | import org.elasticsearch.spark.rdd.EsSpark 13 | import scala.collection.mutable.HashMap 14 | import breeze.linalg.Axis._0 15 | import org.apache.spark.rdd.RDD 16 | import scala.collection.mutable.MutableList 17 | 18 | case class PageStatistic ( 19 | verbs:List[Map[String, Integer]] 20 | ) 21 | 22 | case class Clickstream ( 23 | message:String, 24 | version:String, 25 | file:String, 26 | host:String, 27 | offset:String, 28 | eventType:String, 29 | clientip:String, 30 | ident:String, 31 | auth:String, 32 | timestamp:String, 33 | verb:String, 34 | request:String, 35 | httpVersion:String, 36 | response:String, 37 | bytes:Integer, 38 | referrer:String, 39 | agent:String 40 | ) 41 | 42 | object KafkaStreamer { 43 | def main(args: Array[String]) { 44 | if (args.length < 2) { 45 | System.err.println(s""" 46 | |Usage: DirectKafkaWordCount 47 | | is a list of one or more Kafka brokers 48 | | is a list of one or more kafka topics to consume from 49 | | 50 | """.stripMargin) 51 | System.exit(1) 52 | } 53 | 54 | StreamingExamples.setStreamingLogLevels() 55 | 56 | // Create context with 2 second batch interval 57 | val sparkConf = new SparkConf().setAppName("KafkaStreamerToElasticsearch") 58 | sparkConf.set("es.index.auto.create", "true") 59 | sparkConf.set("es.nodes", "localhost:9200") 60 | //sparkConf.set("es.net.http.auth.user", "bahaaldine") 61 | //sparkConf.set("es.net.http.auth.pass", "bazarmi") 62 | val ssc = new StreamingContext(sparkConf, Seconds(2)) 63 | 64 | // Create direct kafka stream with brokers and topics 65 | val Array(brokers, topics) = args 66 | val topicsSet = topics.split(",").toSet 67 | val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) 68 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( 69 | ssc, kafkaParams, topicsSet) 70 | // Get the lines, split them into words, count the words and print 71 | val lines = messages.map(_._2) 72 | val parsedEvents = lines.map(JSON.parseFull(_)).map(_.get.asInstanceOf[scala.collection.immutable.Map[String,Any]]) 73 | val events = parsedEvents.map(data=>Clickstream( 74 | data("message").toString 75 | ,data("@version").toString 76 | ,data("file").toString 77 | ,data("host").toString 78 | ,data("offset").toString 79 | ,data("type").toString 80 | ,data("clientip").toString 81 | ,data("ident").toString 82 | ,data("auth").toString 83 | ,data("timestamp").toString 84 | ,data("verb").toString 85 | ,data("request").toString 86 | ,data("httpversion").toString 87 | ,data("response").toString 88 | ,Integer.parseInt(data("bytes").toString) 89 | ,data("referrer").toString 90 | ,data("agent").toString 91 | )) 92 | 93 | val counts = events.map(event => event.verb).countByValue() 94 | counts.print() 95 | 96 | counts.foreachRDD{ rdd => 97 | if (rdd.toLocalIterator.nonEmpty) { 98 | var array:Array[(String, Long)] = rdd.collect() 99 | EsSpark.saveToEs(rdd, "spark/clickstream") 100 | //EsSpark.saveToEs(ssc.sparkContext.makeRDD(Seq(Map("id" -> 123, array(0)._1 -> array(0)._2))), "spark/clickstream", Map("es.mapping.id" -> "id")) 101 | } 102 | } 103 | 104 | 105 | // Start the computation 106 | ssc.start() 107 | ssc.awaitTermination() 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /chapter5/spark-scala-streamer/src/main/scala/clickstream/PageViewGenerator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.streaming.clickstream 19 | 20 | import java.net.ServerSocket 21 | import java.io.PrintWriter 22 | import util.Random 23 | 24 | /** Represents a page view on a website with associated dimension data. */ 25 | class PageView(val url : String, val status : Int, val zipCode : Int, val userID : Int) 26 | extends Serializable { 27 | override def toString() : String = { 28 | "%s\t%s\t%s\t%s\n".format(url, status, zipCode, userID) 29 | } 30 | } 31 | 32 | object PageView extends Serializable { 33 | def fromString(in : String) : PageView = { 34 | val parts = in.split("\t") 35 | new PageView(parts(0), parts(1).toInt, parts(2).toInt, parts(3).toInt) 36 | } 37 | } 38 | 39 | // scalastyle:off 40 | /** Generates streaming events to simulate page views on a website. 41 | * 42 | * This should be used in tandem with PageViewStream.scala. Example: 43 | * 44 | * To run the generator 45 | * `$ bin/run-example org.apache.spark.examples.streaming.clickstream.PageViewGenerator 44444 10` 46 | * To process the generated stream 47 | * `$ bin/run-example \ 48 | * org.apache.spark.examples.streaming.clickstream.PageViewStream errorRatePerZipCode localhost 44444` 49 | * 50 | */ 51 | // scalastyle:on 52 | object PageViewGenerator { 53 | val pages = Map("http://foo.com/" -> .7, 54 | "http://foo.com/news" -> 0.2, 55 | "http://foo.com/contact" -> .1) 56 | val httpStatus = Map(200 -> .95, 57 | 404 -> .05) 58 | val userZipCode = Map(94709 -> .5, 59 | 94117 -> .5) 60 | val userID = Map((1 to 100).map(_ -> .01) : _*) 61 | 62 | def pickFromDistribution[T](inputMap : Map[T, Double]) : T = { 63 | val rand = new Random().nextDouble() 64 | var total = 0.0 65 | for ((item, prob) <- inputMap) { 66 | total = total + prob 67 | if (total > rand) { 68 | return item 69 | } 70 | } 71 | inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0 72 | } 73 | 74 | def getNextClickEvent() : String = { 75 | val id = pickFromDistribution(userID) 76 | val page = pickFromDistribution(pages) 77 | val status = pickFromDistribution(httpStatus) 78 | val zipCode = pickFromDistribution(userZipCode) 79 | new PageView(page, status, zipCode, id).toString() 80 | } 81 | 82 | def main(args : Array[String]) { 83 | if (args.length != 2) { 84 | System.err.println("Usage: PageViewGenerator ") 85 | System.exit(1) 86 | } 87 | val port = args(0).toInt 88 | val viewsPerSecond = args(1).toFloat 89 | val sleepDelayMs = (1000.0 / viewsPerSecond).toInt 90 | val listener = new ServerSocket(port) 91 | println("Listening on port: " + port) 92 | 93 | while (true) { 94 | val socket = listener.accept() 95 | new Thread() { 96 | override def run(): Unit = { 97 | println("Got client connected from: " + socket.getInetAddress) 98 | val out = new PrintWriter(socket.getOutputStream(), true) 99 | 100 | while (true) { 101 | Thread.sleep(sleepDelayMs) 102 | out.write(getNextClickEvent()) 103 | out.flush() 104 | } 105 | socket.close() 106 | } 107 | }.start() 108 | } 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /chapter5/spark-scala-streamer/src/main/scala/clickstream/PageViewStream.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples.streaming.clickstream 19 | 20 | import org.apache.spark.SparkContext._ 21 | import org.apache.spark.streaming.{Seconds, StreamingContext} 22 | import org.apress.examples.chapter4.StreamingExamples 23 | // scalastyle:off 24 | /** Analyses a streaming dataset of web page views. This class demonstrates several types of 25 | * operators available in Spark streaming. 26 | * 27 | * This should be used in tandem with PageViewStream.scala. Example: 28 | * To run the generator 29 | * `$ bin/run-example org.apache.spark.examples.streaming.clickstream.PageViewGenerator 44444 10` 30 | * To process the generated stream 31 | * `$ bin/run-example \ 32 | * org.apache.spark.examples.streaming.clickstream.PageViewStream errorRatePerZipCode localhost 44444` 33 | */ 34 | // scalastyle:on 35 | object PageViewStream { 36 | def main(args: Array[String]) { 37 | if (args.length != 3) { 38 | System.err.println("Usage: PageViewStream ") 39 | System.err.println(" must be one of pageCounts, slidingPageCounts," + 40 | " errorRatePerZipCode, activeUserCount, popularUsersSeen") 41 | System.exit(1) 42 | } 43 | StreamingExamples.setStreamingLogLevels() 44 | val metric = args(0) 45 | val host = args(1) 46 | val port = args(2).toInt 47 | 48 | // Create the context 49 | val ssc = new StreamingContext("local[2]", "PageViewStream", Seconds(1), 50 | System.getenv("SPARK_HOME"), StreamingContext.jarOfClass(this.getClass).toSeq) 51 | 52 | // Create a ReceiverInputDStream on target host:port and convert each line to a PageView 53 | val pageViews = ssc.socketTextStream(host, port) 54 | .flatMap(_.split("\n")) 55 | .map(PageView.fromString(_)) 56 | 57 | // Return a count of views per URL seen in each batch 58 | val pageCounts = pageViews.map(view => view.url).countByValue() 59 | 60 | // Return a sliding window of page views per URL in the last ten seconds 61 | val slidingPageCounts = pageViews.map(view => view.url) 62 | .countByValueAndWindow(Seconds(10), Seconds(2)) 63 | 64 | 65 | // Return the rate of error pages (a non 200 status) in each zip code over the last 30 seconds 66 | val statusesPerZipCode = pageViews.window(Seconds(30), Seconds(2)) 67 | .map(view => ((view.zipCode, view.status))) 68 | .groupByKey() 69 | val errorRatePerZipCode = statusesPerZipCode.map{ 70 | case(zip, statuses) => 71 | val normalCount = statuses.filter(_ == 200).size 72 | val errorCount = statuses.size - normalCount 73 | val errorRatio = errorCount.toFloat / statuses.size 74 | if (errorRatio > 0.05) { 75 | "%s: **%s**".format(zip, errorRatio) 76 | } else { 77 | "%s: %s".format(zip, errorRatio) 78 | } 79 | } 80 | 81 | // Return the number unique users in last 15 seconds 82 | val activeUserCount = pageViews.window(Seconds(15), Seconds(2)) 83 | .map(view => (view.userID, 1)) 84 | .groupByKey() 85 | .count() 86 | .map("Unique active users: " + _) 87 | 88 | // An external dataset we want to join to this stream 89 | val userList = ssc.sparkContext.parallelize( 90 | Map(1 -> "Patrick Wendell", 2->"Reynold Xin", 3->"Matei Zaharia").toSeq) 91 | 92 | metric match { 93 | case "pageCounts" => pageCounts.print() 94 | case "slidingPageCounts" => slidingPageCounts.print() 95 | case "errorRatePerZipCode" => errorRatePerZipCode.print() 96 | case "activeUserCount" => activeUserCount.print() 97 | case "popularUsersSeen" => 98 | // Look for users in our existing dataset and print it out if we have a match 99 | pageViews.map(view => (view.userID, 1)) 100 | .foreachRDD((rdd, time) => rdd.join(userList) 101 | .map(_._2._2) 102 | .take(10) 103 | .foreach(u => println("Saw user %s at time %s".format(u, time)))) 104 | case _ => println("Invalid metric entered: " + metric) 105 | } 106 | 107 | ssc.start() 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /chapter7/docker/kafka/server1/server.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # see kafka.server.KafkaConfig for additional details and defaults 16 | 17 | ############################# Server Basics ############################# 18 | 19 | # The id of the broker. This must be set to a unique integer for each broker. 20 | broker.id=1 21 | 22 | ############################# Socket Server Settings ############################# 23 | 24 | # The port the socket server listens on 25 | port=9092 26 | 27 | # Hostname the broker will bind to. If not set, the server will bind to all interfaces 28 | #host.name=brokers 29 | 30 | # Hostname the broker will advertise to producers and consumers. If not set, it uses the 31 | # value for "host.name" if configured. Otherwise, it will use the value returned from 32 | # java.net.InetAddress.getCanonicalHostName(). 33 | advertised.host.name=192.168.59.103 34 | 35 | # The port to publish to ZooKeeper for clients to use. If this is not set, 36 | # it will publish the same port that the broker binds to. 37 | #advertised.port= 38 | 39 | # The number of threads handling network requests 40 | num.network.threads=3 41 | 42 | # The number of threads doing disk I/O 43 | num.io.threads=8 44 | 45 | # The send buffer (SO_SNDBUF) used by the socket server 46 | socket.send.buffer.bytes=102400 47 | 48 | # The receive buffer (SO_RCVBUF) used by the socket server 49 | socket.receive.buffer.bytes=102400 50 | 51 | # The maximum size of a request that the socket server will accept (protection against OOM) 52 | socket.request.max.bytes=104857600 53 | 54 | 55 | ############################# Log Basics ############################# 56 | 57 | # A comma seperated list of directories under which to store log files 58 | log.dirs=/tmp/kafka-logs-1 59 | 60 | # The default number of log partitions per topic. More partitions allow greater 61 | # parallelism for consumption, but this will also result in more files across 62 | # the brokers. 63 | num.partitions=1 64 | 65 | # The number of threads per data directory to be used for log recovery at startup and flushing at shutdown. 66 | # This value is recommended to be increased for installations with data dirs located in RAID array. 67 | num.recovery.threads.per.data.dir=1 68 | 69 | ############################# Log Flush Policy ############################# 70 | 71 | # Messages are immediately written to the filesystem but by default we only fsync() to sync 72 | # the OS cache lazily. The following configurations control the flush of data to disk. 73 | # There are a few important trade-offs here: 74 | # 1. Durability: Unflushed data may be lost if you are not using replication. 75 | # 2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush. 76 | # 3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 77 | # The settings below allow one to configure the flush policy to flush data after a period of time or 78 | # every N messages (or both). This can be done globally and overridden on a per-topic basis. 79 | 80 | # The number of messages to accept before forcing a flush of data to disk 81 | #log.flush.interval.messages=10000 82 | 83 | # The maximum amount of time a message can sit in a log before we force a flush 84 | #log.flush.interval.ms=1000 85 | 86 | ############################# Log Retention Policy ############################# 87 | 88 | # The following configurations control the disposal of log segments. The policy can 89 | # be set to delete segments after a period of time, or after a given size has accumulated. 90 | # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens 91 | # from the end of the log. 92 | 93 | # The minimum age of a log file to be eligible for deletion 94 | log.retention.hours=168 95 | 96 | # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining 97 | # segments don't drop below log.retention.bytes. 98 | #log.retention.bytes=1073741824 99 | 100 | # The maximum size of a log segment file. When this size is reached a new log segment will be created. 101 | log.segment.bytes=1073741824 102 | 103 | # The interval at which log segments are checked to see if they can be deleted according 104 | # to the retention policies 105 | log.retention.check.interval.ms=300000 106 | 107 | # By default the log cleaner is disabled and the log retention policy will default to just delete segments after their retention expires. 108 | # If log.cleaner.enable=true is set the cleaner will be enabled and individual logs can then be marked for log compaction. 109 | log.cleaner.enable=false 110 | 111 | ############################# Zookeeper ############################# 112 | 113 | # Zookeeper connection string (see zookeeper docs for details). 114 | # This is a comma separated host:port pairs, each corresponding to a zk 115 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002". 116 | # You can also append an optional chroot string to the urls to specify the 117 | # root directory for all kafka znodes. 118 | zookeeper.connect=brokers:2181 119 | 120 | # Timeout in ms for connecting to zookeeper 121 | zookeeper.connection.timeout.ms=6000 122 | -------------------------------------------------------------------------------- /chapter7/docker/kafka/server2/server.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # see kafka.server.KafkaConfig for additional details and defaults 16 | 17 | ############################# Server Basics ############################# 18 | 19 | # The id of the broker. This must be set to a unique integer for each broker. 20 | broker.id=2 21 | 22 | ############################# Socket Server Settings ############################# 23 | 24 | # The port the socket server listens on 25 | port=9093 26 | 27 | # Hostname the broker will bind to. If not set, the server will bind to all interfaces 28 | #host.name=brokers 29 | 30 | # Hostname the broker will advertise to producers and consumers. If not set, it uses the 31 | # value for "host.name" if configured. Otherwise, it will use the value returned from 32 | # java.net.InetAddress.getCanonicalHostName(). 33 | advertised.host.name=192.168.59.103 34 | 35 | # The port to publish to ZooKeeper for clients to use. If this is not set, 36 | # it will publish the same port that the broker binds to. 37 | #advertised.port= 38 | 39 | # The number of threads handling network requests 40 | num.network.threads=3 41 | 42 | # The number of threads doing disk I/O 43 | num.io.threads=8 44 | 45 | # The send buffer (SO_SNDBUF) used by the socket server 46 | socket.send.buffer.bytes=102400 47 | 48 | # The receive buffer (SO_RCVBUF) used by the socket server 49 | socket.receive.buffer.bytes=102400 50 | 51 | # The maximum size of a request that the socket server will accept (protection against OOM) 52 | socket.request.max.bytes=104857600 53 | 54 | 55 | ############################# Log Basics ############################# 56 | 57 | # A comma seperated list of directories under which to store log files 58 | log.dirs=/tmp/kafka-logs-2 59 | 60 | # The default number of log partitions per topic. More partitions allow greater 61 | # parallelism for consumption, but this will also result in more files across 62 | # the brokers. 63 | num.partitions=1 64 | 65 | # The number of threads per data directory to be used for log recovery at startup and flushing at shutdown. 66 | # This value is recommended to be increased for installations with data dirs located in RAID array. 67 | num.recovery.threads.per.data.dir=1 68 | 69 | ############################# Log Flush Policy ############################# 70 | 71 | # Messages are immediately written to the filesystem but by default we only fsync() to sync 72 | # the OS cache lazily. The following configurations control the flush of data to disk. 73 | # There are a few important trade-offs here: 74 | # 1. Durability: Unflushed data may be lost if you are not using replication. 75 | # 2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush. 76 | # 3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 77 | # The settings below allow one to configure the flush policy to flush data after a period of time or 78 | # every N messages (or both). This can be done globally and overridden on a per-topic basis. 79 | 80 | # The number of messages to accept before forcing a flush of data to disk 81 | #log.flush.interval.messages=10000 82 | 83 | # The maximum amount of time a message can sit in a log before we force a flush 84 | #log.flush.interval.ms=1000 85 | 86 | ############################# Log Retention Policy ############################# 87 | 88 | # The following configurations control the disposal of log segments. The policy can 89 | # be set to delete segments after a period of time, or after a given size has accumulated. 90 | # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens 91 | # from the end of the log. 92 | 93 | # The minimum age of a log file to be eligible for deletion 94 | log.retention.hours=168 95 | 96 | # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining 97 | # segments don't drop below log.retention.bytes. 98 | #log.retention.bytes=1073741824 99 | 100 | # The maximum size of a log segment file. When this size is reached a new log segment will be created. 101 | log.segment.bytes=1073741824 102 | 103 | # The interval at which log segments are checked to see if they can be deleted according 104 | # to the retention policies 105 | log.retention.check.interval.ms=300000 106 | 107 | # By default the log cleaner is disabled and the log retention policy will default to just delete segments after their retention expires. 108 | # If log.cleaner.enable=true is set the cleaner will be enabled and individual logs can then be marked for log compaction. 109 | log.cleaner.enable=false 110 | 111 | ############################# Zookeeper ############################# 112 | 113 | # Zookeeper connection string (see zookeeper docs for details). 114 | # This is a comma separated host:port pairs, each corresponding to a zk 115 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002". 116 | # You can also append an optional chroot string to the urls to specify the 117 | # root directory for all kafka znodes. 118 | zookeeper.connect=brokers:2181 119 | 120 | # Timeout in ms for connecting to zookeeper 121 | zookeeper.connection.timeout.ms=6000 122 | -------------------------------------------------------------------------------- /chapter4/kafka/server-1.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # see kafka.server.KafkaConfig for additional details and defaults 16 | 17 | ############################# Server Basics ############################# 18 | 19 | # The id of the broker. This must be set to a unique integer for each broker. 20 | broker.id=1 21 | 22 | ############################# Socket Server Settings ############################# 23 | 24 | # The port the socket server listens on 25 | port=9092 26 | 27 | # Hostname the broker will bind to. If not set, the server will bind to all interfaces 28 | #host.name=localhost 29 | 30 | # Hostname the broker will advertise to producers and consumers. If not set, it uses the 31 | # value for "host.name" if configured. Otherwise, it will use the value returned from 32 | # java.net.InetAddress.getCanonicalHostName(). 33 | #advertised.host.name= 34 | 35 | # The port to publish to ZooKeeper for clients to use. If this is not set, 36 | # it will publish the same port that the broker binds to. 37 | #advertised.port= 38 | 39 | # The number of threads handling network requests 40 | num.network.threads=3 41 | 42 | # The number of threads doing disk I/O 43 | num.io.threads=8 44 | 45 | # The send buffer (SO_SNDBUF) used by the socket server 46 | socket.send.buffer.bytes=102400 47 | 48 | # The receive buffer (SO_RCVBUF) used by the socket server 49 | socket.receive.buffer.bytes=102400 50 | 51 | # The maximum size of a request that the socket server will accept (protection against OOM) 52 | socket.request.max.bytes=104857600 53 | 54 | 55 | ############################# Log Basics ############################# 56 | 57 | # A comma seperated list of directories under which to store log files 58 | log.dirs=/tmp/kafka-logs-1 59 | 60 | # The default number of log partitions per topic. More partitions allow greater 61 | # parallelism for consumption, but this will also result in more files across 62 | # the brokers. 63 | num.partitions=1 64 | 65 | # The number of threads per data directory to be used for log recovery at startup and flushing at shutdown. 66 | # This value is recommended to be increased for installations with data dirs located in RAID array. 67 | num.recovery.threads.per.data.dir=1 68 | 69 | ############################# Log Flush Policy ############################# 70 | 71 | # Messages are immediately written to the filesystem but by default we only fsync() to sync 72 | # the OS cache lazily. The following configurations control the flush of data to disk. 73 | # There are a few important trade-offs here: 74 | # 1. Durability: Unflushed data may be lost if you are not using replication. 75 | # 2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush. 76 | # 3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 77 | # The settings below allow one to configure the flush policy to flush data after a period of time or 78 | # every N messages (or both). This can be done globally and overridden on a per-topic basis. 79 | 80 | # The number of messages to accept before forcing a flush of data to disk 81 | #log.flush.interval.messages=10000 82 | 83 | # The maximum amount of time a message can sit in a log before we force a flush 84 | #log.flush.interval.ms=1000 85 | 86 | ############################# Log Retention Policy ############################# 87 | 88 | # The following configurations control the disposal of log segments. The policy can 89 | # be set to delete segments after a period of time, or after a given size has accumulated. 90 | # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens 91 | # from the end of the log. 92 | 93 | # The minimum age of a log file to be eligible for deletion 94 | log.retention.hours=168 95 | 96 | # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining 97 | # segments don't drop below log.retention.bytes. 98 | #log.retention.bytes=1073741824 99 | 100 | # The maximum size of a log segment file. When this size is reached a new log segment will be created. 101 | log.segment.bytes=1073741824 102 | 103 | # The interval at which log segments are checked to see if they can be deleted according 104 | # to the retention policies 105 | log.retention.check.interval.ms=300000 106 | 107 | # By default the log cleaner is disabled and the log retention policy will default to just delete segments after their retention expires. 108 | # If log.cleaner.enable=true is set the cleaner will be enabled and individual logs can then be marked for log compaction. 109 | log.cleaner.enable=false 110 | 111 | ############################# Zookeeper ############################# 112 | 113 | # Zookeeper connection string (see zookeeper docs for details). 114 | # This is a comma separated host:port pairs, each corresponding to a zk 115 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002". 116 | # You can also append an optional chroot string to the urls to specify the 117 | # root directory for all kafka znodes. 118 | zookeeper.connect=localhost:2181 119 | 120 | # Timeout in ms for connecting to zookeeper 121 | zookeeper.connection.timeout.ms=6000 122 | -------------------------------------------------------------------------------- /chapter4/kafka/server-2.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # see kafka.server.KafkaConfig for additional details and defaults 16 | 17 | ############################# Server Basics ############################# 18 | 19 | # The id of the broker. This must be set to a unique integer for each broker. 20 | broker.id=2 21 | 22 | ############################# Socket Server Settings ############################# 23 | 24 | # The port the socket server listens on 25 | port=9093 26 | 27 | # Hostname the broker will bind to. If not set, the server will bind to all interfaces 28 | #host.name=localhost 29 | 30 | # Hostname the broker will advertise to producers and consumers. If not set, it uses the 31 | # value for "host.name" if configured. Otherwise, it will use the value returned from 32 | # java.net.InetAddress.getCanonicalHostName(). 33 | #advertised.host.name= 34 | 35 | # The port to publish to ZooKeeper for clients to use. If this is not set, 36 | # it will publish the same port that the broker binds to. 37 | #advertised.port= 38 | 39 | # The number of threads handling network requests 40 | num.network.threads=3 41 | 42 | # The number of threads doing disk I/O 43 | num.io.threads=8 44 | 45 | # The send buffer (SO_SNDBUF) used by the socket server 46 | socket.send.buffer.bytes=102400 47 | 48 | # The receive buffer (SO_RCVBUF) used by the socket server 49 | socket.receive.buffer.bytes=102400 50 | 51 | # The maximum size of a request that the socket server will accept (protection against OOM) 52 | socket.request.max.bytes=104857600 53 | 54 | 55 | ############################# Log Basics ############################# 56 | 57 | # A comma seperated list of directories under which to store log files 58 | log.dirs=/tmp/kafka-logs-2 59 | 60 | # The default number of log partitions per topic. More partitions allow greater 61 | # parallelism for consumption, but this will also result in more files across 62 | # the brokers. 63 | num.partitions=1 64 | 65 | # The number of threads per data directory to be used for log recovery at startup and flushing at shutdown. 66 | # This value is recommended to be increased for installations with data dirs located in RAID array. 67 | num.recovery.threads.per.data.dir=1 68 | 69 | ############################# Log Flush Policy ############################# 70 | 71 | # Messages are immediately written to the filesystem but by default we only fsync() to sync 72 | # the OS cache lazily. The following configurations control the flush of data to disk. 73 | # There are a few important trade-offs here: 74 | # 1. Durability: Unflushed data may be lost if you are not using replication. 75 | # 2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush. 76 | # 3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 77 | # The settings below allow one to configure the flush policy to flush data after a period of time or 78 | # every N messages (or both). This can be done globally and overridden on a per-topic basis. 79 | 80 | # The number of messages to accept before forcing a flush of data to disk 81 | #log.flush.interval.messages=10000 82 | 83 | # The maximum amount of time a message can sit in a log before we force a flush 84 | #log.flush.interval.ms=1000 85 | 86 | ############################# Log Retention Policy ############################# 87 | 88 | # The following configurations control the disposal of log segments. The policy can 89 | # be set to delete segments after a period of time, or after a given size has accumulated. 90 | # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens 91 | # from the end of the log. 92 | 93 | # The minimum age of a log file to be eligible for deletion 94 | log.retention.hours=168 95 | 96 | # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining 97 | # segments don't drop below log.retention.bytes. 98 | #log.retention.bytes=1073741824 99 | 100 | # The maximum size of a log segment file. When this size is reached a new log segment will be created. 101 | log.segment.bytes=1073741824 102 | 103 | # The interval at which log segments are checked to see if they can be deleted according 104 | # to the retention policies 105 | log.retention.check.interval.ms=300000 106 | 107 | # By default the log cleaner is disabled and the log retention policy will default to just delete segments after their retention expires. 108 | # If log.cleaner.enable=true is set the cleaner will be enabled and individual logs can then be marked for log compaction. 109 | log.cleaner.enable=false 110 | 111 | ############################# Zookeeper ############################# 112 | 113 | # Zookeeper connection string (see zookeeper docs for details). 114 | # This is a comma separated host:port pairs, each corresponding to a zk 115 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002". 116 | # You can also append an optional chroot string to the urls to specify the 117 | # root directory for all kafka znodes. 118 | zookeeper.connect=localhost:2181 119 | 120 | # Timeout in ms for connecting to zookeeper 121 | zookeeper.connection.timeout.ms=6000 122 | -------------------------------------------------------------------------------- /chapter6/spark-enrich-and-ml/src/main/scala/org/apache/spark/examples/SparkEnricher.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.examples 2 | 3 | import kafka.serializer.StringDecoder 4 | import org.apache.spark.streaming._ 5 | import org.apache.spark.streaming.kafka._ 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.sql.SQLContext 10 | import scala.util.parsing.json.JSON 11 | import org.elasticsearch.spark._ 12 | import org.elasticsearch.spark.rdd.EsSpark 13 | import scala.collection.mutable.HashMap 14 | import breeze.linalg.Axis._0 15 | import org.apache.spark.rdd.RDD 16 | import scala.collection.mutable.MutableList 17 | import org.apache.spark.mllib.linalg.Vectors 18 | import org.apache.spark.mllib.clustering.KMeans 19 | import org.apache.spark.mllib.clustering.StreamingKMeans 20 | import org.apache.spark.mllib.regression.LabeledPoint 21 | 22 | case class Clickstream ( 23 | message:String, 24 | version:String, 25 | file:String, 26 | host:String, 27 | offset:String, 28 | eventType:String, 29 | clientip:String, 30 | ident:String, 31 | auth:String, 32 | timestamp:String, 33 | verb:String, 34 | request:String, 35 | httpVersion:String, 36 | response:String, 37 | bytes:Integer, 38 | referrer:String, 39 | agent:String 40 | ) 41 | 42 | case class Customer ( 43 | session:String, 44 | request:String, 45 | category:String 46 | ) 47 | 48 | 49 | object SparkEnricher { 50 | def main(args: Array[String]) { 51 | if (args.length < 2) { 52 | System.err.println(s""" 53 | |Usage: DirectKafkaWordCount 54 | | is a list of one or more Kafka brokers 55 | | is a list of one or more kafka topics to consume from 56 | | 57 | """.stripMargin) 58 | System.exit(1) 59 | } 60 | 61 | StreamingExamples.setStreamingLogLevels() 62 | 63 | // Create context with 2 second batch interval 64 | val sparkConf = new SparkConf().setAppName("KafkaStreamerToElasticsearch") 65 | sparkConf.set("es.index.auto.create", "true") 66 | sparkConf.set("es.nodes", "localhost:9200") 67 | //sparkConf.set("es.net.http.auth.user", "bahaaldine") 68 | //sparkConf.set("es.net.http.auth.pass", "bazarmi") 69 | val ssc = new StreamingContext(sparkConf, Seconds(2)) 70 | 71 | val productCategoryMappingFile = ssc.sparkContext.textFile("/Users/bahaaldine/Google Drive/demo/v2/clickstream/generator/mappings.csv") 72 | val productCategoryMapping = productCategoryMappingFile.map(line => line.split(",")).map(x => (x(0),x(1))).collectAsMap() 73 | val categoryLabelMapping:scala.collection.Map[String,Double] = productCategoryMappingFile.map(line => line.split(",")).map(x => (x(1),x(2).toDouble)).collectAsMap() 74 | val brodcastProductCategoryMapping = ssc.sparkContext.broadcast(productCategoryMapping) 75 | val brodcastCategoryLabelMapping = ssc.sparkContext.broadcast(categoryLabelMapping) 76 | 77 | val customerMappingFile = ssc.sparkContext.textFile("/Users/bahaaldine/Google Drive/demo/v2/clickstream/generator/ip_mappings.csv") 78 | val ipLabelMapping:scala.collection.Map[String,Double] = customerMappingFile.map(line => line.split(",")).map(x => (x(0),x(1).toDouble)).collectAsMap() 79 | val brodcastIpLabelMapping = ssc.sparkContext.broadcast(ipLabelMapping) 80 | 81 | // Create direct kafka stream with brokers and topics 82 | val Array(brokers, topics) = args 83 | val topicsSet = topics.split(",").toSet 84 | val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers) 85 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( 86 | ssc, kafkaParams, topicsSet) 87 | // Get the lines, split them into words, count the words and print 88 | val lines = messages.map(_._2) 89 | val parsedEvents = lines.map(JSON.parseFull(_)).map(_.get.asInstanceOf[scala.collection.immutable.Map[String,Any]]) 90 | val events = parsedEvents.map(data=>Clickstream( 91 | data("message").toString 92 | ,data("@version").toString 93 | ,data("file").toString 94 | ,data("host").toString 95 | ,data("offset").toString 96 | ,data("type").toString 97 | ,data("clientip").toString 98 | ,data("ident").toString 99 | ,data("auth").toString 100 | ,data("timestamp").toString 101 | ,data("verb").toString 102 | ,data("request").toString 103 | ,data("httpversion").toString 104 | ,data("response").toString 105 | ,Integer.parseInt(data("bytes").toString) 106 | ,data("referrer").toString 107 | ,data("agent").toString 108 | )) 109 | 110 | // Creating and enriching the customer object 111 | val customers = events.map { clickstream => 112 | val lookupMap = brodcastProductCategoryMapping.value 113 | Customer(clickstream.clientip, clickstream.request, lookupMap.getOrElse(clickstream.request, "category not found")) 114 | } 115 | 116 | customers.foreachRDD{ rdd => 117 | if (rdd.toLocalIterator.nonEmpty) { 118 | EsSpark.saveToEs(rdd, "spark/customer") 119 | } 120 | } 121 | 122 | val trainingData = customers.map { customer => 123 | val categoryLookupMap = brodcastCategoryLabelMapping.value 124 | val customerLookupMap = brodcastIpLabelMapping.value 125 | 126 | val categoryLabel = categoryLookupMap.getOrElse(customer.category, 1).asInstanceOf[Double] 127 | val customerLabel = customerLookupMap.getOrElse(customer.session, 1).asInstanceOf[Double] 128 | 129 | Vectors.dense(Array(categoryLabel, customerLabel)) 130 | } 131 | 132 | val testData = customers.map { customer => 133 | val categoryLookupMap = brodcastCategoryLabelMapping.value 134 | val customerLookupMap = brodcastIpLabelMapping.value 135 | 136 | val categoryLabel = categoryLookupMap.getOrElse(customer.category, 1).asInstanceOf[Double] 137 | val customerLabel = customerLookupMap.getOrElse(customer.session, 1).asInstanceOf[Double] 138 | 139 | LabeledPoint(categoryLabel, Vectors.dense(Array(categoryLabel, customerLabel))) 140 | } 141 | 142 | val model = new StreamingKMeans() 143 | .setK(3) 144 | .setDecayFactor(1.0) 145 | .setRandomCenters(2, 0.0) 146 | 147 | model.trainOn(trainingData) 148 | model.predictOnValues(testData.map(lp => (lp.label, lp.features))).foreachRDD{ rdd => 149 | if (rdd.toLocalIterator.nonEmpty) { 150 | EsSpark.saveToEs(rdd, "spark/prediction") 151 | } 152 | } 153 | 154 | 155 | // Start the computation 156 | ssc.start() 157 | ssc.awaitTermination() 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /chapter7/docker/logstash/.logstash-forwarder: -------------------------------------------------------------------------------- 1 | {"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150711-220259.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150711-220259.log","offset":96859,"inode":7479218,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-000447.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-000447.log","offset":972325,"inode":7504456,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-000741.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-000741.log","offset":975794,"inode":7505868,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-001824.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-001824.log","offset":647,"inode":7508694,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-001915.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-001915.log","offset":174,"inode":7508875,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-093649.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-093649.log","offset":222,"inode":7535349,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-140436.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-140436.log","offset":166,"inode":7572409,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-142450.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-142450.log","offset":971867,"inode":7578311,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-225306.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-225306.log","offset":971300,"inode":7728720,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134126.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134126.log","offset":242,"inode":7918600,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134150.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134150.log","offset":241,"inode":7918743,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134238.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134238.log","offset":974,"inode":7918888,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134434.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134434.log","offset":179,"inode":7919307,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-135702.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-135702.log","offset":234,"inode":7922130,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-135855.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-135855.log","offset":169,"inode":7922725,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-140105.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-140105.log","offset":183,"inode":7923254,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-140440.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-140440.log","offset":243,"inode":7925752,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-163017.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-163017.log","offset":146,"inode":7965083,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-170127.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-170127.log","offset":164,"inode":7975371,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-170639.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-170639.log","offset":169,"inode":7977865,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-170655.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-170655.log","offset":1848,"inode":7977973,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173241.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173241.log","offset":236,"inode":7988866,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173404.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173404.log","offset":252,"inode":7989373,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173422.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173422.log","offset":1720,"inode":7989470,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173621.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173621.log","offset":1925,"inode":7990107,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-175004.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-175004.log","offset":1740,"inode":7993748,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-181903.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-181903.log","offset":2029,"inode":8005282,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-182035.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-182035.log","offset":2090,"inode":8005809,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-182711.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-182711.log","offset":1976,"inode":8009448,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150714-000234.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150714-000234.log","offset":2032,"inode":8048571,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150714-000309.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150714-000309.log","offset":2059,"inode":8048811,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-133903.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-133903.log","offset":1903,"inode":8288492,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-134905.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-134905.log","offset":2042,"inode":8292378,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-143201.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-143201.log","offset":2038,"inode":8343433,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-145645.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-145645.log","offset":1781,"inode":8420823,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-153450.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-153450.log","offset":2171,"inode":8459287,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-154641.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-154641.log","offset":1960,"inode":8462798,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-154730.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-154730.log","offset":2044,"inode":8463027,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-163018.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-163018.log","offset":1889,"inode":8491010,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-163455.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-163455.log","offset":1749,"inode":8492463,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-164003.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-164003.log","offset":1912,"inode":8495558,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-164023.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-164023.log","offset":193955,"inode":8495798,"device":16777217}} 2 | -------------------------------------------------------------------------------- /chapter5/spark-scala-streamer/spark-scala-streamer.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | --------------------------------------------------------------------------------