├── .gitignore
├── chapter4
├── generator
│ ├── requests.txt
│ ├── referers.txt
│ ├── codes.txt
│ ├── ips.txt
│ ├── resources.txt
│ ├── user_agents.txt
│ └── generator.py
├── logstash
│ ├── indexer
│ │ └── kafka_to_elasticsearch.conf
│ ├── processor
│ │ └── forwarder_to_kafka.conf
│ └── forwarder
│ │ └── forwarder.json
└── kafka
│ ├── server-1.properties
│ └── server-2.properties
├── chapter7
└── docker
│ ├── generator
│ ├── requests.txt
│ ├── referers.txt
│ ├── codes.txt
│ ├── ips.txt
│ ├── resources.txt
│ ├── user_agents.txt
│ └── generator.py
│ ├── docker-machine.sh
│ ├── logstash
│ ├── indexer
│ │ └── kafka_to_elasticsearch.conf
│ ├── processor
│ │ └── forwarder_to_kafka.conf
│ ├── forwarder
│ │ └── forwarder.json
│ └── .logstash-forwarder
│ ├── templates
│ └── template.json
│ ├── kafka
│ ├── server1
│ │ ├── zookeeper.properties
│ │ └── server.properties
│ └── server2
│ │ ├── zookeeper.properties
│ │ └── server.properties
│ ├── security
│ ├── lumberjack.crt
│ └── lumberjack.key
│ └── docker-compose.yml
├── chapter6
└── spark-enrich-and-ml
│ ├── build.sh
│ ├── src
│ └── main
│ │ ├── main.iml
│ │ └── scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── examples
│ │ ├── StreamingExamples.scala
│ │ └── SparkEnricher.scala
│ └── pom.xml
└── chapter5
└── spark-scala-streamer
├── build.sh
├── src
└── main
│ ├── main.iml
│ └── scala
│ ├── org
│ └── apress
│ │ └── examples
│ │ └── chapter4
│ │ ├── StreamingExamples.scala
│ │ └── KafkaStreamer.scala
│ └── clickstream
│ ├── PageViewGenerator.scala
│ └── PageViewStream.scala
├── pom.xml
└── spark-scala-streamer.iml
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
--------------------------------------------------------------------------------
/chapter4/generator/requests.txt:
--------------------------------------------------------------------------------
1 | GET
2 | GET
3 | GET
4 | GET
5 | GET
6 | GET
7 | GET
8 | GET
9 | GET
10 | GET
11 | POST
12 | POST
--------------------------------------------------------------------------------
/chapter7/docker/generator/requests.txt:
--------------------------------------------------------------------------------
1 | GET
2 | GET
3 | GET
4 | GET
5 | GET
6 | GET
7 | GET
8 | GET
9 | GET
10 | GET
11 | POST
12 | POST
--------------------------------------------------------------------------------
/chapter4/generator/referers.txt:
--------------------------------------------------------------------------------
1 | -
2 | http://www.google.com
3 | http://www.yahoo.com
4 | http://www.referrer.com
5 | http://www.bing.com
6 | http://www.amazon.com
--------------------------------------------------------------------------------
/chapter7/docker/generator/referers.txt:
--------------------------------------------------------------------------------
1 | -
2 | http://www.google.com
3 | http://www.yahoo.com
4 | http://www.referrer.com
5 | http://www.bing.com
6 | http://www.amazon.com
--------------------------------------------------------------------------------
/chapter4/generator/codes.txt:
--------------------------------------------------------------------------------
1 | 200
2 | 200
3 | 200
4 | 200
5 | 200
6 | 200
7 | 200
8 | 200
9 | 200
10 | 200
11 | 404
12 | 404
13 | 404
14 | 401
15 | 401
16 | 403
17 | 500
--------------------------------------------------------------------------------
/chapter7/docker/generator/codes.txt:
--------------------------------------------------------------------------------
1 | 200
2 | 200
3 | 200
4 | 200
5 | 200
6 | 200
7 | 200
8 | 200
9 | 200
10 | 200
11 | 404
12 | 404
13 | 404
14 | 401
15 | 401
16 | 403
17 | 500
--------------------------------------------------------------------------------
/chapter4/generator/ips.txt:
--------------------------------------------------------------------------------
1 | 10.10.10.10
2 | 10.10.10.11
3 | 10.10.10.12
4 | 10.10.10.13
5 | 10.10.10.14
6 | 10.10.10.15
7 | 10.10.10.16
8 | 10.10.10.17
9 | 10.10.10.18
10 | 10.10.10.19
--------------------------------------------------------------------------------
/chapter7/docker/generator/ips.txt:
--------------------------------------------------------------------------------
1 | 10.10.10.10
2 | 10.10.10.11
3 | 10.10.10.12
4 | 10.10.10.13
5 | 10.10.10.14
6 | 10.10.10.15
7 | 10.10.10.16
8 | 10.10.10.17
9 | 10.10.10.18
10 | 10.10.10.19
--------------------------------------------------------------------------------
/chapter7/docker/docker-machine.sh:
--------------------------------------------------------------------------------
1 | docker-machine -D create \
2 | --driver amazonec2 \
3 | --amazonec2-access-key $AWS_ACCESS_KEY \
4 | --amazonec2-secret-key $AWS_SECRET_KEY \
5 | --amazonec2-vpc-id $AWS_VPC_ID \
6 | --amazonec2-zone b \
7 | baha-lambda-architecture
--------------------------------------------------------------------------------
/chapter4/generator/resources.txt:
--------------------------------------------------------------------------------
1 | /products/product1
2 | /products/product2
3 | /products/product3
4 | /products/product4
5 | /products/product5
6 | /products/product6
7 | /store/cart
8 | /store/cart/checkout
9 | /account
10 | /page1
11 | /page2
12 | /page3
13 | /page4
14 | /page5
15 | /page6
16 | /page6
--------------------------------------------------------------------------------
/chapter7/docker/generator/resources.txt:
--------------------------------------------------------------------------------
1 | /products/product1
2 | /products/product2
3 | /products/product3
4 | /products/product4
5 | /products/product5
6 | /products/product6
7 | /store/cart
8 | /store/cart/checkout
9 | /account
10 | /page1
11 | /page2
12 | /page3
13 | /page4
14 | /page5
15 | /page6
16 | /page6
--------------------------------------------------------------------------------
/chapter4/logstash/indexer/kafka_to_elasticsearch.conf:
--------------------------------------------------------------------------------
1 | input {
2 | kafka {
3 | topic_id => "clickstream"
4 | }
5 | }
6 | filter {
7 | }
8 | output {
9 | stdout { codec => rubydebug }
10 | elasticsearch {
11 | index => "clickstream-%{+YYYY.MM.dd}"
12 | manage_template => false
13 | host => localhost
14 | protocol => http
15 | }
16 | }
--------------------------------------------------------------------------------
/chapter7/docker/logstash/indexer/kafka_to_elasticsearch.conf:
--------------------------------------------------------------------------------
1 | input {
2 | kafka {
3 | zk_connect => "brokers:2181"
4 | topic_id => "clickstream"
5 | }
6 | }
7 | filter {
8 | mutate {convert => ["bytes", "integer"]}
9 | }
10 | output {
11 | stdout { codec => rubydebug }
12 | elasticsearch {
13 | index => "clickstream-%{+YYYY.MM.dd}"
14 | manage_template => false
15 | host => brokers
16 | protocol => http
17 | }
18 | }
--------------------------------------------------------------------------------
/chapter6/spark-enrich-and-ml/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo 'Setting JAVA_HOME to Java7'
4 | export JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.7.0_21.jdk/Contents/Home/
5 |
6 | echo 'Package sources'
7 | mvn clean scala:compile package
8 |
9 | echo 'Running Spark'
10 | spark-submit --class org.apache.spark.examples.SparkEnricher \
11 | --master local[2] \
12 | target/spark-enrich-and-ml-1.0.0-jar-with-dependencies.jar \
13 | 192.168.59.103:9092,192.168.59.103:9093 clickstream
14 |
--------------------------------------------------------------------------------
/chapter5/spark-scala-streamer/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo 'Setting JAVA_HOME to Java7'
4 | export JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.7.0_21.jdk/Contents/Home/
5 |
6 | echo 'Package sources'
7 | mvn clean scala:compile package
8 |
9 | echo 'Running Spark'
10 | spark-submit --class org.apress.examples.chapter4.KafkaStreamer \
11 | --master local[2] \
12 | target/spark-scala-streamer-1.0.0-jar-with-dependencies.jar \
13 | 192.168.59.103:9092,192.168.59.103:9093 clickstream
14 |
--------------------------------------------------------------------------------
/chapter7/docker/logstash/processor/forwarder_to_kafka.conf:
--------------------------------------------------------------------------------
1 | input {
2 | lumberjack {
3 | port => "5043"
4 | ssl_certificate => "/etc/logstash/security/lumberjack.crt"
5 | ssl_key => "/etc/logstash/security/lumberjack.key"
6 | }
7 | }
8 | filter {
9 | grok {
10 | match => {
11 | "message" => "%{COMBINEDAPACHELOG}"
12 | }
13 | }
14 | }
15 | output {
16 | stdout { codec => rubydebug }
17 | kafka {
18 | broker_list => "192.168.59.103:9092"
19 | topic_id => "clickstream"
20 | }
21 | }
--------------------------------------------------------------------------------
/chapter4/logstash/processor/forwarder_to_kafka.conf:
--------------------------------------------------------------------------------
1 | input {
2 | lumberjack {
3 | port => "5043"
4 | ssl_certificate => "/Users/bahaaldine/Dropbox/apress/demo/chapter4/security/lumberjack.crt"
5 | ssl_key => "/Users/bahaaldine/Dropbox/apress/demo/chapter4/security/lumberjack.key"
6 | }
7 | }
8 | filter {
9 | grok {
10 | match => {
11 | "message" => "%{COMBINEDAPACHELOG}"
12 | }
13 | }
14 | }
15 | output {
16 | stdout { codec => rubydebug }
17 | kafka {
18 | topic_id => "clickstream"
19 | }
20 | }
--------------------------------------------------------------------------------
/chapter4/logstash/forwarder/forwarder.json:
--------------------------------------------------------------------------------
1 | {
2 | "network": {
3 | "servers": [ "localhost:5043" ],
4 | "ssl certificate": "../security/lumberjack.crt",
5 | "ssl key": "../security/lumberjack.key",
6 | "ssl ca": "../security/lumberjack.crt",
7 | "timeout": 15
8 | },
9 |
10 | "files": [
11 | {
12 | "paths": [
13 | "/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_*"
14 | ],
15 | "fields": { "type": "access_log" }
16 | }, {
17 | "paths": [ "-" ],
18 | "fields": { "type": "stdin" }
19 | }
20 | ]
21 | }
--------------------------------------------------------------------------------
/chapter5/spark-scala-streamer/src/main/main.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/chapter6/spark-enrich-and-ml/src/main/main.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/chapter7/docker/logstash/forwarder/forwarder.json:
--------------------------------------------------------------------------------
1 | {
2 | "network": {
3 | "servers": [ "processors:5043" ],
4 | "ssl certificate": "/etc/logstash-forwarder/security/lumberjack.crt",
5 | "ssl key": "/etc/logstash-forwarder/security/lumberjack.key",
6 | "ssl ca": "/etc/logstash-forwarder/security/lumberjack.crt",
7 | "timeout": 15
8 | },
9 |
10 | "files": [
11 | {
12 | "paths": [
13 | "/tmp/source/access_log_*"
14 | ],
15 | "fields": { "type": "access_log" }
16 | }, {
17 | "paths": [ "-" ],
18 | "fields": { "type": "stdin" }
19 | }
20 | ]
21 | }
--------------------------------------------------------------------------------
/chapter4/generator/user_agents.txt:
--------------------------------------------------------------------------------
1 | Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)
2 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36
3 | Mozilla/5.0 (Linux; U; Android 2.3.5; en-us; HTC Vision Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1
4 | Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25
5 | Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201
6 | Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0
7 | Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))
--------------------------------------------------------------------------------
/chapter7/docker/generator/user_agents.txt:
--------------------------------------------------------------------------------
1 | Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)
2 | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36
3 | Mozilla/5.0 (Linux; U; Android 2.3.5; en-us; HTC Vision Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1
4 | Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25
5 | Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201
6 | Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0
7 | Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))
--------------------------------------------------------------------------------
/chapter7/docker/templates/template.json:
--------------------------------------------------------------------------------
1 | {
2 | "template_1" : {
3 | "template" : "*",
4 | "settings" : {
5 | "index.number_of_shards" : 1,
6 | "index.number_of_replicas" : 0
7 | },
8 | "mappings" : {
9 | "_default_": {
10 | "dynamic_templates": [
11 | {
12 | "string_fields": {
13 | "mapping": {
14 | "index": "not_analyzed",
15 | "omit_norms": true,
16 | "type": "string"
17 | },
18 | "match_mapping_type": "string",
19 | "match": "*"
20 | }
21 | }
22 | ],
23 | "_all": {
24 | "enabled": true
25 | },
26 | "properties": {
27 | "bytes": { "type": "integer" },
28 | "response": { "type": "integer" }
29 | }
30 | }
31 | }
32 | }
33 | }
--------------------------------------------------------------------------------
/chapter7/docker/kafka/server1/zookeeper.properties:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # the directory where the snapshot is stored.
16 | dataDir=/tmp/zookeeper
17 | # the port at which the clients will connect
18 | clientPort=2181
19 | # disable the per-ip limit on the number of connections since this is a non-production config
20 | maxClientCnxns=0
21 |
--------------------------------------------------------------------------------
/chapter7/docker/kafka/server2/zookeeper.properties:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # the directory where the snapshot is stored.
16 | dataDir=/tmp/zookeeper
17 | # the port at which the clients will connect
18 | clientPort=2182
19 | # disable the per-ip limit on the number of connections since this is a non-production config
20 | maxClientCnxns=0
21 |
--------------------------------------------------------------------------------
/chapter7/docker/security/lumberjack.crt:
--------------------------------------------------------------------------------
1 | -----BEGIN CERTIFICATE-----
2 | MIIDIzCCAgugAwIBAgIJAPlrM1BCQmOVMA0GCSqGSIb3DQEBBQUAMBUxEzARBgNV
3 | BAMTCnByb2Nlc3NvcnMwHhcNMTUwNzE2MDk1NjU1WhcNMTUwODE1MDk1NjU1WjAV
4 | MRMwEQYDVQQDEwpwcm9jZXNzb3JzMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIB
5 | CgKCAQEAnIDqokdntcXHt1ez4bsg2gpqYENF9cpd7Y/cJa393l++/nul24HQQrrr
6 | Q3+gYSw1u+dAb9nh8ThDeUABBItxSAHpPB2ozxkHa3SmVf02XWSl5vGWVxsnYliB
7 | qCEqxtLXUzB8Az1FIZy0HLlCTSDCb5U2JHInIkqqxWTnHRQ4s9vXuai0ysxT3BPQ
8 | h4d9ocpgdn80eQO+XCrlyxDaUsSRwY+MdO2G0vEqgiAgru6bndzlCsyPa19yMrru
9 | qwkErv16Yiws6Qmc4jxYDspR7xOfcrQl2N6SgqHtPL0Qv2rAmZhqcyl5V2lbEd7T
10 | WGqbLntYjtht43EyLB7TVoSi2SVFIwIDAQABo3YwdDAdBgNVHQ4EFgQUYAbYar6K
11 | K2/WhrZiWhDZcWU73Y0wRQYDVR0jBD4wPIAUYAbYar6KK2/WhrZiWhDZcWU73Y2h
12 | GaQXMBUxEzARBgNVBAMTCnByb2Nlc3NvcnOCCQD5azNQQkJjlTAMBgNVHRMEBTAD
13 | AQH/MA0GCSqGSIb3DQEBBQUAA4IBAQBJJdcAnFCxnRz8x1/Nta9J9ZGCVb/HM2LD
14 | AEh2sdbWoOZ4tvYtIxfvIsOQW/UfSUCfZnTXQE4qaCqT+r85QxmPE0UuFcTfbmU2
15 | 0u96m5J+tRGoFHI6/FegALtLpt2xQ5yEE2QWBXfKTjL+9TfKxcllyyGHJwKE1oOL
16 | BBAptNxfWRIZWFL73pdy4xmqAzhAE8zhFLmyy7VJY8fK5eewvGaACwKCMRBdBeAN
17 | IzPZwYG3VycJwRAoNBs3HhuIdgbKI95WpszQbxmo8MRBrwp3dxkwel6AHh4jyJiF
18 | cDYUoKp6V0NmtJ40jKPc1riN4BER5pO8aztXcyfp7GDb12/eY0Ur
19 | -----END CERTIFICATE-----
20 |
--------------------------------------------------------------------------------
/chapter4/generator/generator.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import sys
3 | import time
4 | import datetime
5 | import random
6 | from pytz import timezone
7 |
8 | line_count = int(sys.argv[1])
9 | timestr = time.strftime("%Y%m%d-%H%M%S")
10 |
11 | f = open('../source/access_log_'+timestr+'.log','w')
12 |
13 | # ips
14 | with open('ips.txt') as ips_file:
15 | ips = ips_file.read().splitlines()
16 |
17 | # referers
18 | with open('referers.txt') as referers_file:
19 | referers = referers_file.read().splitlines()
20 |
21 | # resources
22 | with open('resources.txt') as resources_file:
23 | resources = resources_file.read().splitlines()
24 |
25 | # user agents
26 | with open('user_agents.txt') as user_agents_file:
27 | useragents = user_agents_file.read().splitlines()
28 |
29 | # codes
30 | with open('codes.txt') as codes_file:
31 | codes = codes_file.read().splitlines()
32 |
33 | # requests
34 | with open('requests.txt') as requests_file:
35 | requests = requests_file.read().splitlines()
36 |
37 | event_time = datetime.datetime(2013,10,10).replace(tzinfo=timezone('UTC'))
38 |
39 | for i in xrange(0,line_count):
40 | increment = datetime.timedelta(seconds=random.randint(30,300))
41 | event_time += increment
42 | uri = random.choice(resources)
43 | if uri.find("Store")>0:
44 | uri += `random.randint(1000,1500)`
45 | ip = random.choice(ips)
46 | useragent = random.choice(useragents)
47 | referer = random.choice(referers)
48 | code = random.choice(codes)
49 | request= random.choice(requests)
50 | f.write('%s - - [%s] "%s %s HTTP/1.0" %s %s "%s" "%s" \n' % (random.choice(ips),event_time.strftime('%d/%b/%Y:%H:%M:%S %z'),request,uri,code,random.randint(2000,5000),referer,useragent))
--------------------------------------------------------------------------------
/chapter7/docker/generator/generator.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import sys
3 | import time
4 | import datetime
5 | import random
6 | from pytz import timezone
7 |
8 | line_count = int(sys.argv[1])
9 | timestr = time.strftime("%Y%m%d-%H%M%S")
10 |
11 | f = open('../source/access_log_'+timestr+'.log','w')
12 |
13 | # ips
14 | with open('ips.txt') as ips_file:
15 | ips = ips_file.read().splitlines()
16 |
17 | # referers
18 | with open('referers.txt') as referers_file:
19 | referers = referers_file.read().splitlines()
20 |
21 | # resources
22 | with open('resources.txt') as resources_file:
23 | resources = resources_file.read().splitlines()
24 |
25 | # user agents
26 | with open('user_agents.txt') as user_agents_file:
27 | useragents = user_agents_file.read().splitlines()
28 |
29 | # codes
30 | with open('codes.txt') as codes_file:
31 | codes = codes_file.read().splitlines()
32 |
33 | # requests
34 | with open('requests.txt') as requests_file:
35 | requests = requests_file.read().splitlines()
36 |
37 | event_time = datetime.datetime(2013,10,10).replace(tzinfo=timezone('UTC'))
38 |
39 | for i in xrange(0,line_count):
40 | increment = datetime.timedelta(seconds=random.randint(30,300))
41 | event_time += increment
42 | uri = random.choice(resources)
43 | if uri.find("Store")>0:
44 | uri += `random.randint(1000,1500)`
45 | ip = random.choice(ips)
46 | useragent = random.choice(useragents)
47 | referer = random.choice(referers)
48 | code = random.choice(codes)
49 | request= random.choice(requests)
50 | f.write('%s - - [%s] "%s %s HTTP/1.0" %s %s "%s" "%s" \n' % (random.choice(ips),event_time.strftime('%d/%b/%Y:%H:%M:%S %z'),request,uri,code,random.randint(2000,5000),referer,useragent))
--------------------------------------------------------------------------------
/chapter6/spark-enrich-and-ml/src/main/scala/org/apache/spark/examples/StreamingExamples.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import org.apache.spark.Logging
21 | import org.apache.log4j.{Level, Logger}
22 |
23 | /** Utility functions for Spark Streaming examples. */
24 | object StreamingExamples extends Logging {
25 |
26 | /** Set reasonable logging levels for streaming if the user has not configured log4j. */
27 | def setStreamingLogLevels() {
28 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
29 | if (!log4jInitialized) {
30 | // We first log something to initialize Spark's default logging, then we override the
31 | // logging level.
32 | logInfo("Setting log level to [WARN] for streaming example." +
33 | " To override add a custom log4j.properties to the classpath.")
34 | Logger.getRootLogger.setLevel(Level.WARN)
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/chapter5/spark-scala-streamer/src/main/scala/org/apress/examples/chapter4/StreamingExamples.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apress.examples.chapter4
19 |
20 | import org.apache.spark.Logging
21 | import org.apache.log4j.{Level, Logger}
22 |
23 | /** Utility functions for Spark Streaming examples. */
24 | object StreamingExamples extends Logging {
25 |
26 | /** Set reasonable logging levels for streaming if the user has not configured log4j. */
27 | def setStreamingLogLevels() {
28 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
29 | if (!log4jInitialized) {
30 | // We first log something to initialize Spark's default logging, then we override the
31 | // logging level.
32 | logInfo("Setting log level to [WARN] for streaming example." +
33 | " To override add a custom log4j.properties to the classpath.")
34 | Logger.getRootLogger.setLevel(Level.WARN)
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/chapter7/docker/security/lumberjack.key:
--------------------------------------------------------------------------------
1 | -----BEGIN RSA PRIVATE KEY-----
2 | MIIEogIBAAKCAQEAnIDqokdntcXHt1ez4bsg2gpqYENF9cpd7Y/cJa393l++/nul
3 | 24HQQrrrQ3+gYSw1u+dAb9nh8ThDeUABBItxSAHpPB2ozxkHa3SmVf02XWSl5vGW
4 | VxsnYliBqCEqxtLXUzB8Az1FIZy0HLlCTSDCb5U2JHInIkqqxWTnHRQ4s9vXuai0
5 | ysxT3BPQh4d9ocpgdn80eQO+XCrlyxDaUsSRwY+MdO2G0vEqgiAgru6bndzlCsyP
6 | a19yMrruqwkErv16Yiws6Qmc4jxYDspR7xOfcrQl2N6SgqHtPL0Qv2rAmZhqcyl5
7 | V2lbEd7TWGqbLntYjtht43EyLB7TVoSi2SVFIwIDAQABAoIBAEgL6PgP2ySJCinT
8 | DBJMrRDzXok2Lwy5vofQhE4sTQSEG07X6UCmABPYLNoD9PcG/S2lhbp4GVWMyzMU
9 | TeOjteCqUI7GMKI2KX5EUxae3NodrWolqUosS/MwP3AA5Mpm4Y2AcnajTCHvG9Nl
10 | Tapn/FqV3Djm/DEDIp6AlwyB9KCHSdlietrdqncvIFY7kBv9E+O2wvD+8L5iw/mr
11 | 1IiirsJyb5Q6YeJ2E/x3a1eAL5RN3kLzUQdxbuVqJW1ebQIRVuZV0Y7+va8qrDsm
12 | ZcNtuEXjj25h2c3RHB9pbS9EYum6QrKJBgQNaf3Ol3/HnfB4DHgYH6stgS8L7P6i
13 | ae/Dg8ECgYEAz+jISccOKi7FnYemWyQSIlsuRH1Ew8dFEY3vtLSEqGtmGbW2b6M5
14 | YL+LcoiTaRTfGG7gvyPYVfu1CxIUEw8ZjzWUUL0rD6Rj5HYqPPBQB5M24E68Bm6y
15 | qE/GF+FAyNk4goRfy0Bt8bugXe35YuE3CwUTFvsZ7cbbf84DcKuS8TECgYEAwLQt
16 | Pc+N2nxDoXLwtfOai5ZYxbHoirNqNn6JEpLEbBoGeMhlhVoK7D1RBUBo84lV42kU
17 | w3op8HUxrsoWcL8UQmfDY+lXqhebomb/jOBSxqFLOz0uCN1iOlk9h8mZbbIKpICR
18 | GICt33VLiCXn6ftzvpPUTznSwijFTZJi8yGPppMCgYB4B+B6F52p5M7aH5J/U9fU
19 | VefkSp4EmWras6MMJJvhz/9mIG5vDUD3Bh6vxQG1zQLvzDOcpkkjeuoOtiU4y6Lp
20 | 9vjllBDkOvgg0ceY3vSVKvyni00qOYaTfVrUNot8aUwireHQKiZPRQ9UqysrVvK8
21 | PfXDcryJdiELLBj4V1XCQQKBgDOCde1lw3c5bCKFxM3+FbtmLsh5a71Xg/aZEC7E
22 | yHhU5JH4jxp8HRtUsImE1Aj1Ft44wnIV/4vk56jfH+x3RwURE2trfiFbOiDJA1o2
23 | xCmQB7oH+nwoIQ+TtxzKyJZH1wxtyVsRWmi2w7/a0gj7S88PWNrsi+eWWlcH4Cc5
24 | O1q9AoGANw3EWZ/z+48w5ZqEFcd0ThkY7uujknufRGM4v41tx5c2EWNBzHjXKtX5
25 | yXu11/uXmNCTQtzQHHUbJjYmhGOMrv8ohOeNOyzSpEoVtHeSllzNldLS7uVb3Fni
26 | Ae/BDC0ARoFjHcxAbusEF5vqhSSro22nMQIoJ8h7TILT9Cj0j4g=
27 | -----END RSA PRIVATE KEY-----
28 |
--------------------------------------------------------------------------------
/chapter7/docker/docker-compose.yml:
--------------------------------------------------------------------------------
1 | zookeeper1:
2 | image: bahaaldine/docker-zookeeper
3 | volumes:
4 | - "kafka/server1:/etc/kafka"
5 | - "logs/logstash-zookeeper1:/opt/kafka/logs/"
6 | ports:
7 | - "2181:2181"
8 | extra_hosts:
9 | - "brokers:192.168.59.103"
10 |
11 | kafka1:
12 | image: bahaaldine/docker-kafka
13 | volumes:
14 | - "kafka/server1:/etc/kafka"
15 | - "logs/logstash-kafka1:/opt/kafka/logs/"
16 | extra_hosts:
17 | - "brokers:192.168.59.103"
18 | ports:
19 | - "9092:9092"
20 |
21 | kafka2:
22 | image: bahaaldine/docker-kafka
23 | volumes:
24 | - "kafka/server2:/etc/kafka"
25 | - "logs/logstash-kafka2:/opt/kafka/logs/"
26 | extra_hosts:
27 | - "brokers:192.168.59.103"
28 | links:
29 | - "kafka1"
30 | ports:
31 | - "9093:9093"
32 |
33 | logstashProcessor1:
34 | image: bahaaldine/docker-logstash-agent
35 | volumes:
36 | - "logstash/processor:/etc/logstash"
37 | - "security:/etc/logstash/security"
38 | - "logs/logstash-processor1:/var/log/logstash"
39 | links:
40 | - kafka1
41 | ports:
42 | - "5043:5043"
43 |
44 | elasticsearch1:
45 | image: bahaaldine/docker-elasticsearch
46 | ports:
47 | - "9200:9200"
48 | volumes:
49 | - "logs/elasticsearch1:/var/log/elasticsearch"
50 | - "templates:/etc/elasticsearch/templates"
51 | extra_hosts:
52 | - "elasticsearch:192.168.59.103"
53 |
54 | logstashIndexer1:
55 | image: bahaaldine/docker-logstash-agent
56 | volumes:
57 | - "logstash/indexer:/etc/logstash"
58 | - "logs/logstash-indexer1:/var/log/logstash"
59 | links:
60 | - elasticsearch1
61 | extra_hosts:
62 | - "brokers:192.168.59.103"
63 |
64 | logstashForwarder:
65 | image: bahaaldine/docker-logstash-forwarder
66 | volumes:
67 | - "logstash/forwarder:/etc/logstash-forwarder"
68 | - "security:/etc/logstash-forwarder/security"
69 | - "logs/logstash-forwarder1:/tmp/logs/"
70 | - "source:/tmp/source"
71 | extra_hosts:
72 | - "processors:192.168.59.103"
73 | kibana1:
74 | image: bahaaldine/docker-kibana
75 | ports:
76 | - "5601:5601"
77 | volumes:
78 | - "logs/kibana:/var/log/kibana"
79 | extra_hosts:
80 | - "elasticsearch:192.168.59.103"
--------------------------------------------------------------------------------
/chapter5/spark-scala-streamer/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | 4.0.0
6 | org.apress
7 | spark-scala-streamer
8 | jar
9 | Apress Streamer
10 | http://spark.apache.org/
11 | 1.0.0
12 |
13 |
14 | 1.4.0
15 | 2.10
16 | 2.10.0
17 | 1.7.7
18 |
19 | compile
20 |
21 |
22 |
23 |
24 | org.apache.spark
25 | spark-core_${scala.binary.version}
26 | ${spark.version}
27 | provided
28 |
29 |
30 | org.apache.spark
31 | spark-streaming_${scala.binary.version}
32 | ${spark.version}
33 | provided
34 |
35 |
36 | org.apache.spark
37 | spark-mllib_${scala.binary.version}
38 | ${spark.version}
39 | provided
40 |
41 |
42 | org.apache.spark
43 | spark-streaming-kafka_${scala.binary.version}
44 | ${spark.version}
45 |
46 |
47 | com.google.guava
48 | guava
49 | 14.0.1
50 |
51 |
52 | org.scala-lang
53 | scala-library
54 | ${scala.version}
55 |
56 |
57 | org.elasticsearch
58 | elasticsearch-hadoop
59 | 2.1.0
60 |
61 |
62 | org.apache.avro
63 | avro
64 | ${avro.version}
65 | ${hadoop.deps.scope}
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 | net.alchim31.maven
75 | scala-maven-plugin
76 | 3.1.5
77 |
78 |
79 | org.apache.maven.plugins
80 | maven-compiler-plugin
81 | 2.0.2
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 | net.alchim31.maven
90 | scala-maven-plugin
91 |
92 |
93 |
94 |
95 | maven-assembly-plugin
96 | 2.4
97 |
98 |
99 | jar-with-dependencies
100 |
101 |
102 |
103 |
104 | make-assembly
105 | package
106 |
107 | single
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
--------------------------------------------------------------------------------
/chapter6/spark-enrich-and-ml/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | 4.0.0
6 | org.apache.spark.examples
7 | spark-enrich-and-ml
8 | jar
9 | Spark enrichment and machine learning
10 | http://spark.apache.org/
11 | 1.0.0
12 |
13 |
14 | 1.4.0
15 | 2.10
16 | 2.10.0
17 | 1.7.7
18 |
19 | compile
20 |
21 |
22 |
23 |
24 | org.apache.spark
25 | spark-core_${scala.binary.version}
26 | ${spark.version}
27 | provided
28 |
29 |
30 | org.apache.spark
31 | spark-streaming_${scala.binary.version}
32 | ${spark.version}
33 | provided
34 |
35 |
36 | org.apache.spark
37 | spark-mllib_${scala.binary.version}
38 | ${spark.version}
39 | provided
40 |
41 |
42 | org.apache.spark
43 | spark-streaming-kafka_${scala.binary.version}
44 | ${spark.version}
45 |
46 |
47 | com.google.guava
48 | guava
49 | 14.0.1
50 |
51 |
52 | org.scala-lang
53 | scala-library
54 | ${scala.version}
55 |
56 |
57 | org.elasticsearch
58 | elasticsearch-hadoop
59 | 2.1.0
60 |
61 |
62 | org.apache.avro
63 | avro
64 | ${avro.version}
65 | ${hadoop.deps.scope}
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 | net.alchim31.maven
75 | scala-maven-plugin
76 | 3.1.5
77 |
78 |
79 | org.apache.maven.plugins
80 | maven-compiler-plugin
81 | 2.0.2
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 | net.alchim31.maven
90 | scala-maven-plugin
91 |
92 |
93 |
94 |
95 | maven-assembly-plugin
96 | 2.4
97 |
98 |
99 | jar-with-dependencies
100 |
101 |
102 |
103 |
104 | make-assembly
105 | package
106 |
107 | single
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
--------------------------------------------------------------------------------
/chapter5/spark-scala-streamer/src/main/scala/org/apress/examples/chapter4/KafkaStreamer.scala:
--------------------------------------------------------------------------------
1 | package org.apress.examples.chapter4
2 |
3 | import kafka.serializer.StringDecoder
4 | import org.apache.spark.streaming._
5 | import org.apache.spark.streaming.kafka._
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
8 | import org.apache.spark.SparkContext
9 | import org.apache.spark.sql.SQLContext
10 | import scala.util.parsing.json.JSON
11 | import org.elasticsearch.spark._
12 | import org.elasticsearch.spark.rdd.EsSpark
13 | import scala.collection.mutable.HashMap
14 | import breeze.linalg.Axis._0
15 | import org.apache.spark.rdd.RDD
16 | import scala.collection.mutable.MutableList
17 |
18 | case class PageStatistic (
19 | verbs:List[Map[String, Integer]]
20 | )
21 |
22 | case class Clickstream (
23 | message:String,
24 | version:String,
25 | file:String,
26 | host:String,
27 | offset:String,
28 | eventType:String,
29 | clientip:String,
30 | ident:String,
31 | auth:String,
32 | timestamp:String,
33 | verb:String,
34 | request:String,
35 | httpVersion:String,
36 | response:String,
37 | bytes:Integer,
38 | referrer:String,
39 | agent:String
40 | )
41 |
42 | object KafkaStreamer {
43 | def main(args: Array[String]) {
44 | if (args.length < 2) {
45 | System.err.println(s"""
46 | |Usage: DirectKafkaWordCount
47 | | is a list of one or more Kafka brokers
48 | | is a list of one or more kafka topics to consume from
49 | |
50 | """.stripMargin)
51 | System.exit(1)
52 | }
53 |
54 | StreamingExamples.setStreamingLogLevels()
55 |
56 | // Create context with 2 second batch interval
57 | val sparkConf = new SparkConf().setAppName("KafkaStreamerToElasticsearch")
58 | sparkConf.set("es.index.auto.create", "true")
59 | sparkConf.set("es.nodes", "localhost:9200")
60 | //sparkConf.set("es.net.http.auth.user", "bahaaldine")
61 | //sparkConf.set("es.net.http.auth.pass", "bazarmi")
62 | val ssc = new StreamingContext(sparkConf, Seconds(2))
63 |
64 | // Create direct kafka stream with brokers and topics
65 | val Array(brokers, topics) = args
66 | val topicsSet = topics.split(",").toSet
67 | val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
68 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
69 | ssc, kafkaParams, topicsSet)
70 | // Get the lines, split them into words, count the words and print
71 | val lines = messages.map(_._2)
72 | val parsedEvents = lines.map(JSON.parseFull(_)).map(_.get.asInstanceOf[scala.collection.immutable.Map[String,Any]])
73 | val events = parsedEvents.map(data=>Clickstream(
74 | data("message").toString
75 | ,data("@version").toString
76 | ,data("file").toString
77 | ,data("host").toString
78 | ,data("offset").toString
79 | ,data("type").toString
80 | ,data("clientip").toString
81 | ,data("ident").toString
82 | ,data("auth").toString
83 | ,data("timestamp").toString
84 | ,data("verb").toString
85 | ,data("request").toString
86 | ,data("httpversion").toString
87 | ,data("response").toString
88 | ,Integer.parseInt(data("bytes").toString)
89 | ,data("referrer").toString
90 | ,data("agent").toString
91 | ))
92 |
93 | val counts = events.map(event => event.verb).countByValue()
94 | counts.print()
95 |
96 | counts.foreachRDD{ rdd =>
97 | if (rdd.toLocalIterator.nonEmpty) {
98 | var array:Array[(String, Long)] = rdd.collect()
99 | EsSpark.saveToEs(rdd, "spark/clickstream")
100 | //EsSpark.saveToEs(ssc.sparkContext.makeRDD(Seq(Map("id" -> 123, array(0)._1 -> array(0)._2))), "spark/clickstream", Map("es.mapping.id" -> "id"))
101 | }
102 | }
103 |
104 |
105 | // Start the computation
106 | ssc.start()
107 | ssc.awaitTermination()
108 | }
109 | }
110 |
--------------------------------------------------------------------------------
/chapter5/spark-scala-streamer/src/main/scala/clickstream/PageViewGenerator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples.streaming.clickstream
19 |
20 | import java.net.ServerSocket
21 | import java.io.PrintWriter
22 | import util.Random
23 |
24 | /** Represents a page view on a website with associated dimension data. */
25 | class PageView(val url : String, val status : Int, val zipCode : Int, val userID : Int)
26 | extends Serializable {
27 | override def toString() : String = {
28 | "%s\t%s\t%s\t%s\n".format(url, status, zipCode, userID)
29 | }
30 | }
31 |
32 | object PageView extends Serializable {
33 | def fromString(in : String) : PageView = {
34 | val parts = in.split("\t")
35 | new PageView(parts(0), parts(1).toInt, parts(2).toInt, parts(3).toInt)
36 | }
37 | }
38 |
39 | // scalastyle:off
40 | /** Generates streaming events to simulate page views on a website.
41 | *
42 | * This should be used in tandem with PageViewStream.scala. Example:
43 | *
44 | * To run the generator
45 | * `$ bin/run-example org.apache.spark.examples.streaming.clickstream.PageViewGenerator 44444 10`
46 | * To process the generated stream
47 | * `$ bin/run-example \
48 | * org.apache.spark.examples.streaming.clickstream.PageViewStream errorRatePerZipCode localhost 44444`
49 | *
50 | */
51 | // scalastyle:on
52 | object PageViewGenerator {
53 | val pages = Map("http://foo.com/" -> .7,
54 | "http://foo.com/news" -> 0.2,
55 | "http://foo.com/contact" -> .1)
56 | val httpStatus = Map(200 -> .95,
57 | 404 -> .05)
58 | val userZipCode = Map(94709 -> .5,
59 | 94117 -> .5)
60 | val userID = Map((1 to 100).map(_ -> .01) : _*)
61 |
62 | def pickFromDistribution[T](inputMap : Map[T, Double]) : T = {
63 | val rand = new Random().nextDouble()
64 | var total = 0.0
65 | for ((item, prob) <- inputMap) {
66 | total = total + prob
67 | if (total > rand) {
68 | return item
69 | }
70 | }
71 | inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0
72 | }
73 |
74 | def getNextClickEvent() : String = {
75 | val id = pickFromDistribution(userID)
76 | val page = pickFromDistribution(pages)
77 | val status = pickFromDistribution(httpStatus)
78 | val zipCode = pickFromDistribution(userZipCode)
79 | new PageView(page, status, zipCode, id).toString()
80 | }
81 |
82 | def main(args : Array[String]) {
83 | if (args.length != 2) {
84 | System.err.println("Usage: PageViewGenerator ")
85 | System.exit(1)
86 | }
87 | val port = args(0).toInt
88 | val viewsPerSecond = args(1).toFloat
89 | val sleepDelayMs = (1000.0 / viewsPerSecond).toInt
90 | val listener = new ServerSocket(port)
91 | println("Listening on port: " + port)
92 |
93 | while (true) {
94 | val socket = listener.accept()
95 | new Thread() {
96 | override def run(): Unit = {
97 | println("Got client connected from: " + socket.getInetAddress)
98 | val out = new PrintWriter(socket.getOutputStream(), true)
99 |
100 | while (true) {
101 | Thread.sleep(sleepDelayMs)
102 | out.write(getNextClickEvent())
103 | out.flush()
104 | }
105 | socket.close()
106 | }
107 | }.start()
108 | }
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/chapter5/spark-scala-streamer/src/main/scala/clickstream/PageViewStream.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples.streaming.clickstream
19 |
20 | import org.apache.spark.SparkContext._
21 | import org.apache.spark.streaming.{Seconds, StreamingContext}
22 | import org.apress.examples.chapter4.StreamingExamples
23 | // scalastyle:off
24 | /** Analyses a streaming dataset of web page views. This class demonstrates several types of
25 | * operators available in Spark streaming.
26 | *
27 | * This should be used in tandem with PageViewStream.scala. Example:
28 | * To run the generator
29 | * `$ bin/run-example org.apache.spark.examples.streaming.clickstream.PageViewGenerator 44444 10`
30 | * To process the generated stream
31 | * `$ bin/run-example \
32 | * org.apache.spark.examples.streaming.clickstream.PageViewStream errorRatePerZipCode localhost 44444`
33 | */
34 | // scalastyle:on
35 | object PageViewStream {
36 | def main(args: Array[String]) {
37 | if (args.length != 3) {
38 | System.err.println("Usage: PageViewStream ")
39 | System.err.println(" must be one of pageCounts, slidingPageCounts," +
40 | " errorRatePerZipCode, activeUserCount, popularUsersSeen")
41 | System.exit(1)
42 | }
43 | StreamingExamples.setStreamingLogLevels()
44 | val metric = args(0)
45 | val host = args(1)
46 | val port = args(2).toInt
47 |
48 | // Create the context
49 | val ssc = new StreamingContext("local[2]", "PageViewStream", Seconds(1),
50 | System.getenv("SPARK_HOME"), StreamingContext.jarOfClass(this.getClass).toSeq)
51 |
52 | // Create a ReceiverInputDStream on target host:port and convert each line to a PageView
53 | val pageViews = ssc.socketTextStream(host, port)
54 | .flatMap(_.split("\n"))
55 | .map(PageView.fromString(_))
56 |
57 | // Return a count of views per URL seen in each batch
58 | val pageCounts = pageViews.map(view => view.url).countByValue()
59 |
60 | // Return a sliding window of page views per URL in the last ten seconds
61 | val slidingPageCounts = pageViews.map(view => view.url)
62 | .countByValueAndWindow(Seconds(10), Seconds(2))
63 |
64 |
65 | // Return the rate of error pages (a non 200 status) in each zip code over the last 30 seconds
66 | val statusesPerZipCode = pageViews.window(Seconds(30), Seconds(2))
67 | .map(view => ((view.zipCode, view.status)))
68 | .groupByKey()
69 | val errorRatePerZipCode = statusesPerZipCode.map{
70 | case(zip, statuses) =>
71 | val normalCount = statuses.filter(_ == 200).size
72 | val errorCount = statuses.size - normalCount
73 | val errorRatio = errorCount.toFloat / statuses.size
74 | if (errorRatio > 0.05) {
75 | "%s: **%s**".format(zip, errorRatio)
76 | } else {
77 | "%s: %s".format(zip, errorRatio)
78 | }
79 | }
80 |
81 | // Return the number unique users in last 15 seconds
82 | val activeUserCount = pageViews.window(Seconds(15), Seconds(2))
83 | .map(view => (view.userID, 1))
84 | .groupByKey()
85 | .count()
86 | .map("Unique active users: " + _)
87 |
88 | // An external dataset we want to join to this stream
89 | val userList = ssc.sparkContext.parallelize(
90 | Map(1 -> "Patrick Wendell", 2->"Reynold Xin", 3->"Matei Zaharia").toSeq)
91 |
92 | metric match {
93 | case "pageCounts" => pageCounts.print()
94 | case "slidingPageCounts" => slidingPageCounts.print()
95 | case "errorRatePerZipCode" => errorRatePerZipCode.print()
96 | case "activeUserCount" => activeUserCount.print()
97 | case "popularUsersSeen" =>
98 | // Look for users in our existing dataset and print it out if we have a match
99 | pageViews.map(view => (view.userID, 1))
100 | .foreachRDD((rdd, time) => rdd.join(userList)
101 | .map(_._2._2)
102 | .take(10)
103 | .foreach(u => println("Saw user %s at time %s".format(u, time))))
104 | case _ => println("Invalid metric entered: " + metric)
105 | }
106 |
107 | ssc.start()
108 | }
109 | }
110 |
--------------------------------------------------------------------------------
/chapter7/docker/kafka/server1/server.properties:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # see kafka.server.KafkaConfig for additional details and defaults
16 |
17 | ############################# Server Basics #############################
18 |
19 | # The id of the broker. This must be set to a unique integer for each broker.
20 | broker.id=1
21 |
22 | ############################# Socket Server Settings #############################
23 |
24 | # The port the socket server listens on
25 | port=9092
26 |
27 | # Hostname the broker will bind to. If not set, the server will bind to all interfaces
28 | #host.name=brokers
29 |
30 | # Hostname the broker will advertise to producers and consumers. If not set, it uses the
31 | # value for "host.name" if configured. Otherwise, it will use the value returned from
32 | # java.net.InetAddress.getCanonicalHostName().
33 | advertised.host.name=192.168.59.103
34 |
35 | # The port to publish to ZooKeeper for clients to use. If this is not set,
36 | # it will publish the same port that the broker binds to.
37 | #advertised.port=
38 |
39 | # The number of threads handling network requests
40 | num.network.threads=3
41 |
42 | # The number of threads doing disk I/O
43 | num.io.threads=8
44 |
45 | # The send buffer (SO_SNDBUF) used by the socket server
46 | socket.send.buffer.bytes=102400
47 |
48 | # The receive buffer (SO_RCVBUF) used by the socket server
49 | socket.receive.buffer.bytes=102400
50 |
51 | # The maximum size of a request that the socket server will accept (protection against OOM)
52 | socket.request.max.bytes=104857600
53 |
54 |
55 | ############################# Log Basics #############################
56 |
57 | # A comma seperated list of directories under which to store log files
58 | log.dirs=/tmp/kafka-logs-1
59 |
60 | # The default number of log partitions per topic. More partitions allow greater
61 | # parallelism for consumption, but this will also result in more files across
62 | # the brokers.
63 | num.partitions=1
64 |
65 | # The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
66 | # This value is recommended to be increased for installations with data dirs located in RAID array.
67 | num.recovery.threads.per.data.dir=1
68 |
69 | ############################# Log Flush Policy #############################
70 |
71 | # Messages are immediately written to the filesystem but by default we only fsync() to sync
72 | # the OS cache lazily. The following configurations control the flush of data to disk.
73 | # There are a few important trade-offs here:
74 | # 1. Durability: Unflushed data may be lost if you are not using replication.
75 | # 2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
76 | # 3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks.
77 | # The settings below allow one to configure the flush policy to flush data after a period of time or
78 | # every N messages (or both). This can be done globally and overridden on a per-topic basis.
79 |
80 | # The number of messages to accept before forcing a flush of data to disk
81 | #log.flush.interval.messages=10000
82 |
83 | # The maximum amount of time a message can sit in a log before we force a flush
84 | #log.flush.interval.ms=1000
85 |
86 | ############################# Log Retention Policy #############################
87 |
88 | # The following configurations control the disposal of log segments. The policy can
89 | # be set to delete segments after a period of time, or after a given size has accumulated.
90 | # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
91 | # from the end of the log.
92 |
93 | # The minimum age of a log file to be eligible for deletion
94 | log.retention.hours=168
95 |
96 | # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
97 | # segments don't drop below log.retention.bytes.
98 | #log.retention.bytes=1073741824
99 |
100 | # The maximum size of a log segment file. When this size is reached a new log segment will be created.
101 | log.segment.bytes=1073741824
102 |
103 | # The interval at which log segments are checked to see if they can be deleted according
104 | # to the retention policies
105 | log.retention.check.interval.ms=300000
106 |
107 | # By default the log cleaner is disabled and the log retention policy will default to just delete segments after their retention expires.
108 | # If log.cleaner.enable=true is set the cleaner will be enabled and individual logs can then be marked for log compaction.
109 | log.cleaner.enable=false
110 |
111 | ############################# Zookeeper #############################
112 |
113 | # Zookeeper connection string (see zookeeper docs for details).
114 | # This is a comma separated host:port pairs, each corresponding to a zk
115 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
116 | # You can also append an optional chroot string to the urls to specify the
117 | # root directory for all kafka znodes.
118 | zookeeper.connect=brokers:2181
119 |
120 | # Timeout in ms for connecting to zookeeper
121 | zookeeper.connection.timeout.ms=6000
122 |
--------------------------------------------------------------------------------
/chapter7/docker/kafka/server2/server.properties:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # see kafka.server.KafkaConfig for additional details and defaults
16 |
17 | ############################# Server Basics #############################
18 |
19 | # The id of the broker. This must be set to a unique integer for each broker.
20 | broker.id=2
21 |
22 | ############################# Socket Server Settings #############################
23 |
24 | # The port the socket server listens on
25 | port=9093
26 |
27 | # Hostname the broker will bind to. If not set, the server will bind to all interfaces
28 | #host.name=brokers
29 |
30 | # Hostname the broker will advertise to producers and consumers. If not set, it uses the
31 | # value for "host.name" if configured. Otherwise, it will use the value returned from
32 | # java.net.InetAddress.getCanonicalHostName().
33 | advertised.host.name=192.168.59.103
34 |
35 | # The port to publish to ZooKeeper for clients to use. If this is not set,
36 | # it will publish the same port that the broker binds to.
37 | #advertised.port=
38 |
39 | # The number of threads handling network requests
40 | num.network.threads=3
41 |
42 | # The number of threads doing disk I/O
43 | num.io.threads=8
44 |
45 | # The send buffer (SO_SNDBUF) used by the socket server
46 | socket.send.buffer.bytes=102400
47 |
48 | # The receive buffer (SO_RCVBUF) used by the socket server
49 | socket.receive.buffer.bytes=102400
50 |
51 | # The maximum size of a request that the socket server will accept (protection against OOM)
52 | socket.request.max.bytes=104857600
53 |
54 |
55 | ############################# Log Basics #############################
56 |
57 | # A comma seperated list of directories under which to store log files
58 | log.dirs=/tmp/kafka-logs-2
59 |
60 | # The default number of log partitions per topic. More partitions allow greater
61 | # parallelism for consumption, but this will also result in more files across
62 | # the brokers.
63 | num.partitions=1
64 |
65 | # The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
66 | # This value is recommended to be increased for installations with data dirs located in RAID array.
67 | num.recovery.threads.per.data.dir=1
68 |
69 | ############################# Log Flush Policy #############################
70 |
71 | # Messages are immediately written to the filesystem but by default we only fsync() to sync
72 | # the OS cache lazily. The following configurations control the flush of data to disk.
73 | # There are a few important trade-offs here:
74 | # 1. Durability: Unflushed data may be lost if you are not using replication.
75 | # 2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
76 | # 3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks.
77 | # The settings below allow one to configure the flush policy to flush data after a period of time or
78 | # every N messages (or both). This can be done globally and overridden on a per-topic basis.
79 |
80 | # The number of messages to accept before forcing a flush of data to disk
81 | #log.flush.interval.messages=10000
82 |
83 | # The maximum amount of time a message can sit in a log before we force a flush
84 | #log.flush.interval.ms=1000
85 |
86 | ############################# Log Retention Policy #############################
87 |
88 | # The following configurations control the disposal of log segments. The policy can
89 | # be set to delete segments after a period of time, or after a given size has accumulated.
90 | # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
91 | # from the end of the log.
92 |
93 | # The minimum age of a log file to be eligible for deletion
94 | log.retention.hours=168
95 |
96 | # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
97 | # segments don't drop below log.retention.bytes.
98 | #log.retention.bytes=1073741824
99 |
100 | # The maximum size of a log segment file. When this size is reached a new log segment will be created.
101 | log.segment.bytes=1073741824
102 |
103 | # The interval at which log segments are checked to see if they can be deleted according
104 | # to the retention policies
105 | log.retention.check.interval.ms=300000
106 |
107 | # By default the log cleaner is disabled and the log retention policy will default to just delete segments after their retention expires.
108 | # If log.cleaner.enable=true is set the cleaner will be enabled and individual logs can then be marked for log compaction.
109 | log.cleaner.enable=false
110 |
111 | ############################# Zookeeper #############################
112 |
113 | # Zookeeper connection string (see zookeeper docs for details).
114 | # This is a comma separated host:port pairs, each corresponding to a zk
115 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
116 | # You can also append an optional chroot string to the urls to specify the
117 | # root directory for all kafka znodes.
118 | zookeeper.connect=brokers:2181
119 |
120 | # Timeout in ms for connecting to zookeeper
121 | zookeeper.connection.timeout.ms=6000
122 |
--------------------------------------------------------------------------------
/chapter4/kafka/server-1.properties:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # see kafka.server.KafkaConfig for additional details and defaults
16 |
17 | ############################# Server Basics #############################
18 |
19 | # The id of the broker. This must be set to a unique integer for each broker.
20 | broker.id=1
21 |
22 | ############################# Socket Server Settings #############################
23 |
24 | # The port the socket server listens on
25 | port=9092
26 |
27 | # Hostname the broker will bind to. If not set, the server will bind to all interfaces
28 | #host.name=localhost
29 |
30 | # Hostname the broker will advertise to producers and consumers. If not set, it uses the
31 | # value for "host.name" if configured. Otherwise, it will use the value returned from
32 | # java.net.InetAddress.getCanonicalHostName().
33 | #advertised.host.name=
34 |
35 | # The port to publish to ZooKeeper for clients to use. If this is not set,
36 | # it will publish the same port that the broker binds to.
37 | #advertised.port=
38 |
39 | # The number of threads handling network requests
40 | num.network.threads=3
41 |
42 | # The number of threads doing disk I/O
43 | num.io.threads=8
44 |
45 | # The send buffer (SO_SNDBUF) used by the socket server
46 | socket.send.buffer.bytes=102400
47 |
48 | # The receive buffer (SO_RCVBUF) used by the socket server
49 | socket.receive.buffer.bytes=102400
50 |
51 | # The maximum size of a request that the socket server will accept (protection against OOM)
52 | socket.request.max.bytes=104857600
53 |
54 |
55 | ############################# Log Basics #############################
56 |
57 | # A comma seperated list of directories under which to store log files
58 | log.dirs=/tmp/kafka-logs-1
59 |
60 | # The default number of log partitions per topic. More partitions allow greater
61 | # parallelism for consumption, but this will also result in more files across
62 | # the brokers.
63 | num.partitions=1
64 |
65 | # The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
66 | # This value is recommended to be increased for installations with data dirs located in RAID array.
67 | num.recovery.threads.per.data.dir=1
68 |
69 | ############################# Log Flush Policy #############################
70 |
71 | # Messages are immediately written to the filesystem but by default we only fsync() to sync
72 | # the OS cache lazily. The following configurations control the flush of data to disk.
73 | # There are a few important trade-offs here:
74 | # 1. Durability: Unflushed data may be lost if you are not using replication.
75 | # 2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
76 | # 3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks.
77 | # The settings below allow one to configure the flush policy to flush data after a period of time or
78 | # every N messages (or both). This can be done globally and overridden on a per-topic basis.
79 |
80 | # The number of messages to accept before forcing a flush of data to disk
81 | #log.flush.interval.messages=10000
82 |
83 | # The maximum amount of time a message can sit in a log before we force a flush
84 | #log.flush.interval.ms=1000
85 |
86 | ############################# Log Retention Policy #############################
87 |
88 | # The following configurations control the disposal of log segments. The policy can
89 | # be set to delete segments after a period of time, or after a given size has accumulated.
90 | # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
91 | # from the end of the log.
92 |
93 | # The minimum age of a log file to be eligible for deletion
94 | log.retention.hours=168
95 |
96 | # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
97 | # segments don't drop below log.retention.bytes.
98 | #log.retention.bytes=1073741824
99 |
100 | # The maximum size of a log segment file. When this size is reached a new log segment will be created.
101 | log.segment.bytes=1073741824
102 |
103 | # The interval at which log segments are checked to see if they can be deleted according
104 | # to the retention policies
105 | log.retention.check.interval.ms=300000
106 |
107 | # By default the log cleaner is disabled and the log retention policy will default to just delete segments after their retention expires.
108 | # If log.cleaner.enable=true is set the cleaner will be enabled and individual logs can then be marked for log compaction.
109 | log.cleaner.enable=false
110 |
111 | ############################# Zookeeper #############################
112 |
113 | # Zookeeper connection string (see zookeeper docs for details).
114 | # This is a comma separated host:port pairs, each corresponding to a zk
115 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
116 | # You can also append an optional chroot string to the urls to specify the
117 | # root directory for all kafka znodes.
118 | zookeeper.connect=localhost:2181
119 |
120 | # Timeout in ms for connecting to zookeeper
121 | zookeeper.connection.timeout.ms=6000
122 |
--------------------------------------------------------------------------------
/chapter4/kafka/server-2.properties:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # see kafka.server.KafkaConfig for additional details and defaults
16 |
17 | ############################# Server Basics #############################
18 |
19 | # The id of the broker. This must be set to a unique integer for each broker.
20 | broker.id=2
21 |
22 | ############################# Socket Server Settings #############################
23 |
24 | # The port the socket server listens on
25 | port=9093
26 |
27 | # Hostname the broker will bind to. If not set, the server will bind to all interfaces
28 | #host.name=localhost
29 |
30 | # Hostname the broker will advertise to producers and consumers. If not set, it uses the
31 | # value for "host.name" if configured. Otherwise, it will use the value returned from
32 | # java.net.InetAddress.getCanonicalHostName().
33 | #advertised.host.name=
34 |
35 | # The port to publish to ZooKeeper for clients to use. If this is not set,
36 | # it will publish the same port that the broker binds to.
37 | #advertised.port=
38 |
39 | # The number of threads handling network requests
40 | num.network.threads=3
41 |
42 | # The number of threads doing disk I/O
43 | num.io.threads=8
44 |
45 | # The send buffer (SO_SNDBUF) used by the socket server
46 | socket.send.buffer.bytes=102400
47 |
48 | # The receive buffer (SO_RCVBUF) used by the socket server
49 | socket.receive.buffer.bytes=102400
50 |
51 | # The maximum size of a request that the socket server will accept (protection against OOM)
52 | socket.request.max.bytes=104857600
53 |
54 |
55 | ############################# Log Basics #############################
56 |
57 | # A comma seperated list of directories under which to store log files
58 | log.dirs=/tmp/kafka-logs-2
59 |
60 | # The default number of log partitions per topic. More partitions allow greater
61 | # parallelism for consumption, but this will also result in more files across
62 | # the brokers.
63 | num.partitions=1
64 |
65 | # The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
66 | # This value is recommended to be increased for installations with data dirs located in RAID array.
67 | num.recovery.threads.per.data.dir=1
68 |
69 | ############################# Log Flush Policy #############################
70 |
71 | # Messages are immediately written to the filesystem but by default we only fsync() to sync
72 | # the OS cache lazily. The following configurations control the flush of data to disk.
73 | # There are a few important trade-offs here:
74 | # 1. Durability: Unflushed data may be lost if you are not using replication.
75 | # 2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
76 | # 3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks.
77 | # The settings below allow one to configure the flush policy to flush data after a period of time or
78 | # every N messages (or both). This can be done globally and overridden on a per-topic basis.
79 |
80 | # The number of messages to accept before forcing a flush of data to disk
81 | #log.flush.interval.messages=10000
82 |
83 | # The maximum amount of time a message can sit in a log before we force a flush
84 | #log.flush.interval.ms=1000
85 |
86 | ############################# Log Retention Policy #############################
87 |
88 | # The following configurations control the disposal of log segments. The policy can
89 | # be set to delete segments after a period of time, or after a given size has accumulated.
90 | # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
91 | # from the end of the log.
92 |
93 | # The minimum age of a log file to be eligible for deletion
94 | log.retention.hours=168
95 |
96 | # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
97 | # segments don't drop below log.retention.bytes.
98 | #log.retention.bytes=1073741824
99 |
100 | # The maximum size of a log segment file. When this size is reached a new log segment will be created.
101 | log.segment.bytes=1073741824
102 |
103 | # The interval at which log segments are checked to see if they can be deleted according
104 | # to the retention policies
105 | log.retention.check.interval.ms=300000
106 |
107 | # By default the log cleaner is disabled and the log retention policy will default to just delete segments after their retention expires.
108 | # If log.cleaner.enable=true is set the cleaner will be enabled and individual logs can then be marked for log compaction.
109 | log.cleaner.enable=false
110 |
111 | ############################# Zookeeper #############################
112 |
113 | # Zookeeper connection string (see zookeeper docs for details).
114 | # This is a comma separated host:port pairs, each corresponding to a zk
115 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
116 | # You can also append an optional chroot string to the urls to specify the
117 | # root directory for all kafka znodes.
118 | zookeeper.connect=localhost:2181
119 |
120 | # Timeout in ms for connecting to zookeeper
121 | zookeeper.connection.timeout.ms=6000
122 |
--------------------------------------------------------------------------------
/chapter6/spark-enrich-and-ml/src/main/scala/org/apache/spark/examples/SparkEnricher.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.examples
2 |
3 | import kafka.serializer.StringDecoder
4 | import org.apache.spark.streaming._
5 | import org.apache.spark.streaming.kafka._
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
8 | import org.apache.spark.SparkContext
9 | import org.apache.spark.sql.SQLContext
10 | import scala.util.parsing.json.JSON
11 | import org.elasticsearch.spark._
12 | import org.elasticsearch.spark.rdd.EsSpark
13 | import scala.collection.mutable.HashMap
14 | import breeze.linalg.Axis._0
15 | import org.apache.spark.rdd.RDD
16 | import scala.collection.mutable.MutableList
17 | import org.apache.spark.mllib.linalg.Vectors
18 | import org.apache.spark.mllib.clustering.KMeans
19 | import org.apache.spark.mllib.clustering.StreamingKMeans
20 | import org.apache.spark.mllib.regression.LabeledPoint
21 |
22 | case class Clickstream (
23 | message:String,
24 | version:String,
25 | file:String,
26 | host:String,
27 | offset:String,
28 | eventType:String,
29 | clientip:String,
30 | ident:String,
31 | auth:String,
32 | timestamp:String,
33 | verb:String,
34 | request:String,
35 | httpVersion:String,
36 | response:String,
37 | bytes:Integer,
38 | referrer:String,
39 | agent:String
40 | )
41 |
42 | case class Customer (
43 | session:String,
44 | request:String,
45 | category:String
46 | )
47 |
48 |
49 | object SparkEnricher {
50 | def main(args: Array[String]) {
51 | if (args.length < 2) {
52 | System.err.println(s"""
53 | |Usage: DirectKafkaWordCount
54 | | is a list of one or more Kafka brokers
55 | | is a list of one or more kafka topics to consume from
56 | |
57 | """.stripMargin)
58 | System.exit(1)
59 | }
60 |
61 | StreamingExamples.setStreamingLogLevels()
62 |
63 | // Create context with 2 second batch interval
64 | val sparkConf = new SparkConf().setAppName("KafkaStreamerToElasticsearch")
65 | sparkConf.set("es.index.auto.create", "true")
66 | sparkConf.set("es.nodes", "localhost:9200")
67 | //sparkConf.set("es.net.http.auth.user", "bahaaldine")
68 | //sparkConf.set("es.net.http.auth.pass", "bazarmi")
69 | val ssc = new StreamingContext(sparkConf, Seconds(2))
70 |
71 | val productCategoryMappingFile = ssc.sparkContext.textFile("/Users/bahaaldine/Google Drive/demo/v2/clickstream/generator/mappings.csv")
72 | val productCategoryMapping = productCategoryMappingFile.map(line => line.split(",")).map(x => (x(0),x(1))).collectAsMap()
73 | val categoryLabelMapping:scala.collection.Map[String,Double] = productCategoryMappingFile.map(line => line.split(",")).map(x => (x(1),x(2).toDouble)).collectAsMap()
74 | val brodcastProductCategoryMapping = ssc.sparkContext.broadcast(productCategoryMapping)
75 | val brodcastCategoryLabelMapping = ssc.sparkContext.broadcast(categoryLabelMapping)
76 |
77 | val customerMappingFile = ssc.sparkContext.textFile("/Users/bahaaldine/Google Drive/demo/v2/clickstream/generator/ip_mappings.csv")
78 | val ipLabelMapping:scala.collection.Map[String,Double] = customerMappingFile.map(line => line.split(",")).map(x => (x(0),x(1).toDouble)).collectAsMap()
79 | val brodcastIpLabelMapping = ssc.sparkContext.broadcast(ipLabelMapping)
80 |
81 | // Create direct kafka stream with brokers and topics
82 | val Array(brokers, topics) = args
83 | val topicsSet = topics.split(",").toSet
84 | val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
85 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
86 | ssc, kafkaParams, topicsSet)
87 | // Get the lines, split them into words, count the words and print
88 | val lines = messages.map(_._2)
89 | val parsedEvents = lines.map(JSON.parseFull(_)).map(_.get.asInstanceOf[scala.collection.immutable.Map[String,Any]])
90 | val events = parsedEvents.map(data=>Clickstream(
91 | data("message").toString
92 | ,data("@version").toString
93 | ,data("file").toString
94 | ,data("host").toString
95 | ,data("offset").toString
96 | ,data("type").toString
97 | ,data("clientip").toString
98 | ,data("ident").toString
99 | ,data("auth").toString
100 | ,data("timestamp").toString
101 | ,data("verb").toString
102 | ,data("request").toString
103 | ,data("httpversion").toString
104 | ,data("response").toString
105 | ,Integer.parseInt(data("bytes").toString)
106 | ,data("referrer").toString
107 | ,data("agent").toString
108 | ))
109 |
110 | // Creating and enriching the customer object
111 | val customers = events.map { clickstream =>
112 | val lookupMap = brodcastProductCategoryMapping.value
113 | Customer(clickstream.clientip, clickstream.request, lookupMap.getOrElse(clickstream.request, "category not found"))
114 | }
115 |
116 | customers.foreachRDD{ rdd =>
117 | if (rdd.toLocalIterator.nonEmpty) {
118 | EsSpark.saveToEs(rdd, "spark/customer")
119 | }
120 | }
121 |
122 | val trainingData = customers.map { customer =>
123 | val categoryLookupMap = brodcastCategoryLabelMapping.value
124 | val customerLookupMap = brodcastIpLabelMapping.value
125 |
126 | val categoryLabel = categoryLookupMap.getOrElse(customer.category, 1).asInstanceOf[Double]
127 | val customerLabel = customerLookupMap.getOrElse(customer.session, 1).asInstanceOf[Double]
128 |
129 | Vectors.dense(Array(categoryLabel, customerLabel))
130 | }
131 |
132 | val testData = customers.map { customer =>
133 | val categoryLookupMap = brodcastCategoryLabelMapping.value
134 | val customerLookupMap = brodcastIpLabelMapping.value
135 |
136 | val categoryLabel = categoryLookupMap.getOrElse(customer.category, 1).asInstanceOf[Double]
137 | val customerLabel = customerLookupMap.getOrElse(customer.session, 1).asInstanceOf[Double]
138 |
139 | LabeledPoint(categoryLabel, Vectors.dense(Array(categoryLabel, customerLabel)))
140 | }
141 |
142 | val model = new StreamingKMeans()
143 | .setK(3)
144 | .setDecayFactor(1.0)
145 | .setRandomCenters(2, 0.0)
146 |
147 | model.trainOn(trainingData)
148 | model.predictOnValues(testData.map(lp => (lp.label, lp.features))).foreachRDD{ rdd =>
149 | if (rdd.toLocalIterator.nonEmpty) {
150 | EsSpark.saveToEs(rdd, "spark/prediction")
151 | }
152 | }
153 |
154 |
155 | // Start the computation
156 | ssc.start()
157 | ssc.awaitTermination()
158 | }
159 | }
160 |
--------------------------------------------------------------------------------
/chapter7/docker/logstash/.logstash-forwarder:
--------------------------------------------------------------------------------
1 | {"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150711-220259.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150711-220259.log","offset":96859,"inode":7479218,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-000447.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-000447.log","offset":972325,"inode":7504456,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-000741.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-000741.log","offset":975794,"inode":7505868,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-001824.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-001824.log","offset":647,"inode":7508694,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-001915.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-001915.log","offset":174,"inode":7508875,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-093649.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-093649.log","offset":222,"inode":7535349,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-140436.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-140436.log","offset":166,"inode":7572409,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-142450.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-142450.log","offset":971867,"inode":7578311,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-225306.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150712-225306.log","offset":971300,"inode":7728720,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134126.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134126.log","offset":242,"inode":7918600,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134150.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134150.log","offset":241,"inode":7918743,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134238.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134238.log","offset":974,"inode":7918888,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134434.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-134434.log","offset":179,"inode":7919307,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-135702.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-135702.log","offset":234,"inode":7922130,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-135855.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-135855.log","offset":169,"inode":7922725,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-140105.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-140105.log","offset":183,"inode":7923254,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-140440.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-140440.log","offset":243,"inode":7925752,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-163017.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-163017.log","offset":146,"inode":7965083,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-170127.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-170127.log","offset":164,"inode":7975371,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-170639.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-170639.log","offset":169,"inode":7977865,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-170655.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-170655.log","offset":1848,"inode":7977973,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173241.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173241.log","offset":236,"inode":7988866,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173404.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173404.log","offset":252,"inode":7989373,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173422.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173422.log","offset":1720,"inode":7989470,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173621.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-173621.log","offset":1925,"inode":7990107,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-175004.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-175004.log","offset":1740,"inode":7993748,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-181903.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-181903.log","offset":2029,"inode":8005282,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-182035.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-182035.log","offset":2090,"inode":8005809,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-182711.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150713-182711.log","offset":1976,"inode":8009448,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150714-000234.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150714-000234.log","offset":2032,"inode":8048571,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150714-000309.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150714-000309.log","offset":2059,"inode":8048811,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-133903.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-133903.log","offset":1903,"inode":8288492,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-134905.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-134905.log","offset":2042,"inode":8292378,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-143201.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-143201.log","offset":2038,"inode":8343433,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-145645.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-145645.log","offset":1781,"inode":8420823,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-153450.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-153450.log","offset":2171,"inode":8459287,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-154641.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-154641.log","offset":1960,"inode":8462798,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-154730.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-154730.log","offset":2044,"inode":8463027,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-163018.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-163018.log","offset":1889,"inode":8491010,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-163455.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-163455.log","offset":1749,"inode":8492463,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-164003.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-164003.log","offset":1912,"inode":8495558,"device":16777217},"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-164023.log":{"source":"/Users/bahaaldine/Dropbox/apress/demo/chapter4/source/access_log_20150715-164023.log","offset":193955,"inode":8495798,"device":16777217}}
2 |
--------------------------------------------------------------------------------
/chapter5/spark-scala-streamer/spark-scala-streamer.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
--------------------------------------------------------------------------------