├── .gitignore
├── README.md
├── Vagrantfile
├── Vagrantfile.32
├── Vagrantfile.dockerhost
├── Vagrantfile.dockerhost.src
├── ch03
    └── properties
    │   ├── properties
    │       ├── __init__.py
    │       ├── items.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       └── spiders
    │       │   ├── __init__.py
    │       │   ├── basic.py
    │       │   ├── easy.py
    │       │   └── manual.py
    │   └── scrapy.cfg
├── ch04
    └── properties
    │   ├── properties
    │       ├── __init__.py
    │       ├── items.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       └── spiders
    │       │   ├── __init__.py
    │       │   └── tomobile.py
    │   └── scrapy.cfg
├── ch05
    ├── generic
    │   ├── generic
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   └── fromcsv.py
    │   ├── scrapy.cfg
    │   └── todo.csv
    └── properties
    │   ├── properties
    │       ├── __init__.py
    │       ├── items.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       └── spiders
    │       │   ├── __init__.py
    │       │   ├── api.py
    │       │   ├── fast.py
    │       │   ├── login.py
    │       │   └── noncelogin.py
    │   └── scrapy.cfg
├── ch06
    └── properties
    │   ├── properties
    │       ├── __init__.py
    │       ├── items.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       └── spiders
    │       │   ├── __init__.py
    │       │   └── tomobile.py
    │   └── scrapy.cfg
├── ch07
    └── properties
    │   ├── properties
    │       ├── __init__.py
    │       ├── hi.py
    │       ├── items.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       └── spiders
    │       │   ├── __init__.py
    │       │   └── fast.py
    │   └── scrapy.cfg
├── ch08
    ├── deferreds.py
    ├── hooksasync
    │   ├── hooksasync
    │   │   ├── __init__.py
    │   │   ├── extensions.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   └── test.py
    │   └── scrapy.cfg
    └── properties
    │   ├── properties
    │       ├── __init__.py
    │       ├── hi.py
    │       ├── items.py
    │       ├── latencies.py
    │       ├── pipelines
    │       │   ├── __init__.py
    │       │   └── tidyup.py
    │       ├── settings.py
    │       └── spiders
    │       │   ├── __init__.py
    │       │   └── easy.py
    │   └── scrapy.cfg
├── ch09
    └── properties
    │   ├── properties
    │       ├── __init__.py
    │       ├── items.py
    │       ├── latencies.py
    │       ├── pipelines
    │       │   ├── __init__.py
    │       │   ├── computation.py
    │       │   ├── es.py
    │       │   ├── geo.py
    │       │   ├── geo2.py
    │       │   ├── legacy.py
    │       │   ├── legacy.sh
    │       │   ├── mysql.py
    │       │   ├── redis.py
    │       │   └── tidyup.py
    │       ├── settings.py
    │       └── spiders
    │       │   ├── __init__.py
    │       │   └── easy.py
    │   └── scrapy.cfg
├── ch10
    └── speed
    │   ├── scrapy.cfg
    │   └── speed
    │       ├── __init__.py
    │       ├── settings.py
    │       └── spiders
    │           ├── __init__.py
    │           └── speed.py
├── ch11
    ├── boostwords.py
    └── properties
    │   ├── properties
    │       ├── __init__.py
    │       ├── items.py
    │       ├── middlewares.py
    │       ├── monitor.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       └── spiders
    │       │   ├── __init__.py
    │       │   ├── distr.py
    │       │   └── easy.py
    │   ├── scrapy.cfg
    │   ├── scrapyd1
    │   ├── scrapyd2
    │   └── scrapyd3
├── docker-compose.yml
├── insecure_key
└── lint


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | .DS_Store
 3 | build
 4 | project.egg-info
 5 | setup.py
 6 | *.swp
 7 | *.cache
 8 | /generate/properties/*
 9 | 
10 | # C extensions
11 | *.so
12 | 
13 | # Packages
14 | *.egg
15 | *.egg-info
16 | dist
17 | build
18 | eggs
19 | parts
20 | bin
21 | var
22 | sdist
23 | develop-eggs
24 | .installed.cfg
25 | lib
26 | lib64
27 | 
28 | # Installer logs
29 | pip-log.txt
30 | 
31 | # Unit test / coverage reports
32 | .coverage
33 | .tox
34 | nosetests.xml
35 | 
36 | # Translations
37 | *.mo
38 | 
39 | # Mr Developer
40 | .mr.developer.cfg
41 | .project
42 | .pydevproject
43 | 
44 | # This is usually scrapy cache folders
45 | .scrapy
46 | .vagrant
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Learning Scrapy Book
 2 | ==========
 3 | 
 4 | This book covers the long awaited Scrapy v 1.0 that empowers you to extract useful data from virtually any source with very little effort. It starts off by explaining the fundamentals of Scrapy framework, followed by a thorough description of how to extract data from any source, clean it up, shape it as per your requirement using Python and 3rd party APIs. Next you will be familiarised with the process of storing the scrapped data in databases as well as search engines and performing real time analytics on them with Spark Streaming. By the end of this book, you will perfect the art of scraping data for your applications with ease.
 5 | 
 6 | This book is now available on [Amazon](http://amzn.to/1PeQ5O0) and [Packt](https://www.packtpub.com/big-data-and-business-intelligence/learning-scrapy).
 7 | 
 8 | ## What you will learn
 9 | 
10 | - Understand HTML pages and write XPath to extract the data you need
11 | - Write Scrapy spiders with simple Python and do web crawls
12 | - Push your data into any database, search engine or analytics system
13 | - Configure your spider to download files, images and use proxies
14 | - Create efficient pipelines that shape data in precisely the form you want
15 | - Use Twisted Asynchronous API to process hundreds of items concurrently
16 | - Make your crawler super-fast by learning how to tune Scrapy's performance
17 | - Perform large scale distributed crawls with scrapyd and scrapinghub
18 | 
19 | ## Tutorials
20 | 
21 | * How to Setup Software and Run Examples On A Windows Machine
22 | 
23 | [![image](https://cloud.githubusercontent.com/assets/789359/24506332/0c016008-1555-11e7-86e3-c736e953a199.PNG)](https://www.youtube.com/watch?v=r84-dsIRFI8)
24 | 
25 | * Chapter 4 - Create Appery.io mobile application - Updated process
26 | 
27 | [![image](https://cloud.githubusercontent.com/assets/789359/24486821/e6c99072-1503-11e7-9d45-7eed9c13c7b6.png)](https://www.youtube.com/watch?v=FEbPyQJc3NE)
28 | 
29 | * Chapter 3 & 9 on a 32-bit VM (for computers limited memory/processing power)
30 | 
31 | [![image](https://cloud.githubusercontent.com/assets/789359/24482446/26a8eae6-14e9-11e7-9244-d5117954ccea.png)](https://www.youtube.com/watch?v=w9ditoIQ7sU)
32 | 
33 | ## To use Docker directly without installing Vagrant
34 | 
35 | A `docker-compose.yml` file is included, mainly for those who already have Docker installed. For completeness, here are the links to go about installing Docker.
36 | 
37 | * For OS X El Capitan 10.11 and later, get [Docker for Mac](https://docs.docker.com/docker-for-mac/).
38 | * For earlier OS X, get [Docker Toolbox for Mac](https://docs.docker.com/toolbox/toolbox_install_mac/).
39 | * For Windows 10 Pro, with Enterprise and Education (1511 November update, Build 10586 or later), get [Docker for Windows](https://docs.docker.com/docker-for-windows/).
40 | * For Windows 7, 8.1 or other 10, get [Docker Toolbox for Windows](https://docs.docker.com/toolbox/toolbox_install_windows/).
41 | * For Ubuntu and other Linux distributions, install
42 | [docker](https://docs.docker.com/engine/installation/linux/ubuntu/) and
43 | [docker-compose](https://docs.docker.com/compose/install/).
44 |   To [avoid having to use sudo when you use the docker command](https://docs.docker.com/engine/installation/linux/linux-postinstall/),
45 | create a Unix group called docker and add users to it:
46 |   1. `sudo groupadd docker`
47 |   2. `sudo usermod -aG docker $USER`
48 | 
49 | Once you have Docker installed and started, change to the project directory and run:
50 | 
51 |   1. `docker-compose pull` - To check for updated images
52 |   2. `docker-compose up` - Will scroll log messages as various containers (virtual machines) start up. To stop the containers, Ctrl-C in this window, or enter `docker-compose down` in another shell window.
53 | 
54 | `docker system prune` will delete the system-wide Docker images, containers, and volumes that are not in use when you want to recover space.
55 | 
56 | See also [the official website](http://scrapybook.com)
57 | 


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
  1 | ENV['VAGRANT_DEFAULT_PROVIDER'] = "docker"
  2 | 
  3 | host_vagrantfile = "./Vagrantfile.dockerhost"
  4 | force_host_vm = TRUE
  5 | 
  6 | Vagrant.configure("2") do |config|
  7 | 
  8 | 	# -------------- Web server --------------
  9 | 
 10 | 	config.vm.define "web" do |web|
 11 | 	
 12 | 		web.vm.provider "docker" do |d|
 13 | 			d.image = "scrapybook/web"
 14 | 			#d.build_dir = "../scrapybook-docker-web"
 15 | 			d.name = "web"
 16 | 
 17 | 			d.vagrant_machine = "docker-provider"
 18 | 			d.vagrant_vagrantfile = host_vagrantfile
 19 | 			d.force_host_vm = force_host_vm
 20 | 		end
 21 | 		
 22 | 		web.vm.synced_folder ".", "/vagrant", disabled: true
 23 | 		
 24 | 		web.vm.network "forwarded_port", guest: 9312, host: 9312
 25 | 		web.vm.hostname = "web"
 26 | 	end
 27 | 
 28 | 	# -------------- Spark server --------------
 29 | 
 30 | 	config.vm.define "spark" do |spark|
 31 | 	
 32 | 		spark.vm.provider "docker" do |d|
 33 | 			d.image = "scrapybook/spark"
 34 | 			#d.build_dir = "../scrapybook-docker-spark"
 35 | 			d.name = "spark"
 36 | 
 37 | 			d.vagrant_machine = "docker-provider"
 38 | 			d.vagrant_vagrantfile = host_vagrantfile
 39 | 			d.force_host_vm = force_host_vm
 40 | 		end
 41 | 		
 42 | 		spark.vm.synced_folder ".", "/root/book"
 43 | 		
 44 | 		spark.vm.network "forwarded_port", guest: 21, host: 21
 45 | 		(30000..30009).each do |port|
 46 | 			spark.vm.network "forwarded_port", guest: port, host: port
 47 | 		end
 48 | 		spark.vm.hostname = "spark"
 49 | 	end
 50 | 
 51 | 
 52 | 	# -------------- ES server --------------
 53 | 
 54 | 	config.vm.define "es" do |es|
 55 | 	
 56 | 		es.vm.provider "docker" do |d|
 57 | 			d.image = "scrapybook/es"
 58 | 			#d.build_dir = "../scrapybook-docker-es"
 59 | 			d.name = "es"
 60 | 
 61 | 			d.vagrant_machine = "docker-provider"
 62 | 			d.vagrant_vagrantfile = host_vagrantfile
 63 | 			d.force_host_vm = force_host_vm
 64 | 		end
 65 | 		
 66 | 		es.vm.synced_folder ".", "/vagrant", disabled: true
 67 | 		
 68 | 		es.vm.network "forwarded_port", guest: 9200, host: 9200
 69 | 		es.vm.hostname = "es"
 70 | 	end
 71 | 	
 72 | 
 73 | 	# -------------- Redis server --------------
 74 | 
 75 | 	config.vm.define "redis" do |redis|
 76 | 	
 77 | 		redis.vm.provider "docker" do |d|
 78 | 			d.image = "scrapybook/redis"
 79 | 			#d.build_dir = "../scrapybook-docker-redis"
 80 | 			d.name = "redis"
 81 | 
 82 | 			d.vagrant_machine = "docker-provider"
 83 | 			d.vagrant_vagrantfile = host_vagrantfile
 84 | 			d.force_host_vm = force_host_vm
 85 | 		end
 86 | 		
 87 | 		redis.vm.synced_folder ".", "/vagrant", disabled: true
 88 | 		
 89 | 		redis.vm.network "forwarded_port", guest: 6379, host: 6379
 90 | 		redis.vm.hostname = "redis"
 91 | 	end
 92 | 	
 93 | 
 94 | 	# -------------- MySQL server --------------
 95 | 
 96 | 	config.vm.define "mysql" do |mysql|
 97 | 	
 98 | 		mysql.vm.provider "docker" do |d|
 99 | 			d.image = "scrapybook/mysql"
100 | 			#d.build_dir = "../scrapybook-docker-mysql"
101 | 			d.name = "mysql"
102 | 
103 | 			d.vagrant_machine = "docker-provider"
104 | 			d.vagrant_vagrantfile = host_vagrantfile
105 | 			d.force_host_vm = force_host_vm
106 | 		end
107 | 		
108 | 		mysql.vm.synced_folder ".", "/vagrant", disabled: true
109 | 		
110 | 		mysql.vm.network "forwarded_port", guest: 3306, host: 3306
111 | 		mysql.vm.hostname = "mysql"
112 | 	end
113 | 	# -------------- 3 Scrapyd servers --------------
114 | 
115 | 	{
116 | 		"scrapyd1" => 6801,
117 | 		"scrapyd2" => 6802, 
118 | 		"scrapyd3" => 6803,
119 | 	}.each do |host, port|
120 | 	
121 | 		config.vm.define host do |scp|
122 | 
123 | 			scp.vm.provider "docker" do |d|
124 | 				d.image = "scrapybook/dev"
125 | 				#d.build_dir = "../scrapybook-docker-dev"
126 | 				d.name = host
127 | 				
128 | 				d.link("spark:spark")
129 | 				d.link("web:web")
130 | 
131 | 				d.vagrant_machine = "docker-provider"
132 | 				d.vagrant_vagrantfile = host_vagrantfile
133 | 				d.force_host_vm = force_host_vm
134 | 			end
135 | 		
136 | 			scp.vm.synced_folder ".", "/vagrant", disabled: true
137 | 		
138 | 			scp.vm.network "forwarded_port", guest: 6800, host: port
139 | 			scp.vm.hostname = host
140 | 		end
141 | 	end
142 | 
143 | 	# -------------- Dev machine --------------
144 | 
145 | 	config.vm.define "dev", primary: true do |dev|
146 | 	
147 | 		dev.vm.provider "docker" do |d|
148 | 			d.image = "scrapybook/dev"
149 | 			#d.build_dir = "../scrapybook-docker-dev"
150 | 			d.name = "dev"
151 | 
152 | 			d.link("web:web")
153 | 			d.link("spark:spark")
154 | 			d.link("scrapyd1:scrapyd1")
155 | 			d.link("scrapyd2:scrapyd2")
156 | 			d.link("scrapyd3:scrapyd3")
157 | 			d.link("mysql:mysql")
158 | 			d.link("redis:redis")
159 | 			d.link("es:es")
160 | 
161 | 			d.vagrant_machine = "docker-provider"
162 | 			d.vagrant_vagrantfile = host_vagrantfile
163 | 			d.force_host_vm = force_host_vm
164 | 		end
165 | 		
166 | 		dev.vm.synced_folder ".", "/root/book"
167 | 		
168 | 		dev.vm.network "forwarded_port", guest: 6800, host: 6800
169 | 		dev.vm.hostname = "dev"
170 | 	end
171 | 
172 | 	config.ssh.username = 'root'
173 | 	config.ssh.private_key_path = 'insecure_key'
174 | end
175 | 


--------------------------------------------------------------------------------
/Vagrantfile.32:
--------------------------------------------------------------------------------
 1 | Vagrant.configure("2") do |config|
 2 | 
 3 | # Box based on ubuntu/trusty32. Limited but it works without any
 4 | # virtualization extensions.
 5 | 
 6 | 	config.vm.define "scrapy-vm-32"
 7 | 	config.vm.box = "lookfwd/scrapybook32"
 8 | 
 9 | 	config.vm.hostname = "dev"
10 | 	config.vm.synced_folder ".", "/home/vagrant/book"
11 | 
12 | 	# Setting up ports
13 | 	(
14 | 		[9200] +                      # ES
15 | 		[6379] +                      # Redis
16 | 		[3306] +                      # MySQL
17 | 		[9312] +                      # Web
18 | 		[6800] +                      # Scrapyd
19 | 	[]).each do |port|
20 | 		config.vm.network "forwarded_port", guest: port, host: port, host_ip: "localhost", auto_correct: true
21 | 	end
22 | 
23 | 	# Set the mem/cpu requirements
24 | 	config.vm.provider :virtualbox do |vb|
25 | 		vb.name = "scrapy-vm-32"
26 | 		vb.check_guest_additions = false
27 | 		vb.cpus = 1
28 | 		# If you get: cloud-init-nonet[4.54]: waiting 10 seconds for network device
29 | 		#             cloud-init-nonet[14.57]: waiting 120 seconds for network device
30 | 		#             cloud-init-nonet[134.57]: gave up waiting for a network device.
31 | 		# Your netowrk card is not supported with the default adapter.
32 | 		# See: https://github.com/mitchellh/vagrant/issues/3860#issuecomment-167664778
33 | 		# Try to uncomment the following two lines:
34 | 		# 
35 | 		# vb.customize ["modifyvm", :id, "--nictype1", "Am79C973"]
36 | 		# vb.customize ["modifyvm", :id, "--nictype2", "Am79C973"]
37 | 	end
38 | end
39 | 


--------------------------------------------------------------------------------
/Vagrantfile.dockerhost:
--------------------------------------------------------------------------------
 1 | # Set if you are behind proxy
 2 | #proxy_url = "http://user:pass@proxy.com:8080/"
 3 | 
 4 | Vagrant.configure("2") do |config|
 5 | 
 6 | 	if !defined?(proxy_url) || proxy_url.nil? || proxy_url.empty?
 7 | 
 8 | 		# No proxy setup necessary.
 9 | 
10 | 	else
11 | 		# Setup proxies
12 | 
13 | 		ENV['http_proxy']  = proxy_url
14 | 		ENV['https_proxy'] = proxy_url
15 | 	end
16 | 
17 | 	config.vm.define "docker-provider"
18 | 	config.vm.box = "lookfwd/scrapybook"
19 | 
20 | 	# Setting up ports
21 | 	(
22 | 		[9200] +                      # ES
23 | 		[6379] +                      # Redis
24 | 		[3306] +                      # MySQL
25 | 		[9312] +                      # Web
26 | 		(6800..6803).to_a +           # Scrapyd
27 | 		[21] + (30000..30009).to_a  + # Spark
28 | 	[]).each do |port|
29 | 		config.vm.network "forwarded_port", guest: port, host: port, host_ip: "localhost", auto_correct: true
30 | 	end
31 | 
32 | 	# Set the mem/cpu requirements
33 | 	config.vm.provider :virtualbox do |vb|
34 | 		vb.memory = 2048
35 | 		vb.cpus = 4
36 | 		vb.name = "docker-provider"
37 | 		vb.check_guest_additions = false
38 | 	end
39 | end
40 | 


--------------------------------------------------------------------------------
/Vagrantfile.dockerhost.src:
--------------------------------------------------------------------------------
 1 | # Set if you are behind proxy
 2 | #proxy_url = "http://user:pass@proxy.com:8080/"
 3 | 
 4 | # Set if you are behind a TLS interception proxy
 5 | #crt_filename = "my-ca-certificate.crt"
 6 | 
 7 | Vagrant.configure("2") do |config|
 8 | 
 9 | 	if !defined?(proxy_url) || proxy_url.nil? || proxy_url.empty?
10 | 
11 | 		# No proxy setup necessary. Just provision docker.
12 | 		config.vm.provision "docker"
13 | 
14 | 	else
15 | 		# Setup proxies
16 | 
17 | 		ENV['http_proxy']  = proxy_url
18 | 		ENV['https_proxy'] = proxy_url
19 | 
20 | 		config.vm.provision "shell", inline: "echo 'Using #{proxy_url} as a proxy'"
21 | 
22 | 		config.vm.provision "shell", inline: "echo 'http_proxy=#{proxy_url}' >> /etc/environment"
23 | 		config.vm.provision "shell", inline: "echo 'https_proxy=#{proxy_url}' >> /etc/environment"
24 | 
25 | 		config.vm.provision "shell", inline: "echo 'Acquire::http::Proxy \"#{proxy_url}\";' >> /etc/apt/apt.conf"
26 | 
27 | 		unless !defined?(crt_filename) || crt_filename.nil? || crt_filename.empty?
28 | 			# Vagrant should use the certificate to download boxes
29 | 			config.vm.box_download_ca_cert = crt_filename
30 | 			# Add crt file and update CA certificates
31 | 			config.vm.provision "shell", inline: "echo 'Using #{crt_filename} as a certificate'"
32 | 			config.vm.provision "shell", inline: "sudo cp /vagrant/#{crt_filename} /usr/local/share/ca-certificates/#{crt_filename}"
33 | 			config.vm.provision "shell", inline: "sudo update-ca-certificates"
34 | 		end
35 | 
36 | 		# Restart ssh so that /etc/environment settings get picked
37 | 		config.vm.provision "shell", inline: "ps aux | grep 'sshd:' | awk '{print $2}' | xargs kill"
38 | 
39 | 		# Provision docker.
40 | 		config.vm.provision "docker"
41 | 
42 | 		config.vm.provision "shell", inline: "echo 'export http_proxy=#{proxy_url}' >> /etc/default/docker"
43 | 		config.vm.provision "shell", inline: "echo 'export https_proxy=#{proxy_url}' >> /etc/default/docker"
44 | 		config.vm.provision "shell", inline: "service docker restart"
45 | 	end
46 | 	
47 | 	# The following line terminates all ssh connections. Therefore
48 | 	# Vagrant will be forced to reconnect. That's a workaround to have
49 | 	# the docker command in the PATH
50 | 	config.vm.provision "shell", inline: "ps aux | grep 'sshd:' | awk '{print $2}' | xargs kill"
51 | 
52 | 	config.vm.define "docker-provider"
53 | 	config.vm.box = "ubuntu/trusty64"
54 | 
55 | 	# Setting up ports
56 | 	(
57 | 		[9200] +                      # ES
58 | 		[6379] +                      # Redis
59 | 		[3306] +                      # MySQL
60 | 		[9312] +                      # Web
61 | 		(6800..6803).to_a +           # Scrapyd
62 | 		[21] + (30000..30009).to_a  + # Spark
63 | 	[]).each do |port|
64 | 		config.vm.network "forwarded_port", guest: port, host: port, host_ip: "localhost", auto_correct: true
65 | 	end
66 | 
67 | 	# Set the mem/cpu requirements
68 | 	config.vm.provider :virtualbox do |vb|
69 | 		vb.memory = 2048
70 | 		vb.cpus = 4
71 | 		vb.name = "docker-provider"
72 | 		vb.check_guest_additions = false
73 | 	end
74 | end
75 | 


--------------------------------------------------------------------------------
/ch03/properties/properties/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch03/properties/properties/__init__.py


--------------------------------------------------------------------------------
/ch03/properties/properties/items.py:
--------------------------------------------------------------------------------
 1 | from scrapy.item import Item, Field
 2 | 
 3 | 
 4 | class PropertiesItem(Item):
 5 |     # Primary fields
 6 |     title = Field()
 7 |     price = Field()
 8 |     description = Field()
 9 |     address = Field()
10 |     image_urls = Field()
11 | 
12 |     # Calculated fields
13 |     images = Field()
14 |     location = Field()
15 | 
16 |     # Housekeeping fields
17 |     url = Field()
18 |     project = Field()
19 |     spider = Field()
20 |     server = Field()
21 |     date = Field()
22 | 


--------------------------------------------------------------------------------
/ch03/properties/properties/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | class PropertiesPipeline(object):
 8 |     def process_item(self, item, spider):
 9 |         return item
10 | 


--------------------------------------------------------------------------------
/ch03/properties/properties/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for properties project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'properties'
10 | 
11 | SPIDER_MODULES = ['properties.spiders']
12 | NEWSPIDER_MODULE = 'properties.spiders'
13 | 
14 | # Crawl responsibly by identifying yourself (and your website) on
15 | # the user-agent
16 | #USER_AGENT = 'properties (+http://www.yourdomain.com)'
17 | 
18 | # Disable S3
19 | AWS_ACCESS_KEY_ID = ""
20 | AWS_SECRET_ACCESS_KEY = ""
21 | 


--------------------------------------------------------------------------------
/ch03/properties/properties/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ch03/properties/properties/spiders/basic.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | import scrapy
 5 | 
 6 | from scrapy.loader.processors import MapCompose, Join
 7 | from scrapy.loader import ItemLoader
 8 | 
 9 | from properties.items import PropertiesItem
10 | 
11 | 
12 | class BasicSpider(scrapy.Spider):
13 |     name = "basic"
14 |     allowed_domains = ["web"]
15 | 
16 |     # Start on a property page
17 |     start_urls = (
18 |         'http://web:9312/properties/property_000000.html',
19 |     )
20 | 
21 |     def parse(self, response):
22 |         """ This function parses a property page.
23 | 
24 |         @url http://web:9312/properties/property_000000.html
25 |         @returns items 1
26 |         @scrapes title price description address image_urls
27 |         @scrapes url project spider server date
28 |         """
29 | 
30 |         # Create the loader using the response
31 |         l = ItemLoader(item=PropertiesItem(), response=response)
32 | 
33 |         # Load fields using XPath expressions
34 |         l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
35 |                     MapCompose(unicode.strip, unicode.title))
36 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
37 |                     MapCompose(lambda i: i.replace(',', ''), float),
38 |                     re='[,.0-9]+')
39 |         l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
40 |                     MapCompose(unicode.strip), Join())
41 |         l.add_xpath('address',
42 |                     '//*[@itemtype="http://schema.org/Place"][1]/text()',
43 |                     MapCompose(unicode.strip))
44 |         l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
45 |                     MapCompose(lambda i: urlparse.urljoin(response.url, i)))
46 | 
47 |         # Housekeeping fields
48 |         l.add_value('url', response.url)
49 |         l.add_value('project', self.settings.get('BOT_NAME'))
50 |         l.add_value('spider', self.name)
51 |         l.add_value('server', socket.gethostname())
52 |         l.add_value('date', datetime.datetime.now())
53 | 
54 |         return l.load_item()
55 | 


--------------------------------------------------------------------------------
/ch03/properties/properties/spiders/easy.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | 
 5 | from scrapy.loader.processors import MapCompose, Join
 6 | from scrapy.linkextractors import LinkExtractor
 7 | from scrapy.spiders import CrawlSpider, Rule
 8 | from scrapy.loader import ItemLoader
 9 | 
10 | from properties.items import PropertiesItem
11 | 
12 | 
13 | class EasySpider(CrawlSpider):
14 |     name = 'easy'
15 |     allowed_domains = ["web"]
16 | 
17 |     # Start on the first index page
18 |     start_urls = (
19 |         'http://web:9312/properties/index_00000.html',
20 |     )
21 | 
22 |     # Rules for horizontal and vertical crawling
23 |     rules = (
24 |         Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
25 |         Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
26 |              callback='parse_item')
27 |     )
28 | 
29 |     def parse_item(self, response):
30 |         """ This function parses a property page.
31 | 
32 |         @url http://web:9312/properties/property_000000.html
33 |         @returns items 1
34 |         @scrapes title price description address image_urls
35 |         @scrapes url project spider server date
36 |         """
37 | 
38 |         # Create the loader using the response
39 |         l = ItemLoader(item=PropertiesItem(), response=response)
40 | 
41 |         # Load fields using XPath expressions
42 |         l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
43 |                     MapCompose(unicode.strip, unicode.title))
44 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
45 |                     MapCompose(lambda i: i.replace(',', ''), float),
46 |                     re='[,.0-9]+')
47 |         l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
48 |                     MapCompose(unicode.strip), Join())
49 |         l.add_xpath('address',
50 |                     '//*[@itemtype="http://schema.org/Place"][1]/text()',
51 |                     MapCompose(unicode.strip))
52 |         l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
53 |                     MapCompose(lambda i: urlparse.urljoin(response.url, i)))
54 | 
55 |         # Housekeeping fields
56 |         l.add_value('url', response.url)
57 |         l.add_value('project', self.settings.get('BOT_NAME'))
58 |         l.add_value('spider', self.name)
59 |         l.add_value('server', socket.gethostname())
60 |         l.add_value('date', datetime.datetime.now())
61 | 
62 |         return l.load_item()
63 | 


--------------------------------------------------------------------------------
/ch03/properties/properties/spiders/manual.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | import scrapy
 5 | 
 6 | from scrapy.loader.processors import MapCompose, Join
 7 | from scrapy.loader import ItemLoader
 8 | from scrapy.http import Request
 9 | 
10 | from properties.items import PropertiesItem
11 | 
12 | 
13 | class BasicSpider(scrapy.Spider):
14 |     name = "manual"
15 |     allowed_domains = ["web"]
16 | 
17 |     # Start on the first index page
18 |     start_urls = (
19 |         'http://web:9312/properties/index_00000.html',
20 |     )
21 | 
22 |     def parse(self, response):
23 |         # Get the next index URLs and yield Requests
24 |         next_selector = response.xpath('//*[contains(@class,"next")]//@href')
25 |         for url in next_selector.extract():
26 |             yield Request(urlparse.urljoin(response.url, url))
27 | 
28 |         # Get item URLs and yield Requests
29 |         item_selector = response.xpath('//*[@itemprop="url"]/@href')
30 |         for url in item_selector.extract():
31 |             yield Request(urlparse.urljoin(response.url, url),
32 |                           callback=self.parse_item)
33 | 
34 |     def parse_item(self, response):
35 |         """ This function parses a property page.
36 | 
37 |         @url http://web:9312/properties/property_000000.html
38 |         @returns items 1
39 |         @scrapes title price description address image_urls
40 |         @scrapes url project spider server date
41 |         """
42 | 
43 |         # Create the loader using the response
44 |         l = ItemLoader(item=PropertiesItem(), response=response)
45 | 
46 |         # Load fields using XPath expressions
47 |         l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
48 |                     MapCompose(unicode.strip, unicode.title))
49 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
50 |                     MapCompose(lambda i: i.replace(',', ''), float),
51 |                     re='[,.0-9]+')
52 |         l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
53 |                     MapCompose(unicode.strip), Join())
54 |         l.add_xpath('address',
55 |                     '//*[@itemtype="http://schema.org/Place"][1]/text()',
56 |                     MapCompose(unicode.strip))
57 |         l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
58 |                     MapCompose(lambda i: urlparse.urljoin(response.url, i)))
59 | 
60 |         # Housekeeping fields
61 |         l.add_value('url', response.url)
62 |         l.add_value('project', self.settings.get('BOT_NAME'))
63 |         l.add_value('spider', self.name)
64 |         l.add_value('server', socket.gethostname())
65 |         l.add_value('date', datetime.datetime.now())
66 | 
67 |         return l.load_item()
68 | 


--------------------------------------------------------------------------------
/ch03/properties/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = properties.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = properties
12 | 


--------------------------------------------------------------------------------
/ch04/properties/properties/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch04/properties/properties/__init__.py


--------------------------------------------------------------------------------
/ch04/properties/properties/items.py:
--------------------------------------------------------------------------------
 1 | from scrapy.item import Item, Field
 2 | 
 3 | 
 4 | class PropertiesItem(Item):
 5 |     # Primary fields
 6 |     title = Field()
 7 |     price = Field()
 8 |     description = Field()
 9 |     address = Field()
10 |     image_urls = Field()
11 | 
12 |     # Calculated fields
13 |     images = Field()
14 |     location = Field()
15 | 
16 |     # Housekeeping fields
17 |     url = Field()
18 |     project = Field()
19 |     spider = Field()
20 |     server = Field()
21 |     date = Field()
22 | 


--------------------------------------------------------------------------------
/ch04/properties/properties/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | class PropertiesPipeline(object):
 8 |     def process_item(self, item, spider):
 9 |         return item
10 | 


--------------------------------------------------------------------------------
/ch04/properties/properties/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for properties project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'properties'
10 | 
11 | SPIDER_MODULES = ['properties.spiders']
12 | NEWSPIDER_MODULE = 'properties.spiders'
13 | 
14 | # Crawl responsibly by identifying yourself (and your website) on
15 | # the user-agent
16 | #USER_AGENT = 'properties (+http://www.yourdomain.com)'
17 | 
18 | ITEM_PIPELINES = {'scrapyapperyio.ApperyIoPipeline': 300}
19 | 
20 | APPERYIO_DB_ID = '1234abcdef1234abcdef1234'
21 | APPERYIO_USERNAME = 'root'
22 | APPERYIO_PASSWORD = 'pass'
23 | APPERYIO_COLLECTION_NAME = 'properties'
24 | 
25 | # Disable S3
26 | AWS_ACCESS_KEY_ID = ""
27 | AWS_SECRET_ACCESS_KEY = ""
28 | 


--------------------------------------------------------------------------------
/ch04/properties/properties/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ch04/properties/properties/spiders/tomobile.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | 
 5 | from scrapy.loader.processors import MapCompose, Join
 6 | from scrapy.linkextractors import LinkExtractor
 7 | from scrapy.spiders import CrawlSpider, Rule
 8 | from scrapy.loader import ItemLoader
 9 | 
10 | from properties.items import PropertiesItem
11 | 
12 | 
13 | class ToMobileSpider(CrawlSpider):
14 |     name = 'tomobile'
15 |     allowed_domains = ["scrapybook.s3.amazonaws.com"]
16 | 
17 |     # Start on the first index page
18 |     start_urls = (
19 |         'http://scrapybook.s3.amazonaws.com/properties/index_00000.html',
20 |     )
21 | 
22 |     # Rules for horizontal and vertical crawling
23 |     rules = (
24 |         Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
25 |         Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
26 |              callback='parse_item')
27 |     )
28 | 
29 |     def parse_item(self, response):
30 |         """ This function parses a property page.
31 | 
32 |         @url http://scrapybook.s3.amazonaws.com/properties/property_000000.html
33 |         @returns items 1
34 |         @scrapes title price description address image_urls
35 |         @scrapes url project spider server date
36 |         """
37 | 
38 |         # Create the loader using the response
39 |         l = ItemLoader(item=PropertiesItem(), response=response)
40 | 
41 |         # Load fields using XPath expressions
42 |         l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
43 |                     MapCompose(unicode.strip, unicode.title))
44 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
45 |                     MapCompose(lambda i: i.replace(',', ''), float),
46 |                     re='[,.0-9]+')
47 |         l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
48 |                     MapCompose(unicode.strip), Join())
49 |         l.add_xpath('address',
50 |                     '//*[@itemtype="http://schema.org/Place"][1]/text()',
51 |                     MapCompose(unicode.strip))
52 |         l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
53 |                     MapCompose(lambda i: urlparse.urljoin(response.url, i)))
54 | 
55 |         # Housekeeping fields
56 |         l.add_value('url', response.url)
57 |         l.add_value('project', self.settings.get('BOT_NAME'))
58 |         l.add_value('spider', self.name)
59 |         l.add_value('server', socket.gethostname())
60 |         l.add_value('date', datetime.datetime.now())
61 | 
62 |         return l.load_item()
63 | 


--------------------------------------------------------------------------------
/ch04/properties/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = properties.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = properties
12 | 


--------------------------------------------------------------------------------
/ch05/generic/generic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch05/generic/generic/__init__.py


--------------------------------------------------------------------------------
/ch05/generic/generic/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class GenericItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/ch05/generic/generic/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class GenericPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/ch05/generic/generic/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for generic project
 4 | #
 5 | # For simplicity, this file contains only the most important settings by
 6 | # default. All the other settings are documented here:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #
10 | 
11 | BOT_NAME = 'generic'
12 | 
13 | SPIDER_MODULES = ['generic.spiders']
14 | NEWSPIDER_MODULE = 'generic.spiders'
15 | 
16 | # Crawl responsibly by identifying yourself (and your website)
17 | # on the user-agent
18 | #USER_AGENT = 'generic (+http://www.yourdomain.com)'
19 | 
20 | # Disable S3
21 | AWS_ACCESS_KEY_ID = ""
22 | AWS_SECRET_ACCESS_KEY = ""
23 | 


--------------------------------------------------------------------------------
/ch05/generic/generic/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ch05/generic/generic/spiders/fromcsv.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | import scrapy
 4 | from scrapy.http import Request
 5 | from scrapy.loader import ItemLoader
 6 | from scrapy.item import Item, Field
 7 | 
 8 | 
 9 | class FromcsvSpider(scrapy.Spider):
10 |     name = "fromcsv"
11 | 
12 |     def start_requests(self):
13 |         with open(getattr(self, "file", "todo.csv"), "rU") as f:
14 |             reader = csv.DictReader(f)
15 |             for line in reader:
16 |                 request = Request(line.pop('url'))
17 |                 request.meta['fields'] = line
18 |                 yield request
19 | 
20 |     def parse(self, response):
21 |         item = Item()
22 |         l = ItemLoader(item=item, response=response)
23 |         for name, xpath in response.meta['fields'].iteritems():
24 |             if xpath:
25 |                 item.fields[name] = Field()
26 |                 l.add_xpath(name, xpath)
27 | 
28 |         return l.load_item()
29 | 


--------------------------------------------------------------------------------
/ch05/generic/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = generic.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = generic
12 | 


--------------------------------------------------------------------------------
/ch05/generic/todo.csv:
--------------------------------------------------------------------------------
1 | url,name,pricehttp://web:9312/static/a.html,"//*[@id=""itemTitle""]/text()","//*[@id=""prcIsum""]/text()"http://web:9312/static/b.html,//h1/text(),//span/strong/text()http://web:9312/static/c.html,"//*[@id=""product-desc""]/span/text()",


--------------------------------------------------------------------------------
/ch05/properties/properties/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch05/properties/properties/__init__.py


--------------------------------------------------------------------------------
/ch05/properties/properties/items.py:
--------------------------------------------------------------------------------
 1 | from scrapy.item import Item, Field
 2 | 
 3 | 
 4 | class PropertiesItem(Item):
 5 |     # Primary fields
 6 |     title = Field()
 7 |     price = Field()
 8 |     description = Field()
 9 |     address = Field()
10 |     image_urls = Field()
11 | 
12 |     # Calculated fields
13 |     images = Field()
14 |     location = Field()
15 | 
16 |     # Housekeeping fields
17 |     url = Field()
18 |     project = Field()
19 |     spider = Field()
20 |     server = Field()
21 |     date = Field()
22 | 


--------------------------------------------------------------------------------
/ch05/properties/properties/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | class PropertiesPipeline(object):
 8 |     def process_item(self, item, spider):
 9 |         return item
10 | 


--------------------------------------------------------------------------------
/ch05/properties/properties/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for properties project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'properties'
10 | 
11 | SPIDER_MODULES = ['properties.spiders']
12 | NEWSPIDER_MODULE = 'properties.spiders'
13 | 
14 | # Crawl responsibly by identifying yourself (and your website) on
15 | # the user-agent
16 | #USER_AGENT = 'properties (+http://www.yourdomain.com)'
17 | 
18 | # Disable S3
19 | AWS_ACCESS_KEY_ID = ""
20 | AWS_SECRET_ACCESS_KEY = ""
21 | 


--------------------------------------------------------------------------------
/ch05/properties/properties/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ch05/properties/properties/spiders/api.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | import scrapy
 5 | import json
 6 | 
 7 | from scrapy.loader.processors import MapCompose, Join
 8 | from scrapy.loader import ItemLoader
 9 | from scrapy.http import Request
10 | 
11 | from properties.items import PropertiesItem
12 | 
13 | 
14 | class ApiSpider(scrapy.Spider):
15 |     name = 'api'
16 |     allowed_domains = ["web"]
17 | 
18 |     # Start on the first index page
19 |     start_urls = (
20 |         'http://web:9312/properties/api.json',
21 |     )
22 | 
23 |     # Format the URLs based on the API call response
24 |     def parse(self, response):
25 |         base_url = "http://web:9312/properties/"
26 |         js = json.loads(response.body)
27 |         for item in js:
28 |             id = item["id"]
29 |             title = item["title"]
30 |             url = base_url + "property_%06d.html" % id
31 |             yield Request(url, meta={"title": title}, callback=self.parse_item)
32 | 
33 |     def parse_item(self, response):
34 |         """ This function parses a property page.
35 | 
36 |         @url http://web:9312/properties/property_000000.html
37 |         @returns items 1
38 |         @scrapes title price description address image_urls
39 |         @scrapes url project spider server date
40 |         """
41 | 
42 |         # Create the loader using the response
43 |         l = ItemLoader(item=PropertiesItem(), response=response)
44 | 
45 |         # Load fields using XPath expressions
46 |         l.add_value('title', response.meta['title'],
47 |                     MapCompose(unicode.strip, unicode.title))
48 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
49 |                     MapCompose(lambda i: i.replace(',', ''), float),
50 |                     re='[,.0-9]+')
51 |         l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
52 |                     MapCompose(unicode.strip), Join())
53 |         l.add_xpath('address',
54 |                     '//*[@itemtype="http://schema.org/Place"][1]/text()',
55 |                     MapCompose(unicode.strip))
56 |         l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
57 |                     MapCompose(lambda i: urlparse.urljoin(response.url, i)))
58 | 
59 |         # Housekeeping fields
60 |         l.add_value('url', response.url)
61 |         l.add_value('project', self.settings.get('BOT_NAME'))
62 |         l.add_value('spider', self.name)
63 |         l.add_value('server', socket.gethostname())
64 |         l.add_value('date', datetime.datetime.now())
65 | 
66 |         return l.load_item()
67 | 


--------------------------------------------------------------------------------
/ch05/properties/properties/spiders/fast.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | import scrapy
 5 | 
 6 | from scrapy.loader.processors import MapCompose, Join
 7 | from scrapy.loader import ItemLoader
 8 | from scrapy.http import Request
 9 | 
10 | from properties.items import PropertiesItem
11 | 
12 | 
13 | class FastSpider(scrapy.Spider):
14 |     name = 'fast'
15 |     allowed_domains = ["web"]
16 | 
17 |     # Start on the first index page
18 |     start_urls = (
19 |         'http://web:9312/properties/index_00000.html',
20 |     )
21 | 
22 |     def parse(self, response):
23 |         # Get the next index URLs and yield Requests
24 |         next_selector = response.xpath('//*[contains(@class,"next")]//@href')
25 |         for url in next_selector.extract():
26 |             yield Request(urlparse.urljoin(response.url, url))
27 | 
28 |         # Iterate through products and create PropertiesItems
29 |         selectors = response.xpath(
30 |             '//*[@itemtype="http://schema.org/Product"]')
31 |         for selector in selectors:
32 |             yield self.parse_item(selector, response)
33 | 
34 |     def parse_item(self, selector, response):
35 |         # Create the loader using the selector
36 |         l = ItemLoader(item=PropertiesItem(), selector=selector)
37 | 
38 |         # Load fields using XPath expressions
39 |         l.add_xpath('title', './/*[@itemprop="name"][1]/text()',
40 |                     MapCompose(unicode.strip, unicode.title))
41 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
42 |                     MapCompose(lambda i: i.replace(',', ''), float),
43 |                     re='[,.0-9]+')
44 |         l.add_xpath('description',
45 |                     './/*[@itemprop="description"][1]/text()',
46 |                     MapCompose(unicode.strip), Join())
47 |         l.add_xpath('address',
48 |                     './/*[@itemtype="http://schema.org/Place"]'
49 |                     '[1]/*/text()',
50 |                     MapCompose(unicode.strip))
51 |         make_url = lambda i: urlparse.urljoin(response.url, i)
52 |         l.add_xpath('image_urls', './/*[@itemprop="image"][1]/@src',
53 |                     MapCompose(make_url))
54 | 
55 |         # Housekeeping fields
56 |         l.add_xpath('url', './/*[@itemprop="url"][1]/@href',
57 |                     MapCompose(make_url))
58 |         l.add_value('project', self.settings.get('BOT_NAME'))
59 |         l.add_value('spider', self.name)
60 |         l.add_value('server', socket.gethostname())
61 |         l.add_value('date', datetime.datetime.now())
62 | 
63 |         return l.load_item()
64 | 


--------------------------------------------------------------------------------
/ch05/properties/properties/spiders/login.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | 
 5 | from scrapy.loader.processors import MapCompose, Join
 6 | from scrapy.linkextractors import LinkExtractor
 7 | from scrapy.spiders import CrawlSpider, Rule
 8 | from scrapy.loader import ItemLoader
 9 | from scrapy.http import FormRequest
10 | 
11 | from properties.items import PropertiesItem
12 | 
13 | 
14 | class LoginSpider(CrawlSpider):
15 |     name = 'login'
16 |     allowed_domains = ["web"]
17 | 
18 |     # Start with a login request
19 |     def start_requests(self):
20 |         return [
21 |             FormRequest(
22 |                 "http://web:9312/dynamic/login",
23 |                 formdata={"user": "user", "pass": "pass"}
24 |             )]
25 | 
26 |     # Rules for horizontal and vertical crawling
27 |     rules = (
28 |         Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
29 |         Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
30 |              callback='parse_item')
31 |     )
32 | 
33 |     def parse_item(self, response):
34 |         """ This function parses a property page.
35 | 
36 |         @url http://web:9312/properties/property_000000.html
37 |         @returns items 1
38 |         @scrapes title price description address image_urls
39 |         @scrapes url project spider server date
40 |         """
41 | 
42 |         # Create the loader using the response
43 |         l = ItemLoader(item=PropertiesItem(), response=response)
44 | 
45 |         # Load fields using XPath expressions
46 |         l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
47 |                     MapCompose(unicode.strip, unicode.title))
48 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
49 |                     MapCompose(lambda i: i.replace(',', ''), float),
50 |                     re='[,.0-9]+')
51 |         l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
52 |                     MapCompose(unicode.strip), Join())
53 |         l.add_xpath('address',
54 |                     '//*[@itemtype="http://schema.org/Place"][1]/text()',
55 |                     MapCompose(unicode.strip))
56 |         l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
57 |                     MapCompose(lambda i: urlparse.urljoin(response.url, i)))
58 | 
59 |         # Housekeeping fields
60 |         l.add_value('url', response.url)
61 |         l.add_value('project', self.settings.get('BOT_NAME'))
62 |         l.add_value('spider', self.name)
63 |         l.add_value('server', socket.gethostname())
64 |         l.add_value('date', datetime.datetime.now())
65 | 
66 |         return l.load_item()
67 | 


--------------------------------------------------------------------------------
/ch05/properties/properties/spiders/noncelogin.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | 
 5 | from scrapy.loader.processors import MapCompose, Join
 6 | from scrapy.linkextractors import LinkExtractor
 7 | from scrapy.spiders import CrawlSpider, Rule
 8 | from scrapy.loader import ItemLoader
 9 | from scrapy.http import Request, FormRequest
10 | 
11 | from properties.items import PropertiesItem
12 | 
13 | 
14 | class NonceLoginSpider(CrawlSpider):
15 |     name = 'noncelogin'
16 |     allowed_domains = ["web"]
17 | 
18 |     # Start on the welcome page
19 |     def start_requests(self):
20 |         return [
21 |             Request(
22 |                 "http://web:9312/dynamic/nonce",
23 |                 callback=self.parse_welcome)
24 |         ]
25 | 
26 |     # Post welcome page's first form with the given user/pass
27 |     def parse_welcome(self, response):
28 |         return FormRequest.from_response(
29 |             response,
30 |             formdata={"user": "user", "pass": "pass"}
31 |         )
32 | 
33 |     # Rules for horizontal and vertical crawling
34 |     rules = (
35 |         Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
36 |         Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
37 |              callback='parse_item')
38 |     )
39 | 
40 |     def parse_item(self, response):
41 |         """ This function parses a property page.
42 | 
43 |         @url http://web:9312/properties/property_000000.html
44 |         @returns items 1
45 |         @scrapes title price description address image_urls
46 |         @scrapes url project spider server date
47 |         """
48 | 
49 |         # Create the loader using the response
50 |         l = ItemLoader(item=PropertiesItem(), response=response)
51 | 
52 |         # Load fields using XPath expressions
53 |         l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
54 |                     MapCompose(unicode.strip, unicode.title))
55 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
56 |                     MapCompose(lambda i: i.replace(',', ''), float),
57 |                     re='[,.0-9]+')
58 |         l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
59 |                     MapCompose(unicode.strip), Join())
60 |         l.add_xpath('address',
61 |                     '//*[@itemtype="http://schema.org/Place"][1]/text()',
62 |                     MapCompose(unicode.strip))
63 |         l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
64 |                     MapCompose(lambda i: urlparse.urljoin(response.url, i)))
65 | 
66 |         # Housekeeping fields
67 |         l.add_value('url', response.url)
68 |         l.add_value('project', self.settings.get('BOT_NAME'))
69 |         l.add_value('spider', self.name)
70 |         l.add_value('server', socket.gethostname())
71 |         l.add_value('date', datetime.datetime.now())
72 | 
73 |         return l.load_item()
74 | 


--------------------------------------------------------------------------------
/ch05/properties/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = properties.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = properties
12 | 


--------------------------------------------------------------------------------
/ch06/properties/properties/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch06/properties/properties/__init__.py


--------------------------------------------------------------------------------
/ch06/properties/properties/items.py:
--------------------------------------------------------------------------------
 1 | from scrapy.item import Item, Field
 2 | 
 3 | 
 4 | class PropertiesItem(Item):
 5 |     # Primary fields
 6 |     title = Field()
 7 |     price = Field()
 8 |     description = Field()
 9 |     address = Field()
10 |     image_urls = Field()
11 | 
12 |     # Calculated fields
13 |     images = Field()
14 |     location = Field()
15 | 
16 |     # Housekeeping fields
17 |     url = Field()
18 |     project = Field()
19 |     spider = Field()
20 |     server = Field()
21 |     date = Field()
22 | 


--------------------------------------------------------------------------------
/ch06/properties/properties/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | class PropertiesPipeline(object):
 8 |     def process_item(self, item, spider):
 9 |         return item
10 | 


--------------------------------------------------------------------------------
/ch06/properties/properties/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for properties project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'properties'
10 | 
11 | SPIDER_MODULES = ['properties.spiders']
12 | NEWSPIDER_MODULE = 'properties.spiders'
13 | 
14 | # Crawl responsibly by identifying yourself (and your website) on
15 | # the user-agent
16 | #USER_AGENT = 'properties (+http://www.yourdomain.com)'
17 | 
18 | # Disable S3
19 | AWS_ACCESS_KEY_ID = ""
20 | AWS_SECRET_ACCESS_KEY = ""
21 | 


--------------------------------------------------------------------------------
/ch06/properties/properties/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ch06/properties/properties/spiders/tomobile.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | 
 5 | from scrapy.loader.processors import MapCompose, Join
 6 | from scrapy.linkextractors import LinkExtractor
 7 | from scrapy.spiders import CrawlSpider, Rule
 8 | from scrapy.loader import ItemLoader
 9 | 
10 | from properties.items import PropertiesItem
11 | 
12 | 
13 | class ToMobileSpider(CrawlSpider):
14 |     name = 'tomobile'
15 |     allowed_domains = ["scrapybook.s3.amazonaws.com"]
16 | 
17 |     # Start on the first index page
18 |     start_urls = (
19 |         'http://scrapybook.s3.amazonaws.com/properties/index_00000.html',
20 |     )
21 | 
22 |     # Rules for horizontal and vertical crawling
23 |     rules = (
24 |         Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
25 |         Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
26 |              callback='parse_item')
27 |     )
28 | 
29 |     def parse_item(self, response):
30 |         """ This function parses a property page.
31 | 
32 |         @url http://scrapybook.s3.amazonaws.com/properties/property_000000.html
33 |         @returns items 1
34 |         @scrapes title price description address image_urls
35 |         @scrapes url project spider server date
36 |         """
37 | 
38 |         # Create the loader using the response
39 |         l = ItemLoader(item=PropertiesItem(), response=response)
40 | 
41 |         # Load fields using XPath expressions
42 |         l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
43 |                     MapCompose(unicode.strip, unicode.title))
44 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
45 |                     MapCompose(lambda i: i.replace(',', ''), float),
46 |                     re='[,.0-9]+')
47 |         l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
48 |                     MapCompose(unicode.strip), Join())
49 |         l.add_xpath('address',
50 |                     '//*[@itemtype="http://schema.org/Place"][1]/text()',
51 |                     MapCompose(unicode.strip))
52 |         l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
53 |                     MapCompose(lambda i: urlparse.urljoin(response.url, i)))
54 | 
55 |         # Housekeeping fields
56 |         l.add_value('url', response.url)
57 |         l.add_value('project', self.settings.get('BOT_NAME'))
58 |         l.add_value('spider', self.name)
59 |         l.add_value('server', socket.gethostname())
60 |         l.add_value('date', datetime.datetime.now())
61 | 
62 |         return l.load_item()
63 | 


--------------------------------------------------------------------------------
/ch06/properties/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = properties.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = properties
12 | 


--------------------------------------------------------------------------------
/ch07/properties/properties/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch07/properties/properties/__init__.py


--------------------------------------------------------------------------------
/ch07/properties/properties/hi.py:
--------------------------------------------------------------------------------
1 | from scrapy.commands import ScrapyCommand
2 | 
3 | 
4 | class Command(ScrapyCommand):
5 |     default_settings = {'LOG_ENABLED': False}
6 | 
7 |     def run(self, args, opts):
8 |         print("hello")
9 | 


--------------------------------------------------------------------------------
/ch07/properties/properties/items.py:
--------------------------------------------------------------------------------
 1 | from scrapy.item import Item, Field
 2 | 
 3 | 
 4 | class PropertiesItem(Item):
 5 |     # Primary fields
 6 |     title = Field()
 7 |     price = Field()
 8 |     description = Field()
 9 |     address = Field()
10 |     image_urls = Field()
11 | 
12 |     # Calculated fields
13 |     images = Field()
14 |     location = Field()
15 | 
16 |     # Housekeeping fields
17 |     url = Field()
18 |     project = Field()
19 |     spider = Field()
20 |     server = Field()
21 |     date = Field()
22 | 


--------------------------------------------------------------------------------
/ch07/properties/properties/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | class PropertiesPipeline(object):
 8 |     def process_item(self, item, spider):
 9 |         return item
10 | 


--------------------------------------------------------------------------------
/ch07/properties/properties/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for properties project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'properties'
10 | 
11 | SPIDER_MODULES = ['properties.spiders']
12 | NEWSPIDER_MODULE = 'properties.spiders'
13 | 
14 | # Crawl responsibly by identifying yourself (and your website) on
15 | # the user-agent
16 | #USER_AGENT = 'properties (+http://www.yourdomain.com)'
17 | 
18 | ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
19 | IMAGES_STORE = 'images'
20 | IMAGES_THUMBS = {'small': (30, 30)}
21 | 
22 | COMMANDS_MODULE = 'properties.hi'
23 | 
24 | # Disable S3
25 | AWS_ACCESS_KEY_ID = ""
26 | AWS_SECRET_ACCESS_KEY = ""
27 | 


--------------------------------------------------------------------------------
/ch07/properties/properties/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ch07/properties/properties/spiders/fast.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | import scrapy
 5 | 
 6 | from scrapy.loader.processors import MapCompose, Join
 7 | from scrapy.loader import ItemLoader
 8 | from scrapy.http import Request
 9 | 
10 | from properties.items import PropertiesItem
11 | 
12 | 
13 | class BasicSpider(scrapy.Spider):
14 |     name = 'fast'
15 |     allowed_domains = ["web"]
16 | 
17 |     # Start on the first index page
18 |     start_urls = (
19 |         'http://web:9312/properties/index_00000.html',
20 |     )
21 | 
22 |     def parse(self, response):
23 |         # Get the next index URLs and yield Requests
24 |         next_selector = response.xpath('//*[contains(@class,"next")]//@href')
25 |         for url in next_selector.extract():
26 |             yield Request(urlparse.urljoin(response.url, url))
27 | 
28 |         # Iterate through products and create PropertiesItems
29 |         selectors = response.xpath(
30 |             '//*[@itemtype="http://schema.org/Product"]')
31 |         for selector in selectors:
32 |             yield self.parse_item(selector, response)
33 | 
34 |     def parse_item(self, selector, response):
35 |         # Create the loader using the selector
36 |         l = ItemLoader(item=PropertiesItem(), selector=selector)
37 | 
38 |         # Load fields using XPath expressions
39 |         l.add_xpath('title', './/*[@itemprop="name"][1]/text()',
40 |                     MapCompose(unicode.strip, unicode.title))
41 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
42 |                     MapCompose(lambda i: i.replace(',', ''), float),
43 |                     re='[,.0-9]+')
44 |         l.add_xpath('description',
45 |                     './/*[@itemprop="description"][1]/text()',
46 |                     MapCompose(unicode.strip), Join())
47 |         l.add_xpath('address',
48 |                     './/*[@itemtype="http://schema.org/Place"]'
49 |                     '[1]/*/text()',
50 |                     MapCompose(unicode.strip))
51 |         make_url = lambda i: urlparse.urljoin(response.url, i)
52 |         l.add_xpath('image_urls', './/*[@itemprop="image"][1]/@src',
53 |                     MapCompose(make_url))
54 | 
55 |         # Housekeeping fields
56 |         l.add_xpath('url', './/*[@itemprop="url"][1]/@href',
57 |                     MapCompose(make_url))
58 |         l.add_value('project', self.settings.get('BOT_NAME'))
59 |         l.add_value('spider', self.name)
60 |         l.add_value('server', socket.gethostname())
61 |         l.add_value('date', datetime.datetime.now())
62 | 
63 |         return l.load_item()
64 | 


--------------------------------------------------------------------------------
/ch07/properties/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = properties.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = properties
12 | 


--------------------------------------------------------------------------------
/ch08/deferreds.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # ~*~ Understanding Twisted deferreds ~*~
  4 | 
  5 | 
  6 | def example0():
  7 |     from twisted.internet import defer
  8 | 
  9 |     # Experiment 1
 10 |     d = defer.Deferred()
 11 |     d.called
 12 |     d.callback(3)
 13 |     d.called
 14 |     d.result
 15 | 
 16 |     # Experiment 2
 17 |     d = defer.Deferred()
 18 | 
 19 |     def foo(v):
 20 |         print "foo called"
 21 |         return v+1
 22 | 
 23 |     d.addCallback(foo)
 24 |     d.called
 25 |     d.callback(3)
 26 |     d.called
 27 |     d.result
 28 | 
 29 |     # Experiment 3
 30 |     def status(*ds):
 31 |         return [(getattr(d, 'result', "N/A"), len(d.callbacks)) for d in ds]
 32 | 
 33 |     def b_callback(arg):
 34 |         print "b_callback called with arg =", arg
 35 |         return b
 36 | 
 37 |     def on_done(arg):
 38 |         print "on_done called with arg =", arg
 39 |         return arg
 40 | 
 41 |     # Experiment 3.a
 42 |     a = defer.Deferred()
 43 |     b = defer.Deferred()
 44 | 
 45 |     a.addCallback(b_callback).addCallback(on_done)
 46 | 
 47 |     status(a, b)
 48 | 
 49 |     a.callback(3)
 50 | 
 51 |     status(a, b)
 52 | 
 53 |     b.callback(4)
 54 | 
 55 |     status(a, b)
 56 | 
 57 |     # Experiment 3.b
 58 |     a = defer.Deferred()
 59 |     b = defer.Deferred()
 60 | 
 61 |     a.addCallback(b_callback).addCallback(on_done)
 62 | 
 63 |     b.callback(4)
 64 | 
 65 |     status(a, b)
 66 | 
 67 |     a.callback(3)
 68 | 
 69 |     status(a, b)
 70 | 
 71 |     # Experiment 4
 72 |     deferreds = [defer.Deferred() for i in xrange(5)]
 73 |     join = defer.DeferredList(deferreds)
 74 |     join.addCallback(on_done)
 75 |     for i in xrange(4):
 76 |         deferreds[i].callback(i)
 77 | 
 78 |     deferreds[4].callback(4)
 79 | 
 80 | from time import sleep
 81 | 
 82 | 
 83 | # ~*~ Twisted - A Python tale ~*~
 84 | 
 85 | 
 86 | # Hello, I'm a developer and I mainly setup Wordpress.
 87 | def install_wordpress(customer):
 88 |     # Our hosting company Threads Ltd. sucks. I start installation and...
 89 |     print "Start installation for", customer
 90 | 
 91 |     # ...then wait till the installation finishes successfully. It is boring
 92 |     # and I'm spending most of my time waiting while consuming resources
 93 |     # (memory and some CPU cycles). It's because the process is *blocking*.
 94 |     sleep(3)
 95 | 
 96 |     print "All done for", customer
 97 | 
 98 | 
 99 | def example1():
100 |     # I do this all day long for our customers
101 |     def developer_day(customers):
102 |         for customer in customers:
103 |             install_wordpress(customer)
104 | 
105 |     developer_day(["Bill", "Elon", "Steve", "Mark"])
106 | 
107 | 
108 | def example2():
109 |     import threading
110 | 
111 |     # The company grew. We now have many customers and I can't handle the
112 |     # workload. We are now 5 developers doing exactly the same thing.
113 |     def developers_day(customers):
114 |         # But we now have to synchronize... a.k.a. bureaucracy
115 |         lock = threading.Lock()
116 | 
117 |         def dev_day(id):
118 |             print "Goodmorning from developer", id
119 |             # Yuck - I hate locks...
120 |             lock.acquire()
121 |             while customers:
122 |                 customer = customers.pop(0)
123 |                 lock.release()
124 |                 # My Python is less readable
125 |                 install_wordpress(customer)
126 |                 lock.acquire()
127 |             lock.release()
128 |             print "Bye from developer", id
129 | 
130 |         # We go to work in the morning
131 |         devs = [threading.Thread(target=dev_day, args=(i,)) for i in range(5)]
132 |         [dev.start() for dev in devs]
133 |         # We leave for the evening
134 |         [dev.join() for dev in devs]
135 | 
136 |     # We now get more done in the same time but our dev process got more
137 |     # complex. As we grew we spend more time managing queues than doing dev
138 |     # work. We even had occasional deadlocks when processes got extremely
139 |     # complex. The fact is that we are still mostly pressing buttons and
140 |     # waiting but now we also spend some time in meetings.
141 |     developers_day(["Customer %d" % i for i in xrange(15)])
142 | 
143 | # For years we thought this was all there was... We kept hiring more
144 | # developers, more managers and buying servers. We were trying harder
145 | # optimising processes and fire-fighting while getting mediocre performance in
146 | # return. Till luckily one day our hosting company decided to increase their
147 | # fees and we decided to switch to Twisted Ltd.!
148 | 
149 | 
150 | def example3():
151 |     from twisted.internet import reactor
152 |     from twisted.internet import defer
153 |     from twisted.internet import task
154 | 
155 |     # Twisted has a slightly different approach
156 |     def schedule_install(customer):
157 |         # They are calling us back when a Wordpress installation completes.
158 |         # They connected the caller recognition system with our CRM and
159 |         # we know exactly what a call is about and what has to be done next.
160 | 
161 |         # We now design processes of what has to happen on certain events.
162 |         def schedule_install_wordpress():
163 |             def on_done():
164 |                 print "Callback: Finished installation for", customer
165 |             print "Scheduling: Installation for", customer
166 |             return task.deferLater(reactor, 3, on_done)
167 | 
168 |         def all_done(_):
169 |             print "All done for", customer
170 | 
171 |         # For each customer, we schedule these processes on the CRM and that
172 |         # is all our chief-Twisted developer has to do
173 |         d = schedule_install_wordpress()
174 |         d.addCallback(all_done)
175 | 
176 |         return d
177 | 
178 |     # Yes, we don't need many developers anymore nor any synchronization.
179 |     # ~~ Super-powered Twisted developer ~~
180 |     def twisted_developer_day(customers):
181 |         print "Goodmorning from Twisted developer"
182 | 
183 |         # Here's what has to be done today
184 |         work = [schedule_install(customer) for customer in customers]
185 |         # Turn off the lights when done
186 |         join = defer.DeferredList(work)
187 |         join.addCallback(lambda _: reactor.stop())
188 | 
189 |         print "Bye from Twisted developer!"
190 | 
191 |     # Even his day is particularly short!
192 |     twisted_developer_day(["Customer %d" % i for i in xrange(15)])
193 | 
194 |     # Reactor, our secretary uses the CRM and follows-up on events!
195 |     reactor.run()
196 | 
197 | 
198 | def example4():
199 |     from twisted.internet import reactor
200 |     from twisted.internet import defer
201 |     from twisted.internet import task
202 | 
203 |     # Twisted gave us utilities that make our code way more readable!
204 |     @defer.inlineCallbacks
205 |     def inline_install(customer):
206 |         print "Scheduling: Installation for", customer
207 |         yield task.deferLater(reactor, 3, lambda: None)
208 |         print "Callback: Finished installation for", customer
209 | 
210 |         print "All done for", customer
211 | 
212 |     # Still no developers or synchronization.
213 |     # ~~ Super-powered Twisted developer ~~
214 |     def twisted_developer_day(customers):
215 |         print "Goodmorning from Twisted developer"
216 |         work = [inline_install(customer) for customer in customers]
217 |         join = defer.DeferredList(work)
218 |         join.addCallback(lambda _: reactor.stop())
219 |         print "Bye from Twisted developer!"
220 | 
221 |     twisted_developer_day(["Customer %d" % i for i in xrange(15)])
222 |     reactor.run()
223 | 
224 | 
225 | def example5():
226 |     from twisted.internet import reactor
227 |     from twisted.internet import defer
228 |     from twisted.internet import task
229 | 
230 |     @defer.inlineCallbacks
231 |     def inline_install(customer):
232 |         print "Scheduling: Installation for", customer
233 |         yield task.deferLater(reactor, 3, lambda: None)
234 |         print "Callback: Finished installation for", customer
235 |         print "All done for", customer
236 | 
237 |     # The new "problem" is that we have to manage all this concurrency to
238 |     # avoid causing problems to others, but this is a nice problem to have.
239 |     def twisted_developer_day(customers):
240 |         print "Goodmorning from Twisted developer"
241 |         work = (inline_install(customer) for customer in customers)
242 | 
243 |         # We use the Cooperator mechanism to make the secretary not service
244 |         # more than 5 customers simultaneously.
245 |         coop = task.Cooperator()
246 |         join = defer.DeferredList([coop.coiterate(work) for i in xrange(5)])
247 | 
248 |         join.addCallback(lambda _: reactor.stop())
249 |         print "Bye from Twisted developer!"
250 | 
251 |     twisted_developer_day(["Customer %d" % i for i in xrange(15)])
252 | 
253 |     reactor.run()
254 | 
255 | # We are now more lean than ever, our customers happy, our hosting bills
256 | # ridiculously low and our performance stellar.
257 | 
258 | # ~*~ THE END ~*~
259 | 
260 | if __name__ == "__main__":
261 |     # Parsing arguments
262 |     import argparse
263 |     parser = argparse.ArgumentParser()
264 |     parser.add_argument('example', metavar='example', type=int, nargs='?',
265 |                         choices=xrange(6), help='example to run')
266 |     args = parser.parse_args()
267 | 
268 |     if args.example is not None:
269 |         import time
270 |         # I know which example to run
271 |         print "------ Running example", args.example, "------"
272 |         start = time.time()
273 |         # Run the appropriate local function
274 |         locals()["example%d" % args.example]()
275 |         end = time.time()
276 |         print "* Elapsed time: %3.2f seconds" % (end - start)
277 |     else:
278 |         # I don't have arguments, so I run experiments 1-5 one after
279 |         # the other
280 |         import sys
281 |         import subprocess
282 |         [subprocess.call([sys.argv[0], str(i)]) for i in xrange(1, 6)]
283 | 


--------------------------------------------------------------------------------
/ch08/hooksasync/hooksasync/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch08/hooksasync/hooksasync/__init__.py


--------------------------------------------------------------------------------
/ch08/hooksasync/hooksasync/extensions.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from scrapy import signals
  4 | from scrapy.exceptions import DropItem
  5 | 
  6 | 
  7 | class HooksasyncExtension(object):
  8 |     @classmethod
  9 |     def from_crawler(cls, crawler):
 10 |         logging.info("HooksasyncExtension from_crawler")
 11 |         return cls(crawler)
 12 | 
 13 |     def __init__(self, crawler):
 14 |         logging.info("HooksasyncExtension Constructor called")
 15 | 
 16 |         # connect the extension object to signals
 17 |         cs = crawler.signals.connect
 18 |         cs(self.engine_started, signal=signals.engine_started)
 19 |         cs(self.engine_stopped, signal=signals.engine_stopped)
 20 |         cs(self.spider_opened, signal=signals.spider_opened)
 21 |         cs(self.spider_idle, signal=signals.spider_idle)
 22 |         cs(self.spider_closed, signal=signals.spider_closed)
 23 |         cs(self.spider_error, signal=signals.spider_error)
 24 |         cs(self.request_scheduled, signal=signals.request_scheduled)
 25 |         cs(self.response_received, signal=signals.response_received)
 26 |         cs(self.response_downloaded, signal=signals.response_downloaded)
 27 |         cs(self.item_scraped, signal=signals.item_scraped)
 28 |         cs(self.item_dropped, signal=signals.item_dropped)
 29 | 
 30 |     def engine_started(self):
 31 |         logging.info("HooksasyncExtension, signals.engine_started fired")
 32 | 
 33 |     def engine_stopped(self):
 34 |         logging.info("HooksasyncExtension, signals.engine_stopped fired")
 35 | 
 36 |     def spider_opened(self, spider):
 37 |         logging.info("HooksasyncExtension, signals.spider_opened fired")
 38 | 
 39 |     def spider_idle(self, spider):
 40 |         logging.info("HooksasyncExtension, signals.spider_idle fired")
 41 | 
 42 |     def spider_closed(self, spider, reason):
 43 |         logging.info("HooksasyncExtension, signals.spider_closed fired")
 44 | 
 45 |     def spider_error(self, failure, response, spider):
 46 |         logging.info("HooksasyncExtension, signals.spider_error fired")
 47 | 
 48 |     def request_scheduled(self, request, spider):
 49 |         logging.info("HooksasyncExtension, signals.request_scheduled fired")
 50 | 
 51 |     def response_received(self, response, request, spider):
 52 |         logging.info("HooksasyncExtension, signals.response_received fired")
 53 | 
 54 |     def response_downloaded(self, response, request, spider):
 55 |         logging.info("HooksasyncExtension, signals.response_downloaded fired")
 56 | 
 57 |     def item_scraped(self, item, response, spider):
 58 |         logging.info("HooksasyncExtension, signals.item_scraped fired")
 59 | 
 60 |     def item_dropped(self, item, spider, exception):
 61 |         logging.info("HooksasyncExtension, signals.item_dropped fired")
 62 | 
 63 |     @classmethod
 64 |     def from_settings(cls, settings):
 65 |         logging.info("HooksasyncExtension from_settings")
 66 |         # This is never called - but would be called if from_crawler()
 67 |         # didn't exist. from_crawler() can access the settings via
 68 |         # crawler.settings but also has access to everything that
 69 |         # crawler object provides like signals, stats and the ability
 70 |         # to schedule new requests with crawler.engine.download()
 71 | 
 72 | 
 73 | class HooksasyncDownloaderMiddleware(object):
 74 |     """ Downloader middlewares *are* middlewares and as a result can do
 75 |     everything any middleware can do (see HooksasyncExtension).
 76 |     The main thing that makes them different are the process_*() methods"""
 77 | 
 78 |     @classmethod
 79 |     def from_crawler(cls, crawler):
 80 |         logging.info("HooksasyncDownloaderMiddleware from_crawler")
 81 |         # Here the constructor is actually called and the class returned
 82 |         return cls(crawler)
 83 | 
 84 |     def __init__(self, crawler):
 85 |         logging.info("HooksasyncDownloaderMiddleware Constructor called")
 86 | 
 87 |     def process_request(self, request, spider):
 88 |         logging.info(("HooksasyncDownloaderMiddleware process_request "
 89 |                       "called for %s") % request.url)
 90 | 
 91 |     def process_response(self, request, response, spider):
 92 |         logging.info(("HooksasyncDownloaderMiddleware process_response "
 93 |                       "called for %s") % request.url)
 94 |         return response
 95 | 
 96 |     def process_exception(self, request, exception, spider):
 97 |         logging.info(("HooksasyncDownloaderMiddleware process_exception "
 98 |                       "called for %s") % request.url)
 99 | 
100 | 
101 | class HooksasyncSpiderMiddleware(object):
102 |     """ Spider middlewares *are* middlewares and as a result can do
103 |     everything any middleware can do (see HooksasyncExtension).
104 |     The main thing that makes them different are the process_*() methods"""
105 | 
106 |     @classmethod
107 |     def from_crawler(cls, crawler):
108 |         logging.info("HooksasyncSpiderMiddleware from_crawler")
109 |         # Here the constructor is actually called and the class returned
110 |         return cls(crawler)
111 | 
112 |     def __init__(self, crawler):
113 |         logging.info("HooksasyncSpiderMiddleware Constructor called")
114 | 
115 |     def process_spider_input(self, response, spider):
116 |         logging.info(("HooksasyncSpiderMiddleware process_spider_input "
117 |                       "called for %s") % response.url)
118 | 
119 |     def process_spider_output(self, response, result, spider):
120 |         logging.info(("HooksasyncSpiderMiddleware process_spider_output "
121 |                       "called for %s") % response.url)
122 |         return result
123 | 
124 |     def process_spider_exception(self, response, exception, spider):
125 |         logging.info(("HooksasyncSpiderMiddleware process_spider_exception "
126 |                       "called for %s") % response.url)
127 | 
128 |     def process_start_requests(self, start_requests, spider):
129 |         logging.info("HooksasyncSpiderMiddleware process_start_requests"
130 |                      " called")
131 |         return start_requests
132 | 
133 | 
134 | class HooksasyncPipeline(object):
135 |     """ Pipelines *are* middlewares and as a result can do
136 |     everything any middlewares can do (see HooksasyncExtension).
137 |     The main thing that makes them different is that they have
138 |     the process_item() method"""
139 | 
140 |     @classmethod
141 |     def from_crawler(cls, crawler):
142 |         logging.info("HooksasyncPipeline from_crawler")
143 |         # Here the constructor is actually called and the class returned
144 |         return cls(crawler)
145 | 
146 |     def __init__(self, crawler):
147 |         logging.info("HooksasyncPipeline Constructor called")
148 | 
149 |     def process_item(self, item, spider):
150 |         if item['name'] == "Hello 1":
151 |             raise DropItem("Not good")
152 |         logging.info(("HooksasyncPipeline process_item() called for "
153 |                       "item: %s") % item['name'])
154 |         return item
155 | 
156 |     # This function overrides the default for Item Pipelines
157 |     def open_spider(self, spider):
158 |         logging.info("HooksasyncPipeline spider_opened")
159 | 
160 |     # This function overrides the default for Item Pipelines
161 |     def close_spider(self, spider):
162 |         logging.info("HooksasyncPipeline spider_closed")
163 | 


--------------------------------------------------------------------------------
/ch08/hooksasync/hooksasync/settings.py:
--------------------------------------------------------------------------------
 1 | BOT_NAME = 'hooksasync'
 2 | 
 3 | SPIDER_MODULES = ['hooksasync.spiders']
 4 | NEWSPIDER_MODULE = 'hooksasync.spiders'
 5 | 
 6 | 
 7 | EXTENSIONS = {'hooksasync.extensions.HooksasyncExtension': 100}
 8 | DOWNLOADER_MIDDLEWARES = {
 9 |     'hooksasync.extensions.HooksasyncDownloaderMiddleware': 100
10 | }
11 | 
12 | SPIDER_MIDDLEWARES = {'hooksasync.extensions.HooksasyncSpiderMiddleware': 100}
13 | ITEM_PIPELINES = {'hooksasync.extensions.HooksasyncPipeline': 100}
14 | 
15 | # Disable S3
16 | AWS_ACCESS_KEY_ID = ""
17 | AWS_SECRET_ACCESS_KEY = ""
18 | 


--------------------------------------------------------------------------------
/ch08/hooksasync/hooksasync/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ch08/hooksasync/hooksasync/spiders/test.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spider import Spider
 2 | from scrapy.item import Item, Field
 3 | 
 4 | 
 5 | class HooksasyncItem(Item):
 6 |     name = Field()
 7 | 
 8 | 
 9 | class TestSpider(Spider):
10 |     name = "test"
11 |     allowed_domains = ["example.com"]
12 |     start_urls = ('http://www.example.com',)
13 | 
14 |     def parse(self, response):
15 |         for i in range(2):
16 |             item = HooksasyncItem()
17 |             item['name'] = "Hello %d" % i
18 |             yield item
19 |         raise Exception("dead")
20 | 


--------------------------------------------------------------------------------
/ch08/hooksasync/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = hooksasync.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = hooksasync
12 | 


--------------------------------------------------------------------------------
/ch08/properties/properties/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/ch08/properties/properties/hi.py:
--------------------------------------------------------------------------------
 1 | from scrapy.commands import ScrapyCommand
 2 | 
 3 | 
 4 | class Command(ScrapyCommand):
 5 |     default_settings = {'LOG_ENABLED': False}
 6 | 
 7 |     def short_desc(self):
 8 |         return "Says hi"
 9 | 
10 |     def run(self, args, opts):
11 |         print("hi")
12 | 


--------------------------------------------------------------------------------
/ch08/properties/properties/items.py:
--------------------------------------------------------------------------------
 1 | from scrapy.item import Item, Field
 2 | 
 3 | 
 4 | class PropertiesItem(Item):
 5 |     # Primary fields
 6 |     title = Field()
 7 |     price = Field()
 8 |     description = Field()
 9 |     address = Field()
10 |     image_urls = Field()
11 | 
12 |     # Calculated fields
13 |     images = Field()
14 |     location = Field()
15 | 
16 |     # Housekeeping fields
17 |     url = Field()
18 |     project = Field()
19 |     spider = Field()
20 |     server = Field()
21 |     date = Field()
22 | 


--------------------------------------------------------------------------------
/ch08/properties/properties/latencies.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | 
 3 | from scrapy.exceptions import NotConfigured
 4 | from twisted.internet import task
 5 | from scrapy import signals
 6 | 
 7 | 
 8 | class Latencies(object):
 9 |     """
10 |     An extension that measures throughput and latencies.
11 |     """
12 |     @classmethod
13 |     def from_crawler(cls, crawler):
14 |         return cls(crawler)
15 | 
16 |     def __init__(self, crawler):
17 |         self.crawler = crawler
18 |         self.interval = crawler.settings.getfloat('LATENCIES_INTERVAL')
19 | 
20 |         if not self.interval:
21 |             raise NotConfigured
22 | 
23 |         cs = crawler.signals
24 |         cs.connect(self._spider_opened, signal=signals.spider_opened)
25 |         cs.connect(self._spider_closed, signal=signals.spider_closed)
26 |         cs.connect(self._request_scheduled, signal=signals.request_scheduled)
27 |         cs.connect(self._response_received, signal=signals.response_received)
28 |         cs.connect(self._item_scraped, signal=signals.item_scraped)
29 | 
30 |         self.latency, self.proc_latency, self.items = 0, 0, 0
31 | 
32 |     def _spider_opened(self, spider):
33 |         self.task = task.LoopingCall(self._log, spider)
34 |         self.task.start(self.interval)
35 | 
36 |     def _spider_closed(self, spider, reason):
37 |         if self.task.running:
38 |             self.task.stop()
39 | 
40 |     def _request_scheduled(self, request, spider):
41 |         request.meta['schedule_time'] = time()
42 | 
43 |     def _response_received(self, response, request, spider):
44 |         request.meta['received_time'] = time()
45 | 
46 |     def _item_scraped(self, item, response, spider):
47 |         self.latency += time() - response.meta['schedule_time']
48 |         self.proc_latency += time() - response.meta['received_time']
49 |         self.items += 1
50 | 
51 |     def _log(self, spider):
52 |         irate = float(self.items) / self.interval
53 |         latency = self.latency / self.items if self.items else 0
54 |         proc_latency = self.proc_latency / self.items if self.items else 0
55 | 
56 |         spider.logger.info(("Scraped %d items at %.1f items/s, avg latency: "
57 |                             "%.2f s and avg time in pipelines: %.2f s") %
58 |                           (self.items, irate, latency, proc_latency))
59 | 
60 |         self.latency, self.proc_latency, self.items = 0, 0, 0
61 | 


--------------------------------------------------------------------------------
/ch08/properties/properties/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch08/properties/properties/pipelines/__init__.py


--------------------------------------------------------------------------------
/ch08/properties/properties/pipelines/tidyup.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | 
 4 | class TidyUp(object):
 5 |     """A pipeline that does some basic post-processing"""
 6 | 
 7 |     def process_item(self, item, spider):
 8 |         """
 9 |         Pipeline's main method. Formats the date as a string.
10 |         """
11 | 
12 |         item['date'] = map(datetime.isoformat, item['date'])
13 | 
14 |         return item
15 | 


--------------------------------------------------------------------------------
/ch08/properties/properties/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for properties project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'properties'
10 | 
11 | SPIDER_MODULES = ['properties.spiders']
12 | NEWSPIDER_MODULE = 'properties.spiders'
13 | 
14 | # Crawl responsibly by identifying yourself (and your website) on
15 | # the user-agent
16 | #USER_AGENT = 'properties (+http://www.yourdomain.com)'
17 | 
18 | ITEM_PIPELINES = {
19 |     'properties.pipelines.tidyup.TidyUp': 100,
20 | }
21 | 
22 | EXTENSIONS = {'properties.latencies.Latencies': 500}
23 | LATENCIES_INTERVAL = 5
24 | 
25 | COMMANDS_MODULE = 'properties.hi'
26 | 
27 | # Disable S3
28 | AWS_ACCESS_KEY_ID = ""
29 | AWS_SECRET_ACCESS_KEY = ""
30 | 


--------------------------------------------------------------------------------
/ch08/properties/properties/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ch08/properties/properties/spiders/easy.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | 
 5 | from scrapy.loader.processors import MapCompose, Join
 6 | from scrapy.linkextractors import LinkExtractor
 7 | from scrapy.spiders import CrawlSpider, Rule
 8 | from scrapy.loader import ItemLoader
 9 | 
10 | from properties.items import PropertiesItem
11 | 
12 | 
13 | class EasySpider(CrawlSpider):
14 |     name = 'easy'
15 |     allowed_domains = ["web"]
16 | 
17 |     # Start on the first index page
18 |     start_urls = (
19 |         'http://web:9312/properties/index_00000.html',
20 |     )
21 | 
22 |     # Rules for horizontal and vertical crawling
23 |     rules = (
24 |         Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
25 |         Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
26 |              callback='parse_item')
27 |     )
28 | 
29 |     def parse_item(self, response):
30 |         """ This function parses a property page.
31 | 
32 |         @url http://web:9312/properties/property_000000.html
33 |         @returns items 1
34 |         @scrapes title price description address image_urls
35 |         @scrapes url project spider server date
36 |         """
37 | 
38 |         # Create the loader using the response
39 |         l = ItemLoader(item=PropertiesItem(), response=response)
40 | 
41 |         # Load fields using XPath expressions
42 |         l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
43 |                     MapCompose(unicode.strip, unicode.title))
44 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
45 |                     MapCompose(lambda i: i.replace(',', ''), float),
46 |                     re='[,.0-9]+')
47 |         l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
48 |                     MapCompose(unicode.strip), Join())
49 |         l.add_xpath('address',
50 |                     '//*[@itemtype="http://schema.org/Place"][1]/text()',
51 |                     MapCompose(unicode.strip))
52 |         l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
53 |                     MapCompose(lambda i: urlparse.urljoin(response.url, i)))
54 | 
55 |         # Housekeeping fields
56 |         l.add_value('url', response.url)
57 |         l.add_value('project', self.settings.get('BOT_NAME'))
58 |         l.add_value('spider', self.name)
59 |         l.add_value('server', socket.gethostname())
60 |         l.add_value('date', datetime.datetime.now())
61 | 
62 |         return l.load_item()
63 | 


--------------------------------------------------------------------------------
/ch08/properties/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = properties.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = properties
12 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch09/properties/properties/__init__.py


--------------------------------------------------------------------------------
/ch09/properties/properties/items.py:
--------------------------------------------------------------------------------
 1 | from scrapy.item import Item, Field
 2 | 
 3 | 
 4 | class PropertiesItem(Item):
 5 |     # Primary fields
 6 |     title = Field()
 7 |     price = Field()
 8 |     description = Field()
 9 |     address = Field()
10 |     image_urls = Field()
11 | 
12 |     # Calculated fields
13 |     images = Field()
14 |     location = Field()
15 | 
16 |     # Housekeeping fields
17 |     url = Field()
18 |     project = Field()
19 |     spider = Field()
20 |     server = Field()
21 |     date = Field()
22 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/latencies.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | 
 3 | from scrapy.exceptions import NotConfigured
 4 | from twisted.internet import task
 5 | from scrapy import signals
 6 | 
 7 | 
 8 | class Latencies(object):
 9 |     """
10 |     An extension that measures throughput and latencies.
11 |     """
12 |     @classmethod
13 |     def from_crawler(cls, crawler):
14 |         return cls(crawler)
15 | 
16 |     def __init__(self, crawler):
17 |         self.crawler = crawler
18 |         self.interval = crawler.settings.getfloat('LATENCIES_INTERVAL')
19 | 
20 |         if not self.interval:
21 |             raise NotConfigured
22 | 
23 |         cs = crawler.signals
24 |         cs.connect(self._spider_opened, signal=signals.spider_opened)
25 |         cs.connect(self._spider_closed, signal=signals.spider_closed)
26 |         cs.connect(self._request_scheduled, signal=signals.request_scheduled)
27 |         cs.connect(self._response_received, signal=signals.response_received)
28 |         cs.connect(self._item_scraped, signal=signals.item_scraped)
29 | 
30 |         self.latency, self.proc_latency, self.items = 0, 0, 0
31 | 
32 |     def _spider_opened(self, spider):
33 |         self.task = task.LoopingCall(self._log, spider)
34 |         self.task.start(self.interval)
35 | 
36 |     def _spider_closed(self, spider, reason):
37 |         if self.task.running:
38 |             self.task.stop()
39 | 
40 |     def _request_scheduled(self, request, spider):
41 |         request.meta['schedule_time'] = time()
42 | 
43 |     def _response_received(self, response, request, spider):
44 |         request.meta['received_time'] = time()
45 | 
46 |     def _item_scraped(self, item, response, spider):
47 |         self.latency += time() - response.meta['schedule_time']
48 |         self.proc_latency += time() - response.meta['received_time']
49 |         self.items += 1
50 | 
51 |     def _log(self, spider):
52 |         irate = float(self.items) / self.interval
53 |         latency = self.latency / self.items if self.items else 0
54 |         proc_latency = self.proc_latency / self.items if self.items else 0
55 | 
56 |         spider.logger.info(("Scraped %d items at %.1f items/s, avg latency: "
57 |                             "%.2f s and avg time in pipelines: %.2f s") %
58 |                           (self.items, irate, latency, proc_latency))
59 | 
60 |         self.latency, self.proc_latency, self.items = 0, 0, 0
61 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch09/properties/properties/pipelines/__init__.py


--------------------------------------------------------------------------------
/ch09/properties/properties/pipelines/computation.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import threading
 3 | 
 4 | from twisted.internet import reactor, defer
 5 | 
 6 | 
 7 | class UsingBlocking(object):
 8 |     """A pipeline that fakes some computation or blocking calls"""
 9 | 
10 |     def __init__(self):
11 |         """
12 |         This function doesn't need any settings so init just initializes a few
13 |         fields
14 |         """
15 | 
16 |         self.beta, self.delta = 0, 0
17 |         self.lock = threading.RLock()
18 | 
19 |     @defer.inlineCallbacks
20 |     def process_item(self, item, spider):
21 |         """We defer a function to Twisted rector's thread pool"""
22 | 
23 |         # Get the price
24 |         price = item["price"][0]
25 | 
26 |         # Call a complex/blocking function in a thread pool
27 |         # Note that while this will give you some performance boost
28 |         # it's still subject to GIL and likely won't make the most
29 |         # out of systems with multiple CPUs/cores.
30 |         # Consider Twisted's spawnProcess() (example in legacy.py)
31 |         # or crafting a custom solution around Python's
32 |         # multiprocessing.Process to make the most out of your
33 |         # cores for CPU intensive tasks. Also consider doing this
34 |         # processing as a batch post-processing step as shown in Chapter 11.
35 |         out = defer.Deferred()
36 |         reactor.callInThread(self._do_calculation, price, out)
37 | 
38 |         # Yield out to get the result and replace the price with it
39 |         item["price"][0] = yield out
40 | 
41 |         # Return the item to the next stage
42 |         defer.returnValue(item)
43 | 
44 |     def _do_calculation(self, price, out):
45 |         """
46 |         This is a slow calculation. Notice that it uses locks to protect a
47 |         global state. If you don't use locks and you have global state, your
48 |         will end up with corrupted data
49 |         """
50 | 
51 |         # Use a lock to protect the critical section
52 |         with self.lock:
53 |             # Faking a complex calculation that uses global state
54 |             self.beta += 1
55 |             # Hold the lock for as little time as possible. Here by sleeping
56 |             # for 1ms we make data corruption in case you don't hold the lock
57 |             # more likely
58 |             time.sleep(0.001)
59 |             self.delta += 1
60 |             new_price = price + self.beta - self.delta + 1
61 | 
62 |         # Using our "complex" calculation, the end-value must remain the same
63 |         assert abs(new_price - price - 1) < 0.01, "%f!=%f" % (new_price, price)
64 | 
65 |         # Do some calculations that don't require global state...
66 |         time.sleep(0.10)
67 | 
68 |         # We enqueue processing the value to the main (reactor's) thread
69 |         reactor.callFromThread(out.callback, new_price)
70 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/pipelines/es.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import traceback
 3 | 
 4 | import treq
 5 | 
 6 | from urllib import quote
 7 | from twisted.internet import defer
 8 | from scrapy.exceptions import NotConfigured
 9 | from twisted.internet.error import ConnectError
10 | from twisted.internet.error import ConnectingCancelledError
11 | 
12 | 
13 | class EsWriter(object):
14 |     """A pipeline that writes to Elastic Search"""
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         """Create a new instance and pass it ES's url"""
19 | 
20 |         # Get Elastic Search URL
21 |         es_url = crawler.settings.get('ES_PIPELINE_URL', None)
22 | 
23 |         # If doesn't exist, disable
24 |         if not es_url:
25 |             raise NotConfigured
26 | 
27 |         return cls(es_url)
28 | 
29 |     def __init__(self, es_url):
30 |         """Store url and initialize error reporting"""
31 | 
32 |         # Store the url for future reference
33 |         self.es_url = es_url
34 | 
35 |     @defer.inlineCallbacks
36 |     def process_item(self, item, spider):
37 |         """
38 |         Pipeline's main method. Uses inlineCallbacks to do
39 |         asynchronous REST requests
40 |         """
41 |         try:
42 |             # Create a json representation of this item
43 |             data = json.dumps(dict(item), ensure_ascii=False).encode("utf-8")
44 |             yield treq.post(self.es_url, data, timeout=5)
45 |         finally:
46 |             # In any case, return the dict for the next stage
47 |             defer.returnValue(item)
48 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/pipelines/geo.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | 
 3 | import treq
 4 | 
 5 | from twisted.internet import defer
 6 | 
 7 | 
 8 | class GeoPipeline(object):
 9 |     """A pipeline that geocodes addresses using Google's API"""
10 | 
11 |     @classmethod
12 |     def from_crawler(cls, crawler):
13 |         """Create a new instance and pass it crawler's stats object"""
14 |         return cls(crawler.stats)
15 | 
16 |     def __init__(self, stats):
17 |         """Initialize empty cache and stats object"""
18 |         self.stats = stats
19 | 
20 |     @defer.inlineCallbacks
21 |     def geocode(self, address):
22 |         """
23 |         This method makes a call to Google's geocoding API. You shouldn't
24 |         call this more than 5 times per second
25 |         """
26 | 
27 |         # The url for this API
28 |         #endpoint = 'https://maps.googleapis.com/maps/api/geocode/json'
29 |         endpoint = 'http://web:9312/maps/api/geocode/json'
30 | 
31 |         # Do the call
32 |         parms = [('address', address), ('sensor', 'false')]
33 |         response = yield treq.get(endpoint, params=parms)
34 | 
35 |         # Decode the response as json
36 |         content = yield response.json()
37 | 
38 |         # If the status isn't ok, return it as a string
39 |         if content['status'] != 'OK':
40 |             raise Exception('Unexpected status="%s" for address="%s"' %
41 |                             (content['status'], address))
42 | 
43 |         # Extract the address and geo-point and set item's fields
44 |         geo = content['results'][0]["geometry"]["location"]
45 | 
46 |         # Return the final value
47 |         defer.returnValue({"lat": geo["lat"], "lon": geo["lng"]})
48 | 
49 |     @defer.inlineCallbacks
50 |     def process_item(self, item, spider):
51 |         """
52 |         Pipeline's main method. Uses inlineCallbacks to do
53 |         asynchronous REST requests
54 |         """
55 | 
56 |         if "location" in item:
57 |             # Set by previous step (spider or pipeline). Don't do anything
58 |             # apart from increasing stats
59 |             self.stats.inc_value('geo_pipeline/already_set')
60 |             defer.returnValue(item)
61 |             return
62 | 
63 |         # The item has to have the address field set
64 |         assert ("address" in item) and (len(item["address"]) > 0)
65 | 
66 |         # Extract the address from the item.
67 |         try:
68 |             item["location"] = yield self.geocode(item["address"][0])
69 |         except:
70 |             self.stats.inc_value('geo_pipeline/errors')
71 |             print traceback.format_exc()
72 | 
73 |         # Return the item for the next stage
74 |         defer.returnValue(item)
75 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/pipelines/geo2.py:
--------------------------------------------------------------------------------
  1 | import traceback
  2 | 
  3 | import treq
  4 | 
  5 | from twisted.internet import defer
  6 | from twisted.internet import task
  7 | from twisted.internet import reactor
  8 | 
  9 | 
 10 | class Throttler(object):
 11 |     """
 12 |     A simple throttler helps you limit the number of requests you make
 13 |     to a limited resource
 14 |     """
 15 | 
 16 |     def __init__(self, rate):
 17 |         """It will callback at most ```rate``` enqueued things per second"""
 18 |         self.queue = []
 19 |         self.looping_call = task.LoopingCall(self._allow_one)
 20 |         self.looping_call.start(1. / float(rate))
 21 | 
 22 |     def stop(self):
 23 |         """Stop the throttler"""
 24 |         self.looping_call.stop()
 25 | 
 26 |     def throttle(self):
 27 |         """
 28 |         Call this function to get a deferred that will become available
 29 |         in some point in the future in accordance with the throttling rate
 30 |         """
 31 |         d = defer.Deferred()
 32 |         self.queue.append(d)
 33 |         return d
 34 | 
 35 |     def _allow_one(self):
 36 |         """Makes deferred callbacks periodically"""
 37 |         if self.queue:
 38 |             self.queue.pop(0).callback(None)
 39 | 
 40 | 
 41 | class DeferredCache(object):
 42 |     """
 43 |     A cache that always returns a value, an error or a deferred
 44 |     """
 45 | 
 46 |     def __init__(self, key_not_found_callback):
 47 |         """Takes as an argument """
 48 |         self.records = {}
 49 |         self.deferreds_waiting = {}
 50 |         self.key_not_found_callback = key_not_found_callback
 51 | 
 52 |     @defer.inlineCallbacks
 53 |     def find(self, key):
 54 |         """
 55 |         This function either returns something directly from the cache or it
 56 |         calls ```key_not_found_callback``` to evaluate a value and return it.
 57 |         Uses deferreds to do this is a non-blocking manner.
 58 |         """
 59 |         # This is the deferred for this call
 60 |         rv = defer.Deferred()
 61 | 
 62 |         if key in self.deferreds_waiting:
 63 |             # We have other instances waiting for this key. Queue
 64 |             self.deferreds_waiting[key].append(rv)
 65 |         else:
 66 |             # We are the only guy waiting for this key right now.
 67 |             self.deferreds_waiting[key] = [rv]
 68 | 
 69 |             if not key in self.records:
 70 |                 # If we don't have a value for this key we will evaluate it
 71 |                 # using key_not_found_callback.
 72 |                 try:
 73 |                     value = yield self.key_not_found_callback(key)
 74 | 
 75 |                     # If the evaluation succeeds then the action for this key
 76 |                     # is to call deferred's callback with value as an argument
 77 |                     # (using Python closures)
 78 |                     self.records[key] = lambda d: d.callback(value)
 79 |                 except Exception as e:
 80 |                     # If the evaluation fails with an exception then the
 81 |                     # action for this key is to call deferred's errback with
 82 |                     # the exception as an argument (Python closures again)
 83 |                     self.records[key] = lambda d: d.errback(e)
 84 | 
 85 |             # At this point we have an action for this key in self.records
 86 |             action = self.records[key]
 87 | 
 88 |             # Note that due to ```yield key_not_found_callback```, many
 89 |             # deferreds might have been added in deferreds_waiting[key] in
 90 |             # the meanwhile
 91 |             # For each of the deferreds waiting for this key....
 92 |             for d in self.deferreds_waiting.pop(key):
 93 |                 # ...perform the action later from the reactor thread
 94 |                 reactor.callFromThread(action, d)
 95 | 
 96 |         value = yield rv
 97 |         defer.returnValue(value)
 98 | 
 99 | 
100 | class GeoPipeline(object):
101 |     """A pipeline that geocodes addresses using Google's API"""
102 | 
103 |     @classmethod
104 |     def from_crawler(cls, crawler):
105 |         """Create a new instance and pass it crawler's stats object"""
106 |         return cls(crawler.stats)
107 | 
108 |     def __init__(self, stats):
109 |         """Initialize empty cache and stats object"""
110 |         self.stats = stats
111 |         self.cache = DeferredCache(self.cache_key_not_found_callback)
112 |         self.throttler = Throttler(5)  # 5 Requests per second
113 | 
114 |     def close_spider(self, spider):
115 |         """Stop the throttler"""
116 |         self.throttler.stop()
117 | 
118 |     @defer.inlineCallbacks
119 |     def geocode(self, address):
120 |         """
121 |         This method makes a call to Google's geocoding API. You shouldn't
122 |         call this more than 5 times per second
123 |         """
124 | 
125 |         # The url for this API
126 |         #endpoint = 'https://maps.googleapis.com/maps/api/geocode/json'
127 |         endpoint = 'http://web:9312/maps/api/geocode/json'
128 | 
129 |         # Do the call
130 |         parms = [('address', address), ('sensor', 'false')]
131 |         response = yield treq.get(endpoint, params=parms)
132 | 
133 |         # Decode the response as json
134 |         content = yield response.json()
135 | 
136 |         # If the status isn't ok, return it as a string
137 |         if content['status'] != 'OK':
138 |             raise Exception('Unexpected status="%s" for address="%s"' %
139 |                             (content['status'], address))
140 | 
141 |         # Extract the address and geo-point and set item's fields
142 |         geo = content['results'][0]["geometry"]["location"]
143 | 
144 |         # Return the final value
145 |         defer.returnValue({"lat": geo["lat"], "lon": geo["lng"]})
146 | 
147 |     @defer.inlineCallbacks
148 |     def cache_key_not_found_callback(self, address):
149 |         """
150 |         This method makes an API call while respecting throttling limits.
151 |         It also retries attempts that fail due to limits.
152 |         """
153 |         self.stats.inc_value('geo_pipeline/misses')
154 | 
155 |         while True:
156 |             # Wait enough to adhere to throttling policies
157 |             yield self.throttler.throttle()
158 | 
159 |             # Do the API call
160 |             try:
161 |                 value = yield self.geocode(address)
162 |                 defer.returnValue(value)
163 | 
164 |                 # Success
165 |                 break
166 |             except Exception, e:
167 |                 if 'status="OVER_QUERY_LIMIT"' in str(e):
168 |                     # Retry in this case
169 |                     self.stats.inc_value('geo_pipeline/retries')
170 |                     continue
171 |                 # Propagate the rest
172 |                 raise
173 | 
174 |     @defer.inlineCallbacks
175 |     def process_item(self, item, spider):
176 |         """
177 |         Pipeline's main method. Uses inlineCallbacks to do
178 |         asynchronous REST requests
179 |         """
180 | 
181 |         if "location" in item:
182 |             # Set by previous step (spider or pipeline). Don't do anything
183 |             # apart from increasing stats
184 |             self.stats.inc_value('geo_pipeline/already_set')
185 |             defer.returnValue(item)
186 |             return
187 | 
188 |         # The item has to have the address field set
189 |         assert ("address" in item) and (len(item["address"]) > 0)
190 | 
191 |         # Extract the address from the item.
192 |         try:
193 |             item["location"] = yield self.cache.find(item["address"][0])
194 |         except:
195 |             self.stats.inc_value('geo_pipeline/errors')
196 |             print traceback.format_exc()
197 | 
198 |         # Return the item for the next stage
199 |         defer.returnValue(item)
200 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/pipelines/legacy.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from twisted.internet import defer
 4 | from twisted.internet import protocol
 5 | from twisted.internet import reactor
 6 | 
 7 | 
 8 | class CommandSlot(protocol.ProcessProtocol):
 9 |     """A ProcessProtocol that sends prices through a binary"""
10 | 
11 |     def __init__(self, args):
12 |         """Initalizing members and starting a new process"""
13 | 
14 |         self._current_deferred = None
15 |         self._queue = []
16 |         reactor.spawnProcess(self, args[0], args)
17 | 
18 |         self.logger = logging.getLogger('pricing-pipeline')
19 | 
20 |     def legacy_calculate(self, price):
21 |         """Enqueue a price to be calculated"""
22 | 
23 |         d = defer.Deferred()
24 |         d.addBoth(self._process_done)
25 |         self._queue.append((price, d))
26 |         self._try_dispatch_top()
27 |         return d
28 | 
29 |     def _process_done(self, result):
30 |         """Called when a calculation completes. It returns the value"""
31 | 
32 |         self._current_deferred = None
33 |         self._try_dispatch_top()
34 |         return result
35 | 
36 |     def _try_dispatch_top(self):
37 |         """Starts a new computation by sending a price to the process"""
38 | 
39 |         if not self._current_deferred and self._queue:
40 |             price, d = self._queue.pop(0)
41 |             self._current_deferred = d
42 |             self.transport.write("%f\n" % price)
43 | 
44 |     # Overriding from protocol.ProcessProtocol
45 |     def outReceived(self, data):
46 |         """Called when new output is received"""
47 |         self._current_deferred.callback(float(data))
48 | 
49 |     def errReceived(self, data):
50 |         """Called in case of an error"""
51 |         self.logger.error('PID[%r]: %s' % (self.transport.pid, data.rstrip()))
52 | 
53 | 
54 | class Pricing(object):
55 |     """A pipeline that accesses legacy functionality"""
56 | 
57 |     @classmethod
58 |     def from_crawler(cls, crawler):
59 |         """Create a new instance from settings"""
60 | 
61 |         concurrency = crawler.settings.get('LEGACY_CONCURENCY', 16)
62 |         default_args = ['properties/pipelines/legacy.sh']
63 |         args = crawler.settings.get('LEGACY_ARGS', default_args)
64 | 
65 |         return cls(concurrency, args)
66 | 
67 |     def __init__(self, concurrency, args):
68 |         """Init this instance by the settings"""
69 |         self.args = args
70 |         self.concurrency = concurrency
71 |         self.slots = [CommandSlot(self.args) for i in xrange(self.concurrency)]
72 |         self.rr = 0
73 | 
74 |     @defer.inlineCallbacks
75 |     def process_item(self, item, spider):
76 |         slot = self.slots[self.rr]
77 | 
78 |         self.rr = (self.rr + 1) % self.concurrency
79 | 
80 |         item["price"][0] = yield slot.legacy_calculate(item["price"][0])
81 | 
82 |         defer.returnValue(item)
83 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/pipelines/legacy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "" SIGINT
 4 | 
 5 | sleep 3
 6 | 
 7 | while read line
 8 | do
 9 |     # 4 per second
10 |     sleep 0.25
11 |     awk "BEGIN {print 1.20 * $line}"
12 | done
13 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/pipelines/mysql.py:
--------------------------------------------------------------------------------
  1 | import traceback
  2 | 
  3 | import dj_database_url
  4 | import MySQLdb
  5 | 
  6 | from twisted.internet import defer
  7 | from twisted.enterprise import adbapi
  8 | from scrapy.exceptions import NotConfigured
  9 | 
 10 | 
 11 | class MysqlWriter(object):
 12 |     """
 13 |     A spider that writes to MySQL databases
 14 |     """
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         """Retrieves scrapy crawler and accesses pipeline's settings"""
 19 | 
 20 |         # Get MySQL URL from settings
 21 |         mysql_url = crawler.settings.get('MYSQL_PIPELINE_URL', None)
 22 | 
 23 |         # If doesn't exist, disable the pipeline
 24 |         if not mysql_url:
 25 |             raise NotConfigured
 26 | 
 27 |         # Create the class
 28 |         return cls(mysql_url)
 29 | 
 30 |     def __init__(self, mysql_url):
 31 |         """Opens a MySQL connection pool"""
 32 | 
 33 |         # Store the url for future reference
 34 |         self.mysql_url = mysql_url
 35 |         # Report connection error only once
 36 |         self.report_connection_error = True
 37 | 
 38 |         # Parse MySQL URL and try to initialize a connection
 39 |         conn_kwargs = MysqlWriter.parse_mysql_url(mysql_url)
 40 |         self.dbpool = adbapi.ConnectionPool('MySQLdb',
 41 |                                             charset='utf8',
 42 |                                             use_unicode=True,
 43 |                                             connect_timeout=5,
 44 |                                             **conn_kwargs)
 45 | 
 46 |     def close_spider(self, spider):
 47 |         """Discard the database pool on spider close"""
 48 |         self.dbpool.close()
 49 | 
 50 |     @defer.inlineCallbacks
 51 |     def process_item(self, item, spider):
 52 |         """Processes the item. Does insert into MySQL"""
 53 | 
 54 |         logger = spider.logger
 55 | 
 56 |         try:
 57 |             yield self.dbpool.runInteraction(self.do_replace, item)
 58 |         except MySQLdb.OperationalError:
 59 |             if self.report_connection_error:
 60 |                 logger.error("Can't connect to MySQL: %s" % self.mysql_url)
 61 |                 self.report_connection_error = False
 62 |         except:
 63 |             print traceback.format_exc()
 64 | 
 65 |         # Return the item for the next stage
 66 |         defer.returnValue(item)
 67 | 
 68 |     @staticmethod
 69 |     def do_replace(tx, item):
 70 |         """Does the actual REPLACE INTO"""
 71 | 
 72 |         sql = """REPLACE INTO properties (url, title, price, description)
 73 |         VALUES (%s,%s,%s,%s)"""
 74 | 
 75 |         args = (
 76 |             item["url"][0][:100],
 77 |             item["title"][0][:30],
 78 |             item["price"][0],
 79 |             item["description"][0].replace("\r\n", " ")[:30]
 80 |         )
 81 | 
 82 |         tx.execute(sql, args)
 83 | 
 84 |     @staticmethod
 85 |     def parse_mysql_url(mysql_url):
 86 |         """
 87 |         Parses mysql url and prepares arguments for
 88 |         adbapi.ConnectionPool()
 89 |         """
 90 | 
 91 |         params = dj_database_url.parse(mysql_url)
 92 | 
 93 |         conn_kwargs = {}
 94 |         conn_kwargs['host'] = params['HOST']
 95 |         conn_kwargs['user'] = params['USER']
 96 |         conn_kwargs['passwd'] = params['PASSWORD']
 97 |         conn_kwargs['db'] = params['NAME']
 98 |         conn_kwargs['port'] = params['PORT']
 99 | 
100 |         # Remove items with empty values
101 |         conn_kwargs = dict((k, v) for k, v in conn_kwargs.iteritems() if v)
102 | 
103 |         return conn_kwargs
104 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/pipelines/redis.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import dj_redis_url
  4 | import txredisapi
  5 | 
  6 | from scrapy.exceptions import NotConfigured
  7 | from twisted.internet import defer
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class RedisCache(object):
 12 |     """A pipeline that uses a Redis server to cache values"""
 13 | 
 14 |     @classmethod
 15 |     def from_crawler(cls, crawler):
 16 |         """Create a new instance and pass it Redis' url and namespace"""
 17 | 
 18 |         # Get redis URL
 19 |         redis_url = crawler.settings.get('REDIS_PIPELINE_URL', None)
 20 | 
 21 |         # If doesn't exist, disable
 22 |         if not redis_url:
 23 |             raise NotConfigured
 24 | 
 25 |         redis_nm = crawler.settings.get('REDIS_PIPELINE_NS', 'ADDRESS_CACHE')
 26 | 
 27 |         return cls(crawler, redis_url, redis_nm)
 28 | 
 29 |     def __init__(self, crawler, redis_url, redis_nm):
 30 |         """Store configuration, open connection and register callback"""
 31 | 
 32 |         # Store the url and the namespace for future reference
 33 |         self.redis_url = redis_url
 34 |         self.redis_nm = redis_nm
 35 | 
 36 |         # Report connection error only once
 37 |         self.report_connection_error = True
 38 | 
 39 |         # Parse redis URL and try to initialize a connection
 40 |         args = RedisCache.parse_redis_url(redis_url)
 41 |         self.connection = txredisapi.lazyConnectionPool(connectTimeout=5,
 42 |                                                         replyTimeout=5,
 43 |                                                         **args)
 44 | 
 45 |         # Connect the item_scraped signal
 46 |         crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
 47 | 
 48 |     @defer.inlineCallbacks
 49 |     def process_item(self, item, spider):
 50 |         """Looks up address in redis"""
 51 | 
 52 |         logger = spider.logger
 53 | 
 54 |         if "location" in item:
 55 |             # Set by previous step (spider or pipeline). Don't do anything
 56 |             defer.returnValue(item)
 57 |             return
 58 | 
 59 |         # The item has to have the address field set
 60 |         assert ("address" in item) and (len(item["address"]) > 0)
 61 | 
 62 |         # Extract the address from the item.
 63 |         address = item["address"][0]
 64 | 
 65 |         try:
 66 |             # Check Redis
 67 |             key = self.redis_nm + ":" + address
 68 | 
 69 |             value = yield self.connection.get(key)
 70 | 
 71 |             if value:
 72 |                 # Set the value for this item
 73 |                 item["location"] = json.loads(value)
 74 | 
 75 |         except txredisapi.ConnectionError:
 76 |             if self.report_connection_error:
 77 |                 logger.error("Can't connect to Redis: %s" % self.redis_url)
 78 |                 self.report_connection_error = False
 79 | 
 80 |         defer.returnValue(item)
 81 | 
 82 |     def item_scraped(self, item, spider):
 83 |         """
 84 |         This function inspects the item after it has gone through every
 85 |         pipeline stage and if there is some cache value to add it does so.
 86 |         """
 87 |         # Capture and encode the location and the address
 88 |         try:
 89 |             location = item["location"]
 90 |             value = json.dumps(location, ensure_ascii=False)
 91 |         except KeyError:
 92 |             return
 93 | 
 94 |         # Extract the address from the item.
 95 |         address = item["address"][0]
 96 | 
 97 |         key = self.redis_nm + ":" + address
 98 | 
 99 |         quiet = lambda failure: failure.trap(txredisapi.ConnectionError)
100 | 
101 |         # Store it in Redis asynchronously
102 |         return self.connection.set(key, value).addErrback(quiet)
103 | 
104 |     @staticmethod
105 |     def parse_redis_url(redis_url):
106 |         """
107 |         Parses redis url and prepares arguments for
108 |         txredisapi.lazyConnectionPool()
109 |         """
110 | 
111 |         params = dj_redis_url.parse(redis_url)
112 | 
113 |         conn_kwargs = {}
114 |         conn_kwargs['host'] = params['HOST']
115 |         conn_kwargs['password'] = params['PASSWORD']
116 |         conn_kwargs['dbid'] = params['DB']
117 |         conn_kwargs['port'] = params['PORT']
118 | 
119 |         # Remove items with empty values
120 |         conn_kwargs = dict((k, v) for k, v in conn_kwargs.iteritems() if v)
121 | 
122 |         return conn_kwargs
123 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/pipelines/tidyup.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | 
 4 | class TidyUp(object):
 5 |     """A pipeline that does some basic post-processing"""
 6 | 
 7 |     def process_item(self, item, spider):
 8 |         """
 9 |         Pipeline's main method. Formats the date as a string.
10 |         """
11 | 
12 |         item['date'] = map(datetime.isoformat, item['date'])
13 | 
14 |         return item
15 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for properties project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'properties'
10 | 
11 | SPIDER_MODULES = ['properties.spiders']
12 | NEWSPIDER_MODULE = 'properties.spiders'
13 | 
14 | # Crawl responsibly by identifying yourself (and your website) on
15 | # the user-agent
16 | #USER_AGENT = 'properties (+http://www.yourdomain.com)'
17 | 
18 | ITEM_PIPELINES = {
19 |     'properties.pipelines.tidyup.TidyUp': 100,
20 |     'properties.pipelines.es.EsWriter': 800,
21 |     #'properties.pipelines.geo.GeoPipeline': 400,
22 |     'properties.pipelines.geo2.GeoPipeline': 400,
23 |     'properties.pipelines.mysql.MysqlWriter': 700,
24 |     'properties.pipelines.redis.RedisCache': 300,
25 |     'properties.pipelines.computation.UsingBlocking': 500,
26 |     'properties.pipelines.legacy.Pricing': 600,
27 | }
28 | 
29 | EXTENSIONS = {'properties.latencies.Latencies': 500, }
30 | LATENCIES_INTERVAL = 5
31 | 
32 | ES_PIPELINE_URL = 'http://es:9200/properties/property/'
33 | 
34 | MYSQL_PIPELINE_URL = 'mysql://root:pass@mysql/properties'
35 | 
36 | REDIS_PIPELINE_URL = 'redis://redis:6379'
37 | 
38 | LOG_LEVEL = "INFO"
39 | 
40 | # Disable S3
41 | AWS_ACCESS_KEY_ID = ""
42 | AWS_SECRET_ACCESS_KEY = ""
43 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ch09/properties/properties/spiders/easy.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | 
 5 | from scrapy.loader.processors import MapCompose, Join
 6 | from scrapy.linkextractors import LinkExtractor
 7 | from scrapy.spiders import CrawlSpider, Rule
 8 | from scrapy.loader import ItemLoader
 9 | 
10 | from properties.items import PropertiesItem
11 | 
12 | 
13 | class EasySpider(CrawlSpider):
14 |     name = 'easy'
15 |     allowed_domains = ["web"]
16 | 
17 |     # Start on the first index page
18 |     start_urls = (
19 |         'http://web:9312/properties/index_00000.html',
20 |     )
21 | 
22 |     # Rules for horizontal and vertical crawling
23 |     rules = (
24 |         Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
25 |         Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
26 |              callback='parse_item')
27 |     )
28 | 
29 |     def parse_item(self, response):
30 |         """ This function parses a property page.
31 | 
32 |         @url http://web:9312/properties/property_000000.html
33 |         @returns items 1
34 |         @scrapes title price description address image_urls
35 |         @scrapes url project spider server date
36 |         """
37 | 
38 |         # Create the loader using the response
39 |         l = ItemLoader(item=PropertiesItem(), response=response)
40 | 
41 |         # Load fields using XPath expressions
42 |         l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
43 |                     MapCompose(unicode.strip, unicode.title))
44 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
45 |                     MapCompose(lambda i: i.replace(',', ''), float),
46 |                     re='[,.0-9]+')
47 |         l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
48 |                     MapCompose(unicode.strip), Join())
49 |         l.add_xpath('address',
50 |                     '//*[@itemtype="http://schema.org/Place"][1]/text()',
51 |                     MapCompose(unicode.strip))
52 |         l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
53 |                     MapCompose(lambda i: urlparse.urljoin(response.url, i)))
54 | 
55 |         # Housekeeping fields
56 |         l.add_value('url', response.url)
57 |         l.add_value('project', self.settings.get('BOT_NAME'))
58 |         l.add_value('spider', self.name)
59 |         l.add_value('server', socket.gethostname())
60 |         l.add_value('date', datetime.datetime.now())
61 | 
62 |         return l.load_item()
63 | 


--------------------------------------------------------------------------------
/ch09/properties/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = properties.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = properties
12 | 


--------------------------------------------------------------------------------
/ch10/speed/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = speed.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = client
12 | 


--------------------------------------------------------------------------------
/ch10/speed/speed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch10/speed/speed/__init__.py


--------------------------------------------------------------------------------
/ch10/speed/speed/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | BOT_NAME = 'speed'
 4 | 
 5 | SPIDER_MODULES = ['speed.spiders']
 6 | NEWSPIDER_MODULE = 'speed.spiders'
 7 | 
 8 | 
 9 | #SPEED_PORT = 9312
10 | 
11 | ITEM_PIPELINES = {'speed.spiders.speed.DummyPipeline': 100}
12 | EXTENSIONS = {'speed.spiders.speed.PrintCoreMetrics': 500}
13 | 
14 | # Disable S3
15 | AWS_ACCESS_KEY_ID = ""
16 | AWS_SECRET_ACCESS_KEY = ""
17 | 
18 | # Defaults for high performance
19 | # See http://doc.scrapy.org/en/latest/topics/broad-crawls.html
20 | LOG_LEVEL = "INFO"
21 | COOKIES_ENABLED = False
22 | RETRY_ENABLED = False
23 | DOWNLOAD_TIMEOUT = 15
24 | REDIRECT_ENABLED = False
25 | REACTOR_THREADPOOL_MAXSIZE = 20
26 | AJAXCRAWL_ENABLED = True
27 | DEPTH_PRIORITY = 0
28 | # We simplify the model by essentially
29 | # disabling the per-IP limits.
30 | CONCURRENT_REQUESTS_PER_DOMAIN = 1000000
31 | # As long as you have one item per crawl
32 | # it's better to have this set to 1
33 | CONCURRENT_ITEMS = 1
34 | 
35 | # The most key Scrapy setting:
36 | CONCURRENT_REQUESTS = 16
37 | 
38 | # Main simulation factors:
39 | SPEED_TOTAL_ITEMS = 5000
40 | SPEED_T_RESPONSE = 0.125
41 | 
42 | # *** Spider simulation settings ***
43 | #SPEED_SPIDER_BLOCKING_DELAY = 0
44 | 
45 | # *** Pipeline control settings ***
46 | #CONCURRENT_ITEMS = 100
47 | 
48 | # *** Pipeline simulation settings ***
49 | #SPEED_PIPELINE_BLOCKING_DELAY = 0.2
50 | #SPEED_PIPELINE_ASYNC_DELAY = 0.2
51 | #SPEED_PIPELINE_API_VIA_DOWNLOADER = 0
52 | #SPEED_PIPELINE_API_VIA_TREQ = 0
53 | 
54 | # *** Adjusting crawling style ***
55 | #SPEED_INDEX_POINTAHEAD=1
56 | #SPEED_INDEX_HIGHER_PRIORITY = False
57 | #SPEED_START_REQUESTS_STYLE = 'Force' # or 'UseIndex' or 'Iterate'
58 | #SPEED_DETAILS_PER_INDEX_PAGE = SPEED_TOTAL_ITEMS
59 | #SPEED_ITEMS_PER_DETAIL = 100
60 | #SPEED_DETAIL_EXTRA_SIZE = 0
61 | 
62 | # *** Adjusting individual response times ***
63 | #SPEED_API_T_RESPONSE= 0.5
64 | #SPEED_INDEX_T_RESPONSE = 0
65 | #SPEED_DETAIL_T_RESPONSE = 0
66 | 
67 | # *** Enable broad search ***
68 | # DEPTH_PRIORITY = 1
69 | # SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
70 | # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
71 | 


--------------------------------------------------------------------------------
/ch10/speed/speed/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ch10/speed/speed/spiders/speed.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import json
  4 | import time
  5 | 
  6 | from treq import post
  7 | 
  8 | from twisted.internet.task import deferLater
  9 | from twisted.internet import defer, reactor, task
 10 | 
 11 | import scrapy
 12 | 
 13 | from scrapy import FormRequest
 14 | from scrapy.http import Request
 15 | from scrapy.linkextractors import LinkExtractor
 16 | from scrapy.spiders import CrawlSpider, Rule
 17 | from scrapy.exceptions import NotConfigured
 18 | from scrapy import signals
 19 | 
 20 | 
 21 | settings_to_url = {
 22 |     # Website structure
 23 |     "SPEED_TOTAL_ITEMS": "ti",
 24 |     "SPEED_DETAILS_PER_INDEX_PAGE": "dp",
 25 |     "SPEED_ITEMS_PER_DETAIL": "id",
 26 |     "SPEED_DETAIL_EXTRA_SIZE": "ds",
 27 |     "SPEED_INDEX_POINTAHEAD": "ip",
 28 |     # Response times
 29 |     "SPEED_T_RESPONSE": "rr",
 30 |     "SPEED_API_T_RESPONSE": "ar",
 31 |     "SPEED_DETAIL_T_RESPONSE": "dr",
 32 |     "SPEED_INDEX_T_RESPONSE": "ir",
 33 | }
 34 | 
 35 | 
 36 | def get_base_url(settings):
 37 |     port = settings.getint('SPEED_PORT', 9312)
 38 |     args = []
 39 |     for (setting, arg_name) in settings_to_url.iteritems():
 40 |         arg_value = settings.get(setting)
 41 |         # If a setting is set, then reflect that to the URL
 42 |         if arg_value:
 43 |             args.append("/%s:%s" % (arg_name, arg_value))
 44 | 
 45 |     return "http://web:%d%s/benchmark/" % (port, "".join(args))
 46 | 
 47 | 
 48 | class DummyItem(scrapy.Item):
 49 |     id = scrapy.Field()
 50 |     info = scrapy.Field()
 51 |     translation = scrapy.Field()
 52 | 
 53 | 
 54 | class SpeedSpider(CrawlSpider):
 55 |     name = 'speed'
 56 | 
 57 |     @classmethod
 58 |     def from_crawler(cls, crawler, *args, **kwargs):
 59 |         if crawler.settings.getbool('SPEED_INDEX_RULE_LAST', False):
 60 |             cls.rules = (cls.rules[1], cls.rules[0])
 61 | 
 62 |         spider = super(SpeedSpider, cls).from_crawler(crawler, *args, **kwargs)
 63 | 
 64 |         spider.blocking_delay = crawler.settings.getfloat(
 65 |             'SPEED_SPIDER_BLOCKING_DELAY', 0.0)
 66 |         spider.base = get_base_url(crawler.settings)
 67 | 
 68 |         return spider
 69 | 
 70 |     def get_detail_requests(self):
 71 |         items_per_page = self.settings.getint('SPEED_ITEMS_PER_DETAIL', 1)
 72 |         total_items = self.settings.getint('SPEED_TOTAL_ITEMS', 1000)
 73 | 
 74 |         for i in xrange(1, total_items+1, items_per_page):
 75 |             yield Request(self.base + "detail?id0=%d" % i,
 76 |                           callback=self.parse_item)
 77 | 
 78 |     def start_requests(self):
 79 |         start_requests_style = self.settings.get('SPEED_START_REQUESTS_STYLE',
 80 |                                                  'Force')
 81 | 
 82 |         if start_requests_style == 'UseIndex':
 83 |             # The requests out of the index page get processed in the same
 84 |             # parallel(... CONCURRENT_ITEMS) among regular Items.
 85 |             index_shards = self.settings.getint('SPEED_INDEX_SHARDS', 1)
 86 | 
 87 |             index_pages_count = self.get_index_pages_count()
 88 | 
 89 |             # Round up
 90 |             shard_length = (index_pages_count+index_shards-1) / index_shards
 91 | 
 92 |             for i in xrange(1, index_pages_count, shard_length):
 93 |                 url = self.base + "index?p=%d" % i
 94 |                 yield self.make_requests_from_url(url)
 95 | 
 96 |         elif start_requests_style == 'Force':
 97 |             # This is feeding those requests directly into the scheduler's
 98 |             # queue.
 99 |             for request in self.get_detail_requests():
100 |                 self.crawler.engine.crawl(request=request, spider=self)
101 |         elif start_requests_style == 'Iterate':
102 |             # start_requests are consumed "on demand" through yield
103 |             for request in self.get_detail_requests():
104 |                 yield request
105 |         else:
106 |             print "No start_requests."
107 | 
108 |     def get_index_pages_count(self):
109 |         details_per_index = self.settings.getint(
110 |             'SPEED_DETAILS_PER_INDEX_PAGE', 20)
111 |         items_per_page = self.settings.getint('SPEED_ITEMS_PER_DETAIL', 1)
112 | 
113 |         page_worth = details_per_index * items_per_page
114 | 
115 |         total_items = self.settings.getint('SPEED_TOTAL_ITEMS', 1000)
116 | 
117 |         # Round up
118 |         index_pages_count = (total_items + page_worth - 1) / page_worth
119 | 
120 |         return index_pages_count
121 | 
122 |     def my_process_request(self, r):
123 |         if self.settings.getbool('SPEED_INDEX_HIGHER_PRIORITY', False):
124 |             r.priority = 1
125 |         return r
126 | 
127 |     rules = (
128 |         Rule(LinkExtractor(restrict_xpaths='//*[@class="nav"]'),
129 |              process_request="my_process_request"),
130 |         Rule(LinkExtractor(restrict_xpaths='//*[@class="item"]'),
131 |              callback='parse_item')
132 |     )
133 | 
134 |     def parse_item(self, response):
135 |         if self.blocking_delay > 0.001:
136 |             # This is a bad bad thing
137 |             time.sleep(self.blocking_delay)
138 | 
139 |         for li in response.xpath('//li'):
140 |             i = DummyItem()
141 |             id_phrase = li.xpath('.//h3/text()').extract()[0]
142 |             i['id'] = int(id_phrase.split()[1])
143 |             i['info'] = li.xpath('.//div[@class="info"]/text()').extract()
144 |             yield i
145 | 
146 | 
147 | class DummyPipeline(object):
148 | 
149 |     def __init__(self, crawler):
150 |         self.crawler = crawler
151 |         self.blocking_delay = crawler.settings.getfloat(
152 |             'SPEED_PIPELINE_BLOCKING_DELAY', 0.0)
153 |         self.async_delay = crawler.settings.getfloat(
154 |             'SPEED_PIPELINE_ASYNC_DELAY', 0.0)
155 |         self.downloader_api = crawler.settings.getbool(
156 |             'SPEED_PIPELINE_API_VIA_DOWNLOADER', False)
157 |         self.treq_api = crawler.settings.getbool(
158 |             'SPEED_PIPELINE_API_VIA_TREQ', False)
159 |         self.base = get_base_url(crawler.settings)
160 | 
161 |     @classmethod
162 |     def from_crawler(cls, crawler):
163 |         return cls(crawler)
164 | 
165 |     @defer.inlineCallbacks
166 |     def process_item(self, item, spider):
167 |         # If no processing is made, translation will
168 |         # be N/A
169 |         item['translation'] = "N/A"
170 | 
171 |         if self.blocking_delay > 0.001:
172 |             # This is a bad bad thing
173 |             time.sleep(self.blocking_delay)
174 | 
175 |         if self.async_delay > 0.001:
176 |             # Emulate an asynchronous call to a translation function
177 |             delay = self.async_delay
178 |             translate = lambda: "calculated-%s" % item['info']
179 |             translation = yield deferLater(reactor, delay, translate)
180 |             item['translation'] = translation
181 | 
182 |         if self.downloader_api:
183 |             # Do an API call using Scrapy's downloader
184 |             formdata = dict(text=item['info'])
185 |             request = FormRequest(self.base + "api", formdata=formdata)
186 |             response = yield self.crawler.engine.download(request, spider)
187 |             item['translation'] = json.loads(response.body)['translation']
188 | 
189 |         if self.treq_api:
190 |             # Do an API call using treq
191 |             response = yield post(self.base + "api", {"text": item['info']})
192 |             json_response = yield response.json()
193 |             item['translation'] = json_response['translation']
194 | 
195 |         defer.returnValue(item)
196 | 
197 | 
198 | class PrintCoreMetrics(object):
199 |     """
200 |     An extension that prints "core metrics"
201 |     """
202 |     @classmethod
203 |     def from_crawler(cls, crawler):
204 |         return cls(crawler)
205 | 
206 |     def __init__(self, crawler):
207 |         self.crawler = crawler
208 |         self.interval = crawler.settings.getfloat('CORE_METRICS_INTERVAL', 1.0)
209 |         self.first = True
210 | 
211 |         if not self.interval:
212 |             raise NotConfigured
213 | 
214 |         cs = crawler.signals
215 |         cs.connect(self._spider_opened, signal=signals.spider_opened)
216 |         cs.connect(self._spider_closed, signal=signals.spider_closed)
217 | 
218 |     def _spider_opened(self, spider):
219 |         self.task = task.LoopingCall(self._log, spider)
220 |         self.task.start(self.interval)
221 | 
222 |     def _spider_closed(self, spider, reason):
223 |         if self.task.running:
224 |             self.task.stop()
225 | 
226 |     def _log(self, spider):
227 |         engine = self.crawler.engine
228 |         stats = self.crawler.stats
229 | 
230 |         if self.first:
231 |             self.first = False
232 |             spider.logger.info(("%8s"*5+"%10s") % (
233 |                 "s/edule",
234 |                 "d/load",
235 |                 "scrape",
236 |                 "p/line",
237 |                 "done",
238 |                 "mem",
239 |                 ))
240 | 
241 |         spider.logger.info(("%8d"*5+"%10d") % (
242 |             len(engine.slot.scheduler.mqs),
243 |             len(engine.downloader.active),
244 |             len(engine.scraper.slot.active),
245 |             engine.scraper.slot.itemproc_size,
246 |             stats.get_value('item_scraped_count') or 0,
247 |             engine.scraper.slot.active_size
248 |             ))
249 | 


--------------------------------------------------------------------------------
/ch11/boostwords.py:
--------------------------------------------------------------------------------
  1 | from pyspark.context import SparkConf, SparkContext, RDD
  2 | from pyspark.streaming import StreamingContext
  3 | from operator import add
  4 | from pprint import pformat
  5 | 
  6 | import unittest
  7 | import json
  8 | import sys
  9 | import time
 10 | 
 11 | 
 12 | def add_tuples(acc, i):
 13 |     return tuple(map(add, acc, i))
 14 | 
 15 | def preprocess(raw_data):
 16 |     def item_to_keywords_list(item):
 17 |         words = set(item['title'][0].split())
 18 |         return [(k.lower(), (item['price'][0], 1)) for k in words]
 19 | 
 20 |     return (
 21 |         raw_data.map(lambda line: json.loads(line))
 22 |         .flatMap(item_to_keywords_list)
 23 |         .reduceByKey(add_tuples)
 24 |     )
 25 | 
 26 | 
 27 | def to_shifts(word_prices):
 28 |     if word_prices.isEmpty():
 29 |         return word_prices
 30 | 
 31 |     (sum0, cnt0) = word_prices.values().reduce(add_tuples)
 32 |     avg0 = sum0 / cnt0
 33 | 
 34 |     def calculate_shift((isum, icnt)):
 35 |         if cnt0 == icnt:
 36 |             return 1.0
 37 |         else:
 38 |             avg_with = isum / icnt
 39 |             avg_without = (sum0 - isum) / (cnt0 - icnt)
 40 |             return (avg_with - avg_without) / avg0
 41 | 
 42 |     return word_prices.mapValues(calculate_shift)
 43 | 
 44 | 
 45 | class NonStreamTestCase(unittest.TestCase):
 46 | 
 47 |     def test_one_line_preprocess(self):
 48 |         lines = [
 49 |             '{"title": ["Split Business Split"], "price": [1.0]}',
 50 |         ]
 51 | 
 52 |         word_prices_rdd = preprocess(sc.parallelize(lines, 1))
 53 | 
 54 |         word_prices = dict(word_prices_rdd.collect())
 55 | 
 56 |         self.assertEqual(2, len(word_prices))
 57 |         self.assertAlmostEqual(1, word_prices['business'][0])
 58 |         self.assertAlmostEqual(1, word_prices['split'][0])
 59 |         self.assertAlmostEqual(1, word_prices['business'][1])
 60 |         self.assertAlmostEqual(1, word_prices['split'][1])
 61 | 
 62 |     def test_two_line_preprocess(self):
 63 |         lines = [
 64 |             '{"title": ["Split Business Split"], "price": [1.0]}',
 65 |             '{"title": ["Need business"], "price": [2.0]}',
 66 |         ]
 67 | 
 68 |         word_prices_rdd = preprocess(sc.parallelize(lines, 1))
 69 | 
 70 |         word_prices = dict(word_prices_rdd.collect())
 71 | 
 72 |         self.assertEqual(3, len(word_prices))
 73 |         self.assertAlmostEqual(2, word_prices['need'][0])
 74 |         self.assertAlmostEqual(3, word_prices['business'][0])
 75 |         self.assertAlmostEqual(1, word_prices['split'][0])
 76 |         self.assertAlmostEqual(1, word_prices['need'][1])
 77 |         self.assertAlmostEqual(2, word_prices['business'][1])
 78 |         self.assertAlmostEqual(1, word_prices['split'][1])
 79 | 
 80 |     def test_one_line_shifts(self):
 81 |         lines = [
 82 |             '{"title": ["Split Business Split"], "price": [1.0]}',
 83 |         ]
 84 | 
 85 |         word_prices = preprocess(sc.parallelize(lines, 1))
 86 | 
 87 |         shiftsRdd = to_shifts(word_prices)
 88 | 
 89 |         shifts = dict(shiftsRdd.collect())
 90 | 
 91 |         self.assertEqual(2, len(shifts))
 92 |         self.assertAlmostEqual(0.0, shifts['business'])
 93 |         self.assertAlmostEqual(0.0, shifts['split'])
 94 | 
 95 |     def test_two_line_shifts(self):
 96 |         lines = [
 97 |             '{"title": ["Split Business Split"], "price": [1.0]}',
 98 |             '{"title": ["Need business"], "price": [2.0]}',
 99 |         ]
100 | 
101 |         word_prices = preprocess(sc.parallelize(lines, 1))
102 | 
103 |         shiftsRdd = to_shifts(word_prices)
104 | 
105 |         shifts = dict(shiftsRdd.collect())
106 | 
107 |         self.assertEqual(3, len(shifts))
108 |         self.assertAlmostEqual(0.44444444, shifts['need'])
109 |         self.assertAlmostEqual(0.0, shifts['business'])
110 |         self.assertAlmostEqual(-0.44444444, shifts['split'])
111 | 
112 | 
113 | class BaseStreamingTestCase(unittest.TestCase):
114 |     """ From https://github.com/apache/spark/blob/
115 |     master/python/pyspark/streaming/tests.py """
116 | 
117 |     timeout = 10  # seconds
118 |     duration = .5
119 | 
120 |     def setUp(self):
121 |         self.ssc = StreamingContext(sc, self.duration)
122 | 
123 |     def tearDown(self):
124 |         self.ssc.stop(False)
125 | 
126 |     def wait_for(self, result, n):
127 |         start_time = time.time()
128 |         while len(result) < n and time.time() - start_time < self.timeout:
129 |             time.sleep(0.01)
130 |         if len(result) < n:
131 |             print("timeout after", self.timeout)
132 | 
133 |     def _collect(self, dstream, n):
134 |         result = []
135 | 
136 |         def get_output(_, rdd):
137 |             if rdd and len(result) < n:
138 |                 r = rdd.collect()
139 |                 if r:
140 |                     result.append(r)
141 | 
142 |         dstream.foreachRDD(get_output)
143 | 
144 |         self.ssc.start()
145 |         self.wait_for(result, n)
146 |         return result
147 | 
148 | 
149 | def update_state_function(new_values, prev):
150 |     return reduce(add_tuples, new_values, prev or (0, 0))
151 | 
152 | 
153 | class SmokeStreaming(BaseStreamingTestCase):
154 | 
155 |     def test_map(self):
156 |         """Test streaming operation for the use case above"""
157 | 
158 |         input = [
159 |             ['{"title": ["Split Business Split"], "price": [1.0]}'],
160 |             ['{"title": ["Need business"], "price": [2.0]}'],
161 |         ]
162 | 
163 |         input = [sc.parallelize(d, 1) for d in input]
164 | 
165 |         raw_data = self.ssc.queueStream(input)
166 | 
167 |         word_prices = preprocess(raw_data)
168 | 
169 |         running_word_prices = word_prices.updateStateByKey(update_state_function)
170 | 
171 |         shifts = running_word_prices.transform(to_shifts)
172 | 
173 |         output = self._collect(shifts, 2)
174 | 
175 |         # The first RDD is trivial
176 |         shifts = dict(output.pop(0))
177 | 
178 |         self.assertEqual(2, len(shifts))
179 |         self.assertAlmostEqual(0.0, shifts['business'])
180 |         self.assertAlmostEqual(0.0, shifts['split'])
181 | 
182 |         # The second RDD includes the values from the first because of
183 |         # updateStateByKey().
184 |         shifts = dict(output.pop(0))
185 | 
186 |         self.assertEqual(3, len(shifts))
187 |         self.assertAlmostEqual(0.44444444, shifts['need'])
188 |         self.assertAlmostEqual(0.0, shifts['business'])
189 |         self.assertAlmostEqual(-0.44444444, shifts['split'])
190 | 
191 | def print_shifts(shifts):
192 |     print("\033c" +
193 |           pformat(
194 |               shifts.takeOrdered(5, lambda (k, v): -v) +
195 |               ['...'] +
196 |               list(reversed(shifts.takeOrdered(5, lambda (k, v): v)))
197 |           )
198 |     )
199 |     
200 | def main(ssc, args):
201 |     if len(args) < 2:
202 |         print "usage: spark-submit book/ch11/boostwords.py file:///root/items"
203 |         sys.exit()
204 | 
205 |     # Monitor the files and give us a DStream of term-price pairs
206 |     raw_data = ssc.textFileStream(args[1])
207 |     word_prices = preprocess(raw_data)
208 |     
209 |     # Update the counters using Spark's updateStateByKey
210 |     running_word_prices = word_prices.updateStateByKey(update_state_function)
211 | 
212 |     # Calculate shifts out of the counters
213 |     shifts = running_word_prices.transform(to_shifts)
214 | 
215 |     # Print the results
216 |     shifts.foreachRDD(print_shifts)
217 |             
218 | if __name__ == "__main__":
219 | 
220 |     if len(sys.argv) >= 2 and sys.argv[1] == "test":
221 |         # Run the tests
222 |         del sys.argv[1]
223 | 
224 |         conf = SparkConf().set("spark.default.parallelism", 1)
225 | 
226 |         sc = SparkContext(appName='unit_test', conf=conf)
227 | 
228 |         sc.setLogLevel("WARN")
229 | 
230 |         sc.setCheckpointDir("/tmp")
231 | 
232 |         unittest.main()
233 | 
234 |         sc.stop()
235 | 
236 |     else:
237 |         # Run the main()
238 |         sc = SparkContext(appName="BoostWords")
239 | 
240 |         sc.setLogLevel("WARN")
241 | 
242 |         ssc = StreamingContext(sc, 5)
243 | 
244 |         ssc.checkpoint("checkpoint")
245 |         
246 |         main(ssc, sys.argv)
247 | 
248 |         # Start the engine
249 |         ssc.start()
250 | 
251 |         ssc.awaitTermination()
252 | 


--------------------------------------------------------------------------------
/ch11/properties/properties/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch11/properties/properties/__init__.py


--------------------------------------------------------------------------------
/ch11/properties/properties/items.py:
--------------------------------------------------------------------------------
 1 | from scrapy.item import Item, Field
 2 | 
 3 | 
 4 | class PropertiesItem(Item):
 5 |     # Primary fields
 6 |     title = Field()
 7 |     price = Field()
 8 |     description = Field()
 9 |     address = Field()
10 |     image_urls = Field()
11 | 
12 |     # Calculated fields
13 |     images = Field()
14 |     location = Field()
15 | 
16 |     # Housekeeping fields
17 |     url = Field()
18 |     project = Field()
19 |     spider = Field()
20 |     server = Field()
21 |     date = Field()
22 | 


--------------------------------------------------------------------------------
/ch11/properties/properties/middlewares.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import json
  3 | import treq
  4 | 
  5 | 
  6 | from scrapy import signals
  7 | from scrapy.http import Request
  8 | from twisted.internet import defer
  9 | from scrapy.spiders import CrawlSpider
 10 | from scrapy.exceptions import NotConfigured
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class Distributed(object):
 16 | 
 17 |     @classmethod
 18 |     def from_crawler(cls, crawler):
 19 |         """Passes the crawler to the constructor"""
 20 |         return cls(crawler)
 21 | 
 22 |     def __init__(self, crawler):
 23 |         """Initializes this spider middleware"""
 24 | 
 25 |         settings = crawler.settings
 26 | 
 27 |         # You can also use spider's custom_settings to customize target
 28 |         # rule for each spider
 29 |         #
 30 |         # custom_settings = {
 31 |         #     'DISTRIBUTED_TARGET_RULE': 2
 32 |         # }
 33 |         #
 34 |         self._target = settings.getint('DISTRIBUTED_TARGET_RULE', -1)
 35 |         if self._target < 0:
 36 |             raise NotConfigured
 37 | 
 38 |         # If this is set, it's a worker instance and wills start by using
 39 |         # those URLs instead of spider's start_requests().
 40 |         self._start_urls = settings.get('DISTRIBUTED_START_URLS', None)
 41 |         self.is_worker = self._start_urls is not None
 42 | 
 43 |         # The URLs to be batched
 44 |         self._urls = []
 45 | 
 46 |         # Indicates the target scrapyd to dispatch the next batch to
 47 |         self._batch = 1
 48 | 
 49 |         # The size of a batch. Defaults to 1000.
 50 |         self._batch_size = settings.getint('DISTRIBUTED_BATCH_SIZE', 1000)
 51 | 
 52 |         # The feed uri
 53 |         self._feed_uri = settings.get('DISTRIBUTED_TARGET_FEED_URL', None)
 54 | 
 55 |         # Target scrapyd hosts
 56 |         self._targets = settings.get("DISTRIBUTED_TARGET_HOSTS")
 57 | 
 58 |         # Can't do much as a master without these
 59 |         if not self.is_worker:
 60 |             if not self._feed_uri or not self._targets:
 61 |                 raise NotConfigured
 62 | 
 63 |         # Connecting close signal
 64 |         crawler.signals.connect(self._closed, signal=signals.spider_closed)
 65 | 
 66 |         # A list to wait for before you terminate
 67 |         self._scrapyd_submits_to_wait = []
 68 | 
 69 |         # A de-duplicator
 70 |         self._seen = set()
 71 | 
 72 |         # The project
 73 |         self._project = settings.get('BOT_NAME')
 74 | 
 75 |     def process_start_requests(self, start_requests, spider):
 76 |         """
 77 |         If it's a worker instance, it uses urls from DISTRIBUTED_START_URLS
 78 |         setting instead of spider's start_requests.
 79 |         """
 80 |         if (not isinstance(spider, CrawlSpider) or not self.is_worker):
 81 |             # Case master or inactive. Do default behaviour.
 82 |             for x in start_requests:
 83 |                 yield x
 84 | 
 85 |         else:
 86 | 
 87 |             # Case worker:
 88 |             for url in json.loads(self._start_urls):
 89 |                 # class scrapy.http.Request(url[, callback, method='GET',
 90 |                 # headers, body, cookies, meta, encoding='utf-8',
 91 |                 # priority=0, dont_filter=False, errback])
 92 |                 # Note: This doesn't take into account headers, cookies,
 93 |                 # non-GET methods etc.
 94 |                 yield Request(url, spider._response_downloaded,
 95 |                               meta={'rule': self._target})
 96 | 
 97 |     def process_spider_output(self, response, result, spider):
 98 |         """
 99 |         If a request is for a traget rule, it gets batched. It passes-through
100 |         otherwise.
101 |         """
102 |         if not isinstance(spider, CrawlSpider) or self.is_worker:
103 |             for x in result:
104 |                 yield x
105 | 
106 |         else:
107 | 
108 |             for x in result:
109 |                 if not isinstance(x, Request):
110 |                     yield x
111 |                 else:
112 |                     rule = x.meta.get('rule')
113 | 
114 |                     if rule == self._target:
115 |                         self._add_to_batch(spider, x)
116 |                     else:
117 |                         yield x
118 | 
119 |     @defer.inlineCallbacks
120 |     def _closed(self, spider, reason, signal, sender):
121 |         """
122 |         On close, we flush all remaining URLs and if it's a worker instance,
123 |         it posts all the results to the streaming engine.
124 |         """
125 | 
126 |         # Submit any remaining URLs
127 |         self._flush_urls(spider)
128 | 
129 |         r = yield defer.DeferredList(self._scrapyd_submits_to_wait)
130 | 
131 |         for (success, (debug_data, resp)) in r:
132 |             if not success:
133 |                 logger.error("%s: treq request not send" % debug_data)
134 |                 continue
135 |             if resp.code != 200:
136 |                 body = yield resp.body()
137 |                 logger.error("%s: scrapyd request failed: %d. Body: %s" %
138 |                              (debug_data, resp.code, body))
139 |                 continue
140 |             ob = yield resp.json()
141 |             if ob["status"] != "ok":
142 |                 logger.error("%s: scrapyd operation %s: %s" %
143 |                              (debug_data, ob["status"], ob))
144 | 
145 |     def _add_to_batch(self, spider, request):
146 |         """
147 |         Adds a Request (URL) to the batch. If we reach DISTRIBUTED_BATCH_SIZE
148 |         we flush the batch.
149 |         """
150 |         url = request.url
151 |         if not url in self._seen:
152 |             self._seen.add(url)
153 |             self._urls.append(url)
154 |             if len(self._urls) >= self._batch_size:
155 |                 self._flush_urls(spider)
156 | 
157 |     def _flush_urls(self, spider):
158 |         """
159 |         Flushes the URLs.
160 |         """
161 |         if not self._urls:
162 |             return
163 | 
164 |         target = self._targets[(self._batch-1) % len(self._targets)]
165 | 
166 |         logger.info("Posting batch %d with %d URLs to %s",
167 |                     self._batch, len(self._urls), target)
168 | 
169 |         data = [
170 |             ("project", self._project),
171 |             ("spider", spider.name),
172 |             ("setting", "FEED_URI=%s" % self._feed_uri),
173 |             ("batch", str(self._batch)),
174 |         ]
175 | 
176 |         debug_data = "target (%d): %s" % (len(self._urls), data)
177 | 
178 |         json_urls = json.dumps(self._urls)
179 |         data.append(("setting", "DISTRIBUTED_START_URLS=%s" % json_urls))
180 | 
181 |         d = treq.post("http://%s/schedule.json" % target,
182 |                       data=data, timeout=5, persistent=False)
183 | 
184 |         d.addBoth(lambda resp: (debug_data, resp))
185 | 
186 |         self._scrapyd_submits_to_wait.append(d)
187 | 
188 |         self._urls = []
189 |         self._batch += 1
190 | 


--------------------------------------------------------------------------------
/ch11/properties/properties/monitor.py:
--------------------------------------------------------------------------------
 1 | import treq
 2 | 
 3 | from twisted.internet import reactor, task, defer
 4 | from twisted.python.failure import Failure
 5 | 
 6 | from scrapy.commands import ScrapyCommand
 7 | from scrapy.utils.conf import get_config
 8 | from scrapy.exceptions import UsageError
 9 | 
10 | 
11 | class Command(ScrapyCommand):
12 |     requires_project = True
13 | 
14 |     def run(self, args, opts):
15 |         self._to_monitor = {}
16 |         for name, target in self._get_targets().iteritems():
17 |             if name in args:
18 |                 project = self.settings.get('BOT_NAME')
19 |                 url = target['url'] + "listjobs.json?project=" + project
20 |                 self._to_monitor[name] = url
21 | 
22 |         if not self._to_monitor:
23 |             raise UsageError("Nothing to monitor")
24 | 
25 |         l = task.LoopingCall(self._monitor)
26 |         l.start(5)  # call every 5 seconds
27 | 
28 |         reactor.run()
29 | 
30 |     @defer.inlineCallbacks
31 |     def _monitor(self):
32 |         all_deferreds = []
33 |         for name, url in self._to_monitor.iteritems():
34 |             d = treq.get(url, timeout=5, persistent=False)
35 |             d.addBoth(lambda resp, name: (name, resp), name)
36 |             all_deferreds.append(d)
37 | 
38 |         all_resp = yield defer.DeferredList(all_deferreds)
39 | 
40 |         status = {}
41 |         for (success, (name, resp)) in all_resp:
42 |             if not success:
43 |                 print "deferred error"
44 |             elif isinstance(resp, Failure):
45 |                 print "got failure: %r" % resp
46 |             elif resp.code == 200:
47 |                 json_resp = yield resp.json()
48 |                 status[name] = (
49 |                     len(json_resp.get('running', [])),
50 |                     len(json_resp.get('finished', [])),
51 |                     len(json_resp.get('pending', [])),
52 |                 )
53 | 
54 |         to_print = []
55 |         for name in sorted(status.keys()):
56 |             to_print.append("%-20s running: %d, finished: %d, pending: %d" %
57 |                             ((name,) + status[name]))
58 |         print "\033c" + "\n".join(to_print)
59 | 
60 |     def _get_targets(self):
61 |         cfg = get_config()
62 |         baset = dict(cfg.items('deploy')) if cfg.has_section('deploy') else {}
63 |         targets = {}
64 |         if 'url' in baset:
65 |             targets['default'] = baset
66 |         for x in cfg.sections():
67 |             if x.startswith('deploy:'):
68 |                 t = baset.copy()
69 |                 t.update(cfg.items(x))
70 |                 targets[x[7:]] = t
71 | 
72 |         return targets
73 | 


--------------------------------------------------------------------------------
/ch11/properties/properties/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | class PropertiesPipeline(object):
 8 |     def process_item(self, item, spider):
 9 |         return item
10 | 


--------------------------------------------------------------------------------
/ch11/properties/properties/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for properties project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'properties'
10 | 
11 | SPIDER_MODULES = ['properties.spiders']
12 | NEWSPIDER_MODULE = 'properties.spiders'
13 | 
14 | # Crawl responsibly by identifying yourself (and your website) on
15 | # the user-agent
16 | #USER_AGENT = 'properties (+http://www.yourdomain.com)'
17 | 
18 | SPIDER_MIDDLEWARES = {
19 |     'properties.middlewares.Distributed': 100,
20 | }
21 | 
22 | # Disable S3
23 | AWS_ACCESS_KEY_ID = ""
24 | AWS_SECRET_ACCESS_KEY = ""
25 | 
26 | COMMANDS_MODULE = 'properties.monitor'
27 | 
28 | LOG_LEVEL = "INFO"
29 | 
30 | CONCURRENT_REQUESTS_PER_DOMAIN = 16
31 | CONCURRENT_REQUESTS = 16
32 | 
33 | DISTRIBUTED_TARGET_RULE = 1
34 | DISTRIBUTED_BATCH_SIZE = 2000
35 | DISTRIBUTED_TARGET_FEED_URL = ("ftp://anonymous@spark/"
36 |                                "%(batch)s_%(name)s_%(time)s.jl")
37 | DISTRIBUTED_TARGET_HOSTS = [
38 |     "scrapyd1:6800",
39 |     "scrapyd2:6800",
40 |     "scrapyd3:6800",
41 | ]
42 | 


--------------------------------------------------------------------------------
/ch11/properties/properties/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ch11/properties/properties/spiders/distr.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | 
 5 | from scrapy.loader.processors import MapCompose, Join
 6 | from scrapy.linkextractors import LinkExtractor
 7 | from scrapy.spiders import CrawlSpider, Rule
 8 | from scrapy.loader import ItemLoader
 9 | 
10 | from properties.items import PropertiesItem
11 | 
12 | 
13 | class EasySpider(CrawlSpider):
14 |     name = 'distr'
15 |     allowed_domains = ["web"]
16 | 
17 |     # Start on the first index page
18 |     start_urls = ['http://web:9312/properties/index_%05d.html' % id
19 |                   for id in map(lambda x: 1667 * x / 20, range(20))]
20 | 
21 |     # Rules for horizontal and vertical crawling
22 |     rules = (
23 |         Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
24 |         Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
25 |              callback='parse_item')
26 |     )
27 | 
28 |     def parse_item(self, response):
29 |         """ This function parses a property page.
30 | 
31 |         @url http://web:9312/properties/property_000000.html
32 |         @returns items 1
33 |         @scrapes title price description address image_urls
34 |         @scrapes url project spider server date
35 |         """
36 | 
37 |         # Create the loader using the response
38 |         l = ItemLoader(item=PropertiesItem(), response=response)
39 | 
40 |         # Load fields using XPath expressions
41 |         l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
42 |                     MapCompose(unicode.strip, unicode.title))
43 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
44 |                     MapCompose(lambda i: i.replace(',', ''), float),
45 |                     re='[,.0-9]+')
46 |         l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
47 |                     MapCompose(unicode.strip), Join())
48 |         l.add_xpath('address',
49 |                     '//*[@itemtype="http://schema.org/Place"][1]/text()',
50 |                     MapCompose(unicode.strip))
51 |         l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
52 |                     MapCompose(lambda i: urlparse.urljoin(response.url, i)))
53 | 
54 |         # Housekeeping fields
55 |         l.add_value('url', response.url)
56 |         l.add_value('project', self.settings.get('BOT_NAME'))
57 |         l.add_value('spider', self.name)
58 |         l.add_value('server', socket.gethostname())
59 |         l.add_value('date', datetime.datetime.now())
60 | 
61 |         return l.load_item()
62 | 


--------------------------------------------------------------------------------
/ch11/properties/properties/spiders/easy.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urlparse
 3 | import socket
 4 | 
 5 | from scrapy.loader.processors import MapCompose, Join
 6 | from scrapy.linkextractors import LinkExtractor
 7 | from scrapy.spiders import CrawlSpider, Rule
 8 | from scrapy.loader import ItemLoader
 9 | 
10 | from properties.items import PropertiesItem
11 | 
12 | 
13 | class EasySpider(CrawlSpider):
14 |     name = 'easy'
15 |     allowed_domains = ["web"]
16 | 
17 |     # Start on the first index page
18 |     start_urls = ['http://web:9312/properties/index_%05d.html' % id
19 |                   for id in map(lambda x: 1667 * x / 20, range(20))]
20 | 
21 |     # Rules for horizontal and vertical crawling
22 |     rules = (
23 |         Rule(LinkExtractor(restrict_xpaths='//*[contains(@class,"next")]')),
24 |         #Rule(LinkExtractor(restrict_xpaths='//*[@itemprop="url"]'),
25 |         #callback='parse_item')
26 |     )
27 | 
28 |     def parse_item(self, response):
29 |         """ This function parses a property page.
30 | 
31 |         @url http://web:9312/properties/property_000000.html
32 |         @returns items 1
33 |         @scrapes title price description address image_urls
34 |         @scrapes url project spider server date
35 |         """
36 | 
37 |         # Create the loader using the response
38 |         l = ItemLoader(item=PropertiesItem(), response=response)
39 | 
40 |         # Load fields using XPath expressions
41 |         l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
42 |                     MapCompose(unicode.strip, unicode.title))
43 |         l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
44 |                     MapCompose(lambda i: i.replace(',', ''), float),
45 |                     re='[,.0-9]+')
46 |         l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
47 |                     MapCompose(unicode.strip), Join())
48 |         l.add_xpath('address',
49 |                     '//*[@itemtype="http://schema.org/Place"][1]/text()',
50 |                     MapCompose(unicode.strip))
51 |         l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
52 |                     MapCompose(lambda i: urlparse.urljoin(response.url, i)))
53 | 
54 |         # Housekeeping fields
55 |         l.add_value('url', response.url)
56 |         l.add_value('project', self.settings.get('BOT_NAME'))
57 |         l.add_value('spider', self.name)
58 |         l.add_value('server', socket.gethostname())
59 |         l.add_value('date', datetime.datetime.now())
60 | 
61 |         return l.load_item()
62 | 


--------------------------------------------------------------------------------
/ch11/properties/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = properties.settings
 8 | 
 9 | [deploy]
10 | url = http://localhost:6800/
11 | project = properties
12 | 
13 | [deploy:scrapyd1]
14 | url = http://scrapyd1:6800/
15 | 
16 | [deploy:scrapyd2]
17 | url = http://scrapyd2:6800/
18 | 
19 | [deploy:scrapyd3]
20 | url = http://scrapyd3:6800/
21 | 


--------------------------------------------------------------------------------
/ch11/properties/scrapyd1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch11/properties/scrapyd1


--------------------------------------------------------------------------------
/ch11/properties/scrapyd2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch11/properties/scrapyd2


--------------------------------------------------------------------------------
/ch11/properties/scrapyd3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scalingexcellence/scrapybook/4a051e8ca25326084699900979b6a705e38a1235/ch11/properties/scrapyd3


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | services:
 3 |   web:
 4 |     image: scrapybook/web
 5 |     ports:
 6 |       - "9312:9312"
 7 | 
 8 |   spark:
 9 |     image: scrapybook/spark
10 |     ports:
11 |       - "21:21"
12 |       - "30000:30000"
13 |       - "30001:30001"
14 |       - "30002:30002"
15 |       - "30003:30003"
16 |       - "30004:30004"
17 |       - "30005:30005"
18 |       - "30006:30006"
19 |       - "30007:30007"
20 |       - "30008:30008"
21 |       - "30009:30009"
22 |     volumes:
23 |       - .:/root/book
24 | 
25 |   es:
26 |     image: scrapybook/es
27 |     ports:
28 |       - "9200:9200"
29 | 
30 |   redis:
31 |     image: scrapybook/redis
32 |     ports:
33 |       - "6379:6379"
34 | 
35 |   mysql:
36 |     image: scrapybook/mysql
37 |     ports:
38 |       - "3306:3306"
39 | 
40 |   scrapyd1:
41 |     image: scrapybook/dev
42 |     ports:
43 |       - "6801:6800"
44 | 
45 |   scrapyd2:
46 |     image: scrapybook/dev
47 |     ports:
48 |       - "6802:6800"
49 | 
50 |   scrapyd3:
51 |     image: scrapybook/dev
52 |     ports:
53 |       - "6803:6800"
54 | 
55 |   dev:
56 |     image: scrapybook/dev
57 |     ports:
58 |       - "6800:6800"
59 |     volumes:
60 |       - .:/root/book
61 | 


--------------------------------------------------------------------------------
/insecure_key:
--------------------------------------------------------------------------------
 1 | -----BEGIN RSA PRIVATE KEY-----
 2 | MIIEpQIBAAKCAQEA1ZswRub+3DvSEnBiyM5YRpRzRYV88vO1X2j867u6pyCHUNXv
 3 | RRCr7ahMLPIVYsZwlHb4sF+Zb3DJOBH+E265o93chdMxbWG44k0spf10JRevA0JX
 4 | NrEwHR8vesCR74e5MuddbSic88lsEqnnn+Fo3lStvE6nBp6tbqdEu7GhTtHSYejn
 5 | wwINnA5ocsHkd1YE9L2Scqw1e4bXveTAQnSvhqe33QshGXFpt0tQwRWngah887f2
 6 | P54wFSm2C/UyFT7pvIjINKzIi4vUoXz/nU+V7neTmt3XDdjloYg3ycOaX4RSVneO
 7 | HCf7hkcEKbzbPzzSrGAAYYC5UzFB+ImsIbtV2wIDAQABAoIBAQCjROxgtX2Gft7y
 8 | Ix8Ol9IXmK6HLCI2XZt7ovb3hFWGGzHy0qMBql2P2Tzoed1o038Hq+woe9n+uTnE
 9 | dtQ6rD6PByzgyW2VSsWTjCOdeJ5HH9Qw7ItXDZZWHBkhfYHOkXI4e2oI3qshGAtY
10 | NLALn7KVhioJriCyyaSM2KOLx5khcY+EJ1inQfwQJKqPGsdKc72liz07T8ifRj+m
11 | NLKtwrxlK3IXYfIdgLp/1pCKdrC80DhprMsD4xvNgq4pCR9jd4FoqM9t/Up5ppTm
12 | +p6A/bDwdIPh6cFFeyMP+G3+bTlW1Gg7RLoNCc6qh53WWVgEOQqdLHcQ8Ge4RLmb
13 | wLUmnRuRAoGBAPfXYfjpPZi8rPIQpux13Bs7xaS1/Fa9WqrEfrPptFdUVHeFCGY8
14 | qOUVewPviHdbs0nB71Ynk9/e96agFYijQdqTQzVnpYI4i8GiGk5gPMiB2UYeJ/HZ
15 | mIB3jtWyf6Z/GO0hJ1a6mX0XD3zJGNqFaiwqaYgdO1Fwh9gcH3O2lHyjAoGBANyj
16 | TGDBYHpxPu6uKcGreLd0SgO61PEj7aOSNfrBB2PK83A+zjZCFZRIWqjfrkxGG6+a
17 | 2WuHbEHuCGvu2V5juHYxbAD/38iV/lQl/2xyvN1eR/baE3US06qn6idxjnmeNZDy
18 | DelAx1RGuEvLX1TNAzDTxBwYyzH3W2RpKAUAD11pAoGAN38YJhd8Pn5JL68A4cQG
19 | dGau/BHwHjAqZEC5qmmzgzaT72tvlQ0SOLHVqOzzHt7+x45QnHciSqfvxnTkPYNp
20 | FJuTGhtKWV12FfbJczFjivZgg63u/d3eoy2iY0GkCdE98KNS3r3L7tHCGwwgr5Xe
21 | T2Nz3BHHnZXYJVEuzcddeocCgYEAnhDjPAHtw2p0Inxlb9kPb6aBC/ECcwtBSUkL
22 | IOy/BZA1HPnxs89eNFAtmwQ8k2o6lXDDSJTJSuZj5CdGVKfuU8aOUJz/Tm2eudxL
23 | A/+jLJhJyCBthhcJyx3m04E4CAr+5ytyKeP9qXPMvoghcNg66/UabuKYV+CU+feX
24 | 8xUa7NkCgYEAlX8HGvWMmiG+ZRFB//3Loy87bBxGlN0pUtCEScabZxdB2HkI9Vp7
25 | Yr67QIZ3y7T88Mhkwam54JCjiV+3TZbSyRMOjkqf7UhTCZC6hHNqdUnlpv4bJWeW
26 | i5Eun8ltYxBnemNc2QGxA4r+KCspi+pRvWNGzL3PFVBGXiLsmOMul78=
27 | -----END RSA PRIVATE KEY-----
28 | 


--------------------------------------------------------------------------------
/lint:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for i in `find . -type f -name "*.py"`
 4 | do
 5 |     pep8 $i
 6 |     pyflakes $i
 7 |     #pylint $i
 8 | done
 9 | 
10 | 


--------------------------------------------------------------------------------