├── .coveragerc
├── .gitignore
├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── benchmark
    ├── __init__.py
    └── test_octopus.py
├── octopus
    ├── __init__.py
    ├── cache.py
    ├── core.py
    ├── limiter
    │   ├── __init__.py
    │   ├── in_memory
    │   │   ├── __init__.py
    │   │   └── per_domain.py
    │   └── redis
    │   │   ├── __init__.py
    │   │   └── per_domain.py
    ├── model.py
    └── tornado_core.py
├── redis.conf
├── setup.py
└── tests
    ├── __init__.py
    ├── limiter
        ├── __init__.py
        ├── in_memory
        │   ├── __init__.py
        │   └── test_per_domain.py
        ├── redis
        │   ├── __init__.py
        │   └── test_per_domain.py
        └── test_base_limiter.py
    ├── test_cache.py
    ├── test_model.py
    ├── test_octopus.py
    ├── test_octopus_limited.py
    ├── test_tornado_octopus.py
    └── test_tornado_octopus_limited.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | omit =
 3 |     *tests.py
 4 | branch = True
 5 | source =
 6 |     octopus
 7 | 
 8 | [report]
 9 | exclude_lines =
10 |     pragma: no cover
11 |     def __repr__
12 |     raise NotImplementedError
13 |     if __name__ == .__main__.:
14 |     from urllib.parse import parse_qs
15 |     except ImportError:
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | __pycache__
21 | 
22 | # Installer logs
23 | pip-log.txt
24 | 
25 | # Unit test / coverage reports
26 | .coverage
27 | .tox
28 | nosetests.xml
29 | 
30 | # Translations
31 | *.mo
32 | 
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 | cover
38 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 |   - "3.3"
 5 |   - "pypy"
 6 | 
 7 | matrix:
 8 |   allow_failures:
 9 |     - python: "3.3"
10 |     - python: "pypy"
11 | 
12 | install:
13 |   # install python requirements
14 |   - pip install coveralls
15 |   - pip install -e .[tests]
16 | 
17 | script:
18 |   - make test
19 | 
20 | after_success:
21 |   - coveralls
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Bernardo Heynemann
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | test: redis
 2 | 	@coverage run --branch `which nosetests` -vv --with-yanc -s tests/
 3 | 	@coverage report -m --fail-under=90
 4 | 
 5 | coverage-html: test
 6 | 	@coverage html -d cover
 7 | 	@open cover/index.html
 8 | 
 9 | setup:
10 | 	@pip install -U -e .\[tests\]
11 | 
12 | kill_redis:
13 | 	-redis-cli -p 7575 shutdown
14 | 
15 | redis: kill_redis
16 | 	redis-server ./redis.conf; sleep 1
17 | 	redis-cli -p 7575 info > /dev/null
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | octopus
  2 | =======
  3 | 
  4 | [![Build Status](https://travis-ci.org/heynemann/octopus.png?branch=master)](https://travis-ci.org/heynemann/octopus)
  5 | [![PyPi version](https://pypip.in/v/octopus-http/badge.png)](https://crate.io/packages/octopus-http/)
  6 | [![PyPi downloads](https://pypip.in/d/octopus-http/badge.png)](https://crate.io/packages/octopus-http/)
  7 | [![Coverage Status](https://coveralls.io/repos/heynemann/octopus/badge.png?branch=master)](https://coveralls.io/r/heynemann/octopus?branch=master)
  8 | 
  9 | `octopus` is a library to concurrently retrieve and report on the completion of http requests.
 10 | 
 11 | You can either use threads or the tornado IOLoop to asynchronously get them.
 12 | 
 13 | Installing
 14 | ==========
 15 | 
 16 | Installing `octopus` is really easy:
 17 | 
 18 |     $ pip install octopus-http
 19 | 
 20 | The reason for the name of the package is that a package called `octopus` was already registered at the Python Package Index.
 21 | 
 22 | Using
 23 | =====
 24 | 
 25 | Using `octopus` with threads:
 26 | 
 27 |     from octopus import Octopus
 28 | 
 29 |     # this Octopus instance we'll run 4 threads,
 30 |     # automatically start listening to the queue and
 31 |     # we'll in-memory cache responses for 10 seconds.
 32 |     otto = Octopus(
 33 |         concurrency=4, auto_start=True, cache=True,
 34 |         expiration_in_seconds=10
 35 |     )
 36 | 
 37 |     def handle_url_response(url, response):
 38 |         # do something with response
 39 | 
 40 |     otto.enqueue('http://www.google.com', handle_url_response)
 41 |     otto.enqueue('http://www.facebook.com', handle_url_response)
 42 |     otto.enqueue('http://www.yahoo.com', handle_url_response)
 43 | 
 44 |     # this request will come from the cache
 45 |     otto.enqueue('http://www.google.com', handle_url_response)  
 46 | 
 47 |     otto.wait()  # waits until queue is empty or timeout is ellapsed
 48 | 
 49 | The analogous version with Tornado's IOLoop:
 50 | 
 51 |     from octopus import TornadoOctopus
 52 | 
 53 |     # this Octopus instance we'll run 4 concurrent requests max,
 54 |     # automatically start listening to the queue and
 55 |     # we'll in-memory cache responses for 10 seconds.
 56 |     otto = TornadoOctopus(
 57 |         concurrency=4, auto_start=True, cache=True,
 58 |         expiration_in_seconds=10
 59 |     )
 60 | 
 61 |     def handle_url_response(url, response):
 62 |         # do something with response
 63 | 
 64 |     otto.enqueue('http://www.google.com', handle_url_response)
 65 |     otto.enqueue('http://www.facebook.com', handle_url_response)
 66 |     otto.enqueue('http://www.yahoo.com', handle_url_response)
 67 | 
 68 |     # this request will come from the cache
 69 |     otto.enqueue('http://www.google.com', handle_url_response)  
 70 | 
 71 |     otto.wait()  # waits until queue is empty or timeout is ellapsed
 72 | 
 73 | API Reference
 74 | =============
 75 | 
 76 | Response Class
 77 | --------------
 78 | 
 79 | The `Response` class is the result of all requests made with `Octopus` or `TornadoOctopus`.
 80 | 
 81 | It has the following information:
 82 | 
 83 | * `url` - the url that started the request;
 84 | * `status_code` - the status code for the request;
 85 | * `cookies` - dictionary with request cookie values;
 86 | * `headers` - dictionary with response headers;
 87 | * `text` - the body of the response;
 88 | * `effective_url` - in the case of redirects, this url might be different than url;
 89 | * `error` - if an error has occurred this is where the error message will be;
 90 | * `request_time` - the time ellapsed between the start and the end of the request in seconds.
 91 | 
 92 | Octopus Class
 93 | -------------
 94 | 
 95 | This is the main unit of work in `octopus` if you want to use threads. To enqueue new urls you need to have an `Octopus` instance:
 96 | 
 97 |     from octopus import Octopus
 98 | 
 99 |     otto = Octopus()
100 | 
101 | The constructor for `Octopus` takes several configuration options:
102 | 
103 | * `concurrency`: number of threads to use to retrieve URLs (defaults to 10 threads);
104 | * `auto_start`: Indicates whether threads should be started automatically (defaults to False);
105 | * `cache`: If set to `True`, responses will be cached for the number of seconds specified in `expiration_in_seconds` (defaults to False);
106 | * `expiration_in_seconds`: The number of seconds to keep url responses in the local cache (defaults to 30 seconds);
107 | * `request_timeout_in_seconds`: The number of seconds that each request can take (defaults to 5 seconds).
108 | * `limiter`: The instance of a limiter class to use to acquire limits (more on limits below).
109 | 
110 | Octopus.start()
111 | ---------------
112 | 
113 | If `auto_start` is set to `False`, this method must be called to start retrieving URLs. This is a **non-blocking** method.
114 | 
115 | Octopus.enqueue
116 | ---------------
117 | 
118 | Takes as arguments (url, handler, method="GET", **kwargs).
119 | 
120 | This is the main method in the `Octopus` class. This method is used to enqueue new URLs. The handler argument specifies the method to be called when the response is available.
121 | 
122 | The handler takes the form `handler(url, response)`. The response argument is a Octopus.Response instance.
123 | 
124 | You can specify a different method using the `method` argument (`POST`, `HEAD`, etc) and you can pass extra keyword arguments to the `requests.request` method using the keyword arguments for this method.
125 | 
126 | This is a **non-blocking** method.
127 | 
128 | Octopus.queue_size
129 | ------------------
130 | 
131 | This property returns the approximate number of URLs still in the queue (not retrieved yet).
132 | 
133 | Octopus.is_empty
134 | ----------------
135 | 
136 | This property returns if the URL queue is empty.
137 | 
138 | Octopus.wait(timeout=10)
139 | ------------------------
140 | 
141 | If you want to wait for all the URLs in the queue to finish loading, just call this method.
142 | 
143 | If you specify a `timeout` of `0`, `octopus` will wait until the queue is empty, no matter how long it takes.
144 | 
145 | This is a **blocking** method.
146 | 
147 | TornadoOctopus Class
148 | --------------------
149 | 
150 | This is the main unit of work in `octopus` if you want to use Tornado's IOLoop. To enqueue new urls you need to have an `TornadoOctopus` instance:
151 | 
152 |     from octopus import TornadoOctopus
153 | 
154 |     otto = TornadoOctopus()
155 | 
156 | A **very important** thing that differs from the threaded version of Octopus is that you **MUST** call wait to get the responses, since Tornado IOLoop needs to be run in order to get the requests.
157 | 
158 | The constructor for `TornadoOctopus` takes several configuration options:
159 | 
160 | * `concurrency`: number of maximum async http requests to use to retrieve URLs (defaults to 10 requests);
161 | * `auto_start`: Indicates whether the ioloop should be created automatically (defaults to False);
162 | * `cache`: If set to `True`, responses will be cached for the number of seconds specified in `expiration_in_seconds` (defaults to False);
163 | * `expiration_in_seconds`: The number of seconds to keep url responses in the local cache (defaults to 30 seconds);
164 | * `request_timeout_in_seconds`: The number of seconds that each request can take (defaults to 10 seconds).
165 | * `connect_timeout_in_seconds`: The number of seconds that each connection can take (defaults to 5 seconds).
166 | * `limiter`: The instance of a limiter class to use to acquire limits (more on limits below).
167 | 
168 | TornadoOctopus.start()
169 | ---------------
170 | 
171 | If `auto_start` is set to `False`, this method must be called to create the IOLoop instance. This is a **non-blocking** method.
172 | 
173 | TornadoOctopus.enqueue
174 | ----------------------
175 | 
176 | Takes as arguments (url, handler, method="GET", **kwargs).
177 | 
178 | This is the main method in the `TornadoOctopus` class. This method is used to enqueue new URLs. The handler argument specifies the method to be called when the response is available.
179 | 
180 | The handler takes the form `handler(url, response)`. The response argument is a Octopus.Response instance.
181 | 
182 | You can specify a different method using the `method` argument (`POST`, `HEAD`, etc) and you can pass extra keyword arguments to the `AsyncHTTPClient.fetch` method using the keyword arguments for this method.
183 | 
184 | This is a **non-blocking** method.
185 | 
186 | TornadoOctopus.queue_size
187 | -------------------------
188 | 
189 | This property returns the number of URLs still in the queue (not retrieved yet).
190 | 
191 | TornadoOctopus.is_empty
192 | -----------------------
193 | 
194 | This property returns if the URL queue is empty.
195 | 
196 | TornadoOctopus.wait(timeout=10)
197 | -------------------------------
198 | 
199 | In order for the IOLoop to handle callbacks, you **MUST** call wait. This is the method that gets the IOLoop to run.
200 | 
201 | If you specify a `timeout` of `0`, `octopus` will wait until the queue is empty, no matter how long it takes.
202 | 
203 | This is a **blocking** method.
204 | 
205 | Limiting Simultaneous Connections
206 | =================================
207 | 
208 | A very common problem that can happen when using octopus is overwhelming the server you are going to. In order to make sure this
209 | does not happen, Octopus allows users to specify a limiter class.
210 | 
211 | Each limiter class has to provide two methods `acquire` and `release`, both taking an URL as argument.
212 | 
213 | Octopus comes bundled with an in-memory limiter and a redis limiter (courtesy of the [retools project](https://github.com/bbangert/retools)). Using limiters is as simple as passing it to octopus constructor:
214 | 
215 |     from octopus import TornadoOctopus
216 |     from octopus.limiter.in_memory.per_domain import Limiter
217 | 
218 |     # using in-memory limiter. Domains not specified here have no limit.
219 |     limiter = Limiter(
220 |         {'http://globo.com': 10},  # only 10 concurrent requests to this domain
221 |         {'http://g1.globo.com': 20},  # only 20 concurrent requests to this domain
222 |     )
223 | 
224 |     otto = TornadoOctopus(
225 |         concurrency=4, auto_start=True, cache=True,
226 |         expiration_in_seconds=10,
227 |         limiter=limiter
228 |     )
229 | 
230 | The available built-in limiters are:
231 | 
232 | * `octopus.limiter.in_memory.per_domain.Limiter`
233 | * `octopus.limiter.redis.per_domain.Limiter`
234 | 
235 | Both take a list of dictionaries with the key being the beginning of the URL and value being the allowed concurrent connections.
236 | 
237 | The reason this is a list is that urls defined first take precedence. This allows users to single out a path in a domain that needs less connections than the rest of the domain, like this:
238 | 
239 |     # using in-memory limiter. Domains not specified here have no limit.
240 |     limiter = Limiter(
241 |         {'http://g1.globo.com/economia': 5},  # only 5 concurrent requests to urls that begin with this key
242 |         {'http://g1.globo.com': 20},  # only 20 concurrent requests to the rest of the domain
243 |     )
244 | 
245 | The redis limiter takes two additional keyword arguments:
246 |  `redis` (a [redis.py](https://github.com/andymccurdy/redis-py) connection to redis)
247 |  and `expiration_in_seconds` (the expiration for locks in the limiter).
248 | 
249 | **WARNING**: The in-memory limiter **IS NOT** thread-safe, so if you are using Threaded Octopus, do not use this limiter.
250 | 
251 | If you'd like to do something when the limiter misses a lock (i.e.: no more connections allowed), just subscribe to it in the limiter using:
252 | 
253 |     # using in-memory limiter. Domains not specified here have no limit.
254 |     limiter = Limiter(
255 |         {'http://g1.globo.com/economia': 5},  # only 5 concurrent requests to urls that begin with this key
256 |         {'http://g1.globo.com': 20},  # only 20 concurrent requests to the rest of the domain
257 |     )
258 | 
259 |     def handle_lock_miss(url):
260 |         # do something with the miss
261 |         pass
262 | 
263 |     limiter.subscribe_to_lock_miss(handle_lock_miss)
264 | 
265 | 
266 | Benchmark
267 | =========
268 | 
269 | In order to decide whether `octopus` really was worth using, it features a benchmark test in it's codebase.
270 | 
271 | If you want to run it yourself (which is highly encouraged), just clone `octopus` repository and run this command:
272 | 
273 |     $ python benchmark/test_octopus.py 200 100
274 | 
275 | The first argument is the number of URLs to retrieve. The seconds argument means how many threads will be used by `octopus` to get the urls.
276 | 
277 | The test is pretty simple. Time how long it takes for requests to get the URLs sequentially and for `octopus` to get them concurrently.
278 | 
279 | The results for retrieving `2000` urls with `200` threads is as follows:
280 | 
281 |     =======
282 |     RESULTS
283 |     =======
284 | 
285 |     [requests] Retrieving 2000 urls took 2692.66 seconds meaning 0.74 urls/second.
286 | 
287 |     [octopus] Retrieving 2000 urls took 31.14 seconds meaning 64.22 urls/second.
288 | 
289 |     [octopus] Retrieving 2000 urls with local in-memory caching took 6.61 seconds
290 |     meaning 302.50 urls/second.
291 | 
292 |     [octopus-tornado] Retrieving 2000 urls took 167.99 seconds
293 |     meaning 11.91 urls/second.
294 | 
295 |     [octopus-tornado-pycurl] Retrieving 2000 urls took 171.40 seconds
296 |     meaning 11.67 urls/second.
297 | 
298 |     Overall, threaded octopus was more than 86 times faster than sequential requests
299 |     and tornado octopus was more than 15 times faster than sequential requests.
300 | 


--------------------------------------------------------------------------------
/benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/heynemann/octopus/c7bc93e60530368b137a4e9df26b5b34dacecedf/benchmark/__init__.py


--------------------------------------------------------------------------------
/benchmark/test_octopus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | from random import choice
  6 | from time import time
  7 | 
  8 | import requests
  9 | 
 10 | from octopus import Octopus, TornadoOctopus
 11 | 
 12 | 
 13 | def main(repetitions, concurrency):
 14 | 
 15 |     # alexa top sites
 16 |     urls = [
 17 |         'http://facebook.com',
 18 |         'http://youtube.com',
 19 |         'http://yahoo.com',
 20 |         'http://wikipedia.org',
 21 |         'http://linkedin.com',
 22 |         'http://live.com',
 23 |         'http://twitter.com',
 24 |         'http://amazon.com',
 25 |         'http://blogspot.com',
 26 |         'http://wordpress.com',
 27 |         'http://bing.com',
 28 |         'http://ebay.com',
 29 |         'http://tumblr.com',
 30 |     ]
 31 | 
 32 |     urls_to_retrieve = [choice(urls) for i in range(repetitions)]
 33 | 
 34 |     #requests_total_time = sequential_requests(repetitions, urls_to_retrieve)
 35 |     requests_total_time = 2692.66  # did it once... takes too long to get 2000 urls sequentially.
 36 |     otto_total_time = otto_requests(repetitions, concurrency, urls_to_retrieve)
 37 |     otto_cached_total_time = otto_cached_requests(repetitions, concurrency, urls_to_retrieve)
 38 |     tornado_pycurl_total_time = tornado_requests(repetitions, concurrency, urls_to_retrieve)
 39 |     tornado_total_time = tornado_requests(repetitions, concurrency, urls_to_retrieve, ignore_pycurl=True)
 40 | 
 41 |     message = "RESULTS"
 42 |     print
 43 |     print("=" * len(message))
 44 |     print(message)
 45 |     print("=" * len(message))
 46 |     print
 47 | 
 48 |     print "[requests] Retrieving %d urls took %.2f seconds meaning %.2f urls/second." % (
 49 |         repetitions,
 50 |         requests_total_time,
 51 |         repetitions / requests_total_time
 52 |     )
 53 |     print
 54 | 
 55 |     print "[octopus] Retrieving %d urls took %.2f seconds meaning %.2f urls/second." % (
 56 |         repetitions,
 57 |         otto_total_time,
 58 |         repetitions / otto_total_time
 59 |     )
 60 |     print
 61 | 
 62 |     print "[octopus] Retrieving %d urls with local in-memory caching took %.2f seconds meaning %.2f urls/second." % (
 63 |         repetitions,
 64 |         otto_cached_total_time,
 65 |         repetitions / otto_cached_total_time
 66 |     )
 67 |     print
 68 | 
 69 |     print "[octopus-tornado] Retrieving %d urls took %.2f seconds meaning %.2f urls/second." % (
 70 |         repetitions,
 71 |         tornado_total_time,
 72 |         repetitions / tornado_total_time
 73 |     )
 74 |     print
 75 | 
 76 |     print "[octopus-tornado-pycurl] Retrieving %d urls took %.2f seconds meaning %.2f urls/second." % (
 77 |         repetitions,
 78 |         tornado_pycurl_total_time,
 79 |         repetitions / tornado_pycurl_total_time
 80 |     )
 81 |     print
 82 | 
 83 |     print "Overall, threaded octopus was more than %d times faster than sequential requests and tornado octopus was more than %d times faster than sequential requests." % (
 84 |         int(requests_total_time / otto_total_time),
 85 |         int(requests_total_time / tornado_pycurl_total_time)
 86 |     )
 87 | 
 88 |     print
 89 | 
 90 | 
 91 | def sequential_requests(repetitions, urls_to_retrieve):
 92 |     message = "Retrieving URLs sequentially with Requests..."
 93 |     print
 94 |     print("=" * len(message))
 95 |     print(message)
 96 |     print("=" * len(message))
 97 |     print
 98 | 
 99 |     start_time = time()
100 | 
101 |     for url_index, url in enumerate(urls_to_retrieve):
102 |         print "%.2f%% - getting %s..." % (
103 |             float(url_index) / float(repetitions) * 100,
104 |             url
105 |         )
106 |         assert requests.get(url).status_code == 200
107 | 
108 |     return time() - start_time
109 | 
110 | 
111 | def otto_requests(repetitions, concurrency, urls_to_retrieve):
112 |     message = "Retrieving URLs concurrently with Octopus..."
113 |     print
114 |     print("=" * len(message))
115 |     print(message)
116 |     print("=" * len(message))
117 |     print
118 | 
119 |     otto = Octopus(concurrency=concurrency)
120 | 
121 |     for url in urls_to_retrieve:
122 |         otto.enqueue(url, handle_url_response)
123 | 
124 |     start_time = time()
125 |     otto.start()
126 |     otto.wait(0)
127 | 
128 |     return time() - start_time
129 | 
130 | 
131 | def otto_cached_requests(repetitions, concurrency, urls_to_retrieve):
132 |     message = "Retrieving URLs concurrently with Octopus with caching enabled..."
133 |     print
134 |     print("=" * len(message))
135 |     print(message)
136 |     print("=" * len(message))
137 |     print
138 | 
139 |     otto = Octopus(concurrency=concurrency, cache=True, auto_start=True)
140 | 
141 |     for url in urls_to_retrieve:
142 |         otto.enqueue(url, handle_url_response)
143 | 
144 |     start_time = time()
145 |     otto.wait(0)
146 | 
147 |     return time() - start_time
148 | 
149 | 
150 | def tornado_requests(repetitions, concurrency, urls_to_retrieve, ignore_pycurl=False):
151 |     message = "Retrieving URLs concurrently with TornadoOctopus (%s)..." % (
152 |         ignore_pycurl and "using SimpleHTTPClient" or "using pycurl"
153 |     )
154 |     print
155 |     print("=" * len(message))
156 |     print(message)
157 |     print("=" * len(message))
158 |     print
159 | 
160 |     otto = TornadoOctopus(concurrency=concurrency, cache=False, auto_start=True, ignore_pycurl=ignore_pycurl)
161 | 
162 |     for url in urls_to_retrieve:
163 |         otto.enqueue(url, handle_url_response)
164 | 
165 |     start_time = time()
166 |     otto.wait(0)
167 | 
168 |     return time() - start_time
169 | 
170 | 
171 | def handle_url_response(url, response):
172 |     print "Got %s!" % url
173 |     assert response.status_code == 200, "Expected status code for %s to be 200, got %s" % (url, response.status_code)
174 | 
175 | 
176 | if __name__ == '__main__':
177 |     main(int(sys.argv[1]), int(sys.argv[2]))
178 | 


--------------------------------------------------------------------------------
/octopus/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | __version__ = '0.6.4'
5 | 
6 | from octopus.core import Octopus, TimeoutError, ResponseError  # NOQA
7 | from octopus.tornado_core import TornadoOctopus  # NOQA
8 | 


--------------------------------------------------------------------------------
/octopus/cache.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | from datetime import datetime, timedelta
 6 | 
 7 | 
 8 | class Cache(object):
 9 |     def __init__(self, expiration_in_seconds):
10 |         self.responses = {}
11 |         self.expiration_in_seconds = expiration_in_seconds
12 | 
13 |     def put(self, url, response):
14 |         self.responses[url] = {
15 |             'response': response,
16 |             'expires': datetime.now() + timedelta(seconds=self.expiration_in_seconds)
17 |         }
18 | 
19 |     def get(self, url):
20 |         if url not in self.responses:
21 |             return None
22 | 
23 |         data = self.responses[url]
24 | 
25 |         if data['expires'] <= datetime.now():
26 |             del self.responses[url]
27 |             return None
28 | 
29 |         return data['response']
30 | 


--------------------------------------------------------------------------------
/octopus/core.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import logging
  5 | import sys
  6 | import time
  7 | from datetime import timedelta
  8 | from threading import Thread
  9 | 
 10 | try:
 11 |     import requests
 12 |     import requests.exceptions
 13 | except ImportError:
 14 |     print("Can't import requests. Probably setup.py installing package.")
 15 | 
 16 | from octopus.cache import Cache
 17 | from octopus.model import Response
 18 | 
 19 | try:
 20 | 
 21 |     from six.moves import queue
 22 | 
 23 |     class OctopusQueue(queue.Queue):
 24 |         # from http://stackoverflow.com/questions/1564501/add-timeout-argument-to-pythons-queue-join
 25 |         def join_with_timeout(self, timeout):
 26 |             self.all_tasks_done.acquire()
 27 |             try:
 28 |                 endtime = time.time() + timeout
 29 |                 while self.unfinished_tasks:
 30 |                     remaining = endtime - time.time()
 31 |                     if remaining <= 0.0:
 32 |                         raise TimeoutError
 33 |                     self.all_tasks_done.wait(remaining)
 34 |             finally:
 35 |                 self.all_tasks_done.release()
 36 | 
 37 | except ImportError:
 38 |     print("Can't import six. Probably setup.py installing package.")
 39 | 
 40 | 
 41 | class TimeoutError(RuntimeError):
 42 |     pass
 43 | 
 44 | 
 45 | class ResponseError(object):
 46 |     def __init__(self, url, status_code, text, error=None, elapsed=None):
 47 |         self.url = url
 48 |         self.status_code = status_code
 49 |         self.text = text
 50 |         self.error = error
 51 |         self.headers = {}
 52 |         self.cookies = {}
 53 |         self.effective_url = url
 54 |         self.elapsed = elapsed
 55 | 
 56 |     def close(self):
 57 |         pass
 58 | 
 59 | 
 60 | class Octopus(object):
 61 |     def __init__(
 62 |             self, concurrency=10, auto_start=False, cache=False,
 63 |             expiration_in_seconds=30, request_timeout_in_seconds=5, limiter=None
 64 |             ):
 65 | 
 66 |         self.concurrency = concurrency
 67 |         self.auto_start = auto_start
 68 | 
 69 |         self.cache = cache
 70 |         self.response_cache = Cache(expiration_in_seconds=expiration_in_seconds)
 71 |         self.request_timeout_in_seconds = request_timeout_in_seconds
 72 | 
 73 |         self.url_queue = OctopusQueue()
 74 |         self.limiter = limiter
 75 | 
 76 |         if auto_start:
 77 |             self.start()
 78 | 
 79 |     def from_requests_response(self, url, response):
 80 |         return Response(
 81 |             url=url, status_code=response.status_code,
 82 |             headers=dict([(key, value) for key, value in response.headers.items()]),
 83 |             cookies=dict([(key, value) for key, value in response.cookies.items()]),
 84 |             text=response.text, effective_url=response.url,
 85 |             error=response.status_code > 399 and response.text or None,
 86 |             request_time=response.elapsed and response.elapsed.total_seconds or 0
 87 |         )
 88 | 
 89 |     def enqueue(self, url, handler, method='GET', **kw):
 90 |         if self.cache:
 91 |             response = self.response_cache.get(url)
 92 |             if response is not None:
 93 |                 handler(url, response)
 94 |                 return
 95 | 
 96 |         self.url_queue.put_nowait((url, handler, method, kw))
 97 | 
 98 |     @property
 99 |     def queue_size(self):
100 |         return self.url_queue.qsize()
101 | 
102 |     @property
103 |     def is_empty(self):
104 |         return self.url_queue.empty()
105 | 
106 |     def start(self):
107 |         for i in range(self.concurrency):
108 |             t = Thread(target=self.do_work)
109 |             t.daemon = True
110 |             t.start()
111 | 
112 |     def do_work(self):
113 |         while True:
114 |             url, handler, method, kwargs = self.url_queue.get()
115 | 
116 |             response = None
117 |             if self.cache:
118 |                 response = self.response_cache.get(url)
119 | 
120 |             if response is None:
121 |                 if self.limiter and not self.limiter.acquire(url):
122 |                     logging.info('Could not acquire limit for url "%s".' % url)
123 |                     self.url_queue.task_done()
124 |                     self.url_queue.put_nowait((url, handler, method, kwargs))
125 |                     self.limiter.publish_lock_miss(url)
126 |                     time.sleep(0.1)
127 |                     continue
128 | 
129 |                 try:
130 |                     response = requests.request(method, url, timeout=self.request_timeout_in_seconds, **kwargs)
131 |                 except requests.exceptions.Timeout:
132 |                     err = sys.exc_info()[1]
133 |                     response = ResponseError(
134 |                         url=url,
135 |                         status_code=500,
136 |                         text=str(err),
137 |                         error=err,
138 |                         elapsed=timedelta(seconds=self.request_timeout_in_seconds)
139 |                     )
140 |                 except Exception:
141 |                     err = sys.exc_info()[1]
142 |                     response = ResponseError(
143 |                         url=url,
144 |                         status_code=599,
145 |                         text=str(err),
146 |                         error=err
147 |                     )
148 |                 finally:
149 |                     if self.limiter:
150 |                         self.limiter.release(url)
151 | 
152 |                 original_response = response
153 | 
154 |                 response = self.from_requests_response(url, response)
155 | 
156 |                 original_response.close()
157 | 
158 |                 if self.cache:
159 |                     self.response_cache.put(url, response)
160 | 
161 |             handler(url, response)
162 | 
163 |             self.url_queue.task_done()
164 | 
165 |     def wait(self, timeout=10):
166 |         if timeout > 0:
167 |             self.url_queue.join_with_timeout(timeout=timeout)
168 |         else:
169 |             self.url_queue.join()
170 | 


--------------------------------------------------------------------------------
/octopus/limiter/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from cyrusbus import Bus
 5 | 
 6 | 
 7 | class Limiter(object):
 8 |     def __init__(self, limiter_miss_timeout_ms=None):
 9 |         self.bus = Bus()
10 |         self.limiter_miss_timeout_ms = limiter_miss_timeout_ms
11 |         if self.limiter_miss_timeout_ms is None:
12 |             self.limiter_miss_timeout_ms = 500
13 | 
14 |     def handle_callbacks(self, callback):
15 |         def handle(bus, *args, **kw):
16 |             callback(*args, **kw)
17 |         return handle
18 | 
19 |     def subscribe_to_lock_miss(self, callback):
20 |         self.bus.subscribe('limiter.miss', self.handle_callbacks(callback))
21 | 
22 |     def publish_lock_miss(self, url):
23 |         self.bus.publish('limiter.miss', url)
24 | 


--------------------------------------------------------------------------------
/octopus/limiter/in_memory/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/heynemann/octopus/c7bc93e60530368b137a4e9df26b5b34dacecedf/octopus/limiter/in_memory/__init__.py


--------------------------------------------------------------------------------
/octopus/limiter/in_memory/per_domain.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import logging
 5 | from collections import defaultdict
 6 | 
 7 | from octopus.limiter import Limiter as BaseLimiter
 8 | 
 9 | 
10 | class Limiter(BaseLimiter):
11 |     def __init__(self, *domains, **kw):
12 |         limiter_miss_timeout_ms = None
13 |         if 'limiter_miss_timeout_ms' in kw:
14 |             limiter_miss_timeout_ms = kw['limiter_miss_timeout_ms']
15 | 
16 |         super(Limiter, self).__init__(limiter_miss_timeout_ms=limiter_miss_timeout_ms)
17 |         self.update_domain_definitions(*domains)
18 | 
19 |     def update_domain_definitions(self, *domains):
20 |         self.domains = domains
21 |         self.domain_count = defaultdict(int)
22 | 
23 |     def get_domain_from_url(self, url):
24 |         for domain in self.domains:
25 |             for key in domain.keys():
26 |                 if url.startswith(key):
27 |                     return key
28 |         return None
29 | 
30 |     def get_domain_limit(self, url):
31 |         for domain in self.domains:
32 |             for key in domain.keys():
33 |                 if url.startswith(key):
34 |                     return domain[key]
35 |         return 0
36 | 
37 |     def acquire(self, url):
38 |         domain = self.get_domain_from_url(url)
39 |         if domain is None:
40 |             logging.info('Tried to acquire lock to a domain that was not specified in the limiter (%s).' % url)
41 |             return True
42 | 
43 |         limit = self.get_domain_limit(url)
44 | 
45 |         if self.domain_count[domain] < limit:
46 |             self.domain_count[domain] += 1
47 |             return True
48 | 
49 |         return False
50 | 
51 |     def release(self, url):
52 |         domain = self.get_domain_from_url(url)
53 |         if domain is None:
54 |             logging.info('Tried to release lock to a domain that was not specified in the limiter (%s).' % url)
55 |             return
56 | 
57 |         self.domain_count[domain] -= 1
58 | 


--------------------------------------------------------------------------------
/octopus/limiter/redis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/heynemann/octopus/c7bc93e60530368b137a4e9df26b5b34dacecedf/octopus/limiter/redis/__init__.py


--------------------------------------------------------------------------------
/octopus/limiter/redis/per_domain.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import logging
 5 | 
 6 | from retools.limiter import Limiter as ReToolsLimiter
 7 | 
 8 | from octopus.limiter.in_memory.per_domain import Limiter as InMemoryPerDomainLimiter
 9 | 
10 | 
11 | class Limiter(InMemoryPerDomainLimiter):
12 |     def __init__(self, *domains, **kw):
13 |         limiter_miss_timeout_ms = None
14 |         if 'limiter_miss_timeout_ms' in kw:
15 |             limiter_miss_timeout_ms = kw['limiter_miss_timeout_ms']
16 | 
17 |         # Skips InMemoryPerDomainLimiter constructor
18 |         super(InMemoryPerDomainLimiter, self).__init__(limiter_miss_timeout_ms=limiter_miss_timeout_ms)
19 | 
20 |         if not 'redis' in kw:
21 |             raise RuntimeError('You must specify a connection to redis in order to use Redis Limiter.')
22 | 
23 |         self.redis = kw['redis']
24 |         self.expiration_in_seconds = float(kw.get('expiration_in_seconds', 10))
25 | 
26 |         self.update_domain_definitions(*domains)
27 | 
28 |     def update_domain_definitions(self, *domains):
29 |         self.domains = domains
30 |         self.limiters = {}
31 | 
32 |         for domain in self.domains:
33 |             for key, limit in domain.items():
34 |                 self.limiters[key] = ReToolsLimiter(
35 |                     limit=limit,
36 |                     prefix='limit-for-%s' % key,
37 |                     expiration_in_seconds=self.expiration_in_seconds,
38 |                     redis=self.redis
39 |                 )
40 | 
41 |     def acquire(self, url):
42 |         domain = self.get_domain_from_url(url)
43 |         if domain is None:
44 |             logging.info('Tried to acquire lock to a domain that was not specified in the limiter (%s).' % url)
45 |             return True
46 | 
47 |         could_lock = self.limiters[domain].acquire_limit(url)
48 | 
49 |         if not could_lock:
50 |             logging.info('Tried to acquire lock for %s but could not.' % url)
51 | 
52 |         return could_lock
53 | 
54 |     def release(self, url):
55 |         domain = self.get_domain_from_url(url)
56 | 
57 |         if domain is None:
58 |             logging.info('Tried to release lock to a domain that was not specified in the limiter (%s).' % url)
59 |             return True
60 | 
61 |         self.limiters[domain].release_limit(url)
62 | 


--------------------------------------------------------------------------------
/octopus/model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | class Response(object):
 6 |     def __init__(
 7 |         self, url, status_code,
 8 |         headers, cookies, text, effective_url,
 9 |         error, request_time
10 |     ):
11 |         self.url = url
12 |         self.status_code = status_code
13 |         self.cookies = cookies
14 |         self.headers = headers
15 |         self.text = text
16 |         self.effective_url = effective_url
17 |         self.error = error
18 |         self.request_time = request_time
19 | 


--------------------------------------------------------------------------------
/octopus/tornado_core.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import logging
  5 | from datetime import timedelta
  6 | 
  7 | try:
  8 |     from tornado.ioloop import IOLoop
  9 |     from tornado.httpclient import AsyncHTTPClient, HTTPRequest
 10 | except ImportError:
 11 |     print("Can't import tornado. Probably setup.py installing package.")
 12 | 
 13 | try:
 14 |     import pycurl  # NOQA
 15 |     PYCURL_AVAILABLE = True
 16 | except ImportError:
 17 |     PYCURL_AVAILABLE = False
 18 | 
 19 | from octopus.cache import Cache
 20 | from octopus.model import Response
 21 | 
 22 | 
 23 | class TornadoOctopus(object):
 24 |     def __init__(
 25 |             self, concurrency=10, auto_start=False, cache=False,
 26 |             expiration_in_seconds=30, request_timeout_in_seconds=10,
 27 |             connect_timeout_in_seconds=5, ignore_pycurl=False,
 28 |             limiter=None, allow_connection_reuse=True):
 29 | 
 30 |         self.concurrency = concurrency
 31 |         self.auto_start = auto_start
 32 |         self.last_timeout = None
 33 | 
 34 |         self.cache = cache
 35 |         self.response_cache = Cache(expiration_in_seconds=expiration_in_seconds)
 36 |         self.request_timeout_in_seconds = request_timeout_in_seconds
 37 |         self.connect_timeout_in_seconds = connect_timeout_in_seconds
 38 | 
 39 |         self.ignore_pycurl = ignore_pycurl
 40 | 
 41 |         self.running_urls = 0
 42 |         self.url_queue = []
 43 | 
 44 |         if PYCURL_AVAILABLE and not self.ignore_pycurl:
 45 |             logging.debug('pycurl is available, thus Octopus will be using it instead of tornado\'s simple http client.')
 46 |             AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
 47 |             self.allow_connection_reuse = allow_connection_reuse
 48 |         else:
 49 |             self.allow_connection_reuse = True
 50 | 
 51 |         if auto_start:
 52 |             logging.debug('Auto starting...')
 53 |             self.start()
 54 | 
 55 |         self.limiter = limiter
 56 | 
 57 |     @property
 58 |     def queue_size(self):
 59 |         return len(self.url_queue)
 60 | 
 61 |     @property
 62 |     def is_empty(self):
 63 |         return self.queue_size == 0
 64 | 
 65 |     def start(self):
 66 |         logging.debug('Creating IOLoop and http_client.')
 67 |         self.ioloop = IOLoop()
 68 |         self.http_client = AsyncHTTPClient(io_loop=self.ioloop)
 69 | 
 70 |     @classmethod
 71 |     def from_tornado_response(cls, url, response):
 72 |         cookies = response.request.headers.get('Cookie', '')
 73 |         if cookies:
 74 |             cookies = dict([cookie.split('=') for cookie in cookies.split(';')])
 75 | 
 76 |         return Response(
 77 |             url=url, status_code=response.code,
 78 |             headers=dict([(key, value) for key, value in response.headers.items()]),
 79 |             cookies=cookies,
 80 |             text=response.body, effective_url=response.effective_url,
 81 |             error=response.error and str(response.error) or None,
 82 |             request_time=response.request_time
 83 |         )
 84 | 
 85 |     def enqueue(self, url, handler, method='GET', **kw):
 86 |         logging.debug('Enqueueing %s...' % url)
 87 | 
 88 |         if self.cache:
 89 |             response = self.response_cache.get(url)
 90 | 
 91 |             if response is not None:
 92 |                 logging.debug('Cache hit on %s.' % url)
 93 |                 handler(url, response)
 94 |                 return
 95 | 
 96 |         if self.running_urls < self.concurrency:
 97 |             logging.debug('Queue has space available for fetching %s.' % url)
 98 |             self.get_next_url(url, handler, method, **kw)
 99 |         else:
100 |             logging.debug('Queue is full. Enqueueing %s for future fetch.' % url)
101 |             self.url_queue.append((url, handler, method, kw))
102 | 
103 |     def fetch(self, url, handler, method, **kw):
104 |         self.running_urls += 1
105 | 
106 |         if self.cache:
107 |             response = self.response_cache.get(url)
108 | 
109 |             if response is not None:
110 |                 logging.debug('Cache hit on %s.' % url)
111 |                 self.running_urls -= 1
112 |                 handler(url, response)
113 |                 return
114 | 
115 |         logging.info('Fetching %s...' % url)
116 | 
117 |         request = HTTPRequest(
118 |             url=url,
119 |             method=method,
120 |             connect_timeout=self.connect_timeout_in_seconds,
121 |             request_timeout=self.request_timeout_in_seconds,
122 |             prepare_curl_callback=self.handle_curl_callback,
123 |             **kw
124 |         )
125 | 
126 |         self.http_client.fetch(request, self.handle_request(url, handler))
127 | 
128 |     def handle_curl_callback(self, curl):
129 |         if not self.allow_connection_reuse:
130 |             curl.setopt(pycurl.FRESH_CONNECT, 1)
131 | 
132 |     def get_next_url(self, request_url=None, handler=None, method=None, **kw):
133 |         if request_url is None:
134 |             if not self.url_queue:
135 |                 return
136 | 
137 |             request_url, handler, method, kw = self.url_queue.pop()
138 | 
139 |         self.fetch_next_url(request_url, handler, method, **kw)
140 | 
141 |     def fetch_next_url(self, request_url, handler, method, **kw):
142 |         if self.limiter and not self.limiter.acquire(request_url):
143 |             logging.info('Could not acquire limit for url "%s".' % request_url)
144 | 
145 |             self.url_queue.append((request_url, handler, method, kw))
146 |             deadline = timedelta(seconds=self.limiter.limiter_miss_timeout_ms / 1000.0)
147 |             self.ioloop.add_timeout(deadline, self.get_next_url)
148 |             self.limiter.publish_lock_miss(request_url)
149 |             return False
150 | 
151 |         logging.debug('Queue has space available for fetching %s.' % request_url)
152 |         self.fetch(request_url, handler, method, **kw)
153 |         return True
154 | 
155 |     def handle_request(self, url, callback):
156 |         def handle(response):
157 |             logging.debug('Handler called for url %s...' % url)
158 |             self.running_urls -= 1
159 | 
160 |             response = self.from_tornado_response(url, response)
161 |             logging.info('Got response(%s) from %s.' % (response.status_code, url))
162 | 
163 |             if self.cache and response and response.status_code < 399:
164 |                 logging.debug('Putting %s into cache.' % url)
165 |                 self.response_cache.put(url, response)
166 | 
167 |             if self.limiter:
168 |                 self.limiter.release(url)
169 | 
170 |             try:
171 |                 callback(url, response)
172 |             except Exception:
173 |                 logging.exception('Error calling callback for %s.' % url)
174 | 
175 |             if self.running_urls < self.concurrency and self.url_queue:
176 |                 self.get_next_url()
177 | 
178 |             logging.debug('Getting %d urls and still have %d more urls to get...' % (self.running_urls, self.remaining_requests))
179 |             if self.running_urls < 1 and self.remaining_requests == 0:
180 |                 logging.debug('Nothing else to get. Stopping Octopus...')
181 |                 self.stop()
182 | 
183 |         return handle
184 | 
185 |     def handle_wait_timeout(self, signal_number, frames):
186 |         logging.debug('Timeout waiting for IOLoop to finish. Stopping IOLoop manually.')
187 |         self.stop(force=True)
188 | 
189 |     def wait(self, timeout=10):
190 |         self.last_timeout = timeout
191 |         if not self.url_queue and not self.running_urls:
192 |             logging.debug('No urls to wait for. Returning immediately.')
193 |             return
194 | 
195 |         if timeout:
196 |             logging.debug('Waiting for urls to be retrieved for %s seconds.' % timeout)
197 |             self.ioloop.set_blocking_signal_threshold(timeout, self.handle_wait_timeout)
198 |         else:
199 |             logging.debug('Waiting for urls to be retrieved.')
200 | 
201 |         logging.info('Starting IOLoop with %d URLs still left to process.' % self.remaining_requests)
202 |         self.ioloop.start()
203 | 
204 |     @property
205 |     def remaining_requests(self):
206 |         return len(self.url_queue)
207 | 
208 |     def stop(self, force=False):
209 |         logging.info('Stopping IOLoop with %d URLs still left to process.' % self.remaining_requests)
210 |         self.ioloop.stop()
211 | 


--------------------------------------------------------------------------------
/redis.conf:
--------------------------------------------------------------------------------
1 | daemonize yes
2 | pidfile /tmp/redis-octopus.pid
3 | port 7575
4 | dbfilename redis-octopus.rdb
5 | dir /tmp
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from setuptools import setup, find_packages
 5 | from octopus import __version__
 6 | 
 7 | tests_require = [
 8 |     'mock',
 9 |     'nose',
10 |     'coverage',
11 |     'yanc',
12 |     'preggy',
13 |     'tox',
14 |     'ipdb',
15 |     'coveralls',
16 | ]
17 | 
18 | setup(
19 |     name='octopus-http',
20 |     version=__version__,
21 |     description='Octopus is a library to use threads to concurrently retrieve and report on the completion of http requests',
22 |     long_description='''
23 | Octopus is a library to use threads to concurrently retrieve and report on the completion of http requests
24 | ''',
25 |     keywords='http concurrency threading',
26 |     author='Bernardo Heynemann',
27 |     author_email='heynemann@gmail.com',
28 |     url='https://heynemann.github.io/octopus',
29 |     license='MIT',
30 |     classifiers=[
31 |         'Development Status :: 4 - Beta',
32 |         'Intended Audience :: Developers',
33 |         'License :: OSI Approved :: MIT License',
34 |         'Natural Language :: English',
35 |         'Operating System :: MacOS',
36 |         'Operating System :: POSIX',
37 |         'Operating System :: Unix',
38 |         'Operating System :: OS Independent',
39 |         'Programming Language :: Python :: 2.7',
40 |     ],
41 |     packages=find_packages(),
42 |     include_package_data=True,
43 |     install_requires=[
44 |         'six',
45 |         'requests',
46 |         'tornado',
47 |         'retools',
48 |         'cyrusbus'
49 |     ],
50 |     extras_require={
51 |         'tests': tests_require,
52 |     },
53 | )
54 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from unittest import TestCase as PythonTestCase
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | class TestCase(PythonTestCase):
10 |     def setUp(self):
11 |         self.redis = redis.Redis(host='localhost', port=7575, db=0)
12 |         self.redis.flushall()
13 | 


--------------------------------------------------------------------------------
/tests/limiter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/heynemann/octopus/c7bc93e60530368b137a4e9df26b5b34dacecedf/tests/limiter/__init__.py


--------------------------------------------------------------------------------
/tests/limiter/in_memory/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/heynemann/octopus/c7bc93e60530368b137a4e9df26b5b34dacecedf/tests/limiter/in_memory/__init__.py


--------------------------------------------------------------------------------
/tests/limiter/in_memory/test_per_domain.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import logging
 5 | 
 6 | from preggy import expect
 7 | from mock import patch
 8 | 
 9 | from octopus.limiter.in_memory.per_domain import Limiter as PerDomainInMemoryLimiter
10 | from tests import TestCase
11 | 
12 | 
13 | class TestPerDomain(TestCase):
14 |     def setUp(self):
15 |         super(TestPerDomain, self).setUp()
16 |         self.limiter = PerDomainInMemoryLimiter(
17 |             {'http://g1.globo.com': 10},
18 |             {'http://globoesporte.globo.com': 10}
19 |         )
20 | 
21 |     def test_can_create_limiter(self):
22 |         expect(self.limiter.domains[0]).to_include('http://g1.globo.com')
23 |         expect(self.limiter.domains).not_to_be_null()
24 |         expect(self.limiter.domains[0]['http://g1.globo.com']).to_equal(10)
25 | 
26 |     def test_can_acquire_limit(self):
27 |         expect(self.limiter.acquire('http://g1.globo.com')).to_be_true()
28 |         expect(self.limiter.domain_count).to_include('http://g1.globo.com')
29 |         expect(self.limiter.domain_count['http://g1.globo.com']).to_equal(1)
30 | 
31 |     def test_acquiring_internal_url_gets_proper_domain(self):
32 |         expect(self.limiter.acquire('http://g1.globo.com/economia/')).to_be_true()
33 |         expect(self.limiter.domain_count).to_include('http://g1.globo.com')
34 |         expect(self.limiter.domain_count['http://g1.globo.com']).to_equal(1)
35 | 
36 |     @patch.object(logging, 'info')
37 |     def test_can_acquire_from_unknown_domain_url(self, logging_mock):
38 |         limiter = PerDomainInMemoryLimiter(
39 |             {'http://globoesporte.globo.com': 10}
40 |         )
41 | 
42 |         expect(limiter.acquire('http://g1.globo.com/economia/')).to_be_true()
43 |         expect(limiter.domain_count).to_be_empty()
44 |         logging_mock.assert_called_once_with('Tried to acquire lock to a domain that was not specified in the limiter (http://g1.globo.com/economia/).')
45 | 
46 |     def test_can_release(self):
47 |         url = 'http://g1.globo.com/economia/'
48 |         self.limiter.acquire(url)
49 |         self.limiter.release(url)
50 | 
51 |         expect(self.limiter.domain_count['http://g1.globo.com']).to_equal(0)
52 | 
53 |     def test_can_get_domain_from_url(self):
54 |         expect(self.limiter.get_domain_from_url('http://g1.globo.com/economia/')).to_equal('http://g1.globo.com')
55 | 
56 |     def test_can_get_domain_limit(self):
57 |         url = 'http://g1.globo.com/economia/'
58 |         expect(self.limiter.get_domain_limit(url)).to_equal(10)
59 | 
60 |         self.limiter.acquire(url)
61 |         expect(self.limiter.get_domain_limit(url)).to_equal(10)
62 | 
63 |         expect(self.limiter.get_domain_limit('http://www.google.com')).to_equal(0)
64 | 
65 |     @patch.object(logging, 'info')
66 |     def test_can_release_unknown_url(self, logging_mock):
67 |         self.limiter.release('http://www.google.com')
68 | 
69 |         expect(self.limiter.domain_count).to_be_empty()
70 |         logging_mock.assert_called_once_with('Tried to release lock to a domain that was not specified in the limiter (http://www.google.com).')
71 | 


--------------------------------------------------------------------------------
/tests/limiter/redis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/heynemann/octopus/c7bc93e60530368b137a4e9df26b5b34dacecedf/tests/limiter/redis/__init__.py


--------------------------------------------------------------------------------
/tests/limiter/redis/test_per_domain.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import logging
 6 | 
 7 | from preggy import expect
 8 | from mock import patch
 9 | 
10 | from octopus.limiter.redis.per_domain import Limiter as PerDomainRedisLimiter
11 | from tests import TestCase
12 | 
13 | 
14 | class TestPerDomain(TestCase):
15 |     def setUp(self):
16 |         super(TestPerDomain, self).setUp()
17 |         self.limiter = PerDomainRedisLimiter(
18 |             {'http://g1.globo.com': 10},
19 |             {'http://globoesporte.globo.com': 10},
20 |             redis=self.redis,
21 |             expiration_in_seconds=12
22 |         )
23 | 
24 |     def test_can_create_limiter(self):
25 |         expect(self.limiter.redis).to_equal(self.redis)
26 |         expect(self.limiter.expiration_in_seconds).to_equal(12)
27 |         expect(self.limiter.domains[0]).to_include('http://g1.globo.com')
28 |         expect(self.limiter.domains).not_to_be_null()
29 |         expect(self.limiter.domains[0]['http://g1.globo.com']).to_equal(10)
30 | 
31 |     def test_cant_create_limiter_without_redis(self):
32 |         try:
33 |             PerDomainRedisLimiter()
34 |         except RuntimeError:
35 |             err = sys.exc_info()[1]
36 |             expect(err).to_have_an_error_message_of('You must specify a connection to redis in order to use Redis Limiter.')
37 |         else:
38 |             assert False, "Should not have gotten this far"
39 | 
40 |     def test_can_acquire_limit(self):
41 |         expect(self.limiter.acquire('http://g1.globo.com')).to_be_true()
42 | 
43 |         try:
44 |             expect(self.redis.zcard('limit-for-http://g1.globo.com')).to_equal(1)
45 |         finally:
46 |             self.limiter.release('http://g1.globo.com')
47 | 
48 |     def test_acquiring_internal_url_gets_proper_domain(self):
49 |         url = 'http://g1.globo.com/economia/'
50 |         expect(self.limiter.acquire(url)).to_be_true()
51 | 
52 |         try:
53 |             expect(self.redis.zcard('limit-for-http://g1.globo.com')).to_equal(1)
54 |         finally:
55 |             self.limiter.release(url)
56 | 
57 |     def test_can_acquire_from_unknown_domain_url(self):
58 |         limiter = PerDomainRedisLimiter(
59 |             {'http://globoesporte.globo.com': 10},
60 |             redis=self.redis
61 |         )
62 | 
63 |         url = 'http://g1.globo.com/economia/'
64 |         expect(limiter.acquire(url)).to_be_true()
65 |         expect(self.redis.zcard('limit-for-http://g1.globo.com')).to_equal(0)
66 | 
67 |     def test_can_release(self):
68 |         url = 'http://g1.globo.com/economia/'
69 |         self.limiter.acquire(url)
70 |         self.limiter.release(url)
71 | 
72 |         expect(self.redis.zcard('limit-for-http://g1.globo.com')).to_equal(0)
73 | 
74 |     def test_can_get_domain_from_url(self):
75 |         expect(self.limiter.get_domain_from_url('http://g1.globo.com/economia/')).to_equal('http://g1.globo.com')
76 | 
77 |     def test_can_get_domain_limit(self):
78 |         url = 'http://g1.globo.com/economia/'
79 |         expect(self.limiter.get_domain_limit(url)).to_equal(10)
80 | 
81 |         expect(self.limiter.get_domain_limit('http://www.google.com')).to_equal(0)
82 | 
83 |     @patch.object(logging, 'info')
84 |     def test_can_release_unknown_url(self, logging_mock):
85 |         self.limiter.release('http://www.google.com')
86 | 
87 |         expect(self.redis.zcard('limit-for-http://www.google.com')).to_equal(0)
88 | 
89 |         logging_mock.assert_called_once_with(
90 |             'Tried to release lock to a domain that was not specified '
91 |             'in the limiter (http://www.google.com).'
92 |         )
93 | 


--------------------------------------------------------------------------------
/tests/limiter/test_base_limiter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from preggy import expect
 5 | 
 6 | from octopus.limiter import Limiter
 7 | from tests import TestCase
 8 | 
 9 | 
10 | class TestBaseLimiter(TestCase):
11 |     def setUp(self):
12 |         super(TestBaseLimiter, self).setUp()
13 |         self.limiter = Limiter()
14 |         self.handled_url = None
15 | 
16 |     def test_has_bus(self):
17 |         expect(self.limiter.bus).not_to_be_null()
18 | 
19 |     def test_can_subscribe(self):
20 |         def handle_lock_miss(url):
21 |             pass
22 | 
23 |         self.limiter.subscribe_to_lock_miss(handle_lock_miss)
24 | 
25 |         expect(self.limiter.bus.has_any_subscriptions('limiter.miss')).to_be_true()
26 | 
27 |     def test_can_get_lock_miss(self):
28 |         def handle_lock_miss(url):
29 |             self.handled_url = url
30 | 
31 |         self.limiter.subscribe_to_lock_miss(handle_lock_miss)
32 | 
33 |         self.limiter.publish_lock_miss('some-url')
34 | 
35 |         expect(self.handled_url).to_equal('some-url')
36 | 


--------------------------------------------------------------------------------
/tests/test_cache.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import time
 5 | 
 6 | from preggy import expect
 7 | 
 8 | from octopus.cache import Cache
 9 | from tests import TestCase
10 | 
11 | 
12 | class TestCache(TestCase):
13 |     def test_can_create_cache(self):
14 |         cache = Cache(expiration_in_seconds=45)
15 |         expect(cache.expiration_in_seconds).to_equal(45)
16 |         expect(cache.responses).to_be_empty()
17 | 
18 |     def test_get_returns_none_if_not_put(self):
19 |         cache = Cache(expiration_in_seconds=45)
20 | 
21 |         expect(cache.get('http://www.google.com')).to_be_null()
22 | 
23 |     def test_get_returns_none_if_expired(self):
24 |         cache = Cache(expiration_in_seconds=0.1)
25 | 
26 |         cache.put('http://www.google.com', 'response')
27 | 
28 |         time.sleep(0.5)
29 | 
30 |         expect(cache.get('http://www.google.com')).to_be_null()
31 | 
32 |         expect(cache.responses).not_to_include('http://www.google.com')
33 | 
34 |     def test_can_get_after_put(self):
35 |         cache = Cache(expiration_in_seconds=10)
36 |         cache.put('http://www.google.com', 'response')
37 |         expect(cache.get('http://www.google.com')).to_equal('response')
38 | 


--------------------------------------------------------------------------------
/tests/test_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from preggy import expect
 5 | 
 6 | from octopus.model import Response
 7 | from tests import TestCase
 8 | 
 9 | 
10 | class TestResponseModel(TestCase):
11 |     def test_can_create_response(self):
12 |         response = Response(
13 |             url="http://www.google.com",
14 |             status_code=200,
15 |             headers={
16 |                 'Accept': 'image/webp; */*'
17 |             },
18 |             cookies={
19 |                 'whatever': 'some-value'
20 |             },
21 |             text='some request body',
22 |             effective_url='http://www.google.com/',
23 |             error="some error message",
24 |             request_time=10.24
25 |         )
26 | 
27 |         expect(response.url).to_equal('http://www.google.com')
28 |         expect(response.status_code).to_equal(200)
29 |         expect(response.headers).to_be_like({
30 |             'Accept': 'image/webp; */*'
31 |         })
32 |         expect(response.cookies).to_be_like({
33 |             'whatever': 'some-value'
34 |         })
35 |         expect(response.text).to_equal('some request body')
36 |         expect(response.effective_url).to_equal('http://www.google.com/')
37 |         expect(response.error).to_equal('some error message')
38 |         expect(response.request_time).to_equal(10.24)
39 | 


--------------------------------------------------------------------------------
/tests/test_octopus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | 
  6 | from preggy import expect
  7 | from mock import Mock
  8 | 
  9 | from octopus import Octopus, TimeoutError
 10 | from tests import TestCase
 11 | 
 12 | 
 13 | class TestOctopus(TestCase):
 14 |     def setUp(self):
 15 |         self.response = None
 16 |         self.responses = {}
 17 | 
 18 |     def test_can_create_octopus(self):
 19 |         otto = Octopus(concurrency=20)
 20 |         expect(otto.concurrency).to_equal(20)
 21 |         expect(otto.auto_start).to_be_false()
 22 |         expect(otto.cache).to_be_false()
 23 | 
 24 |     def test_has_default_concurrency(self):
 25 |         otto = Octopus()
 26 |         expect(otto.concurrency).to_equal(10)
 27 | 
 28 |     def test_queue_is_empty(self):
 29 |         otto = Octopus()
 30 |         expect(otto.is_empty).to_be_true()
 31 | 
 32 |     def test_can_enqueue_url(self):
 33 |         otto = Octopus()
 34 | 
 35 |         otto.enqueue('http://www.google.com', None)
 36 | 
 37 |         expect(otto.queue_size).to_equal(1)
 38 | 
 39 |     def test_can_get_after_started(self):
 40 |         otto = Octopus(concurrency=1)
 41 | 
 42 |         def handle_url_response(url, response):
 43 |             self.response = response
 44 | 
 45 |         otto.enqueue('http://www.twitter.com', handle_url_response)
 46 |         otto.start()
 47 | 
 48 |         otto.wait(5)
 49 | 
 50 |         expect(self.response).not_to_be_null()
 51 |         expect(self.response.status_code).to_equal(200)
 52 | 
 53 |     def test_can_get_with_auto_start(self):
 54 |         otto = Octopus(concurrency=1, auto_start=True)
 55 | 
 56 |         def handle_url_response(url, response):
 57 |             self.response = response
 58 | 
 59 |         otto.enqueue('http://www.twitter.com', handle_url_response)
 60 | 
 61 |         otto.wait(5)
 62 | 
 63 |         expect(self.response).not_to_be_null()
 64 |         expect(self.response.status_code).to_equal(200)
 65 | 
 66 |     def test_can_wait(self):
 67 |         otto = Octopus(concurrency=1)
 68 | 
 69 |         def handle_url_response(url, response):
 70 |             self.response = response
 71 | 
 72 |         otto.enqueue('http://www.twitter.com', handle_url_response)
 73 |         otto.start()
 74 | 
 75 |         otto.wait(0)
 76 | 
 77 |         expect(self.response).not_to_be_null()
 78 |         expect(self.response.status_code).to_equal(200)
 79 | 
 80 |     def test_wait_returns_automatically_when_empty(self):
 81 |         otto = Octopus(concurrency=1)
 82 |         otto.start()
 83 | 
 84 |         otto.wait(5)
 85 | 
 86 |         expect(otto.is_empty).to_be_true()
 87 | 
 88 |     def test_times_out_on_wait(self):
 89 |         otto = Octopus(concurrency=1)
 90 | 
 91 |         def handle_url_response(url, response):
 92 |             self.response = response
 93 | 
 94 |         otto.enqueue('http://www.google.com', handle_url_response)
 95 | 
 96 |         try:
 97 |             otto.wait(0.1)
 98 |         except TimeoutError:
 99 |             err = sys.exc_info()[1]
100 |             expect(err).to_have_an_error_message_of("")
101 |         else:
102 |             assert False, "Should not have gotten this far"
103 | 
104 |     def test_can_handle_more_urls_concurrently(self):
105 |         urls = [
106 |             'http://www.twitter.com',
107 |             'http://www.cnn.com',
108 |             'http://www.bbc.com',
109 |             'http://www.facebook.com'
110 |         ]
111 |         otto = Octopus(concurrency=4)
112 | 
113 |         def handle_url_response(url, response):
114 |             self.responses[url] = response
115 | 
116 |         for url in urls:
117 |             otto.enqueue(url, handle_url_response)
118 | 
119 |         otto.start()
120 | 
121 |         otto.wait(10)
122 | 
123 |         expect(self.responses).to_length(4)
124 | 
125 |         for url in urls:
126 |             expect(self.responses).to_include(url)
127 |             expect(self.responses[url].status_code).to_equal(200)
128 | 
129 |     def test_can_handle_cached_responses(self):
130 |         response = Mock(status_code=200, body="whatever")
131 | 
132 |         url = 'http://www.google.com'
133 |         otto = Octopus(concurrency=1, cache=True)
134 |         otto.response_cache.put(url, response)
135 | 
136 |         def handle_url_response(url, response):
137 |             self.response = response
138 | 
139 |         otto.enqueue(url, handle_url_response)
140 | 
141 |         expect(self.response).not_to_be_null()
142 |         expect(self.response.status_code).to_equal(200)
143 |         expect(self.response.body).to_equal("whatever")
144 | 
145 |     def test_can_handle_cached_responses_when_not_cached(self):
146 |         url = 'http://www.twitter.com'
147 |         otto = Octopus(concurrency=1, cache=True)
148 | 
149 |         def handle_url_response(url, response):
150 |             self.response = response
151 | 
152 |         otto.enqueue(url, handle_url_response)
153 |         otto.enqueue(url, handle_url_response)
154 |         otto.enqueue(url, handle_url_response)
155 |         otto.enqueue(url, handle_url_response)
156 | 
157 |         otto.start()
158 | 
159 |         otto.wait(5)
160 | 
161 |         expect(self.response).not_to_be_null()
162 |         expect(self.response.status_code).to_equal(200)
163 | 
164 |     def test_can_handle_invalid_urls(self):
165 |         url = 'http://kagdjdkjgka.fk'
166 |         otto = Octopus(concurrency=1)
167 | 
168 |         def handle_url_response(url, response):
169 |             self.response = response
170 | 
171 |         otto.enqueue(url, handle_url_response)
172 | 
173 |         otto.start()
174 | 
175 |         otto.wait(5)
176 | 
177 |         expect(self.response).not_to_be_null()
178 |         expect(self.response.status_code).to_equal(599)
179 |         expect(self.response.text).to_include("HTTPConnectionPool(host='kagdjdkjgka.fk', port=80)")
180 |         expect(self.response.text).to_include('Max retries exceeded with url: /')
181 |         expect(self.response.error).to_equal(self.response.text)
182 | 
183 |     def test_can_handle_timeouts(self):
184 |         url = 'http://baidu.com'
185 |         otto = Octopus(concurrency=1, request_timeout_in_seconds=0.1)
186 | 
187 |         def handle_url_response(url, response):
188 |             self.response = response
189 | 
190 |         otto.enqueue(url, handle_url_response)
191 | 
192 |         otto.start()
193 | 
194 |         otto.wait(5)
195 | 
196 |         expect(self.response.text).to_include('Connection to baidu.com timed out')
197 |         expect(self.response.error).to_include('Connection to baidu.com timed out. (connect timeout=0.1)')
198 | 


--------------------------------------------------------------------------------
/tests/test_octopus_limited.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from preggy import expect
 5 | 
 6 | from octopus import Octopus
 7 | from octopus.limiter.redis.per_domain import Limiter as PerDomainRedisLimiter
 8 | from octopus.limiter.in_memory.per_domain import Limiter as PerDomainInMemoryLimiter
 9 | from tests import TestCase
10 | 
11 | 
12 | class TestThreadedOctopusAgainstLimiter(TestCase):
13 |     def setUp(self):
14 |         super(TestThreadedOctopusAgainstLimiter, self).setUp()
15 | 
16 |         self.response = None
17 |         self.url = None
18 |         self.responses = {}
19 |         self.cache_miss = set()
20 |         self.redis.flushall()
21 | 
22 |     def handle_url_response(self, url, response):
23 |         self.responses[url] = response
24 | 
25 |     def handle_limiter_miss(self, url):
26 |         self.cache_miss.add(url)
27 | 
28 |     def test_should_not_get_more_than_one_url_for_same_domain_concurrently(self):
29 |         limiter = PerDomainRedisLimiter(
30 |             {'http://g1.globo.com': 1},
31 |             {'http://globoesporte.globo.com': 1},
32 |             redis=self.redis
33 |         )
34 |         otto = Octopus(concurrency=10, auto_start=True, limiter=limiter)
35 | 
36 |         otto.enqueue('http://globoesporte.globo.com', self.handle_url_response)
37 |         otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response)
38 |         otto.enqueue('http://g1.globo.com', self.handle_url_response)
39 |         otto.enqueue('http://g1.globo.com/economia', self.handle_url_response)
40 | 
41 |         otto.wait(10)
42 | 
43 |         expect(self.responses).to_length(4)
44 |         expect(self.redis.zcard('limit-for-http://g1.globo.com')).to_equal(0)
45 |         expect(self.redis.zcard('limit-for-http://globoesporte.globo.com')).to_equal(0)
46 | 
47 |     def test_should_call_limiter_miss_twice(self):
48 |         limiter = PerDomainInMemoryLimiter(
49 |             {'http://g1.globo.com': 1},
50 |             {'http://globoesporte.globo.com': 1},
51 |         )
52 |         limiter.subscribe_to_lock_miss(self.handle_limiter_miss)
53 |         otto = Octopus(concurrency=10, auto_start=True, limiter=limiter)
54 | 
55 |         otto.enqueue('http://globoesporte.globo.com/', self.handle_url_response)
56 |         otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response)
57 |         otto.enqueue('http://g1.globo.com/', self.handle_url_response)
58 |         otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response)
59 | 
60 |         otto.wait()
61 | 
62 |         expect(self.cache_miss).to_length(2)
63 | 


--------------------------------------------------------------------------------
/tests/test_tornado_octopus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import logging
  5 | 
  6 | from preggy import expect
  7 | from mock import Mock, patch
  8 | 
  9 | from octopus import TornadoOctopus
 10 | from octopus.cache import Cache
 11 | from tests import TestCase
 12 | 
 13 | 
 14 | class TestTornadoOctopus(TestCase):
 15 |     def setUp(self):
 16 |         self.response = None
 17 |         self.url = None
 18 |         self.responses = {}
 19 | 
 20 |     def get_response(self, request=None):
 21 |         if request is None:
 22 |             request = Mock(
 23 |                 headers={
 24 |                     'Cookie': 'foo=bar'
 25 |                 },
 26 |             )
 27 | 
 28 |         return Mock(
 29 |             request=request,
 30 |             headers={
 31 |                 'baz': 'foo'
 32 |             },
 33 |             code=200,
 34 |             body='body',
 35 |             effective_url='http://www.google.com/',
 36 |             error='error',
 37 |             request_time=2.1
 38 |         )
 39 | 
 40 |     def test_can_create_tornado_otto(self):
 41 |         otto = TornadoOctopus()
 42 | 
 43 |         expect(otto.concurrency).to_equal(10)
 44 |         expect(otto.auto_start).to_be_false()
 45 |         expect(otto.cache).to_be_false()
 46 | 
 47 |         expect(otto.response_cache).not_to_be_null()
 48 |         expect(otto.response_cache).to_be_instance_of(Cache)
 49 |         expect(otto.response_cache.expiration_in_seconds).to_equal(30)
 50 | 
 51 |         expect(otto.request_timeout_in_seconds).to_equal(10)
 52 |         expect(otto.connect_timeout_in_seconds).to_equal(5)
 53 |         expect(otto.ignore_pycurl).to_be_false()
 54 | 
 55 |         expect(otto.running_urls).to_equal(0)
 56 |         expect(otto.url_queue).to_be_empty()
 57 | 
 58 |     def test_can_create_tornado_otto_with_custom_values(self):
 59 |         otto = TornadoOctopus(
 60 |             concurrency=20, auto_start=True, cache=True,
 61 |             expiration_in_seconds=60, request_timeout_in_seconds=20,
 62 |             connect_timeout_in_seconds=10, ignore_pycurl=True
 63 | 
 64 |         )
 65 | 
 66 |         expect(otto.concurrency).to_equal(20)
 67 |         expect(otto.auto_start).to_be_true()
 68 |         expect(otto.cache).to_be_true()
 69 | 
 70 |         expect(otto.response_cache).not_to_be_null()
 71 |         expect(otto.response_cache).to_be_instance_of(Cache)
 72 |         expect(otto.response_cache.expiration_in_seconds).to_equal(60)
 73 | 
 74 |         expect(otto.request_timeout_in_seconds).to_equal(20)
 75 |         expect(otto.connect_timeout_in_seconds).to_equal(10)
 76 |         expect(otto.ignore_pycurl).to_be_true()
 77 | 
 78 |         expect(otto.running_urls).to_equal(0)
 79 |         expect(otto.url_queue).to_be_empty()
 80 | 
 81 |     def test_can_get_queue_info(self):
 82 |         otto = TornadoOctopus()
 83 | 
 84 |         expect(otto.queue_size).to_equal(0)
 85 |         expect(otto.is_empty).to_be_true()
 86 | 
 87 |     def test_can_get_response_from_tornado_response(self):
 88 |         response = self.get_response()
 89 | 
 90 |         otto_response = TornadoOctopus.from_tornado_response('http://www.google.com', response)
 91 | 
 92 |         expect(otto_response.url).to_equal('http://www.google.com')
 93 |         expect(otto_response.headers).to_be_like(response.headers)
 94 |         expect(otto_response.cookies).to_be_like({
 95 |             'foo': 'bar'
 96 |         })
 97 |         expect(otto_response.text).to_equal('body')
 98 |         expect(otto_response.error).to_equal('error')
 99 |         expect(otto_response.request_time).to_equal(2.1)
100 | 
101 |     def test_can_get_response_from_tornado_response_when_no_cookies(self):
102 |         response = self.get_response(request=Mock(headers={}))
103 | 
104 |         otto_response = TornadoOctopus.from_tornado_response('http://www.google.com', response)
105 | 
106 |         expect(otto_response.url).to_equal('http://www.google.com')
107 |         expect(otto_response.headers).to_be_like(response.headers)
108 |         expect(otto_response.cookies).to_be_empty()
109 |         expect(otto_response.text).to_equal('body')
110 |         expect(otto_response.error).to_equal('error')
111 |         expect(otto_response.request_time).to_equal(2.1)
112 | 
113 |     def test_can_enqueue_url(self):
114 |         otto = TornadoOctopus(cache=False, concurrency=0)
115 | 
116 |         otto.enqueue('http://www.google.com', None, method='GET', something="else")
117 | 
118 |         expect(otto.url_queue).to_length(1)
119 | 
120 |     @patch.object(TornadoOctopus, 'fetch')
121 |     def test_can_enqueue_url_and_fetch(self, fetch_mock):
122 |         otto = TornadoOctopus(cache=True)
123 | 
124 |         otto.enqueue('http://www.google.com', None, method='GET', something="else")
125 | 
126 |         expect(otto.url_queue).to_be_empty()
127 |         fetch_mock.assert_called_once_with('http://www.google.com', None, 'GET', something='else')
128 | 
129 |     def test_can_enqueue_and_get_from_cache(self):
130 |         mock_response = Mock()
131 |         otto = TornadoOctopus(cache=True)
132 |         otto.response_cache.put('http://www.google.com', mock_response)
133 | 
134 |         def response(url, response):
135 |             self.url = url
136 |             self.response = response
137 | 
138 |         otto.enqueue('http://www.google.com', response, method='GET')
139 | 
140 |         expect(otto.url_queue).to_be_empty()
141 |         expect(self.response).not_to_be_null()
142 |         expect(self.response).to_equal(mock_response)
143 | 
144 |     def test_can_enqueue_and_get_when_cache_miss(self):
145 |         otto = TornadoOctopus(cache=True, auto_start=True)
146 | 
147 |         def response(url, response):
148 |             self.url = url
149 |             self.response = response
150 | 
151 |         otto.enqueue('http://www.google.com', response, method='GET')
152 |         otto.wait(2)
153 | 
154 |         expect(otto.url_queue).to_be_empty()
155 |         expect(self.response).not_to_be_null()
156 | 
157 |     def test_can_fetch(self):
158 |         otto = TornadoOctopus(cache=False, auto_start=True)
159 |         otto.response_cache.put('http://www.google.com', Mock())
160 | 
161 |         http_client_mock = Mock()
162 |         otto.http_client = http_client_mock
163 | 
164 |         otto.fetch('http://www.google.com', None, 'GET')
165 | 
166 |         expect(otto.running_urls).to_equal(1)
167 |         expect(http_client_mock.fetch.called).to_be_true()
168 | 
169 |     def test_fetch_gets_the_response_from_cache_if_available(self):
170 |         otto = TornadoOctopus(cache=True, auto_start=True)
171 |         response_mock = Mock()
172 |         otto.response_cache.put('http://www.google.com', response_mock)
173 | 
174 |         http_client_mock = Mock()
175 |         otto.http_client = http_client_mock
176 | 
177 |         callback = Mock()
178 | 
179 |         otto.fetch('http://www.google.com', callback, 'GET')
180 | 
181 |         expect(otto.running_urls).to_equal(0)
182 |         expect(http_client_mock.fetch.called).to_be_false()
183 |         callback.assert_called_once_with('http://www.google.com', response_mock)
184 | 
185 |     @patch.object(TornadoOctopus, 'stop')
186 |     def test_handle_request(self, stop_mock):
187 |         otto = TornadoOctopus(cache=False, auto_start=True)
188 | 
189 |         response = self.get_response()
190 | 
191 |         callback = Mock()
192 | 
193 |         handle_request = otto.handle_request('some url', callback)
194 | 
195 |         handle_request(response)
196 | 
197 |         expect(otto.running_urls).to_equal(-1)
198 |         expect(callback.called).to_be_true()
199 |         expect(stop_mock.called).to_be_true()
200 | 
201 |     @patch.object(TornadoOctopus, 'stop')
202 |     def test_handle_request_when_queue_has_no_items(self, stop_mock):
203 |         otto = TornadoOctopus(cache=True, auto_start=True)
204 |         otto.response_cache = Mock()
205 | 
206 |         response = self.get_response()
207 | 
208 |         callback = Mock()
209 | 
210 |         handle_request = otto.handle_request('some url', callback)
211 | 
212 |         handle_request(response)
213 | 
214 |         expect(otto.running_urls).to_equal(-1)
215 |         expect(callback.called).to_be_true()
216 |         expect(stop_mock.called).to_be_true()
217 |         expect(otto.response_cache.put.called).to_be_true()
218 | 
219 |     def test_handle_request_when_queue_has_no_items_but_running_urls(self):
220 |         otto = TornadoOctopus(cache=True, auto_start=True)
221 |         otto.response_cache = Mock()
222 |         otto.running_urls = 10
223 | 
224 |         response = self.get_response()
225 | 
226 |         callback = Mock()
227 | 
228 |         handle_request = otto.handle_request('some url', callback)
229 | 
230 |         handle_request(response)
231 | 
232 |         expect(otto.running_urls).to_equal(9)
233 |         expect(callback.called).to_be_true()
234 |         expect(otto.response_cache.put.called).to_be_true()
235 | 
236 |     @patch.object(TornadoOctopus, 'fetch')
237 |     def test_handle_request_when_queue_has_items(self, fetch_mock):
238 |         otto = TornadoOctopus(cache=False, auto_start=True)
239 | 
240 |         handler_mock = Mock()
241 | 
242 |         otto.url_queue.append(
243 |             ('other url', handler_mock, 'POST', {'foo': 'bar'})
244 |         )
245 | 
246 |         response = self.get_response()
247 |         callback = Mock()
248 | 
249 |         handle_request = otto.handle_request('some url', callback)
250 |         handle_request(response)
251 | 
252 |         expect(otto.running_urls).to_equal(-1)
253 |         expect(otto.url_queue).to_be_empty()
254 |         expect(callback.called).to_be_true()
255 |         fetch_mock.assert_called_once_with('other url', handler_mock, 'POST', foo='bar')
256 | 
257 |     def test_can_handle_wait_timeout(self):
258 |         otto = TornadoOctopus(cache=False, auto_start=True)
259 |         otto.ioloop = Mock()
260 | 
261 |         otto.handle_wait_timeout(1, None)
262 | 
263 |         expect(otto.ioloop.stop.called).to_be_true()
264 | 
265 |     def test_can_stop(self):
266 |         otto = TornadoOctopus(cache=False, auto_start=True)
267 |         otto.ioloop = Mock()
268 | 
269 |         otto.stop()
270 | 
271 |         expect(otto.ioloop.stop.called).to_be_true()
272 | 
273 |     @patch.object(logging, 'debug')
274 |     def test_can_wait_when_no_urls(self, logging_mock):
275 |         otto = TornadoOctopus(cache=False, auto_start=True)
276 | 
277 |         otto.wait()
278 | 
279 |         logging_mock.assert_calls('No urls to wait for. Returning immediately.')
280 | 
281 |     def test_can_wait_when_urls_and_timeout(self):
282 |         otto = TornadoOctopus(cache=False, auto_start=True)
283 |         otto.ioloop = Mock()
284 |         otto.running_urls = 10
285 | 
286 |         otto.wait()
287 | 
288 |         expect(otto.ioloop.set_blocking_signal_threshold.called)
289 | 
290 |     @patch.object(logging, 'debug')
291 |     def test_can_wait_when_urls_and_no_timeout(self, logging_mock):
292 |         otto = TornadoOctopus(cache=False, auto_start=True)
293 |         otto.ioloop = Mock()
294 |         otto.running_urls = 10
295 | 
296 |         otto.wait(0)
297 | 
298 |         logging_mock.assert_calls('Waiting for urls to be retrieved.')
299 | 
300 |     def test_can_get_many_urls(self):
301 |         urls = [
302 |             'http://www.globo.com',
303 |             'http://www.twitter.com',
304 |             'http://www.facebook.com'
305 |         ]
306 |         otto = TornadoOctopus(concurrency=4, auto_start=True)
307 | 
308 |         def handle_url_response(url, response):
309 |             self.responses[url] = response
310 | 
311 |         for url in urls:
312 |             otto.enqueue(url, handle_url_response)
313 | 
314 |         otto.wait(2)
315 | 
316 |         expect(self.responses).to_length(3)
317 | 
318 |         for url in urls:
319 |             expect(self.responses).to_include(url)
320 |             expect(self.responses[url].status_code).to_equal(200)
321 | 
322 |     def test_can_handle_invalid_urls(self):
323 |         url = 'http://kagdjdkjgka.fk'
324 |         otto = TornadoOctopus(concurrency=1, auto_start=True)
325 | 
326 |         def handle_url_response(url, response):
327 |             self.response = response
328 | 
329 |         otto.enqueue(url, handle_url_response)
330 | 
331 |         otto.wait(5)
332 | 
333 |         expect(self.response).not_to_be_null()
334 |         expect(self.response.status_code).to_equal(599)
335 |         expect(self.response.text).to_be_null()
336 |         expect(self.response.error).not_to_be_null()
337 | 
338 |     def test_can_handle_timeouts(self):
339 |         url = 'http://baidu.com'
340 |         otto = TornadoOctopus(concurrency=1, request_timeout_in_seconds=0.1, auto_start=True)
341 | 
342 |         def handle_url_response(url, response):
343 |             self.response = response
344 | 
345 |         otto.enqueue(url, handle_url_response)
346 | 
347 |         otto.wait(5)
348 | 
349 |         expect(self.response.status_code).to_equal(599)
350 |         expect(self.response.text).to_be_null()
351 |         expect(self.response.error).not_to_be_null()
352 | 
353 |     @patch.object(logging, 'exception')
354 |     def test_can_handle_exception(self, log_mock):
355 |         url = 'http://www.globo.com'
356 | 
357 |         otto = TornadoOctopus(concurrency=4, auto_start=True)
358 | 
359 |         def handle_url_response(url, response):
360 |             raise RuntimeError(url)
361 | 
362 |         otto.enqueue(url, handle_url_response)
363 | 
364 |         otto.wait(2)
365 | 
366 |         log_mock.assert_called_once_with('Error calling callback for http://www.globo.com.')
367 | 


--------------------------------------------------------------------------------
/tests/test_tornado_octopus_limited.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from preggy import expect
 5 | 
 6 | from octopus import TornadoOctopus
 7 | from octopus.limiter.redis.per_domain import Limiter as PerDomainRedisLimiter
 8 | from octopus.limiter.in_memory.per_domain import Limiter as PerDomainInMemoryLimiter
 9 | from tests import TestCase
10 | 
11 | 
12 | class TestTornadoCoreLimited(TestCase):
13 |     def setUp(self):
14 |         super(TestTornadoCoreLimited, self).setUp()
15 | 
16 |         self.response = None
17 |         self.url = None
18 |         self.responses = {}
19 |         self.cache_miss = set()
20 |         self.redis.flushall()
21 | 
22 |     def handle_url_response(self, url, response):
23 |         self.responses[url] = response
24 | 
25 |     def handle_limiter_miss(self, url):
26 |         self.cache_miss.add(url)
27 | 
28 |     def test_should_not_get_more_than_one_url_for_same_domain_concurrently(self):
29 |         limiter = PerDomainInMemoryLimiter(
30 |             {'http://g1.globo.com': 1},
31 |             {'http://globoesporte.globo.com': 1}
32 |         )
33 |         otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)
34 | 
35 |         otto.enqueue('http://globoesporte.globo.com', self.handle_url_response)
36 |         otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response)
37 |         otto.enqueue('http://g1.globo.com', self.handle_url_response)
38 |         otto.enqueue('http://g1.globo.com/economia', self.handle_url_response)
39 | 
40 |         otto.wait(2)
41 | 
42 |         expect(self.responses).to_length(4)
43 |         expect(list(limiter.domain_count.keys())).to_be_like(['http://g1.globo.com', 'http://globoesporte.globo.com'])
44 | 
45 |     def test_should_call_limiter_miss_twice(self):
46 |         limiter = PerDomainRedisLimiter(
47 |             {'http://g1.globo.com': 1},
48 |             {'http://globoesporte.globo.com': 1},
49 |             redis=self.redis
50 |         )
51 |         limiter.subscribe_to_lock_miss(self.handle_limiter_miss)
52 |         otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter)
53 | 
54 |         otto.enqueue('http://globoesporte.globo.com/', self.handle_url_response)
55 |         otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response)
56 |         otto.enqueue('http://g1.globo.com/', self.handle_url_response)
57 |         otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response)
58 | 
59 |         otto.wait()
60 | 
61 |         expect(self.cache_miss).to_length(2)
62 | 


--------------------------------------------------------------------------------