├── .coveragerc ├── .gitignore ├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── benchmark ├── __init__.py └── test_octopus.py ├── octopus ├── __init__.py ├── cache.py ├── core.py ├── limiter │ ├── __init__.py │ ├── in_memory │ │ ├── __init__.py │ │ └── per_domain.py │ └── redis │ │ ├── __init__.py │ │ └── per_domain.py ├── model.py └── tornado_core.py ├── redis.conf ├── setup.py └── tests ├── __init__.py ├── limiter ├── __init__.py ├── in_memory │ ├── __init__.py │ └── test_per_domain.py ├── redis │ ├── __init__.py │ └── test_per_domain.py └── test_base_limiter.py ├── test_cache.py ├── test_model.py ├── test_octopus.py ├── test_octopus_limited.py ├── test_tornado_octopus.py └── test_tornado_octopus_limited.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | *tests.py 4 | branch = True 5 | source = 6 | octopus 7 | 8 | [report] 9 | exclude_lines = 10 | pragma: no cover 11 | def __repr__ 12 | raise NotImplementedError 13 | if __name__ == .__main__.: 14 | from urllib.parse import parse_qs 15 | except ImportError: 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | cover 38 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.3" 5 | - "pypy" 6 | 7 | matrix: 8 | allow_failures: 9 | - python: "3.3" 10 | - python: "pypy" 11 | 12 | install: 13 | # install python requirements 14 | - pip install coveralls 15 | - pip install -e .[tests] 16 | 17 | script: 18 | - make test 19 | 20 | after_success: 21 | - coveralls 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Bernardo Heynemann 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: redis 2 | @coverage run --branch `which nosetests` -vv --with-yanc -s tests/ 3 | @coverage report -m --fail-under=90 4 | 5 | coverage-html: test 6 | @coverage html -d cover 7 | @open cover/index.html 8 | 9 | setup: 10 | @pip install -U -e .\[tests\] 11 | 12 | kill_redis: 13 | -redis-cli -p 7575 shutdown 14 | 15 | redis: kill_redis 16 | redis-server ./redis.conf; sleep 1 17 | redis-cli -p 7575 info > /dev/null 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | octopus 2 | ======= 3 | 4 | [![Build Status](https://travis-ci.org/heynemann/octopus.png?branch=master)](https://travis-ci.org/heynemann/octopus) 5 | [![PyPi version](https://pypip.in/v/octopus-http/badge.png)](https://crate.io/packages/octopus-http/) 6 | [![PyPi downloads](https://pypip.in/d/octopus-http/badge.png)](https://crate.io/packages/octopus-http/) 7 | [![Coverage Status](https://coveralls.io/repos/heynemann/octopus/badge.png?branch=master)](https://coveralls.io/r/heynemann/octopus?branch=master) 8 | 9 | `octopus` is a library to concurrently retrieve and report on the completion of http requests. 10 | 11 | You can either use threads or the tornado IOLoop to asynchronously get them. 12 | 13 | Installing 14 | ========== 15 | 16 | Installing `octopus` is really easy: 17 | 18 | $ pip install octopus-http 19 | 20 | The reason for the name of the package is that a package called `octopus` was already registered at the Python Package Index. 21 | 22 | Using 23 | ===== 24 | 25 | Using `octopus` with threads: 26 | 27 | from octopus import Octopus 28 | 29 | # this Octopus instance we'll run 4 threads, 30 | # automatically start listening to the queue and 31 | # we'll in-memory cache responses for 10 seconds. 32 | otto = Octopus( 33 | concurrency=4, auto_start=True, cache=True, 34 | expiration_in_seconds=10 35 | ) 36 | 37 | def handle_url_response(url, response): 38 | # do something with response 39 | 40 | otto.enqueue('http://www.google.com', handle_url_response) 41 | otto.enqueue('http://www.facebook.com', handle_url_response) 42 | otto.enqueue('http://www.yahoo.com', handle_url_response) 43 | 44 | # this request will come from the cache 45 | otto.enqueue('http://www.google.com', handle_url_response) 46 | 47 | otto.wait() # waits until queue is empty or timeout is ellapsed 48 | 49 | The analogous version with Tornado's IOLoop: 50 | 51 | from octopus import TornadoOctopus 52 | 53 | # this Octopus instance we'll run 4 concurrent requests max, 54 | # automatically start listening to the queue and 55 | # we'll in-memory cache responses for 10 seconds. 56 | otto = TornadoOctopus( 57 | concurrency=4, auto_start=True, cache=True, 58 | expiration_in_seconds=10 59 | ) 60 | 61 | def handle_url_response(url, response): 62 | # do something with response 63 | 64 | otto.enqueue('http://www.google.com', handle_url_response) 65 | otto.enqueue('http://www.facebook.com', handle_url_response) 66 | otto.enqueue('http://www.yahoo.com', handle_url_response) 67 | 68 | # this request will come from the cache 69 | otto.enqueue('http://www.google.com', handle_url_response) 70 | 71 | otto.wait() # waits until queue is empty or timeout is ellapsed 72 | 73 | API Reference 74 | ============= 75 | 76 | Response Class 77 | -------------- 78 | 79 | The `Response` class is the result of all requests made with `Octopus` or `TornadoOctopus`. 80 | 81 | It has the following information: 82 | 83 | * `url` - the url that started the request; 84 | * `status_code` - the status code for the request; 85 | * `cookies` - dictionary with request cookie values; 86 | * `headers` - dictionary with response headers; 87 | * `text` - the body of the response; 88 | * `effective_url` - in the case of redirects, this url might be different than url; 89 | * `error` - if an error has occurred this is where the error message will be; 90 | * `request_time` - the time ellapsed between the start and the end of the request in seconds. 91 | 92 | Octopus Class 93 | ------------- 94 | 95 | This is the main unit of work in `octopus` if you want to use threads. To enqueue new urls you need to have an `Octopus` instance: 96 | 97 | from octopus import Octopus 98 | 99 | otto = Octopus() 100 | 101 | The constructor for `Octopus` takes several configuration options: 102 | 103 | * `concurrency`: number of threads to use to retrieve URLs (defaults to 10 threads); 104 | * `auto_start`: Indicates whether threads should be started automatically (defaults to False); 105 | * `cache`: If set to `True`, responses will be cached for the number of seconds specified in `expiration_in_seconds` (defaults to False); 106 | * `expiration_in_seconds`: The number of seconds to keep url responses in the local cache (defaults to 30 seconds); 107 | * `request_timeout_in_seconds`: The number of seconds that each request can take (defaults to 5 seconds). 108 | * `limiter`: The instance of a limiter class to use to acquire limits (more on limits below). 109 | 110 | Octopus.start() 111 | --------------- 112 | 113 | If `auto_start` is set to `False`, this method must be called to start retrieving URLs. This is a **non-blocking** method. 114 | 115 | Octopus.enqueue 116 | --------------- 117 | 118 | Takes as arguments (url, handler, method="GET", **kwargs). 119 | 120 | This is the main method in the `Octopus` class. This method is used to enqueue new URLs. The handler argument specifies the method to be called when the response is available. 121 | 122 | The handler takes the form `handler(url, response)`. The response argument is a Octopus.Response instance. 123 | 124 | You can specify a different method using the `method` argument (`POST`, `HEAD`, etc) and you can pass extra keyword arguments to the `requests.request` method using the keyword arguments for this method. 125 | 126 | This is a **non-blocking** method. 127 | 128 | Octopus.queue_size 129 | ------------------ 130 | 131 | This property returns the approximate number of URLs still in the queue (not retrieved yet). 132 | 133 | Octopus.is_empty 134 | ---------------- 135 | 136 | This property returns if the URL queue is empty. 137 | 138 | Octopus.wait(timeout=10) 139 | ------------------------ 140 | 141 | If you want to wait for all the URLs in the queue to finish loading, just call this method. 142 | 143 | If you specify a `timeout` of `0`, `octopus` will wait until the queue is empty, no matter how long it takes. 144 | 145 | This is a **blocking** method. 146 | 147 | TornadoOctopus Class 148 | -------------------- 149 | 150 | This is the main unit of work in `octopus` if you want to use Tornado's IOLoop. To enqueue new urls you need to have an `TornadoOctopus` instance: 151 | 152 | from octopus import TornadoOctopus 153 | 154 | otto = TornadoOctopus() 155 | 156 | A **very important** thing that differs from the threaded version of Octopus is that you **MUST** call wait to get the responses, since Tornado IOLoop needs to be run in order to get the requests. 157 | 158 | The constructor for `TornadoOctopus` takes several configuration options: 159 | 160 | * `concurrency`: number of maximum async http requests to use to retrieve URLs (defaults to 10 requests); 161 | * `auto_start`: Indicates whether the ioloop should be created automatically (defaults to False); 162 | * `cache`: If set to `True`, responses will be cached for the number of seconds specified in `expiration_in_seconds` (defaults to False); 163 | * `expiration_in_seconds`: The number of seconds to keep url responses in the local cache (defaults to 30 seconds); 164 | * `request_timeout_in_seconds`: The number of seconds that each request can take (defaults to 10 seconds). 165 | * `connect_timeout_in_seconds`: The number of seconds that each connection can take (defaults to 5 seconds). 166 | * `limiter`: The instance of a limiter class to use to acquire limits (more on limits below). 167 | 168 | TornadoOctopus.start() 169 | --------------- 170 | 171 | If `auto_start` is set to `False`, this method must be called to create the IOLoop instance. This is a **non-blocking** method. 172 | 173 | TornadoOctopus.enqueue 174 | ---------------------- 175 | 176 | Takes as arguments (url, handler, method="GET", **kwargs). 177 | 178 | This is the main method in the `TornadoOctopus` class. This method is used to enqueue new URLs. The handler argument specifies the method to be called when the response is available. 179 | 180 | The handler takes the form `handler(url, response)`. The response argument is a Octopus.Response instance. 181 | 182 | You can specify a different method using the `method` argument (`POST`, `HEAD`, etc) and you can pass extra keyword arguments to the `AsyncHTTPClient.fetch` method using the keyword arguments for this method. 183 | 184 | This is a **non-blocking** method. 185 | 186 | TornadoOctopus.queue_size 187 | ------------------------- 188 | 189 | This property returns the number of URLs still in the queue (not retrieved yet). 190 | 191 | TornadoOctopus.is_empty 192 | ----------------------- 193 | 194 | This property returns if the URL queue is empty. 195 | 196 | TornadoOctopus.wait(timeout=10) 197 | ------------------------------- 198 | 199 | In order for the IOLoop to handle callbacks, you **MUST** call wait. This is the method that gets the IOLoop to run. 200 | 201 | If you specify a `timeout` of `0`, `octopus` will wait until the queue is empty, no matter how long it takes. 202 | 203 | This is a **blocking** method. 204 | 205 | Limiting Simultaneous Connections 206 | ================================= 207 | 208 | A very common problem that can happen when using octopus is overwhelming the server you are going to. In order to make sure this 209 | does not happen, Octopus allows users to specify a limiter class. 210 | 211 | Each limiter class has to provide two methods `acquire` and `release`, both taking an URL as argument. 212 | 213 | Octopus comes bundled with an in-memory limiter and a redis limiter (courtesy of the [retools project](https://github.com/bbangert/retools)). Using limiters is as simple as passing it to octopus constructor: 214 | 215 | from octopus import TornadoOctopus 216 | from octopus.limiter.in_memory.per_domain import Limiter 217 | 218 | # using in-memory limiter. Domains not specified here have no limit. 219 | limiter = Limiter( 220 | {'http://globo.com': 10}, # only 10 concurrent requests to this domain 221 | {'http://g1.globo.com': 20}, # only 20 concurrent requests to this domain 222 | ) 223 | 224 | otto = TornadoOctopus( 225 | concurrency=4, auto_start=True, cache=True, 226 | expiration_in_seconds=10, 227 | limiter=limiter 228 | ) 229 | 230 | The available built-in limiters are: 231 | 232 | * `octopus.limiter.in_memory.per_domain.Limiter` 233 | * `octopus.limiter.redis.per_domain.Limiter` 234 | 235 | Both take a list of dictionaries with the key being the beginning of the URL and value being the allowed concurrent connections. 236 | 237 | The reason this is a list is that urls defined first take precedence. This allows users to single out a path in a domain that needs less connections than the rest of the domain, like this: 238 | 239 | # using in-memory limiter. Domains not specified here have no limit. 240 | limiter = Limiter( 241 | {'http://g1.globo.com/economia': 5}, # only 5 concurrent requests to urls that begin with this key 242 | {'http://g1.globo.com': 20}, # only 20 concurrent requests to the rest of the domain 243 | ) 244 | 245 | The redis limiter takes two additional keyword arguments: 246 | `redis` (a [redis.py](https://github.com/andymccurdy/redis-py) connection to redis) 247 | and `expiration_in_seconds` (the expiration for locks in the limiter). 248 | 249 | **WARNING**: The in-memory limiter **IS NOT** thread-safe, so if you are using Threaded Octopus, do not use this limiter. 250 | 251 | If you'd like to do something when the limiter misses a lock (i.e.: no more connections allowed), just subscribe to it in the limiter using: 252 | 253 | # using in-memory limiter. Domains not specified here have no limit. 254 | limiter = Limiter( 255 | {'http://g1.globo.com/economia': 5}, # only 5 concurrent requests to urls that begin with this key 256 | {'http://g1.globo.com': 20}, # only 20 concurrent requests to the rest of the domain 257 | ) 258 | 259 | def handle_lock_miss(url): 260 | # do something with the miss 261 | pass 262 | 263 | limiter.subscribe_to_lock_miss(handle_lock_miss) 264 | 265 | 266 | Benchmark 267 | ========= 268 | 269 | In order to decide whether `octopus` really was worth using, it features a benchmark test in it's codebase. 270 | 271 | If you want to run it yourself (which is highly encouraged), just clone `octopus` repository and run this command: 272 | 273 | $ python benchmark/test_octopus.py 200 100 274 | 275 | The first argument is the number of URLs to retrieve. The seconds argument means how many threads will be used by `octopus` to get the urls. 276 | 277 | The test is pretty simple. Time how long it takes for requests to get the URLs sequentially and for `octopus` to get them concurrently. 278 | 279 | The results for retrieving `2000` urls with `200` threads is as follows: 280 | 281 | ======= 282 | RESULTS 283 | ======= 284 | 285 | [requests] Retrieving 2000 urls took 2692.66 seconds meaning 0.74 urls/second. 286 | 287 | [octopus] Retrieving 2000 urls took 31.14 seconds meaning 64.22 urls/second. 288 | 289 | [octopus] Retrieving 2000 urls with local in-memory caching took 6.61 seconds 290 | meaning 302.50 urls/second. 291 | 292 | [octopus-tornado] Retrieving 2000 urls took 167.99 seconds 293 | meaning 11.91 urls/second. 294 | 295 | [octopus-tornado-pycurl] Retrieving 2000 urls took 171.40 seconds 296 | meaning 11.67 urls/second. 297 | 298 | Overall, threaded octopus was more than 86 times faster than sequential requests 299 | and tornado octopus was more than 15 times faster than sequential requests. 300 | -------------------------------------------------------------------------------- /benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/heynemann/octopus/c7bc93e60530368b137a4e9df26b5b34dacecedf/benchmark/__init__.py -------------------------------------------------------------------------------- /benchmark/test_octopus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | from random import choice 6 | from time import time 7 | 8 | import requests 9 | 10 | from octopus import Octopus, TornadoOctopus 11 | 12 | 13 | def main(repetitions, concurrency): 14 | 15 | # alexa top sites 16 | urls = [ 17 | 'http://facebook.com', 18 | 'http://youtube.com', 19 | 'http://yahoo.com', 20 | 'http://wikipedia.org', 21 | 'http://linkedin.com', 22 | 'http://live.com', 23 | 'http://twitter.com', 24 | 'http://amazon.com', 25 | 'http://blogspot.com', 26 | 'http://wordpress.com', 27 | 'http://bing.com', 28 | 'http://ebay.com', 29 | 'http://tumblr.com', 30 | ] 31 | 32 | urls_to_retrieve = [choice(urls) for i in range(repetitions)] 33 | 34 | #requests_total_time = sequential_requests(repetitions, urls_to_retrieve) 35 | requests_total_time = 2692.66 # did it once... takes too long to get 2000 urls sequentially. 36 | otto_total_time = otto_requests(repetitions, concurrency, urls_to_retrieve) 37 | otto_cached_total_time = otto_cached_requests(repetitions, concurrency, urls_to_retrieve) 38 | tornado_pycurl_total_time = tornado_requests(repetitions, concurrency, urls_to_retrieve) 39 | tornado_total_time = tornado_requests(repetitions, concurrency, urls_to_retrieve, ignore_pycurl=True) 40 | 41 | message = "RESULTS" 42 | print 43 | print("=" * len(message)) 44 | print(message) 45 | print("=" * len(message)) 46 | print 47 | 48 | print "[requests] Retrieving %d urls took %.2f seconds meaning %.2f urls/second." % ( 49 | repetitions, 50 | requests_total_time, 51 | repetitions / requests_total_time 52 | ) 53 | print 54 | 55 | print "[octopus] Retrieving %d urls took %.2f seconds meaning %.2f urls/second." % ( 56 | repetitions, 57 | otto_total_time, 58 | repetitions / otto_total_time 59 | ) 60 | print 61 | 62 | print "[octopus] Retrieving %d urls with local in-memory caching took %.2f seconds meaning %.2f urls/second." % ( 63 | repetitions, 64 | otto_cached_total_time, 65 | repetitions / otto_cached_total_time 66 | ) 67 | print 68 | 69 | print "[octopus-tornado] Retrieving %d urls took %.2f seconds meaning %.2f urls/second." % ( 70 | repetitions, 71 | tornado_total_time, 72 | repetitions / tornado_total_time 73 | ) 74 | print 75 | 76 | print "[octopus-tornado-pycurl] Retrieving %d urls took %.2f seconds meaning %.2f urls/second." % ( 77 | repetitions, 78 | tornado_pycurl_total_time, 79 | repetitions / tornado_pycurl_total_time 80 | ) 81 | print 82 | 83 | print "Overall, threaded octopus was more than %d times faster than sequential requests and tornado octopus was more than %d times faster than sequential requests." % ( 84 | int(requests_total_time / otto_total_time), 85 | int(requests_total_time / tornado_pycurl_total_time) 86 | ) 87 | 88 | print 89 | 90 | 91 | def sequential_requests(repetitions, urls_to_retrieve): 92 | message = "Retrieving URLs sequentially with Requests..." 93 | print 94 | print("=" * len(message)) 95 | print(message) 96 | print("=" * len(message)) 97 | print 98 | 99 | start_time = time() 100 | 101 | for url_index, url in enumerate(urls_to_retrieve): 102 | print "%.2f%% - getting %s..." % ( 103 | float(url_index) / float(repetitions) * 100, 104 | url 105 | ) 106 | assert requests.get(url).status_code == 200 107 | 108 | return time() - start_time 109 | 110 | 111 | def otto_requests(repetitions, concurrency, urls_to_retrieve): 112 | message = "Retrieving URLs concurrently with Octopus..." 113 | print 114 | print("=" * len(message)) 115 | print(message) 116 | print("=" * len(message)) 117 | print 118 | 119 | otto = Octopus(concurrency=concurrency) 120 | 121 | for url in urls_to_retrieve: 122 | otto.enqueue(url, handle_url_response) 123 | 124 | start_time = time() 125 | otto.start() 126 | otto.wait(0) 127 | 128 | return time() - start_time 129 | 130 | 131 | def otto_cached_requests(repetitions, concurrency, urls_to_retrieve): 132 | message = "Retrieving URLs concurrently with Octopus with caching enabled..." 133 | print 134 | print("=" * len(message)) 135 | print(message) 136 | print("=" * len(message)) 137 | print 138 | 139 | otto = Octopus(concurrency=concurrency, cache=True, auto_start=True) 140 | 141 | for url in urls_to_retrieve: 142 | otto.enqueue(url, handle_url_response) 143 | 144 | start_time = time() 145 | otto.wait(0) 146 | 147 | return time() - start_time 148 | 149 | 150 | def tornado_requests(repetitions, concurrency, urls_to_retrieve, ignore_pycurl=False): 151 | message = "Retrieving URLs concurrently with TornadoOctopus (%s)..." % ( 152 | ignore_pycurl and "using SimpleHTTPClient" or "using pycurl" 153 | ) 154 | print 155 | print("=" * len(message)) 156 | print(message) 157 | print("=" * len(message)) 158 | print 159 | 160 | otto = TornadoOctopus(concurrency=concurrency, cache=False, auto_start=True, ignore_pycurl=ignore_pycurl) 161 | 162 | for url in urls_to_retrieve: 163 | otto.enqueue(url, handle_url_response) 164 | 165 | start_time = time() 166 | otto.wait(0) 167 | 168 | return time() - start_time 169 | 170 | 171 | def handle_url_response(url, response): 172 | print "Got %s!" % url 173 | assert response.status_code == 200, "Expected status code for %s to be 200, got %s" % (url, response.status_code) 174 | 175 | 176 | if __name__ == '__main__': 177 | main(int(sys.argv[1]), int(sys.argv[2])) 178 | -------------------------------------------------------------------------------- /octopus/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | __version__ = '0.6.4' 5 | 6 | from octopus.core import Octopus, TimeoutError, ResponseError # NOQA 7 | from octopus.tornado_core import TornadoOctopus # NOQA 8 | -------------------------------------------------------------------------------- /octopus/cache.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | from datetime import datetime, timedelta 6 | 7 | 8 | class Cache(object): 9 | def __init__(self, expiration_in_seconds): 10 | self.responses = {} 11 | self.expiration_in_seconds = expiration_in_seconds 12 | 13 | def put(self, url, response): 14 | self.responses[url] = { 15 | 'response': response, 16 | 'expires': datetime.now() + timedelta(seconds=self.expiration_in_seconds) 17 | } 18 | 19 | def get(self, url): 20 | if url not in self.responses: 21 | return None 22 | 23 | data = self.responses[url] 24 | 25 | if data['expires'] <= datetime.now(): 26 | del self.responses[url] 27 | return None 28 | 29 | return data['response'] 30 | -------------------------------------------------------------------------------- /octopus/core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | import sys 6 | import time 7 | from datetime import timedelta 8 | from threading import Thread 9 | 10 | try: 11 | import requests 12 | import requests.exceptions 13 | except ImportError: 14 | print("Can't import requests. Probably setup.py installing package.") 15 | 16 | from octopus.cache import Cache 17 | from octopus.model import Response 18 | 19 | try: 20 | 21 | from six.moves import queue 22 | 23 | class OctopusQueue(queue.Queue): 24 | # from http://stackoverflow.com/questions/1564501/add-timeout-argument-to-pythons-queue-join 25 | def join_with_timeout(self, timeout): 26 | self.all_tasks_done.acquire() 27 | try: 28 | endtime = time.time() + timeout 29 | while self.unfinished_tasks: 30 | remaining = endtime - time.time() 31 | if remaining <= 0.0: 32 | raise TimeoutError 33 | self.all_tasks_done.wait(remaining) 34 | finally: 35 | self.all_tasks_done.release() 36 | 37 | except ImportError: 38 | print("Can't import six. Probably setup.py installing package.") 39 | 40 | 41 | class TimeoutError(RuntimeError): 42 | pass 43 | 44 | 45 | class ResponseError(object): 46 | def __init__(self, url, status_code, text, error=None, elapsed=None): 47 | self.url = url 48 | self.status_code = status_code 49 | self.text = text 50 | self.error = error 51 | self.headers = {} 52 | self.cookies = {} 53 | self.effective_url = url 54 | self.elapsed = elapsed 55 | 56 | def close(self): 57 | pass 58 | 59 | 60 | class Octopus(object): 61 | def __init__( 62 | self, concurrency=10, auto_start=False, cache=False, 63 | expiration_in_seconds=30, request_timeout_in_seconds=5, limiter=None 64 | ): 65 | 66 | self.concurrency = concurrency 67 | self.auto_start = auto_start 68 | 69 | self.cache = cache 70 | self.response_cache = Cache(expiration_in_seconds=expiration_in_seconds) 71 | self.request_timeout_in_seconds = request_timeout_in_seconds 72 | 73 | self.url_queue = OctopusQueue() 74 | self.limiter = limiter 75 | 76 | if auto_start: 77 | self.start() 78 | 79 | def from_requests_response(self, url, response): 80 | return Response( 81 | url=url, status_code=response.status_code, 82 | headers=dict([(key, value) for key, value in response.headers.items()]), 83 | cookies=dict([(key, value) for key, value in response.cookies.items()]), 84 | text=response.text, effective_url=response.url, 85 | error=response.status_code > 399 and response.text or None, 86 | request_time=response.elapsed and response.elapsed.total_seconds or 0 87 | ) 88 | 89 | def enqueue(self, url, handler, method='GET', **kw): 90 | if self.cache: 91 | response = self.response_cache.get(url) 92 | if response is not None: 93 | handler(url, response) 94 | return 95 | 96 | self.url_queue.put_nowait((url, handler, method, kw)) 97 | 98 | @property 99 | def queue_size(self): 100 | return self.url_queue.qsize() 101 | 102 | @property 103 | def is_empty(self): 104 | return self.url_queue.empty() 105 | 106 | def start(self): 107 | for i in range(self.concurrency): 108 | t = Thread(target=self.do_work) 109 | t.daemon = True 110 | t.start() 111 | 112 | def do_work(self): 113 | while True: 114 | url, handler, method, kwargs = self.url_queue.get() 115 | 116 | response = None 117 | if self.cache: 118 | response = self.response_cache.get(url) 119 | 120 | if response is None: 121 | if self.limiter and not self.limiter.acquire(url): 122 | logging.info('Could not acquire limit for url "%s".' % url) 123 | self.url_queue.task_done() 124 | self.url_queue.put_nowait((url, handler, method, kwargs)) 125 | self.limiter.publish_lock_miss(url) 126 | time.sleep(0.1) 127 | continue 128 | 129 | try: 130 | response = requests.request(method, url, timeout=self.request_timeout_in_seconds, **kwargs) 131 | except requests.exceptions.Timeout: 132 | err = sys.exc_info()[1] 133 | response = ResponseError( 134 | url=url, 135 | status_code=500, 136 | text=str(err), 137 | error=err, 138 | elapsed=timedelta(seconds=self.request_timeout_in_seconds) 139 | ) 140 | except Exception: 141 | err = sys.exc_info()[1] 142 | response = ResponseError( 143 | url=url, 144 | status_code=599, 145 | text=str(err), 146 | error=err 147 | ) 148 | finally: 149 | if self.limiter: 150 | self.limiter.release(url) 151 | 152 | original_response = response 153 | 154 | response = self.from_requests_response(url, response) 155 | 156 | original_response.close() 157 | 158 | if self.cache: 159 | self.response_cache.put(url, response) 160 | 161 | handler(url, response) 162 | 163 | self.url_queue.task_done() 164 | 165 | def wait(self, timeout=10): 166 | if timeout > 0: 167 | self.url_queue.join_with_timeout(timeout=timeout) 168 | else: 169 | self.url_queue.join() 170 | -------------------------------------------------------------------------------- /octopus/limiter/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from cyrusbus import Bus 5 | 6 | 7 | class Limiter(object): 8 | def __init__(self, limiter_miss_timeout_ms=None): 9 | self.bus = Bus() 10 | self.limiter_miss_timeout_ms = limiter_miss_timeout_ms 11 | if self.limiter_miss_timeout_ms is None: 12 | self.limiter_miss_timeout_ms = 500 13 | 14 | def handle_callbacks(self, callback): 15 | def handle(bus, *args, **kw): 16 | callback(*args, **kw) 17 | return handle 18 | 19 | def subscribe_to_lock_miss(self, callback): 20 | self.bus.subscribe('limiter.miss', self.handle_callbacks(callback)) 21 | 22 | def publish_lock_miss(self, url): 23 | self.bus.publish('limiter.miss', url) 24 | -------------------------------------------------------------------------------- /octopus/limiter/in_memory/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/heynemann/octopus/c7bc93e60530368b137a4e9df26b5b34dacecedf/octopus/limiter/in_memory/__init__.py -------------------------------------------------------------------------------- /octopus/limiter/in_memory/per_domain.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | from collections import defaultdict 6 | 7 | from octopus.limiter import Limiter as BaseLimiter 8 | 9 | 10 | class Limiter(BaseLimiter): 11 | def __init__(self, *domains, **kw): 12 | limiter_miss_timeout_ms = None 13 | if 'limiter_miss_timeout_ms' in kw: 14 | limiter_miss_timeout_ms = kw['limiter_miss_timeout_ms'] 15 | 16 | super(Limiter, self).__init__(limiter_miss_timeout_ms=limiter_miss_timeout_ms) 17 | self.update_domain_definitions(*domains) 18 | 19 | def update_domain_definitions(self, *domains): 20 | self.domains = domains 21 | self.domain_count = defaultdict(int) 22 | 23 | def get_domain_from_url(self, url): 24 | for domain in self.domains: 25 | for key in domain.keys(): 26 | if url.startswith(key): 27 | return key 28 | return None 29 | 30 | def get_domain_limit(self, url): 31 | for domain in self.domains: 32 | for key in domain.keys(): 33 | if url.startswith(key): 34 | return domain[key] 35 | return 0 36 | 37 | def acquire(self, url): 38 | domain = self.get_domain_from_url(url) 39 | if domain is None: 40 | logging.info('Tried to acquire lock to a domain that was not specified in the limiter (%s).' % url) 41 | return True 42 | 43 | limit = self.get_domain_limit(url) 44 | 45 | if self.domain_count[domain] < limit: 46 | self.domain_count[domain] += 1 47 | return True 48 | 49 | return False 50 | 51 | def release(self, url): 52 | domain = self.get_domain_from_url(url) 53 | if domain is None: 54 | logging.info('Tried to release lock to a domain that was not specified in the limiter (%s).' % url) 55 | return 56 | 57 | self.domain_count[domain] -= 1 58 | -------------------------------------------------------------------------------- /octopus/limiter/redis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/heynemann/octopus/c7bc93e60530368b137a4e9df26b5b34dacecedf/octopus/limiter/redis/__init__.py -------------------------------------------------------------------------------- /octopus/limiter/redis/per_domain.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | 6 | from retools.limiter import Limiter as ReToolsLimiter 7 | 8 | from octopus.limiter.in_memory.per_domain import Limiter as InMemoryPerDomainLimiter 9 | 10 | 11 | class Limiter(InMemoryPerDomainLimiter): 12 | def __init__(self, *domains, **kw): 13 | limiter_miss_timeout_ms = None 14 | if 'limiter_miss_timeout_ms' in kw: 15 | limiter_miss_timeout_ms = kw['limiter_miss_timeout_ms'] 16 | 17 | # Skips InMemoryPerDomainLimiter constructor 18 | super(InMemoryPerDomainLimiter, self).__init__(limiter_miss_timeout_ms=limiter_miss_timeout_ms) 19 | 20 | if not 'redis' in kw: 21 | raise RuntimeError('You must specify a connection to redis in order to use Redis Limiter.') 22 | 23 | self.redis = kw['redis'] 24 | self.expiration_in_seconds = float(kw.get('expiration_in_seconds', 10)) 25 | 26 | self.update_domain_definitions(*domains) 27 | 28 | def update_domain_definitions(self, *domains): 29 | self.domains = domains 30 | self.limiters = {} 31 | 32 | for domain in self.domains: 33 | for key, limit in domain.items(): 34 | self.limiters[key] = ReToolsLimiter( 35 | limit=limit, 36 | prefix='limit-for-%s' % key, 37 | expiration_in_seconds=self.expiration_in_seconds, 38 | redis=self.redis 39 | ) 40 | 41 | def acquire(self, url): 42 | domain = self.get_domain_from_url(url) 43 | if domain is None: 44 | logging.info('Tried to acquire lock to a domain that was not specified in the limiter (%s).' % url) 45 | return True 46 | 47 | could_lock = self.limiters[domain].acquire_limit(url) 48 | 49 | if not could_lock: 50 | logging.info('Tried to acquire lock for %s but could not.' % url) 51 | 52 | return could_lock 53 | 54 | def release(self, url): 55 | domain = self.get_domain_from_url(url) 56 | 57 | if domain is None: 58 | logging.info('Tried to release lock to a domain that was not specified in the limiter (%s).' % url) 59 | return True 60 | 61 | self.limiters[domain].release_limit(url) 62 | -------------------------------------------------------------------------------- /octopus/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | class Response(object): 6 | def __init__( 7 | self, url, status_code, 8 | headers, cookies, text, effective_url, 9 | error, request_time 10 | ): 11 | self.url = url 12 | self.status_code = status_code 13 | self.cookies = cookies 14 | self.headers = headers 15 | self.text = text 16 | self.effective_url = effective_url 17 | self.error = error 18 | self.request_time = request_time 19 | -------------------------------------------------------------------------------- /octopus/tornado_core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | from datetime import timedelta 6 | 7 | try: 8 | from tornado.ioloop import IOLoop 9 | from tornado.httpclient import AsyncHTTPClient, HTTPRequest 10 | except ImportError: 11 | print("Can't import tornado. Probably setup.py installing package.") 12 | 13 | try: 14 | import pycurl # NOQA 15 | PYCURL_AVAILABLE = True 16 | except ImportError: 17 | PYCURL_AVAILABLE = False 18 | 19 | from octopus.cache import Cache 20 | from octopus.model import Response 21 | 22 | 23 | class TornadoOctopus(object): 24 | def __init__( 25 | self, concurrency=10, auto_start=False, cache=False, 26 | expiration_in_seconds=30, request_timeout_in_seconds=10, 27 | connect_timeout_in_seconds=5, ignore_pycurl=False, 28 | limiter=None, allow_connection_reuse=True): 29 | 30 | self.concurrency = concurrency 31 | self.auto_start = auto_start 32 | self.last_timeout = None 33 | 34 | self.cache = cache 35 | self.response_cache = Cache(expiration_in_seconds=expiration_in_seconds) 36 | self.request_timeout_in_seconds = request_timeout_in_seconds 37 | self.connect_timeout_in_seconds = connect_timeout_in_seconds 38 | 39 | self.ignore_pycurl = ignore_pycurl 40 | 41 | self.running_urls = 0 42 | self.url_queue = [] 43 | 44 | if PYCURL_AVAILABLE and not self.ignore_pycurl: 45 | logging.debug('pycurl is available, thus Octopus will be using it instead of tornado\'s simple http client.') 46 | AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") 47 | self.allow_connection_reuse = allow_connection_reuse 48 | else: 49 | self.allow_connection_reuse = True 50 | 51 | if auto_start: 52 | logging.debug('Auto starting...') 53 | self.start() 54 | 55 | self.limiter = limiter 56 | 57 | @property 58 | def queue_size(self): 59 | return len(self.url_queue) 60 | 61 | @property 62 | def is_empty(self): 63 | return self.queue_size == 0 64 | 65 | def start(self): 66 | logging.debug('Creating IOLoop and http_client.') 67 | self.ioloop = IOLoop() 68 | self.http_client = AsyncHTTPClient(io_loop=self.ioloop) 69 | 70 | @classmethod 71 | def from_tornado_response(cls, url, response): 72 | cookies = response.request.headers.get('Cookie', '') 73 | if cookies: 74 | cookies = dict([cookie.split('=') for cookie in cookies.split(';')]) 75 | 76 | return Response( 77 | url=url, status_code=response.code, 78 | headers=dict([(key, value) for key, value in response.headers.items()]), 79 | cookies=cookies, 80 | text=response.body, effective_url=response.effective_url, 81 | error=response.error and str(response.error) or None, 82 | request_time=response.request_time 83 | ) 84 | 85 | def enqueue(self, url, handler, method='GET', **kw): 86 | logging.debug('Enqueueing %s...' % url) 87 | 88 | if self.cache: 89 | response = self.response_cache.get(url) 90 | 91 | if response is not None: 92 | logging.debug('Cache hit on %s.' % url) 93 | handler(url, response) 94 | return 95 | 96 | if self.running_urls < self.concurrency: 97 | logging.debug('Queue has space available for fetching %s.' % url) 98 | self.get_next_url(url, handler, method, **kw) 99 | else: 100 | logging.debug('Queue is full. Enqueueing %s for future fetch.' % url) 101 | self.url_queue.append((url, handler, method, kw)) 102 | 103 | def fetch(self, url, handler, method, **kw): 104 | self.running_urls += 1 105 | 106 | if self.cache: 107 | response = self.response_cache.get(url) 108 | 109 | if response is not None: 110 | logging.debug('Cache hit on %s.' % url) 111 | self.running_urls -= 1 112 | handler(url, response) 113 | return 114 | 115 | logging.info('Fetching %s...' % url) 116 | 117 | request = HTTPRequest( 118 | url=url, 119 | method=method, 120 | connect_timeout=self.connect_timeout_in_seconds, 121 | request_timeout=self.request_timeout_in_seconds, 122 | prepare_curl_callback=self.handle_curl_callback, 123 | **kw 124 | ) 125 | 126 | self.http_client.fetch(request, self.handle_request(url, handler)) 127 | 128 | def handle_curl_callback(self, curl): 129 | if not self.allow_connection_reuse: 130 | curl.setopt(pycurl.FRESH_CONNECT, 1) 131 | 132 | def get_next_url(self, request_url=None, handler=None, method=None, **kw): 133 | if request_url is None: 134 | if not self.url_queue: 135 | return 136 | 137 | request_url, handler, method, kw = self.url_queue.pop() 138 | 139 | self.fetch_next_url(request_url, handler, method, **kw) 140 | 141 | def fetch_next_url(self, request_url, handler, method, **kw): 142 | if self.limiter and not self.limiter.acquire(request_url): 143 | logging.info('Could not acquire limit for url "%s".' % request_url) 144 | 145 | self.url_queue.append((request_url, handler, method, kw)) 146 | deadline = timedelta(seconds=self.limiter.limiter_miss_timeout_ms / 1000.0) 147 | self.ioloop.add_timeout(deadline, self.get_next_url) 148 | self.limiter.publish_lock_miss(request_url) 149 | return False 150 | 151 | logging.debug('Queue has space available for fetching %s.' % request_url) 152 | self.fetch(request_url, handler, method, **kw) 153 | return True 154 | 155 | def handle_request(self, url, callback): 156 | def handle(response): 157 | logging.debug('Handler called for url %s...' % url) 158 | self.running_urls -= 1 159 | 160 | response = self.from_tornado_response(url, response) 161 | logging.info('Got response(%s) from %s.' % (response.status_code, url)) 162 | 163 | if self.cache and response and response.status_code < 399: 164 | logging.debug('Putting %s into cache.' % url) 165 | self.response_cache.put(url, response) 166 | 167 | if self.limiter: 168 | self.limiter.release(url) 169 | 170 | try: 171 | callback(url, response) 172 | except Exception: 173 | logging.exception('Error calling callback for %s.' % url) 174 | 175 | if self.running_urls < self.concurrency and self.url_queue: 176 | self.get_next_url() 177 | 178 | logging.debug('Getting %d urls and still have %d more urls to get...' % (self.running_urls, self.remaining_requests)) 179 | if self.running_urls < 1 and self.remaining_requests == 0: 180 | logging.debug('Nothing else to get. Stopping Octopus...') 181 | self.stop() 182 | 183 | return handle 184 | 185 | def handle_wait_timeout(self, signal_number, frames): 186 | logging.debug('Timeout waiting for IOLoop to finish. Stopping IOLoop manually.') 187 | self.stop(force=True) 188 | 189 | def wait(self, timeout=10): 190 | self.last_timeout = timeout 191 | if not self.url_queue and not self.running_urls: 192 | logging.debug('No urls to wait for. Returning immediately.') 193 | return 194 | 195 | if timeout: 196 | logging.debug('Waiting for urls to be retrieved for %s seconds.' % timeout) 197 | self.ioloop.set_blocking_signal_threshold(timeout, self.handle_wait_timeout) 198 | else: 199 | logging.debug('Waiting for urls to be retrieved.') 200 | 201 | logging.info('Starting IOLoop with %d URLs still left to process.' % self.remaining_requests) 202 | self.ioloop.start() 203 | 204 | @property 205 | def remaining_requests(self): 206 | return len(self.url_queue) 207 | 208 | def stop(self, force=False): 209 | logging.info('Stopping IOLoop with %d URLs still left to process.' % self.remaining_requests) 210 | self.ioloop.stop() 211 | -------------------------------------------------------------------------------- /redis.conf: -------------------------------------------------------------------------------- 1 | daemonize yes 2 | pidfile /tmp/redis-octopus.pid 3 | port 7575 4 | dbfilename redis-octopus.rdb 5 | dir /tmp 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import setup, find_packages 5 | from octopus import __version__ 6 | 7 | tests_require = [ 8 | 'mock', 9 | 'nose', 10 | 'coverage', 11 | 'yanc', 12 | 'preggy', 13 | 'tox', 14 | 'ipdb', 15 | 'coveralls', 16 | ] 17 | 18 | setup( 19 | name='octopus-http', 20 | version=__version__, 21 | description='Octopus is a library to use threads to concurrently retrieve and report on the completion of http requests', 22 | long_description=''' 23 | Octopus is a library to use threads to concurrently retrieve and report on the completion of http requests 24 | ''', 25 | keywords='http concurrency threading', 26 | author='Bernardo Heynemann', 27 | author_email='heynemann@gmail.com', 28 | url='https://heynemann.github.io/octopus', 29 | license='MIT', 30 | classifiers=[ 31 | 'Development Status :: 4 - Beta', 32 | 'Intended Audience :: Developers', 33 | 'License :: OSI Approved :: MIT License', 34 | 'Natural Language :: English', 35 | 'Operating System :: MacOS', 36 | 'Operating System :: POSIX', 37 | 'Operating System :: Unix', 38 | 'Operating System :: OS Independent', 39 | 'Programming Language :: Python :: 2.7', 40 | ], 41 | packages=find_packages(), 42 | include_package_data=True, 43 | install_requires=[ 44 | 'six', 45 | 'requests', 46 | 'tornado', 47 | 'retools', 48 | 'cyrusbus' 49 | ], 50 | extras_require={ 51 | 'tests': tests_require, 52 | }, 53 | ) 54 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from unittest import TestCase as PythonTestCase 5 | 6 | import redis 7 | 8 | 9 | class TestCase(PythonTestCase): 10 | def setUp(self): 11 | self.redis = redis.Redis(host='localhost', port=7575, db=0) 12 | self.redis.flushall() 13 | -------------------------------------------------------------------------------- /tests/limiter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/heynemann/octopus/c7bc93e60530368b137a4e9df26b5b34dacecedf/tests/limiter/__init__.py -------------------------------------------------------------------------------- /tests/limiter/in_memory/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/heynemann/octopus/c7bc93e60530368b137a4e9df26b5b34dacecedf/tests/limiter/in_memory/__init__.py -------------------------------------------------------------------------------- /tests/limiter/in_memory/test_per_domain.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | 6 | from preggy import expect 7 | from mock import patch 8 | 9 | from octopus.limiter.in_memory.per_domain import Limiter as PerDomainInMemoryLimiter 10 | from tests import TestCase 11 | 12 | 13 | class TestPerDomain(TestCase): 14 | def setUp(self): 15 | super(TestPerDomain, self).setUp() 16 | self.limiter = PerDomainInMemoryLimiter( 17 | {'http://g1.globo.com': 10}, 18 | {'http://globoesporte.globo.com': 10} 19 | ) 20 | 21 | def test_can_create_limiter(self): 22 | expect(self.limiter.domains[0]).to_include('http://g1.globo.com') 23 | expect(self.limiter.domains).not_to_be_null() 24 | expect(self.limiter.domains[0]['http://g1.globo.com']).to_equal(10) 25 | 26 | def test_can_acquire_limit(self): 27 | expect(self.limiter.acquire('http://g1.globo.com')).to_be_true() 28 | expect(self.limiter.domain_count).to_include('http://g1.globo.com') 29 | expect(self.limiter.domain_count['http://g1.globo.com']).to_equal(1) 30 | 31 | def test_acquiring_internal_url_gets_proper_domain(self): 32 | expect(self.limiter.acquire('http://g1.globo.com/economia/')).to_be_true() 33 | expect(self.limiter.domain_count).to_include('http://g1.globo.com') 34 | expect(self.limiter.domain_count['http://g1.globo.com']).to_equal(1) 35 | 36 | @patch.object(logging, 'info') 37 | def test_can_acquire_from_unknown_domain_url(self, logging_mock): 38 | limiter = PerDomainInMemoryLimiter( 39 | {'http://globoesporte.globo.com': 10} 40 | ) 41 | 42 | expect(limiter.acquire('http://g1.globo.com/economia/')).to_be_true() 43 | expect(limiter.domain_count).to_be_empty() 44 | logging_mock.assert_called_once_with('Tried to acquire lock to a domain that was not specified in the limiter (http://g1.globo.com/economia/).') 45 | 46 | def test_can_release(self): 47 | url = 'http://g1.globo.com/economia/' 48 | self.limiter.acquire(url) 49 | self.limiter.release(url) 50 | 51 | expect(self.limiter.domain_count['http://g1.globo.com']).to_equal(0) 52 | 53 | def test_can_get_domain_from_url(self): 54 | expect(self.limiter.get_domain_from_url('http://g1.globo.com/economia/')).to_equal('http://g1.globo.com') 55 | 56 | def test_can_get_domain_limit(self): 57 | url = 'http://g1.globo.com/economia/' 58 | expect(self.limiter.get_domain_limit(url)).to_equal(10) 59 | 60 | self.limiter.acquire(url) 61 | expect(self.limiter.get_domain_limit(url)).to_equal(10) 62 | 63 | expect(self.limiter.get_domain_limit('http://www.google.com')).to_equal(0) 64 | 65 | @patch.object(logging, 'info') 66 | def test_can_release_unknown_url(self, logging_mock): 67 | self.limiter.release('http://www.google.com') 68 | 69 | expect(self.limiter.domain_count).to_be_empty() 70 | logging_mock.assert_called_once_with('Tried to release lock to a domain that was not specified in the limiter (http://www.google.com).') 71 | -------------------------------------------------------------------------------- /tests/limiter/redis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/heynemann/octopus/c7bc93e60530368b137a4e9df26b5b34dacecedf/tests/limiter/redis/__init__.py -------------------------------------------------------------------------------- /tests/limiter/redis/test_per_domain.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import logging 6 | 7 | from preggy import expect 8 | from mock import patch 9 | 10 | from octopus.limiter.redis.per_domain import Limiter as PerDomainRedisLimiter 11 | from tests import TestCase 12 | 13 | 14 | class TestPerDomain(TestCase): 15 | def setUp(self): 16 | super(TestPerDomain, self).setUp() 17 | self.limiter = PerDomainRedisLimiter( 18 | {'http://g1.globo.com': 10}, 19 | {'http://globoesporte.globo.com': 10}, 20 | redis=self.redis, 21 | expiration_in_seconds=12 22 | ) 23 | 24 | def test_can_create_limiter(self): 25 | expect(self.limiter.redis).to_equal(self.redis) 26 | expect(self.limiter.expiration_in_seconds).to_equal(12) 27 | expect(self.limiter.domains[0]).to_include('http://g1.globo.com') 28 | expect(self.limiter.domains).not_to_be_null() 29 | expect(self.limiter.domains[0]['http://g1.globo.com']).to_equal(10) 30 | 31 | def test_cant_create_limiter_without_redis(self): 32 | try: 33 | PerDomainRedisLimiter() 34 | except RuntimeError: 35 | err = sys.exc_info()[1] 36 | expect(err).to_have_an_error_message_of('You must specify a connection to redis in order to use Redis Limiter.') 37 | else: 38 | assert False, "Should not have gotten this far" 39 | 40 | def test_can_acquire_limit(self): 41 | expect(self.limiter.acquire('http://g1.globo.com')).to_be_true() 42 | 43 | try: 44 | expect(self.redis.zcard('limit-for-http://g1.globo.com')).to_equal(1) 45 | finally: 46 | self.limiter.release('http://g1.globo.com') 47 | 48 | def test_acquiring_internal_url_gets_proper_domain(self): 49 | url = 'http://g1.globo.com/economia/' 50 | expect(self.limiter.acquire(url)).to_be_true() 51 | 52 | try: 53 | expect(self.redis.zcard('limit-for-http://g1.globo.com')).to_equal(1) 54 | finally: 55 | self.limiter.release(url) 56 | 57 | def test_can_acquire_from_unknown_domain_url(self): 58 | limiter = PerDomainRedisLimiter( 59 | {'http://globoesporte.globo.com': 10}, 60 | redis=self.redis 61 | ) 62 | 63 | url = 'http://g1.globo.com/economia/' 64 | expect(limiter.acquire(url)).to_be_true() 65 | expect(self.redis.zcard('limit-for-http://g1.globo.com')).to_equal(0) 66 | 67 | def test_can_release(self): 68 | url = 'http://g1.globo.com/economia/' 69 | self.limiter.acquire(url) 70 | self.limiter.release(url) 71 | 72 | expect(self.redis.zcard('limit-for-http://g1.globo.com')).to_equal(0) 73 | 74 | def test_can_get_domain_from_url(self): 75 | expect(self.limiter.get_domain_from_url('http://g1.globo.com/economia/')).to_equal('http://g1.globo.com') 76 | 77 | def test_can_get_domain_limit(self): 78 | url = 'http://g1.globo.com/economia/' 79 | expect(self.limiter.get_domain_limit(url)).to_equal(10) 80 | 81 | expect(self.limiter.get_domain_limit('http://www.google.com')).to_equal(0) 82 | 83 | @patch.object(logging, 'info') 84 | def test_can_release_unknown_url(self, logging_mock): 85 | self.limiter.release('http://www.google.com') 86 | 87 | expect(self.redis.zcard('limit-for-http://www.google.com')).to_equal(0) 88 | 89 | logging_mock.assert_called_once_with( 90 | 'Tried to release lock to a domain that was not specified ' 91 | 'in the limiter (http://www.google.com).' 92 | ) 93 | -------------------------------------------------------------------------------- /tests/limiter/test_base_limiter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from preggy import expect 5 | 6 | from octopus.limiter import Limiter 7 | from tests import TestCase 8 | 9 | 10 | class TestBaseLimiter(TestCase): 11 | def setUp(self): 12 | super(TestBaseLimiter, self).setUp() 13 | self.limiter = Limiter() 14 | self.handled_url = None 15 | 16 | def test_has_bus(self): 17 | expect(self.limiter.bus).not_to_be_null() 18 | 19 | def test_can_subscribe(self): 20 | def handle_lock_miss(url): 21 | pass 22 | 23 | self.limiter.subscribe_to_lock_miss(handle_lock_miss) 24 | 25 | expect(self.limiter.bus.has_any_subscriptions('limiter.miss')).to_be_true() 26 | 27 | def test_can_get_lock_miss(self): 28 | def handle_lock_miss(url): 29 | self.handled_url = url 30 | 31 | self.limiter.subscribe_to_lock_miss(handle_lock_miss) 32 | 33 | self.limiter.publish_lock_miss('some-url') 34 | 35 | expect(self.handled_url).to_equal('some-url') 36 | -------------------------------------------------------------------------------- /tests/test_cache.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import time 5 | 6 | from preggy import expect 7 | 8 | from octopus.cache import Cache 9 | from tests import TestCase 10 | 11 | 12 | class TestCache(TestCase): 13 | def test_can_create_cache(self): 14 | cache = Cache(expiration_in_seconds=45) 15 | expect(cache.expiration_in_seconds).to_equal(45) 16 | expect(cache.responses).to_be_empty() 17 | 18 | def test_get_returns_none_if_not_put(self): 19 | cache = Cache(expiration_in_seconds=45) 20 | 21 | expect(cache.get('http://www.google.com')).to_be_null() 22 | 23 | def test_get_returns_none_if_expired(self): 24 | cache = Cache(expiration_in_seconds=0.1) 25 | 26 | cache.put('http://www.google.com', 'response') 27 | 28 | time.sleep(0.5) 29 | 30 | expect(cache.get('http://www.google.com')).to_be_null() 31 | 32 | expect(cache.responses).not_to_include('http://www.google.com') 33 | 34 | def test_can_get_after_put(self): 35 | cache = Cache(expiration_in_seconds=10) 36 | cache.put('http://www.google.com', 'response') 37 | expect(cache.get('http://www.google.com')).to_equal('response') 38 | -------------------------------------------------------------------------------- /tests/test_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from preggy import expect 5 | 6 | from octopus.model import Response 7 | from tests import TestCase 8 | 9 | 10 | class TestResponseModel(TestCase): 11 | def test_can_create_response(self): 12 | response = Response( 13 | url="http://www.google.com", 14 | status_code=200, 15 | headers={ 16 | 'Accept': 'image/webp; */*' 17 | }, 18 | cookies={ 19 | 'whatever': 'some-value' 20 | }, 21 | text='some request body', 22 | effective_url='http://www.google.com/', 23 | error="some error message", 24 | request_time=10.24 25 | ) 26 | 27 | expect(response.url).to_equal('http://www.google.com') 28 | expect(response.status_code).to_equal(200) 29 | expect(response.headers).to_be_like({ 30 | 'Accept': 'image/webp; */*' 31 | }) 32 | expect(response.cookies).to_be_like({ 33 | 'whatever': 'some-value' 34 | }) 35 | expect(response.text).to_equal('some request body') 36 | expect(response.effective_url).to_equal('http://www.google.com/') 37 | expect(response.error).to_equal('some error message') 38 | expect(response.request_time).to_equal(10.24) 39 | -------------------------------------------------------------------------------- /tests/test_octopus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | 6 | from preggy import expect 7 | from mock import Mock 8 | 9 | from octopus import Octopus, TimeoutError 10 | from tests import TestCase 11 | 12 | 13 | class TestOctopus(TestCase): 14 | def setUp(self): 15 | self.response = None 16 | self.responses = {} 17 | 18 | def test_can_create_octopus(self): 19 | otto = Octopus(concurrency=20) 20 | expect(otto.concurrency).to_equal(20) 21 | expect(otto.auto_start).to_be_false() 22 | expect(otto.cache).to_be_false() 23 | 24 | def test_has_default_concurrency(self): 25 | otto = Octopus() 26 | expect(otto.concurrency).to_equal(10) 27 | 28 | def test_queue_is_empty(self): 29 | otto = Octopus() 30 | expect(otto.is_empty).to_be_true() 31 | 32 | def test_can_enqueue_url(self): 33 | otto = Octopus() 34 | 35 | otto.enqueue('http://www.google.com', None) 36 | 37 | expect(otto.queue_size).to_equal(1) 38 | 39 | def test_can_get_after_started(self): 40 | otto = Octopus(concurrency=1) 41 | 42 | def handle_url_response(url, response): 43 | self.response = response 44 | 45 | otto.enqueue('http://www.twitter.com', handle_url_response) 46 | otto.start() 47 | 48 | otto.wait(5) 49 | 50 | expect(self.response).not_to_be_null() 51 | expect(self.response.status_code).to_equal(200) 52 | 53 | def test_can_get_with_auto_start(self): 54 | otto = Octopus(concurrency=1, auto_start=True) 55 | 56 | def handle_url_response(url, response): 57 | self.response = response 58 | 59 | otto.enqueue('http://www.twitter.com', handle_url_response) 60 | 61 | otto.wait(5) 62 | 63 | expect(self.response).not_to_be_null() 64 | expect(self.response.status_code).to_equal(200) 65 | 66 | def test_can_wait(self): 67 | otto = Octopus(concurrency=1) 68 | 69 | def handle_url_response(url, response): 70 | self.response = response 71 | 72 | otto.enqueue('http://www.twitter.com', handle_url_response) 73 | otto.start() 74 | 75 | otto.wait(0) 76 | 77 | expect(self.response).not_to_be_null() 78 | expect(self.response.status_code).to_equal(200) 79 | 80 | def test_wait_returns_automatically_when_empty(self): 81 | otto = Octopus(concurrency=1) 82 | otto.start() 83 | 84 | otto.wait(5) 85 | 86 | expect(otto.is_empty).to_be_true() 87 | 88 | def test_times_out_on_wait(self): 89 | otto = Octopus(concurrency=1) 90 | 91 | def handle_url_response(url, response): 92 | self.response = response 93 | 94 | otto.enqueue('http://www.google.com', handle_url_response) 95 | 96 | try: 97 | otto.wait(0.1) 98 | except TimeoutError: 99 | err = sys.exc_info()[1] 100 | expect(err).to_have_an_error_message_of("") 101 | else: 102 | assert False, "Should not have gotten this far" 103 | 104 | def test_can_handle_more_urls_concurrently(self): 105 | urls = [ 106 | 'http://www.twitter.com', 107 | 'http://www.cnn.com', 108 | 'http://www.bbc.com', 109 | 'http://www.facebook.com' 110 | ] 111 | otto = Octopus(concurrency=4) 112 | 113 | def handle_url_response(url, response): 114 | self.responses[url] = response 115 | 116 | for url in urls: 117 | otto.enqueue(url, handle_url_response) 118 | 119 | otto.start() 120 | 121 | otto.wait(10) 122 | 123 | expect(self.responses).to_length(4) 124 | 125 | for url in urls: 126 | expect(self.responses).to_include(url) 127 | expect(self.responses[url].status_code).to_equal(200) 128 | 129 | def test_can_handle_cached_responses(self): 130 | response = Mock(status_code=200, body="whatever") 131 | 132 | url = 'http://www.google.com' 133 | otto = Octopus(concurrency=1, cache=True) 134 | otto.response_cache.put(url, response) 135 | 136 | def handle_url_response(url, response): 137 | self.response = response 138 | 139 | otto.enqueue(url, handle_url_response) 140 | 141 | expect(self.response).not_to_be_null() 142 | expect(self.response.status_code).to_equal(200) 143 | expect(self.response.body).to_equal("whatever") 144 | 145 | def test_can_handle_cached_responses_when_not_cached(self): 146 | url = 'http://www.twitter.com' 147 | otto = Octopus(concurrency=1, cache=True) 148 | 149 | def handle_url_response(url, response): 150 | self.response = response 151 | 152 | otto.enqueue(url, handle_url_response) 153 | otto.enqueue(url, handle_url_response) 154 | otto.enqueue(url, handle_url_response) 155 | otto.enqueue(url, handle_url_response) 156 | 157 | otto.start() 158 | 159 | otto.wait(5) 160 | 161 | expect(self.response).not_to_be_null() 162 | expect(self.response.status_code).to_equal(200) 163 | 164 | def test_can_handle_invalid_urls(self): 165 | url = 'http://kagdjdkjgka.fk' 166 | otto = Octopus(concurrency=1) 167 | 168 | def handle_url_response(url, response): 169 | self.response = response 170 | 171 | otto.enqueue(url, handle_url_response) 172 | 173 | otto.start() 174 | 175 | otto.wait(5) 176 | 177 | expect(self.response).not_to_be_null() 178 | expect(self.response.status_code).to_equal(599) 179 | expect(self.response.text).to_include("HTTPConnectionPool(host='kagdjdkjgka.fk', port=80)") 180 | expect(self.response.text).to_include('Max retries exceeded with url: /') 181 | expect(self.response.error).to_equal(self.response.text) 182 | 183 | def test_can_handle_timeouts(self): 184 | url = 'http://baidu.com' 185 | otto = Octopus(concurrency=1, request_timeout_in_seconds=0.1) 186 | 187 | def handle_url_response(url, response): 188 | self.response = response 189 | 190 | otto.enqueue(url, handle_url_response) 191 | 192 | otto.start() 193 | 194 | otto.wait(5) 195 | 196 | expect(self.response.text).to_include('Connection to baidu.com timed out') 197 | expect(self.response.error).to_include('Connection to baidu.com timed out. (connect timeout=0.1)') 198 | -------------------------------------------------------------------------------- /tests/test_octopus_limited.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from preggy import expect 5 | 6 | from octopus import Octopus 7 | from octopus.limiter.redis.per_domain import Limiter as PerDomainRedisLimiter 8 | from octopus.limiter.in_memory.per_domain import Limiter as PerDomainInMemoryLimiter 9 | from tests import TestCase 10 | 11 | 12 | class TestThreadedOctopusAgainstLimiter(TestCase): 13 | def setUp(self): 14 | super(TestThreadedOctopusAgainstLimiter, self).setUp() 15 | 16 | self.response = None 17 | self.url = None 18 | self.responses = {} 19 | self.cache_miss = set() 20 | self.redis.flushall() 21 | 22 | def handle_url_response(self, url, response): 23 | self.responses[url] = response 24 | 25 | def handle_limiter_miss(self, url): 26 | self.cache_miss.add(url) 27 | 28 | def test_should_not_get_more_than_one_url_for_same_domain_concurrently(self): 29 | limiter = PerDomainRedisLimiter( 30 | {'http://g1.globo.com': 1}, 31 | {'http://globoesporte.globo.com': 1}, 32 | redis=self.redis 33 | ) 34 | otto = Octopus(concurrency=10, auto_start=True, limiter=limiter) 35 | 36 | otto.enqueue('http://globoesporte.globo.com', self.handle_url_response) 37 | otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response) 38 | otto.enqueue('http://g1.globo.com', self.handle_url_response) 39 | otto.enqueue('http://g1.globo.com/economia', self.handle_url_response) 40 | 41 | otto.wait(10) 42 | 43 | expect(self.responses).to_length(4) 44 | expect(self.redis.zcard('limit-for-http://g1.globo.com')).to_equal(0) 45 | expect(self.redis.zcard('limit-for-http://globoesporte.globo.com')).to_equal(0) 46 | 47 | def test_should_call_limiter_miss_twice(self): 48 | limiter = PerDomainInMemoryLimiter( 49 | {'http://g1.globo.com': 1}, 50 | {'http://globoesporte.globo.com': 1}, 51 | ) 52 | limiter.subscribe_to_lock_miss(self.handle_limiter_miss) 53 | otto = Octopus(concurrency=10, auto_start=True, limiter=limiter) 54 | 55 | otto.enqueue('http://globoesporte.globo.com/', self.handle_url_response) 56 | otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response) 57 | otto.enqueue('http://g1.globo.com/', self.handle_url_response) 58 | otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response) 59 | 60 | otto.wait() 61 | 62 | expect(self.cache_miss).to_length(2) 63 | -------------------------------------------------------------------------------- /tests/test_tornado_octopus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | 6 | from preggy import expect 7 | from mock import Mock, patch 8 | 9 | from octopus import TornadoOctopus 10 | from octopus.cache import Cache 11 | from tests import TestCase 12 | 13 | 14 | class TestTornadoOctopus(TestCase): 15 | def setUp(self): 16 | self.response = None 17 | self.url = None 18 | self.responses = {} 19 | 20 | def get_response(self, request=None): 21 | if request is None: 22 | request = Mock( 23 | headers={ 24 | 'Cookie': 'foo=bar' 25 | }, 26 | ) 27 | 28 | return Mock( 29 | request=request, 30 | headers={ 31 | 'baz': 'foo' 32 | }, 33 | code=200, 34 | body='body', 35 | effective_url='http://www.google.com/', 36 | error='error', 37 | request_time=2.1 38 | ) 39 | 40 | def test_can_create_tornado_otto(self): 41 | otto = TornadoOctopus() 42 | 43 | expect(otto.concurrency).to_equal(10) 44 | expect(otto.auto_start).to_be_false() 45 | expect(otto.cache).to_be_false() 46 | 47 | expect(otto.response_cache).not_to_be_null() 48 | expect(otto.response_cache).to_be_instance_of(Cache) 49 | expect(otto.response_cache.expiration_in_seconds).to_equal(30) 50 | 51 | expect(otto.request_timeout_in_seconds).to_equal(10) 52 | expect(otto.connect_timeout_in_seconds).to_equal(5) 53 | expect(otto.ignore_pycurl).to_be_false() 54 | 55 | expect(otto.running_urls).to_equal(0) 56 | expect(otto.url_queue).to_be_empty() 57 | 58 | def test_can_create_tornado_otto_with_custom_values(self): 59 | otto = TornadoOctopus( 60 | concurrency=20, auto_start=True, cache=True, 61 | expiration_in_seconds=60, request_timeout_in_seconds=20, 62 | connect_timeout_in_seconds=10, ignore_pycurl=True 63 | 64 | ) 65 | 66 | expect(otto.concurrency).to_equal(20) 67 | expect(otto.auto_start).to_be_true() 68 | expect(otto.cache).to_be_true() 69 | 70 | expect(otto.response_cache).not_to_be_null() 71 | expect(otto.response_cache).to_be_instance_of(Cache) 72 | expect(otto.response_cache.expiration_in_seconds).to_equal(60) 73 | 74 | expect(otto.request_timeout_in_seconds).to_equal(20) 75 | expect(otto.connect_timeout_in_seconds).to_equal(10) 76 | expect(otto.ignore_pycurl).to_be_true() 77 | 78 | expect(otto.running_urls).to_equal(0) 79 | expect(otto.url_queue).to_be_empty() 80 | 81 | def test_can_get_queue_info(self): 82 | otto = TornadoOctopus() 83 | 84 | expect(otto.queue_size).to_equal(0) 85 | expect(otto.is_empty).to_be_true() 86 | 87 | def test_can_get_response_from_tornado_response(self): 88 | response = self.get_response() 89 | 90 | otto_response = TornadoOctopus.from_tornado_response('http://www.google.com', response) 91 | 92 | expect(otto_response.url).to_equal('http://www.google.com') 93 | expect(otto_response.headers).to_be_like(response.headers) 94 | expect(otto_response.cookies).to_be_like({ 95 | 'foo': 'bar' 96 | }) 97 | expect(otto_response.text).to_equal('body') 98 | expect(otto_response.error).to_equal('error') 99 | expect(otto_response.request_time).to_equal(2.1) 100 | 101 | def test_can_get_response_from_tornado_response_when_no_cookies(self): 102 | response = self.get_response(request=Mock(headers={})) 103 | 104 | otto_response = TornadoOctopus.from_tornado_response('http://www.google.com', response) 105 | 106 | expect(otto_response.url).to_equal('http://www.google.com') 107 | expect(otto_response.headers).to_be_like(response.headers) 108 | expect(otto_response.cookies).to_be_empty() 109 | expect(otto_response.text).to_equal('body') 110 | expect(otto_response.error).to_equal('error') 111 | expect(otto_response.request_time).to_equal(2.1) 112 | 113 | def test_can_enqueue_url(self): 114 | otto = TornadoOctopus(cache=False, concurrency=0) 115 | 116 | otto.enqueue('http://www.google.com', None, method='GET', something="else") 117 | 118 | expect(otto.url_queue).to_length(1) 119 | 120 | @patch.object(TornadoOctopus, 'fetch') 121 | def test_can_enqueue_url_and_fetch(self, fetch_mock): 122 | otto = TornadoOctopus(cache=True) 123 | 124 | otto.enqueue('http://www.google.com', None, method='GET', something="else") 125 | 126 | expect(otto.url_queue).to_be_empty() 127 | fetch_mock.assert_called_once_with('http://www.google.com', None, 'GET', something='else') 128 | 129 | def test_can_enqueue_and_get_from_cache(self): 130 | mock_response = Mock() 131 | otto = TornadoOctopus(cache=True) 132 | otto.response_cache.put('http://www.google.com', mock_response) 133 | 134 | def response(url, response): 135 | self.url = url 136 | self.response = response 137 | 138 | otto.enqueue('http://www.google.com', response, method='GET') 139 | 140 | expect(otto.url_queue).to_be_empty() 141 | expect(self.response).not_to_be_null() 142 | expect(self.response).to_equal(mock_response) 143 | 144 | def test_can_enqueue_and_get_when_cache_miss(self): 145 | otto = TornadoOctopus(cache=True, auto_start=True) 146 | 147 | def response(url, response): 148 | self.url = url 149 | self.response = response 150 | 151 | otto.enqueue('http://www.google.com', response, method='GET') 152 | otto.wait(2) 153 | 154 | expect(otto.url_queue).to_be_empty() 155 | expect(self.response).not_to_be_null() 156 | 157 | def test_can_fetch(self): 158 | otto = TornadoOctopus(cache=False, auto_start=True) 159 | otto.response_cache.put('http://www.google.com', Mock()) 160 | 161 | http_client_mock = Mock() 162 | otto.http_client = http_client_mock 163 | 164 | otto.fetch('http://www.google.com', None, 'GET') 165 | 166 | expect(otto.running_urls).to_equal(1) 167 | expect(http_client_mock.fetch.called).to_be_true() 168 | 169 | def test_fetch_gets_the_response_from_cache_if_available(self): 170 | otto = TornadoOctopus(cache=True, auto_start=True) 171 | response_mock = Mock() 172 | otto.response_cache.put('http://www.google.com', response_mock) 173 | 174 | http_client_mock = Mock() 175 | otto.http_client = http_client_mock 176 | 177 | callback = Mock() 178 | 179 | otto.fetch('http://www.google.com', callback, 'GET') 180 | 181 | expect(otto.running_urls).to_equal(0) 182 | expect(http_client_mock.fetch.called).to_be_false() 183 | callback.assert_called_once_with('http://www.google.com', response_mock) 184 | 185 | @patch.object(TornadoOctopus, 'stop') 186 | def test_handle_request(self, stop_mock): 187 | otto = TornadoOctopus(cache=False, auto_start=True) 188 | 189 | response = self.get_response() 190 | 191 | callback = Mock() 192 | 193 | handle_request = otto.handle_request('some url', callback) 194 | 195 | handle_request(response) 196 | 197 | expect(otto.running_urls).to_equal(-1) 198 | expect(callback.called).to_be_true() 199 | expect(stop_mock.called).to_be_true() 200 | 201 | @patch.object(TornadoOctopus, 'stop') 202 | def test_handle_request_when_queue_has_no_items(self, stop_mock): 203 | otto = TornadoOctopus(cache=True, auto_start=True) 204 | otto.response_cache = Mock() 205 | 206 | response = self.get_response() 207 | 208 | callback = Mock() 209 | 210 | handle_request = otto.handle_request('some url', callback) 211 | 212 | handle_request(response) 213 | 214 | expect(otto.running_urls).to_equal(-1) 215 | expect(callback.called).to_be_true() 216 | expect(stop_mock.called).to_be_true() 217 | expect(otto.response_cache.put.called).to_be_true() 218 | 219 | def test_handle_request_when_queue_has_no_items_but_running_urls(self): 220 | otto = TornadoOctopus(cache=True, auto_start=True) 221 | otto.response_cache = Mock() 222 | otto.running_urls = 10 223 | 224 | response = self.get_response() 225 | 226 | callback = Mock() 227 | 228 | handle_request = otto.handle_request('some url', callback) 229 | 230 | handle_request(response) 231 | 232 | expect(otto.running_urls).to_equal(9) 233 | expect(callback.called).to_be_true() 234 | expect(otto.response_cache.put.called).to_be_true() 235 | 236 | @patch.object(TornadoOctopus, 'fetch') 237 | def test_handle_request_when_queue_has_items(self, fetch_mock): 238 | otto = TornadoOctopus(cache=False, auto_start=True) 239 | 240 | handler_mock = Mock() 241 | 242 | otto.url_queue.append( 243 | ('other url', handler_mock, 'POST', {'foo': 'bar'}) 244 | ) 245 | 246 | response = self.get_response() 247 | callback = Mock() 248 | 249 | handle_request = otto.handle_request('some url', callback) 250 | handle_request(response) 251 | 252 | expect(otto.running_urls).to_equal(-1) 253 | expect(otto.url_queue).to_be_empty() 254 | expect(callback.called).to_be_true() 255 | fetch_mock.assert_called_once_with('other url', handler_mock, 'POST', foo='bar') 256 | 257 | def test_can_handle_wait_timeout(self): 258 | otto = TornadoOctopus(cache=False, auto_start=True) 259 | otto.ioloop = Mock() 260 | 261 | otto.handle_wait_timeout(1, None) 262 | 263 | expect(otto.ioloop.stop.called).to_be_true() 264 | 265 | def test_can_stop(self): 266 | otto = TornadoOctopus(cache=False, auto_start=True) 267 | otto.ioloop = Mock() 268 | 269 | otto.stop() 270 | 271 | expect(otto.ioloop.stop.called).to_be_true() 272 | 273 | @patch.object(logging, 'debug') 274 | def test_can_wait_when_no_urls(self, logging_mock): 275 | otto = TornadoOctopus(cache=False, auto_start=True) 276 | 277 | otto.wait() 278 | 279 | logging_mock.assert_calls('No urls to wait for. Returning immediately.') 280 | 281 | def test_can_wait_when_urls_and_timeout(self): 282 | otto = TornadoOctopus(cache=False, auto_start=True) 283 | otto.ioloop = Mock() 284 | otto.running_urls = 10 285 | 286 | otto.wait() 287 | 288 | expect(otto.ioloop.set_blocking_signal_threshold.called) 289 | 290 | @patch.object(logging, 'debug') 291 | def test_can_wait_when_urls_and_no_timeout(self, logging_mock): 292 | otto = TornadoOctopus(cache=False, auto_start=True) 293 | otto.ioloop = Mock() 294 | otto.running_urls = 10 295 | 296 | otto.wait(0) 297 | 298 | logging_mock.assert_calls('Waiting for urls to be retrieved.') 299 | 300 | def test_can_get_many_urls(self): 301 | urls = [ 302 | 'http://www.globo.com', 303 | 'http://www.twitter.com', 304 | 'http://www.facebook.com' 305 | ] 306 | otto = TornadoOctopus(concurrency=4, auto_start=True) 307 | 308 | def handle_url_response(url, response): 309 | self.responses[url] = response 310 | 311 | for url in urls: 312 | otto.enqueue(url, handle_url_response) 313 | 314 | otto.wait(2) 315 | 316 | expect(self.responses).to_length(3) 317 | 318 | for url in urls: 319 | expect(self.responses).to_include(url) 320 | expect(self.responses[url].status_code).to_equal(200) 321 | 322 | def test_can_handle_invalid_urls(self): 323 | url = 'http://kagdjdkjgka.fk' 324 | otto = TornadoOctopus(concurrency=1, auto_start=True) 325 | 326 | def handle_url_response(url, response): 327 | self.response = response 328 | 329 | otto.enqueue(url, handle_url_response) 330 | 331 | otto.wait(5) 332 | 333 | expect(self.response).not_to_be_null() 334 | expect(self.response.status_code).to_equal(599) 335 | expect(self.response.text).to_be_null() 336 | expect(self.response.error).not_to_be_null() 337 | 338 | def test_can_handle_timeouts(self): 339 | url = 'http://baidu.com' 340 | otto = TornadoOctopus(concurrency=1, request_timeout_in_seconds=0.1, auto_start=True) 341 | 342 | def handle_url_response(url, response): 343 | self.response = response 344 | 345 | otto.enqueue(url, handle_url_response) 346 | 347 | otto.wait(5) 348 | 349 | expect(self.response.status_code).to_equal(599) 350 | expect(self.response.text).to_be_null() 351 | expect(self.response.error).not_to_be_null() 352 | 353 | @patch.object(logging, 'exception') 354 | def test_can_handle_exception(self, log_mock): 355 | url = 'http://www.globo.com' 356 | 357 | otto = TornadoOctopus(concurrency=4, auto_start=True) 358 | 359 | def handle_url_response(url, response): 360 | raise RuntimeError(url) 361 | 362 | otto.enqueue(url, handle_url_response) 363 | 364 | otto.wait(2) 365 | 366 | log_mock.assert_called_once_with('Error calling callback for http://www.globo.com.') 367 | -------------------------------------------------------------------------------- /tests/test_tornado_octopus_limited.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from preggy import expect 5 | 6 | from octopus import TornadoOctopus 7 | from octopus.limiter.redis.per_domain import Limiter as PerDomainRedisLimiter 8 | from octopus.limiter.in_memory.per_domain import Limiter as PerDomainInMemoryLimiter 9 | from tests import TestCase 10 | 11 | 12 | class TestTornadoCoreLimited(TestCase): 13 | def setUp(self): 14 | super(TestTornadoCoreLimited, self).setUp() 15 | 16 | self.response = None 17 | self.url = None 18 | self.responses = {} 19 | self.cache_miss = set() 20 | self.redis.flushall() 21 | 22 | def handle_url_response(self, url, response): 23 | self.responses[url] = response 24 | 25 | def handle_limiter_miss(self, url): 26 | self.cache_miss.add(url) 27 | 28 | def test_should_not_get_more_than_one_url_for_same_domain_concurrently(self): 29 | limiter = PerDomainInMemoryLimiter( 30 | {'http://g1.globo.com': 1}, 31 | {'http://globoesporte.globo.com': 1} 32 | ) 33 | otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter) 34 | 35 | otto.enqueue('http://globoesporte.globo.com', self.handle_url_response) 36 | otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response) 37 | otto.enqueue('http://g1.globo.com', self.handle_url_response) 38 | otto.enqueue('http://g1.globo.com/economia', self.handle_url_response) 39 | 40 | otto.wait(2) 41 | 42 | expect(self.responses).to_length(4) 43 | expect(list(limiter.domain_count.keys())).to_be_like(['http://g1.globo.com', 'http://globoesporte.globo.com']) 44 | 45 | def test_should_call_limiter_miss_twice(self): 46 | limiter = PerDomainRedisLimiter( 47 | {'http://g1.globo.com': 1}, 48 | {'http://globoesporte.globo.com': 1}, 49 | redis=self.redis 50 | ) 51 | limiter.subscribe_to_lock_miss(self.handle_limiter_miss) 52 | otto = TornadoOctopus(concurrency=10, auto_start=True, limiter=limiter) 53 | 54 | otto.enqueue('http://globoesporte.globo.com/', self.handle_url_response) 55 | otto.enqueue('http://globoesporte.globo.com/futebol/times/flamengo/', self.handle_url_response) 56 | otto.enqueue('http://g1.globo.com/', self.handle_url_response) 57 | otto.enqueue('http://g1.globo.com/economia/', self.handle_url_response) 58 | 59 | otto.wait() 60 | 61 | expect(self.cache_miss).to_length(2) 62 | --------------------------------------------------------------------------------