├── .gitignore ├── .pylintrc ├── LICENSE ├── MANIFEST.in ├── README.rst ├── bootstrap.py ├── buildout.cfg ├── crawluri.thrift ├── docs-source ├── api │ ├── entrypoints.rst │ ├── extractor.rst │ ├── fetcher.rst │ ├── frontier.rst │ ├── masterprocess.rst │ ├── queues.rst │ ├── scoper.rst │ ├── sink.rst │ ├── spyderapi.rst │ └── workerprocess.rst ├── conf.py ├── crawler-design.rst ├── getting-started.rst ├── globals.rst ├── index.rst ├── libraries.rst ├── release-notes.rst └── roadmap.rst ├── local.cfg.template ├── setup.py ├── src └── spyder │ ├── __init__.py │ ├── core │ ├── __init__.py │ ├── constants.py │ ├── dnscache.py │ ├── frontier.py │ ├── log.py │ ├── master.py │ ├── messages.py │ ├── mgmt.py │ ├── prioritizer.py │ ├── queueassignment.py │ ├── queueselector.py │ ├── settings.py │ ├── sink.py │ ├── sqlitequeues.py │ ├── uri_uniq.py │ └── worker.py │ ├── defaultsettings.py │ ├── encoding.py │ ├── import_util.py │ ├── logsink.py │ ├── masterprocess.py │ ├── processor │ ├── __init__.py │ ├── cleanupquery.py │ ├── fetcher.py │ ├── htmllinkextractor.py │ ├── httpextractor.py │ ├── limiter.py │ ├── scoper.py │ └── stripsessions.py │ ├── spyder_template │ ├── log │ │ └── .keep │ ├── logging.conf │ ├── master.py │ ├── settings.py │ ├── sink.py │ └── spyder-ctrl.py │ ├── thrift │ ├── __init__.py │ └── gen │ │ ├── __init__.py │ │ ├── constants.py │ │ └── ttypes.py │ ├── time.py │ └── workerprocess.py ├── test ├── static │ └── robots.txt ├── test_async_worker.py ├── test_cleanup_qs.py ├── test_default_html_link_extractor.py ├── test_dns_cache.py ├── test_fetch_processor.py ├── test_fetch_processor_last_modified_works.py ├── test_fetch_processor_with_etag.py ├── test_frontier.py ├── test_http_extractor.py ├── test_limiter.py ├── test_masterprocess.py ├── test_messages.py ├── test_mgmt.py ├── test_multiple_frontier.py ├── test_queue_assignment.py ├── test_queue_selector.py ├── test_regex_scoper.py ├── test_settings.py ├── test_settings_settings.py ├── test_sqlite_multiple_queues.py ├── test_sqlite_queues.py ├── test_strip_session_ids.py ├── test_uri_unique_filter.py ├── test_worker.py ├── test_workerprocess_extractor.py ├── test_workerprocess_fetcher.py ├── test_workerprocess_mgmtintegration.py ├── test_workerprocess_processing.py └── test_workerprocess_unspec.py └── versions.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | *.pyo 4 | .installed.cfg 5 | bin 6 | develop-eggs 7 | dist 8 | downloads 9 | eggs 10 | parts 11 | src/*.egg-info 12 | docs 13 | pylint* 14 | .coverage 15 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # Specify a configuration file. 4 | #rcfile= 5 | 6 | # Python code to execute, usually for sys.path manipulation such as 7 | # pygtk.require(). 8 | #init-hook= 9 | 10 | # Profiled execution. 11 | profile=no 12 | 13 | # Add to the black list. It should be a base name, not a 14 | # path. You may set this option multiple times. 15 | ignore=CVS 16 | ignore=gen 17 | 18 | # Pickle collected data for later comparisons. 19 | persistent=yes 20 | 21 | # List of plugins (as comma separated values of python modules names) to load, 22 | # usually to register additional checkers. 23 | load-plugins= 24 | 25 | 26 | [MESSAGES CONTROL] 27 | 28 | # Enable the message, report, category or checker with the given id(s). You can 29 | # either give multiple identifier separated by comma (,) or put this option 30 | # multiple time. 31 | #enable= 32 | 33 | # Disable the message, report, category or checker with the given id(s). You 34 | # can either give multiple identifier separated by comma (,) or put this option 35 | # multiple time (only on the command line, not in the configuration file where 36 | # it should appear only once). 37 | disable=R0903 38 | 39 | 40 | [REPORTS] 41 | 42 | # Set the output format. Available formats are text, parseable, colorized, msvs 43 | # (visual studio) and html 44 | output-format=html 45 | 46 | # Include message's id in output 47 | include-ids=yes 48 | 49 | # Put messages in a separate file for each module / package specified on the 50 | # command line instead of printing them on stdout. Reports (if any) will be 51 | # written in a file name "pylint_global.[txt|html]". 52 | files-output=no 53 | 54 | # Tells whether to display a full report or only the messages 55 | reports=yes 56 | 57 | # Python expression which should return a note less than 10 (10 is the highest 58 | # note). You have access to the variables errors warning, statement which 59 | # respectively contain the number of errors / warnings messages and the total 60 | # number of statements analyzed. This is used by the global evaluation report 61 | # (RP0004). 62 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 63 | 64 | # Add a comment according to your evaluation note. This is used by the global 65 | # evaluation report (RP0004). 66 | comment=no 67 | 68 | 69 | [BASIC] 70 | 71 | # Required attributes for module, separated by a comma 72 | required-attributes= 73 | 74 | # List of builtins function names that should not be used, separated by a comma 75 | bad-functions=map,filter,apply,input 76 | 77 | # Regular expression which should only match correct module names 78 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 79 | 80 | # Regular expression which should only match correct module level names 81 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 82 | 83 | # Regular expression which should only match correct class names 84 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 85 | 86 | # Regular expression which should only match correct function names 87 | function-rgx=[a-z_][a-z0-9_]{2,30}$ 88 | 89 | # Regular expression which should only match correct method names 90 | method-rgx=[a-z_][a-z0-9_]{2,30}$ 91 | 92 | # Regular expression which should only match correct instance attribute names 93 | attr-rgx=[a-z_][a-z0-9_]{2,30}$ 94 | 95 | # Regular expression which should only match correct argument names 96 | argument-rgx=[a-z_][a-z0-9_]{2,30}$ 97 | 98 | # Regular expression which should only match correct variable names 99 | variable-rgx=[a-z_][a-z0-9_]{2,30}$ 100 | 101 | # Regular expression which should only match correct list comprehension / 102 | # generator expression variable names 103 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 104 | 105 | # Good variable names which should always be accepted, separated by a comma 106 | good-names=i,j,k,ex,Run,_ 107 | 108 | # Bad variable names which should always be refused, separated by a comma 109 | bad-names=foo,bar,baz,toto,tutu,tata 110 | 111 | # Regular expression which should only match functions or classes name which do 112 | # not require a docstring 113 | no-docstring-rgx=__.*__ 114 | 115 | 116 | [FORMAT] 117 | 118 | # Maximum number of characters on a single line. 119 | max-line-length=80 120 | 121 | # Maximum number of lines in a module 122 | max-module-lines=1000 123 | 124 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 125 | # tab). 126 | indent-string=' ' 127 | 128 | 129 | [MISCELLANEOUS] 130 | 131 | # List of note tags to take in consideration, separated by a comma. 132 | notes=FIXME,XXX,TODO 133 | 134 | 135 | [SIMILARITIES] 136 | 137 | # Minimum lines number of a similarity. 138 | min-similarity-lines=4 139 | 140 | # Ignore comments when computing similarities. 141 | ignore-comments=yes 142 | 143 | # Ignore docstrings when computing similarities. 144 | ignore-docstrings=yes 145 | 146 | 147 | [TYPECHECK] 148 | 149 | # Tells whether missing members accessed in mixin class should be ignored. A 150 | # mixin class is detected if its name ends with "mixin" (case insensitive). 151 | ignore-mixin-members=yes 152 | 153 | # List of classes names for which member attributes should not be checked 154 | # (useful for classes with attributes dynamically set). 155 | ignored-classes=SQLObject 156 | 157 | # When zope mode is activated, add a predefined set of Zope acquired attributes 158 | # to generated-members. 159 | zope=no 160 | 161 | # List of members which are set dynamically and missed by pylint inference 162 | # system, and so shouldn't trigger E0201 when accessed. 163 | generated-members=REQUEST,acl_users,aq_parent 164 | 165 | 166 | [VARIABLES] 167 | 168 | # Tells whether we should check for unused import in __init__ files. 169 | init-import=yes 170 | 171 | # A regular expression matching the beginning of the name of dummy variables 172 | # (i.e. not used). 173 | dummy-variables-rgx=_|dummy 174 | 175 | # List of additional names supposed to be defined in builtins. Remember that 176 | # you should avoid to define new builtins when possible. 177 | additional-builtins= 178 | 179 | 180 | [CLASSES] 181 | 182 | # List of interface methods to ignore, separated by a comma. This is used for 183 | # instance to not check methods defines in Zope's Interface base class. 184 | ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by 185 | 186 | # List of method names used to declare (i.e. assign) instance attributes. 187 | defining-attr-methods=__init__,__new__,setUp 188 | 189 | 190 | [DESIGN] 191 | 192 | # Maximum number of arguments for function / method 193 | max-args=5 194 | 195 | # Argument names that match this expression will be ignored. Default to name 196 | # with leading underscore 197 | ignored-argument-names=_.* 198 | 199 | # Maximum number of locals for function / method body 200 | max-locals=15 201 | 202 | # Maximum number of return / yield for function / method body 203 | max-returns=6 204 | 205 | # Maximum number of branch for function / method body 206 | max-branchs=12 207 | 208 | # Maximum number of statements in function / method body 209 | max-statements=50 210 | 211 | # Maximum number of parents for a class (see R0901). 212 | max-parents=7 213 | 214 | # Maximum number of attributes for a class (see R0902). 215 | max-attributes=7 216 | 217 | # Minimum number of public methods for a class (see R0903). 218 | min-public-methods=2 219 | 220 | # Maximum number of public methods for a class (see R0904). 221 | max-public-methods=20 222 | 223 | 224 | [IMPORTS] 225 | 226 | # Deprecated modules which should not be used, separated by a comma 227 | deprecated-modules=regsub,string,TERMIOS,Bastion,rexec 228 | 229 | # Create a graph of every (i.e. internal and external) dependencies in the 230 | # given file (report RP0402 must not be disabled) 231 | import-graph=pylint_spyder.dot 232 | 233 | # Create a graph of external dependencies in the given file (report RP0402 must 234 | # not be disabled) 235 | ext-import-graph=pylint_spyder_external.dot 236 | 237 | # Create a graph of internal dependencies in the given file (report RP0402 must 238 | # not be disabled) 239 | int-import-graph=pylint_spyder_internal.dot 240 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include LICENSE 3 | include src/spyder/spyder_template/logging.conf 4 | include src/spyder/spyder_template/log/.keep 5 | recursive-include docs-source *.rst 6 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Spyder 2 | ====== 3 | 4 | `ALONG CAME A SPIDER` 5 | 6 | 7 | *Spyder* is a scalable web-spider written in Python using the non-blocking 8 | *tornado* library and *ZeroMQ* as messaging layer. The messages are serialized 9 | using *Thrift*. 10 | 11 | The architecture is very basic: a **Master** process contains the crawl 12 | **Frontier** that organises the *urls* that need to be crawled; several 13 | **Worker** processes actually download the content and extract new *urls* that 14 | should be crawled in the future. For storing the content you may attach a 15 | **Sink** to the **Master** and be informed about the interesting events for an 16 | *url*. 17 | 18 | 19 | Getting Started 20 | =============== 21 | 22 | *Spyder* is just a library for creating web crawlers. In order to really crawl 23 | content, you first have to create a *Spyder* skeleton: 24 | 25 | .. code-block:: bash 26 | 27 | $ mkdir my-crawler && cd my-crawler 28 | $ spyder start 29 | $ ls 30 | log logging.conf master.py settings.py sink.py spyder-ctrl.py 31 | 32 | This will copy the skeleton into `my-crawler`. The main file is `settings.py`. 33 | In it, you can configure the logging level for **Masters** and **Workers** and 34 | define the **crawl scope**. In `master.py` you should manipulate the starting 35 | URLs and add your specific `sink.py` into the **Frontier**. `spyder-ctrl.py` is 36 | just a small control script that helps you start the **Log Sink**, **Master** and 37 | **Worker**. 38 | 39 | In the skeleton everything is setup as if you would want to crawl Sailing 40 | related pages from **DMOZ**. That should give you a starting point for your own 41 | crawler. 42 | 43 | So, when you wrote your sink and have everything configured right, it's time to 44 | start crawling. First, on one of your nodes you start the logsink: 45 | 46 | .. code-block:: bash 47 | 48 | $ spyder-ctrl.py logsink & 49 | 50 | Again on one node (the same as the logsink, e.g.) you start the **Master**: 51 | 52 | $ spyder-ctrl.py master & 53 | 54 | Finally you can start as many **Workers** as you want: 55 | 56 | $ spyder-ctrl.py worker & 57 | $ spyder-ctrl.py worker & 58 | $ spyder-ctrl.py worker & 59 | 60 | Here we started 3 workers since it is a powerful node having a quad core CPU. 61 | 62 | 63 | Scaling the Crawl 64 | ================= 65 | 66 | With the default settings it is not possible to start workers on different 67 | nodes. Most of the time one node is powerful enough to crawl quite an amount of 68 | data. But there are times when you simply want to crawl using *many* nodes. This 69 | can be done by configuring the **ZeroMQ** transports to something like 70 | 71 | 72 | ZEROMQ_MASTER_PUSH = "tcp://NodeA:5005" 73 | ZEROMQ_MASTER_SUB = "tcp://NodeA:5007" 74 | 75 | ZEROMQ_MGMT_MASTER = "tcp://NodeA:5008" 76 | ZEROMQ_MGMT_WORKER = "tcp://NodeA:5009" 77 | 78 | ZEROMQ_LOGGING = "tcp://NodeA:5010" 79 | 80 | Basically we have setup a 2 node crawl cluster. **NodeA** acts as logging sink 81 | and controls the crawl via the **Master**. **NodeB** Is a pure **Worker** node. 82 | Only the **Master** actually *binds* **ZeroMQ** sockets, the **Worker** always 83 | *connect* to them so the **Master** does not have to know where the 84 | **Workers** are really running. 85 | 86 | 87 | From here 88 | ========= 89 | 90 | There is plenty of room for improvement and development ahead. Everything will 91 | be handled by Github tickets from now on and, if there is interest, we may setup 92 | a Google Group. 93 | -------------------------------------------------------------------------------- /buildout.cfg: -------------------------------------------------------------------------------- 1 | [buildout] 2 | parts = 3 | ${local:parts} 4 | ${codeq:parts} 5 | test 6 | python 7 | sphinxbuilder 8 | coverage 9 | develop = . 10 | eggs = spyder 11 | versions = versions 12 | extends = 13 | https://github.com/retresco/buildout-recipes/raw/master/zmq.cfg 14 | https://github.com/retresco/buildout-recipes/raw/master/testing.cfg 15 | https://github.com/retresco/buildout-recipes/raw/master/codeq.cfg 16 | local.cfg 17 | versions.cfg 18 | extensions = buildout.dumppickedversions 19 | 20 | [python] 21 | recipe = zc.recipe.egg 22 | interpreter = python 23 | eggs = ${buildout:eggs} 24 | 25 | [test] 26 | <= test-template 27 | eggs = spyder [test] 28 | defaults = 29 | --verbosity=3 30 | --with-doctest 31 | --doctest-extension=txt 32 | --where=${buildout:directory}/test 33 | --with-xunit 34 | --with-coverage 35 | --cover-package=spyder 36 | 37 | [sphinxbuilder] 38 | recipe = collective.recipe.sphinxbuilder 39 | source = ${buildout:directory}/docs-source 40 | build = ${buildout:directory}/docs 41 | interpreter = ${buildout:directory}/bin/python 42 | 43 | [coverage] 44 | recipe = zc.recipe.egg 45 | eggs = coverage 46 | -------------------------------------------------------------------------------- /crawluri.thrift: -------------------------------------------------------------------------------- 1 | # Description of the CrawlUri thrift structure 2 | 3 | namespace py spyder.thrift.gen 4 | 5 | 6 | /** 7 | * Some typedefs in order to make the code more readable. 8 | */ 9 | typedef i64 timestamp 10 | 11 | typedef map header 12 | 13 | typedef map key_value 14 | 15 | /** 16 | * The main strcut for CrawlUris. 17 | * 18 | * This contains some metadata and if possible the saved web page. 19 | */ 20 | struct CrawlUri { 21 | // readable version of the url to crawl 22 | 1: string url, 23 | 24 | // the effective url used for downloading the content (i.e.: IP instead of hostname) 25 | 2: string effective_url, 26 | 27 | // the host identifier used for queue selection 28 | 3: i16 current_priority, 29 | 30 | // when processing has been started 31 | 4: timestamp begin_processing, 32 | 33 | // when processing is finished 34 | 5: timestamp end_processing, 35 | 36 | // the http request headers 37 | 6: header req_header, 38 | 39 | // the http response headers 40 | 7: header rep_header 41 | 42 | // the saved content body 43 | 8: string content_body, 44 | 45 | // the servers status code 46 | 9: i16 status_code, 47 | 48 | // request time 49 | 10: double req_time, 50 | 51 | // queue time 52 | 11: double queue_time, 53 | 54 | // additional values from other processors 55 | 12: key_value optional_vars 56 | } 57 | -------------------------------------------------------------------------------- /docs-source/api/entrypoints.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: ../globals.rst 4 | 5 | Entrypoints 6 | =========== 7 | 8 | .. automodule:: spyder 9 | :members: 10 | -------------------------------------------------------------------------------- /docs-source/api/extractor.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: ../globals.rst 4 | 5 | Link Extractors 6 | =============== 7 | 8 | .. automodule:: spyder.processor.htmllinkextractor 9 | :members: 10 | -------------------------------------------------------------------------------- /docs-source/api/fetcher.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: ../globals.rst 4 | 5 | Content Fetcher 6 | =============== 7 | 8 | .. automodule:: spyder.processor.fetcher 9 | :members: 10 | -------------------------------------------------------------------------------- /docs-source/api/frontier.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: ../globals.rst 4 | 5 | Frontier 6 | ======== 7 | 8 | .. automodule:: spyder.core.frontier 9 | :members: 10 | -------------------------------------------------------------------------------- /docs-source/api/masterprocess.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: ../globals.rst 4 | 5 | Masterprocess 6 | ============= 7 | 8 | .. automodule:: spyder.masterprocess 9 | :members: 10 | 11 | ZeroMQ Master 12 | ============= 13 | 14 | .. automodule:: spyder.core.master 15 | :members: 16 | -------------------------------------------------------------------------------- /docs-source/api/queues.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: ../globals.rst 4 | 5 | Queue Management 6 | ================ 7 | 8 | .. automodule:: spyder.core.sqlitequeues 9 | :members: 10 | -------------------------------------------------------------------------------- /docs-source/api/scoper.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: ../globals.rst 4 | 5 | Crawl Scoper 6 | ============ 7 | 8 | .. automodule:: spyder.processor.scoper 9 | :members: 10 | -------------------------------------------------------------------------------- /docs-source/api/sink.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: ../globals.rst 4 | 5 | Sink 6 | ==== 7 | 8 | .. automodule:: spyder.core.sink 9 | :members: 10 | -------------------------------------------------------------------------------- /docs-source/api/spyderapi.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: ../globals.rst 4 | 5 | .. _secapi: 6 | 7 | Spyder API 8 | ========== 9 | 10 | This is the main documentation for the |spyder| API. This will hopefully provide 11 | you with enough information to getting started for coding new features or help 12 | with bugfixing. 13 | 14 | .. toctree:: 15 | :maxdepth: 2 16 | 17 | entrypoints 18 | masterprocess 19 | frontier 20 | queues 21 | workerprocess 22 | fetcher 23 | extractor 24 | scoper 25 | sink 26 | -------------------------------------------------------------------------------- /docs-source/api/workerprocess.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: ../globals.rst 4 | 5 | Workerprocess 6 | ============= 7 | 8 | .. automodule:: spyder.workerprocess 9 | :members: 10 | 11 | ZeroMQ Worker 12 | ============= 13 | 14 | .. automodule:: spyder.core.worker 15 | :members: 16 | -------------------------------------------------------------------------------- /docs-source/crawler-design.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: globals.rst 4 | .. _seccrawlerdesign: 5 | 6 | Crawler Design 7 | ============== 8 | 9 | The basic crawler design is simple and straight forward. You have a *Master* 10 | that collects the |urls| that should be crawled and a number of *Worker* threads 11 | (or processes) that download the content and extract new links from it. In 12 | practice though there are a number of pitfalls you have to keep an eye on. Just 13 | to give one example: you really don't want to excessively crawl **one** host as 14 | you might be doing a *Denial of Service* attack given enough workers. And even 15 | if the host survives, the site owner might not like you from now on. 16 | 17 | Some Science 18 | ------------ 19 | 20 | Ok, really only a little bit. Basically there two papers describing effective 21 | crawler designs. The *Mercator* paper (`Mercator: A Scalable, Extensible Web 22 | Crawler (1999) 23 | `_) 24 | describes the architecture of the *Mercator* crawler. The crawler is split into 25 | several parts: 26 | 27 | * *Frontier* for keeping track of |urls| 28 | * *Scheduler* for scheduling the |urls| to be crawled 29 | * *Downloader* for really downloading the content 30 | * *Link Extractors* for extracting new links from different kinds of content 31 | * *Unique Filter* for filtering known |urls| from the extracted ones 32 | * *Host Splitter* for working with multiple *Frontiers* 33 | 34 | The second important paper on crawler design is the *Ubi Crawler* (`UbiCrawler: 35 | a scalable fully distributed Web crawler (2003) 36 | `_). In 37 | this paper the authors use a *Consistent Hashing* algorithm for splitting the 38 | hosts among several *Frontiers*. 39 | 40 | The |spyder| is designed on the basis of these two papers. 41 | 42 | References 43 | ========== 44 | 45 | The |spyder| is not only inspired by these two papers but also on `Heritrix 46 | `_ the *Internet Archive's* open source crawler. 47 | *Heritrix* is designed just like *Mercator* except it lacks something like a 48 | *Host Splitter* that allows one to crawl using more than one *Frontier*. 49 | Additionally *Heritrix* does not provide any kind of *monitoring* or 50 | *revisiting* strategy, although this might be possible in Version *H3*. 51 | -------------------------------------------------------------------------------- /docs-source/getting-started.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: globals.rst 4 | 5 | .. _secgettingstarted: 6 | 7 | Getting Started 8 | =============== 9 | 10 | *Spyder* is just a library for creating web crawlers. In order to really crawl 11 | content, you first have to create a *Spyder* skeleton: 12 | 13 | .. code-block:: bash 14 | 15 | $ mkdir my-crawler && cd my-crawler 16 | $ spyder start 17 | $ ls 18 | log logging.conf master.py settings.py sink.py spyder-ctrl.py 19 | 20 | This will copy the skeleton into `my-crawler`. The main file is `settings.py`. 21 | In it, you can configure the logging level for **Masters** and **Workers** and 22 | define the **crawl scope**. In `master.py` you should manipulate the starting 23 | URLs and add your specific `sink.py` into the **Frontier**. `spyder-ctrl.py` is 24 | just a small control script that helps you start the **Log Sink**, **Master** and 25 | **Worker**. 26 | 27 | In the skeleton everything is setup as if you would want to crawl Sailing 28 | related pages from **DMOZ**. That should give you a starting point for your own 29 | crawler. 30 | 31 | So, when you wrote your sink and have everything configured right, it's time to 32 | start crawling. First, on one of your nodes you start the logsink: 33 | 34 | .. code-block:: bash 35 | 36 | $ spyder-ctrl.py logsink & 37 | 38 | Again on one node (the same as the logsink, e.g.) you start the **Master**: 39 | 40 | .. code-block:: bash 41 | 42 | $ spyder-ctrl.py master & 43 | 44 | Finally you can start as many **Workers** as you want: 45 | 46 | .. code-block:: bash 47 | 48 | $ spyder-ctrl.py worker & 49 | $ spyder-ctrl.py worker & 50 | $ spyder-ctrl.py worker & 51 | 52 | Here we started 3 workers since it is a powerful node having a quad core CPU. 53 | 54 | 55 | Scaling the Crawl 56 | ----------------- 57 | 58 | With the default settings it is not possible to start workers on different 59 | nodes. Most of the time one node is powerful enough to crawl quite an amount of 60 | data. But there are times when you simply want to crawl using *many* nodes. This 61 | can be done by configuring the **ZeroMQ** transports to something like 62 | 63 | 64 | .. code-block:: python 65 | 66 | ZEROMQ_MASTER_PUSH = "tcp://NodeA:5005" 67 | ZEROMQ_MASTER_SUB = "tcp://NodeA:5007" 68 | 69 | ZEROMQ_MGMT_MASTER = "tcp://NodeA:5008" 70 | ZEROMQ_MGMT_WORKER = "tcp://NodeA:5009" 71 | 72 | ZEROMQ_LOGGING = "tcp://NodeA:5010" 73 | 74 | Basically we have setup a 2 node crawl cluster. **NodeA** acts as logging sink 75 | and controls the crawl via the **Master**. **NodeB** Is a pure **Worker** node. 76 | Only the **Master** actually *binds* **ZeroMQ** sockets, the **Worker** always 77 | *connect* to them so the **Master** does not have to know where the 78 | **Workers** are really running. 79 | 80 | 81 | From here 82 | --------- 83 | 84 | There is plenty of room for improvement and development ahead. Everything will 85 | be handled by Github tickets from now on and, if there is interest, we may setup 86 | a Google Group. 87 | -------------------------------------------------------------------------------- /docs-source/globals.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. |zmq| replace:: *ZeroMQ* 4 | .. |spyder| replace:: **Spyder** 5 | .. |pushpull| replace:: *PUSH/PULL* 6 | .. |pubsub| replace:: *PUB/SUB* 7 | .. |url| replace:: *URL* 8 | .. |urls| replace:: *URLs* 9 | .. |tornado| replace:: *Tornado* 10 | -------------------------------------------------------------------------------- /docs-source/index.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: globals.rst 4 | 5 | Welcome to |spyder| 6 | =================== 7 | 8 | |spyder| is a scalable web-spider written in Python using the non-blocking 9 | |tornado| library and |zmq| as messaging layer. The messages are serialized 10 | using *Thrift*. 11 | 12 | The architecture is very basic: a **Master** process contains the crawl 13 | **Frontier** that organises the |urls| that need to be crawled; several 14 | **Worker** processes actually download the content and extract new |urls| that 15 | should be crawled in the future. For storing the content you may attach a 16 | **Sink** to the **Master** and be informed about the interesting events for an 17 | |url|. 18 | 19 | Table of Contents 20 | ================= 21 | 22 | .. toctree:: 23 | :maxdepth: 2 24 | 25 | release-notes 26 | getting-started 27 | crawler-design 28 | libraries 29 | api/spyderapi 30 | roadmap 31 | 32 | Indices and tables 33 | ================== 34 | 35 | * :ref:`genindex` 36 | * :ref:`modindex` 37 | * :ref:`search` 38 | 39 | -------------------------------------------------------------------------------- /docs-source/libraries.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: globals.rst 4 | 5 | Libraries used in |spyder| 6 | ========================== 7 | 8 | .. _seczmq: 9 | 10 | ZeroMQ 11 | ------ 12 | 13 | Not only with the emergence of multicore systems Python's `Global Interpreter 14 | Lock `_ becomes a major issue for scaling 15 | across cores. Libraries like `multiprocess `_ try to 16 | circumvent the `GIL` by forking child processes and establishing a messaging 17 | layer between them. This enables Python programmers to scale with the number of 18 | available cores but scaling across node boundaries is not possible using plain 19 | `multiprocess`. 20 | 21 | At this point `ZeroMQ `_ comes to the rescue. As the name 22 | suggests, |zmq| is a message queue. But, unlike other more famous queues like 23 | `AMQP` or more lightweight ones like `STOMP` or `XMPP`, |zmq| does not need a 24 | global broker (that might act as *single point of failure*). It is instead a 25 | little bit of code around the plain *socket* interface that adds simple 26 | messaging patterns to them (it's like *sockets on steroids*). 27 | 28 | The beauty of |zmq| lies in it's simplicity. The programmer basically defines 29 | a *socket* to which one side **binds** and the other **connects** and a 30 | messaging pattern with which both sides communicate with each other. Once this 31 | is established, scaling across cores/nodes/data centers is simple as pie. Four 32 | types of *sockets* are supported by |zmq|: 33 | 34 | 1. `inproc` sockets can be used for **intra-process** communication (between 35 | threads, e.g.) 36 | 37 | 2. `ipc` sockets can be used for **inter-process** communication between 38 | different processes *on the same node*. 39 | 40 | 3. `tcp` sockets can be used for **inter-process** communication between 41 | different processes *on different node*. 42 | 43 | 4. `pgn` sockets can be used for **inter-process** communication between one and 44 | many other processes *on many other nodes*. 45 | 46 | So by simply changing the socket type from `ipc` to `tcp` the application can 47 | scale across node boundaries transparently for the programmer, i.e. by **not 48 | changing a single line of code**. Awesome! 49 | 50 | This leaves us with the different messaging patterns. |zmq| supports all well 51 | known (at least to me) messaging patterns. The first one that comes into mind is 52 | of course the `PUB/SUB` pattern that allows one publisher to send messages to 53 | many subscribers. The `PUSH/PULL` pattern allows one master to send messages to 54 | only one of the available clients (the common producer/consumer pattern). With 55 | `REQ/REP` a simple request and response pattern is possible. Most of the 56 | patterns have a `non-blocking` equivalent. 57 | 58 | 59 | Messaging Patterns used in |spyder| 60 | +++++++++++++++++++++++++++++++++++ 61 | 62 | |zmq| is used as messaging layer to distribute the workload to an arbitrary 63 | number of worker processes which in return send the result back to the master. 64 | In the context of |spyder| the master process controls the |urls| that should be 65 | crawled and sends them to the worker processes when they are due. One of the 66 | worker processes then downloads the content and possibly extracts new links from 67 | it. When finished it sends the result back to the master. 68 | 69 | We do not use the `REQ/REP` pattern as it does not scale as easily as we need 70 | since we have to keep track of whom we sent the |url| to and we would have to do 71 | the load balancing ourselves. 72 | 73 | Instead with the |pushpull| pattern we get the load balancing as a nice little 74 | gift. It comes with a *fair distribution policy* that simply distributes the 75 | messages to all workers in a *round-robin* way. In order to send the results 76 | back to the master we will use the |pubsub| pattern where the *publisher* is the 77 | worker process and the *subscriber* is the master process. 78 | 79 | The |pubsub| pattern is used to send the results back to the master process. 80 | 81 | Users familiar with |zmq| might already have noted that this messaging setup is 82 | shamelessly *adapted* from `Mongrel2 `_. In the case of 83 | a *Web Server* as well as for a crawler this is a perfect fit as it helps you to 84 | scale **very** easy. 85 | 86 | .. note:: There is another way to do this type message pattern using 87 | *XPEQ/XREP*. Transition to this pattern is planned for the near future. 88 | 89 | For a crawler there are two parts that we possibly want to scale: the worker 90 | *and* the master. While scaling the worker across several processes is somewhat 91 | obvious, scaling the master first seems to be of no relevance. But if you want 92 | to crawl large portions of the web (all German Internet pages, e.g.), you might 93 | experience difficulties as this are not only **many** |urls| but also **many** 94 | hosts you possibly want to connect. While the number of |urls| might not be the 95 | limiting part, the number of hosts can be as they require a lot of queue 96 | switching. 97 | 98 | .. note:: For more info on this, see the :ref:`seccrawlerdesign` document. 99 | 100 | 101 | What does all that mean in practice 102 | +++++++++++++++++++++++++++++++++++ 103 | 104 | The master process binds to one socket with a `PUSH` type and to another socket 105 | using the `SUB` type. On the `SUB` socket the master registers a |zmq| filter to 106 | only receive messages with a certain *topic*: it's identity. 107 | 108 | The worker in connects to the `PUSH` socket using a `PULL` type socket and 109 | receives the |urls| from the master containing the master's identity. When the 110 | |url| has been processed it sends the result back to the master using the `PUB` 111 | socket it has connected to the `SUB` socket. By setting the message's topic to 112 | the identity of the sending master, it is ensured that only the master process 113 | that sent this |url| receives the answer. 114 | 115 | Future version of |spyder| will thus be able to work with **n** master and **m** 116 | worker processes. 117 | 118 | 119 | .. _sectornado: 120 | 121 | |tornado| 122 | --------- 123 | 124 | `Tornado `_ is a *non-blocking* or *evented 125 | IO* library developed at FriendFeed (now Facebook) to run their python front-end 126 | servers. Basically this is a 127 | 128 | .. code-block:: python 129 | 130 | while True: 131 | callback_for_event(event) 132 | 133 | loop. The events are any *read* or *write* event on a number of sockets or files 134 | that are registered with the loop. So instead of starting one thread for each 135 | socket connection everything runs in one thread or even process. Although this 136 | might feel strange it has been shown to be **alot** faster for network intensive 137 | applications that potentially serve a large number of clients. 138 | 139 | .. note:: For more info see the `C10k Problem `_ 140 | 141 | 142 | An additional reason for choosing |tornado| was the nice integration with |zmq|. 143 | This not only makes programming with |zmq| easier but also makes it possible to 144 | easily write *non-blocking, evented* IO programms with Python and |zmq|. 145 | -------------------------------------------------------------------------------- /docs-source/release-notes.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: globals.rst 4 | .. _secrelnotes: 5 | 6 | Release Notes 7 | ============= 8 | 9 | Version 0.1 10 | ----------- 11 | 12 | This is the first release of the |spyder| so I will only cover the known issues 13 | here. 14 | 15 | Changes 16 | +++++++ 17 | 18 | * Initial Release with a working *master* and *worker* implementation 19 | 20 | Known Issues 21 | ++++++++++++ 22 | 23 | * If a *worker* crashes or is being stopped, the URLs it is currently processing 24 | might get lost in the *master* and never be crawled. There are several 25 | precautions in order to track this problem in the future but right now it is a 26 | bug that might also end up in a memory leak. 27 | -------------------------------------------------------------------------------- /docs-source/roadmap.rst: -------------------------------------------------------------------------------- 1 | .. vim: set fileencoding=UTF-8 : 2 | .. vim: set tw=80 : 3 | .. include:: globals.rst 4 | 5 | Roadmap 6 | ======= 7 | 8 | Version 0.3 9 | +++++++++++ 10 | 11 | - Integration with `Supervisord` 12 | 13 | The current way of starting |spyder| is quite painful. Using the 14 | `supervisord` I want to start the master and worker processes automatically 15 | and, in case of failures, be able to restart them automatically. 16 | -------------------------------------------------------------------------------- /local.cfg.template: -------------------------------------------------------------------------------- 1 | [local] 2 | parts = 3 | ${zmq:sharedzmq} 4 | # ${zmq:localzmq} 5 | 6 | #[environment] 7 | #ZMQ_DIR = /usr/local 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2008 Daniel Truemper truemped@googlemail.com 3 | # 4 | # setup.py 04-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # under the License. 18 | # 19 | # 20 | 21 | from setuptools import setup, find_packages 22 | import re 23 | 24 | __version__ = re.search( "__version__\s*=\s*'(.*)'", open('src/spyder/__init__.py').read(), re.M).group(1) 25 | assert __version__ 26 | 27 | long_description = open("README.rst").read() 28 | assert long_description 29 | 30 | tests_require = ['coverage>=3.4', 'nose==1.1.2'] 31 | 32 | setup( 33 | name = "spyder", 34 | version = __version__, 35 | description = "A python spider", 36 | long_description = long_description, 37 | author = "Daniel Truemper", 38 | author_email = "truemped@googlemail.com", 39 | url = "", 40 | license = "Apache 2.0", 41 | package_dir = { '' : 'src' }, 42 | packages = find_packages('src'), 43 | include_package_data = True, 44 | test_suite = 'nose.collector', 45 | install_requires = [ 46 | 'pyzmq>=2.0.10', 47 | 'tornado>=1.1', 48 | 'thrift>=0.5.0', 49 | 'pycurl>=7.19.0', 50 | 'pytz>=2010o', 51 | 'brownie>=0.4.1', 52 | ], 53 | tests_require = tests_require, 54 | extras_require = {'test': tests_require}, 55 | entry_points = { 56 | 'console_scripts' : [ 57 | 'spyder = spyder:spyder_admin_main', 58 | ] 59 | }, 60 | classifiers = [ 61 | 'Intended Audience :: Developers', 62 | 'Development Status :: 3 - Alpha', 63 | 'Intended Audience :: Information Technology', 64 | 'License :: OSI Approved :: Apache Software License', 65 | 'Operating System :: POSIX :: Linux', 66 | 'Programming Language :: Python :: 2.6', 67 | 'Topic :: Internet :: WWW/HTTP', 68 | 'Topic :: Internet :: WWW/HTTP :: Indexing/Search', 69 | ] 70 | ) 71 | -------------------------------------------------------------------------------- /src/spyder/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # __init__.py 07-Jan-2011 3 | # 4 | """ 5 | The Spyder. 6 | """ 7 | 8 | import os 9 | import shutil 10 | import stat 11 | import sys 12 | 13 | from spyder.core.settings import Settings 14 | from spyder.thrift.gen.ttypes import CrawlUri 15 | 16 | 17 | __version__ = '0.2.0-dev' 18 | 19 | 20 | def copy_skeleton_dir(destination): 21 | """ 22 | Copy the skeleton directory (spyder_template) to a new directory. 23 | """ 24 | if not os.path.exists(destination): 25 | os.makedirs(destination) 26 | template_dir = os.path.join(__path__[0], 'spyder_template') 27 | wanted_files = [".keep", "logging.conf"] 28 | 29 | for root, subdirs, files in os.walk(template_dir): 30 | relative = root[len(template_dir) + 1:] 31 | if relative: 32 | os.mkdir(os.path.join(destination, relative)) 33 | 34 | for subdir in subdirs: 35 | if subdir.startswith('.'): 36 | subdirs.remove(subdir) 37 | 38 | for filename in files: 39 | if (not filename.endswith('.py') and \ 40 | filename not in wanted_files) or \ 41 | filename == "__init__.py": 42 | 43 | continue 44 | 45 | path_old = os.path.join(root, filename) 46 | path_new = os.path.join(destination, relative, filename) 47 | fp_old = open(path_old, 'r') 48 | fp_new = open(path_new, 'w') 49 | fp_new.write(fp_old.read()) 50 | fp_old.close() 51 | fp_new.close() 52 | 53 | try: 54 | shutil.copymode(path_old, path_new) 55 | if sys.platform.startswith('java'): 56 | # On Jython there is no os.access() 57 | return 58 | if not os.access(path_new, os.W_OK): 59 | st_new = os.stat(path_new) 60 | new_perm = stat.S_IMODE(st_new.st_mode) | stat.S_IWUSR 61 | os.chmod(path_new, new_perm) 62 | except OSError: 63 | sys.stderr.write("Could not set permission bits on %s" % 64 | path_new) 65 | 66 | 67 | def spyder_admin_main(): 68 | """ 69 | Method for creating new environments for Spyders. 70 | """ 71 | if len(sys.argv) != 2 or "start" != sys.argv[1]: 72 | sys.stderr.write( 73 | """Usage: 'spyder start' 74 | to start a new spyder in the current directory\n""") 75 | sys.exit(1) 76 | 77 | copy_skeleton_dir(os.getcwd()) 78 | 79 | 80 | def spyder_management(settings): 81 | """ 82 | Start new master/worker/logsink processes. 83 | """ 84 | 85 | from spyder import logsink 86 | import spyder.workerprocess as worker 87 | import spyder.masterprocess as master 88 | 89 | effective_settings = Settings(settings) 90 | 91 | args = [a.lower() for a in sys.argv] 92 | 93 | if "master" in args: 94 | args.remove("master") 95 | master.main(effective_settings) 96 | elif "worker" in args: 97 | worker.main(effective_settings) 98 | elif "logsink" in args: 99 | logsink.main(effective_settings) 100 | else: 101 | print >> sys.stderr, """Usage: spyder-ctrl [master|worker|logsink] 102 | 103 | 'master'\t\tstart a master process. 104 | 'worker'\t\tstart a worker process. 105 | 'logsink'\t\tstart a sink for logmessages. 106 | """ 107 | sys.exit(1) 108 | -------------------------------------------------------------------------------- /src/spyder/core/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # __init__.py 10-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | Core modules used in the spyder. 20 | """ 21 | -------------------------------------------------------------------------------- /src/spyder/core/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # constants.py 10-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | Serveral constants mainly for ZeroMQ topics and messages. 20 | """ 21 | 22 | # general topic for spyder related management tasks 23 | ZMQ_SPYDER_MGMT = 'spyder.' 24 | 25 | ZMQ_SPYDER_MGMT_WORKER = ZMQ_SPYDER_MGMT + 'worker.' 26 | ZMQ_SPYDER_MGMT_WORKER_AVAIL = 'be here now'.encode() 27 | ZMQ_SPYDER_MGMT_WORKER_QUIT = 'quit'.encode() 28 | ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK = 'quit.ack'.encode() 29 | 30 | # constants used in the optional_vars map of CrawlUris 31 | CURI_OPTIONAL_TRUE = "1".encode() 32 | CURI_OPTIONAL_FALSE = "0".encode() 33 | 34 | # username and password fields 35 | CURI_SITE_USERNAME = "username".encode() 36 | CURI_SITE_PASSWORD = "password".encode() 37 | 38 | # extraction finished field 39 | CURI_EXTRACTION_FINISHED = "extraction_finished".encode() 40 | 41 | # extracted urls field 42 | CURI_EXTRACTED_URLS = "extracted_urls".encode() 43 | 44 | # Some internal error states 45 | CURI_EUNCAUGHT_EXCEPTION = 710 46 | -------------------------------------------------------------------------------- /src/spyder/core/dnscache.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # dnscache.py 24-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | A very simple dns cache. 20 | 21 | Currently dns resolution is blocking style but this should get a nonblocking 22 | version. 23 | """ 24 | 25 | import socket 26 | 27 | from brownie.caching import LRUCache as LRUDict 28 | 29 | 30 | class DnsCache(object): 31 | """ 32 | This is a least recently used cache for hostname to ip addresses. If the 33 | cache has reached it's maximum size, the least used key is being removed 34 | and a new DNS lookup is made. 35 | 36 | In addition you may add static mappings via the 37 | ``settings.STATIC_DNS_MAPPINGS`` dict. 38 | """ 39 | 40 | def __init__(self, settings): 41 | """ 42 | Initialize the lru cache and the static mappings. 43 | """ 44 | self._cache = LRUDict(maxsize=settings.SIZE_DNS_CACHE) 45 | self._static_cache = dict() 46 | self._static_cache.update(settings.STATIC_DNS_MAPPINGS) 47 | 48 | def __getitem__(self, host_port_string): 49 | """ 50 | Retrieve the item from the cache or resolve the hostname and store the 51 | result in the cache. 52 | 53 | Returns a tuple of `(ip, port)`. At the moment we only support IPv4 but 54 | this will probably change in the future. 55 | """ 56 | if host_port_string in self._static_cache.keys(): 57 | return self._static_cache[host_port_string] 58 | 59 | if host_port_string not in self._cache: 60 | (hostname, port) = host_port_string.split(":") 61 | infos = socket.getaddrinfo(hostname, port, 0, 0, socket.SOL_TCP) 62 | for (_family, _socktype, _proto, _canoname, sockaddr) in infos: 63 | if len(sockaddr) == 2: 64 | # IPv4 (which we prefer) 65 | self._cache[host_port_string] = sockaddr 66 | 67 | return self._cache[host_port_string] 68 | -------------------------------------------------------------------------------- /src/spyder/core/log.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # logging.py 04-Feb-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | A simple pyzmq logging mixin. 20 | """ 21 | 22 | import logging 23 | 24 | 25 | class LoggingMixin: 26 | """ 27 | Simple mixin for adding logging methods to a class. 28 | """ 29 | 30 | def __init__(self, pub_handler, log_level): 31 | """ 32 | Initialize the logger. 33 | """ 34 | self._logger = logging.getLogger() 35 | self._logger.addHandler(pub_handler) 36 | self._logger.setLevel(log_level) 37 | -------------------------------------------------------------------------------- /src/spyder/core/messages.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # messages.py 14-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | Definitions of messages that are being sent via ZeroMQ Sockets. 20 | 21 | Plus some (de-)serialization helpers. 22 | """ 23 | from thrift import TSerialization 24 | 25 | from spyder.thrift.gen.ttypes import CrawlUri 26 | 27 | 28 | class DataMessage(object): 29 | """ 30 | Envelope class describing `data` messages. 31 | """ 32 | 33 | def __init__(self, message=None, identity=None, curi=None): 34 | """ 35 | Construct a new message. 36 | """ 37 | if message is not None: 38 | self.identity = message[0] 39 | self.serialized_curi = message[1] 40 | self.curi = deserialize_crawl_uri(message[1]) 41 | elif identity is not None or curi is not None: 42 | self.identity = identity 43 | self.curi = curi 44 | else: 45 | self.identity = self.curi = None 46 | 47 | def serialize(self): 48 | """ 49 | Return a new message envelope from the class members. 50 | """ 51 | return [self.identity, serialize_crawl_uri(self.curi)] 52 | 53 | def __eq__(self, other): 54 | return (self.identity == other.identity 55 | and self.curi == other.curi) 56 | 57 | 58 | class MgmtMessage(object): 59 | """ 60 | Envelope class describing `management` messages. 61 | """ 62 | 63 | def __init__(self, message=None, topic=None, identity=None, data=None): 64 | """ 65 | Construct a new message and if given parse the serialized message. 66 | """ 67 | if message is not None: 68 | self.topic = message[0] 69 | self.identity = message[1] 70 | self.data = message[2] 71 | elif topic is not None or identity is not None or data is not None: 72 | self.topic = topic 73 | self.identity = identity 74 | self.data = data 75 | else: 76 | self.topic = self.identity = self.data = None 77 | 78 | def serialize(self): 79 | """ 80 | Return a new message envelope from the class members. 81 | """ 82 | return [self.topic, self.identity, self.data] 83 | 84 | def __eq__(self, other): 85 | return (self.topic == other.topic 86 | and self.identity == other.identity 87 | and self.data == other.data) 88 | 89 | 90 | def deserialize_crawl_uri(serialized): 91 | """ 92 | Deserialize a `CrawlUri` that has been serialized using Thrift. 93 | """ 94 | return TSerialization.deserialize(CrawlUri(), serialized) 95 | 96 | 97 | def serialize_crawl_uri(crawl_uri): 98 | """ 99 | Serialize a `CrawlUri` using Thrift. 100 | """ 101 | return TSerialization.serialize(crawl_uri) 102 | -------------------------------------------------------------------------------- /src/spyder/core/mgmt.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # mgmt.py 10-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | A management module for managing components via ZeroMQ. 20 | """ 21 | 22 | from zmq.eventloop.ioloop import IOLoop 23 | from zmq.eventloop.zmqstream import ZMQStream 24 | 25 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER 26 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT 27 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK 28 | from spyder.core.messages import MgmtMessage 29 | 30 | 31 | class ZmqMgmt(object): 32 | """ 33 | A :class:`ZMQStream` object handling the management sockets. 34 | """ 35 | 36 | def __init__(self, subscriber, publisher, **kwargs): 37 | """ 38 | Initialize the management interface. 39 | 40 | The `subscriber` socket is the socket used by the Master to send 41 | commands to the workers. The publisher socket is used to send commands 42 | to the Master. 43 | 44 | You have to set the `zmq.SUBSCRIBE` socket option yourself! 45 | """ 46 | self._io_loop = kwargs.get('io_loop', IOLoop.instance()) 47 | 48 | self._subscriber = subscriber 49 | self._in_stream = ZMQStream(self._subscriber, self._io_loop) 50 | 51 | self._publisher = publisher 52 | self._out_stream = ZMQStream(self._publisher, self._io_loop) 53 | 54 | self._callbacks = dict() 55 | 56 | def _receive(self, raw_msg): 57 | """ 58 | Main method for receiving management messages. 59 | 60 | `message` is a multipart message where `message[0]` contains the topic, 61 | `message[1]` is 0 and `message[1]` contains the actual message. 62 | """ 63 | msg = MgmtMessage(raw_msg) 64 | 65 | if msg.topic in self._callbacks: 66 | for callback in self._callbacks[msg.topic]: 67 | if callable(callback): 68 | callback(msg) 69 | 70 | if ZMQ_SPYDER_MGMT_WORKER_QUIT == msg.data: 71 | self.stop() 72 | 73 | def start(self): 74 | """ 75 | Start the MGMT interface. 76 | """ 77 | self._in_stream.on_recv(self._receive) 78 | 79 | def stop(self): 80 | """ 81 | Stop the MGMT interface. 82 | """ 83 | self._in_stream.stop_on_recv() 84 | self.publish(topic=ZMQ_SPYDER_MGMT_WORKER, identity=None, 85 | data=ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK) 86 | 87 | def close(self): 88 | """ 89 | Close all open sockets. 90 | """ 91 | self._in_stream.close() 92 | self._subscriber.close() 93 | self._out_stream.close() 94 | self._publisher.close() 95 | 96 | def add_callback(self, topic, callback): 97 | """ 98 | Add a callback to the specified topic. 99 | """ 100 | if not callable(callback): 101 | raise ValueError('callback must be callable') 102 | 103 | if topic not in self._callbacks: 104 | self._callbacks[topic] = [] 105 | 106 | self._callbacks[topic].append(callback) 107 | 108 | def remove_callback(self, topic, callback): 109 | """ 110 | Remove a callback from the specified topic. 111 | """ 112 | if topic in self._callbacks and callback in self._callbacks[topic]: 113 | self._callbacks[topic].remove(callback) 114 | 115 | def publish(self, topic=None, identity=None, data=None): 116 | """ 117 | Publish a message to the intended audience. 118 | """ 119 | assert topic is not None 120 | assert data is not None 121 | msg = MgmtMessage(topic=topic, identity=identity, data=data) 122 | self._out_stream.send_multipart(msg.serialize()) 123 | -------------------------------------------------------------------------------- /src/spyder/core/prioritizer.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # prioritizer.py 01-Feb-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | URL prioritizers will calculate priorities of new URLs and the recrawling 20 | priority. 21 | """ 22 | 23 | 24 | class SimpleTimestampPrioritizer(object): 25 | """ 26 | A simple prioritizer where the priority is based on the timestamp of the 27 | next scheduled crawl of the URL. 28 | """ 29 | 30 | def __init__(self, settings): 31 | """ 32 | Initialize the number of available priorities and the priority delta 33 | between the priorities. 34 | """ 35 | self._priorities = settings.PRIORITIZER_NUM_PRIORITIES 36 | self._default_priority = settings.PRIORITIZER_DEFAULT_PRIORITY 37 | self._delta = settings.PRIORITIZER_CRAWL_DELTA 38 | 39 | def calculate_priority(self, curi): 40 | """ 41 | Calculate the new priority based on the :class:`CrawlUri`s current. 42 | 43 | This should return a tuple of 44 | (prio_level, prio) 45 | """ 46 | if curi.current_priority and curi.status_code == 304: 47 | prio_level = min(curi.current_priority + 1, self._priorities) 48 | else: 49 | prio_level = 1 50 | prio = self._delta * prio_level 51 | return (prio_level, prio) 52 | -------------------------------------------------------------------------------- /src/spyder/core/queueassignment.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # queueassignment.py 14-Mar-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | A collection of queue assignment classes. 20 | """ 21 | from urlparse import urlparse 22 | 23 | from spyder.core.frontier import PROTOCOLS_DEFAULT_PORT 24 | 25 | 26 | class HostBasedQueueAssignment(object): 27 | """ 28 | This class will assign URLs to queues based on the hostnames. 29 | """ 30 | 31 | def __init__(self, dnscache): 32 | """ 33 | Initialize the assignment class. 34 | """ 35 | self._dns_cache = dnscache 36 | 37 | def get_identifier(self, url): 38 | """ 39 | Get the identifier for this url. 40 | """ 41 | parsed_url = urlparse(url) 42 | return parsed_url.hostname 43 | 44 | 45 | class IpBasedQueueAssignment(HostBasedQueueAssignment): 46 | """ 47 | This class will assign urls to queues based on the server's IP address. 48 | """ 49 | 50 | def __init__(self, dnscache): 51 | """ 52 | Call the parent only. 53 | """ 54 | HostBasedQueueAssignment.__init__(self, dnscache) 55 | 56 | def get_identifier(self, url): 57 | """ 58 | Get the identifier for this url. 59 | """ 60 | parsed_url = urlparse(url) 61 | 62 | # dns resolution and caching 63 | port = parsed_url.port 64 | if not port: 65 | port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme] 66 | 67 | (ip, port) = self._dns_cache["%s:%s" % (parsed_url.hostname, port)] 68 | 69 | return "%s" % (ip,) 70 | -------------------------------------------------------------------------------- /src/spyder/core/queueselector.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # queueselector.py 25-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | A random queue selector. 20 | 21 | Based on the number of queues (i.e. `FrontEnd queues`) return a number of the 22 | queue with a bias towards lower numbered queues. 23 | """ 24 | 25 | import random 26 | 27 | 28 | class BiasedQueueSelector(object): 29 | """ 30 | The default queue selector based on radom selection with bias towards lower 31 | numbered queues. 32 | """ 33 | 34 | def __init__(self, number_of_queues): 35 | """ 36 | Initialize the queue selector with the number of available queues. 37 | """ 38 | self._weights = [] 39 | self._sum_weights = 0 40 | self._enumerate_weights = [] 41 | self.reset_queues(number_of_queues) 42 | 43 | def reset_queues(self, number_of_queues): 44 | self._weights = [1 / (float(i) * number_of_queues) 45 | for i in range(1, number_of_queues + 1)] 46 | self._sum_weights = sum(self._weights) 47 | self._enumerate_weights = [(i, w) for i, w in enumerate(self._weights)] 48 | 49 | def get_queue(self): 50 | """ 51 | Return the next queue to use. 52 | """ 53 | random_weight = random.random() * self._sum_weights 54 | for (i, weight) in self._enumerate_weights: 55 | random_weight -= weight 56 | if random_weight < 0: 57 | return i 58 | -------------------------------------------------------------------------------- /src/spyder/core/settings.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # settings.py 10-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | Simple class for working with settings. 20 | 21 | Adopted from the Django based settings system. 22 | """ 23 | 24 | from spyder import defaultsettings 25 | 26 | 27 | class Settings(object): 28 | """ 29 | Class for handling spyder settings. 30 | """ 31 | 32 | def __init__(self, settings=None): 33 | """ 34 | Initialize the settings. 35 | """ 36 | 37 | # load the default settings 38 | for setting in dir(defaultsettings): 39 | if setting == setting.upper(): 40 | setattr(self, setting, getattr(defaultsettings, setting)) 41 | 42 | # now override with user settings 43 | if settings is not None: 44 | for setting in dir(settings): 45 | if setting == setting.upper(): 46 | setattr(self, setting, getattr(settings, setting)) 47 | -------------------------------------------------------------------------------- /src/spyder/core/sink.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # sink.py 02-Feb-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | A sink of :class:`CrawlUri`. 20 | """ 21 | 22 | 23 | class AbstractCrawlUriSink(object): 24 | """ 25 | Abstract sink. Only overwrite the methods you are interested in. 26 | """ 27 | 28 | def process_successful_crawl(self, curi): 29 | """ 30 | We have crawled a uri successfully. If there are newly extracted links, 31 | add them alongside the original uri to the frontier. 32 | """ 33 | pass 34 | 35 | def process_not_found(self, curi): 36 | """ 37 | The uri we should have crawled was not found, i.e. HTTP Error 404. Do 38 | something with that. 39 | """ 40 | pass 41 | 42 | def process_redirect(self, curi): 43 | """ 44 | There have been too many redirects, i.e. in the default config there 45 | have been more than 3 redirects. 46 | """ 47 | pass 48 | 49 | def process_server_error(self, curi): 50 | """ 51 | There has been a server error, i.e. HTTP Error 50x. Maybe we should try 52 | to crawl this uri again a little bit later. 53 | """ 54 | pass 55 | -------------------------------------------------------------------------------- /src/spyder/core/uri_uniq.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # uri_uniq.py 31-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | A simple filter for unique uris. 20 | """ 21 | 22 | import hashlib 23 | 24 | 25 | class UniqueUriFilter(object): 26 | """ 27 | A simple filter for unique uris. This is used to keep the frontier clean. 28 | """ 29 | 30 | def __init__(self, hash_method, depth=3): 31 | """ 32 | Create a new unique uri filter using the specified `hash_method`. 33 | 34 | `depth` is used to determine the number of nested dictionaries to use. 35 | Example: using `depth=2` the dictionary storing all hash values use the 36 | first 2 bytes as keys, i.e. if the hash value is `abc` then 37 | 38 | hashes[a][b] = [c,] 39 | 40 | This should reduce the number of lookups within a dictionary. 41 | """ 42 | self._hash = hash_method 43 | self._depth = depth 44 | self._hashes = dict() 45 | 46 | def is_known(self, url, add_if_unknown=False): 47 | """ 48 | Test whether the given `url` is known. If not, store it from now on. 49 | """ 50 | hash_method = hashlib.new(self._hash) 51 | hash_method.update(url) 52 | hash_value = hash_method.hexdigest() 53 | 54 | dictionary = self._hashes 55 | for i in range(0, self._depth): 56 | if hash_value[i] in dictionary: 57 | dictionary = dictionary[hash_value[i]] 58 | else: 59 | # unknown dict, add it now 60 | if i == self._depth - 1: 61 | dictionary[hash_value[i]] = [] 62 | else: 63 | dictionary[hash_value[i]] = dict() 64 | dictionary = dictionary[hash_value[i]] 65 | 66 | # now dictionary is the list at the deepest level 67 | if hash_value[self._depth:] in dictionary: 68 | return True 69 | else: 70 | # since we still are here, only the nested list does not 71 | # contain the given rest. Now we know it 72 | if add_if_unknown: 73 | dictionary.append(hash_value[self._depth:]) 74 | return False 75 | -------------------------------------------------------------------------------- /src/spyder/core/worker.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # worker.py 10-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | This module contains a ZeroMQ based Worker abstraction. 20 | 21 | The `ZmqWorker` class expects an incoming and one outgoing `zmq.socket` as well 22 | as an instance of the `spyder.core.mgmt.ZmqMgmt` class. 23 | """ 24 | import traceback 25 | 26 | from zmq.eventloop.ioloop import IOLoop 27 | from zmq.eventloop.zmqstream import ZMQStream 28 | 29 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER 30 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT 31 | from spyder.core.log import LoggingMixin 32 | from spyder.core.messages import DataMessage 33 | 34 | 35 | class ZmqWorker(object, LoggingMixin): 36 | """ 37 | This is the ZMQ worker implementation. 38 | 39 | The worker will register a :class:`ZMQStream` with the configured 40 | :class:`zmq.Socket` and :class:`zmq.eventloop.ioloop.IOLoop` instance. 41 | 42 | Upon `ZMQStream.on_recv` the configured `processors` will be executed 43 | with the deserialized context and the result will be published through the 44 | configured `zmq.socket`. 45 | """ 46 | 47 | def __init__(self, insocket, outsocket, mgmt, processing, log_handler, 48 | log_level, io_loop=None): 49 | """ 50 | Initialize the `ZMQStream` with the `insocket` and `io_loop` and store 51 | the `outsocket`. 52 | 53 | `insocket` should be of the type `zmq.socket.PULL` `outsocket` should 54 | be of the type `zmq.socket.PUB` 55 | 56 | `mgmt` is an instance of `spyder.core.mgmt.ZmqMgmt` that handles 57 | communication between master and worker processes. 58 | """ 59 | LoggingMixin.__init__(self, log_handler, log_level) 60 | 61 | self._insocket = insocket 62 | self._io_loop = io_loop or IOLoop.instance() 63 | self._outsocket = outsocket 64 | 65 | self._processing = processing 66 | self._mgmt = mgmt 67 | self._in_stream = ZMQStream(self._insocket, self._io_loop) 68 | self._out_stream = ZMQStream(self._outsocket, self._io_loop) 69 | 70 | def _quit(self, msg): 71 | """ 72 | The worker is quitting, stop receiving messages. 73 | """ 74 | if ZMQ_SPYDER_MGMT_WORKER_QUIT == msg.data: 75 | self.stop() 76 | 77 | def _receive(self, msg): 78 | """ 79 | We have a message! 80 | 81 | `msg` is a serialized version of a `DataMessage`. 82 | """ 83 | message = DataMessage(msg) 84 | 85 | try: 86 | # this is the real work we want to do 87 | curi = self._processing(message.curi) 88 | message.curi = curi 89 | except: 90 | # catch any uncaught exception and only log it as CRITICAL 91 | self._logger.critical( 92 | "worker::Uncaught exception executing the worker for URL %s!" % 93 | (message.curi.url,)) 94 | self._logger.critical("worker::%s" % (traceback.format_exc(),)) 95 | 96 | # finished, now send the result back to the master 97 | self._out_stream.send_multipart(message.serialize()) 98 | 99 | def start(self): 100 | """ 101 | Start the worker. 102 | """ 103 | self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self._quit) 104 | self._in_stream.on_recv(self._receive) 105 | 106 | def stop(self): 107 | """ 108 | Stop the worker. 109 | """ 110 | # stop receiving 111 | self._in_stream.stop_on_recv() 112 | self._mgmt.remove_callback(ZMQ_SPYDER_MGMT_WORKER, self._quit) 113 | # but work on anything we might already have 114 | self._in_stream.flush() 115 | self._out_stream.flush() 116 | 117 | def close(self): 118 | """ 119 | Close all open sockets. 120 | """ 121 | self._in_stream.close() 122 | self._insocket.close() 123 | self._out_stream.close() 124 | self._outsocket.close() 125 | 126 | 127 | class AsyncZmqWorker(ZmqWorker): 128 | """ 129 | Asynchronous version of the `ZmqWorker`. 130 | 131 | This worker differs in that the `self._processing` method should have two 132 | arguments: the message and the socket where the result should be sent to! 133 | """ 134 | 135 | def _receive(self, msg): 136 | """ 137 | We have a message! 138 | 139 | Instead of the synchronous version we do not handle serializing and 140 | sending the result to the `self._outsocket`. This has to be handled by 141 | the `self._processing` method. 142 | """ 143 | message = DataMessage(msg) 144 | 145 | try: 146 | self._processing(message, self._out_stream) 147 | except: 148 | # catch any uncaught exception and only log it as CRITICAL 149 | self._logger.critical("Uncaught exception executing the worker!") 150 | self._logger.critical(traceback.format_exc()) 151 | -------------------------------------------------------------------------------- /src/spyder/defaultsettings.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # settings.py 10-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | Module for the default spyder settings. 20 | """ 21 | import logging 22 | 23 | import pytz 24 | from datetime import timedelta 25 | 26 | 27 | # simple settings 28 | LOG_LEVEL_MASTER = logging.DEBUG 29 | LOG_LEVEL_WORKER = logging.DEBUG 30 | 31 | 32 | # my local timezone 33 | LOCAL_TIMEZONE = pytz.timezone('Europe/Berlin') 34 | 35 | 36 | # Fetch Processor 37 | USER_AGENT = "Mozilla/5.0 (compatible; spyder/0.1; " + \ 38 | "+http://github.com/retresco/spyder)" 39 | MAX_CLIENTS = 10 40 | MAX_SIMULTANEOUS_CONNECTIONS = 1 41 | FOLLOW_REDIRECTS = False 42 | MAX_REDIRECTS = 3 43 | USE_GZIP = True 44 | 45 | # Proxy configuration. Both PROXY_HOST and PROXY_PORT must be set! 46 | # PROXY_USERNAME and PROXY_PASSWORD are optional 47 | PROXY_HOST = None 48 | PROXY_PORT = None 49 | PROXY_USERNAME = '' 50 | PROXY_PASSWORD = '' 51 | 52 | # Timeout settings for requests. See tornado HTTPRequest class for explanation 53 | # defaults to 20.0 (float) 54 | REQUEST_TIMEOUT = 20.0 55 | CONNECT_TIMEOUT = REQUEST_TIMEOUT 56 | 57 | VALIDATE_CERTIFICATES = True 58 | 59 | # 60 | # static dns mappings. Mapping has to be like this: 61 | # "hostname:port" => ("xxx.xxx.xxx.xxx", port) 62 | # 63 | STATIC_DNS_MAPPINGS = dict() 64 | # Size of the DNS Cache. 65 | SIZE_DNS_CACHE = 1000 66 | 67 | 68 | # Callback for Master processes. 69 | MASTER_CALLBACK = None 70 | # Interval for the periodic updater (surviving times where nothing is to be 71 | # crawled) 72 | MASTER_PERIODIC_UPDATE_INTERVAL = 60 * 1000 73 | 74 | 75 | # Frontier implementation to use 76 | FRONTIER_CLASS = 'spyder.core.frontier.SingleHostFrontier' 77 | # Filename storing the frontier state 78 | FRONTIER_STATE_FILE = "./state.db" 79 | # checkpointing interval (uris added/changed) 80 | FRONTIER_CHECKPOINTING = 1000 81 | # The number of URIs to keep inside the HEAP 82 | FRONTIER_HEAP_SIZE = 500 83 | # Minimum number of URIs in the HEAP 84 | FRONTIER_HEAP_MIN = 100 85 | # Download duration times this factor throttles the spyder 86 | FRONTIER_CRAWL_DELAY_FACTOR = 4 87 | # Minimum delay to wait before connecting the host again (s) 88 | FRONTIER_MIN_DELAY = 5 89 | 90 | # Number of simultaneously active queues 91 | FRONTIER_ACTIVE_QUEUES = 100 92 | # Number of URLs to be processed in one queue before it is put on hold 93 | FRONTIER_QUEUE_BUDGET = 50 94 | # Punishment of server errors with the queue 95 | FRONTIER_QUEUE_BUDGET_PUNISH = 5 96 | 97 | 98 | # Name of the prioritizer class to use 99 | PRIORITIZER_CLASS = 'spyder.core.prioritizer.SimpleTimestampPrioritizer' 100 | # The number of priority levels where URIs are being assigned to (lowest means 101 | # highest priority) 102 | PRIORITIZER_NUM_PRIORITIES = 10 103 | # default priority for new urls 104 | PRIORITIZER_DEFAULT_PRIORITY = 1 105 | # Default crawl delta for known urls 106 | PRIORITIZER_CRAWL_DELTA = timedelta(days=1) 107 | 108 | 109 | # Name of the queue selector to use 110 | QUEUE_SELECTOR_CLASS = 'spyder.core.queueselector.BiasedQueueSelector' 111 | 112 | 113 | # Name of the queue assignment class to use 114 | QUEUE_ASSIGNMENT_CLASS = 'spyder.core.queueassignment.HostBasedQueueAssignment' 115 | 116 | 117 | # The pipeline of link extractors 118 | SPYDER_EXTRACTOR_PIPELINE = [ 119 | 'spyder.processor.limiter.DefaultLimiter', 120 | 'spyder.processor.htmllinkextractor.DefaultHtmlLinkExtractor', 121 | 'spyder.processor.httpextractor.HttpExtractor', 122 | ] 123 | 124 | 125 | # Default HTML Extractor settings 126 | # maximum number of chars an element name may have 127 | REGEX_LINK_XTRACTOR_MAX_ELEMENT_LENGTH = 64 128 | 129 | 130 | # The pipeline of scope processors 131 | SPYDER_SCOPER_PIPELINE = [ 132 | 'spyder.processor.scoper.RegexScoper', 133 | 'spyder.processor.stripsessions.StripSessionIds', 134 | 'spyder.processor.cleanupquery.CleanupQueryString', 135 | ] 136 | 137 | # List of positive regular expressions for the crawl scope 138 | REGEX_SCOPE_POSITIVE = [ 139 | ] 140 | 141 | # List of negative regular expressions for the crawl scope 142 | REGEX_SCOPE_NEGATIVE = [ 143 | ] 144 | 145 | 146 | # List of 404 redirects 147 | HTTP_EXTRACTOR_404_REDIRECT = [ 148 | ] 149 | 150 | 151 | # Whether to remove anchors from extracted urls. 152 | REMOVE_ANCHORS_FROM_LINKS = True 153 | 154 | 155 | # define a parent directory for unix sockets that will be created 156 | PARENT_SOCKET_DIRECTORY = "/tmp" 157 | 158 | # 159 | # improved settings 160 | # only edit if you are usually working behind a nuclear power plant's control 161 | # panel 162 | 163 | # ZeroMQ Master Push 164 | ZEROMQ_MASTER_PUSH = "ipc://%s/spyder-zmq-master-push.sock" % \ 165 | PARENT_SOCKET_DIRECTORY 166 | ZEROMQ_MASTER_PUSH_HWM = 10 167 | 168 | # ZeroMQ Fetcher 169 | ZEROMQ_WORKER_PROC_FETCHER_PULL = ZEROMQ_MASTER_PUSH 170 | ZEROMQ_WORKER_PROC_FETCHER_PUSH = "inproc://processing/fetcher/push" 171 | ZEROMQ_WORKER_PROC_FETCHER_PUSH_HWM = 10 172 | 173 | # ZeroMQ Extractor 174 | ZEROMQ_WORKER_PROC_EXTRACTOR_PULL = ZEROMQ_WORKER_PROC_FETCHER_PUSH 175 | ZEROMQ_WORKER_PROC_EXTRACTOR_PUB = "ipc://%s/spyder-zmq-master-sub.sock" % \ 176 | PARENT_SOCKET_DIRECTORY 177 | ZEROMQ_WORKER_PROC_EXTRACTOR_PUB_HWM = 10 178 | 179 | # ZeroMQ Master Sub 180 | ZEROMQ_MASTER_SUB = ZEROMQ_WORKER_PROC_EXTRACTOR_PUB 181 | 182 | # ZeroMQ Management Sockets 183 | ZEROMQ_MGMT_MASTER = "ipc://%s/spyder-zmq-mgmt-master.sock" % \ 184 | (PARENT_SOCKET_DIRECTORY,) 185 | ZEROMQ_MGMT_WORKER = "ipc://%s/spyder-zmq-mgmt-worker.sock" % \ 186 | (PARENT_SOCKET_DIRECTORY,) 187 | 188 | # ZeroMQ logging socket 189 | ZEROMQ_LOGGING = "ipc://%s/spyder-logging.sock" % (PARENT_SOCKET_DIRECTORY,) 190 | -------------------------------------------------------------------------------- /src/spyder/encoding.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # encoding.py 09-Feb-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | 20 | def get_content_type_encoding(curi): 21 | """ 22 | Determine the content encoding based on the `Content-Type` Header. 23 | 24 | `curi` is the :class:`CrawlUri`. 25 | """ 26 | content_type = "text/plain" 27 | charset = "" 28 | 29 | if curi.rep_header and "Content-Type" in curi.rep_header: 30 | (content_type, charset) = extract_content_type_encoding( 31 | curi.rep_header["Content-Type"]) 32 | 33 | if charset == "" and curi.content_body and len(curi.content_body) >= 512: 34 | # no charset information in the http header 35 | first_bytes = curi.content_body[:512].lower() 36 | ctypestart = first_bytes.find("content-type") 37 | if ctypestart != -1: 38 | # there is a html header 39 | ctypestart = first_bytes.find("content=\"", ctypestart) 40 | ctypeend = first_bytes.find("\"", ctypestart + 9) 41 | return extract_content_type_encoding( 42 | first_bytes[ctypestart + 9:ctypeend]) 43 | 44 | return (content_type, charset) 45 | 46 | 47 | def extract_content_type_encoding(content_type_string): 48 | """ 49 | Extract the content type and encoding information. 50 | """ 51 | charset = "" 52 | content_type = "" 53 | for part in content_type_string.split(";"): 54 | part = part.strip().lower() 55 | if part.startswith("charset"): 56 | charset = part.split("=")[1] 57 | charset = charset.replace("-", "_") 58 | else: 59 | content_type = part 60 | 61 | return (content_type, charset) 62 | -------------------------------------------------------------------------------- /src/spyder/import_util.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # import_util.py 07-Feb-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # under the License. 18 | # All programs in this directory and 19 | # subdirectories are published under the GNU General Public License as 20 | # described below. 21 | # 22 | # 23 | """ 24 | A custom import method for importing modules or classes from a string. 25 | """ 26 | 27 | 28 | def custom_import(module): 29 | """ 30 | A custom import method to import a module. 31 | see: stackoverflow.com: 547829/how-to-dynamically-load-a-python-class 32 | """ 33 | mod = __import__(module) 34 | components = module.split('.') 35 | for comp in components[1:]: 36 | mod = getattr(mod, comp) 37 | return mod 38 | 39 | 40 | def import_class(classstring): 41 | """ 42 | Import a class using a `classstring`. This string is split by `.` and the 43 | last part is interpreted as class name. 44 | """ 45 | (module_name, _sep, class_name) = classstring.rpartition('.') 46 | module = custom_import(module_name) 47 | return getattr(module, class_name) 48 | -------------------------------------------------------------------------------- /src/spyder/logsink.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # logsink.py 03-Feb-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | Module for aggregating spyder logs. 20 | """ 21 | import logging 22 | import logging.config 23 | import signal 24 | import os.path 25 | import traceback 26 | 27 | import zmq 28 | from zmq.core.error import ZMQError 29 | from zmq.eventloop.ioloop import IOLoop 30 | from zmq.eventloop.zmqstream import ZMQStream 31 | 32 | 33 | LOGGERS = {"default": logging.getLogger()} 34 | 35 | LOGGERS['master'] = logging.getLogger('masterlog') 36 | LOGGERS['worker'] = logging.getLogger('workerlog') 37 | 38 | 39 | def log_zmq_message(msg): 40 | """ 41 | Log a specific message. 42 | 43 | The message has the format:: 44 | 45 | message = [topic, msg] 46 | 47 | `topic` is a string of the form:: 48 | 49 | topic = "process.LEVEL.subtopics" 50 | """ 51 | topic = msg[0].split(".") 52 | if len(topic) == 3: 53 | topic.append("SUBTOPIC") 54 | if topic[1] in LOGGERS: 55 | log = getattr(LOGGERS[topic[1]], topic[2].lower()) 56 | log("%s - %s" % (topic[3], msg[1].strip())) 57 | else: 58 | log = getattr(LOGGERS['default'], topic[2].lower()) 59 | log("%s: %s)" % (topic[3], msg[2].strip())) 60 | 61 | 62 | def main(settings): 63 | """ 64 | Initialize the logger sink. 65 | """ 66 | 67 | if os.path.isfile('logging.conf'): 68 | logging.config.fileConfig('logging.conf') 69 | 70 | ctx = zmq.Context() 71 | io_loop = IOLoop.instance() 72 | 73 | log_sub = ctx.socket(zmq.SUB) 74 | log_sub.setsockopt(zmq.SUBSCRIBE, "") 75 | log_sub.bind(settings.ZEROMQ_LOGGING) 76 | 77 | log_stream = ZMQStream(log_sub, io_loop) 78 | 79 | log_stream.on_recv(log_zmq_message) 80 | 81 | def handle_shutdown_signal(_sig, _frame): 82 | """ 83 | Called from the os when a shutdown signal is fired. 84 | """ 85 | log_stream.stop_on_recv() 86 | log_stream.flush() 87 | io_loop.stop() 88 | 89 | # handle kill signals 90 | signal.signal(signal.SIGINT, handle_shutdown_signal) 91 | signal.signal(signal.SIGTERM, handle_shutdown_signal) 92 | 93 | try: 94 | io_loop.start() 95 | except ZMQError: 96 | LOGGERS['master'].debug("Caught a ZMQError. Hopefully during shutdown") 97 | LOGGERS['master'].debug(traceback.format_exc()) 98 | 99 | log_stream.close() 100 | ctx.term() 101 | -------------------------------------------------------------------------------- /src/spyder/masterprocess.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # masterprocess.py 31-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | This module contains the default architecture for master process. 20 | 21 | The main task for masterprocesses is to create and run the **Frontier**. 22 | Starting a master involves the following steps: 23 | 24 | 1. Bind to the configured |zmq| sockets 25 | 2. Start the management interface 26 | 3. Create the frontier 27 | 4. Start the master 28 | 29 | Once the master is up and you have configured a ``settings.MASTER_CALLBACK``, 30 | this method will be called before the master is really started, i.e. before the 31 | ``IOLoop.start()`` is called. This will allow you to insert *Seed* |urls|, e.g. 32 | """ 33 | 34 | import logging 35 | import os 36 | import signal 37 | import socket 38 | import traceback 39 | 40 | import zmq 41 | from zmq.core.error import ZMQError 42 | from zmq.eventloop.ioloop import IOLoop 43 | from zmq.log.handlers import PUBHandler 44 | 45 | from spyder.import_util import import_class 46 | from spyder.core.master import ZmqMaster 47 | from spyder.core.mgmt import ZmqMgmt 48 | 49 | 50 | def create_master_management(settings, zmq_context, io_loop): 51 | """ 52 | Create the management interface for master processes. 53 | """ 54 | listening_socket = zmq_context.socket(zmq.SUB) 55 | listening_socket.setsockopt(zmq.SUBSCRIBE, "") 56 | listening_socket.bind(settings.ZEROMQ_MGMT_WORKER) 57 | 58 | publishing_socket = zmq_context.socket(zmq.PUB) 59 | publishing_socket.bind(settings.ZEROMQ_MGMT_MASTER) 60 | 61 | return ZmqMgmt(listening_socket, publishing_socket, io_loop=io_loop) 62 | 63 | 64 | def create_frontier(settings, log_handler): 65 | """ 66 | Create the frontier to use. 67 | """ 68 | frontier = import_class(settings.FRONTIER_CLASS) 69 | return frontier(settings, log_handler) 70 | 71 | 72 | def main(settings): 73 | """ 74 | Main method for master processes. 75 | """ 76 | # create my own identity 77 | identity = "master:%s:%s" % (socket.gethostname(), os.getpid()) 78 | 79 | ctx = zmq.Context() 80 | io_loop = IOLoop.instance() 81 | 82 | # initialize the logging subsystem 83 | log_pub = ctx.socket(zmq.PUB) 84 | log_pub.connect(settings.ZEROMQ_LOGGING) 85 | zmq_logging_handler = PUBHandler(log_pub) 86 | zmq_logging_handler.root_topic = "spyder.master" 87 | logger = logging.getLogger() 88 | logger.addHandler(zmq_logging_handler) 89 | logger.setLevel(settings.LOG_LEVEL_MASTER) 90 | 91 | logger.info("process::Starting up the master") 92 | 93 | mgmt = create_master_management(settings, ctx, io_loop) 94 | frontier = create_frontier(settings, zmq_logging_handler) 95 | 96 | publishing_socket = ctx.socket(zmq.PUSH) 97 | publishing_socket.setsockopt(zmq.HWM, settings.ZEROMQ_MASTER_PUSH_HWM) 98 | publishing_socket.bind(settings.ZEROMQ_MASTER_PUSH) 99 | 100 | receiving_socket = ctx.socket(zmq.SUB) 101 | receiving_socket.setsockopt(zmq.SUBSCRIBE, "") 102 | receiving_socket.bind(settings.ZEROMQ_MASTER_SUB) 103 | 104 | master = ZmqMaster(settings, identity, receiving_socket, 105 | publishing_socket, mgmt, frontier, zmq_logging_handler, 106 | settings.LOG_LEVEL_MASTER, io_loop) 107 | 108 | def handle_shutdown_signal(_sig, _frame): 109 | """ 110 | Called from the os when a shutdown signal is fired. 111 | """ 112 | master.shutdown() 113 | # zmq 2.1 stops blocking calls, restart the ioloop 114 | io_loop.start() 115 | 116 | # handle kill signals 117 | signal.signal(signal.SIGINT, handle_shutdown_signal) 118 | signal.signal(signal.SIGTERM, handle_shutdown_signal) 119 | 120 | if settings.MASTER_CALLBACK: 121 | callback = import_class(settings.MASTER_CALLBACK) 122 | callback(settings, ctx, io_loop, frontier) 123 | 124 | mgmt.start() 125 | master.start() 126 | 127 | # this will block until the master stops 128 | try: 129 | io_loop.start() 130 | except ZMQError: 131 | logger.debug("Caught a ZMQError. Hopefully during shutdown") 132 | logger.debug(traceback.format_exc()) 133 | 134 | master.close() 135 | mgmt.close() 136 | 137 | logger.info("process::Master is down.") 138 | log_pub.close() 139 | 140 | ctx.term() 141 | -------------------------------------------------------------------------------- /src/spyder/processor/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # fetcher.py 14-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | Package for the default processors. 20 | """ 21 | -------------------------------------------------------------------------------- /src/spyder/processor/cleanupquery.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # cleanupquery.py 14-Apr-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # 19 | """ 20 | Processor to clean up the query string. At this point we want to strip any 21 | trailing '?' or '&' and optionally remove any anchors from it. 22 | """ 23 | from spyder.core.constants import CURI_EXTRACTED_URLS 24 | 25 | 26 | class CleanupQueryString(object): 27 | """ 28 | The processor for cleaning up the query string. 29 | """ 30 | 31 | def __init__(self, settings): 32 | """ 33 | Initialize me. 34 | """ 35 | self._remove_anchors = settings.REMOVE_ANCHORS_FROM_LINKS 36 | 37 | def __call__(self, curi): 38 | """ 39 | Remove any obsolete stuff from the query string. 40 | """ 41 | if CURI_EXTRACTED_URLS not in curi.optional_vars: 42 | return curi 43 | 44 | urls = [] 45 | for raw_url in curi.optional_vars[CURI_EXTRACTED_URLS].split('\n'): 46 | urls.append(self._cleanup_query_string(raw_url)) 47 | 48 | curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join(urls) 49 | return curi 50 | 51 | def _cleanup_query_string(self, raw_url): 52 | """ 53 | """ 54 | url = raw_url 55 | if self._remove_anchors: 56 | begin = raw_url.find("#") 57 | if begin > -1: 58 | url = raw_url[:begin] 59 | 60 | if len(url) == 0: 61 | return raw_url 62 | 63 | while url[-1] == '?' or url[-1] == '&': 64 | return url[:-1] 65 | 66 | return url 67 | -------------------------------------------------------------------------------- /src/spyder/processor/fetcher.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # fetcher.py 14-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | Module for downloading content from the web. 20 | 21 | TODO: document pycurls features, i.e. what it can download. 22 | """ 23 | 24 | import logging 25 | 26 | from urlparse import urlsplit 27 | 28 | from tornado.httpclient import AsyncHTTPClient, HTTPRequest 29 | from tornado.httputil import HTTPHeaders 30 | 31 | from zmq.eventloop.ioloop import IOLoop 32 | 33 | from spyder.core.constants import CURI_SITE_USERNAME 34 | from spyder.core.constants import CURI_SITE_PASSWORD 35 | from spyder.time import deserialize_date_time 36 | 37 | LOG = logging.getLogger('fetcher') 38 | 39 | 40 | class FetchProcessor(object): 41 | """ 42 | A processing class for downloading all kinds of stuff from the web. 43 | """ 44 | 45 | def __init__(self, settings, io_loop=None): 46 | """ 47 | Initialize the members. 48 | """ 49 | self._user_agent = settings.USER_AGENT 50 | assert self._user_agent 51 | 52 | self._io_loop = io_loop or IOLoop.instance() 53 | 54 | self._follow_redirects = settings.FOLLOW_REDIRECTS 55 | self._max_redirects = settings.MAX_REDIRECTS 56 | self._gzip = settings.USE_GZIP 57 | 58 | if settings.PROXY_HOST: 59 | proxy_port = settings.PROXY_PORT 60 | assert proxy_port 61 | assert isinstance(proxy_port, int) 62 | 63 | self._proxy_configuration = dict( 64 | host = settings.PROXY_HOST, 65 | port = settings.PROXY_PORT, 66 | user = settings.PROXY_USERNAME, 67 | password = settings.PROXY_PASSWORD 68 | ) 69 | 70 | self._validate_cert = settings.VALIDATE_CERTIFICATES 71 | self._request_timeout = settings.REQUEST_TIMEOUT 72 | self._connect_timeout = settings.CONNECT_TIMEOUT 73 | 74 | max_clients = settings.MAX_CLIENTS 75 | max_simultaneous_connections = settings.MAX_SIMULTANEOUS_CONNECTIONS 76 | 77 | self._client = AsyncHTTPClient(self._io_loop, 78 | max_clients=max_clients, 79 | max_simultaneous_connections=max_simultaneous_connections) 80 | 81 | def __call__(self, msg, out_stream): 82 | """ 83 | Work on the current `DataMessage` and send the result to `out_stream`. 84 | """ 85 | # prepare the HTTPHeaders 86 | headers = prepare_headers(msg) 87 | 88 | last_modified = None 89 | if msg.curi.req_header: 90 | # check if we have a date when the page was last crawled 91 | if "Last-Modified" in msg.curi.req_header: 92 | last_modified = deserialize_date_time( 93 | msg.curi.req_header["Last-Modified"]) 94 | 95 | # check if we have username and password present 96 | auth_username = None 97 | auth_password = None 98 | if msg.curi.optional_vars and \ 99 | CURI_SITE_USERNAME in msg.curi.optional_vars and \ 100 | CURI_SITE_PASSWORD in msg.curi.optional_vars: 101 | 102 | auth_username = msg.curi.optional_vars[CURI_SITE_USERNAME] 103 | auth_password = msg.curi.optional_vars[CURI_SITE_PASSWORD] 104 | 105 | # create the request 106 | request = HTTPRequest(msg.curi.effective_url, 107 | method="GET", 108 | headers=headers, 109 | auth_username=auth_username, 110 | auth_password=auth_password, 111 | if_modified_since=last_modified, 112 | follow_redirects=self._follow_redirects, 113 | max_redirects=self._max_redirects, 114 | user_agent=self._user_agent, 115 | request_timeout = self._request_timeout, 116 | connect_timeout = self._connect_timeout, 117 | validate_cert = self._validate_cert) 118 | 119 | if hasattr(self, '_proxy_configuration'): 120 | request.proxy_host = self._proxy_configuration['host'] 121 | request.proxy_port = self._proxy_configuration['port'] 122 | request.proxy_username = \ 123 | self._proxy_configuration.get('user', None) 124 | request.proxy_password = \ 125 | self._proxy_configuration.get('password', None) 126 | 127 | LOG.info("proc.fetch::request for %s" % msg.curi.url) 128 | self._client.fetch(request, handle_response(msg, out_stream)) 129 | 130 | 131 | def prepare_headers(msg): 132 | """ 133 | Construct the :class:`HTTPHeaders` with all the necessary information for 134 | the request. 135 | """ 136 | # construct the headers 137 | headers = HTTPHeaders() 138 | 139 | if msg.curi.req_header: 140 | # check if we have a previous Etag 141 | if "Etag" in msg.curi.req_header: 142 | headers["If-None-Match"] = \ 143 | msg.curi.req_header["Etag"] 144 | 145 | # manually set the Host header since we are requesting using an IP 146 | host = urlsplit(msg.curi.url).hostname 147 | if host is None: 148 | LOG.error("proc.fetch::cannot extract hostname from url '%s'" % 149 | msg.curi.url) 150 | else: 151 | headers["Host"] = host 152 | 153 | return headers 154 | 155 | 156 | def handle_response(msg, out_stream): 157 | """ 158 | Decorator for the actual callback function that will extract interesting 159 | info and forward the response. 160 | """ 161 | def handle_server_response(response): 162 | """ 163 | The actual callback function. 164 | 165 | Extract interesting info from the response using 166 | :meth:`extract_info_from_response` and forward the result to the 167 | `out_stream`. 168 | """ 169 | extract_info_from_response(response, msg) 170 | LOG.info("proc.fetch::response for %s (took '%s'ms)" % 171 | (msg.curi.url, response.request_time)) 172 | if response.code >= 400: 173 | LOG.error("proc.fetch::response error: %s", response) 174 | out_stream.send_multipart(msg.serialize()) 175 | 176 | return handle_server_response 177 | 178 | 179 | def extract_info_from_response(response, msg): 180 | """ 181 | Extract the interesting information from a HTTPResponse. 182 | """ 183 | msg.curi.status_code = response.code 184 | msg.curi.req_header = response.request.headers 185 | msg.curi.rep_header = response.headers 186 | msg.curi.req_time = response.request_time 187 | msg.curi.queue_time = response.time_info["queue"] 188 | msg.curi.content_body = response.body 189 | 190 | return msg 191 | -------------------------------------------------------------------------------- /src/spyder/processor/httpextractor.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Daniel Truemper truemped@googlemail.com 3 | # 4 | # httpextractor.py 17-Mar-2011 5 | # 6 | # Licensed to the Apache Software Foundation (ASF) under one 7 | # or more contributor license agreements. See the NOTICE file 8 | # distributed with this work for additional information 9 | # regarding copyright ownership. The ASF licenses this file 10 | # to you under the Apache License, Version 2.0 (the 11 | # "License"); you may not use this file except in compliance 12 | # with the License. You may obtain a copy of the License at 13 | # 14 | # http://www.apache.org/licenses/LICENSE-2.0 15 | # 16 | # Unless required by applicable law or agreed to in writing, 17 | # software distributed under the License is distributed on an 18 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | # KIND, either express or implied. See the License for the 20 | # specific language governing permissions and limitations 21 | # under the License. 22 | # 23 | # 24 | """ 25 | Link extractor for detecting links in HTTP codes. 26 | 27 | The main use case for this are HTTP redirects, e.g. In the case of a redirect 28 | the HTTP status code ``30X`` is present and the ``Location`` header indicates 29 | the new location. 30 | """ 31 | import urlparse 32 | 33 | from spyder.core.constants import CURI_EXTRACTED_URLS 34 | 35 | 36 | class HttpExtractor(object): 37 | """ 38 | The processor for extracting links from ``HTTP`` headers. 39 | """ 40 | 41 | def __init__(self, settings): 42 | """ 43 | Initialize the extractor. 44 | """ 45 | self._not_found_redirects = settings.HTTP_EXTRACTOR_404_REDIRECT 46 | 47 | def __call__(self, curi): 48 | """ 49 | Perform the URL extraction in case of a redirect code. 50 | 51 | I.e. if ``300 <= curi.status_code < 400``, then search for any 52 | HTTP ``Location`` header and append the given URL to the list of 53 | extracted URLs. 54 | """ 55 | 56 | if 300 <= curi.status_code < 400 and curi.rep_header and \ 57 | "Location" in curi.rep_header: 58 | 59 | link = curi.rep_header["Location"] 60 | 61 | if link.find("://") == -1: 62 | # a relative link. this is bad behaviour, but yeah, you know... 63 | link = urlparse.urljoin(curi.url, link) 64 | 65 | if link not in self._not_found_redirects: 66 | if not hasattr(curi, "optional_vars"): 67 | curi.optional_vars = dict() 68 | 69 | if not CURI_EXTRACTED_URLS in curi.optional_vars: 70 | curi.optional_vars[CURI_EXTRACTED_URLS] = link 71 | else: 72 | curi.optional_vars[CURI_EXTRACTED_URLS] += "\n" + link 73 | 74 | return curi 75 | -------------------------------------------------------------------------------- /src/spyder/processor/limiter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # limiter.py 18-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | A processor used for limiting the extraction and scoping processings. 20 | 21 | Basically this will be used for ignoring any `robots.txt` for being processed. 22 | """ 23 | 24 | from spyder.core.constants import CURI_OPTIONAL_TRUE, CURI_EXTRACTION_FINISHED 25 | 26 | 27 | class DefaultLimiter(object): 28 | """ 29 | The default crawl limiter. 30 | """ 31 | 32 | def __init__(self, settings): 33 | """ 34 | Initialize the limiter with the given settings. 35 | """ 36 | pass 37 | 38 | def __call__(self, curi): 39 | """ 40 | Do the actual limiting. 41 | """ 42 | return self._do_not_process_robots(curi) 43 | 44 | def _do_not_process_robots(self, curi): 45 | """ 46 | Do not process `CrawlUris` if they are **robots.txt** files. 47 | """ 48 | if CURI_EXTRACTION_FINISHED not in curi.optional_vars and \ 49 | curi.effective_url.endswith("robots.txt"): 50 | curi.optional_vars[CURI_EXTRACTION_FINISHED] = CURI_OPTIONAL_TRUE 51 | 52 | return curi 53 | -------------------------------------------------------------------------------- /src/spyder/processor/scoper.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # scoper.py 24-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | The *Crawl Scope* defines which *URLs* the *Spyder* should process. The main 20 | usecases for them are: 21 | 22 | - only spider content from the *Seed* Hosts 23 | - do not spider images, css, videos 24 | 25 | and there are probably a lot of other reasons you want to have at least one the 26 | scoper configured, otherwise you might end up downloading the internet. 27 | 28 | So each scoper should iterate over the 29 | ``curi.optional_vars[CURI_EXTRACTED_URLS]`` and determine if it should be 30 | downloaded or not. 31 | 32 | The :class:`RegexScoper` maintains a list of regular expressions that define 33 | the crawl scope. Two classes of expressions exist: positive and negative. 34 | The initial decision of the scoper is to not download its content. If a regex 35 | from the positive list matches, and no regex from the negative list matches, 36 | the *URL* is marked for downloading. In any other case, the *URL* will be 37 | abandoned. 38 | 39 | .. note:: We should really split up the regex scoper and allow the user to 40 | configure more than just one scoper. 41 | """ 42 | 43 | import re 44 | 45 | from spyder.core.constants import CURI_EXTRACTED_URLS 46 | 47 | 48 | class RegexScoper(object): 49 | """ 50 | The scoper based on regular expressions. 51 | 52 | There are two settings that influence this scoper: 53 | 54 | 1. ``settings.REGEX_SCOPE_POSITIVE`` 55 | 2. ``settings.REGEX_SCOPE_NEGATIVE`` 56 | 57 | Both have to be a ``list``. The scoper is executed in the 58 | :meth:`__call__` method. 59 | """ 60 | 61 | def __init__(self, settings): 62 | """ 63 | Compile the regular expressions. 64 | """ 65 | self._positive_regex = [] 66 | for regex in settings.REGEX_SCOPE_POSITIVE: 67 | self._positive_regex.append(re.compile(regex)) 68 | 69 | self._negative_regex = [] 70 | for regex in settings.REGEX_SCOPE_NEGATIVE: 71 | self._negative_regex.append(re.compile(regex)) 72 | 73 | def __call__(self, curi): 74 | """ 75 | Filter all newly extracted URLs for those we want in this crawl. 76 | """ 77 | if CURI_EXTRACTED_URLS not in curi.optional_vars: 78 | return curi 79 | 80 | urls = [] 81 | for url in curi.optional_vars[CURI_EXTRACTED_URLS].split("\n"): 82 | add_url = False 83 | for regex in self._positive_regex: 84 | if regex.match(url): 85 | add_url = True 86 | 87 | for regex in self._negative_regex: 88 | if regex.match(url): 89 | add_url = False 90 | 91 | if add_url: 92 | urls.append(url) 93 | 94 | curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join(urls) 95 | return curi 96 | -------------------------------------------------------------------------------- /src/spyder/processor/stripsessions.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # stripsessions.py 14-Apr-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # 19 | """ 20 | Processor to strip all session ids from the extracted URLs. It should be placed 21 | at the very end of the scoper chain in order to process only those URLs that 22 | are relevant for the crawl. 23 | 24 | It basically searches for 25 | 26 | sid= 27 | jsessionid= 28 | phpsessionid= 29 | aspsessionid= 30 | """ 31 | from spyder.core.constants import CURI_EXTRACTED_URLS 32 | 33 | 34 | class StripSessionIds(object): 35 | """ 36 | The processor for removing session information from the query string. 37 | """ 38 | 39 | def __init__(self, settings): 40 | """ 41 | Initialize me. 42 | """ 43 | self._session_params = ['jsessionid=', 'phpsessid=', 44 | 'aspsessionid=', 'sid='] 45 | 46 | def __call__(self, curi): 47 | """ 48 | Main method stripping the session stuff from the query string. 49 | """ 50 | if CURI_EXTRACTED_URLS not in curi.optional_vars: 51 | return curi 52 | 53 | urls = [] 54 | for raw_url in curi.optional_vars[CURI_EXTRACTED_URLS].split('\n'): 55 | urls.append(self._remove_session_ids(raw_url)) 56 | 57 | curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join(urls) 58 | return curi 59 | 60 | def _remove_session_ids(self, raw_url): 61 | """ 62 | Remove the session information. 63 | """ 64 | for session in self._session_params: 65 | url = raw_url.lower() 66 | begin = url.find(session) 67 | while begin > -1: 68 | end = url.find('&', begin) 69 | if end == -1: 70 | raw_url = raw_url[:begin] 71 | else: 72 | raw_url = "%s%s" % (raw_url[:begin], raw_url[end:]) 73 | url = raw_url.lower() 74 | begin = url.find(session) 75 | 76 | return raw_url 77 | -------------------------------------------------------------------------------- /src/spyder/spyder_template/log/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/9a2de6ec4c25d4dc85802305d5675a52c3ebb750/src/spyder/spyder_template/log/.keep -------------------------------------------------------------------------------- /src/spyder/spyder_template/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys = root, master, worker 3 | 4 | [handlers] 5 | keys = master, worker 6 | 7 | [formatters] 8 | keys = default 9 | 10 | [logger_root] 11 | level = NOTSET 12 | handlers = 13 | 14 | [logger_master] 15 | level = DEBUG 16 | handlers = master 17 | qualname = masterlog 18 | 19 | [handler_master] 20 | class = handlers.TimedRotatingFileHandler 21 | formatter = default 22 | args = ('log/master.log', 'D', 1, 10) 23 | 24 | [logger_worker] 25 | level = DEBUG 26 | handlers = worker 27 | qualname = workerlog 28 | 29 | [handler_worker] 30 | class = handlers.TimedRotatingFileHandler 31 | formatter = default 32 | args = ('log/worker.log', 'D', 1, 10) 33 | 34 | [formatter_default] 35 | format = [%(asctime)s] - %(levelname)s - %(message)s 36 | class = logging.Formatter 37 | -------------------------------------------------------------------------------- /src/spyder/spyder_template/master.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # master.py 21-Apr-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # 19 | """ 20 | Master module starting a crawl. 21 | """ 22 | from spyder import CrawlUri 23 | 24 | from sink import MySink 25 | 26 | 27 | def initialize(settings, zmq_ctx, io_loop, frontier): 28 | """ 29 | Initialize the **Master**. 30 | 31 | You may access and manipulate the `settings`, the process global `zmq_ctx`, 32 | *pyzmq's* `io_loop` and the `frontier`. 33 | """ 34 | frontier.add_uri(CrawlUri("http://www.dmoz.org/Recreation/Boating/Sailing/"))) 35 | frontier.add_sink(MySink(settings)) 36 | -------------------------------------------------------------------------------- /src/spyder/spyder_template/settings.py: -------------------------------------------------------------------------------- 1 | # 2 | # settings.py 3 | # 4 | """ 5 | Your crawler specific settings. 6 | """ 7 | import logging 8 | 9 | LOG_LEVEL_MASTER = logging.INFO 10 | LOG_LEVEL_WORKER = logging.INFO 11 | 12 | USER_AGENT = "Mozilla/5.0 (compatible; spyder/0.1; " + \ 13 | "+http://github.com/retresco/spyder)" 14 | 15 | # callback for initializing the periodic crawling of the sitemap 16 | MASTER_CALLBACK = 'master.initialize' 17 | 18 | # List of positive regular expressions for the crawl scope 19 | REGEX_SCOPE_POSITIVE = [ 20 | "^http://www\.dmoz\.org/Recreation/Boating/Sailing/.*", 21 | ] 22 | 23 | # List of negative regular expressions for the crawl scope 24 | REGEX_SCOPE_NEGATIVE = [ 25 | "^http://www\.dmoz\.org/Recreation/Boating/Sailing/Racing/.*", 26 | ] 27 | -------------------------------------------------------------------------------- /src/spyder/spyder_template/sink.py: -------------------------------------------------------------------------------- 1 | # 2 | # sink.py 21-Apr-2011 3 | # 4 | """ 5 | Put your storage code here. 6 | """ 7 | from spyder.core.sink import AbstractCrawlUriSink 8 | 9 | 10 | class MySink(AbstractCrawlUriSink): 11 | """ 12 | This is my sink. 13 | """ 14 | 15 | def __init__(self, settings): 16 | """ 17 | Initialize me with some settings. 18 | """ 19 | pass 20 | 21 | def process_successful_crawl(self, curi): 22 | """ 23 | We have crawled a uri successfully. If there are newly extracted links, 24 | add them alongside the original uri to the frontier. 25 | """ 26 | pass 27 | 28 | def process_not_found(self, curi): 29 | """ 30 | The uri we should have crawled was not found, i.e. HTTP Error 404. Do 31 | something with that. 32 | """ 33 | pass 34 | 35 | def process_redirect(self, curi): 36 | """ 37 | There have been too many redirects, i.e. in the default config there 38 | have been more than 3 redirects. 39 | """ 40 | pass 41 | 42 | def process_server_error(self, curi): 43 | """ 44 | There has been a server error, i.e. HTTP Error 50x. Maybe we should try 45 | to crawl this uri again a little bit later. 46 | """ 47 | pass 48 | -------------------------------------------------------------------------------- /src/spyder/spyder_template/spyder-ctrl.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # spyder.py 02-Feb-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import sys 20 | 21 | import spyder 22 | 23 | try: 24 | import settings 25 | except ImportError: 26 | print >> sys.stderr, \ 27 | """Cannot find settings.py in the directory containing %s""" % __file__ 28 | sys.exit(1) 29 | 30 | 31 | if __name__ == "__main__": 32 | spyder.spyder_management(settings) 33 | -------------------------------------------------------------------------------- /src/spyder/thrift/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # __init__.py 10-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | Modules for working with the thrift generated code. 20 | """ 21 | -------------------------------------------------------------------------------- /src/spyder/thrift/gen/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # __init__.py 10-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | """ 19 | Generated modules from thrift. 20 | """ 21 | 22 | __all__ = ['ttypes', 'constants'] 23 | -------------------------------------------------------------------------------- /src/spyder/thrift/gen/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | 7 | from thrift.Thrift import * 8 | from ttypes import * 9 | 10 | -------------------------------------------------------------------------------- /src/spyder/time.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # time.py 15-Feb-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # under the License. 18 | # All programs in this directory and 19 | # subdirectories are published under the GNU General Public License as 20 | # described below. 21 | # 22 | # 23 | """ 24 | Time related utilities. 25 | """ 26 | from datetime import datetime 27 | 28 | import pytz 29 | 30 | SERVER_TIME_FORMAT = "%a, %d %b %Y %H:%M:%S %Z" 31 | GMT = pytz.timezone('GMT') 32 | 33 | 34 | def serialize_date_time(date_time): 35 | """ 36 | Create a string of the datetime. 37 | """ 38 | return GMT.localize(date_time).strftime(SERVER_TIME_FORMAT) 39 | 40 | 41 | def deserialize_date_time(date_string): 42 | """ 43 | Read a string as a datetime. 44 | """ 45 | return datetime.strptime(date_string, SERVER_TIME_FORMAT) 46 | -------------------------------------------------------------------------------- /test/static/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: /RealMedia/ 3 | Disallow: /archiv/ 4 | Disallow: /forum/ 5 | Disallow: /cgi-bin/ 6 | Disallow: /werbung/ 7 | Disallow: /artikelversand/ 8 | Disallow: /grossbild/ 9 | Disallow: /druckbild/ 10 | Disallow: /druckrezension/ 11 | Disallow: /druckversion/ 12 | Disallow: /active/ 13 | Disallow: /staticgen/mobil/ 14 | 15 | #User-agent: Firefly/1.0 16 | #Disallow: / 17 | 18 | User-agent: WebReaper 19 | Disallow: / 20 | 21 | User-agent: Slurp 22 | Crawl-delay: 18 23 | -------------------------------------------------------------------------------- /test/test_async_worker.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_async_worker.py 14-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | import logging 19 | from logging import StreamHandler 20 | import sys 21 | import unittest 22 | 23 | import zmq 24 | from zmq.eventloop.ioloop import IOLoop 25 | from zmq.eventloop.zmqstream import ZMQStream 26 | 27 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER 28 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT 29 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK 30 | from spyder.core.mgmt import ZmqMgmt 31 | from spyder.core.worker import AsyncZmqWorker 32 | from spyder.core.messages import DataMessage, MgmtMessage 33 | from spyder.thrift.gen.ttypes import CrawlUri 34 | 35 | 36 | class ZmqTornadoIntegrationTest(unittest.TestCase): 37 | 38 | def setUp(self): 39 | 40 | # create the io_loop 41 | self._io_loop = IOLoop.instance() 42 | 43 | # and the context 44 | self._ctx = zmq.Context(1) 45 | 46 | # setup the mgmt sockets 47 | self._setup_mgmt_sockets() 48 | 49 | # setup the data sockets 50 | self._setup_data_sockets() 51 | 52 | # setup the management interface 53 | self._mgmt = ZmqMgmt( self._mgmt_sockets['worker_sub'], 54 | self._mgmt_sockets['worker_pub'], io_loop=self._io_loop) 55 | self._mgmt.start() 56 | self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end) 57 | 58 | def tearDown(self): 59 | # stop the mgmt 60 | self._mgmt.stop() 61 | 62 | # close all sockets 63 | for socket in self._mgmt_sockets.itervalues(): 64 | socket.close() 65 | for socket in self._worker_sockets.itervalues(): 66 | socket.close() 67 | 68 | # terminate the context 69 | self._ctx.term() 70 | 71 | def _setup_mgmt_sockets(self): 72 | 73 | self._mgmt_sockets = dict() 74 | 75 | # adress for the communication from master to worker(s) 76 | mgmt_master_worker = 'inproc://master/worker/coordination/' 77 | 78 | # connect the master with the worker 79 | # the master is a ZMQStream because we are sending msgs from the test 80 | sock = self._ctx.socket(zmq.PUB) 81 | sock.bind(mgmt_master_worker) 82 | self._mgmt_sockets['tmp1'] = sock 83 | self._mgmt_sockets['master_pub'] = ZMQStream(sock, self._io_loop) 84 | # the worker stream is created inside the ZmqMgmt class 85 | self._mgmt_sockets['worker_sub'] = self._ctx.socket(zmq.SUB) 86 | self._mgmt_sockets['worker_sub'].setsockopt(zmq.SUBSCRIBE, "") 87 | self._mgmt_sockets['worker_sub'].connect(mgmt_master_worker) 88 | 89 | # adress for the communication from worker(s) to master 90 | mgmt_worker_master = 'inproc://worker/master/coordination/' 91 | 92 | # connect the worker with the master 93 | self._mgmt_sockets['worker_pub'] = self._ctx.socket(zmq.PUB) 94 | self._mgmt_sockets['worker_pub'].bind(mgmt_worker_master) 95 | sock = self._ctx.socket(zmq.SUB) 96 | sock.setsockopt(zmq.SUBSCRIBE, "") 97 | sock.connect(mgmt_worker_master) 98 | self._mgmt_sockets['tmp2'] = sock 99 | self._mgmt_sockets['master_sub'] = ZMQStream(sock, self._io_loop) 100 | 101 | def _setup_data_sockets(self): 102 | 103 | self._worker_sockets = dict() 104 | 105 | # address for master -> worker communication 106 | data_master_worker = 'inproc://master/worker/pipeline/' 107 | 108 | sock = self._ctx.socket(zmq.PUSH) 109 | sock.bind(data_master_worker) 110 | self._worker_sockets['tmp3'] = sock 111 | self._worker_sockets['master_push'] = ZMQStream(sock, self._io_loop) 112 | self._worker_sockets['worker_pull'] = self._ctx.socket(zmq.PULL) 113 | self._worker_sockets['worker_pull'].connect(data_master_worker) 114 | 115 | # address for worker -> master communication 116 | data_worker_master = 'inproc://worker/master/pipeline/' 117 | 118 | self._worker_sockets['worker_pub'] = self._ctx.socket(zmq.PUB) 119 | self._worker_sockets['worker_pub'].bind(data_worker_master) 120 | sock = self._ctx.socket(zmq.SUB) 121 | sock.setsockopt(zmq.SUBSCRIBE, "") 122 | sock.connect(data_worker_master) 123 | self._worker_sockets['tmp4'] = sock 124 | self._worker_sockets['master_sub'] = ZMQStream(sock, self._io_loop) 125 | 126 | def on_mgmt_end(self, _msg): 127 | self._io_loop.stop() 128 | 129 | 130 | class AsyncZmqWorkerIntegrationTest(ZmqTornadoIntegrationTest): 131 | 132 | def echo_processing(self, data_message, out_socket): 133 | msg = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, 134 | data=ZMQ_SPYDER_MGMT_WORKER_QUIT) 135 | self._mgmt_sockets['master_pub'].send_multipart(msg.serialize()) 136 | out_socket.send_multipart(data_message.serialize()) 137 | 138 | def test_that_async_worker_works(self): 139 | worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], 140 | self._worker_sockets['worker_pub'], 141 | self._mgmt, 142 | self.echo_processing, 143 | StreamHandler(sys.stdout), 144 | logging.DEBUG, 145 | self._io_loop) 146 | 147 | worker.start() 148 | 149 | curi = CrawlUri(url="http://localhost") 150 | msg = DataMessage() 151 | msg.identity = "me" 152 | msg.curi = curi 153 | 154 | def assert_correct_data(msg2): 155 | msg3 = DataMessage(msg2) 156 | self.assertEqual(msg, msg3) 157 | 158 | self._worker_sockets['master_sub'].on_recv(assert_correct_data) 159 | 160 | def assert_correct_mgmt(msg4): 161 | self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg4.data) 162 | 163 | self._mgmt_sockets['master_sub'].on_recv(assert_correct_mgmt) 164 | 165 | self._worker_sockets['master_push'].send_multipart(msg.serialize()) 166 | 167 | self._io_loop.start() 168 | worker._in_stream.flush() 169 | 170 | 171 | if __name__ == '__main__': 172 | unittest.main() 173 | -------------------------------------------------------------------------------- /test/test_cleanup_qs.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_cleanup_qs.py 14-Apr-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # 19 | import unittest 20 | 21 | from spyder.core.settings import Settings 22 | from spyder.processor.cleanupquery import CleanupQueryString 23 | 24 | 25 | class CleanupQueryStringTest(unittest.TestCase): 26 | 27 | def test_that_cleaning_qs_works(self): 28 | s = Settings() 29 | c = CleanupQueryString(s) 30 | 31 | self.assertEqual("http://tesT.com/t.html?p=a", 32 | c._cleanup_query_string("http://tesT.com/t.html?p=a#top")) 33 | 34 | self.assertEqual("http://test.com/t.html", 35 | c._cleanup_query_string("http://test.com/t.html?#top")) 36 | 37 | self.assertEqual("http://test.com/t.html?test=a", 38 | c._cleanup_query_string("http://test.com/t.html?test=a&")) 39 | -------------------------------------------------------------------------------- /test/test_default_html_link_extractor.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_default_html_link_extractor.py 21-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import unittest 20 | 21 | from spyder.core.constants import CURI_EXTRACTED_URLS 22 | from spyder.core.settings import Settings 23 | from spyder.processor.htmllinkextractor import DefaultHtmlLinkExtractor 24 | from spyder.thrift.gen.ttypes import CrawlUri 25 | 26 | 27 | class HtmlLinkExtractorTest(unittest.TestCase): 28 | 29 | def test_that_content_type_restriction_works(self): 30 | xtor = DefaultHtmlLinkExtractor(Settings()) 31 | 32 | curi = CrawlUri() 33 | curi.rep_header = dict() 34 | curi.rep_header["Content-Type"] = "text/html" 35 | self.assertTrue(xtor._restrict_content_type(curi)) 36 | curi.rep_header["Content-Type"] = "pille/palle" 37 | self.assertFalse(xtor._restrict_content_type(curi)) 38 | 39 | def test_link_extraction_works(self): 40 | 41 | src = " viel text" + \ 42 | "und " + \ 43 | "noch mehr!" + \ 44 | "" 45 | 46 | curi = CrawlUri() 47 | curi.rep_header = dict() 48 | curi.rep_header["Content-Type"] = "text/html; charset=utf-8" 49 | curi.url = "http://www.bmg.bund.de/test/" 50 | curi.content_body = src 51 | curi.optional_vars = dict() 52 | 53 | xtor = DefaultHtmlLinkExtractor(Settings()) 54 | curi = xtor(curi) 55 | 56 | links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n") 57 | self.assertEqual("http://www.google.de", links[0]) 58 | self.assertEqual("http://www.bmg.bund.de/relative.html", links[1]) 59 | self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html", 60 | links[2]) 61 | 62 | def test_link_extraction_with_base_works(self): 63 | 64 | src = "" + \ 65 | " viel text" + \ 66 | "und " + \ 67 | "noch mehr!" 68 | 69 | curi = CrawlUri() 70 | curi.rep_header = dict() 71 | curi.rep_header["Content-Type"] = "text/html; charset=utf-8" 72 | curi.url = "http://www.bmg.bund.de/test/" 73 | curi.content_body = src 74 | curi.optional_vars = dict() 75 | 76 | xtor = DefaultHtmlLinkExtractor(Settings()) 77 | curi = xtor(curi) 78 | 79 | links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n") 80 | self.assertEqual("http://www.google.de", links[0]) 81 | self.assertEqual("http://www.bing.com/relative.html", links[1]) 82 | self.assertEqual("http://www.bing.com/evenmorerelative.html", 83 | links[2]) 84 | 85 | def test_missing_encoding_works(self): 86 | src = " viel text" + \ 87 | "und " + \ 88 | "noch mehr!" 89 | 90 | curi = CrawlUri() 91 | curi.rep_header = dict() 92 | curi.rep_header["Content-Type"] = "text/html" 93 | curi.url = "http://www.bmg.bund.de/test/" 94 | curi.content_body = src 95 | curi.optional_vars = dict() 96 | 97 | xtor = DefaultHtmlLinkExtractor(Settings()) 98 | curi = xtor(curi) 99 | 100 | links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n") 101 | self.assertEqual("http://www.google.de", links[0]) 102 | self.assertEqual("http://www.bmg.bund.de/relative.html", links[1]) 103 | self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html", 104 | links[2]) 105 | 106 | 107 | if __name__ == '__main__': 108 | unittest.main() 109 | -------------------------------------------------------------------------------- /test/test_dns_cache.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_dns_cache.py 25-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import unittest 20 | 21 | from spyder.core.dnscache import DnsCache 22 | from spyder.core.settings import Settings 23 | 24 | 25 | class DnsCacheTest(unittest.TestCase): 26 | 27 | def test_dns_cache(self): 28 | s = Settings() 29 | s.SIZE_DNS_CACHE = 1 30 | dns = DnsCache(s) 31 | self.assertEqual(('127.0.0.1', 80), dns["localhost:80"]) 32 | self.assertEqual(('127.0.0.1', 81), dns["localhost:81"]) 33 | self.assertTrue(1, len(dns._cache)) 34 | 35 | def test_static_dns_mapping(self): 36 | s = Settings() 37 | s.STATIC_DNS_MAPPINGS = {"localhost:123": ("-1.-1.-1.-1", 123)} 38 | dns = DnsCache(s) 39 | self.assertEqual(("-1.-1.-1.-1", 123), dns["localhost:123"]) 40 | self.assertEqual(('127.0.0.1', 80), dns["localhost:80"]) 41 | self.assertTrue(1, len(dns._cache)) 42 | 43 | 44 | if __name__ == '__main__': 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /test/test_fetch_processor.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_fetch_processor.py 17-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | import logging 19 | from logging import StreamHandler 20 | import sys 21 | 22 | import os.path 23 | import time 24 | import random 25 | 26 | import unittest 27 | 28 | import tornado 29 | import tornado.httpserver 30 | import tornado.web 31 | 32 | import zmq 33 | from zmq.eventloop.ioloop import IOLoop 34 | from zmq.eventloop.zmqstream import ZMQStream 35 | 36 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER 37 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT 38 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK 39 | from spyder.core.messages import DataMessage, MgmtMessage 40 | from spyder.core.mgmt import ZmqMgmt 41 | from spyder.core.worker import AsyncZmqWorker 42 | from spyder.core.settings import Settings 43 | from spyder.processor.fetcher import FetchProcessor 44 | from spyder.encoding import extract_content_type_encoding 45 | from spyder.thrift.gen.ttypes import CrawlUri 46 | 47 | 48 | class ZmqTornadoIntegrationTest(unittest.TestCase): 49 | 50 | def setUp(self): 51 | 52 | # create the io_loop 53 | self._io_loop = IOLoop.instance() 54 | 55 | # and the context 56 | self._ctx = zmq.Context(1) 57 | 58 | # setup the mgmt sockets 59 | self._setup_mgmt_sockets() 60 | 61 | # setup the data sockets 62 | self._setup_data_sockets() 63 | 64 | # setup the management interface 65 | self._mgmt = ZmqMgmt( self._mgmt_sockets['worker_sub'], 66 | self._mgmt_sockets['worker_pub'], io_loop=self._io_loop) 67 | self._mgmt.start() 68 | self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end) 69 | 70 | def tearDown(self): 71 | # stop the mgmt 72 | self._mgmt.stop() 73 | 74 | # close all sockets 75 | for socket in self._mgmt_sockets.itervalues(): 76 | socket.close() 77 | for socket in self._worker_sockets.itervalues(): 78 | socket.close() 79 | 80 | # terminate the context 81 | self._ctx.term() 82 | 83 | def _setup_mgmt_sockets(self): 84 | 85 | self._mgmt_sockets = dict() 86 | 87 | # adress for the communication from master to worker(s) 88 | mgmt_master_worker = 'inproc://master/worker/coordination/' 89 | 90 | # connect the master with the worker 91 | # the master is a ZMQStream because we are sending msgs from the test 92 | sock = self._ctx.socket(zmq.PUB) 93 | sock.bind(mgmt_master_worker) 94 | self._mgmt_sockets['tmp1'] = sock 95 | self._mgmt_sockets['master_pub'] = ZMQStream(sock, self._io_loop) 96 | # the worker stream is created inside the ZmqMgmt class 97 | self._mgmt_sockets['worker_sub'] = self._ctx.socket(zmq.SUB) 98 | self._mgmt_sockets['worker_sub'].setsockopt(zmq.SUBSCRIBE, "") 99 | self._mgmt_sockets['worker_sub'].connect(mgmt_master_worker) 100 | 101 | # adress for the communication from worker(s) to master 102 | mgmt_worker_master = 'inproc://worker/master/coordination/' 103 | 104 | # connect the worker with the master 105 | self._mgmt_sockets['worker_pub'] = self._ctx.socket(zmq.PUB) 106 | self._mgmt_sockets['worker_pub'].bind(mgmt_worker_master) 107 | sock = self._ctx.socket(zmq.SUB) 108 | sock.setsockopt(zmq.SUBSCRIBE, "") 109 | sock.connect(mgmt_worker_master) 110 | self._mgmt_sockets['tmp2'] = sock 111 | self._mgmt_sockets['master_sub'] = ZMQStream(sock, self._io_loop) 112 | 113 | def _setup_data_sockets(self): 114 | 115 | self._worker_sockets = dict() 116 | 117 | # address for master -> worker communication 118 | data_master_worker = 'inproc://master/worker/pipeline/' 119 | 120 | sock = self._ctx.socket(zmq.PUSH) 121 | sock.bind(data_master_worker) 122 | self._worker_sockets['tmp3'] = sock 123 | self._worker_sockets['master_push'] = ZMQStream(sock, self._io_loop) 124 | self._worker_sockets['worker_pull'] = self._ctx.socket(zmq.PULL) 125 | self._worker_sockets['worker_pull'].connect(data_master_worker) 126 | 127 | # address for worker -> master communication 128 | data_worker_master = 'inproc://worker/master/pipeline/' 129 | 130 | self._worker_sockets['worker_pub'] = self._ctx.socket(zmq.PUB) 131 | self._worker_sockets['worker_pub'].bind(data_worker_master) 132 | sock = self._ctx.socket(zmq.SUB) 133 | sock.setsockopt(zmq.SUBSCRIBE, "") 134 | sock.connect(data_worker_master) 135 | self._worker_sockets['tmp4'] = sock 136 | self._worker_sockets['master_sub'] = ZMQStream(sock, self._io_loop) 137 | 138 | def on_mgmt_end(self, _msg): 139 | self._io_loop.stop() 140 | 141 | 142 | class SimpleFetcherTestCase(ZmqTornadoIntegrationTest): 143 | 144 | port = 8085 145 | 146 | def setUp(self): 147 | ZmqTornadoIntegrationTest.setUp(self) 148 | 149 | path = os.path.join(os.path.dirname(__file__), "static") 150 | application = tornado.web.Application([ 151 | (r"/(.*)", tornado.web.StaticFileHandler, {"path": path}), 152 | ]) 153 | self._server = tornado.httpserver.HTTPServer(application, io_loop = 154 | self._io_loop) 155 | self._server.listen(self.port) 156 | 157 | def tearDown(self): 158 | ZmqTornadoIntegrationTest.tearDown(self) 159 | self._server.stop() 160 | 161 | def test_content_type_encoding(self): 162 | rep_header = dict() 163 | rep_header["Content-Type"] = "text/html; charset=ISO-8859-1" 164 | (ct, encoding) = extract_content_type_encoding(rep_header["Content-Type"]) 165 | self.assertEqual("text/html", ct) 166 | self.assertEqual("iso_8859_1", encoding) 167 | 168 | def test_fetching_works(self): 169 | 170 | settings = Settings() 171 | fetcher = FetchProcessor(settings, io_loop=self._io_loop) 172 | 173 | worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], 174 | self._worker_sockets['worker_pub'], 175 | self._mgmt, 176 | fetcher, 177 | StreamHandler(sys.stdout), 178 | logging.DEBUG, 179 | self._io_loop) 180 | worker.start() 181 | 182 | curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, 183 | effective_url="http://127.0.0.1:%s/robots.txt" % self.port, 184 | ) 185 | msg = DataMessage() 186 | msg.identity = "me" 187 | msg.curi = curi 188 | 189 | self._worker_sockets['master_push'].send_multipart(msg.serialize()) 190 | 191 | def assert_expected_result_and_stop(raw_msg): 192 | msg = DataMessage(raw_msg) 193 | robots = open(os.path.join(os.path.dirname(__file__), 194 | "static/robots.txt")).read() 195 | self.assertEqual(robots, msg.curi.content_body) 196 | death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, 197 | data=ZMQ_SPYDER_MGMT_WORKER_QUIT) 198 | self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) 199 | 200 | self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) 201 | 202 | self._io_loop.start() 203 | 204 | 205 | if __name__ == '__main__': 206 | unittest.main() 207 | -------------------------------------------------------------------------------- /test/test_fetch_processor_last_modified_works.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_fetch_processor_last_modified_works.py 17-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | import logging 19 | from logging import StreamHandler 20 | import sys 21 | 22 | import os 23 | import os.path 24 | import time 25 | from datetime import datetime 26 | import random 27 | 28 | import unittest 29 | 30 | import tornado 31 | import tornado.httpserver 32 | import tornado.web 33 | 34 | import zmq 35 | from zmq.eventloop.ioloop import IOLoop 36 | from zmq.eventloop.zmqstream import ZMQStream 37 | 38 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER 39 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT 40 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK 41 | from spyder.core.messages import DataMessage, MgmtMessage 42 | from spyder.time import serialize_date_time 43 | from spyder.core.mgmt import ZmqMgmt 44 | from spyder.core.worker import AsyncZmqWorker 45 | from spyder.core.settings import Settings 46 | from spyder.processor.fetcher import FetchProcessor 47 | from spyder.thrift.gen.ttypes import CrawlUri 48 | 49 | 50 | class ZmqTornadoIntegrationTest(unittest.TestCase): 51 | 52 | def setUp(self): 53 | 54 | # create the io_loop 55 | self._io_loop = IOLoop.instance() 56 | 57 | # and the context 58 | self._ctx = zmq.Context(1) 59 | 60 | # setup the mgmt sockets 61 | self._setup_mgmt_sockets() 62 | 63 | # setup the data sockets 64 | self._setup_data_sockets() 65 | 66 | # setup the management interface 67 | self._mgmt = ZmqMgmt( self._mgmt_sockets['worker_sub'], 68 | self._mgmt_sockets['worker_pub'], io_loop=self._io_loop) 69 | self._mgmt.start() 70 | self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end) 71 | 72 | def tearDown(self): 73 | # stop the mgmt 74 | self._mgmt.stop() 75 | 76 | # close all sockets 77 | for socket in self._mgmt_sockets.itervalues(): 78 | socket.close() 79 | for socket in self._worker_sockets.itervalues(): 80 | socket.close() 81 | 82 | # terminate the context 83 | self._ctx.term() 84 | 85 | def _setup_mgmt_sockets(self): 86 | 87 | self._mgmt_sockets = dict() 88 | 89 | # adress for the communication from master to worker(s) 90 | mgmt_master_worker = 'inproc://master/worker/coordination/' 91 | 92 | # connect the master with the worker 93 | # the master is a ZMQStream because we are sending msgs from the test 94 | sock = self._ctx.socket(zmq.PUB) 95 | sock.bind(mgmt_master_worker) 96 | self._mgmt_sockets['tmp1'] = sock 97 | self._mgmt_sockets['master_pub'] = ZMQStream(sock, self._io_loop) 98 | # the worker stream is created inside the ZmqMgmt class 99 | self._mgmt_sockets['worker_sub'] = self._ctx.socket(zmq.SUB) 100 | self._mgmt_sockets['worker_sub'].setsockopt(zmq.SUBSCRIBE, "") 101 | self._mgmt_sockets['worker_sub'].connect(mgmt_master_worker) 102 | 103 | # adress for the communication from worker(s) to master 104 | mgmt_worker_master = 'inproc://worker/master/coordination/' 105 | 106 | # connect the worker with the master 107 | self._mgmt_sockets['worker_pub'] = self._ctx.socket(zmq.PUB) 108 | self._mgmt_sockets['worker_pub'].bind(mgmt_worker_master) 109 | sock = self._ctx.socket(zmq.SUB) 110 | sock.setsockopt(zmq.SUBSCRIBE, "") 111 | sock.connect(mgmt_worker_master) 112 | self._mgmt_sockets['tmp2'] = sock 113 | self._mgmt_sockets['master_sub'] = ZMQStream(sock, self._io_loop) 114 | 115 | def _setup_data_sockets(self): 116 | 117 | self._worker_sockets = dict() 118 | 119 | # address for master -> worker communication 120 | data_master_worker = 'inproc://master/worker/pipeline/' 121 | 122 | sock = self._ctx.socket(zmq.PUSH) 123 | sock.bind(data_master_worker) 124 | self._worker_sockets['tmp3'] = sock 125 | self._worker_sockets['master_push'] = ZMQStream(sock, self._io_loop) 126 | self._worker_sockets['worker_pull'] = self._ctx.socket(zmq.PULL) 127 | self._worker_sockets['worker_pull'].connect(data_master_worker) 128 | 129 | # address for worker -> master communication 130 | data_worker_master = 'inproc://worker/master/pipeline/' 131 | 132 | self._worker_sockets['worker_pub'] = self._ctx.socket(zmq.PUB) 133 | self._worker_sockets['worker_pub'].bind(data_worker_master) 134 | sock = self._ctx.socket(zmq.SUB) 135 | sock.setsockopt(zmq.SUBSCRIBE, "") 136 | sock.connect(data_worker_master) 137 | self._worker_sockets['tmp4'] = sock 138 | self._worker_sockets['master_sub'] = ZMQStream(sock, self._io_loop) 139 | 140 | def on_mgmt_end(self, _msg): 141 | self._io_loop.stop() 142 | 143 | 144 | class SimpleFetcherTestCase(ZmqTornadoIntegrationTest): 145 | 146 | port = 8085 147 | 148 | def setUp(self): 149 | ZmqTornadoIntegrationTest.setUp(self) 150 | 151 | self._path = os.path.join(os.path.dirname(__file__), "static") 152 | application = tornado.web.Application([ 153 | (r"/(.*)", tornado.web.StaticFileHandler, {"path": self._path}), 154 | ]) 155 | self._server = tornado.httpserver.HTTPServer(application, io_loop = 156 | self._io_loop) 157 | self._server.listen(self.port) 158 | 159 | def tearDown(self): 160 | ZmqTornadoIntegrationTest.tearDown(self) 161 | self._server.stop() 162 | 163 | def test_fetching_last_modified_works(self): 164 | 165 | settings = Settings() 166 | fetcher = FetchProcessor(settings, io_loop=self._io_loop) 167 | 168 | worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], 169 | self._worker_sockets['worker_pub'], 170 | self._mgmt, 171 | fetcher, 172 | StreamHandler(sys.stdout), 173 | logging.DEBUG, 174 | self._io_loop) 175 | worker.start() 176 | 177 | mtimestamp = datetime.fromtimestamp(os.stat(os.path.join(self._path, 178 | "robots.txt")).st_mtime) 179 | mtime = serialize_date_time(mtimestamp) 180 | curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, 181 | effective_url="http://127.0.0.1:%s/robots.txt" % self.port, 182 | req_header = { "Last-Modified" : 183 | mtime } 184 | ) 185 | 186 | msg = DataMessage() 187 | msg.identity = "me" 188 | msg.curi = curi 189 | 190 | def assert_expected_result_and_stop(raw_msg): 191 | msg = DataMessage(raw_msg) 192 | self.assertEqual(304, msg.curi.status_code) 193 | self.assertEqual("", msg.curi.content_body) 194 | death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, 195 | data=ZMQ_SPYDER_MGMT_WORKER_QUIT) 196 | self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) 197 | 198 | self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) 199 | 200 | self._worker_sockets['master_push'].send_multipart(msg.serialize()) 201 | 202 | self._io_loop.start() 203 | 204 | 205 | if __name__ == '__main__': 206 | unittest.main() 207 | -------------------------------------------------------------------------------- /test/test_fetch_processor_with_etag.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_fetch_processor_with_etag.py 17-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | import logging 19 | from logging import StreamHandler 20 | import sys 21 | 22 | import os.path 23 | import time 24 | import random 25 | 26 | import unittest 27 | 28 | import tornado 29 | import tornado.httpserver 30 | import tornado.web 31 | 32 | import zmq 33 | from zmq.eventloop.ioloop import IOLoop 34 | from zmq.eventloop.zmqstream import ZMQStream 35 | 36 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER 37 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT 38 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK 39 | from spyder.core.messages import DataMessage, MgmtMessage 40 | from spyder.core.mgmt import ZmqMgmt 41 | from spyder.core.worker import AsyncZmqWorker 42 | from spyder.core.settings import Settings 43 | from spyder.processor.fetcher import FetchProcessor 44 | from spyder.thrift.gen.ttypes import CrawlUri 45 | 46 | 47 | class ZmqTornadoIntegrationTest(unittest.TestCase): 48 | 49 | def setUp(self): 50 | 51 | # create the io_loop 52 | self._io_loop = IOLoop.instance() 53 | 54 | # and the context 55 | self._ctx = zmq.Context(1) 56 | 57 | # setup the mgmt sockets 58 | self._setup_mgmt_sockets() 59 | 60 | # setup the data sockets 61 | self._setup_data_sockets() 62 | 63 | # setup the management interface 64 | self._mgmt = ZmqMgmt( self._mgmt_sockets['worker_sub'], 65 | self._mgmt_sockets['worker_pub'], io_loop=self._io_loop) 66 | self._mgmt.start() 67 | self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end) 68 | 69 | def tearDown(self): 70 | # stop the mgmt 71 | self._mgmt.stop() 72 | 73 | # close all sockets 74 | for socket in self._mgmt_sockets.itervalues(): 75 | socket.close() 76 | for socket in self._worker_sockets.itervalues(): 77 | socket.close() 78 | 79 | # terminate the context 80 | self._ctx.term() 81 | 82 | def _setup_mgmt_sockets(self): 83 | 84 | self._mgmt_sockets = dict() 85 | 86 | # adress for the communication from master to worker(s) 87 | mgmt_master_worker = 'inproc://master/worker/coordination/' 88 | 89 | # connect the master with the worker 90 | # the master is a ZMQStream because we are sending msgs from the test 91 | sock = self._ctx.socket(zmq.PUB) 92 | sock.bind(mgmt_master_worker) 93 | self._mgmt_sockets['tmp1'] = sock 94 | self._mgmt_sockets['master_pub'] = ZMQStream(sock, self._io_loop) 95 | # the worker stream is created inside the ZmqMgmt class 96 | self._mgmt_sockets['worker_sub'] = self._ctx.socket(zmq.SUB) 97 | self._mgmt_sockets['worker_sub'].setsockopt(zmq.SUBSCRIBE, "") 98 | self._mgmt_sockets['worker_sub'].connect(mgmt_master_worker) 99 | 100 | # adress for the communication from worker(s) to master 101 | mgmt_worker_master = 'inproc://worker/master/coordination/' 102 | 103 | # connect the worker with the master 104 | self._mgmt_sockets['worker_pub'] = self._ctx.socket(zmq.PUB) 105 | self._mgmt_sockets['worker_pub'].bind(mgmt_worker_master) 106 | sock = self._ctx.socket(zmq.SUB) 107 | sock.setsockopt(zmq.SUBSCRIBE, "") 108 | sock.connect(mgmt_worker_master) 109 | self._mgmt_sockets['tmp2'] = sock 110 | self._mgmt_sockets['master_sub'] = ZMQStream(sock, self._io_loop) 111 | 112 | def _setup_data_sockets(self): 113 | 114 | self._worker_sockets = dict() 115 | 116 | # address for master -> worker communication 117 | data_master_worker = 'inproc://master/worker/pipeline/' 118 | 119 | sock = self._ctx.socket(zmq.PUSH) 120 | sock.bind(data_master_worker) 121 | self._worker_sockets['tmp3'] = sock 122 | self._worker_sockets['master_push'] = ZMQStream(sock, self._io_loop) 123 | self._worker_sockets['worker_pull'] = self._ctx.socket(zmq.PULL) 124 | self._worker_sockets['worker_pull'].connect(data_master_worker) 125 | 126 | # address for worker -> master communication 127 | data_worker_master = 'inproc://worker/master/pipeline/' 128 | 129 | self._worker_sockets['worker_pub'] = self._ctx.socket(zmq.PUB) 130 | self._worker_sockets['worker_pub'].bind(data_worker_master) 131 | sock = self._ctx.socket(zmq.SUB) 132 | sock.setsockopt(zmq.SUBSCRIBE, "") 133 | sock.connect(data_worker_master) 134 | self._worker_sockets['tmp4'] = sock 135 | self._worker_sockets['master_sub'] = ZMQStream(sock, self._io_loop) 136 | 137 | def on_mgmt_end(self, _msg): 138 | self._io_loop.stop() 139 | 140 | 141 | class SimpleFetcherTestCase(ZmqTornadoIntegrationTest): 142 | 143 | port = 8085 144 | 145 | def setUp(self): 146 | ZmqTornadoIntegrationTest.setUp(self) 147 | 148 | path = os.path.join(os.path.dirname(__file__), "static") 149 | application = tornado.web.Application([ 150 | (r"/(.*)", tornado.web.StaticFileHandler, {"path": path}), 151 | ]) 152 | self._server = tornado.httpserver.HTTPServer(application, io_loop = 153 | self._io_loop) 154 | self._server.listen(self.port) 155 | 156 | def tearDown(self): 157 | ZmqTornadoIntegrationTest.tearDown(self) 158 | self._server.stop() 159 | 160 | def test_fetching_etag_works(self): 161 | 162 | settings = Settings() 163 | fetcher = FetchProcessor(settings, io_loop=self._io_loop) 164 | 165 | worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], 166 | self._worker_sockets['worker_pub'], 167 | self._mgmt, 168 | fetcher, 169 | StreamHandler(sys.stdout), 170 | logging.DEBUG, 171 | self._io_loop) 172 | worker.start() 173 | 174 | curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, 175 | effective_url="http://127.0.0.1:%s/robots.txt" % self.port, 176 | req_header = { "Etag" : 177 | "\"3926227169c58185234888b60000c6eb1169577d\"" } 178 | ) 179 | 180 | msg = DataMessage() 181 | msg.identity = "me" 182 | msg.curi = curi 183 | 184 | def assert_expected_result_and_stop(raw_msg): 185 | msg = DataMessage(raw_msg) 186 | self.assertEqual(304, msg.curi.status_code) 187 | self.assertEqual("", msg.curi.content_body) 188 | death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, 189 | data=ZMQ_SPYDER_MGMT_WORKER_QUIT) 190 | self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) 191 | 192 | self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) 193 | 194 | self._worker_sockets['master_push'].send_multipart(msg.serialize()) 195 | 196 | self._io_loop.start() 197 | 198 | 199 | if __name__ == '__main__': 200 | unittest.main() 201 | -------------------------------------------------------------------------------- /test/test_http_extractor.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_http_extractor.py 17-Mar-2011 5 | # 6 | # Licensed to the Apache Software Foundation (ASF) under one 7 | # or more contributor license agreements. See the NOTICE file 8 | # distributed with this work for additional information 9 | # regarding copyright ownership. The ASF licenses this file 10 | # to you under the Apache License, Version 2.0 (the 11 | # "License"); you may not use this file except in compliance 12 | # with the License. You may obtain a copy of the License at 13 | # 14 | # http://www.apache.org/licenses/LICENSE-2.0 15 | # 16 | # Unless required by applicable law or agreed to in writing, 17 | # software distributed under the License is distributed on an 18 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 19 | # KIND, either express or implied. See the License for the 20 | # specific language governing permissions and limitations 21 | # under the License. 22 | # 23 | # 24 | import unittest 25 | 26 | from spyder.core.constants import CURI_EXTRACTED_URLS 27 | from spyder.core.settings import Settings 28 | from spyder.processor.httpextractor import HttpExtractor 29 | from spyder.thrift.gen.ttypes import CrawlUri 30 | 31 | 32 | class HttpExtractorTest(unittest.TestCase): 33 | 34 | def test_correct_extraction(self): 35 | 36 | s = Settings() 37 | 38 | curi = CrawlUri("http://localhost") 39 | curi.status_code = 302 40 | curi.rep_header = {"Location": "http://localhost/index.html"} 41 | curi.optional_vars = dict() 42 | 43 | xtor = HttpExtractor(s) 44 | curi = xtor(curi) 45 | 46 | self.assertTrue(CURI_EXTRACTED_URLS in curi.optional_vars) 47 | self.assertEquals("http://localhost/index.html", 48 | curi.optional_vars[CURI_EXTRACTED_URLS]) 49 | 50 | def test_only_on_redirect(self): 51 | 52 | s = Settings() 53 | 54 | curi = CrawlUri("http://localhost") 55 | curi.status_code = 200 56 | curi.rep_header = {"Location": "http://localhost/index.html"} 57 | curi.optional_vars = dict() 58 | 59 | xtor = HttpExtractor(s) 60 | curi = xtor(curi) 61 | 62 | self.assertFalse(CURI_EXTRACTED_URLS in curi.optional_vars) 63 | 64 | def test_relative_links(self): 65 | 66 | s = Settings() 67 | 68 | curi = CrawlUri("http://localhost") 69 | curi.status_code = 303 70 | curi.rep_header = {"Location": "/index.html"} 71 | curi.optional_vars = dict() 72 | 73 | xtor = HttpExtractor(s) 74 | curi = xtor(curi) 75 | 76 | self.assertTrue(CURI_EXTRACTED_URLS in curi.optional_vars) 77 | self.assertEquals("http://localhost/index.html", 78 | curi.optional_vars[CURI_EXTRACTED_URLS]) 79 | -------------------------------------------------------------------------------- /test/test_limiter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_limiter.py 18-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import unittest 20 | 21 | from spyder.core.constants import CURI_EXTRACTION_FINISHED, CURI_OPTIONAL_TRUE 22 | from spyder.processor import limiter 23 | from spyder.thrift.gen.ttypes import CrawlUri 24 | 25 | 26 | class LimiterTestCase(unittest.TestCase): 27 | 28 | def test_do_not_process_robots_works(self): 29 | 30 | curi = CrawlUri() 31 | curi.effective_url = "http://127.0.0.1/robots.txt" 32 | curi.optional_vars = dict() 33 | 34 | l = limiter.DefaultLimiter(None) 35 | 36 | for i in range(2): 37 | l._do_not_process_robots(curi) 38 | self.assertEqual(CURI_OPTIONAL_TRUE, 39 | curi.optional_vars[CURI_EXTRACTION_FINISHED]) 40 | 41 | 42 | if __name__ == '__main__': 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /test/test_masterprocess.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_masterprocess.py 07-Feb-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | import logging 19 | import unittest 20 | 21 | import sys 22 | 23 | from spyder.core.settings import Settings 24 | from spyder import masterprocess 25 | 26 | 27 | class MasterProcessTest(unittest.TestCase): 28 | 29 | def test_create_frontier_works(self): 30 | 31 | handler = logging.StreamHandler(sys.stdout) 32 | s = Settings() 33 | s.FRONTIER_STATE_FILE = ":memory:" 34 | 35 | frontier = masterprocess.create_frontier(s, handler) 36 | 37 | self.assertTrue(frontier is not None) 38 | -------------------------------------------------------------------------------- /test/test_messages.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_messages.py 14-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import unittest 20 | 21 | from spyder.core.messages import DataMessage, MgmtMessage 22 | from spyder.core.messages import serialize_crawl_uri, deserialize_crawl_uri 23 | from spyder.thrift.gen.ttypes import CrawlUri 24 | 25 | class TestMessages(unittest.TestCase): 26 | 27 | def test_that_serialization_works(self): 28 | 29 | curi = CrawlUri(url="http://localhost") 30 | 31 | serialized = serialize_crawl_uri(curi) 32 | deserialized = deserialize_crawl_uri(serialized) 33 | 34 | self.assertEqual(curi, deserialized) 35 | 36 | def test_that_data_messages_work(self): 37 | identity = "me myself and i" 38 | curi = CrawlUri(url="http://localhost") 39 | serialized = serialize_crawl_uri(curi) 40 | 41 | msg = DataMessage([identity, serialized]) 42 | 43 | self.assertEqual(identity, msg.identity) 44 | self.assertEqual(curi, msg.curi) 45 | self.assertEqual([identity, serialized], msg.serialize()) 46 | self.assertEqual(msg, DataMessage(msg.serialize())) 47 | 48 | def test_that_mgmt_messages_work(self): 49 | topic = "me" 50 | identity = "myself" 51 | data = "and i" 52 | 53 | msg = MgmtMessage([topic, identity, data]) 54 | 55 | self.assertEqual(topic, msg.topic) 56 | self.assertEqual(identity, msg.identity) 57 | self.assertEqual(data, msg.data) 58 | self.assertEqual([topic, identity, data], msg.serialize()) 59 | self.assertEqual(msg, MgmtMessage(msg.serialize())) 60 | 61 | def test_that_construction_works(self): 62 | msg = DataMessage(identity="me") 63 | self.assertEqual("me", msg.identity) 64 | self.assertEqual(None, msg.curi) 65 | 66 | msg = DataMessage(curi="bla") 67 | self.assertEqual("bla", msg.curi) 68 | self.assertEqual(None, msg.identity) 69 | 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /test/test_mgmt.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_mgmt.py 10-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import unittest 20 | 21 | import time 22 | 23 | import zmq 24 | from zmq.eventloop.ioloop import IOLoop 25 | from zmq.eventloop.zmqstream import ZMQStream 26 | 27 | from spyder.core.messages import MgmtMessage 28 | from spyder.core.mgmt import ZmqMgmt 29 | from spyder.core.constants import * 30 | 31 | 32 | class ManagementIntegrationTest(unittest.TestCase): 33 | 34 | 35 | def setUp(self): 36 | self._io_loop = IOLoop.instance() 37 | self._ctx = zmq.Context(1) 38 | 39 | sock = self._ctx.socket(zmq.PUB) 40 | sock.bind('inproc://master/worker/coordination') 41 | self._master_pub_sock = sock 42 | self._master_pub = ZMQStream(sock, self._io_loop) 43 | 44 | self._worker_sub = self._ctx.socket(zmq.SUB) 45 | self._worker_sub.setsockopt(zmq.SUBSCRIBE, "") 46 | self._worker_sub.connect('inproc://master/worker/coordination') 47 | 48 | self._worker_pub = self._ctx.socket(zmq.PUB) 49 | self._worker_pub.bind( 'inproc://worker/master/coordination' ) 50 | 51 | sock = self._ctx.socket(zmq.SUB) 52 | sock.setsockopt(zmq.SUBSCRIBE, "") 53 | sock.connect( 'inproc://worker/master/coordination' ) 54 | self._master_sub_sock = sock 55 | self._master_sub = ZMQStream(sock, self._io_loop) 56 | 57 | self._topic = ZMQ_SPYDER_MGMT_WORKER + 'testtopic' 58 | 59 | def tearDown(self): 60 | self._master_pub.close() 61 | self._master_pub_sock.close() 62 | self._worker_sub.close() 63 | self._worker_pub.close() 64 | self._master_sub.close() 65 | self._master_sub_sock.close() 66 | self._ctx.term() 67 | 68 | def call_me(self, msg): 69 | self.assertEqual(self._topic, msg.topic) 70 | self.assertEqual('test'.encode(), msg.data) 71 | death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, 72 | data=ZMQ_SPYDER_MGMT_WORKER_QUIT) 73 | self._master_pub.send_multipart(death.serialize()) 74 | 75 | def on_end(self, msg): 76 | self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT, msg.data) 77 | self._io_loop.stop() 78 | 79 | 80 | def test_simple_mgmt_session(self): 81 | 82 | mgmt = ZmqMgmt(self._worker_sub, self._worker_pub, io_loop=self._io_loop) 83 | mgmt.start() 84 | 85 | self.assertRaises(ValueError, mgmt.add_callback, "test", "test") 86 | 87 | mgmt.add_callback(self._topic, self.call_me) 88 | mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_end) 89 | 90 | test_msg = MgmtMessage(topic=self._topic, data='test'.encode()) 91 | self._master_pub.send_multipart(test_msg.serialize()) 92 | 93 | def assert_correct_mgmt_answer(raw_msg): 94 | msg = MgmtMessage(raw_msg) 95 | self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg.data) 96 | mgmt.remove_callback(self._topic, self.call_me) 97 | mgmt.remove_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_end) 98 | self.assertEqual({}, mgmt._callbacks) 99 | 100 | self._master_sub.on_recv(assert_correct_mgmt_answer) 101 | 102 | self._io_loop.start() 103 | 104 | 105 | if __name__ == '__main__': 106 | unittest.main() 107 | -------------------------------------------------------------------------------- /test/test_multiple_frontier.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_multiple_frontier.py 31-Mar-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | import logging 19 | from logging import StreamHandler 20 | 21 | from datetime import datetime 22 | from datetime import timedelta 23 | import time 24 | import unittest 25 | import sys 26 | 27 | from spyder.core.frontier import MultipleHostFrontier 28 | from spyder.core.settings import Settings 29 | from spyder.time import serialize_date_time, deserialize_date_time 30 | from spyder.thrift.gen.ttypes import CrawlUri 31 | 32 | 33 | class MultipleHostFrontierTest(unittest.TestCase): 34 | 35 | def test_that_adding_uris_works(self): 36 | 37 | s = Settings() 38 | s.FRONTIER_STATE_FILE = ":memory:" 39 | 40 | frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout)) 41 | 42 | now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) 43 | next_crawl_date = now + timedelta(days=1) 44 | curi = CrawlUri("http://localhost") 45 | curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) } 46 | curi.current_priority = 2 47 | 48 | frontier.add_uri(curi) 49 | 50 | cur = frontier._front_end_queues._cursor 51 | 52 | curi = CrawlUri("http://foreignhost") 53 | curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) } 54 | curi.current_priority = 1 55 | 56 | frontier.add_uri(curi) 57 | 58 | idents = {"localhost": -1, "foreignhost": -1} 59 | cur.execute("SELECT * FROM queue_identifiers") 60 | for row in cur: 61 | self.assertTrue(row['identifier'] in idents.keys()) 62 | idents["http://%s" % row['identifier']] = row['queue'] 63 | 64 | cur.execute("SELECT * FROM queues") 65 | for row in cur: 66 | self.assertEqual(idents[row['url']], row['queue']) 67 | 68 | self.assertEqual(2, frontier._front_end_queues.get_queue_count()) 69 | 70 | def test_queues_work(self): 71 | 72 | s = Settings() 73 | s.FRONTIER_STATE_FILE = ":memory:" 74 | s.FRONTIER_ACTIVE_QUEUES = 1 75 | s.FRONTIER_QUEUE_BUDGET = 4 76 | s.FRONTIER_QUEUE_BUDGET_PUNISH = 5 77 | 78 | frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout)) 79 | 80 | now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) 81 | curi1 = CrawlUri("http://localhost") 82 | curi1.current_priority = 2 83 | curi1.req_time = 0.4 84 | 85 | frontier.add_uri(curi1) 86 | 87 | cur = frontier._front_end_queues._cursor 88 | 89 | curi2 = CrawlUri("http://foreignhost") 90 | curi2.current_priority = 1 91 | curi2.req_time = 1.4 92 | 93 | frontier.add_uri(curi2) 94 | 95 | self.assertEqual(0, len(frontier._current_queues)) 96 | frontier._maybe_add_queues() 97 | 98 | self.assertEqual(1, len(frontier._current_queues)) 99 | for q1 in frontier._current_queues.keys(): 100 | pass 101 | 102 | self.assertEquals(4, frontier._budget_politeness[q1]) 103 | frontier._cleanup_budget_politeness() 104 | self.assertEquals(4, frontier._budget_politeness[q1]) 105 | 106 | frontier._update_heap() 107 | self.assertEqual(1, len(frontier._current_queues)) 108 | 109 | if q1 == 1: 110 | curi1.status_code = 500 111 | frontier.process_server_error(curi1) 112 | else: 113 | curi1.status_code = 500 114 | frontier.process_server_error(curi2) 115 | 116 | self.assertEquals(-1, frontier._budget_politeness[q1]) 117 | 118 | frontier._cleanup_budget_politeness() 119 | 120 | self.assertEqual(1, len(frontier._current_queues)) 121 | for q2 in frontier._current_queues.keys(): 122 | pass 123 | 124 | self.assertEquals(4, frontier._budget_politeness[q2]) 125 | frontier._cleanup_budget_politeness() 126 | self.assertEquals(4, frontier._budget_politeness[q2]) 127 | 128 | frontier._update_heap() 129 | self.assertEqual(1, len(frontier._current_queues)) 130 | 131 | if q2 == 1: 132 | curi1.status_code = 200 133 | frontier.process_successful_crawl(curi1) 134 | else: 135 | curi2.status_code = 200 136 | frontier.process_successful_crawl(curi2) 137 | 138 | self.assertEquals(3, frontier._budget_politeness[q2]) 139 | 140 | frontier._cleanup_budget_politeness() 141 | 142 | def test_with_multiple_active_queues(self): 143 | 144 | s = Settings() 145 | s.FRONTIER_STATE_FILE = ":memory:" 146 | s.FRONTIER_ACTIVE_QUEUES = 2 147 | s.FRONTIER_QUEUE_BUDGET = 4 148 | s.FRONTIER_QUEUE_BUDGET_PUNISH = 5 149 | 150 | frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout)) 151 | 152 | now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) 153 | curi1 = CrawlUri("http://localhost") 154 | curi1.current_priority = 2 155 | curi1.req_time = 0.4 156 | 157 | frontier.add_uri(curi1) 158 | 159 | cur = frontier._front_end_queues._cursor 160 | 161 | curi2 = CrawlUri("http://www.google.de") 162 | curi2.current_priority = 1 163 | curi2.req_time = 1.4 164 | 165 | frontier.add_uri(curi2) 166 | 167 | self.assertEqual(0, len(frontier._current_queues)) 168 | frontier._maybe_add_queues() 169 | 170 | self.assertEqual(2, len(frontier._current_queues)) 171 | 172 | next_url = frontier.get_next() 173 | 174 | 175 | if __name__ == '__main__': 176 | unittest.main() 177 | -------------------------------------------------------------------------------- /test/test_queue_assignment.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_queue_assignment.py 31-Mar-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | import unittest 19 | 20 | from spyder.core.settings import Settings 21 | from spyder.core.dnscache import DnsCache 22 | from spyder.core.queueassignment import HostBasedQueueAssignment 23 | from spyder.core.queueassignment import IpBasedQueueAssignment 24 | 25 | class HostBasedQueueAssignmentTest(unittest.TestCase): 26 | 27 | def test_host_based_assignment(self): 28 | 29 | s = Settings() 30 | dns = DnsCache(s) 31 | assign = HostBasedQueueAssignment(dns) 32 | 33 | url = "http://www.google.com/pille/palle" 34 | self.assertEqual("www.google.com", assign.get_identifier(url)) 35 | 36 | 37 | 38 | class IpBasedQueueAssignmentTest(unittest.TestCase): 39 | 40 | def test_ip_based_assignment(self): 41 | 42 | s = Settings() 43 | dns = DnsCache(s) 44 | assign = IpBasedQueueAssignment(dns) 45 | 46 | url = "http://localhost:12345/this" 47 | self.assertEqual("127.0.0.1", assign.get_identifier(url)) 48 | 49 | if __name__ == '__main__': 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /test/test_queue_selector.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_queue_selector.py 25-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import unittest 20 | 21 | from collections import defaultdict 22 | 23 | from spyder.core.queueselector import BiasedQueueSelector 24 | 25 | 26 | class BiasedQueueSelectorTest(unittest.TestCase): 27 | 28 | def test_histogram(self): 29 | 30 | # create a selector with 10 queues 31 | selector = BiasedQueueSelector(10) 32 | 33 | histogram = defaultdict(int) 34 | 35 | for i in xrange(100000): 36 | histogram[selector.get_queue()] += 1 37 | 38 | for i in range(1,9): 39 | self.assertTrue(histogram[i] > histogram[i+1]) 40 | 41 | 42 | if __name__ == '__main__': 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /test/test_regex_scoper.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_regex_scoper.py 24-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import unittest 20 | 21 | from spyder.core.constants import CURI_EXTRACTED_URLS 22 | from spyder.core.settings import Settings 23 | from spyder.thrift.gen.ttypes import CrawlUri 24 | 25 | from spyder.processor.scoper import * 26 | 27 | class RegexScoperTest(unittest.TestCase): 28 | 29 | def test_regex_scoper(self): 30 | 31 | curi = CrawlUri() 32 | curi.optional_vars = dict() 33 | curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join([ 34 | "http://www.google.de/index.html", 35 | "ftp://www.google.de/pillepalle.avi", 36 | ]) 37 | 38 | settings = Settings() 39 | settings.REGEX_SCOPE_POSITIVE = ['^.*\.html'] 40 | settings.REGEX_SCOPE_NEGATIVE = ['^.*\.avi'] 41 | scoper = RegexScoper(settings) 42 | 43 | curi = scoper(curi) 44 | 45 | print curi.optional_vars[CURI_EXTRACTED_URLS] 46 | self.assertTrue("http://www.google.de/index.html" in 47 | curi.optional_vars[CURI_EXTRACTED_URLS]) 48 | self.assertFalse("ftp://www.google.de/pillepalle.avi" in 49 | curi.optional_vars[CURI_EXTRACTED_URLS]) 50 | 51 | 52 | if __name__ == '__main__': 53 | unittest.main() 54 | -------------------------------------------------------------------------------- /test/test_settings.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_settings.py 10-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import unittest 20 | 21 | 22 | class SettingsTest(unittest.TestCase): 23 | 24 | def test_loading_default_settings_works(self): 25 | 26 | from spyder import defaultsettings 27 | from spyder.core.settings import Settings 28 | 29 | settings = Settings() 30 | self.assertEqual(defaultsettings.ZEROMQ_MGMT_MASTER, 31 | settings.ZEROMQ_MGMT_MASTER) 32 | 33 | 34 | def test_loading_custom_settings_works(self): 35 | 36 | from spyder import defaultsettings 37 | from spyder.core.settings import Settings 38 | 39 | import test_settings_settings 40 | settings = Settings(test_settings_settings) 41 | 42 | self.assertEqual(test_settings_settings.ZEROMQ_MGMT_WORKER, 43 | settings.ZEROMQ_MGMT_WORKER) 44 | 45 | 46 | if __name__ == '__main__': 47 | unittest.main() 48 | -------------------------------------------------------------------------------- /test/test_settings_settings.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_settings_settings.py 10-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | ZEROMQ_MGMT_WORKER = "test" 20 | -------------------------------------------------------------------------------- /test/test_sqlite_queues.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_sqlite_queues.py 25-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import unittest 20 | 21 | import time 22 | 23 | from spyder.core.sqlitequeues import SQLiteSingleHostUriQueue, UriNotFound 24 | 25 | 26 | class SqliteQueuesTest(unittest.TestCase): 27 | 28 | def test_adding_works(self): 29 | 30 | uri = ("http://localhost", "etag", int(time.time()*1000), 31 | int(time.time() * 1000), 1) 32 | 33 | q = SQLiteSingleHostUriQueue(":memory:") 34 | q.add_uri(uri) 35 | 36 | self.assertEqual(1, len(q)) 37 | 38 | cursor = q._connection.execute("SELECT * FROM queue") 39 | uri_res = cursor.fetchone() 40 | (url, etag, mod_date, next_date, prio) = uri 41 | (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res 42 | self.assertEqual(url, url_res) 43 | self.assertEqual(etag, etag_res) 44 | self.assertEqual(mod_date, mod_date_res) 45 | self.assertEqual(prio, prio_res) 46 | self.assertEqual(next_date, next_date_res) 47 | 48 | q.close() 49 | 50 | def test_updating_works(self): 51 | 52 | uri = ("http://localhost", "etag", int(time.time()*1000), 53 | int(time.time() * 1000), 1) 54 | 55 | q = SQLiteSingleHostUriQueue(":memory:") 56 | q.add_uri(uri) 57 | 58 | uri = ("http://localhost", "etag", int(time.time()*1000), 59 | int(time.time() * 1000), 2) 60 | 61 | q.update_uri(uri) 62 | 63 | cursor = q._connection.execute("SELECT * FROM queue") 64 | uri_res = cursor.fetchone() 65 | (url, etag, mod_date, next_date, prio) = uri 66 | (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res 67 | self.assertEqual(url, url_res) 68 | self.assertEqual(etag, etag_res) 69 | self.assertEqual(mod_date, mod_date_res) 70 | self.assertEqual(prio, prio_res) 71 | self.assertEqual(next_date, next_date_res) 72 | 73 | def test_adding_lists_works(self): 74 | 75 | uris = [("http://localhost", "etag", int(time.time()*1000), 76 | int(time.time() * 1010), 1), 77 | ] 78 | 79 | q = SQLiteSingleHostUriQueue(":memory:") 80 | q.add_uris(uris) 81 | 82 | cursor = q._connection.execute("SELECT * FROM queue") 83 | uri_res = cursor.fetchone() 84 | (url, etag, mod_date, next_date, prio) = uris[0] 85 | (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res 86 | self.assertEqual(url, url_res) 87 | self.assertEqual(etag, etag_res) 88 | self.assertEqual(mod_date, mod_date_res) 89 | self.assertEqual(prio, prio_res) 90 | self.assertEqual(next_date, next_date_res) 91 | 92 | def test_updating_lists_works(self): 93 | 94 | uris = [("http://localhost", "etag", int(time.time()*1000), 95 | int(time.time() * 1000), 1), 96 | ] 97 | 98 | q = SQLiteSingleHostUriQueue(":memory:") 99 | q.add_uris(uris) 100 | 101 | uris = [("http://localhost", "etag", int(time.time()*1000), 102 | int(time.time() * 1000), 2), 103 | ] 104 | 105 | q.update_uris(uris) 106 | 107 | cursor = q._connection.execute("SELECT * FROM queue") 108 | uri_res = cursor.fetchone() 109 | (url, etag, mod_date, next_date, prio) = uris[0] 110 | (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res 111 | self.assertEqual(url, url_res) 112 | self.assertEqual(etag, etag_res) 113 | self.assertEqual(mod_date, mod_date_res) 114 | self.assertEqual(prio, prio_res) 115 | self.assertEqual(next_date, next_date_res) 116 | 117 | def test_removing_lists_works(self): 118 | 119 | uris = [("http://localhost", "etag", int(time.time()*1000), 120 | int(time.time() * 1000), 1), 121 | ("http://fogeignhost", "ETAG", int(time.time()*1000), 122 | int(time.time() * 1000), 2), 123 | ] 124 | 125 | q = SQLiteSingleHostUriQueue(":memory:") 126 | q.add_uris(uris) 127 | 128 | q.remove_uris(uris) 129 | 130 | cursor = q._connection.execute("SELECT * FROM queue") 131 | self.assertTrue(None is cursor.fetchone()) 132 | 133 | def test_iterating_over_all_uris_works(self): 134 | 135 | uris = [("http://localhost", "etag", int(time.time()*1000), 136 | int(time.time() * 1000), 1), 137 | ("http://foreignhost", "ETAG", int(time.time()*1000), 138 | int(time.time() * 1000), 2), 139 | ] 140 | urls = ["http://localhost", "http://foreignhost"] 141 | 142 | q = SQLiteSingleHostUriQueue(":memory:") 143 | q.add_uris(uris) 144 | 145 | uri = q.get_uri("http://foreignhost") 146 | self.assertEqual(uris[1], uri) 147 | 148 | self.assertRaises(UriNotFound, q.get_uri, "http://gibtsnuesch") 149 | 150 | for url in q.all_uris(): 151 | self.assertTrue(url in urls) 152 | 153 | def test_queue_head_works(self): 154 | 155 | uris = [("http://localhost", "etag", int(time.time()*1000), 156 | int(time.time() * 1000), 1), 157 | ("http://fogeignhost", "ETAG", int(time.time()*1000), 158 | int(time.time() * 1001), 2), 159 | ] 160 | 161 | q = SQLiteSingleHostUriQueue(":memory:") 162 | q.add_uris(uris) 163 | 164 | (url1, etag1, mod_date1, next_date1, prio1) = uris[0] 165 | (url2, etag2, mod_date2, next_date2, prio2) = uris[1] 166 | 167 | for uri_res in q.queue_head(n=1, offset=0): 168 | (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res 169 | self.assertEqual(url1, url_res) 170 | self.assertEqual(etag1, etag_res) 171 | self.assertEqual(mod_date1, mod_date_res) 172 | self.assertEqual(prio1, prio_res) 173 | self.assertEqual(next_date1, next_date_res) 174 | 175 | for uri_res in q.queue_head(n=1, offset=1): 176 | (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res 177 | self.assertEqual(url2, url_res) 178 | self.assertEqual(etag2, etag_res) 179 | self.assertEqual(mod_date2, mod_date_res) 180 | self.assertEqual(prio2, prio_res) 181 | self.assertEqual(next_date2, next_date_res) 182 | 183 | uris.append(("http://localhost/1", "eTag", int(time.time()*1000), 184 | int(time.time()*1002), 1)) 185 | (url3, etag3, mod_date3, next_date3, prio3) = uris[2] 186 | q.add_uri(uris[2]) 187 | 188 | q.ignore_uri("http://localhost", 404) 189 | 190 | for uri_res in q.queue_head(n=1, offset=1): 191 | (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res 192 | self.assertEqual(url3, url_res) 193 | self.assertEqual(etag3, etag_res) 194 | self.assertEqual(mod_date3, mod_date_res) 195 | self.assertEqual(prio3, prio_res) 196 | self.assertEqual(next_date3, next_date_res) 197 | 198 | 199 | if __name__ == '__main__': 200 | unittest.main() 201 | -------------------------------------------------------------------------------- /test/test_strip_session_ids.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_strip_session_ids.py 14-Apr-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # 19 | import unittest 20 | 21 | from spyder.core.constants import CURI_EXTRACTED_URLS 22 | from spyder.core.settings import Settings 23 | from spyder.processor.stripsessions import StripSessionIds 24 | from spyder.thrift.gen.ttypes import CrawlUri 25 | 26 | 27 | class StripSessionIdsTest(unittest.TestCase): 28 | 29 | def test_that_stripping_session_stuff_works(self): 30 | 31 | s = StripSessionIds(Settings()) 32 | 33 | url = "http://pREis.de/traeger/index.php?sid=8429fb3ae210a2a0e28800b7f48d90f2" 34 | 35 | self.assertEqual("http://pREis.de/traeger/index.php?", 36 | s._remove_session_ids(url)) 37 | 38 | url = "http://preis.de/traeger/index.php?jsessionid=8429fb3ae210a2a0e28800b7f48d90f2" 39 | 40 | self.assertEqual("http://preis.de/traeger/index.php?", 41 | s._remove_session_ids(url)) 42 | 43 | url = "http://preis.de/traeger/index.php?phpsessid=8429fb3ae210a2a0e28800b7f48d90f2" 44 | 45 | self.assertEqual("http://preis.de/traeger/index.php?", 46 | s._remove_session_ids(url)) 47 | 48 | url = "http://preis.de/traeger/index.php?aspsessionid=8429fb3ae210a2a0e28800b7f48d90f2" 49 | 50 | self.assertEqual("http://preis.de/traeger/index.php?", 51 | s._remove_session_ids(url)) 52 | 53 | def test_that_with_uri_works(self): 54 | 55 | s = StripSessionIds(Settings()) 56 | 57 | urls = ["http://preis.de/traeger/index.php?sid=8429fb3ae210a2a0e28800b7f48d90f2", 58 | "http://preis.de/traeger/index.php?jsessionid=8429fb3ae210a2a0e28800b7f48d90f2", 59 | "http://preis.de/traeger/index.php?phpsessid=8429fb3ae210a2a0e28800b7f48d90f2", 60 | "http://preis.de/traeger/index.php?aspsessionid=8429fb3ae210a2a0e28800b7f48d90f2", 61 | ] 62 | 63 | curi = CrawlUri() 64 | curi.optional_vars = { CURI_EXTRACTED_URLS: "\n".join(urls) } 65 | 66 | curi = s(curi) 67 | clean_urls = curi.optional_vars[CURI_EXTRACTED_URLS].split('\n') 68 | 69 | for u in clean_urls: 70 | self.assertEqual("http://preis.de/traeger/index.php?", u) 71 | -------------------------------------------------------------------------------- /test/test_uri_unique_filter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_uri_unique_filter.py 31-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import unittest 20 | 21 | from spyder.core.uri_uniq import UniqueUriFilter 22 | 23 | class UniqueUriFilterTest(unittest.TestCase): 24 | 25 | def test_unknown_uris(self): 26 | 27 | unique_filter = UniqueUriFilter('sha1') 28 | 29 | self.assertFalse(unique_filter.is_known("http://www.google.de", 30 | add_if_unknown=True)) 31 | self.assertFalse(unique_filter.is_known("http://www.yahoo.com", 32 | add_if_unknown=True)) 33 | self.assertTrue(unique_filter.is_known("http://www.google.de")) 34 | self.assertTrue(unique_filter.is_known("http://www.yahoo.com")) 35 | 36 | 37 | if __name__ == '__main__': 38 | unittest.main() 39 | -------------------------------------------------------------------------------- /test/test_worker.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_worker.py 11-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import logging 20 | from logging import StreamHandler 21 | import sys 22 | 23 | import unittest 24 | 25 | import time 26 | 27 | import zmq 28 | from zmq import Socket 29 | from zmq.eventloop.ioloop import IOLoop 30 | from zmq.eventloop.zmqstream import ZMQStream 31 | 32 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER 33 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT 34 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK 35 | from spyder.core.mgmt import ZmqMgmt 36 | from spyder.core.worker import ZmqWorker, AsyncZmqWorker 37 | from spyder.core.messages import DataMessage, MgmtMessage 38 | from spyder.thrift.gen.ttypes import CrawlUri 39 | 40 | 41 | class ZmqTornadoIntegrationTest(unittest.TestCase): 42 | 43 | def setUp(self): 44 | 45 | # create the io_loop 46 | self._io_loop = IOLoop.instance() 47 | 48 | # and the context 49 | self._ctx = zmq.Context(1) 50 | 51 | # setup the mgmt sockets 52 | self._setup_mgmt_sockets() 53 | 54 | # setup the data sockets 55 | self._setup_data_sockets() 56 | 57 | # setup the management interface 58 | self._mgmt = ZmqMgmt( self._mgmt_sockets['worker_sub'], 59 | self._mgmt_sockets['worker_pub'], io_loop=self._io_loop) 60 | self._mgmt.start() 61 | self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end) 62 | 63 | def tearDown(self): 64 | # stop the mgmt 65 | self._mgmt.stop() 66 | 67 | # close all sockets 68 | for socket in self._mgmt_sockets.itervalues(): 69 | socket.close() 70 | for socket in self._worker_sockets.itervalues(): 71 | socket.close() 72 | 73 | # terminate the context 74 | self._ctx.term() 75 | 76 | def _setup_mgmt_sockets(self): 77 | 78 | self._mgmt_sockets = dict() 79 | 80 | # adress for the communication from master to worker(s) 81 | mgmt_master_worker = 'inproc://master/worker/coordination/' 82 | 83 | # connect the master with the worker 84 | # the master is a ZMQStream because we are sending msgs from the test 85 | sock = self._ctx.socket(zmq.PUB) 86 | sock.bind(mgmt_master_worker) 87 | self._mgmt_sockets['tmp1'] = sock 88 | self._mgmt_sockets['master_pub'] = ZMQStream(sock, self._io_loop) 89 | # the worker stream is created inside the ZmqMgmt class 90 | self._mgmt_sockets['worker_sub'] = self._ctx.socket(zmq.SUB) 91 | self._mgmt_sockets['worker_sub'].setsockopt(zmq.SUBSCRIBE, "") 92 | self._mgmt_sockets['worker_sub'].connect(mgmt_master_worker) 93 | 94 | # adress for the communication from worker(s) to master 95 | mgmt_worker_master = 'inproc://worker/master/coordination/' 96 | 97 | # connect the worker with the master 98 | self._mgmt_sockets['worker_pub'] = self._ctx.socket(zmq.PUB) 99 | self._mgmt_sockets['worker_pub'].bind(mgmt_worker_master) 100 | sock = self._ctx.socket(zmq.SUB) 101 | sock.setsockopt(zmq.SUBSCRIBE, "") 102 | sock.connect(mgmt_worker_master) 103 | self._mgmt_sockets['tmp2'] = sock 104 | self._mgmt_sockets['master_sub'] = ZMQStream(sock, self._io_loop) 105 | 106 | def _setup_data_sockets(self): 107 | 108 | self._worker_sockets = dict() 109 | 110 | # address for master -> worker communication 111 | data_master_worker = 'inproc://master/worker/pipeline/' 112 | 113 | sock = self._ctx.socket(zmq.PUSH) 114 | sock.bind(data_master_worker) 115 | self._worker_sockets['tmp3'] = sock 116 | self._worker_sockets['master_push'] = ZMQStream(sock, self._io_loop) 117 | self._worker_sockets['worker_pull'] = self._ctx.socket(zmq.PULL) 118 | self._worker_sockets['worker_pull'].connect(data_master_worker) 119 | 120 | # address for worker -> master communication 121 | data_worker_master = 'inproc://worker/master/pipeline/' 122 | 123 | self._worker_sockets['worker_pub'] = self._ctx.socket(zmq.PUB) 124 | self._worker_sockets['worker_pub'].bind(data_worker_master) 125 | sock = self._ctx.socket(zmq.SUB) 126 | sock.setsockopt(zmq.SUBSCRIBE, "") 127 | sock.connect(data_worker_master) 128 | self._worker_sockets['tmp4'] = sock 129 | self._worker_sockets['master_sub'] = ZMQStream(sock, self._io_loop) 130 | 131 | def on_mgmt_end(self, _msg): 132 | self._io_loop.stop() 133 | 134 | 135 | class ZmqWorkerIntegrationTest(ZmqTornadoIntegrationTest): 136 | 137 | def echo_processing(self, crawl_uri): 138 | death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, 139 | data=ZMQ_SPYDER_MGMT_WORKER_QUIT) 140 | self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) 141 | return crawl_uri 142 | 143 | def test_that_stopping_worker_via_mgmt_works(self): 144 | 145 | worker = ZmqWorker( self._worker_sockets['worker_pull'], 146 | self._worker_sockets['worker_pub'], 147 | self._mgmt, 148 | self.echo_processing, 149 | StreamHandler(sys.stdout), 150 | logging.DEBUG, 151 | self._io_loop) 152 | 153 | worker.start() 154 | 155 | curi = CrawlUri(url="http://localhost") 156 | msg = DataMessage() 157 | msg.identity = "me" 158 | msg.curi = curi 159 | 160 | def assert_correct_data_answer(msg2): 161 | self.assertEqual(msg, DataMessage(msg2)) 162 | 163 | self._worker_sockets['master_sub'].on_recv(assert_correct_data_answer) 164 | 165 | def assert_correct_mgmt_answer(msg3): 166 | self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg3.data) 167 | 168 | self._mgmt_sockets['master_sub'].on_recv(assert_correct_data_answer) 169 | 170 | self._worker_sockets['master_push'].send_multipart(msg.serialize()) 171 | 172 | self._io_loop.start() 173 | 174 | 175 | if __name__ == '__main__': 176 | unittest.main() 177 | -------------------------------------------------------------------------------- /test/test_workerprocess_extractor.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_workerprocess_extractor.py 19-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | import sys 19 | import logging 20 | from logging import StreamHandler 21 | 22 | import unittest 23 | 24 | import zmq 25 | from zmq.eventloop.ioloop import IOLoop 26 | from zmq.eventloop.zmqstream import ZMQStream 27 | 28 | from spyder.core.constants import CURI_OPTIONAL_TRUE 29 | from spyder.core.constants import CURI_EXTRACTION_FINISHED 30 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER 31 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT 32 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK 33 | from spyder.core.messages import DataMessage, MgmtMessage 34 | from spyder.core.mgmt import ZmqMgmt 35 | from spyder.core.settings import Settings 36 | from spyder.processor import limiter 37 | from spyder.thrift.gen.ttypes import CrawlUri 38 | from spyder import workerprocess 39 | 40 | 41 | class ZmqTornadoIntegrationTest(unittest.TestCase): 42 | 43 | def setUp(self): 44 | 45 | # create the io_loop 46 | self._io_loop = IOLoop.instance() 47 | 48 | # and the context 49 | self._ctx = zmq.Context(1) 50 | 51 | self._settings = Settings() 52 | self._settings.ZEROMQ_MASTER_PUSH = 'inproc://spyder-zmq-master-push' 53 | self._settings.ZEROMQ_WORKER_PROC_FETCHER_PULL = \ 54 | self._settings.ZEROMQ_MASTER_PUSH 55 | self._settings.ZEROMQ_MASTER_SUB = 'inproc://spyder-zmq-master-sub' 56 | self._settings.ZEROMQ_WORKER_PROC_EXTRACTOR_PUB = \ 57 | self._settings.ZEROMQ_MASTER_SUB 58 | 59 | self._settings.ZEROMQ_MGMT_MASTER = 'inproc://spyder-zmq-mgmt-master' 60 | self._settings.ZEROMQ_MGMT_WORKER = 'inproc://spyder-zmq-mgmt-worker' 61 | 62 | # setup the mgmt sockets 63 | self._setup_mgmt_sockets() 64 | 65 | # setup the data sockets 66 | self._setup_data_servers() 67 | 68 | # setup the management interface 69 | self._mgmt = ZmqMgmt( self._mgmt_sockets['worker_sub'], 70 | self._mgmt_sockets['worker_pub'], io_loop=self._io_loop) 71 | self._mgmt.start() 72 | self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end) 73 | 74 | def tearDown(self): 75 | # stop the mgmt 76 | self._mgmt.stop() 77 | 78 | # close all sockets 79 | for socket in self._mgmt_sockets.itervalues(): 80 | socket.close() 81 | for socket in self._worker_sockets.itervalues(): 82 | socket.close() 83 | 84 | # terminate the context 85 | self._ctx.term() 86 | 87 | def _setup_mgmt_sockets(self): 88 | 89 | self._mgmt_sockets = dict() 90 | 91 | # adress for the communication from master to worker(s) 92 | mgmt_master_worker = self._settings.ZEROMQ_MGMT_MASTER 93 | 94 | # connect the master with the worker 95 | # the master is a ZMQStream because we are sending msgs from the test 96 | sock = self._ctx.socket(zmq.PUB) 97 | sock.bind(mgmt_master_worker) 98 | self._mgmt_sockets['tmp1'] = sock 99 | self._mgmt_sockets['master_pub'] = ZMQStream(sock, self._io_loop) 100 | # the worker stream is created inside the ZmqMgmt class 101 | self._mgmt_sockets['worker_sub'] = self._ctx.socket(zmq.SUB) 102 | self._mgmt_sockets['worker_sub'].setsockopt(zmq.SUBSCRIBE, "") 103 | self._mgmt_sockets['worker_sub'].connect(mgmt_master_worker) 104 | 105 | # adress for the communication from worker(s) to master 106 | mgmt_worker_master = self._settings.ZEROMQ_MGMT_WORKER 107 | 108 | # connect the worker with the master 109 | self._mgmt_sockets['worker_pub'] = self._ctx.socket(zmq.PUB) 110 | self._mgmt_sockets['worker_pub'].bind(mgmt_worker_master) 111 | sock = self._ctx.socket(zmq.SUB) 112 | sock.setsockopt(zmq.SUBSCRIBE, "") 113 | sock.connect(mgmt_worker_master) 114 | self._mgmt_sockets['tmp2'] = sock 115 | self._mgmt_sockets['master_sub'] = ZMQStream(sock, self._io_loop) 116 | 117 | def _setup_data_servers(self): 118 | 119 | self._worker_sockets = dict() 120 | 121 | # address for master -> worker communication 122 | data_master_worker = self._settings.ZEROMQ_WORKER_PROC_EXTRACTOR_PULL 123 | 124 | sock = self._ctx.socket(zmq.PUSH) 125 | sock.bind(data_master_worker) 126 | self._worker_sockets['tmp3'] = sock 127 | self._worker_sockets['master_push'] = ZMQStream(sock, self._io_loop) 128 | 129 | # address for worker -> master communication 130 | data_worker_master = self._settings.ZEROMQ_WORKER_PROC_EXTRACTOR_PUB 131 | 132 | sock = self._ctx.socket(zmq.SUB) 133 | sock.setsockopt(zmq.SUBSCRIBE, "") 134 | sock.bind(data_worker_master) 135 | self._worker_sockets['tmp4'] = sock 136 | self._worker_sockets['master_sub'] = ZMQStream(sock, self._io_loop) 137 | 138 | def on_mgmt_end(self, _msg): 139 | self._io_loop.stop() 140 | 141 | 142 | class WorkerExtractorTestCase(ZmqTornadoIntegrationTest): 143 | 144 | def test_that_creating_extractor_works(self): 145 | 146 | self._settings.SPYDER_EXTRACTOR_PIPELINE = ['spyder.processor.limiter.DefaultLimiter',] 147 | 148 | extractor = workerprocess.create_worker_extractor(self._settings, 149 | self._mgmt, self._ctx, StreamHandler(sys.stdout), self._io_loop) 150 | extractor.start() 151 | 152 | curi = CrawlUri(url="http://localhost:80/robots.txt", 153 | effective_url="http://127.0.0.1:%s/robots.txt", 154 | optional_vars=dict(), 155 | ) 156 | msg = DataMessage() 157 | msg.identity = "me" 158 | msg.curi = curi 159 | 160 | def assert_expected_result_and_stop(raw_msg): 161 | msg2 = DataMessage(raw_msg) 162 | self.assertEqual(CURI_OPTIONAL_TRUE, 163 | msg2.curi.optional_vars[CURI_EXTRACTION_FINISHED]) 164 | death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, 165 | data=ZMQ_SPYDER_MGMT_WORKER_QUIT) 166 | self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) 167 | 168 | self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) 169 | 170 | def assert_correct_mgmt_message(raw_msg): 171 | self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, raw_msg) 172 | 173 | self._mgmt_sockets['master_sub'].on_recv(assert_correct_mgmt_message) 174 | 175 | self._worker_sockets['master_push'].send_multipart(msg.serialize()) 176 | 177 | self._io_loop.start() 178 | 179 | extractor._out_stream.close() 180 | extractor._outsocket.close() 181 | extractor._in_stream.close() 182 | extractor._insocket.close() 183 | 184 | 185 | if __name__ == '__main__': 186 | unittest.main() 187 | -------------------------------------------------------------------------------- /test/test_workerprocess_fetcher.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_workerprocess_fetcher.py 19-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | import logging 19 | from logging import StreamHandler 20 | import sys 21 | 22 | import unittest 23 | import time 24 | 25 | import zmq 26 | from zmq.eventloop.ioloop import IOLoop 27 | from zmq.eventloop.zmqstream import ZMQStream 28 | 29 | from spyder.core.constants import CURI_OPTIONAL_TRUE 30 | from spyder.core.constants import CURI_EXTRACTION_FINISHED 31 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER 32 | from spyder.core.settings import Settings 33 | from spyder.core.worker import AsyncZmqWorker 34 | from spyder import workerprocess 35 | 36 | from spyder.processor.fetcher import FetchProcessor 37 | 38 | class WorkerExtractorTestCase(unittest.TestCase): 39 | 40 | def test_that_creating_fetcher_works(self): 41 | ctx = zmq.Context() 42 | io_loop = IOLoop.instance() 43 | 44 | def stop_looping(_msg): 45 | io_loop.stop() 46 | 47 | settings = Settings() 48 | 49 | master_push = ctx.socket(zmq.PUSH) 50 | master_push.bind(settings.ZEROMQ_MASTER_PUSH) 51 | 52 | fetcher = workerprocess.create_worker_fetcher(settings, {}, ctx, 53 | StreamHandler(sys.stdout), io_loop) 54 | 55 | self.assertTrue(isinstance(fetcher._processing, FetchProcessor)) 56 | self.assertTrue(isinstance(fetcher, AsyncZmqWorker)) 57 | 58 | fetcher._insocket.close() 59 | fetcher._outsocket.close() 60 | master_push.close() 61 | ctx.term() 62 | 63 | 64 | if __name__ == '__main__': 65 | unittest.main() 66 | -------------------------------------------------------------------------------- /test/test_workerprocess_mgmtintegration.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_workerprocess.py 18-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import unittest 20 | import time 21 | 22 | import zmq 23 | from zmq.eventloop.ioloop import IOLoop 24 | from zmq.eventloop.zmqstream import ZMQStream 25 | 26 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER 27 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT 28 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK 29 | from spyder.core.messages import MgmtMessage 30 | from spyder.core.settings import Settings 31 | from spyder.processor import limiter 32 | from spyder import workerprocess 33 | 34 | 35 | class WorkerProcessTestCase(unittest.TestCase): 36 | 37 | def test_that_creating_mgmt_works(self): 38 | 39 | ctx = zmq.Context() 40 | io_loop = IOLoop.instance() 41 | 42 | def stop_looping(_msg): 43 | io_loop.stop() 44 | 45 | settings = Settings() 46 | settings.ZEROMQ_MASTER_PUSH = 'inproc://spyder-zmq-master-push' 47 | settings.ZEROMQ_WORKER_PROC_FETCHER_PULL = \ 48 | settings.ZEROMQ_MASTER_PUSH 49 | settings.ZEROMQ_MASTER_SUB = 'inproc://spyder-zmq-master-sub' 50 | settings.ZEROMQ_WORKER_PROC_EXTRACTOR_PUB = \ 51 | settings.ZEROMQ_MASTER_SUB 52 | 53 | settings.ZEROMQ_MGMT_MASTER = 'inproc://spyder-zmq-mgmt-master' 54 | settings.ZEROMQ_MGMT_WORKER = 'inproc://spyder-zmq-mgmt-worker' 55 | 56 | pubsocket = ctx.socket(zmq.PUB) 57 | pubsocket.bind(settings.ZEROMQ_MGMT_MASTER) 58 | pub_stream = ZMQStream(pubsocket, io_loop) 59 | 60 | subsocket = ctx.socket(zmq.SUB) 61 | subsocket.setsockopt(zmq.SUBSCRIBE, "") 62 | subsocket.bind(settings.ZEROMQ_MGMT_WORKER) 63 | sub_stream = ZMQStream(subsocket, io_loop) 64 | 65 | mgmt = workerprocess.create_worker_management(settings, ctx, io_loop) 66 | mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, stop_looping) 67 | mgmt.start() 68 | 69 | def assert_quit_message(msg): 70 | self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg.data) 71 | 72 | sub_stream.on_recv(assert_quit_message) 73 | 74 | death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, 75 | data=ZMQ_SPYDER_MGMT_WORKER_QUIT) 76 | pub_stream.send_multipart(death.serialize()) 77 | 78 | io_loop.start() 79 | 80 | mgmt._out_stream.close() 81 | mgmt._in_stream.close() 82 | mgmt._publisher.close() 83 | mgmt._subscriber.close() 84 | pub_stream.close() 85 | pubsocket.close() 86 | sub_stream.close() 87 | subsocket.close() 88 | ctx.term() 89 | 90 | 91 | if __name__ == '__main__': 92 | unittest.main() 93 | -------------------------------------------------------------------------------- /test/test_workerprocess_processing.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_workerprocess_processing.py 18-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import unittest 20 | 21 | from spyder.core.constants import CURI_OPTIONAL_TRUE 22 | from spyder.core.constants import CURI_EXTRACTION_FINISHED 23 | from spyder.core.settings import Settings 24 | from spyder.processor import limiter 25 | from spyder.thrift.gen.ttypes import CrawlUri 26 | from spyder import workerprocess 27 | 28 | 29 | class WorkerProcessingUnittest(unittest.TestCase): 30 | 31 | def test_that_creating_processing_function_works(self): 32 | settings = Settings() 33 | processors = settings.SPYDER_EXTRACTOR_PIPELINE 34 | processors.extend(settings.SPYDER_SCOPER_PIPELINE) 35 | processors.append('test_workerprocess') 36 | self.assertRaises(ValueError, workerprocess.create_processing_function, 37 | settings, processors) 38 | 39 | processors.pop() 40 | processors.append('test_workerprocess_unspec') 41 | self.assertRaises(ValueError, workerprocess.create_processing_function, 42 | settings, processors) 43 | 44 | processors.pop() 45 | processing = workerprocess.create_processing_function(settings, 46 | processors) 47 | 48 | curi = CrawlUri(optional_vars=dict()) 49 | curi.effective_url = "http://127.0.0.1/robots.txt" 50 | curi2 = processing(curi) 51 | 52 | self.assertEqual(CURI_OPTIONAL_TRUE, 53 | curi2.optional_vars[CURI_EXTRACTION_FINISHED]) 54 | -------------------------------------------------------------------------------- /test/test_workerprocess_unspec.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com 3 | # 4 | # test_workerprocess_unspec.py 26-Jan-2011 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | def a_plugin_with_no_create_processor_method(): 20 | pass 21 | -------------------------------------------------------------------------------- /versions.cfg: -------------------------------------------------------------------------------- 1 | [versions] 2 | zeromq = 2.1.9 3 | tornado = 1.2 4 | Brownie = 0.5.1 5 | collective.recipe.sphinxbuilder = 0.7.0 6 | coverage = 3.5.1 7 | pbp.recipe.noserunner = 0.2.6 8 | pep8 = 0.6.1 9 | pycurl = 7.19.0 10 | pyflakes = 0.5.0 11 | pytz = 2011j 12 | pyzmq = 2.1.9 13 | thrift = 0.7.0 14 | 15 | #Required by: 16 | #pbp.recipe.noserunner 0.2.6 17 | nose = 1.1.2 18 | 19 | #Required by: 20 | #collective.recipe.sphinxbuilder 0.7.0 21 | zc.buildout = 1.5.2 22 | 23 | #Required by: 24 | #collective.recipe.sphinxbuilder 0.7.0 25 | zc.recipe.egg = 1.3.2 26 | --------------------------------------------------------------------------------