├── .gitignore
├── .pylintrc
├── LICENSE
├── MANIFEST.in
├── README.rst
├── bootstrap.py
├── buildout.cfg
├── crawluri.thrift
├── docs-source
    ├── api
    │   ├── entrypoints.rst
    │   ├── extractor.rst
    │   ├── fetcher.rst
    │   ├── frontier.rst
    │   ├── masterprocess.rst
    │   ├── queues.rst
    │   ├── scoper.rst
    │   ├── sink.rst
    │   ├── spyderapi.rst
    │   └── workerprocess.rst
    ├── conf.py
    ├── crawler-design.rst
    ├── getting-started.rst
    ├── globals.rst
    ├── index.rst
    ├── libraries.rst
    ├── release-notes.rst
    └── roadmap.rst
├── local.cfg.template
├── setup.py
├── src
    └── spyder
    │   ├── __init__.py
    │   ├── core
    │       ├── __init__.py
    │       ├── constants.py
    │       ├── dnscache.py
    │       ├── frontier.py
    │       ├── log.py
    │       ├── master.py
    │       ├── messages.py
    │       ├── mgmt.py
    │       ├── prioritizer.py
    │       ├── queueassignment.py
    │       ├── queueselector.py
    │       ├── settings.py
    │       ├── sink.py
    │       ├── sqlitequeues.py
    │       ├── uri_uniq.py
    │       └── worker.py
    │   ├── defaultsettings.py
    │   ├── encoding.py
    │   ├── import_util.py
    │   ├── logsink.py
    │   ├── masterprocess.py
    │   ├── processor
    │       ├── __init__.py
    │       ├── cleanupquery.py
    │       ├── fetcher.py
    │       ├── htmllinkextractor.py
    │       ├── httpextractor.py
    │       ├── limiter.py
    │       ├── scoper.py
    │       └── stripsessions.py
    │   ├── spyder_template
    │       ├── log
    │       │   └── .keep
    │       ├── logging.conf
    │       ├── master.py
    │       ├── settings.py
    │       ├── sink.py
    │       └── spyder-ctrl.py
    │   ├── thrift
    │       ├── __init__.py
    │       └── gen
    │       │   ├── __init__.py
    │       │   ├── constants.py
    │       │   └── ttypes.py
    │   ├── time.py
    │   └── workerprocess.py
├── test
    ├── static
    │   └── robots.txt
    ├── test_async_worker.py
    ├── test_cleanup_qs.py
    ├── test_default_html_link_extractor.py
    ├── test_dns_cache.py
    ├── test_fetch_processor.py
    ├── test_fetch_processor_last_modified_works.py
    ├── test_fetch_processor_with_etag.py
    ├── test_frontier.py
    ├── test_http_extractor.py
    ├── test_limiter.py
    ├── test_masterprocess.py
    ├── test_messages.py
    ├── test_mgmt.py
    ├── test_multiple_frontier.py
    ├── test_queue_assignment.py
    ├── test_queue_selector.py
    ├── test_regex_scoper.py
    ├── test_settings.py
    ├── test_settings_settings.py
    ├── test_sqlite_multiple_queues.py
    ├── test_sqlite_queues.py
    ├── test_strip_session_ids.py
    ├── test_uri_unique_filter.py
    ├── test_worker.py
    ├── test_workerprocess_extractor.py
    ├── test_workerprocess_fetcher.py
    ├── test_workerprocess_mgmtintegration.py
    ├── test_workerprocess_processing.py
    └── test_workerprocess_unspec.py
└── versions.cfg


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | *.pyc
 3 | *.pyo
 4 | .installed.cfg
 5 | bin
 6 | develop-eggs
 7 | dist
 8 | downloads
 9 | eggs
10 | parts
11 | src/*.egg-info
12 | docs
13 | pylint*
14 | .coverage
15 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # Specify a configuration file.
  4 | #rcfile=
  5 | 
  6 | # Python code to execute, usually for sys.path manipulation such as
  7 | # pygtk.require().
  8 | #init-hook=
  9 | 
 10 | # Profiled execution.
 11 | profile=no
 12 | 
 13 | # Add <file or directory> to the black list. It should be a base name, not a
 14 | # path. You may set this option multiple times.
 15 | ignore=CVS
 16 | ignore=gen
 17 | 
 18 | # Pickle collected data for later comparisons.
 19 | persistent=yes
 20 | 
 21 | # List of plugins (as comma separated values of python modules names) to load,
 22 | # usually to register additional checkers.
 23 | load-plugins=
 24 | 
 25 | 
 26 | [MESSAGES CONTROL]
 27 | 
 28 | # Enable the message, report, category or checker with the given id(s). You can
 29 | # either give multiple identifier separated by comma (,) or put this option
 30 | # multiple time.
 31 | #enable=
 32 | 
 33 | # Disable the message, report, category or checker with the given id(s). You
 34 | # can either give multiple identifier separated by comma (,) or put this option
 35 | # multiple time (only on the command line, not in the configuration file where
 36 | # it should appear only once).
 37 | disable=R0903
 38 | 
 39 | 
 40 | [REPORTS]
 41 | 
 42 | # Set the output format. Available formats are text, parseable, colorized, msvs
 43 | # (visual studio) and html
 44 | output-format=html
 45 | 
 46 | # Include message's id in output
 47 | include-ids=yes
 48 | 
 49 | # Put messages in a separate file for each module / package specified on the
 50 | # command line instead of printing them on stdout. Reports (if any) will be
 51 | # written in a file name "pylint_global.[txt|html]".
 52 | files-output=no
 53 | 
 54 | # Tells whether to display a full report or only the messages
 55 | reports=yes
 56 | 
 57 | # Python expression which should return a note less than 10 (10 is the highest
 58 | # note). You have access to the variables errors warning, statement which
 59 | # respectively contain the number of errors / warnings messages and the total
 60 | # number of statements analyzed. This is used by the global evaluation report
 61 | # (RP0004).
 62 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 63 | 
 64 | # Add a comment according to your evaluation note. This is used by the global
 65 | # evaluation report (RP0004).
 66 | comment=no
 67 | 
 68 | 
 69 | [BASIC]
 70 | 
 71 | # Required attributes for module, separated by a comma
 72 | required-attributes=
 73 | 
 74 | # List of builtins function names that should not be used, separated by a comma
 75 | bad-functions=map,filter,apply,input
 76 | 
 77 | # Regular expression which should only match correct module names
 78 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
 79 | 
 80 | # Regular expression which should only match correct module level names
 81 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
 82 | 
 83 | # Regular expression which should only match correct class names
 84 | class-rgx=[A-Z_][a-zA-Z0-9]+$
 85 | 
 86 | # Regular expression which should only match correct function names
 87 | function-rgx=[a-z_][a-z0-9_]{2,30}$
 88 | 
 89 | # Regular expression which should only match correct method names
 90 | method-rgx=[a-z_][a-z0-9_]{2,30}$
 91 | 
 92 | # Regular expression which should only match correct instance attribute names
 93 | attr-rgx=[a-z_][a-z0-9_]{2,30}$
 94 | 
 95 | # Regular expression which should only match correct argument names
 96 | argument-rgx=[a-z_][a-z0-9_]{2,30}$
 97 | 
 98 | # Regular expression which should only match correct variable names
 99 | variable-rgx=[a-z_][a-z0-9_]{2,30}$
100 | 
101 | # Regular expression which should only match correct list comprehension /
102 | # generator expression variable names
103 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
104 | 
105 | # Good variable names which should always be accepted, separated by a comma
106 | good-names=i,j,k,ex,Run,_
107 | 
108 | # Bad variable names which should always be refused, separated by a comma
109 | bad-names=foo,bar,baz,toto,tutu,tata
110 | 
111 | # Regular expression which should only match functions or classes name which do
112 | # not require a docstring
113 | no-docstring-rgx=__.*__
114 | 
115 | 
116 | [FORMAT]
117 | 
118 | # Maximum number of characters on a single line.
119 | max-line-length=80
120 | 
121 | # Maximum number of lines in a module
122 | max-module-lines=1000
123 | 
124 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
125 | # tab).
126 | indent-string='    '
127 | 
128 | 
129 | [MISCELLANEOUS]
130 | 
131 | # List of note tags to take in consideration, separated by a comma.
132 | notes=FIXME,XXX,TODO
133 | 
134 | 
135 | [SIMILARITIES]
136 | 
137 | # Minimum lines number of a similarity.
138 | min-similarity-lines=4
139 | 
140 | # Ignore comments when computing similarities.
141 | ignore-comments=yes
142 | 
143 | # Ignore docstrings when computing similarities.
144 | ignore-docstrings=yes
145 | 
146 | 
147 | [TYPECHECK]
148 | 
149 | # Tells whether missing members accessed in mixin class should be ignored. A
150 | # mixin class is detected if its name ends with "mixin" (case insensitive).
151 | ignore-mixin-members=yes
152 | 
153 | # List of classes names for which member attributes should not be checked
154 | # (useful for classes with attributes dynamically set).
155 | ignored-classes=SQLObject
156 | 
157 | # When zope mode is activated, add a predefined set of Zope acquired attributes
158 | # to generated-members.
159 | zope=no
160 | 
161 | # List of members which are set dynamically and missed by pylint inference
162 | # system, and so shouldn't trigger E0201 when accessed.
163 | generated-members=REQUEST,acl_users,aq_parent
164 | 
165 | 
166 | [VARIABLES]
167 | 
168 | # Tells whether we should check for unused import in __init__ files.
169 | init-import=yes
170 | 
171 | # A regular expression matching the beginning of the name of dummy variables
172 | # (i.e. not used).
173 | dummy-variables-rgx=_|dummy
174 | 
175 | # List of additional names supposed to be defined in builtins. Remember that
176 | # you should avoid to define new builtins when possible.
177 | additional-builtins=
178 | 
179 | 
180 | [CLASSES]
181 | 
182 | # List of interface methods to ignore, separated by a comma. This is used for
183 | # instance to not check methods defines in Zope's Interface base class.
184 | ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by
185 | 
186 | # List of method names used to declare (i.e. assign) instance attributes.
187 | defining-attr-methods=__init__,__new__,setUp
188 | 
189 | 
190 | [DESIGN]
191 | 
192 | # Maximum number of arguments for function / method
193 | max-args=5
194 | 
195 | # Argument names that match this expression will be ignored. Default to name
196 | # with leading underscore
197 | ignored-argument-names=_.*
198 | 
199 | # Maximum number of locals for function / method body
200 | max-locals=15
201 | 
202 | # Maximum number of return / yield for function / method body
203 | max-returns=6
204 | 
205 | # Maximum number of branch for function / method body
206 | max-branchs=12
207 | 
208 | # Maximum number of statements in function / method body
209 | max-statements=50
210 | 
211 | # Maximum number of parents for a class (see R0901).
212 | max-parents=7
213 | 
214 | # Maximum number of attributes for a class (see R0902).
215 | max-attributes=7
216 | 
217 | # Minimum number of public methods for a class (see R0903).
218 | min-public-methods=2
219 | 
220 | # Maximum number of public methods for a class (see R0904).
221 | max-public-methods=20
222 | 
223 | 
224 | [IMPORTS]
225 | 
226 | # Deprecated modules which should not be used, separated by a comma
227 | deprecated-modules=regsub,string,TERMIOS,Bastion,rexec
228 | 
229 | # Create a graph of every (i.e. internal and external) dependencies in the
230 | # given file (report RP0402 must not be disabled)
231 | import-graph=pylint_spyder.dot
232 | 
233 | # Create a graph of external dependencies in the given file (report RP0402 must
234 | # not be disabled)
235 | ext-import-graph=pylint_spyder_external.dot
236 | 
237 | # Create a graph of internal dependencies in the given file (report RP0402 must
238 | # not be disabled)
239 | int-import-graph=pylint_spyder_internal.dot
240 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include LICENSE
3 | include src/spyder/spyder_template/logging.conf
4 | include src/spyder/spyder_template/log/.keep
5 | recursive-include docs-source *.rst
6 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Spyder
 2 | ======
 3 | 
 4 | `ALONG CAME A SPIDER`
 5 | 
 6 | 
 7 | *Spyder* is a scalable web-spider written in Python using the non-blocking
 8 | *tornado* library and *ZeroMQ* as messaging layer. The messages are serialized
 9 | using *Thrift*.
10 | 
11 | The architecture is very basic: a **Master** process contains the crawl
12 | **Frontier** that organises the *urls* that need to be crawled; several
13 | **Worker** processes actually download the content and extract new *urls* that
14 | should be crawled in the future. For storing the content you may attach a
15 | **Sink** to the **Master** and be informed about the interesting events for an
16 | *url*.
17 | 
18 | 
19 | Getting Started
20 | ===============
21 | 
22 | *Spyder* is just a library for creating web crawlers. In order to really crawl
23 | content, you first have to create a *Spyder* skeleton:
24 | 
25 | .. code-block:: bash
26 | 
27 |    $ mkdir my-crawler && cd my-crawler
28 |    $ spyder start
29 |    $ ls
30 |    log logging.conf master.py settings.py sink.py spyder-ctrl.py
31 | 
32 | This will copy the skeleton into `my-crawler`. The main file is `settings.py`.
33 | In it, you can configure the logging level for **Masters** and **Workers** and
34 | define the **crawl scope**. In `master.py` you should manipulate the starting
35 | URLs and add your specific `sink.py` into the **Frontier**. `spyder-ctrl.py` is
36 | just a small control script that helps you start the **Log Sink**, **Master** and
37 | **Worker**.
38 | 
39 | In the skeleton everything is setup as if you would want to crawl Sailing
40 | related pages from **DMOZ**. That should give you a starting point for your own
41 | crawler.
42 | 
43 | So, when you wrote your sink and have everything configured right, it's time to
44 | start crawling. First, on one of your nodes you start the logsink:
45 | 
46 | .. code-block:: bash
47 | 
48 |    $ spyder-ctrl.py logsink &
49 | 
50 | Again on one node (the same as the logsink, e.g.) you start the **Master**:
51 | 
52 |     $ spyder-ctrl.py master &
53 | 
54 | Finally you can start as many **Workers** as you want:
55 | 
56 |     $ spyder-ctrl.py worker &
57 |     $ spyder-ctrl.py worker &
58 |     $ spyder-ctrl.py worker &
59 | 
60 | Here we started 3 workers since it is a powerful node having a quad core CPU.
61 | 
62 | 
63 | Scaling the Crawl
64 | =================
65 | 
66 | With the default settings it is not possible to start workers on different
67 | nodes. Most of the time one node is powerful enough to crawl quite an amount of
68 | data. But there are times when you simply want to crawl using *many* nodes. This
69 | can be done by configuring the **ZeroMQ** transports to something like
70 | 
71 |    
72 |     ZEROMQ_MASTER_PUSH = "tcp://NodeA:5005"
73 |     ZEROMQ_MASTER_SUB = "tcp://NodeA:5007"
74 | 
75 |     ZEROMQ_MGMT_MASTER = "tcp://NodeA:5008"
76 |     ZEROMQ_MGMT_WORKER = "tcp://NodeA:5009"
77 | 
78 |     ZEROMQ_LOGGING = "tcp://NodeA:5010"
79 | 
80 | Basically we have setup a 2 node crawl cluster. **NodeA** acts as logging sink
81 | and controls the crawl via the **Master**. **NodeB** Is a pure **Worker** node.
82 | Only the **Master** actually *binds* **ZeroMQ** sockets, the **Worker** always
83 | *connect* to them so the **Master** does not have to know where the
84 | **Workers** are really running.
85 | 
86 | 
87 | From here
88 | =========
89 | 
90 | There is plenty of room for improvement and development ahead. Everything will
91 | be handled by Github tickets from now on and, if there is interest, we may setup
92 | a Google Group.
93 | 


--------------------------------------------------------------------------------
/buildout.cfg:
--------------------------------------------------------------------------------
 1 | [buildout]
 2 | parts =
 3 |     ${local:parts}
 4 |     ${codeq:parts}
 5 |     test
 6 |     python
 7 |     sphinxbuilder
 8 |     coverage
 9 | develop = .
10 | eggs = spyder
11 | versions = versions
12 | extends =
13 |     https://github.com/retresco/buildout-recipes/raw/master/zmq.cfg
14 |     https://github.com/retresco/buildout-recipes/raw/master/testing.cfg
15 |     https://github.com/retresco/buildout-recipes/raw/master/codeq.cfg
16 |     local.cfg
17 |     versions.cfg
18 | extensions = buildout.dumppickedversions
19 | 
20 | [python]
21 | recipe = zc.recipe.egg
22 | interpreter = python
23 | eggs = ${buildout:eggs}
24 | 
25 | [test]
26 | <= test-template
27 | eggs = spyder [test]
28 | defaults =
29 |     --verbosity=3
30 |     --with-doctest
31 |     --doctest-extension=txt
32 |     --where=${buildout:directory}/test
33 |     --with-xunit
34 |     --with-coverage
35 |     --cover-package=spyder
36 | 
37 | [sphinxbuilder]
38 | recipe = collective.recipe.sphinxbuilder
39 | source = ${buildout:directory}/docs-source
40 | build = ${buildout:directory}/docs
41 | interpreter = ${buildout:directory}/bin/python
42 | 
43 | [coverage]
44 | recipe = zc.recipe.egg
45 | eggs = coverage
46 | 


--------------------------------------------------------------------------------
/crawluri.thrift:
--------------------------------------------------------------------------------
 1 | # Description of the CrawlUri thrift structure
 2 | 
 3 | namespace py spyder.thrift.gen
 4 | 
 5 | 
 6 | /**
 7 |  * Some typedefs in order to make the code more readable.
 8 |  */
 9 | typedef i64 timestamp
10 | 
11 | typedef map<string,string> header
12 | 
13 | typedef map<string,string> key_value
14 | 
15 | /**
16 |  * The main strcut for CrawlUris.
17 |  * 
18 |  * This contains some metadata and if possible the saved web page.
19 |  */
20 | struct CrawlUri {
21 |     // readable version of the url to crawl
22 |     1: string               url,
23 | 
24 |     // the effective url used for downloading the content (i.e.: IP instead of hostname)
25 |     2: string               effective_url,
26 | 
27 |     // the host identifier used for queue selection
28 |     3: i16                  current_priority,
29 | 
30 |     // when processing has been started
31 |     4: timestamp            begin_processing,
32 | 
33 |     // when processing is finished
34 |     5: timestamp            end_processing,
35 | 
36 |     // the http request headers
37 |     6: header               req_header,
38 | 
39 |     // the http response headers
40 |     7: header               rep_header
41 | 
42 |     // the saved content body
43 |     8: string               content_body,
44 | 
45 |     // the servers status code
46 |     9: i16                  status_code,
47 | 
48 |     // request time
49 |     10: double              req_time,
50 | 
51 |     // queue time
52 |     11: double              queue_time,
53 | 
54 |     // additional values from other processors
55 |     12: key_value           optional_vars
56 | }
57 | 


--------------------------------------------------------------------------------
/docs-source/api/entrypoints.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: ../globals.rst
 4 | 
 5 | Entrypoints
 6 | ===========
 7 | 
 8 | .. automodule:: spyder
 9 |     :members:
10 | 


--------------------------------------------------------------------------------
/docs-source/api/extractor.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: ../globals.rst
 4 | 
 5 | Link Extractors
 6 | ===============
 7 | 
 8 | .. automodule:: spyder.processor.htmllinkextractor
 9 |     :members:
10 | 


--------------------------------------------------------------------------------
/docs-source/api/fetcher.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: ../globals.rst
 4 | 
 5 | Content Fetcher
 6 | ===============
 7 | 
 8 | .. automodule:: spyder.processor.fetcher
 9 |     :members:
10 | 


--------------------------------------------------------------------------------
/docs-source/api/frontier.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: ../globals.rst
 4 | 
 5 | Frontier
 6 | ========
 7 | 
 8 | .. automodule:: spyder.core.frontier
 9 |     :members:
10 | 


--------------------------------------------------------------------------------
/docs-source/api/masterprocess.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: ../globals.rst
 4 | 
 5 | Masterprocess
 6 | =============
 7 | 
 8 | .. automodule:: spyder.masterprocess
 9 |     :members:
10 | 
11 | ZeroMQ Master
12 | =============
13 | 
14 | .. automodule:: spyder.core.master
15 |     :members:
16 | 


--------------------------------------------------------------------------------
/docs-source/api/queues.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: ../globals.rst
 4 | 
 5 | Queue Management
 6 | ================
 7 | 
 8 | .. automodule:: spyder.core.sqlitequeues
 9 |     :members:
10 | 


--------------------------------------------------------------------------------
/docs-source/api/scoper.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: ../globals.rst
 4 | 
 5 | Crawl Scoper
 6 | ============
 7 | 
 8 | .. automodule:: spyder.processor.scoper
 9 |     :members:
10 | 


--------------------------------------------------------------------------------
/docs-source/api/sink.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: ../globals.rst
 4 | 
 5 | Sink
 6 | ====
 7 | 
 8 | .. automodule:: spyder.core.sink
 9 |     :members:
10 | 


--------------------------------------------------------------------------------
/docs-source/api/spyderapi.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: ../globals.rst
 4 | 
 5 | .. _secapi:
 6 | 
 7 | Spyder API
 8 | ==========
 9 | 
10 | This is the main documentation for the |spyder| API. This will hopefully provide
11 | you with enough information to getting started for coding new features or help
12 | with bugfixing.
13 | 
14 | .. toctree::
15 |     :maxdepth: 2
16 | 
17 |     entrypoints
18 |     masterprocess
19 |     frontier
20 |     queues
21 |     workerprocess
22 |     fetcher
23 |     extractor
24 |     scoper
25 |     sink
26 | 


--------------------------------------------------------------------------------
/docs-source/api/workerprocess.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: ../globals.rst
 4 | 
 5 | Workerprocess
 6 | =============
 7 | 
 8 | .. automodule:: spyder.workerprocess
 9 |     :members:
10 | 
11 | ZeroMQ Worker
12 | =============
13 | 
14 | .. automodule:: spyder.core.worker
15 |     :members:
16 | 


--------------------------------------------------------------------------------
/docs-source/crawler-design.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: globals.rst
 4 | .. _seccrawlerdesign:
 5 | 
 6 | Crawler Design
 7 | ==============
 8 | 
 9 | The basic crawler design is simple and straight forward. You have a *Master*
10 | that collects the |urls| that should be crawled and a number of *Worker* threads
11 | (or processes) that download the content and extract new links from it. In
12 | practice though there are a number of pitfalls you have to keep an eye on. Just
13 | to give one example: you really don't want to excessively crawl **one** host as
14 | you might be doing a *Denial of Service* attack given enough workers. And even
15 | if the host survives, the site owner might not like you from now on.
16 | 
17 | Some Science
18 | ------------
19 | 
20 | Ok, really only a little bit. Basically there two papers describing effective
21 | crawler designs. The *Mercator* paper (`Mercator: A Scalable, Extensible Web
22 | Crawler (1999)
23 | <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.5342>`_)
24 | describes the architecture of the *Mercator* crawler. The crawler is split into
25 | several parts:
26 | 
27 | * *Frontier* for keeping track of |urls|
28 | * *Scheduler* for scheduling the |urls| to be crawled
29 | * *Downloader* for really downloading the content
30 | * *Link Extractors* for extracting new links from different kinds of content
31 | * *Unique Filter* for filtering known |urls| from the extracted ones
32 | * *Host Splitter* for working with multiple *Frontiers*
33 | 
34 | The second important paper on crawler design is the *Ubi Crawler* (`UbiCrawler:
35 | a scalable fully distributed Web crawler (2003)
36 | <http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.4239>`_). In
37 | this paper the authors use a *Consistent Hashing* algorithm for splitting the
38 | hosts among several *Frontiers*.
39 | 
40 | The |spyder| is designed on the basis of these two papers.
41 | 
42 | References
43 | ==========
44 | 
45 | The |spyder| is not only inspired by these two papers but also on `Heritrix
46 | <http://crawler.archive.org>`_ the *Internet Archive's* open source crawler.
47 | *Heritrix* is designed just like *Mercator* except it lacks something like a
48 | *Host Splitter* that allows one to crawl using more than one *Frontier*.
49 | Additionally *Heritrix* does not provide any kind of *monitoring* or
50 | *revisiting* strategy, although this might be possible in Version *H3*.
51 | 


--------------------------------------------------------------------------------
/docs-source/getting-started.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: globals.rst
 4 | 
 5 | .. _secgettingstarted:
 6 | 
 7 | Getting Started
 8 | ===============
 9 | 
10 | *Spyder* is just a library for creating web crawlers. In order to really crawl
11 | content, you first have to create a *Spyder* skeleton:
12 | 
13 | .. code-block:: bash
14 | 
15 |    $ mkdir my-crawler && cd my-crawler
16 |    $ spyder start
17 |    $ ls
18 |    log logging.conf master.py settings.py sink.py spyder-ctrl.py
19 | 
20 | This will copy the skeleton into `my-crawler`. The main file is `settings.py`.
21 | In it, you can configure the logging level for **Masters** and **Workers** and
22 | define the **crawl scope**. In `master.py` you should manipulate the starting
23 | URLs and add your specific `sink.py` into the **Frontier**. `spyder-ctrl.py` is
24 | just a small control script that helps you start the **Log Sink**, **Master** and
25 | **Worker**.
26 | 
27 | In the skeleton everything is setup as if you would want to crawl Sailing
28 | related pages from **DMOZ**. That should give you a starting point for your own
29 | crawler.
30 | 
31 | So, when you wrote your sink and have everything configured right, it's time to
32 | start crawling. First, on one of your nodes you start the logsink:
33 | 
34 | .. code-block:: bash
35 | 
36 |    $ spyder-ctrl.py logsink &
37 | 
38 | Again on one node (the same as the logsink, e.g.) you start the **Master**:
39 | 
40 | .. code-block:: bash
41 | 
42 |    $ spyder-ctrl.py master &
43 | 
44 | Finally you can start as many **Workers** as you want:
45 | 
46 | .. code-block:: bash
47 | 
48 |    $ spyder-ctrl.py worker &
49 |    $ spyder-ctrl.py worker &
50 |    $ spyder-ctrl.py worker &
51 | 
52 | Here we started 3 workers since it is a powerful node having a quad core CPU.
53 | 
54 | 
55 | Scaling the Crawl
56 | -----------------
57 | 
58 | With the default settings it is not possible to start workers on different
59 | nodes. Most of the time one node is powerful enough to crawl quite an amount of
60 | data. But there are times when you simply want to crawl using *many* nodes. This
61 | can be done by configuring the **ZeroMQ** transports to something like
62 | 
63 |    
64 | .. code-block:: python
65 | 
66 |     ZEROMQ_MASTER_PUSH = "tcp://NodeA:5005"
67 |     ZEROMQ_MASTER_SUB = "tcp://NodeA:5007"
68 | 
69 |     ZEROMQ_MGMT_MASTER = "tcp://NodeA:5008"
70 |     ZEROMQ_MGMT_WORKER = "tcp://NodeA:5009"
71 | 
72 |     ZEROMQ_LOGGING = "tcp://NodeA:5010"
73 | 
74 | Basically we have setup a 2 node crawl cluster. **NodeA** acts as logging sink
75 | and controls the crawl via the **Master**. **NodeB** Is a pure **Worker** node.
76 | Only the **Master** actually *binds* **ZeroMQ** sockets, the **Worker** always
77 | *connect* to them so the **Master** does not have to know where the
78 | **Workers** are really running.
79 | 
80 | 
81 | From here
82 | ---------
83 | 
84 | There is plenty of room for improvement and development ahead. Everything will
85 | be handled by Github tickets from now on and, if there is interest, we may setup
86 | a Google Group.
87 | 


--------------------------------------------------------------------------------
/docs-source/globals.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. |zmq| replace:: *ZeroMQ*
 4 | .. |spyder| replace:: **Spyder**
 5 | .. |pushpull| replace:: *PUSH/PULL*
 6 | .. |pubsub| replace:: *PUB/SUB*
 7 | .. |url| replace:: *URL*
 8 | .. |urls| replace:: *URLs*
 9 | .. |tornado| replace:: *Tornado*
10 | 


--------------------------------------------------------------------------------
/docs-source/index.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: globals.rst
 4 | 
 5 | Welcome to |spyder|
 6 | ===================
 7 | 
 8 | |spyder| is a scalable web-spider written in Python using the non-blocking
 9 | |tornado| library and |zmq| as messaging layer. The messages are serialized
10 | using *Thrift*.
11 | 
12 | The architecture is very basic: a **Master** process contains the crawl
13 | **Frontier** that organises the |urls| that need to be crawled; several
14 | **Worker** processes actually download the content and extract new |urls| that
15 | should be crawled in the future. For storing the content you may attach a
16 | **Sink** to the **Master** and be informed about the interesting events for an
17 | |url|.
18 | 
19 | Table of Contents
20 | =================
21 | 
22 | .. toctree::
23 |    :maxdepth: 2
24 | 
25 |    release-notes
26 |    getting-started
27 |    crawler-design
28 |    libraries
29 |    api/spyderapi
30 |    roadmap
31 | 
32 | Indices and tables
33 | ==================
34 | 
35 | * :ref:`genindex`
36 | * :ref:`modindex`
37 | * :ref:`search`
38 | 
39 | 


--------------------------------------------------------------------------------
/docs-source/libraries.rst:
--------------------------------------------------------------------------------
  1 | .. vim: set fileencoding=UTF-8 :
  2 | .. vim: set tw=80 :
  3 | .. include:: globals.rst
  4 | 
  5 | Libraries used in |spyder|
  6 | ==========================
  7 | 
  8 | .. _seczmq:
  9 | 
 10 | ZeroMQ
 11 | ------
 12 | 
 13 | Not only with the emergence of multicore systems Python's `Global Interpreter
 14 | Lock <http://www.python.org/NEEDSLINK>`_ becomes a major issue for scaling
 15 | across cores. Libraries like `multiprocess <http://NEEDSLINK>`_ try to
 16 | circumvent the `GIL` by forking child processes and establishing a messaging
 17 | layer between them. This enables Python programmers to scale with the number of
 18 | available cores but scaling across node boundaries is not possible using plain
 19 | `multiprocess`.
 20 | 
 21 | At this point `ZeroMQ <http://www.zeromq.org>`_ comes to the rescue. As the name
 22 | suggests, |zmq| is a message queue. But, unlike other more famous queues like
 23 | `AMQP` or more lightweight ones like `STOMP` or `XMPP`, |zmq| does not need a
 24 | global broker (that might act as *single point of failure*). It is instead a
 25 | little bit of code around the plain *socket* interface that adds simple
 26 | messaging patterns to them (it's like *sockets on steroids*).
 27 | 
 28 | The beauty of |zmq| lies in it's simplicity. The programmer basically defines
 29 | a *socket* to which one side **binds** and the other **connects** and a
 30 | messaging pattern with which both sides communicate with each other. Once this
 31 | is established, scaling across cores/nodes/data centers is simple as pie. Four
 32 | types of *sockets* are supported by |zmq|:
 33 | 
 34 | 1. `inproc` sockets can be used for **intra-process** communication (between
 35 |     threads, e.g.)
 36 | 
 37 | 2. `ipc` sockets can be used for **inter-process** communication between
 38 |     different processes *on the same node*.
 39 | 
 40 | 3. `tcp` sockets can be used for **inter-process** communication between
 41 |     different processes *on different node*.
 42 | 
 43 | 4. `pgn` sockets can be used for **inter-process** communication between one and
 44 |     many other processes *on many other nodes*.
 45 | 
 46 | So by simply changing the socket type from `ipc` to `tcp` the application can
 47 | scale across node boundaries transparently for the programmer, i.e. by **not
 48 | changing a single line of code**. Awesome!
 49 | 
 50 | This leaves us with the different messaging patterns. |zmq| supports all well
 51 | known (at least to me) messaging patterns. The first one that comes into mind is
 52 | of course the `PUB/SUB` pattern that allows one publisher to send messages to
 53 | many subscribers. The `PUSH/PULL` pattern allows one master to send messages to
 54 | only one of the available clients (the common producer/consumer pattern). With
 55 | `REQ/REP` a simple request and response pattern is possible. Most of the
 56 | patterns have a `non-blocking` equivalent.
 57 | 
 58 | 
 59 | Messaging Patterns used in |spyder|
 60 | +++++++++++++++++++++++++++++++++++
 61 | 
 62 | |zmq| is used as messaging layer to distribute the workload to an arbitrary
 63 | number of worker processes which in return send the result back to the master.
 64 | In the context of |spyder| the master process controls the |urls| that should be
 65 | crawled and sends them to the worker processes when they are due. One of the
 66 | worker processes then downloads the content and possibly extracts new links from
 67 | it. When finished it sends the result back to the master.
 68 | 
 69 | We do not use the `REQ/REP` pattern as it does not scale as easily as we need
 70 | since we have to keep track of whom we sent the |url| to and we would have to do
 71 | the load balancing ourselves.
 72 | 
 73 | Instead with the |pushpull| pattern we get the load balancing as a nice little
 74 | gift. It comes with a *fair distribution policy* that simply distributes the
 75 | messages to all workers in a *round-robin* way. In order to send the results
 76 | back to the master we will use the |pubsub| pattern where the *publisher* is the
 77 | worker process and the *subscriber* is the master process.
 78 | 
 79 | The |pubsub| pattern is used to send the results back to the master process.
 80 | 
 81 | Users familiar with |zmq| might already have noted that this messaging setup is
 82 | shamelessly *adapted* from `Mongrel2 <http://www.mongrel.org>`_. In the case of
 83 | a *Web Server* as well as for a crawler this is a perfect fit as it helps you to
 84 | scale **very** easy.
 85 | 
 86 | .. note:: There is another way to do this type message pattern using
 87 |   *XPEQ/XREP*. Transition to this pattern is planned for the near future.
 88 | 
 89 | For a crawler there are two parts that we possibly want to scale: the worker
 90 | *and* the master. While scaling the worker across several processes is somewhat
 91 | obvious, scaling the master first seems to be of no relevance. But if you want
 92 | to crawl large portions of the web (all German Internet pages, e.g.), you might
 93 | experience difficulties as this are not only **many** |urls| but also **many**
 94 | hosts you possibly want to connect. While the number of |urls| might not be the
 95 | limiting part, the number of hosts can be as they require a lot of queue
 96 | switching.
 97 | 
 98 | .. note:: For more info on this, see the :ref:`seccrawlerdesign` document.
 99 | 
100 | 
101 | What does all that mean in practice
102 | +++++++++++++++++++++++++++++++++++
103 | 
104 | The master process binds to one socket with a `PUSH` type and to another socket
105 | using the `SUB` type. On the `SUB` socket the master registers a |zmq| filter to
106 | only receive messages with a certain *topic*: it's identity.
107 | 
108 | The worker in connects to the `PUSH` socket using a `PULL` type socket and
109 | receives the |urls| from the master containing the master's identity. When the
110 | |url| has been processed it sends the result back to the master using the `PUB`
111 | socket it has connected to the `SUB` socket. By setting the message's topic to
112 | the identity of the sending master, it is ensured that only the master process
113 | that sent this |url| receives the answer.
114 | 
115 | Future version of |spyder| will thus be able to work with **n** master and **m**
116 | worker processes.
117 | 
118 | 
119 | .. _sectornado:
120 | 
121 | |tornado|
122 | ---------
123 | 
124 | `Tornado <http://github.com/facebook/tornado>`_ is a *non-blocking* or *evented
125 | IO* library developed at FriendFeed (now Facebook) to run their python front-end
126 | servers.  Basically this is a
127 | 
128 | .. code-block:: python
129 | 
130 |     while True:
131 |         callback_for_event(event)
132 | 
133 | loop. The events are any *read* or *write* event on a number of sockets or files
134 | that are registered with the loop. So instead of starting one thread for each
135 | socket connection everything runs in one thread or even process. Although this
136 | might feel strange it has been shown to be **alot** faster for network intensive
137 | applications that potentially serve a large number of clients.
138 | 
139 | .. note:: For more info see the `C10k Problem <http://NEEDS-A-LINK>`_
140 | 
141 | 
142 | An additional reason for choosing |tornado| was the nice integration with |zmq|.
143 | This not only makes programming with |zmq| easier but also makes it possible to
144 | easily write *non-blocking, evented* IO programms with Python and |zmq|.
145 | 


--------------------------------------------------------------------------------
/docs-source/release-notes.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: globals.rst
 4 | .. _secrelnotes:
 5 | 
 6 | Release Notes
 7 | =============
 8 | 
 9 | Version 0.1
10 | -----------
11 | 
12 | This is the first release of the |spyder| so I will only cover the known issues
13 | here.
14 | 
15 | Changes
16 | +++++++
17 | 
18 | * Initial Release with a working *master* and *worker* implementation
19 | 
20 | Known Issues
21 | ++++++++++++
22 | 
23 | * If a *worker* crashes or is being stopped, the URLs it is currently processing
24 |   might get lost in the *master* and never be crawled. There are several
25 |   precautions in order to track this problem in the future but right now it is a
26 |   bug that might also end up in a memory leak.
27 | 


--------------------------------------------------------------------------------
/docs-source/roadmap.rst:
--------------------------------------------------------------------------------
 1 | .. vim: set fileencoding=UTF-8 :
 2 | .. vim: set tw=80 :
 3 | .. include:: globals.rst
 4 | 
 5 | Roadmap
 6 | =======
 7 | 
 8 | Version 0.3
 9 | +++++++++++
10 | 
11 | - Integration with `Supervisord`
12 | 
13 |     The current way of starting |spyder| is quite painful. Using the
14 |     `supervisord` I want to start the master and worker processes automatically
15 |     and, in case of failures, be able to restart them automatically.
16 | 


--------------------------------------------------------------------------------
/local.cfg.template:
--------------------------------------------------------------------------------
1 | [local]
2 | parts =
3 |     ${zmq:sharedzmq}
4 | #    ${zmq:localzmq}
5 | 
6 | #[environment]
7 | #ZMQ_DIR = /usr/local
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2008 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # setup.py 04-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | # under the License.
18 | #
19 | #
20 | 
21 | from setuptools import setup, find_packages
22 | import re
23 | 
24 | __version__ = re.search( "__version__\s*=\s*'(.*)'", open('src/spyder/__init__.py').read(), re.M).group(1)
25 | assert __version__
26 | 
27 | long_description = open("README.rst").read()
28 | assert long_description
29 | 
30 | tests_require = ['coverage>=3.4', 'nose==1.1.2']
31 | 
32 | setup(
33 |     name = "spyder",
34 |     version = __version__,
35 |     description = "A python spider",
36 |     long_description = long_description,
37 |     author = "Daniel Truemper",
38 |     author_email = "truemped@googlemail.com",
39 |     url = "",
40 |     license = "Apache 2.0",
41 |     package_dir = { '' : 'src' },
42 |     packages = find_packages('src'),
43 |     include_package_data = True,
44 |     test_suite = 'nose.collector',
45 |     install_requires = [
46 |         'pyzmq>=2.0.10',
47 |         'tornado>=1.1',
48 |         'thrift>=0.5.0',
49 |         'pycurl>=7.19.0',
50 |         'pytz>=2010o',
51 |         'brownie>=0.4.1',
52 |     ],
53 |     tests_require = tests_require,
54 |     extras_require = {'test': tests_require},
55 |     entry_points = {
56 |         'console_scripts' : [
57 |             'spyder = spyder:spyder_admin_main',
58 |         ]
59 |     },
60 |     classifiers = [
61 |         'Intended Audience :: Developers',
62 |         'Development Status :: 3 - Alpha',
63 |         'Intended Audience :: Information Technology',
64 |         'License :: OSI Approved :: Apache Software License',
65 |         'Operating System :: POSIX :: Linux',
66 |         'Programming Language :: Python :: 2.6',
67 |         'Topic :: Internet :: WWW/HTTP',
68 |         'Topic :: Internet :: WWW/HTTP :: Indexing/Search',
69 |     ]
70 | )
71 | 


--------------------------------------------------------------------------------
/src/spyder/__init__.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # __init__.py 07-Jan-2011
  3 | #
  4 | """
  5 | The Spyder.
  6 | """
  7 | 
  8 | import os
  9 | import shutil
 10 | import stat
 11 | import sys
 12 | 
 13 | from spyder.core.settings import Settings
 14 | from spyder.thrift.gen.ttypes import CrawlUri
 15 | 
 16 | 
 17 | __version__ = '0.2.0-dev'
 18 | 
 19 | 
 20 | def copy_skeleton_dir(destination):
 21 |     """
 22 |     Copy the skeleton directory (spyder_template) to a new directory.
 23 |     """
 24 |     if not os.path.exists(destination):
 25 |         os.makedirs(destination)
 26 |     template_dir = os.path.join(__path__[0], 'spyder_template')
 27 |     wanted_files = [".keep", "logging.conf"]
 28 | 
 29 |     for root, subdirs, files in os.walk(template_dir):
 30 |         relative = root[len(template_dir) + 1:]
 31 |         if relative:
 32 |             os.mkdir(os.path.join(destination, relative))
 33 | 
 34 |         for subdir in subdirs:
 35 |             if subdir.startswith('.'):
 36 |                 subdirs.remove(subdir)
 37 | 
 38 |         for filename in files:
 39 |             if (not filename.endswith('.py') and \
 40 |                 filename not in wanted_files) or \
 41 |                 filename == "__init__.py":
 42 | 
 43 |                 continue
 44 | 
 45 |             path_old = os.path.join(root, filename)
 46 |             path_new = os.path.join(destination, relative, filename)
 47 |             fp_old = open(path_old, 'r')
 48 |             fp_new = open(path_new, 'w')
 49 |             fp_new.write(fp_old.read())
 50 |             fp_old.close()
 51 |             fp_new.close()
 52 | 
 53 |             try:
 54 |                 shutil.copymode(path_old, path_new)
 55 |                 if sys.platform.startswith('java'):
 56 |                     # On Jython there is no os.access()
 57 |                     return
 58 |                 if not os.access(path_new, os.W_OK):
 59 |                     st_new = os.stat(path_new)
 60 |                     new_perm = stat.S_IMODE(st_new.st_mode) | stat.S_IWUSR
 61 |                     os.chmod(path_new, new_perm)
 62 |             except OSError:
 63 |                 sys.stderr.write("Could not set permission bits on %s" %
 64 |                     path_new)
 65 | 
 66 | 
 67 | def spyder_admin_main():
 68 |     """
 69 |     Method for creating new environments for Spyders.
 70 |     """
 71 |     if len(sys.argv) != 2 or "start" != sys.argv[1]:
 72 |         sys.stderr.write(
 73 | """Usage: 'spyder start'
 74 |     to start a new spyder in the current directory\n""")
 75 |         sys.exit(1)
 76 | 
 77 |     copy_skeleton_dir(os.getcwd())
 78 | 
 79 | 
 80 | def spyder_management(settings):
 81 |     """
 82 |     Start new master/worker/logsink processes.
 83 |     """
 84 | 
 85 |     from spyder import logsink
 86 |     import spyder.workerprocess as worker
 87 |     import spyder.masterprocess as master
 88 | 
 89 |     effective_settings = Settings(settings)
 90 | 
 91 |     args = [a.lower() for a in sys.argv]
 92 | 
 93 |     if "master" in args:
 94 |         args.remove("master")
 95 |         master.main(effective_settings)
 96 |     elif "worker" in args:
 97 |         worker.main(effective_settings)
 98 |     elif "logsink" in args:
 99 |         logsink.main(effective_settings)
100 |     else:
101 |         print >> sys.stderr, """Usage: spyder-ctrl [master|worker|logsink]
102 | 
103 | 'master'\t\tstart a master process.
104 | 'worker'\t\tstart a worker process.
105 | 'logsink'\t\tstart a sink for logmessages.
106 | """
107 |         sys.exit(1)
108 | 


--------------------------------------------------------------------------------
/src/spyder/core/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # __init__.py 10-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | Core modules used in the spyder.
20 | """
21 | 


--------------------------------------------------------------------------------
/src/spyder/core/constants.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # constants.py 10-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | Serveral constants mainly for ZeroMQ topics and messages.
20 | """
21 | 
22 | # general topic for spyder related management tasks
23 | ZMQ_SPYDER_MGMT = 'spyder.'
24 | 
25 | ZMQ_SPYDER_MGMT_WORKER = ZMQ_SPYDER_MGMT + 'worker.'
26 | ZMQ_SPYDER_MGMT_WORKER_AVAIL = 'be here now'.encode()
27 | ZMQ_SPYDER_MGMT_WORKER_QUIT = 'quit'.encode()
28 | ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK = 'quit.ack'.encode()
29 | 
30 | # constants used in the optional_vars map of CrawlUris
31 | CURI_OPTIONAL_TRUE = "1".encode()
32 | CURI_OPTIONAL_FALSE = "0".encode()
33 | 
34 | # username and password fields
35 | CURI_SITE_USERNAME = "username".encode()
36 | CURI_SITE_PASSWORD = "password".encode()
37 | 
38 | # extraction finished field
39 | CURI_EXTRACTION_FINISHED = "extraction_finished".encode()
40 | 
41 | # extracted urls field
42 | CURI_EXTRACTED_URLS = "extracted_urls".encode()
43 | 
44 | # Some internal error states
45 | CURI_EUNCAUGHT_EXCEPTION = 710
46 | 


--------------------------------------------------------------------------------
/src/spyder/core/dnscache.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # dnscache.py 24-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | A very simple dns cache.
20 | 
21 | Currently dns resolution is blocking style but this should get a nonblocking
22 | version.
23 | """
24 | 
25 | import socket
26 | 
27 | from brownie.caching import LRUCache as LRUDict
28 | 
29 | 
30 | class DnsCache(object):
31 |     """
32 |     This is a least recently used cache for hostname to ip addresses. If the
33 |     cache has reached it's maximum size, the least used key is being removed
34 |     and a new DNS lookup is made.
35 | 
36 |     In addition you may add static mappings via the
37 |     ``settings.STATIC_DNS_MAPPINGS`` dict.
38 |     """
39 | 
40 |     def __init__(self, settings):
41 |         """
42 |         Initialize the lru cache and the static mappings.
43 |         """
44 |         self._cache = LRUDict(maxsize=settings.SIZE_DNS_CACHE)
45 |         self._static_cache = dict()
46 |         self._static_cache.update(settings.STATIC_DNS_MAPPINGS)
47 | 
48 |     def __getitem__(self, host_port_string):
49 |         """
50 |         Retrieve the item from the cache or resolve the hostname and store the
51 |         result in the cache.
52 | 
53 |         Returns a tuple of `(ip, port)`. At the moment we only support IPv4 but
54 |         this will probably change in the future.
55 |         """
56 |         if host_port_string in self._static_cache.keys():
57 |             return self._static_cache[host_port_string]
58 | 
59 |         if host_port_string not in self._cache:
60 |             (hostname, port) = host_port_string.split(":")
61 |             infos = socket.getaddrinfo(hostname, port, 0, 0, socket.SOL_TCP)
62 |             for (_family, _socktype, _proto, _canoname, sockaddr) in infos:
63 |                 if len(sockaddr) == 2:
64 |                     # IPv4 (which we prefer)
65 |                     self._cache[host_port_string] = sockaddr
66 | 
67 |         return self._cache[host_port_string]
68 | 


--------------------------------------------------------------------------------
/src/spyder/core/log.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # logging.py 04-Feb-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | A simple pyzmq logging mixin.
20 | """
21 | 
22 | import logging
23 | 
24 | 
25 | class LoggingMixin:
26 |     """
27 |     Simple mixin for adding logging methods to a class.
28 |     """
29 | 
30 |     def __init__(self, pub_handler, log_level):
31 |         """
32 |         Initialize the logger.
33 |         """
34 |         self._logger = logging.getLogger()
35 |         self._logger.addHandler(pub_handler)
36 |         self._logger.setLevel(log_level)
37 | 


--------------------------------------------------------------------------------
/src/spyder/core/messages.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # messages.py 14-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | """
 19 | Definitions of messages that are being sent via ZeroMQ Sockets.
 20 | 
 21 | Plus some (de-)serialization helpers.
 22 | """
 23 | from  thrift import TSerialization
 24 | 
 25 | from spyder.thrift.gen.ttypes import CrawlUri
 26 | 
 27 | 
 28 | class DataMessage(object):
 29 |     """
 30 |     Envelope class describing `data` messages.
 31 |     """
 32 | 
 33 |     def __init__(self, message=None, identity=None, curi=None):
 34 |         """
 35 |         Construct a new message.
 36 |         """
 37 |         if message is not None:
 38 |             self.identity = message[0]
 39 |             self.serialized_curi = message[1]
 40 |             self.curi = deserialize_crawl_uri(message[1])
 41 |         elif identity is not None or curi is not None:
 42 |             self.identity = identity
 43 |             self.curi = curi
 44 |         else:
 45 |             self.identity = self.curi = None
 46 | 
 47 |     def serialize(self):
 48 |         """
 49 |         Return a new message envelope from the class members.
 50 |         """
 51 |         return [self.identity, serialize_crawl_uri(self.curi)]
 52 | 
 53 |     def __eq__(self, other):
 54 |         return (self.identity == other.identity
 55 |             and self.curi == other.curi)
 56 | 
 57 | 
 58 | class MgmtMessage(object):
 59 |     """
 60 |     Envelope class describing `management` messages.
 61 |     """
 62 | 
 63 |     def __init__(self, message=None, topic=None, identity=None, data=None):
 64 |         """
 65 |         Construct a new message and if given parse the serialized message.
 66 |         """
 67 |         if message is not None:
 68 |             self.topic = message[0]
 69 |             self.identity = message[1]
 70 |             self.data = message[2]
 71 |         elif topic is not None or identity is not None or data is not None:
 72 |             self.topic = topic
 73 |             self.identity = identity
 74 |             self.data = data
 75 |         else:
 76 |             self.topic = self.identity = self.data = None
 77 | 
 78 |     def serialize(self):
 79 |         """
 80 |         Return a new message envelope from the class members.
 81 |         """
 82 |         return [self.topic, self.identity, self.data]
 83 | 
 84 |     def __eq__(self, other):
 85 |         return (self.topic == other.topic
 86 |             and self.identity == other.identity
 87 |             and self.data == other.data)
 88 | 
 89 | 
 90 | def deserialize_crawl_uri(serialized):
 91 |     """
 92 |     Deserialize a `CrawlUri` that has been serialized using Thrift.
 93 |     """
 94 |     return TSerialization.deserialize(CrawlUri(), serialized)
 95 | 
 96 | 
 97 | def serialize_crawl_uri(crawl_uri):
 98 |     """
 99 |     Serialize a `CrawlUri` using Thrift.
100 |     """
101 |     return TSerialization.serialize(crawl_uri)
102 | 


--------------------------------------------------------------------------------
/src/spyder/core/mgmt.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # mgmt.py 10-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | """
 19 | A management module for managing components via ZeroMQ.
 20 | """
 21 | 
 22 | from zmq.eventloop.ioloop import IOLoop
 23 | from zmq.eventloop.zmqstream import ZMQStream
 24 | 
 25 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER
 26 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT
 27 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK
 28 | from spyder.core.messages import MgmtMessage
 29 | 
 30 | 
 31 | class ZmqMgmt(object):
 32 |     """
 33 |     A :class:`ZMQStream` object handling the management sockets.
 34 |     """
 35 | 
 36 |     def __init__(self, subscriber, publisher, **kwargs):
 37 |         """
 38 |         Initialize the management interface.
 39 | 
 40 |         The `subscriber` socket is the socket used by the Master to send
 41 |         commands to the workers. The publisher socket is used to send commands
 42 |         to the Master.
 43 | 
 44 |         You have to set the `zmq.SUBSCRIBE` socket option yourself!
 45 |         """
 46 |         self._io_loop = kwargs.get('io_loop', IOLoop.instance())
 47 | 
 48 |         self._subscriber = subscriber
 49 |         self._in_stream = ZMQStream(self._subscriber, self._io_loop)
 50 | 
 51 |         self._publisher = publisher
 52 |         self._out_stream = ZMQStream(self._publisher, self._io_loop)
 53 | 
 54 |         self._callbacks = dict()
 55 | 
 56 |     def _receive(self, raw_msg):
 57 |         """
 58 |         Main method for receiving management messages.
 59 | 
 60 |         `message` is a multipart message where `message[0]` contains the topic,
 61 |         `message[1]` is 0 and `message[1]` contains the actual message.
 62 |         """
 63 |         msg = MgmtMessage(raw_msg)
 64 | 
 65 |         if msg.topic in self._callbacks:
 66 |             for callback in self._callbacks[msg.topic]:
 67 |                 if callable(callback):
 68 |                     callback(msg)
 69 | 
 70 |         if ZMQ_SPYDER_MGMT_WORKER_QUIT == msg.data:
 71 |             self.stop()
 72 | 
 73 |     def start(self):
 74 |         """
 75 |         Start the MGMT interface.
 76 |         """
 77 |         self._in_stream.on_recv(self._receive)
 78 | 
 79 |     def stop(self):
 80 |         """
 81 |         Stop the MGMT interface.
 82 |         """
 83 |         self._in_stream.stop_on_recv()
 84 |         self.publish(topic=ZMQ_SPYDER_MGMT_WORKER, identity=None,
 85 |                 data=ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK)
 86 | 
 87 |     def close(self):
 88 |         """
 89 |         Close all open sockets.
 90 |         """
 91 |         self._in_stream.close()
 92 |         self._subscriber.close()
 93 |         self._out_stream.close()
 94 |         self._publisher.close()
 95 | 
 96 |     def add_callback(self, topic, callback):
 97 |         """
 98 |         Add a callback to the specified topic.
 99 |         """
100 |         if not callable(callback):
101 |             raise ValueError('callback must be callable')
102 | 
103 |         if topic not in self._callbacks:
104 |             self._callbacks[topic] = []
105 | 
106 |         self._callbacks[topic].append(callback)
107 | 
108 |     def remove_callback(self, topic, callback):
109 |         """
110 |         Remove a callback from the specified topic.
111 |         """
112 |         if topic in self._callbacks and callback in self._callbacks[topic]:
113 |             self._callbacks[topic].remove(callback)
114 | 
115 |     def publish(self, topic=None, identity=None, data=None):
116 |         """
117 |         Publish a message to the intended audience.
118 |         """
119 |         assert topic is not None
120 |         assert data is not None
121 |         msg = MgmtMessage(topic=topic, identity=identity, data=data)
122 |         self._out_stream.send_multipart(msg.serialize())
123 | 


--------------------------------------------------------------------------------
/src/spyder/core/prioritizer.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # prioritizer.py 01-Feb-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | URL prioritizers will calculate priorities of new URLs and the recrawling
20 | priority.
21 | """
22 | 
23 | 
24 | class SimpleTimestampPrioritizer(object):
25 |     """
26 |     A simple prioritizer where the priority is based on the timestamp of the
27 |     next scheduled crawl of the URL.
28 |     """
29 | 
30 |     def __init__(self, settings):
31 |         """
32 |         Initialize the number of available priorities and the priority delta
33 |         between the priorities.
34 |         """
35 |         self._priorities = settings.PRIORITIZER_NUM_PRIORITIES
36 |         self._default_priority = settings.PRIORITIZER_DEFAULT_PRIORITY
37 |         self._delta = settings.PRIORITIZER_CRAWL_DELTA
38 | 
39 |     def calculate_priority(self, curi):
40 |         """
41 |         Calculate the new priority based on the :class:`CrawlUri`s current.
42 | 
43 |         This should return a tuple of
44 |             (prio_level, prio)
45 |         """
46 |         if curi.current_priority and curi.status_code == 304:
47 |             prio_level = min(curi.current_priority + 1, self._priorities)
48 |         else:
49 |             prio_level = 1
50 |         prio = self._delta * prio_level
51 |         return (prio_level, prio)
52 | 


--------------------------------------------------------------------------------
/src/spyder/core/queueassignment.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # queueassignment.py 14-Mar-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | A collection of queue assignment classes.
20 | """
21 | from urlparse import urlparse
22 | 
23 | from spyder.core.frontier import PROTOCOLS_DEFAULT_PORT
24 | 
25 | 
26 | class HostBasedQueueAssignment(object):
27 |     """
28 |     This class will assign URLs to queues based on the hostnames.
29 |     """
30 | 
31 |     def __init__(self, dnscache):
32 |         """
33 |         Initialize the assignment class.
34 |         """
35 |         self._dns_cache = dnscache
36 | 
37 |     def get_identifier(self, url):
38 |         """
39 |         Get the identifier for this url.
40 |         """
41 |         parsed_url = urlparse(url)
42 |         return parsed_url.hostname
43 | 
44 | 
45 | class IpBasedQueueAssignment(HostBasedQueueAssignment):
46 |     """
47 |     This class will assign urls to queues based on the server's IP address.
48 |     """
49 | 
50 |     def __init__(self, dnscache):
51 |         """
52 |         Call the parent only.
53 |         """
54 |         HostBasedQueueAssignment.__init__(self, dnscache)
55 | 
56 |     def get_identifier(self, url):
57 |         """
58 |         Get the identifier for this url.
59 |         """
60 |         parsed_url = urlparse(url)
61 | 
62 |         # dns resolution and caching
63 |         port = parsed_url.port
64 |         if not port:
65 |             port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme]
66 | 
67 |         (ip, port) = self._dns_cache["%s:%s" % (parsed_url.hostname, port)]
68 | 
69 |         return "%s" % (ip,)
70 | 


--------------------------------------------------------------------------------
/src/spyder/core/queueselector.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # queueselector.py 25-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | A random queue selector.
20 | 
21 | Based on the number of queues (i.e. `FrontEnd queues`) return a number of the
22 | queue with a bias towards lower numbered queues.
23 | """
24 | 
25 | import random
26 | 
27 | 
28 | class BiasedQueueSelector(object):
29 |     """
30 |     The default queue selector based on radom selection with bias towards lower
31 |     numbered queues.
32 |     """
33 | 
34 |     def __init__(self, number_of_queues):
35 |         """
36 |         Initialize the queue selector with the number of available queues.
37 |         """
38 |         self._weights = []
39 |         self._sum_weights = 0
40 |         self._enumerate_weights = []
41 |         self.reset_queues(number_of_queues)
42 | 
43 |     def reset_queues(self, number_of_queues):
44 |         self._weights = [1 / (float(i) * number_of_queues)
45 |             for i in range(1, number_of_queues + 1)]
46 |         self._sum_weights = sum(self._weights)
47 |         self._enumerate_weights = [(i, w) for i, w in enumerate(self._weights)]
48 | 
49 |     def get_queue(self):
50 |         """
51 |         Return the next queue to use.
52 |         """
53 |         random_weight = random.random() * self._sum_weights
54 |         for (i, weight) in self._enumerate_weights:
55 |             random_weight -= weight
56 |             if random_weight < 0:
57 |                 return i
58 | 


--------------------------------------------------------------------------------
/src/spyder/core/settings.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # settings.py 10-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | Simple class for working with settings.
20 | 
21 | Adopted from the Django based settings system.
22 | """
23 | 
24 | from spyder import defaultsettings
25 | 
26 | 
27 | class Settings(object):
28 |     """
29 |     Class for handling spyder settings.
30 |     """
31 | 
32 |     def __init__(self, settings=None):
33 |         """
34 |         Initialize the settings.
35 |         """
36 | 
37 |         # load the default settings
38 |         for setting in dir(defaultsettings):
39 |             if setting == setting.upper():
40 |                 setattr(self, setting, getattr(defaultsettings, setting))
41 | 
42 |         # now override with user settings
43 |         if settings is not None:
44 |             for setting in dir(settings):
45 |                 if setting == setting.upper():
46 |                     setattr(self, setting, getattr(settings, setting))
47 | 


--------------------------------------------------------------------------------
/src/spyder/core/sink.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # sink.py 02-Feb-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | A sink of :class:`CrawlUri`.
20 | """
21 | 
22 | 
23 | class AbstractCrawlUriSink(object):
24 |     """
25 |     Abstract sink. Only overwrite the methods you are interested in.
26 |     """
27 | 
28 |     def process_successful_crawl(self, curi):
29 |         """
30 |         We have crawled a uri successfully. If there are newly extracted links,
31 |         add them alongside the original uri to the frontier.
32 |         """
33 |         pass
34 | 
35 |     def process_not_found(self, curi):
36 |         """
37 |         The uri we should have crawled was not found, i.e. HTTP Error 404. Do
38 |         something with that.
39 |         """
40 |         pass
41 | 
42 |     def process_redirect(self, curi):
43 |         """
44 |         There have been too many redirects, i.e. in the default config there
45 |         have been more than 3 redirects.
46 |         """
47 |         pass
48 | 
49 |     def process_server_error(self, curi):
50 |         """
51 |         There has been a server error, i.e. HTTP Error 50x. Maybe we should try
52 |         to crawl this uri again a little bit later.
53 |         """
54 |         pass
55 | 


--------------------------------------------------------------------------------
/src/spyder/core/uri_uniq.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # uri_uniq.py 31-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | A simple filter for unique uris.
20 | """
21 | 
22 | import hashlib
23 | 
24 | 
25 | class UniqueUriFilter(object):
26 |     """
27 |     A simple filter for unique uris. This is used to keep the frontier clean.
28 |     """
29 | 
30 |     def __init__(self, hash_method, depth=3):
31 |         """
32 |         Create a new unique uri filter using the specified `hash_method`.
33 | 
34 |         `depth` is used to determine the number of nested dictionaries to use.
35 |         Example: using `depth=2` the dictionary storing all hash values use the
36 |         first 2 bytes as keys, i.e. if the hash value is `abc` then
37 | 
38 |           hashes[a][b] = [c,]
39 | 
40 |         This should reduce the number of lookups within a dictionary.
41 |         """
42 |         self._hash = hash_method
43 |         self._depth = depth
44 |         self._hashes = dict()
45 | 
46 |     def is_known(self, url, add_if_unknown=False):
47 |         """
48 |         Test whether the given `url` is known. If not, store it from now on.
49 |         """
50 |         hash_method = hashlib.new(self._hash)
51 |         hash_method.update(url)
52 |         hash_value = hash_method.hexdigest()
53 | 
54 |         dictionary = self._hashes
55 |         for i in range(0, self._depth):
56 |             if hash_value[i] in dictionary:
57 |                 dictionary = dictionary[hash_value[i]]
58 |             else:
59 |                 # unknown dict, add it now
60 |                 if i == self._depth - 1:
61 |                     dictionary[hash_value[i]] = []
62 |                 else:
63 |                     dictionary[hash_value[i]] = dict()
64 |                 dictionary = dictionary[hash_value[i]]
65 | 
66 |         # now dictionary is the list at the deepest level
67 |         if hash_value[self._depth:] in dictionary:
68 |             return True
69 |         else:
70 |             # since we still are here, only the nested list does not
71 |             # contain the given rest. Now we know it
72 |             if add_if_unknown:
73 |                 dictionary.append(hash_value[self._depth:])
74 |             return False
75 | 


--------------------------------------------------------------------------------
/src/spyder/core/worker.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # worker.py 10-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | """
 19 | This module contains a ZeroMQ based Worker abstraction.
 20 | 
 21 | The `ZmqWorker` class expects an incoming and one outgoing `zmq.socket` as well
 22 | as an instance of the `spyder.core.mgmt.ZmqMgmt` class.
 23 | """
 24 | import traceback
 25 | 
 26 | from zmq.eventloop.ioloop import IOLoop
 27 | from zmq.eventloop.zmqstream import ZMQStream
 28 | 
 29 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER
 30 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT
 31 | from spyder.core.log import LoggingMixin
 32 | from spyder.core.messages import DataMessage
 33 | 
 34 | 
 35 | class ZmqWorker(object, LoggingMixin):
 36 |     """
 37 |     This is the ZMQ worker implementation.
 38 | 
 39 |     The worker will register a :class:`ZMQStream` with the configured
 40 |     :class:`zmq.Socket` and :class:`zmq.eventloop.ioloop.IOLoop` instance.
 41 | 
 42 |     Upon `ZMQStream.on_recv` the configured `processors` will be executed
 43 |     with the deserialized context and the result will be published through the
 44 |     configured `zmq.socket`.
 45 |     """
 46 | 
 47 |     def __init__(self, insocket, outsocket, mgmt, processing, log_handler,
 48 |             log_level, io_loop=None):
 49 |         """
 50 |         Initialize the `ZMQStream` with the `insocket` and `io_loop` and store
 51 |         the `outsocket`.
 52 | 
 53 |         `insocket` should be of the type `zmq.socket.PULL` `outsocket` should
 54 |         be of the type `zmq.socket.PUB`
 55 | 
 56 |         `mgmt` is an instance of `spyder.core.mgmt.ZmqMgmt` that handles
 57 |         communication between master and worker processes.
 58 |         """
 59 |         LoggingMixin.__init__(self, log_handler, log_level)
 60 | 
 61 |         self._insocket = insocket
 62 |         self._io_loop = io_loop or IOLoop.instance()
 63 |         self._outsocket = outsocket
 64 | 
 65 |         self._processing = processing
 66 |         self._mgmt = mgmt
 67 |         self._in_stream = ZMQStream(self._insocket, self._io_loop)
 68 |         self._out_stream = ZMQStream(self._outsocket, self._io_loop)
 69 | 
 70 |     def _quit(self, msg):
 71 |         """
 72 |         The worker is quitting, stop receiving messages.
 73 |         """
 74 |         if ZMQ_SPYDER_MGMT_WORKER_QUIT == msg.data:
 75 |             self.stop()
 76 | 
 77 |     def _receive(self, msg):
 78 |         """
 79 |         We have a message!
 80 | 
 81 |         `msg` is a serialized version of a `DataMessage`.
 82 |         """
 83 |         message = DataMessage(msg)
 84 | 
 85 |         try:
 86 |             # this is the real work we want to do
 87 |             curi = self._processing(message.curi)
 88 |             message.curi = curi
 89 |         except:
 90 |             # catch any uncaught exception and only log it as CRITICAL
 91 |             self._logger.critical(
 92 |                     "worker::Uncaught exception executing the worker for URL %s!" %
 93 |                     (message.curi.url,))
 94 |             self._logger.critical("worker::%s" % (traceback.format_exc(),))
 95 | 
 96 |         # finished, now send the result back to the master
 97 |         self._out_stream.send_multipart(message.serialize())
 98 | 
 99 |     def start(self):
100 |         """
101 |         Start the worker.
102 |         """
103 |         self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self._quit)
104 |         self._in_stream.on_recv(self._receive)
105 | 
106 |     def stop(self):
107 |         """
108 |         Stop the worker.
109 |         """
110 |         # stop receiving
111 |         self._in_stream.stop_on_recv()
112 |         self._mgmt.remove_callback(ZMQ_SPYDER_MGMT_WORKER, self._quit)
113 |         # but work on anything we might already have
114 |         self._in_stream.flush()
115 |         self._out_stream.flush()
116 | 
117 |     def close(self):
118 |         """
119 |         Close all open sockets.
120 |         """
121 |         self._in_stream.close()
122 |         self._insocket.close()
123 |         self._out_stream.close()
124 |         self._outsocket.close()
125 | 
126 | 
127 | class AsyncZmqWorker(ZmqWorker):
128 |     """
129 |     Asynchronous version of the `ZmqWorker`.
130 | 
131 |     This worker differs in that the `self._processing` method should have two
132 |     arguments: the message and the socket where the result should be sent to!
133 |     """
134 | 
135 |     def _receive(self, msg):
136 |         """
137 |         We have a message!
138 | 
139 |         Instead of the synchronous version we do not handle serializing and
140 |         sending the result to the `self._outsocket`. This has to be handled by
141 |         the `self._processing` method.
142 |         """
143 |         message = DataMessage(msg)
144 | 
145 |         try:
146 |             self._processing(message, self._out_stream)
147 |         except:
148 |             # catch any uncaught exception and only log it as CRITICAL
149 |             self._logger.critical("Uncaught exception executing the worker!")
150 |             self._logger.critical(traceback.format_exc())
151 | 


--------------------------------------------------------------------------------
/src/spyder/defaultsettings.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # settings.py 10-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | """
 19 | Module for the default spyder settings.
 20 | """
 21 | import logging
 22 | 
 23 | import pytz
 24 | from datetime import timedelta
 25 | 
 26 | 
 27 | # simple settings
 28 | LOG_LEVEL_MASTER = logging.DEBUG
 29 | LOG_LEVEL_WORKER = logging.DEBUG
 30 | 
 31 | 
 32 | # my local timezone
 33 | LOCAL_TIMEZONE = pytz.timezone('Europe/Berlin')
 34 | 
 35 | 
 36 | # Fetch Processor
 37 | USER_AGENT = "Mozilla/5.0 (compatible; spyder/0.1; " + \
 38 |     "+http://github.com/retresco/spyder)"
 39 | MAX_CLIENTS = 10
 40 | MAX_SIMULTANEOUS_CONNECTIONS = 1
 41 | FOLLOW_REDIRECTS = False
 42 | MAX_REDIRECTS = 3
 43 | USE_GZIP = True
 44 | 
 45 | # Proxy configuration. Both PROXY_HOST and PROXY_PORT must be set!
 46 | # PROXY_USERNAME and PROXY_PASSWORD are optional
 47 | PROXY_HOST = None
 48 | PROXY_PORT = None
 49 | PROXY_USERNAME = ''
 50 | PROXY_PASSWORD = ''
 51 | 
 52 | # Timeout settings for requests. See tornado HTTPRequest class for explanation
 53 | # defaults to 20.0 (float)
 54 | REQUEST_TIMEOUT = 20.0
 55 | CONNECT_TIMEOUT = REQUEST_TIMEOUT
 56 | 
 57 | VALIDATE_CERTIFICATES = True
 58 | 
 59 | #
 60 | # static dns mappings. Mapping has to be like this:
 61 | #    "hostname:port" => ("xxx.xxx.xxx.xxx", port)
 62 | #
 63 | STATIC_DNS_MAPPINGS = dict()
 64 | # Size of the DNS Cache.
 65 | SIZE_DNS_CACHE = 1000
 66 | 
 67 | 
 68 | # Callback for Master processes.
 69 | MASTER_CALLBACK = None
 70 | # Interval for the periodic updater (surviving times where nothing is to be
 71 | # crawled)
 72 | MASTER_PERIODIC_UPDATE_INTERVAL = 60 * 1000
 73 | 
 74 | 
 75 | # Frontier implementation to use
 76 | FRONTIER_CLASS = 'spyder.core.frontier.SingleHostFrontier'
 77 | # Filename storing the frontier state
 78 | FRONTIER_STATE_FILE = "./state.db"
 79 | # checkpointing interval (uris added/changed)
 80 | FRONTIER_CHECKPOINTING = 1000
 81 | # The number of URIs to keep inside the HEAP
 82 | FRONTIER_HEAP_SIZE = 500
 83 | # Minimum number of URIs in the HEAP
 84 | FRONTIER_HEAP_MIN = 100
 85 | # Download duration times this factor throttles the spyder
 86 | FRONTIER_CRAWL_DELAY_FACTOR = 4
 87 | # Minimum delay to wait before connecting the host again (s)
 88 | FRONTIER_MIN_DELAY = 5
 89 | 
 90 | # Number of simultaneously active queues
 91 | FRONTIER_ACTIVE_QUEUES = 100
 92 | # Number of URLs to be processed in one queue before it is put on hold
 93 | FRONTIER_QUEUE_BUDGET = 50
 94 | # Punishment of server errors with the queue
 95 | FRONTIER_QUEUE_BUDGET_PUNISH = 5
 96 | 
 97 | 
 98 | # Name of the prioritizer class to use
 99 | PRIORITIZER_CLASS = 'spyder.core.prioritizer.SimpleTimestampPrioritizer'
100 | # The number of priority levels where URIs are being assigned to (lowest means
101 | # highest priority)
102 | PRIORITIZER_NUM_PRIORITIES = 10
103 | # default priority for new urls
104 | PRIORITIZER_DEFAULT_PRIORITY = 1
105 | # Default crawl delta for known urls
106 | PRIORITIZER_CRAWL_DELTA = timedelta(days=1)
107 | 
108 | 
109 | # Name of the queue selector to use
110 | QUEUE_SELECTOR_CLASS = 'spyder.core.queueselector.BiasedQueueSelector'
111 | 
112 | 
113 | # Name of the queue assignment class to use
114 | QUEUE_ASSIGNMENT_CLASS = 'spyder.core.queueassignment.HostBasedQueueAssignment'
115 | 
116 | 
117 | # The pipeline of link extractors
118 | SPYDER_EXTRACTOR_PIPELINE = [
119 |     'spyder.processor.limiter.DefaultLimiter',
120 |     'spyder.processor.htmllinkextractor.DefaultHtmlLinkExtractor',
121 |     'spyder.processor.httpextractor.HttpExtractor',
122 | ]
123 | 
124 | 
125 | # Default HTML Extractor settings
126 | # maximum number of chars an element name may have
127 | REGEX_LINK_XTRACTOR_MAX_ELEMENT_LENGTH = 64
128 | 
129 | 
130 | # The pipeline of scope processors
131 | SPYDER_SCOPER_PIPELINE = [
132 |     'spyder.processor.scoper.RegexScoper',
133 |     'spyder.processor.stripsessions.StripSessionIds',
134 |     'spyder.processor.cleanupquery.CleanupQueryString',
135 | ]
136 | 
137 | # List of positive regular expressions for the crawl scope
138 | REGEX_SCOPE_POSITIVE = [
139 | ]
140 | 
141 | # List of negative regular expressions for the crawl scope
142 | REGEX_SCOPE_NEGATIVE = [
143 | ]
144 | 
145 | 
146 | # List of 404 redirects
147 | HTTP_EXTRACTOR_404_REDIRECT = [
148 | ]
149 | 
150 | 
151 | # Whether to remove anchors from extracted urls.
152 | REMOVE_ANCHORS_FROM_LINKS = True
153 | 
154 | 
155 | # define a parent directory for unix sockets that will be created
156 | PARENT_SOCKET_DIRECTORY = "/tmp"
157 | 
158 | #
159 | # improved settings
160 | # only edit if you are usually working behind a nuclear power plant's control
161 | # panel
162 | 
163 | # ZeroMQ Master Push
164 | ZEROMQ_MASTER_PUSH = "ipc://%s/spyder-zmq-master-push.sock" % \
165 |     PARENT_SOCKET_DIRECTORY
166 | ZEROMQ_MASTER_PUSH_HWM = 10
167 | 
168 | # ZeroMQ Fetcher
169 | ZEROMQ_WORKER_PROC_FETCHER_PULL = ZEROMQ_MASTER_PUSH
170 | ZEROMQ_WORKER_PROC_FETCHER_PUSH = "inproc://processing/fetcher/push"
171 | ZEROMQ_WORKER_PROC_FETCHER_PUSH_HWM = 10
172 | 
173 | # ZeroMQ Extractor
174 | ZEROMQ_WORKER_PROC_EXTRACTOR_PULL = ZEROMQ_WORKER_PROC_FETCHER_PUSH
175 | ZEROMQ_WORKER_PROC_EXTRACTOR_PUB = "ipc://%s/spyder-zmq-master-sub.sock" % \
176 |     PARENT_SOCKET_DIRECTORY
177 | ZEROMQ_WORKER_PROC_EXTRACTOR_PUB_HWM = 10
178 | 
179 | # ZeroMQ Master Sub
180 | ZEROMQ_MASTER_SUB = ZEROMQ_WORKER_PROC_EXTRACTOR_PUB
181 | 
182 | # ZeroMQ Management Sockets
183 | ZEROMQ_MGMT_MASTER = "ipc://%s/spyder-zmq-mgmt-master.sock" % \
184 |     (PARENT_SOCKET_DIRECTORY,)
185 | ZEROMQ_MGMT_WORKER = "ipc://%s/spyder-zmq-mgmt-worker.sock" % \
186 |     (PARENT_SOCKET_DIRECTORY,)
187 | 
188 | # ZeroMQ logging socket
189 | ZEROMQ_LOGGING = "ipc://%s/spyder-logging.sock" % (PARENT_SOCKET_DIRECTORY,)
190 | 


--------------------------------------------------------------------------------
/src/spyder/encoding.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # encoding.py 09-Feb-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | 
20 | def get_content_type_encoding(curi):
21 |     """
22 |     Determine the content encoding based on the `Content-Type` Header.
23 | 
24 |     `curi` is the :class:`CrawlUri`.
25 |     """
26 |     content_type = "text/plain"
27 |     charset = ""
28 | 
29 |     if curi.rep_header and "Content-Type" in curi.rep_header:
30 |         (content_type, charset) = extract_content_type_encoding(
31 |                 curi.rep_header["Content-Type"])
32 | 
33 |     if charset == "" and curi.content_body and len(curi.content_body) >= 512:
34 |         # no charset information in the http header
35 |         first_bytes = curi.content_body[:512].lower()
36 |         ctypestart = first_bytes.find("content-type")
37 |         if ctypestart != -1:
38 |             # there is a html header
39 |             ctypestart = first_bytes.find("content=\"", ctypestart)
40 |             ctypeend = first_bytes.find("\"", ctypestart + 9)
41 |             return extract_content_type_encoding(
42 |                     first_bytes[ctypestart + 9:ctypeend])
43 | 
44 |     return (content_type, charset)
45 | 
46 | 
47 | def extract_content_type_encoding(content_type_string):
48 |     """
49 |     Extract the content type and encoding information.
50 |     """
51 |     charset = ""
52 |     content_type = ""
53 |     for part in content_type_string.split(";"):
54 |         part = part.strip().lower()
55 |         if part.startswith("charset"):
56 |             charset = part.split("=")[1]
57 |             charset = charset.replace("-", "_")
58 |         else:
59 |             content_type = part
60 | 
61 |     return (content_type, charset)
62 | 


--------------------------------------------------------------------------------
/src/spyder/import_util.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # import_util.py 07-Feb-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | # under the License.
18 | # All programs in this directory and
19 | # subdirectories are published under the GNU General Public License as
20 | # described below.
21 | #
22 | #
23 | """
24 | A custom import method for importing modules or classes from a string.
25 | """
26 | 
27 | 
28 | def custom_import(module):
29 |     """
30 |     A custom import method to import a module.
31 |     see: stackoverflow.com: 547829/how-to-dynamically-load-a-python-class
32 |     """
33 |     mod = __import__(module)
34 |     components = module.split('.')
35 |     for comp in components[1:]:
36 |         mod = getattr(mod, comp)
37 |     return mod
38 | 
39 | 
40 | def import_class(classstring):
41 |     """
42 |     Import a class using a `classstring`. This string is split by `.` and the
43 |     last part is interpreted as class name.
44 |     """
45 |     (module_name, _sep, class_name) = classstring.rpartition('.')
46 |     module = custom_import(module_name)
47 |     return getattr(module, class_name)
48 | 


--------------------------------------------------------------------------------
/src/spyder/logsink.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # logsink.py 03-Feb-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | """
 19 | Module for aggregating spyder logs.
 20 | """
 21 | import logging
 22 | import logging.config
 23 | import signal
 24 | import os.path
 25 | import traceback
 26 | 
 27 | import zmq
 28 | from zmq.core.error import ZMQError
 29 | from zmq.eventloop.ioloop import IOLoop
 30 | from zmq.eventloop.zmqstream import ZMQStream
 31 | 
 32 | 
 33 | LOGGERS = {"default": logging.getLogger()}
 34 | 
 35 | LOGGERS['master'] = logging.getLogger('masterlog')
 36 | LOGGERS['worker'] = logging.getLogger('workerlog')
 37 | 
 38 | 
 39 | def log_zmq_message(msg):
 40 |     """
 41 |     Log a specific message.
 42 | 
 43 |     The message has the format::
 44 | 
 45 |         message = [topic, msg]
 46 | 
 47 |     `topic` is a string of the form::
 48 | 
 49 |         topic = "process.LEVEL.subtopics"
 50 |     """
 51 |     topic = msg[0].split(".")
 52 |     if len(topic) == 3:
 53 |         topic.append("SUBTOPIC")
 54 |     if topic[1] in LOGGERS:
 55 |         log = getattr(LOGGERS[topic[1]], topic[2].lower())
 56 |         log("%s - %s" % (topic[3], msg[1].strip()))
 57 |     else:
 58 |         log = getattr(LOGGERS['default'], topic[2].lower())
 59 |         log("%s: %s)" % (topic[3], msg[2].strip()))
 60 | 
 61 | 
 62 | def main(settings):
 63 |     """
 64 |     Initialize the logger sink.
 65 |     """
 66 | 
 67 |     if os.path.isfile('logging.conf'):
 68 |         logging.config.fileConfig('logging.conf')
 69 | 
 70 |     ctx = zmq.Context()
 71 |     io_loop = IOLoop.instance()
 72 | 
 73 |     log_sub = ctx.socket(zmq.SUB)
 74 |     log_sub.setsockopt(zmq.SUBSCRIBE, "")
 75 |     log_sub.bind(settings.ZEROMQ_LOGGING)
 76 | 
 77 |     log_stream = ZMQStream(log_sub, io_loop)
 78 | 
 79 |     log_stream.on_recv(log_zmq_message)
 80 | 
 81 |     def handle_shutdown_signal(_sig, _frame):
 82 |         """
 83 |         Called from the os when a shutdown signal is fired.
 84 |         """
 85 |         log_stream.stop_on_recv()
 86 |         log_stream.flush()
 87 |         io_loop.stop()
 88 | 
 89 |     # handle kill signals
 90 |     signal.signal(signal.SIGINT, handle_shutdown_signal)
 91 |     signal.signal(signal.SIGTERM, handle_shutdown_signal)
 92 | 
 93 |     try:
 94 |         io_loop.start()
 95 |     except ZMQError:
 96 |         LOGGERS['master'].debug("Caught a ZMQError. Hopefully during shutdown")
 97 |         LOGGERS['master'].debug(traceback.format_exc())
 98 | 
 99 |     log_stream.close()
100 |     ctx.term()
101 | 


--------------------------------------------------------------------------------
/src/spyder/masterprocess.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # masterprocess.py 31-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | """
 19 | This module contains the default architecture for master process.
 20 | 
 21 | The main task for masterprocesses is to create and run the **Frontier**.
 22 | Starting a master involves the following steps:
 23 | 
 24 | 1. Bind to the configured |zmq| sockets
 25 | 2. Start the management interface
 26 | 3. Create the frontier
 27 | 4. Start the master
 28 | 
 29 | Once the master is up and you have configured a ``settings.MASTER_CALLBACK``,
 30 | this method will be called before the master is really started, i.e. before the
 31 | ``IOLoop.start()`` is called. This will allow you to insert *Seed* |urls|, e.g.
 32 | """
 33 | 
 34 | import logging
 35 | import os
 36 | import signal
 37 | import socket
 38 | import traceback
 39 | 
 40 | import zmq
 41 | from zmq.core.error import ZMQError
 42 | from zmq.eventloop.ioloop import IOLoop
 43 | from zmq.log.handlers import PUBHandler
 44 | 
 45 | from spyder.import_util import import_class
 46 | from spyder.core.master import ZmqMaster
 47 | from spyder.core.mgmt import ZmqMgmt
 48 | 
 49 | 
 50 | def create_master_management(settings, zmq_context, io_loop):
 51 |     """
 52 |     Create the management interface for master processes.
 53 |     """
 54 |     listening_socket = zmq_context.socket(zmq.SUB)
 55 |     listening_socket.setsockopt(zmq.SUBSCRIBE, "")
 56 |     listening_socket.bind(settings.ZEROMQ_MGMT_WORKER)
 57 | 
 58 |     publishing_socket = zmq_context.socket(zmq.PUB)
 59 |     publishing_socket.bind(settings.ZEROMQ_MGMT_MASTER)
 60 | 
 61 |     return ZmqMgmt(listening_socket, publishing_socket, io_loop=io_loop)
 62 | 
 63 | 
 64 | def create_frontier(settings, log_handler):
 65 |     """
 66 |     Create the frontier to use.
 67 |     """
 68 |     frontier = import_class(settings.FRONTIER_CLASS)
 69 |     return frontier(settings, log_handler)
 70 | 
 71 | 
 72 | def main(settings):
 73 |     """
 74 |     Main method for master processes.
 75 |     """
 76 |     # create my own identity
 77 |     identity = "master:%s:%s" % (socket.gethostname(), os.getpid())
 78 | 
 79 |     ctx = zmq.Context()
 80 |     io_loop = IOLoop.instance()
 81 | 
 82 |     # initialize the logging subsystem
 83 |     log_pub = ctx.socket(zmq.PUB)
 84 |     log_pub.connect(settings.ZEROMQ_LOGGING)
 85 |     zmq_logging_handler = PUBHandler(log_pub)
 86 |     zmq_logging_handler.root_topic = "spyder.master"
 87 |     logger = logging.getLogger()
 88 |     logger.addHandler(zmq_logging_handler)
 89 |     logger.setLevel(settings.LOG_LEVEL_MASTER)
 90 | 
 91 |     logger.info("process::Starting up the master")
 92 | 
 93 |     mgmt = create_master_management(settings, ctx, io_loop)
 94 |     frontier = create_frontier(settings, zmq_logging_handler)
 95 | 
 96 |     publishing_socket = ctx.socket(zmq.PUSH)
 97 |     publishing_socket.setsockopt(zmq.HWM, settings.ZEROMQ_MASTER_PUSH_HWM)
 98 |     publishing_socket.bind(settings.ZEROMQ_MASTER_PUSH)
 99 | 
100 |     receiving_socket = ctx.socket(zmq.SUB)
101 |     receiving_socket.setsockopt(zmq.SUBSCRIBE, "")
102 |     receiving_socket.bind(settings.ZEROMQ_MASTER_SUB)
103 | 
104 |     master = ZmqMaster(settings, identity, receiving_socket,
105 |             publishing_socket, mgmt, frontier, zmq_logging_handler,
106 |             settings.LOG_LEVEL_MASTER, io_loop)
107 | 
108 |     def handle_shutdown_signal(_sig, _frame):
109 |         """
110 |         Called from the os when a shutdown signal is fired.
111 |         """
112 |         master.shutdown()
113 |         # zmq 2.1 stops blocking calls, restart the ioloop
114 |         io_loop.start()
115 | 
116 |     # handle kill signals
117 |     signal.signal(signal.SIGINT, handle_shutdown_signal)
118 |     signal.signal(signal.SIGTERM, handle_shutdown_signal)
119 | 
120 |     if settings.MASTER_CALLBACK:
121 |         callback = import_class(settings.MASTER_CALLBACK)
122 |         callback(settings, ctx, io_loop, frontier)
123 | 
124 |     mgmt.start()
125 |     master.start()
126 | 
127 |     # this will block until the master stops
128 |     try:
129 |         io_loop.start()
130 |     except ZMQError:
131 |         logger.debug("Caught a ZMQError. Hopefully during shutdown")
132 |         logger.debug(traceback.format_exc())
133 | 
134 |     master.close()
135 |     mgmt.close()
136 | 
137 |     logger.info("process::Master is down.")
138 |     log_pub.close()
139 | 
140 |     ctx.term()
141 | 


--------------------------------------------------------------------------------
/src/spyder/processor/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # fetcher.py 14-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | Package for the default processors.
20 | """
21 | 


--------------------------------------------------------------------------------
/src/spyder/processor/cleanupquery.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # cleanupquery.py 14-Apr-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | #
19 | """
20 | Processor to clean up the query string. At this point we want to strip any
21 | trailing '?' or '&' and optionally remove any anchors from it.
22 | """
23 | from spyder.core.constants import CURI_EXTRACTED_URLS
24 | 
25 | 
26 | class CleanupQueryString(object):
27 |     """
28 |     The processor for cleaning up the query string.
29 |     """
30 | 
31 |     def __init__(self, settings):
32 |         """
33 |         Initialize me.
34 |         """
35 |         self._remove_anchors = settings.REMOVE_ANCHORS_FROM_LINKS
36 | 
37 |     def __call__(self, curi):
38 |         """
39 |         Remove any obsolete stuff from the query string.
40 |         """
41 |         if CURI_EXTRACTED_URLS not in curi.optional_vars:
42 |             return curi
43 | 
44 |         urls = []
45 |         for raw_url in curi.optional_vars[CURI_EXTRACTED_URLS].split('\n'):
46 |             urls.append(self._cleanup_query_string(raw_url))
47 | 
48 |         curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join(urls)
49 |         return curi
50 | 
51 |     def _cleanup_query_string(self, raw_url):
52 |         """
53 |         """
54 |         url = raw_url
55 |         if self._remove_anchors:
56 |             begin = raw_url.find("#")
57 |             if begin > -1:
58 |                 url = raw_url[:begin]
59 | 
60 |         if len(url) == 0:
61 |             return raw_url
62 | 
63 |         while url[-1] == '?' or url[-1] == '&':
64 |             return url[:-1]
65 | 
66 |         return url
67 | 


--------------------------------------------------------------------------------
/src/spyder/processor/fetcher.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # fetcher.py 14-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | """
 19 | Module for downloading content from the web.
 20 | 
 21 | TODO: document pycurls features, i.e. what it can download.
 22 | """
 23 | 
 24 | import logging
 25 | 
 26 | from urlparse import urlsplit
 27 | 
 28 | from tornado.httpclient import AsyncHTTPClient, HTTPRequest
 29 | from tornado.httputil import HTTPHeaders
 30 | 
 31 | from zmq.eventloop.ioloop import IOLoop
 32 | 
 33 | from spyder.core.constants import CURI_SITE_USERNAME
 34 | from spyder.core.constants import CURI_SITE_PASSWORD
 35 | from spyder.time import deserialize_date_time
 36 | 
 37 | LOG = logging.getLogger('fetcher')
 38 | 
 39 | 
 40 | class FetchProcessor(object):
 41 |     """
 42 |     A processing class for downloading all kinds of stuff from the web.
 43 |     """
 44 | 
 45 |     def __init__(self, settings, io_loop=None):
 46 |         """
 47 |         Initialize the members.
 48 |         """
 49 |         self._user_agent = settings.USER_AGENT
 50 |         assert self._user_agent
 51 | 
 52 |         self._io_loop = io_loop or IOLoop.instance()
 53 | 
 54 |         self._follow_redirects = settings.FOLLOW_REDIRECTS
 55 |         self._max_redirects = settings.MAX_REDIRECTS
 56 |         self._gzip = settings.USE_GZIP
 57 | 
 58 |         if settings.PROXY_HOST:
 59 |             proxy_port = settings.PROXY_PORT
 60 |             assert proxy_port
 61 |             assert isinstance(proxy_port, int)
 62 | 
 63 |             self._proxy_configuration = dict(
 64 |                     host = settings.PROXY_HOST,
 65 |                     port = settings.PROXY_PORT,
 66 |                     user = settings.PROXY_USERNAME,
 67 |                     password = settings.PROXY_PASSWORD
 68 |                     )
 69 | 
 70 |         self._validate_cert = settings.VALIDATE_CERTIFICATES
 71 |         self._request_timeout = settings.REQUEST_TIMEOUT
 72 |         self._connect_timeout = settings.CONNECT_TIMEOUT
 73 | 
 74 |         max_clients = settings.MAX_CLIENTS
 75 |         max_simultaneous_connections = settings.MAX_SIMULTANEOUS_CONNECTIONS
 76 | 
 77 |         self._client = AsyncHTTPClient(self._io_loop,
 78 |             max_clients=max_clients,
 79 |             max_simultaneous_connections=max_simultaneous_connections)
 80 | 
 81 |     def __call__(self, msg, out_stream):
 82 |         """
 83 |         Work on the current `DataMessage` and send the result to `out_stream`.
 84 |         """
 85 |         # prepare the HTTPHeaders
 86 |         headers = prepare_headers(msg)
 87 | 
 88 |         last_modified = None
 89 |         if msg.curi.req_header:
 90 |             # check if we have a date when the page was last crawled
 91 |             if "Last-Modified" in msg.curi.req_header:
 92 |                 last_modified = deserialize_date_time(
 93 |                         msg.curi.req_header["Last-Modified"])
 94 | 
 95 |         # check if we have username and password present
 96 |         auth_username = None
 97 |         auth_password = None
 98 |         if msg.curi.optional_vars and \
 99 |             CURI_SITE_USERNAME in msg.curi.optional_vars and \
100 |             CURI_SITE_PASSWORD in msg.curi.optional_vars:
101 | 
102 |             auth_username = msg.curi.optional_vars[CURI_SITE_USERNAME]
103 |             auth_password = msg.curi.optional_vars[CURI_SITE_PASSWORD]
104 | 
105 |         # create the request
106 |         request = HTTPRequest(msg.curi.effective_url,
107 |                 method="GET",
108 |                 headers=headers,
109 |                 auth_username=auth_username,
110 |                 auth_password=auth_password,
111 |                 if_modified_since=last_modified,
112 |                 follow_redirects=self._follow_redirects,
113 |                 max_redirects=self._max_redirects,
114 |                 user_agent=self._user_agent,
115 |                 request_timeout = self._request_timeout,
116 |                 connect_timeout = self._connect_timeout,
117 |                 validate_cert = self._validate_cert)
118 | 
119 |         if hasattr(self, '_proxy_configuration'):
120 |             request.proxy_host = self._proxy_configuration['host']
121 |             request.proxy_port = self._proxy_configuration['port']
122 |             request.proxy_username = \
123 |                     self._proxy_configuration.get('user', None)
124 |             request.proxy_password = \
125 |                     self._proxy_configuration.get('password', None)
126 | 
127 |         LOG.info("proc.fetch::request for %s" % msg.curi.url)
128 |         self._client.fetch(request, handle_response(msg, out_stream))
129 | 
130 | 
131 | def prepare_headers(msg):
132 |     """
133 |     Construct the :class:`HTTPHeaders` with all the necessary information for
134 |     the request.
135 |     """
136 |     # construct the headers
137 |     headers = HTTPHeaders()
138 | 
139 |     if msg.curi.req_header:
140 |         # check if we have a previous Etag
141 |         if "Etag" in msg.curi.req_header:
142 |             headers["If-None-Match"] = \
143 |                 msg.curi.req_header["Etag"]
144 | 
145 |     # manually set the Host header since we are requesting using an IP
146 |     host = urlsplit(msg.curi.url).hostname
147 |     if host is None:
148 |         LOG.error("proc.fetch::cannot extract hostname from url '%s'" %
149 |                 msg.curi.url)
150 |     else:
151 |         headers["Host"] = host
152 | 
153 |     return headers
154 | 
155 | 
156 | def handle_response(msg, out_stream):
157 |     """
158 |     Decorator for the actual callback function that will extract interesting
159 |     info and forward the response.
160 |     """
161 |     def handle_server_response(response):
162 |         """
163 |         The actual callback function.
164 | 
165 |         Extract interesting info from the response using
166 |         :meth:`extract_info_from_response` and forward the result to the
167 |         `out_stream`.
168 |         """
169 |         extract_info_from_response(response, msg)
170 |         LOG.info("proc.fetch::response for %s (took '%s'ms)" %
171 |                 (msg.curi.url, response.request_time))
172 |         if response.code >= 400:
173 |             LOG.error("proc.fetch::response error: %s", response)
174 |         out_stream.send_multipart(msg.serialize())
175 | 
176 |     return handle_server_response
177 | 
178 | 
179 | def extract_info_from_response(response, msg):
180 |     """
181 |     Extract the interesting information from a HTTPResponse.
182 |     """
183 |     msg.curi.status_code = response.code
184 |     msg.curi.req_header = response.request.headers
185 |     msg.curi.rep_header = response.headers
186 |     msg.curi.req_time = response.request_time
187 |     msg.curi.queue_time = response.time_info["queue"]
188 |     msg.curi.content_body = response.body
189 | 
190 |     return msg
191 | 


--------------------------------------------------------------------------------
/src/spyder/processor/httpextractor.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2010 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # httpextractor.py 17-Mar-2011
 5 | #
 6 | # Licensed to the Apache Software Foundation (ASF) under one
 7 | # or more contributor license agreements.  See the NOTICE file
 8 | # distributed with this work for additional information
 9 | # regarding copyright ownership.  The ASF licenses this file
10 | # to you under the Apache License, Version 2.0 (the
11 | # "License"); you may not use this file except in compliance
12 | # with the License.  You may obtain a copy of the License at
13 | #
14 | #   http://www.apache.org/licenses/LICENSE-2.0
15 | #
16 | # Unless required by applicable law or agreed to in writing,
17 | # software distributed under the License is distributed on an
18 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
19 | # KIND, either express or implied.  See the License for the
20 | # specific language governing permissions and limitations
21 | # under the License.
22 | #
23 | #
24 | """
25 | Link extractor for detecting links in HTTP codes.
26 | 
27 | The main use case for this are HTTP redirects, e.g. In the case of a redirect
28 | the HTTP status code ``30X`` is present and the ``Location`` header indicates
29 | the new location.
30 | """
31 | import urlparse
32 | 
33 | from spyder.core.constants import CURI_EXTRACTED_URLS
34 | 
35 | 
36 | class HttpExtractor(object):
37 |     """
38 |     The processor for extracting links from ``HTTP`` headers.
39 |     """
40 | 
41 |     def __init__(self, settings):
42 |         """
43 |         Initialize the extractor.
44 |         """
45 |         self._not_found_redirects = settings.HTTP_EXTRACTOR_404_REDIRECT
46 | 
47 |     def __call__(self, curi):
48 |         """
49 |         Perform the URL extraction in case of a redirect code.
50 | 
51 |         I.e. if ``300 <= curi.status_code < 400``, then search for any
52 |         HTTP ``Location`` header and append the given URL to the list of
53 |         extracted URLs.
54 |         """
55 | 
56 |         if 300 <= curi.status_code < 400 and curi.rep_header and \
57 |             "Location" in curi.rep_header:
58 | 
59 |             link = curi.rep_header["Location"]
60 | 
61 |             if link.find("://") == -1:
62 |                 # a relative link. this is bad behaviour, but yeah, you know...
63 |                 link = urlparse.urljoin(curi.url, link)
64 | 
65 |             if link not in self._not_found_redirects:
66 |                 if not hasattr(curi, "optional_vars"):
67 |                     curi.optional_vars = dict()
68 | 
69 |                 if not CURI_EXTRACTED_URLS in curi.optional_vars:
70 |                     curi.optional_vars[CURI_EXTRACTED_URLS] = link
71 |                 else:
72 |                     curi.optional_vars[CURI_EXTRACTED_URLS] += "\n" + link
73 | 
74 |         return curi
75 | 


--------------------------------------------------------------------------------
/src/spyder/processor/limiter.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # limiter.py 18-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | A processor used for limiting the extraction and scoping processings.
20 | 
21 | Basically this will be used for ignoring any `robots.txt` for being processed.
22 | """
23 | 
24 | from spyder.core.constants import CURI_OPTIONAL_TRUE, CURI_EXTRACTION_FINISHED
25 | 
26 | 
27 | class DefaultLimiter(object):
28 |     """
29 |     The default crawl limiter.
30 |     """
31 | 
32 |     def __init__(self, settings):
33 |         """
34 |         Initialize the limiter with the given settings.
35 |         """
36 |         pass
37 | 
38 |     def __call__(self, curi):
39 |         """
40 |         Do the actual limiting.
41 |         """
42 |         return self._do_not_process_robots(curi)
43 | 
44 |     def _do_not_process_robots(self, curi):
45 |         """
46 |         Do not process `CrawlUris` if they are **robots.txt** files.
47 |         """
48 |         if CURI_EXTRACTION_FINISHED not in curi.optional_vars and \
49 |             curi.effective_url.endswith("robots.txt"):
50 |             curi.optional_vars[CURI_EXTRACTION_FINISHED] = CURI_OPTIONAL_TRUE
51 | 
52 |         return curi
53 | 


--------------------------------------------------------------------------------
/src/spyder/processor/scoper.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # scoper.py 24-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | The *Crawl Scope* defines which *URLs* the *Spyder* should process. The main
20 | usecases for them are:
21 | 
22 | - only spider content from the *Seed* Hosts
23 | - do not spider images, css, videos
24 | 
25 | and there are probably a lot of other reasons you want to have at least one the
26 | scoper configured, otherwise you might end up downloading the internet.
27 | 
28 | So each scoper should iterate over the
29 | ``curi.optional_vars[CURI_EXTRACTED_URLS]`` and determine if it should be
30 | downloaded or not.
31 | 
32 | The :class:`RegexScoper` maintains a list of regular expressions that define
33 | the crawl scope. Two classes of expressions exist: positive and negative.
34 | The initial decision of the scoper is to not download its content. If a regex
35 | from the positive list matches, and no regex from the negative list matches,
36 | the *URL* is marked for downloading. In any other case, the *URL* will be
37 | abandoned.
38 | 
39 | .. note:: We should really split up the regex scoper and allow the user to
40 |     configure more than just one scoper.
41 | """
42 | 
43 | import re
44 | 
45 | from spyder.core.constants import CURI_EXTRACTED_URLS
46 | 
47 | 
48 | class RegexScoper(object):
49 |     """
50 |     The scoper based on regular expressions.
51 | 
52 |     There are two settings that influence this scoper:
53 | 
54 |     1. ``settings.REGEX_SCOPE_POSITIVE``
55 |     2. ``settings.REGEX_SCOPE_NEGATIVE``
56 | 
57 |     Both have to be a ``list``. The scoper is executed in the
58 |     :meth:`__call__` method.
59 |     """
60 | 
61 |     def __init__(self, settings):
62 |         """
63 |         Compile the regular expressions.
64 |         """
65 |         self._positive_regex = []
66 |         for regex in settings.REGEX_SCOPE_POSITIVE:
67 |             self._positive_regex.append(re.compile(regex))
68 | 
69 |         self._negative_regex = []
70 |         for regex in settings.REGEX_SCOPE_NEGATIVE:
71 |             self._negative_regex.append(re.compile(regex))
72 | 
73 |     def __call__(self, curi):
74 |         """
75 |         Filter all newly extracted URLs for those we want in this crawl.
76 |         """
77 |         if CURI_EXTRACTED_URLS not in curi.optional_vars:
78 |             return curi
79 | 
80 |         urls = []
81 |         for url in curi.optional_vars[CURI_EXTRACTED_URLS].split("\n"):
82 |             add_url = False
83 |             for regex in self._positive_regex:
84 |                 if regex.match(url):
85 |                     add_url = True
86 | 
87 |             for regex in self._negative_regex:
88 |                 if regex.match(url):
89 |                     add_url = False
90 | 
91 |             if add_url:
92 |                 urls.append(url)
93 | 
94 |         curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join(urls)
95 |         return curi
96 | 


--------------------------------------------------------------------------------
/src/spyder/processor/stripsessions.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # stripsessions.py 14-Apr-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | #
19 | """
20 | Processor to strip all session ids from the extracted URLs. It should be placed
21 | at the very end of the scoper chain in order to process only those URLs that
22 | are relevant for the crawl.
23 | 
24 | It basically searches for
25 | 
26 |    sid=
27 |    jsessionid=
28 |    phpsessionid=
29 |    aspsessionid=
30 | """
31 | from spyder.core.constants import CURI_EXTRACTED_URLS
32 | 
33 | 
34 | class StripSessionIds(object):
35 |     """
36 |     The processor for removing session information from the query string.
37 |     """
38 | 
39 |     def __init__(self, settings):
40 |         """
41 |         Initialize me.
42 |         """
43 |         self._session_params = ['jsessionid=', 'phpsessid=',
44 |             'aspsessionid=', 'sid=']
45 | 
46 |     def __call__(self, curi):
47 |         """
48 |         Main method stripping the session stuff from the query string.
49 |         """
50 |         if CURI_EXTRACTED_URLS not in curi.optional_vars:
51 |             return curi
52 | 
53 |         urls = []
54 |         for raw_url in curi.optional_vars[CURI_EXTRACTED_URLS].split('\n'):
55 |             urls.append(self._remove_session_ids(raw_url))
56 | 
57 |         curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join(urls)
58 |         return curi
59 | 
60 |     def _remove_session_ids(self, raw_url):
61 |         """
62 |         Remove the session information.
63 |         """
64 |         for session in self._session_params:
65 |             url = raw_url.lower()
66 |             begin = url.find(session)
67 |             while begin > -1:
68 |                 end = url.find('&', begin)
69 |                 if end == -1:
70 |                     raw_url = raw_url[:begin]
71 |                 else:
72 |                     raw_url = "%s%s" % (raw_url[:begin], raw_url[end:])
73 |                 url = raw_url.lower()
74 |                 begin = url.find(session)
75 | 
76 |         return raw_url
77 | 


--------------------------------------------------------------------------------
/src/spyder/spyder_template/log/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/retresco/Spyder/9a2de6ec4c25d4dc85802305d5675a52c3ebb750/src/spyder/spyder_template/log/.keep


--------------------------------------------------------------------------------
/src/spyder/spyder_template/logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys = root, master, worker
 3 | 
 4 | [handlers]
 5 | keys = master, worker
 6 | 
 7 | [formatters]
 8 | keys = default
 9 | 
10 | [logger_root]
11 | level = NOTSET
12 | handlers =
13 | 
14 | [logger_master]
15 | level = DEBUG
16 | handlers = master
17 | qualname = masterlog
18 | 
19 | [handler_master]
20 | class = handlers.TimedRotatingFileHandler
21 | formatter = default
22 | args = ('log/master.log', 'D', 1, 10)
23 | 
24 | [logger_worker]
25 | level = DEBUG
26 | handlers = worker
27 | qualname = workerlog
28 | 
29 | [handler_worker]
30 | class = handlers.TimedRotatingFileHandler
31 | formatter = default
32 | args = ('log/worker.log', 'D', 1, 10)
33 | 
34 | [formatter_default]
35 | format = [%(asctime)s] - %(levelname)s - %(message)s
36 | class = logging.Formatter
37 | 


--------------------------------------------------------------------------------
/src/spyder/spyder_template/master.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # master.py 21-Apr-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | #
19 | """
20 | Master module starting a crawl.
21 | """
22 | from spyder import CrawlUri
23 | 
24 | from sink import MySink
25 | 
26 | 
27 | def initialize(settings, zmq_ctx, io_loop, frontier):
28 |     """
29 |     Initialize the **Master**.
30 | 
31 |     You may access and manipulate the `settings`, the process global `zmq_ctx`,
32 |     *pyzmq's* `io_loop` and the `frontier`.
33 |     """
34 |     frontier.add_uri(CrawlUri("http://www.dmoz.org/Recreation/Boating/Sailing/")))
35 |     frontier.add_sink(MySink(settings))
36 | 


--------------------------------------------------------------------------------
/src/spyder/spyder_template/settings.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # settings.py
 3 | #
 4 | """
 5 | Your crawler specific settings.
 6 | """
 7 | import logging
 8 | 
 9 | LOG_LEVEL_MASTER = logging.INFO
10 | LOG_LEVEL_WORKER = logging.INFO
11 | 
12 | USER_AGENT = "Mozilla/5.0 (compatible; spyder/0.1; " + \
13 |     "+http://github.com/retresco/spyder)"
14 | 
15 | # callback for initializing the periodic crawling of the sitemap
16 | MASTER_CALLBACK = 'master.initialize'
17 | 
18 | # List of positive regular expressions for the crawl scope
19 | REGEX_SCOPE_POSITIVE = [
20 |     "^http://www\.dmoz\.org/Recreation/Boating/Sailing/.*",
21 | ]
22 | 
23 | # List of negative regular expressions for the crawl scope
24 | REGEX_SCOPE_NEGATIVE = [
25 |     "^http://www\.dmoz\.org/Recreation/Boating/Sailing/Racing/.*",
26 | ]
27 | 


--------------------------------------------------------------------------------
/src/spyder/spyder_template/sink.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # sink.py 21-Apr-2011
 3 | #
 4 | """
 5 | Put your storage code here.
 6 | """
 7 | from spyder.core.sink import AbstractCrawlUriSink
 8 | 
 9 | 
10 | class MySink(AbstractCrawlUriSink):
11 |     """
12 |     This is my sink.
13 |     """
14 | 
15 |     def __init__(self, settings):
16 |         """
17 |         Initialize me with some settings.
18 |         """
19 |         pass
20 | 
21 |     def process_successful_crawl(self, curi):
22 |         """
23 |         We have crawled a uri successfully. If there are newly extracted links,
24 |         add them alongside the original uri to the frontier.
25 |         """
26 |         pass
27 | 
28 |     def process_not_found(self, curi):
29 |         """
30 |         The uri we should have crawled was not found, i.e. HTTP Error 404. Do
31 |         something with that.
32 |         """
33 |         pass
34 | 
35 |     def process_redirect(self, curi):
36 |         """
37 |         There have been too many redirects, i.e. in the default config there
38 |         have been more than 3 redirects.
39 |         """
40 |         pass
41 | 
42 |     def process_server_error(self, curi):
43 |         """
44 |         There has been a server error, i.e. HTTP Error 50x. Maybe we should try
45 |         to crawl this uri again a little bit later.
46 |         """
47 |         pass
48 | 


--------------------------------------------------------------------------------
/src/spyder/spyder_template/spyder-ctrl.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # spyder.py 02-Feb-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import sys
20 | 
21 | import spyder
22 | 
23 | try:
24 |     import settings
25 | except ImportError:
26 |     print >> sys.stderr, \
27 |         """Cannot find settings.py in the directory containing %s""" % __file__
28 |     sys.exit(1)
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     spyder.spyder_management(settings)
33 | 


--------------------------------------------------------------------------------
/src/spyder/thrift/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # __init__.py 10-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | Modules for working with the thrift generated code.
20 | """
21 | 


--------------------------------------------------------------------------------
/src/spyder/thrift/gen/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # __init__.py 10-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | """
19 | Generated modules from thrift.
20 | """
21 | 
22 | __all__ = ['ttypes', 'constants']
23 | 


--------------------------------------------------------------------------------
/src/spyder/thrift/gen/constants.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Autogenerated by Thrift
 3 | #
 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 5 | #
 6 | 
 7 | from thrift.Thrift import *
 8 | from ttypes import *
 9 | 
10 | 


--------------------------------------------------------------------------------
/src/spyder/time.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # time.py 15-Feb-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | # under the License.
18 | # All programs in this directory and
19 | # subdirectories are published under the GNU General Public License as
20 | # described below.
21 | #
22 | #
23 | """
24 | Time related utilities.
25 | """
26 | from datetime import datetime
27 | 
28 | import pytz
29 | 
30 | SERVER_TIME_FORMAT = "%a, %d %b %Y %H:%M:%S %Z"
31 | GMT = pytz.timezone('GMT')
32 | 
33 | 
34 | def serialize_date_time(date_time):
35 |     """
36 |     Create a string of the datetime.
37 |     """
38 |     return GMT.localize(date_time).strftime(SERVER_TIME_FORMAT)
39 | 
40 | 
41 | def deserialize_date_time(date_string):
42 |     """
43 |     Read a string as a datetime.
44 |     """
45 |     return datetime.strptime(date_string, SERVER_TIME_FORMAT)
46 | 


--------------------------------------------------------------------------------
/test/static/robots.txt:
--------------------------------------------------------------------------------
 1 | User-agent: *
 2 | Disallow: /RealMedia/
 3 | Disallow: /archiv/
 4 | Disallow: /forum/
 5 | Disallow: /cgi-bin/
 6 | Disallow: /werbung/
 7 | Disallow: /artikelversand/
 8 | Disallow: /grossbild/
 9 | Disallow: /druckbild/
10 | Disallow: /druckrezension/
11 | Disallow: /druckversion/
12 | Disallow: /active/
13 | Disallow: /staticgen/mobil/
14 | 
15 | #User-agent: Firefly/1.0
16 | #Disallow: /
17 | 
18 | User-agent: WebReaper
19 | Disallow: /
20 | 
21 | User-agent: Slurp
22 | Crawl-delay: 18
23 | 


--------------------------------------------------------------------------------
/test/test_async_worker.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # test_async_worker.py 14-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | import logging
 19 | from logging import StreamHandler
 20 | import sys
 21 | import unittest
 22 | 
 23 | import zmq
 24 | from zmq.eventloop.ioloop import IOLoop
 25 | from zmq.eventloop.zmqstream import ZMQStream
 26 | 
 27 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER
 28 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT
 29 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK
 30 | from spyder.core.mgmt import ZmqMgmt
 31 | from spyder.core.worker import AsyncZmqWorker
 32 | from spyder.core.messages import DataMessage, MgmtMessage
 33 | from spyder.thrift.gen.ttypes import CrawlUri
 34 | 
 35 | 
 36 | class ZmqTornadoIntegrationTest(unittest.TestCase):
 37 | 
 38 |     def setUp(self):
 39 | 
 40 |         # create the io_loop
 41 |         self._io_loop = IOLoop.instance()
 42 | 
 43 |         # and the context
 44 |         self._ctx = zmq.Context(1)
 45 | 
 46 |         # setup the mgmt sockets
 47 |         self._setup_mgmt_sockets()
 48 | 
 49 |         # setup the data sockets
 50 |         self._setup_data_sockets()
 51 | 
 52 |         # setup the management interface
 53 |         self._mgmt = ZmqMgmt( self._mgmt_sockets['worker_sub'],
 54 |             self._mgmt_sockets['worker_pub'], io_loop=self._io_loop)
 55 |         self._mgmt.start()
 56 |         self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end)
 57 | 
 58 |     def tearDown(self):
 59 |         # stop the mgmt
 60 |         self._mgmt.stop()
 61 | 
 62 |         # close all sockets
 63 |         for socket in self._mgmt_sockets.itervalues():
 64 |             socket.close()
 65 |         for socket in self._worker_sockets.itervalues():
 66 |             socket.close()
 67 | 
 68 |         # terminate the context
 69 |         self._ctx.term()
 70 | 
 71 |     def _setup_mgmt_sockets(self):
 72 | 
 73 |         self._mgmt_sockets = dict()
 74 | 
 75 |         # adress for the communication from master to worker(s)
 76 |         mgmt_master_worker = 'inproc://master/worker/coordination/'
 77 | 
 78 |         # connect the master with the worker
 79 |         # the master is a ZMQStream because we are sending msgs from the test
 80 |         sock = self._ctx.socket(zmq.PUB)
 81 |         sock.bind(mgmt_master_worker)
 82 |         self._mgmt_sockets['tmp1'] = sock
 83 |         self._mgmt_sockets['master_pub'] = ZMQStream(sock, self._io_loop)
 84 |         # the worker stream is created inside the ZmqMgmt class
 85 |         self._mgmt_sockets['worker_sub'] = self._ctx.socket(zmq.SUB)
 86 |         self._mgmt_sockets['worker_sub'].setsockopt(zmq.SUBSCRIBE, "")
 87 |         self._mgmt_sockets['worker_sub'].connect(mgmt_master_worker)
 88 | 
 89 |         # adress for the communication from worker(s) to master
 90 |         mgmt_worker_master = 'inproc://worker/master/coordination/'
 91 | 
 92 |         # connect the worker with the master
 93 |         self._mgmt_sockets['worker_pub'] = self._ctx.socket(zmq.PUB)
 94 |         self._mgmt_sockets['worker_pub'].bind(mgmt_worker_master)
 95 |         sock = self._ctx.socket(zmq.SUB)
 96 |         sock.setsockopt(zmq.SUBSCRIBE, "")
 97 |         sock.connect(mgmt_worker_master)
 98 |         self._mgmt_sockets['tmp2'] = sock
 99 |         self._mgmt_sockets['master_sub'] = ZMQStream(sock, self._io_loop)
100 | 
101 |     def _setup_data_sockets(self):
102 | 
103 |         self._worker_sockets = dict()
104 | 
105 |         # address for master -> worker communication
106 |         data_master_worker = 'inproc://master/worker/pipeline/'
107 | 
108 |         sock = self._ctx.socket(zmq.PUSH)
109 |         sock.bind(data_master_worker)
110 |         self._worker_sockets['tmp3'] = sock
111 |         self._worker_sockets['master_push'] = ZMQStream(sock, self._io_loop)
112 |         self._worker_sockets['worker_pull'] = self._ctx.socket(zmq.PULL)
113 |         self._worker_sockets['worker_pull'].connect(data_master_worker)
114 | 
115 |         # address for worker -> master communication
116 |         data_worker_master = 'inproc://worker/master/pipeline/'
117 | 
118 |         self._worker_sockets['worker_pub'] = self._ctx.socket(zmq.PUB)
119 |         self._worker_sockets['worker_pub'].bind(data_worker_master)
120 |         sock = self._ctx.socket(zmq.SUB)
121 |         sock.setsockopt(zmq.SUBSCRIBE, "")
122 |         sock.connect(data_worker_master)
123 |         self._worker_sockets['tmp4'] = sock
124 |         self._worker_sockets['master_sub'] = ZMQStream(sock, self._io_loop)
125 | 
126 |     def on_mgmt_end(self, _msg):
127 |         self._io_loop.stop()
128 | 
129 | 
130 | class AsyncZmqWorkerIntegrationTest(ZmqTornadoIntegrationTest):
131 | 
132 |     def echo_processing(self, data_message, out_socket):
133 |         msg = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
134 |                 data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
135 |         self._mgmt_sockets['master_pub'].send_multipart(msg.serialize())
136 |         out_socket.send_multipart(data_message.serialize())
137 | 
138 |     def test_that_async_worker_works(self):
139 |         worker = AsyncZmqWorker( self._worker_sockets['worker_pull'],
140 |             self._worker_sockets['worker_pub'],
141 |             self._mgmt,
142 |             self.echo_processing,
143 |             StreamHandler(sys.stdout),
144 |             logging.DEBUG,
145 |             self._io_loop)
146 | 
147 |         worker.start()
148 | 
149 |         curi = CrawlUri(url="http://localhost")
150 |         msg = DataMessage()
151 |         msg.identity = "me"
152 |         msg.curi = curi
153 | 
154 |         def assert_correct_data(msg2):
155 |             msg3 = DataMessage(msg2)
156 |             self.assertEqual(msg, msg3)
157 | 
158 |         self._worker_sockets['master_sub'].on_recv(assert_correct_data)
159 | 
160 |         def assert_correct_mgmt(msg4):
161 |             self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg4.data)
162 | 
163 |         self._mgmt_sockets['master_sub'].on_recv(assert_correct_mgmt)
164 | 
165 |         self._worker_sockets['master_push'].send_multipart(msg.serialize())
166 | 
167 |         self._io_loop.start()
168 |         worker._in_stream.flush()
169 | 
170 | 
171 | if __name__ == '__main__':
172 |     unittest.main()
173 | 


--------------------------------------------------------------------------------
/test/test_cleanup_qs.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_cleanup_qs.py 14-Apr-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | #
19 | import unittest
20 | 
21 | from spyder.core.settings import Settings
22 | from spyder.processor.cleanupquery import CleanupQueryString
23 | 
24 | 
25 | class CleanupQueryStringTest(unittest.TestCase):
26 | 
27 |     def test_that_cleaning_qs_works(self):
28 |         s = Settings()
29 |         c = CleanupQueryString(s)
30 | 
31 |         self.assertEqual("http://tesT.com/t.html?p=a",
32 |                 c._cleanup_query_string("http://tesT.com/t.html?p=a#top"))
33 | 
34 |         self.assertEqual("http://test.com/t.html",
35 |                 c._cleanup_query_string("http://test.com/t.html?#top"))
36 | 
37 |         self.assertEqual("http://test.com/t.html?test=a",
38 |                 c._cleanup_query_string("http://test.com/t.html?test=a&"))
39 | 


--------------------------------------------------------------------------------
/test/test_default_html_link_extractor.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # test_default_html_link_extractor.py 21-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | import unittest
 20 | 
 21 | from spyder.core.constants import CURI_EXTRACTED_URLS
 22 | from spyder.core.settings import Settings
 23 | from spyder.processor.htmllinkextractor import DefaultHtmlLinkExtractor
 24 | from spyder.thrift.gen.ttypes import CrawlUri
 25 | 
 26 | 
 27 | class HtmlLinkExtractorTest(unittest.TestCase):
 28 | 
 29 |     def test_that_content_type_restriction_works(self):
 30 |         xtor = DefaultHtmlLinkExtractor(Settings())
 31 | 
 32 |         curi = CrawlUri()
 33 |         curi.rep_header = dict()
 34 |         curi.rep_header["Content-Type"] = "text/html"
 35 |         self.assertTrue(xtor._restrict_content_type(curi))
 36 |         curi.rep_header["Content-Type"] = "pille/palle"
 37 |         self.assertFalse(xtor._restrict_content_type(curi))
 38 | 
 39 |     def test_link_extraction_works(self):
 40 | 
 41 |         src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \
 42 |             "<a title='ups i did it again' href ='/relative.html'>und " + \
 43 |             "noch mehr!</a><a href='evenmorerelative.html'/>" + \
 44 |             "<a href='&#109;&#97;&#105;&#108;&#116;&#111;&#58;&#109;&#117;&#115;&#116;&#101;&#114;&#64;&#98;&#102;&#97;&#114;&#109;&#46;&#100;&#101;'/>"
 45 | 
 46 |         curi = CrawlUri()
 47 |         curi.rep_header = dict()
 48 |         curi.rep_header["Content-Type"] = "text/html; charset=utf-8"
 49 |         curi.url = "http://www.bmg.bund.de/test/"
 50 |         curi.content_body = src
 51 |         curi.optional_vars = dict()
 52 | 
 53 |         xtor = DefaultHtmlLinkExtractor(Settings())
 54 |         curi = xtor(curi)
 55 | 
 56 |         links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n")
 57 |         self.assertEqual("http://www.google.de", links[0])
 58 |         self.assertEqual("http://www.bmg.bund.de/relative.html", links[1])
 59 |         self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html",
 60 |                 links[2])
 61 | 
 62 |     def test_link_extraction_with_base_works(self):
 63 | 
 64 |         src = "<base href='http://www.bing.com' />" + \
 65 |             "<a href='http://www.google.de' title='ups'> viel text</a>" + \
 66 |             "<a title='ups i did it again' href ='/relative.html'>und " + \
 67 |             "noch mehr!</a><a href='evenmorerelative.html'>"
 68 | 
 69 |         curi = CrawlUri()
 70 |         curi.rep_header = dict()
 71 |         curi.rep_header["Content-Type"] = "text/html; charset=utf-8"
 72 |         curi.url = "http://www.bmg.bund.de/test/"
 73 |         curi.content_body = src
 74 |         curi.optional_vars = dict()
 75 | 
 76 |         xtor = DefaultHtmlLinkExtractor(Settings())
 77 |         curi = xtor(curi)
 78 | 
 79 |         links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n")
 80 |         self.assertEqual("http://www.google.de", links[0])
 81 |         self.assertEqual("http://www.bing.com/relative.html", links[1])
 82 |         self.assertEqual("http://www.bing.com/evenmorerelative.html",
 83 |                 links[2])
 84 | 
 85 |     def test_missing_encoding_works(self):
 86 |         src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \
 87 |             "<a title='ups i did it again' href ='/relative.html'>und " + \
 88 |             "noch mehr!</a><a href='evenmorerelative.html'>"
 89 | 
 90 |         curi = CrawlUri()
 91 |         curi.rep_header = dict()
 92 |         curi.rep_header["Content-Type"] = "text/html"
 93 |         curi.url = "http://www.bmg.bund.de/test/"
 94 |         curi.content_body = src
 95 |         curi.optional_vars = dict()
 96 | 
 97 |         xtor = DefaultHtmlLinkExtractor(Settings())
 98 |         curi = xtor(curi)
 99 | 
100 |         links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n")
101 |         self.assertEqual("http://www.google.de", links[0])
102 |         self.assertEqual("http://www.bmg.bund.de/relative.html", links[1])
103 |         self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html",
104 |                 links[2])
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     unittest.main()
109 | 


--------------------------------------------------------------------------------
/test/test_dns_cache.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_dns_cache.py 25-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import unittest
20 | 
21 | from spyder.core.dnscache import DnsCache
22 | from spyder.core.settings import Settings
23 | 
24 | 
25 | class DnsCacheTest(unittest.TestCase):
26 | 
27 |     def test_dns_cache(self):
28 |         s = Settings()
29 |         s.SIZE_DNS_CACHE = 1
30 |         dns = DnsCache(s)
31 |         self.assertEqual(('127.0.0.1', 80), dns["localhost:80"])
32 |         self.assertEqual(('127.0.0.1', 81), dns["localhost:81"])
33 |         self.assertTrue(1, len(dns._cache))
34 | 
35 |     def test_static_dns_mapping(self):
36 |         s = Settings()
37 |         s.STATIC_DNS_MAPPINGS = {"localhost:123": ("-1.-1.-1.-1", 123)}
38 |         dns = DnsCache(s)
39 |         self.assertEqual(("-1.-1.-1.-1", 123), dns["localhost:123"])
40 |         self.assertEqual(('127.0.0.1', 80), dns["localhost:80"])
41 |         self.assertTrue(1, len(dns._cache))
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     unittest.main()
46 | 


--------------------------------------------------------------------------------
/test/test_fetch_processor.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # test_fetch_processor.py 17-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | import logging
 19 | from logging import StreamHandler
 20 | import sys
 21 | 
 22 | import os.path
 23 | import time
 24 | import random
 25 | 
 26 | import unittest
 27 | 
 28 | import tornado
 29 | import tornado.httpserver
 30 | import tornado.web
 31 | 
 32 | import zmq
 33 | from zmq.eventloop.ioloop import IOLoop
 34 | from zmq.eventloop.zmqstream import ZMQStream
 35 | 
 36 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER
 37 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT
 38 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK
 39 | from spyder.core.messages import DataMessage, MgmtMessage
 40 | from spyder.core.mgmt import ZmqMgmt
 41 | from spyder.core.worker import AsyncZmqWorker
 42 | from spyder.core.settings import Settings
 43 | from spyder.processor.fetcher import FetchProcessor
 44 | from spyder.encoding import extract_content_type_encoding
 45 | from spyder.thrift.gen.ttypes import CrawlUri
 46 | 
 47 | 
 48 | class ZmqTornadoIntegrationTest(unittest.TestCase):
 49 | 
 50 |     def setUp(self):
 51 | 
 52 |         # create the io_loop
 53 |         self._io_loop = IOLoop.instance()
 54 | 
 55 |         # and the context
 56 |         self._ctx = zmq.Context(1)
 57 | 
 58 |         # setup the mgmt sockets
 59 |         self._setup_mgmt_sockets()
 60 | 
 61 |         # setup the data sockets
 62 |         self._setup_data_sockets()
 63 | 
 64 |         # setup the management interface
 65 |         self._mgmt = ZmqMgmt( self._mgmt_sockets['worker_sub'],
 66 |             self._mgmt_sockets['worker_pub'], io_loop=self._io_loop)
 67 |         self._mgmt.start()
 68 |         self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end)
 69 | 
 70 |     def tearDown(self):
 71 |         # stop the mgmt
 72 |         self._mgmt.stop()
 73 | 
 74 |         # close all sockets
 75 |         for socket in self._mgmt_sockets.itervalues():
 76 |             socket.close()
 77 |         for socket in self._worker_sockets.itervalues():
 78 |             socket.close()
 79 | 
 80 |         # terminate the context
 81 |         self._ctx.term()
 82 | 
 83 |     def _setup_mgmt_sockets(self):
 84 | 
 85 |         self._mgmt_sockets = dict()
 86 | 
 87 |         # adress for the communication from master to worker(s)
 88 |         mgmt_master_worker = 'inproc://master/worker/coordination/'
 89 | 
 90 |         # connect the master with the worker
 91 |         # the master is a ZMQStream because we are sending msgs from the test
 92 |         sock = self._ctx.socket(zmq.PUB)
 93 |         sock.bind(mgmt_master_worker)
 94 |         self._mgmt_sockets['tmp1'] = sock
 95 |         self._mgmt_sockets['master_pub'] = ZMQStream(sock, self._io_loop)
 96 |         # the worker stream is created inside the ZmqMgmt class
 97 |         self._mgmt_sockets['worker_sub'] = self._ctx.socket(zmq.SUB)
 98 |         self._mgmt_sockets['worker_sub'].setsockopt(zmq.SUBSCRIBE, "")
 99 |         self._mgmt_sockets['worker_sub'].connect(mgmt_master_worker)
100 | 
101 |         # adress for the communication from worker(s) to master
102 |         mgmt_worker_master = 'inproc://worker/master/coordination/'
103 | 
104 |         # connect the worker with the master
105 |         self._mgmt_sockets['worker_pub'] = self._ctx.socket(zmq.PUB)
106 |         self._mgmt_sockets['worker_pub'].bind(mgmt_worker_master)
107 |         sock = self._ctx.socket(zmq.SUB)
108 |         sock.setsockopt(zmq.SUBSCRIBE, "")
109 |         sock.connect(mgmt_worker_master)
110 |         self._mgmt_sockets['tmp2'] = sock
111 |         self._mgmt_sockets['master_sub'] = ZMQStream(sock, self._io_loop)
112 | 
113 |     def _setup_data_sockets(self):
114 | 
115 |         self._worker_sockets = dict()
116 | 
117 |         # address for master -> worker communication
118 |         data_master_worker = 'inproc://master/worker/pipeline/'
119 | 
120 |         sock = self._ctx.socket(zmq.PUSH)
121 |         sock.bind(data_master_worker)
122 |         self._worker_sockets['tmp3'] = sock
123 |         self._worker_sockets['master_push'] = ZMQStream(sock, self._io_loop)
124 |         self._worker_sockets['worker_pull'] = self._ctx.socket(zmq.PULL)
125 |         self._worker_sockets['worker_pull'].connect(data_master_worker)
126 | 
127 |         # address for worker -> master communication
128 |         data_worker_master = 'inproc://worker/master/pipeline/'
129 | 
130 |         self._worker_sockets['worker_pub'] = self._ctx.socket(zmq.PUB)
131 |         self._worker_sockets['worker_pub'].bind(data_worker_master)
132 |         sock = self._ctx.socket(zmq.SUB)
133 |         sock.setsockopt(zmq.SUBSCRIBE, "")
134 |         sock.connect(data_worker_master)
135 |         self._worker_sockets['tmp4'] = sock
136 |         self._worker_sockets['master_sub'] = ZMQStream(sock, self._io_loop)
137 | 
138 |     def on_mgmt_end(self, _msg):
139 |         self._io_loop.stop()
140 | 
141 | 
142 | class SimpleFetcherTestCase(ZmqTornadoIntegrationTest):
143 | 
144 |     port = 8085
145 | 
146 |     def setUp(self):
147 |         ZmqTornadoIntegrationTest.setUp(self)
148 | 
149 |         path = os.path.join(os.path.dirname(__file__), "static")
150 |         application = tornado.web.Application([
151 |             (r"/(.*)", tornado.web.StaticFileHandler, {"path": path}),
152 |         ])
153 |         self._server = tornado.httpserver.HTTPServer(application, io_loop =
154 |                 self._io_loop)
155 |         self._server.listen(self.port)
156 | 
157 |     def tearDown(self):
158 |         ZmqTornadoIntegrationTest.tearDown(self)
159 |         self._server.stop()
160 | 
161 |     def test_content_type_encoding(self):
162 |         rep_header = dict()
163 |         rep_header["Content-Type"] = "text/html; charset=ISO-8859-1"
164 |         (ct, encoding) = extract_content_type_encoding(rep_header["Content-Type"])
165 |         self.assertEqual("text/html", ct)
166 |         self.assertEqual("iso_8859_1", encoding)
167 | 
168 |     def test_fetching_works(self):
169 | 
170 |         settings = Settings()
171 |         fetcher = FetchProcessor(settings, io_loop=self._io_loop)
172 | 
173 |         worker = AsyncZmqWorker( self._worker_sockets['worker_pull'],
174 |             self._worker_sockets['worker_pub'],
175 |             self._mgmt,
176 |             fetcher,
177 |             StreamHandler(sys.stdout),
178 |             logging.DEBUG,
179 |             self._io_loop)
180 |         worker.start()
181 | 
182 |         curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
183 |                 effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
184 |                 )
185 |         msg = DataMessage()
186 |         msg.identity = "me"
187 |         msg.curi = curi
188 | 
189 |         self._worker_sockets['master_push'].send_multipart(msg.serialize())
190 | 
191 |         def assert_expected_result_and_stop(raw_msg):
192 |             msg = DataMessage(raw_msg)
193 |             robots = open(os.path.join(os.path.dirname(__file__),
194 |                         "static/robots.txt")).read()
195 |             self.assertEqual(robots, msg.curi.content_body)
196 |             death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
197 |                     data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
198 |             self._mgmt_sockets['master_pub'].send_multipart(death.serialize())
199 | 
200 |         self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop)
201 | 
202 |         self._io_loop.start()
203 | 
204 | 
205 | if __name__ == '__main__':
206 |     unittest.main()
207 | 


--------------------------------------------------------------------------------
/test/test_fetch_processor_last_modified_works.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # test_fetch_processor_last_modified_works.py 17-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | import logging
 19 | from logging import StreamHandler
 20 | import sys
 21 | 
 22 | import os
 23 | import os.path
 24 | import time
 25 | from datetime import datetime
 26 | import random
 27 | 
 28 | import unittest
 29 | 
 30 | import tornado
 31 | import tornado.httpserver
 32 | import tornado.web
 33 | 
 34 | import zmq
 35 | from zmq.eventloop.ioloop import IOLoop
 36 | from zmq.eventloop.zmqstream import ZMQStream
 37 | 
 38 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER
 39 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT
 40 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK
 41 | from spyder.core.messages import DataMessage, MgmtMessage
 42 | from spyder.time import serialize_date_time
 43 | from spyder.core.mgmt import ZmqMgmt
 44 | from spyder.core.worker import AsyncZmqWorker
 45 | from spyder.core.settings import Settings
 46 | from spyder.processor.fetcher import FetchProcessor
 47 | from spyder.thrift.gen.ttypes import CrawlUri
 48 | 
 49 | 
 50 | class ZmqTornadoIntegrationTest(unittest.TestCase):
 51 | 
 52 |     def setUp(self):
 53 | 
 54 |         # create the io_loop
 55 |         self._io_loop = IOLoop.instance()
 56 | 
 57 |         # and the context
 58 |         self._ctx = zmq.Context(1)
 59 | 
 60 |         # setup the mgmt sockets
 61 |         self._setup_mgmt_sockets()
 62 | 
 63 |         # setup the data sockets
 64 |         self._setup_data_sockets()
 65 | 
 66 |         # setup the management interface
 67 |         self._mgmt = ZmqMgmt( self._mgmt_sockets['worker_sub'],
 68 |             self._mgmt_sockets['worker_pub'], io_loop=self._io_loop)
 69 |         self._mgmt.start()
 70 |         self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end)
 71 | 
 72 |     def tearDown(self):
 73 |         # stop the mgmt
 74 |         self._mgmt.stop()
 75 | 
 76 |         # close all sockets
 77 |         for socket in self._mgmt_sockets.itervalues():
 78 |             socket.close()
 79 |         for socket in self._worker_sockets.itervalues():
 80 |             socket.close()
 81 | 
 82 |         # terminate the context
 83 |         self._ctx.term()
 84 | 
 85 |     def _setup_mgmt_sockets(self):
 86 | 
 87 |         self._mgmt_sockets = dict()
 88 | 
 89 |         # adress for the communication from master to worker(s)
 90 |         mgmt_master_worker = 'inproc://master/worker/coordination/'
 91 | 
 92 |         # connect the master with the worker
 93 |         # the master is a ZMQStream because we are sending msgs from the test
 94 |         sock = self._ctx.socket(zmq.PUB)
 95 |         sock.bind(mgmt_master_worker)
 96 |         self._mgmt_sockets['tmp1'] = sock
 97 |         self._mgmt_sockets['master_pub'] = ZMQStream(sock, self._io_loop)
 98 |         # the worker stream is created inside the ZmqMgmt class
 99 |         self._mgmt_sockets['worker_sub'] = self._ctx.socket(zmq.SUB)
100 |         self._mgmt_sockets['worker_sub'].setsockopt(zmq.SUBSCRIBE, "")
101 |         self._mgmt_sockets['worker_sub'].connect(mgmt_master_worker)
102 | 
103 |         # adress for the communication from worker(s) to master
104 |         mgmt_worker_master = 'inproc://worker/master/coordination/'
105 | 
106 |         # connect the worker with the master
107 |         self._mgmt_sockets['worker_pub'] = self._ctx.socket(zmq.PUB)
108 |         self._mgmt_sockets['worker_pub'].bind(mgmt_worker_master)
109 |         sock = self._ctx.socket(zmq.SUB)
110 |         sock.setsockopt(zmq.SUBSCRIBE, "")
111 |         sock.connect(mgmt_worker_master)
112 |         self._mgmt_sockets['tmp2'] = sock
113 |         self._mgmt_sockets['master_sub'] = ZMQStream(sock, self._io_loop)
114 | 
115 |     def _setup_data_sockets(self):
116 | 
117 |         self._worker_sockets = dict()
118 | 
119 |         # address for master -> worker communication
120 |         data_master_worker = 'inproc://master/worker/pipeline/'
121 | 
122 |         sock = self._ctx.socket(zmq.PUSH)
123 |         sock.bind(data_master_worker)
124 |         self._worker_sockets['tmp3'] = sock
125 |         self._worker_sockets['master_push'] = ZMQStream(sock, self._io_loop)
126 |         self._worker_sockets['worker_pull'] = self._ctx.socket(zmq.PULL)
127 |         self._worker_sockets['worker_pull'].connect(data_master_worker)
128 | 
129 |         # address for worker -> master communication
130 |         data_worker_master = 'inproc://worker/master/pipeline/'
131 | 
132 |         self._worker_sockets['worker_pub'] = self._ctx.socket(zmq.PUB)
133 |         self._worker_sockets['worker_pub'].bind(data_worker_master)
134 |         sock = self._ctx.socket(zmq.SUB)
135 |         sock.setsockopt(zmq.SUBSCRIBE, "")
136 |         sock.connect(data_worker_master)
137 |         self._worker_sockets['tmp4'] = sock
138 |         self._worker_sockets['master_sub'] = ZMQStream(sock, self._io_loop)
139 | 
140 |     def on_mgmt_end(self, _msg):
141 |         self._io_loop.stop()
142 | 
143 | 
144 | class SimpleFetcherTestCase(ZmqTornadoIntegrationTest):
145 | 
146 |     port = 8085
147 | 
148 |     def setUp(self):
149 |         ZmqTornadoIntegrationTest.setUp(self)
150 | 
151 |         self._path = os.path.join(os.path.dirname(__file__), "static")
152 |         application = tornado.web.Application([
153 |             (r"/(.*)", tornado.web.StaticFileHandler, {"path": self._path}),
154 |         ])
155 |         self._server = tornado.httpserver.HTTPServer(application, io_loop =
156 |                 self._io_loop)
157 |         self._server.listen(self.port)
158 | 
159 |     def tearDown(self):
160 |         ZmqTornadoIntegrationTest.tearDown(self)
161 |         self._server.stop()
162 | 
163 |     def test_fetching_last_modified_works(self):
164 | 
165 |         settings = Settings()
166 |         fetcher = FetchProcessor(settings, io_loop=self._io_loop)
167 | 
168 |         worker = AsyncZmqWorker( self._worker_sockets['worker_pull'],
169 |             self._worker_sockets['worker_pub'],
170 |             self._mgmt,
171 |             fetcher,
172 |             StreamHandler(sys.stdout),
173 |             logging.DEBUG,
174 |             self._io_loop)
175 |         worker.start()
176 | 
177 |         mtimestamp = datetime.fromtimestamp(os.stat(os.path.join(self._path,
178 |                         "robots.txt")).st_mtime)
179 |         mtime = serialize_date_time(mtimestamp)
180 |         curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
181 |                 effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
182 |                 req_header = { "Last-Modified" :
183 |                     mtime }
184 |                 )
185 | 
186 |         msg = DataMessage()
187 |         msg.identity = "me"
188 |         msg.curi = curi
189 | 
190 |         def assert_expected_result_and_stop(raw_msg):
191 |             msg = DataMessage(raw_msg)
192 |             self.assertEqual(304, msg.curi.status_code)
193 |             self.assertEqual("", msg.curi.content_body)
194 |             death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
195 |                     data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
196 |             self._mgmt_sockets['master_pub'].send_multipart(death.serialize())
197 | 
198 |         self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop)
199 | 
200 |         self._worker_sockets['master_push'].send_multipart(msg.serialize())
201 | 
202 |         self._io_loop.start()
203 | 
204 | 
205 | if __name__ == '__main__':
206 |     unittest.main()
207 | 


--------------------------------------------------------------------------------
/test/test_fetch_processor_with_etag.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # test_fetch_processor_with_etag.py 17-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | import logging
 19 | from logging import StreamHandler
 20 | import sys
 21 | 
 22 | import os.path
 23 | import time
 24 | import random
 25 | 
 26 | import unittest
 27 | 
 28 | import tornado
 29 | import tornado.httpserver
 30 | import tornado.web
 31 | 
 32 | import zmq
 33 | from zmq.eventloop.ioloop import IOLoop
 34 | from zmq.eventloop.zmqstream import ZMQStream
 35 | 
 36 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER
 37 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT
 38 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK
 39 | from spyder.core.messages import DataMessage, MgmtMessage
 40 | from spyder.core.mgmt import ZmqMgmt
 41 | from spyder.core.worker import AsyncZmqWorker
 42 | from spyder.core.settings import Settings
 43 | from spyder.processor.fetcher import FetchProcessor
 44 | from spyder.thrift.gen.ttypes import CrawlUri
 45 | 
 46 | 
 47 | class ZmqTornadoIntegrationTest(unittest.TestCase):
 48 | 
 49 |     def setUp(self):
 50 | 
 51 |         # create the io_loop
 52 |         self._io_loop = IOLoop.instance()
 53 | 
 54 |         # and the context
 55 |         self._ctx = zmq.Context(1)
 56 | 
 57 |         # setup the mgmt sockets
 58 |         self._setup_mgmt_sockets()
 59 | 
 60 |         # setup the data sockets
 61 |         self._setup_data_sockets()
 62 | 
 63 |         # setup the management interface
 64 |         self._mgmt = ZmqMgmt( self._mgmt_sockets['worker_sub'],
 65 |             self._mgmt_sockets['worker_pub'], io_loop=self._io_loop)
 66 |         self._mgmt.start()
 67 |         self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end)
 68 | 
 69 |     def tearDown(self):
 70 |         # stop the mgmt
 71 |         self._mgmt.stop()
 72 | 
 73 |         # close all sockets
 74 |         for socket in self._mgmt_sockets.itervalues():
 75 |             socket.close()
 76 |         for socket in self._worker_sockets.itervalues():
 77 |             socket.close()
 78 | 
 79 |         # terminate the context
 80 |         self._ctx.term()
 81 | 
 82 |     def _setup_mgmt_sockets(self):
 83 | 
 84 |         self._mgmt_sockets = dict()
 85 | 
 86 |         # adress for the communication from master to worker(s)
 87 |         mgmt_master_worker = 'inproc://master/worker/coordination/'
 88 | 
 89 |         # connect the master with the worker
 90 |         # the master is a ZMQStream because we are sending msgs from the test
 91 |         sock = self._ctx.socket(zmq.PUB)
 92 |         sock.bind(mgmt_master_worker)
 93 |         self._mgmt_sockets['tmp1'] = sock
 94 |         self._mgmt_sockets['master_pub'] = ZMQStream(sock, self._io_loop)
 95 |         # the worker stream is created inside the ZmqMgmt class
 96 |         self._mgmt_sockets['worker_sub'] = self._ctx.socket(zmq.SUB)
 97 |         self._mgmt_sockets['worker_sub'].setsockopt(zmq.SUBSCRIBE, "")
 98 |         self._mgmt_sockets['worker_sub'].connect(mgmt_master_worker)
 99 | 
100 |         # adress for the communication from worker(s) to master
101 |         mgmt_worker_master = 'inproc://worker/master/coordination/'
102 | 
103 |         # connect the worker with the master
104 |         self._mgmt_sockets['worker_pub'] = self._ctx.socket(zmq.PUB)
105 |         self._mgmt_sockets['worker_pub'].bind(mgmt_worker_master)
106 |         sock = self._ctx.socket(zmq.SUB)
107 |         sock.setsockopt(zmq.SUBSCRIBE, "")
108 |         sock.connect(mgmt_worker_master)
109 |         self._mgmt_sockets['tmp2'] = sock
110 |         self._mgmt_sockets['master_sub'] = ZMQStream(sock, self._io_loop)
111 | 
112 |     def _setup_data_sockets(self):
113 | 
114 |         self._worker_sockets = dict()
115 | 
116 |         # address for master -> worker communication
117 |         data_master_worker = 'inproc://master/worker/pipeline/'
118 | 
119 |         sock = self._ctx.socket(zmq.PUSH)
120 |         sock.bind(data_master_worker)
121 |         self._worker_sockets['tmp3'] = sock
122 |         self._worker_sockets['master_push'] = ZMQStream(sock, self._io_loop)
123 |         self._worker_sockets['worker_pull'] = self._ctx.socket(zmq.PULL)
124 |         self._worker_sockets['worker_pull'].connect(data_master_worker)
125 | 
126 |         # address for worker -> master communication
127 |         data_worker_master = 'inproc://worker/master/pipeline/'
128 | 
129 |         self._worker_sockets['worker_pub'] = self._ctx.socket(zmq.PUB)
130 |         self._worker_sockets['worker_pub'].bind(data_worker_master)
131 |         sock = self._ctx.socket(zmq.SUB)
132 |         sock.setsockopt(zmq.SUBSCRIBE, "")
133 |         sock.connect(data_worker_master)
134 |         self._worker_sockets['tmp4'] = sock
135 |         self._worker_sockets['master_sub'] = ZMQStream(sock, self._io_loop)
136 | 
137 |     def on_mgmt_end(self, _msg):
138 |         self._io_loop.stop()
139 | 
140 | 
141 | class SimpleFetcherTestCase(ZmqTornadoIntegrationTest):
142 | 
143 |     port = 8085
144 | 
145 |     def setUp(self):
146 |         ZmqTornadoIntegrationTest.setUp(self)
147 | 
148 |         path = os.path.join(os.path.dirname(__file__), "static")
149 |         application = tornado.web.Application([
150 |             (r"/(.*)", tornado.web.StaticFileHandler, {"path": path}),
151 |         ])
152 |         self._server = tornado.httpserver.HTTPServer(application, io_loop =
153 |                 self._io_loop)
154 |         self._server.listen(self.port)
155 | 
156 |     def tearDown(self):
157 |         ZmqTornadoIntegrationTest.tearDown(self)
158 |         self._server.stop()
159 | 
160 |     def test_fetching_etag_works(self):
161 | 
162 |         settings = Settings()
163 |         fetcher = FetchProcessor(settings, io_loop=self._io_loop)
164 | 
165 |         worker = AsyncZmqWorker( self._worker_sockets['worker_pull'],
166 |             self._worker_sockets['worker_pub'],
167 |             self._mgmt,
168 |             fetcher,
169 |             StreamHandler(sys.stdout),
170 |             logging.DEBUG,
171 |             self._io_loop)
172 |         worker.start()
173 | 
174 |         curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
175 |                 effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
176 |                 req_header = { "Etag" :
177 |                     "\"3926227169c58185234888b60000c6eb1169577d\"" }
178 |                 )
179 | 
180 |         msg = DataMessage()
181 |         msg.identity = "me"
182 |         msg.curi = curi
183 | 
184 |         def assert_expected_result_and_stop(raw_msg):
185 |             msg = DataMessage(raw_msg)
186 |             self.assertEqual(304, msg.curi.status_code)
187 |             self.assertEqual("", msg.curi.content_body)
188 |             death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
189 |                     data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
190 |             self._mgmt_sockets['master_pub'].send_multipart(death.serialize())
191 | 
192 |         self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop)
193 | 
194 |         self._worker_sockets['master_push'].send_multipart(msg.serialize())
195 | 
196 |         self._io_loop.start()
197 | 
198 | 
199 | if __name__ == '__main__':
200 |     unittest.main()
201 | 


--------------------------------------------------------------------------------
/test/test_http_extractor.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2010 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_http_extractor.py 17-Mar-2011
 5 | #
 6 | # Licensed to the Apache Software Foundation (ASF) under one
 7 | # or more contributor license agreements.  See the NOTICE file
 8 | # distributed with this work for additional information
 9 | # regarding copyright ownership.  The ASF licenses this file
10 | # to you under the Apache License, Version 2.0 (the
11 | # "License"); you may not use this file except in compliance
12 | # with the License.  You may obtain a copy of the License at
13 | #
14 | #   http://www.apache.org/licenses/LICENSE-2.0
15 | #
16 | # Unless required by applicable law or agreed to in writing,
17 | # software distributed under the License is distributed on an
18 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
19 | # KIND, either express or implied.  See the License for the
20 | # specific language governing permissions and limitations
21 | # under the License.
22 | #
23 | #
24 | import unittest
25 | 
26 | from spyder.core.constants import CURI_EXTRACTED_URLS
27 | from spyder.core.settings import Settings
28 | from spyder.processor.httpextractor import HttpExtractor
29 | from spyder.thrift.gen.ttypes import CrawlUri
30 | 
31 | 
32 | class HttpExtractorTest(unittest.TestCase):
33 | 
34 |     def test_correct_extraction(self):
35 | 
36 |         s = Settings()
37 | 
38 |         curi = CrawlUri("http://localhost")
39 |         curi.status_code = 302
40 |         curi.rep_header = {"Location": "http://localhost/index.html"}
41 |         curi.optional_vars = dict()
42 | 
43 |         xtor = HttpExtractor(s)
44 |         curi = xtor(curi)
45 | 
46 |         self.assertTrue(CURI_EXTRACTED_URLS in curi.optional_vars)
47 |         self.assertEquals("http://localhost/index.html",
48 |                 curi.optional_vars[CURI_EXTRACTED_URLS])
49 | 
50 |     def test_only_on_redirect(self):
51 | 
52 |         s = Settings()
53 | 
54 |         curi = CrawlUri("http://localhost")
55 |         curi.status_code = 200
56 |         curi.rep_header = {"Location": "http://localhost/index.html"}
57 |         curi.optional_vars = dict()
58 | 
59 |         xtor = HttpExtractor(s)
60 |         curi = xtor(curi)
61 | 
62 |         self.assertFalse(CURI_EXTRACTED_URLS in curi.optional_vars)
63 | 
64 |     def test_relative_links(self):
65 | 
66 |         s = Settings()
67 | 
68 |         curi = CrawlUri("http://localhost")
69 |         curi.status_code = 303
70 |         curi.rep_header = {"Location": "/index.html"}
71 |         curi.optional_vars = dict()
72 | 
73 |         xtor = HttpExtractor(s)
74 |         curi = xtor(curi)
75 | 
76 |         self.assertTrue(CURI_EXTRACTED_URLS in curi.optional_vars)
77 |         self.assertEquals("http://localhost/index.html",
78 |                 curi.optional_vars[CURI_EXTRACTED_URLS])
79 | 


--------------------------------------------------------------------------------
/test/test_limiter.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_limiter.py 18-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import unittest
20 | 
21 | from spyder.core.constants import CURI_EXTRACTION_FINISHED, CURI_OPTIONAL_TRUE
22 | from spyder.processor import limiter
23 | from spyder.thrift.gen.ttypes import CrawlUri
24 | 
25 | 
26 | class LimiterTestCase(unittest.TestCase):
27 | 
28 |     def test_do_not_process_robots_works(self):
29 | 
30 |         curi = CrawlUri()
31 |         curi.effective_url = "http://127.0.0.1/robots.txt"
32 |         curi.optional_vars = dict()
33 | 
34 |         l = limiter.DefaultLimiter(None)
35 | 
36 |         for i in range(2):
37 |             l._do_not_process_robots(curi)
38 |             self.assertEqual(CURI_OPTIONAL_TRUE,
39 |                     curi.optional_vars[CURI_EXTRACTION_FINISHED])
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/test/test_masterprocess.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_masterprocess.py 07-Feb-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | import logging
19 | import unittest
20 | 
21 | import sys
22 | 
23 | from spyder.core.settings import Settings
24 | from spyder import masterprocess
25 | 
26 | 
27 | class MasterProcessTest(unittest.TestCase):
28 | 
29 |     def test_create_frontier_works(self):
30 | 
31 |         handler = logging.StreamHandler(sys.stdout)
32 |         s = Settings()
33 |         s.FRONTIER_STATE_FILE = ":memory:"
34 | 
35 |         frontier = masterprocess.create_frontier(s, handler)
36 | 
37 |         self.assertTrue(frontier is not None)
38 | 


--------------------------------------------------------------------------------
/test/test_messages.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_messages.py 14-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import unittest
20 | 
21 | from spyder.core.messages import DataMessage, MgmtMessage
22 | from spyder.core.messages import serialize_crawl_uri, deserialize_crawl_uri
23 | from spyder.thrift.gen.ttypes import CrawlUri
24 | 
25 | class TestMessages(unittest.TestCase):
26 | 
27 |     def test_that_serialization_works(self):
28 |     
29 |         curi = CrawlUri(url="http://localhost")
30 | 
31 |         serialized = serialize_crawl_uri(curi)
32 |         deserialized = deserialize_crawl_uri(serialized)
33 | 
34 |         self.assertEqual(curi, deserialized)
35 | 
36 |     def test_that_data_messages_work(self):
37 |         identity = "me myself and i"
38 |         curi = CrawlUri(url="http://localhost")
39 |         serialized = serialize_crawl_uri(curi)
40 | 
41 |         msg = DataMessage([identity, serialized])
42 | 
43 |         self.assertEqual(identity, msg.identity)
44 |         self.assertEqual(curi, msg.curi)
45 |         self.assertEqual([identity, serialized], msg.serialize())
46 |         self.assertEqual(msg, DataMessage(msg.serialize()))
47 | 
48 |     def test_that_mgmt_messages_work(self):
49 |         topic = "me"
50 |         identity = "myself"
51 |         data = "and i"
52 | 
53 |         msg = MgmtMessage([topic, identity, data])
54 | 
55 |         self.assertEqual(topic, msg.topic)
56 |         self.assertEqual(identity, msg.identity)
57 |         self.assertEqual(data, msg.data)
58 |         self.assertEqual([topic, identity, data], msg.serialize())
59 |         self.assertEqual(msg, MgmtMessage(msg.serialize()))
60 | 
61 |     def test_that_construction_works(self):
62 |         msg = DataMessage(identity="me")
63 |         self.assertEqual("me", msg.identity)
64 |         self.assertEqual(None, msg.curi)
65 | 
66 |         msg = DataMessage(curi="bla")
67 |         self.assertEqual("bla", msg.curi)
68 |         self.assertEqual(None, msg.identity)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/test/test_mgmt.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # test_mgmt.py 10-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | import unittest
 20 | 
 21 | import time
 22 | 
 23 | import zmq
 24 | from zmq.eventloop.ioloop import IOLoop
 25 | from zmq.eventloop.zmqstream import ZMQStream
 26 | 
 27 | from spyder.core.messages import MgmtMessage
 28 | from spyder.core.mgmt import ZmqMgmt
 29 | from spyder.core.constants import *
 30 | 
 31 | 
 32 | class ManagementIntegrationTest(unittest.TestCase):
 33 | 
 34 | 
 35 |     def setUp(self):
 36 |         self._io_loop = IOLoop.instance()
 37 |         self._ctx = zmq.Context(1)
 38 | 
 39 |         sock = self._ctx.socket(zmq.PUB)
 40 |         sock.bind('inproc://master/worker/coordination')
 41 |         self._master_pub_sock = sock
 42 |         self._master_pub = ZMQStream(sock, self._io_loop)
 43 | 
 44 |         self._worker_sub = self._ctx.socket(zmq.SUB)
 45 |         self._worker_sub.setsockopt(zmq.SUBSCRIBE, "")
 46 |         self._worker_sub.connect('inproc://master/worker/coordination')
 47 | 
 48 |         self._worker_pub = self._ctx.socket(zmq.PUB)
 49 |         self._worker_pub.bind( 'inproc://worker/master/coordination' )
 50 | 
 51 |         sock = self._ctx.socket(zmq.SUB)
 52 |         sock.setsockopt(zmq.SUBSCRIBE, "")
 53 |         sock.connect( 'inproc://worker/master/coordination' )
 54 |         self._master_sub_sock = sock
 55 |         self._master_sub = ZMQStream(sock, self._io_loop)
 56 | 
 57 |         self._topic = ZMQ_SPYDER_MGMT_WORKER + 'testtopic'
 58 | 
 59 |     def tearDown(self):
 60 |         self._master_pub.close()
 61 |         self._master_pub_sock.close()
 62 |         self._worker_sub.close()
 63 |         self._worker_pub.close()
 64 |         self._master_sub.close()
 65 |         self._master_sub_sock.close()
 66 |         self._ctx.term()
 67 | 
 68 |     def call_me(self, msg):
 69 |         self.assertEqual(self._topic, msg.topic)
 70 |         self.assertEqual('test'.encode(), msg.data)
 71 |         death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
 72 |                 data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
 73 |         self._master_pub.send_multipart(death.serialize())
 74 | 
 75 |     def on_end(self, msg):
 76 |         self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT, msg.data)
 77 |         self._io_loop.stop()
 78 | 
 79 | 
 80 |     def test_simple_mgmt_session(self):
 81 |         
 82 |         mgmt = ZmqMgmt(self._worker_sub, self._worker_pub, io_loop=self._io_loop)
 83 |         mgmt.start()
 84 | 
 85 |         self.assertRaises(ValueError, mgmt.add_callback, "test", "test")
 86 | 
 87 |         mgmt.add_callback(self._topic, self.call_me)
 88 |         mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_end)
 89 | 
 90 |         test_msg = MgmtMessage(topic=self._topic, data='test'.encode())
 91 |         self._master_pub.send_multipart(test_msg.serialize())
 92 | 
 93 |         def assert_correct_mgmt_answer(raw_msg):
 94 |             msg = MgmtMessage(raw_msg)
 95 |             self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg.data)
 96 |             mgmt.remove_callback(self._topic, self.call_me)
 97 |             mgmt.remove_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_end)
 98 |             self.assertEqual({}, mgmt._callbacks)
 99 | 
100 |         self._master_sub.on_recv(assert_correct_mgmt_answer)
101 | 
102 |         self._io_loop.start()
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     unittest.main()
107 | 


--------------------------------------------------------------------------------
/test/test_multiple_frontier.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # test_multiple_frontier.py 31-Mar-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | import logging
 19 | from logging import StreamHandler
 20 | 
 21 | from datetime import datetime
 22 | from datetime import timedelta
 23 | import time
 24 | import unittest
 25 | import sys
 26 | 
 27 | from spyder.core.frontier import MultipleHostFrontier
 28 | from spyder.core.settings import Settings
 29 | from spyder.time import serialize_date_time, deserialize_date_time
 30 | from spyder.thrift.gen.ttypes import CrawlUri
 31 | 
 32 | 
 33 | class MultipleHostFrontierTest(unittest.TestCase):
 34 | 
 35 |     def test_that_adding_uris_works(self):
 36 | 
 37 |         s = Settings()
 38 |         s.FRONTIER_STATE_FILE = ":memory:"
 39 | 
 40 |         frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))
 41 | 
 42 |         now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
 43 |         next_crawl_date = now + timedelta(days=1)
 44 |         curi = CrawlUri("http://localhost")
 45 |         curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
 46 |         curi.current_priority = 2
 47 | 
 48 |         frontier.add_uri(curi)
 49 | 
 50 |         cur = frontier._front_end_queues._cursor
 51 | 
 52 |         curi = CrawlUri("http://foreignhost")
 53 |         curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
 54 |         curi.current_priority = 1
 55 | 
 56 |         frontier.add_uri(curi)
 57 | 
 58 |         idents = {"localhost": -1, "foreignhost": -1}
 59 |         cur.execute("SELECT * FROM queue_identifiers")
 60 |         for row in cur:
 61 |             self.assertTrue(row['identifier'] in idents.keys())
 62 |             idents["http://%s" % row['identifier']] = row['queue']
 63 | 
 64 |         cur.execute("SELECT * FROM queues")
 65 |         for row in cur:
 66 |             self.assertEqual(idents[row['url']], row['queue'])
 67 | 
 68 |         self.assertEqual(2, frontier._front_end_queues.get_queue_count())
 69 | 
 70 |     def test_queues_work(self):
 71 | 
 72 |         s = Settings()
 73 |         s.FRONTIER_STATE_FILE = ":memory:"
 74 |         s.FRONTIER_ACTIVE_QUEUES = 1
 75 |         s.FRONTIER_QUEUE_BUDGET = 4
 76 |         s.FRONTIER_QUEUE_BUDGET_PUNISH = 5
 77 | 
 78 |         frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))
 79 | 
 80 |         now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
 81 |         curi1 = CrawlUri("http://localhost")
 82 |         curi1.current_priority = 2
 83 |         curi1.req_time = 0.4
 84 | 
 85 |         frontier.add_uri(curi1)
 86 | 
 87 |         cur = frontier._front_end_queues._cursor
 88 | 
 89 |         curi2 = CrawlUri("http://foreignhost")
 90 |         curi2.current_priority = 1
 91 |         curi2.req_time = 1.4
 92 | 
 93 |         frontier.add_uri(curi2)
 94 | 
 95 |         self.assertEqual(0, len(frontier._current_queues))
 96 |         frontier._maybe_add_queues()
 97 | 
 98 |         self.assertEqual(1, len(frontier._current_queues))
 99 |         for q1 in frontier._current_queues.keys():
100 |             pass
101 | 
102 |         self.assertEquals(4, frontier._budget_politeness[q1])
103 |         frontier._cleanup_budget_politeness()
104 |         self.assertEquals(4, frontier._budget_politeness[q1])
105 | 
106 |         frontier._update_heap()
107 |         self.assertEqual(1, len(frontier._current_queues))
108 | 
109 |         if q1 == 1:
110 |             curi1.status_code = 500
111 |             frontier.process_server_error(curi1)
112 |         else:
113 |             curi1.status_code = 500
114 |             frontier.process_server_error(curi2)
115 | 
116 |         self.assertEquals(-1, frontier._budget_politeness[q1])
117 | 
118 |         frontier._cleanup_budget_politeness()
119 | 
120 |         self.assertEqual(1, len(frontier._current_queues))
121 |         for q2 in frontier._current_queues.keys():
122 |             pass
123 | 
124 |         self.assertEquals(4, frontier._budget_politeness[q2])
125 |         frontier._cleanup_budget_politeness()
126 |         self.assertEquals(4, frontier._budget_politeness[q2])
127 |  
128 |         frontier._update_heap()
129 |         self.assertEqual(1, len(frontier._current_queues))
130 | 
131 |         if q2 == 1:
132 |             curi1.status_code = 200
133 |             frontier.process_successful_crawl(curi1)
134 |         else:
135 |             curi2.status_code = 200
136 |             frontier.process_successful_crawl(curi2)
137 | 
138 |         self.assertEquals(3, frontier._budget_politeness[q2])
139 | 
140 |         frontier._cleanup_budget_politeness()
141 | 
142 |     def test_with_multiple_active_queues(self):
143 | 
144 |         s = Settings()
145 |         s.FRONTIER_STATE_FILE = ":memory:"
146 |         s.FRONTIER_ACTIVE_QUEUES = 2
147 |         s.FRONTIER_QUEUE_BUDGET = 4
148 |         s.FRONTIER_QUEUE_BUDGET_PUNISH = 5
149 | 
150 |         frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))
151 | 
152 |         now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
153 |         curi1 = CrawlUri("http://localhost")
154 |         curi1.current_priority = 2
155 |         curi1.req_time = 0.4
156 | 
157 |         frontier.add_uri(curi1)
158 | 
159 |         cur = frontier._front_end_queues._cursor
160 | 
161 |         curi2 = CrawlUri("http://www.google.de")
162 |         curi2.current_priority = 1
163 |         curi2.req_time = 1.4
164 | 
165 |         frontier.add_uri(curi2)
166 | 
167 |         self.assertEqual(0, len(frontier._current_queues))
168 |         frontier._maybe_add_queues()
169 | 
170 |         self.assertEqual(2, len(frontier._current_queues))
171 | 
172 |         next_url = frontier.get_next()
173 | 
174 | 
175 | if __name__ == '__main__':
176 |     unittest.main()
177 | 


--------------------------------------------------------------------------------
/test/test_queue_assignment.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_queue_assignment.py 31-Mar-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | import unittest
19 | 
20 | from spyder.core.settings import Settings
21 | from spyder.core.dnscache import DnsCache
22 | from spyder.core.queueassignment import HostBasedQueueAssignment
23 | from spyder.core.queueassignment import IpBasedQueueAssignment
24 | 
25 | class HostBasedQueueAssignmentTest(unittest.TestCase):
26 | 
27 |     def test_host_based_assignment(self):
28 | 
29 |         s = Settings()
30 |         dns = DnsCache(s)
31 |         assign = HostBasedQueueAssignment(dns)
32 | 
33 |         url = "http://www.google.com/pille/palle"
34 |         self.assertEqual("www.google.com", assign.get_identifier(url))
35 | 
36 | 
37 | 
38 | class IpBasedQueueAssignmentTest(unittest.TestCase):
39 | 
40 |     def test_ip_based_assignment(self):
41 | 
42 |         s = Settings()
43 |         dns = DnsCache(s)
44 |         assign = IpBasedQueueAssignment(dns)
45 | 
46 |         url = "http://localhost:12345/this"
47 |         self.assertEqual("127.0.0.1", assign.get_identifier(url))
48 | 
49 | if __name__ == '__main__':
50 |     unittest.main()
51 | 


--------------------------------------------------------------------------------
/test/test_queue_selector.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_queue_selector.py 25-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import unittest
20 | 
21 | from collections import defaultdict
22 | 
23 | from spyder.core.queueselector import BiasedQueueSelector
24 | 
25 | 
26 | class BiasedQueueSelectorTest(unittest.TestCase):
27 | 
28 |     def test_histogram(self):
29 | 
30 |         # create a selector with 10 queues
31 |         selector = BiasedQueueSelector(10)
32 | 
33 |         histogram = defaultdict(int)
34 | 
35 |         for i in xrange(100000):
36 |             histogram[selector.get_queue()] += 1
37 | 
38 |         for i in range(1,9):
39 |             self.assertTrue(histogram[i] > histogram[i+1])
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/test/test_regex_scoper.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_regex_scoper.py 24-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import unittest
20 | 
21 | from spyder.core.constants import CURI_EXTRACTED_URLS
22 | from spyder.core.settings import Settings
23 | from spyder.thrift.gen.ttypes import CrawlUri
24 | 
25 | from spyder.processor.scoper import *
26 | 
27 | class RegexScoperTest(unittest.TestCase):
28 | 
29 |     def test_regex_scoper(self):
30 | 
31 |         curi = CrawlUri()
32 |         curi.optional_vars = dict()
33 |         curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join([
34 |             "http://www.google.de/index.html",
35 |             "ftp://www.google.de/pillepalle.avi",
36 |         ])
37 | 
38 |         settings = Settings()
39 |         settings.REGEX_SCOPE_POSITIVE = ['^.*\.html']
40 |         settings.REGEX_SCOPE_NEGATIVE = ['^.*\.avi']
41 |         scoper = RegexScoper(settings)
42 | 
43 |         curi = scoper(curi)
44 | 
45 |         print curi.optional_vars[CURI_EXTRACTED_URLS]
46 |         self.assertTrue("http://www.google.de/index.html" in
47 |                 curi.optional_vars[CURI_EXTRACTED_URLS])
48 |         self.assertFalse("ftp://www.google.de/pillepalle.avi" in
49 |                 curi.optional_vars[CURI_EXTRACTED_URLS])
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     unittest.main()
54 | 


--------------------------------------------------------------------------------
/test/test_settings.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_settings.py 10-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import unittest
20 | 
21 | 
22 | class SettingsTest(unittest.TestCase):
23 | 
24 |     def test_loading_default_settings_works(self):
25 | 
26 |         from spyder import defaultsettings
27 |         from spyder.core.settings import Settings
28 | 
29 |         settings = Settings()
30 |         self.assertEqual(defaultsettings.ZEROMQ_MGMT_MASTER,
31 |                 settings.ZEROMQ_MGMT_MASTER)
32 | 
33 | 
34 |     def test_loading_custom_settings_works(self):
35 | 
36 |         from spyder import defaultsettings
37 |         from spyder.core.settings import Settings
38 | 
39 |         import test_settings_settings
40 |         settings = Settings(test_settings_settings)
41 | 
42 |         self.assertEqual(test_settings_settings.ZEROMQ_MGMT_WORKER,
43 |                 settings.ZEROMQ_MGMT_WORKER)
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     unittest.main()
48 | 


--------------------------------------------------------------------------------
/test/test_settings_settings.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_settings_settings.py 10-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | ZEROMQ_MGMT_WORKER = "test"
20 | 


--------------------------------------------------------------------------------
/test/test_sqlite_queues.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # test_sqlite_queues.py 25-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | import unittest
 20 | 
 21 | import time
 22 | 
 23 | from spyder.core.sqlitequeues import SQLiteSingleHostUriQueue, UriNotFound
 24 | 
 25 | 
 26 | class SqliteQueuesTest(unittest.TestCase):
 27 | 
 28 |     def test_adding_works(self):
 29 | 
 30 |         uri = ("http://localhost", "etag", int(time.time()*1000),
 31 |                 int(time.time() * 1000), 1)
 32 | 
 33 |         q = SQLiteSingleHostUriQueue(":memory:")
 34 |         q.add_uri(uri)
 35 | 
 36 |         self.assertEqual(1, len(q))
 37 | 
 38 |         cursor = q._connection.execute("SELECT * FROM queue")
 39 |         uri_res = cursor.fetchone()
 40 |         (url, etag, mod_date, next_date, prio) = uri
 41 |         (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res
 42 |         self.assertEqual(url, url_res)
 43 |         self.assertEqual(etag, etag_res)
 44 |         self.assertEqual(mod_date, mod_date_res)
 45 |         self.assertEqual(prio, prio_res)
 46 |         self.assertEqual(next_date, next_date_res)
 47 | 
 48 |         q.close()
 49 | 
 50 |     def test_updating_works(self):
 51 | 
 52 |         uri = ("http://localhost", "etag", int(time.time()*1000),
 53 |                 int(time.time() * 1000), 1)
 54 | 
 55 |         q = SQLiteSingleHostUriQueue(":memory:")
 56 |         q.add_uri(uri)
 57 | 
 58 |         uri = ("http://localhost", "etag", int(time.time()*1000),
 59 |                 int(time.time() * 1000), 2)
 60 | 
 61 |         q.update_uri(uri)
 62 | 
 63 |         cursor = q._connection.execute("SELECT * FROM queue")
 64 |         uri_res = cursor.fetchone()
 65 |         (url, etag, mod_date, next_date, prio) = uri
 66 |         (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res
 67 |         self.assertEqual(url, url_res)
 68 |         self.assertEqual(etag, etag_res)
 69 |         self.assertEqual(mod_date, mod_date_res)
 70 |         self.assertEqual(prio, prio_res)
 71 |         self.assertEqual(next_date, next_date_res)
 72 | 
 73 |     def test_adding_lists_works(self):
 74 | 
 75 |         uris = [("http://localhost", "etag", int(time.time()*1000),
 76 |                 int(time.time() * 1010), 1),
 77 |         ]
 78 | 
 79 |         q = SQLiteSingleHostUriQueue(":memory:")
 80 |         q.add_uris(uris)
 81 | 
 82 |         cursor = q._connection.execute("SELECT * FROM queue")
 83 |         uri_res = cursor.fetchone()
 84 |         (url, etag, mod_date, next_date, prio) = uris[0]
 85 |         (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res
 86 |         self.assertEqual(url, url_res)
 87 |         self.assertEqual(etag, etag_res)
 88 |         self.assertEqual(mod_date, mod_date_res)
 89 |         self.assertEqual(prio, prio_res)
 90 |         self.assertEqual(next_date, next_date_res)
 91 | 
 92 |     def test_updating_lists_works(self):
 93 | 
 94 |         uris = [("http://localhost", "etag", int(time.time()*1000),
 95 |                 int(time.time() * 1000), 1),
 96 |         ]
 97 | 
 98 |         q = SQLiteSingleHostUriQueue(":memory:")
 99 |         q.add_uris(uris)
100 | 
101 |         uris = [("http://localhost", "etag", int(time.time()*1000),
102 |                 int(time.time() * 1000), 2),
103 |         ]
104 | 
105 |         q.update_uris(uris)
106 | 
107 |         cursor = q._connection.execute("SELECT * FROM queue")
108 |         uri_res = cursor.fetchone()
109 |         (url, etag, mod_date, next_date, prio) = uris[0]
110 |         (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res
111 |         self.assertEqual(url, url_res)
112 |         self.assertEqual(etag, etag_res)
113 |         self.assertEqual(mod_date, mod_date_res)
114 |         self.assertEqual(prio, prio_res)
115 |         self.assertEqual(next_date, next_date_res)
116 | 
117 |     def test_removing_lists_works(self):
118 | 
119 |         uris = [("http://localhost", "etag", int(time.time()*1000),
120 |                 int(time.time() * 1000), 1),
121 |             ("http://fogeignhost", "ETAG", int(time.time()*1000),
122 |              int(time.time() * 1000), 2),
123 |         ]
124 | 
125 |         q = SQLiteSingleHostUriQueue(":memory:")
126 |         q.add_uris(uris)
127 | 
128 |         q.remove_uris(uris)
129 | 
130 |         cursor = q._connection.execute("SELECT * FROM queue")
131 |         self.assertTrue(None is cursor.fetchone())
132 | 
133 |     def test_iterating_over_all_uris_works(self):
134 | 
135 |         uris = [("http://localhost", "etag", int(time.time()*1000),
136 |                 int(time.time() * 1000), 1),
137 |             ("http://foreignhost", "ETAG", int(time.time()*1000),
138 |              int(time.time() * 1000), 2),
139 |         ]
140 |         urls = ["http://localhost", "http://foreignhost"]
141 | 
142 |         q = SQLiteSingleHostUriQueue(":memory:")
143 |         q.add_uris(uris)
144 | 
145 |         uri = q.get_uri("http://foreignhost")
146 |         self.assertEqual(uris[1], uri)
147 | 
148 |         self.assertRaises(UriNotFound, q.get_uri, "http://gibtsnuesch")
149 | 
150 |         for url in q.all_uris():
151 |             self.assertTrue(url in urls)
152 | 
153 |     def test_queue_head_works(self):
154 | 
155 |         uris = [("http://localhost", "etag", int(time.time()*1000),
156 |                 int(time.time() * 1000), 1),
157 |             ("http://fogeignhost", "ETAG", int(time.time()*1000),
158 |              int(time.time() * 1001), 2),
159 |         ]
160 | 
161 |         q = SQLiteSingleHostUriQueue(":memory:")
162 |         q.add_uris(uris)
163 | 
164 |         (url1, etag1, mod_date1, next_date1, prio1) = uris[0]
165 |         (url2, etag2, mod_date2, next_date2, prio2) = uris[1]
166 | 
167 |         for uri_res in q.queue_head(n=1, offset=0):
168 |             (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res
169 |             self.assertEqual(url1, url_res)
170 |             self.assertEqual(etag1, etag_res)
171 |             self.assertEqual(mod_date1, mod_date_res)
172 |             self.assertEqual(prio1, prio_res)
173 |             self.assertEqual(next_date1, next_date_res)
174 | 
175 |         for uri_res in q.queue_head(n=1, offset=1):
176 |             (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res
177 |             self.assertEqual(url2, url_res)
178 |             self.assertEqual(etag2, etag_res)
179 |             self.assertEqual(mod_date2, mod_date_res)
180 |             self.assertEqual(prio2, prio_res)
181 |             self.assertEqual(next_date2, next_date_res)
182 | 
183 |         uris.append(("http://localhost/1", "eTag", int(time.time()*1000),
184 |                     int(time.time()*1002), 1))
185 |         (url3, etag3, mod_date3, next_date3, prio3) = uris[2]
186 |         q.add_uri(uris[2])
187 | 
188 |         q.ignore_uri("http://localhost", 404)
189 | 
190 |         for uri_res in q.queue_head(n=1, offset=1):
191 |             (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res
192 |             self.assertEqual(url3, url_res)
193 |             self.assertEqual(etag3, etag_res)
194 |             self.assertEqual(mod_date3, mod_date_res)
195 |             self.assertEqual(prio3, prio_res)
196 |             self.assertEqual(next_date3, next_date_res)
197 | 
198 | 
199 | if __name__ == '__main__':
200 |     unittest.main()
201 | 


--------------------------------------------------------------------------------
/test/test_strip_session_ids.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_strip_session_ids.py 14-Apr-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | #
19 | import unittest
20 | 
21 | from spyder.core.constants import CURI_EXTRACTED_URLS
22 | from spyder.core.settings import Settings
23 | from spyder.processor.stripsessions import StripSessionIds
24 | from spyder.thrift.gen.ttypes import CrawlUri
25 | 
26 | 
27 | class StripSessionIdsTest(unittest.TestCase):
28 | 
29 |     def test_that_stripping_session_stuff_works(self):
30 | 
31 |         s = StripSessionIds(Settings())
32 | 
33 |         url = "http://pREis.de/traeger/index.php?sid=8429fb3ae210a2a0e28800b7f48d90f2"
34 | 
35 |         self.assertEqual("http://pREis.de/traeger/index.php?",
36 |                 s._remove_session_ids(url))
37 | 
38 |         url = "http://preis.de/traeger/index.php?jsessionid=8429fb3ae210a2a0e28800b7f48d90f2"
39 | 
40 |         self.assertEqual("http://preis.de/traeger/index.php?",
41 |                 s._remove_session_ids(url))
42 | 
43 |         url = "http://preis.de/traeger/index.php?phpsessid=8429fb3ae210a2a0e28800b7f48d90f2"
44 | 
45 |         self.assertEqual("http://preis.de/traeger/index.php?",
46 |                 s._remove_session_ids(url))
47 | 
48 |         url = "http://preis.de/traeger/index.php?aspsessionid=8429fb3ae210a2a0e28800b7f48d90f2"
49 | 
50 |         self.assertEqual("http://preis.de/traeger/index.php?",
51 |                 s._remove_session_ids(url))
52 | 
53 |     def test_that_with_uri_works(self):
54 | 
55 |         s = StripSessionIds(Settings())
56 | 
57 |         urls = ["http://preis.de/traeger/index.php?sid=8429fb3ae210a2a0e28800b7f48d90f2",
58 |             "http://preis.de/traeger/index.php?jsessionid=8429fb3ae210a2a0e28800b7f48d90f2",
59 |             "http://preis.de/traeger/index.php?phpsessid=8429fb3ae210a2a0e28800b7f48d90f2",
60 |             "http://preis.de/traeger/index.php?aspsessionid=8429fb3ae210a2a0e28800b7f48d90f2",
61 |         ]
62 | 
63 |         curi = CrawlUri()
64 |         curi.optional_vars = { CURI_EXTRACTED_URLS: "\n".join(urls) }
65 | 
66 |         curi = s(curi)
67 |         clean_urls = curi.optional_vars[CURI_EXTRACTED_URLS].split('\n')
68 | 
69 |         for u in clean_urls:
70 |             self.assertEqual("http://preis.de/traeger/index.php?", u)
71 | 


--------------------------------------------------------------------------------
/test/test_uri_unique_filter.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_uri_unique_filter.py 31-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import unittest
20 | 
21 | from spyder.core.uri_uniq import UniqueUriFilter
22 | 
23 | class UniqueUriFilterTest(unittest.TestCase):
24 | 
25 |     def test_unknown_uris(self):
26 | 
27 |         unique_filter = UniqueUriFilter('sha1')
28 | 
29 |         self.assertFalse(unique_filter.is_known("http://www.google.de",
30 |                     add_if_unknown=True))
31 |         self.assertFalse(unique_filter.is_known("http://www.yahoo.com",
32 |                     add_if_unknown=True))
33 |         self.assertTrue(unique_filter.is_known("http://www.google.de"))
34 |         self.assertTrue(unique_filter.is_known("http://www.yahoo.com"))
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     unittest.main()
39 | 


--------------------------------------------------------------------------------
/test/test_worker.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # test_worker.py 11-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | import logging
 20 | from logging import StreamHandler
 21 | import sys
 22 | 
 23 | import unittest
 24 | 
 25 | import time
 26 | 
 27 | import zmq
 28 | from zmq import Socket
 29 | from zmq.eventloop.ioloop import IOLoop
 30 | from zmq.eventloop.zmqstream import ZMQStream
 31 | 
 32 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER
 33 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT
 34 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK
 35 | from spyder.core.mgmt import ZmqMgmt
 36 | from spyder.core.worker import ZmqWorker, AsyncZmqWorker
 37 | from spyder.core.messages import DataMessage, MgmtMessage
 38 | from spyder.thrift.gen.ttypes import CrawlUri
 39 | 
 40 | 
 41 | class ZmqTornadoIntegrationTest(unittest.TestCase):
 42 | 
 43 |     def setUp(self):
 44 | 
 45 |         # create the io_loop
 46 |         self._io_loop = IOLoop.instance()
 47 | 
 48 |         # and the context
 49 |         self._ctx = zmq.Context(1)
 50 | 
 51 |         # setup the mgmt sockets
 52 |         self._setup_mgmt_sockets()
 53 | 
 54 |         # setup the data sockets
 55 |         self._setup_data_sockets()
 56 | 
 57 |         # setup the management interface
 58 |         self._mgmt = ZmqMgmt( self._mgmt_sockets['worker_sub'],
 59 |             self._mgmt_sockets['worker_pub'], io_loop=self._io_loop)
 60 |         self._mgmt.start()
 61 |         self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end)
 62 | 
 63 |     def tearDown(self):
 64 |         # stop the mgmt
 65 |         self._mgmt.stop()
 66 | 
 67 |         # close all sockets
 68 |         for socket in self._mgmt_sockets.itervalues():
 69 |             socket.close()
 70 |         for socket in self._worker_sockets.itervalues():
 71 |             socket.close()
 72 | 
 73 |         # terminate the context
 74 |         self._ctx.term()
 75 | 
 76 |     def _setup_mgmt_sockets(self):
 77 | 
 78 |         self._mgmt_sockets = dict()
 79 | 
 80 |         # adress for the communication from master to worker(s)
 81 |         mgmt_master_worker = 'inproc://master/worker/coordination/'
 82 | 
 83 |         # connect the master with the worker
 84 |         # the master is a ZMQStream because we are sending msgs from the test
 85 |         sock = self._ctx.socket(zmq.PUB)
 86 |         sock.bind(mgmt_master_worker)
 87 |         self._mgmt_sockets['tmp1'] = sock
 88 |         self._mgmt_sockets['master_pub'] = ZMQStream(sock, self._io_loop)
 89 |         # the worker stream is created inside the ZmqMgmt class
 90 |         self._mgmt_sockets['worker_sub'] = self._ctx.socket(zmq.SUB)
 91 |         self._mgmt_sockets['worker_sub'].setsockopt(zmq.SUBSCRIBE, "")
 92 |         self._mgmt_sockets['worker_sub'].connect(mgmt_master_worker)
 93 | 
 94 |         # adress for the communication from worker(s) to master
 95 |         mgmt_worker_master = 'inproc://worker/master/coordination/'
 96 | 
 97 |         # connect the worker with the master
 98 |         self._mgmt_sockets['worker_pub'] = self._ctx.socket(zmq.PUB)
 99 |         self._mgmt_sockets['worker_pub'].bind(mgmt_worker_master)
100 |         sock = self._ctx.socket(zmq.SUB)
101 |         sock.setsockopt(zmq.SUBSCRIBE, "")
102 |         sock.connect(mgmt_worker_master)
103 |         self._mgmt_sockets['tmp2'] = sock
104 |         self._mgmt_sockets['master_sub'] = ZMQStream(sock, self._io_loop)
105 | 
106 |     def _setup_data_sockets(self):
107 | 
108 |         self._worker_sockets = dict()
109 | 
110 |         # address for master -> worker communication
111 |         data_master_worker = 'inproc://master/worker/pipeline/'
112 | 
113 |         sock = self._ctx.socket(zmq.PUSH)
114 |         sock.bind(data_master_worker)
115 |         self._worker_sockets['tmp3'] = sock
116 |         self._worker_sockets['master_push'] = ZMQStream(sock, self._io_loop)
117 |         self._worker_sockets['worker_pull'] = self._ctx.socket(zmq.PULL)
118 |         self._worker_sockets['worker_pull'].connect(data_master_worker)
119 | 
120 |         # address for worker -> master communication
121 |         data_worker_master = 'inproc://worker/master/pipeline/'
122 | 
123 |         self._worker_sockets['worker_pub'] = self._ctx.socket(zmq.PUB)
124 |         self._worker_sockets['worker_pub'].bind(data_worker_master)
125 |         sock = self._ctx.socket(zmq.SUB)
126 |         sock.setsockopt(zmq.SUBSCRIBE, "")
127 |         sock.connect(data_worker_master)
128 |         self._worker_sockets['tmp4'] = sock
129 |         self._worker_sockets['master_sub'] = ZMQStream(sock, self._io_loop)
130 | 
131 |     def on_mgmt_end(self, _msg):
132 |         self._io_loop.stop()
133 | 
134 | 
135 | class ZmqWorkerIntegrationTest(ZmqTornadoIntegrationTest):
136 |     
137 |     def echo_processing(self, crawl_uri):
138 |         death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
139 |                 data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
140 |         self._mgmt_sockets['master_pub'].send_multipart(death.serialize())
141 |         return crawl_uri
142 | 
143 |     def test_that_stopping_worker_via_mgmt_works(self):
144 | 
145 |         worker = ZmqWorker( self._worker_sockets['worker_pull'],
146 |             self._worker_sockets['worker_pub'],
147 |             self._mgmt,
148 |             self.echo_processing,
149 |             StreamHandler(sys.stdout),
150 |             logging.DEBUG,
151 |             self._io_loop)
152 | 
153 |         worker.start()
154 | 
155 |         curi = CrawlUri(url="http://localhost")
156 |         msg = DataMessage()
157 |         msg.identity = "me"
158 |         msg.curi = curi
159 | 
160 |         def assert_correct_data_answer(msg2):
161 |             self.assertEqual(msg, DataMessage(msg2))
162 | 
163 |         self._worker_sockets['master_sub'].on_recv(assert_correct_data_answer)
164 | 
165 |         def assert_correct_mgmt_answer(msg3):
166 |             self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg3.data)
167 | 
168 |         self._mgmt_sockets['master_sub'].on_recv(assert_correct_data_answer)
169 | 
170 |         self._worker_sockets['master_push'].send_multipart(msg.serialize())
171 | 
172 |         self._io_loop.start()
173 | 
174 | 
175 | if __name__ == '__main__':
176 |     unittest.main()
177 | 


--------------------------------------------------------------------------------
/test/test_workerprocess_extractor.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
  3 | #
  4 | # test_workerprocess_extractor.py 19-Jan-2011
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | import sys
 19 | import logging
 20 | from logging import StreamHandler
 21 | 
 22 | import unittest
 23 | 
 24 | import zmq
 25 | from zmq.eventloop.ioloop import IOLoop
 26 | from zmq.eventloop.zmqstream import ZMQStream
 27 | 
 28 | from spyder.core.constants import CURI_OPTIONAL_TRUE
 29 | from spyder.core.constants import CURI_EXTRACTION_FINISHED
 30 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER
 31 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT
 32 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK
 33 | from spyder.core.messages import DataMessage, MgmtMessage
 34 | from spyder.core.mgmt import ZmqMgmt
 35 | from spyder.core.settings import Settings
 36 | from spyder.processor import limiter
 37 | from spyder.thrift.gen.ttypes import CrawlUri
 38 | from spyder import workerprocess
 39 | 
 40 | 
 41 | class ZmqTornadoIntegrationTest(unittest.TestCase):
 42 | 
 43 |     def setUp(self):
 44 | 
 45 |         # create the io_loop
 46 |         self._io_loop = IOLoop.instance()
 47 | 
 48 |         # and the context
 49 |         self._ctx = zmq.Context(1)
 50 | 
 51 |         self._settings = Settings()
 52 |         self._settings.ZEROMQ_MASTER_PUSH = 'inproc://spyder-zmq-master-push'
 53 |         self._settings.ZEROMQ_WORKER_PROC_FETCHER_PULL = \
 54 |             self._settings.ZEROMQ_MASTER_PUSH
 55 |         self._settings.ZEROMQ_MASTER_SUB = 'inproc://spyder-zmq-master-sub'
 56 |         self._settings.ZEROMQ_WORKER_PROC_EXTRACTOR_PUB = \
 57 |             self._settings.ZEROMQ_MASTER_SUB
 58 | 
 59 |         self._settings.ZEROMQ_MGMT_MASTER = 'inproc://spyder-zmq-mgmt-master'
 60 |         self._settings.ZEROMQ_MGMT_WORKER = 'inproc://spyder-zmq-mgmt-worker'
 61 | 
 62 |         # setup the mgmt sockets
 63 |         self._setup_mgmt_sockets()
 64 | 
 65 |         # setup the data sockets
 66 |         self._setup_data_servers()
 67 | 
 68 |         # setup the management interface
 69 |         self._mgmt = ZmqMgmt( self._mgmt_sockets['worker_sub'],
 70 |             self._mgmt_sockets['worker_pub'], io_loop=self._io_loop)
 71 |         self._mgmt.start()
 72 |         self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end)
 73 | 
 74 |     def tearDown(self):
 75 |         # stop the mgmt
 76 |         self._mgmt.stop()
 77 | 
 78 |         # close all sockets
 79 |         for socket in self._mgmt_sockets.itervalues():
 80 |             socket.close()
 81 |         for socket in self._worker_sockets.itervalues():
 82 |             socket.close()
 83 | 
 84 |         # terminate the context
 85 |         self._ctx.term()
 86 | 
 87 |     def _setup_mgmt_sockets(self):
 88 | 
 89 |         self._mgmt_sockets = dict()
 90 | 
 91 |         # adress for the communication from master to worker(s)
 92 |         mgmt_master_worker = self._settings.ZEROMQ_MGMT_MASTER
 93 | 
 94 |         # connect the master with the worker
 95 |         # the master is a ZMQStream because we are sending msgs from the test
 96 |         sock = self._ctx.socket(zmq.PUB)
 97 |         sock.bind(mgmt_master_worker)
 98 |         self._mgmt_sockets['tmp1'] = sock
 99 |         self._mgmt_sockets['master_pub'] = ZMQStream(sock, self._io_loop)
100 |         # the worker stream is created inside the ZmqMgmt class
101 |         self._mgmt_sockets['worker_sub'] = self._ctx.socket(zmq.SUB)
102 |         self._mgmt_sockets['worker_sub'].setsockopt(zmq.SUBSCRIBE, "")
103 |         self._mgmt_sockets['worker_sub'].connect(mgmt_master_worker)
104 | 
105 |         # adress for the communication from worker(s) to master
106 |         mgmt_worker_master = self._settings.ZEROMQ_MGMT_WORKER
107 | 
108 |         # connect the worker with the master
109 |         self._mgmt_sockets['worker_pub'] = self._ctx.socket(zmq.PUB)
110 |         self._mgmt_sockets['worker_pub'].bind(mgmt_worker_master)
111 |         sock = self._ctx.socket(zmq.SUB)
112 |         sock.setsockopt(zmq.SUBSCRIBE, "")
113 |         sock.connect(mgmt_worker_master)
114 |         self._mgmt_sockets['tmp2'] = sock
115 |         self._mgmt_sockets['master_sub'] = ZMQStream(sock, self._io_loop)
116 | 
117 |     def _setup_data_servers(self):
118 | 
119 |         self._worker_sockets = dict()
120 | 
121 |         # address for master -> worker communication
122 |         data_master_worker = self._settings.ZEROMQ_WORKER_PROC_EXTRACTOR_PULL
123 | 
124 |         sock = self._ctx.socket(zmq.PUSH)
125 |         sock.bind(data_master_worker)
126 |         self._worker_sockets['tmp3'] = sock
127 |         self._worker_sockets['master_push'] = ZMQStream(sock, self._io_loop)
128 | 
129 |         # address for worker -> master communication
130 |         data_worker_master = self._settings.ZEROMQ_WORKER_PROC_EXTRACTOR_PUB
131 | 
132 |         sock = self._ctx.socket(zmq.SUB)
133 |         sock.setsockopt(zmq.SUBSCRIBE, "")
134 |         sock.bind(data_worker_master)
135 |         self._worker_sockets['tmp4'] = sock
136 |         self._worker_sockets['master_sub'] = ZMQStream(sock, self._io_loop)
137 | 
138 |     def on_mgmt_end(self, _msg):
139 |         self._io_loop.stop()
140 | 
141 | 
142 | class WorkerExtractorTestCase(ZmqTornadoIntegrationTest):
143 | 
144 |     def test_that_creating_extractor_works(self):
145 | 
146 |         self._settings.SPYDER_EXTRACTOR_PIPELINE = ['spyder.processor.limiter.DefaultLimiter',]
147 | 
148 |         extractor = workerprocess.create_worker_extractor(self._settings,
149 |                 self._mgmt, self._ctx, StreamHandler(sys.stdout), self._io_loop)
150 |         extractor.start()
151 | 
152 |         curi = CrawlUri(url="http://localhost:80/robots.txt",
153 |                 effective_url="http://127.0.0.1:%s/robots.txt",
154 |                 optional_vars=dict(),
155 |                 )
156 |         msg = DataMessage()
157 |         msg.identity = "me"
158 |         msg.curi = curi
159 | 
160 |         def assert_expected_result_and_stop(raw_msg):
161 |             msg2 = DataMessage(raw_msg)
162 |             self.assertEqual(CURI_OPTIONAL_TRUE,
163 |                     msg2.curi.optional_vars[CURI_EXTRACTION_FINISHED])
164 |             death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
165 |                     data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
166 |             self._mgmt_sockets['master_pub'].send_multipart(death.serialize())
167 | 
168 |         self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop)
169 | 
170 |         def assert_correct_mgmt_message(raw_msg):
171 |             self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, raw_msg)
172 | 
173 |         self._mgmt_sockets['master_sub'].on_recv(assert_correct_mgmt_message)
174 | 
175 |         self._worker_sockets['master_push'].send_multipart(msg.serialize())
176 | 
177 |         self._io_loop.start()
178 | 
179 |         extractor._out_stream.close()
180 |         extractor._outsocket.close()
181 |         extractor._in_stream.close()
182 |         extractor._insocket.close()
183 | 
184 | 
185 | if __name__ == '__main__':
186 |     unittest.main()
187 | 


--------------------------------------------------------------------------------
/test/test_workerprocess_fetcher.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_workerprocess_fetcher.py 19-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | import logging
19 | from logging import StreamHandler
20 | import sys
21 | 
22 | import unittest
23 | import time
24 | 
25 | import zmq
26 | from zmq.eventloop.ioloop import IOLoop
27 | from zmq.eventloop.zmqstream import ZMQStream
28 | 
29 | from spyder.core.constants import CURI_OPTIONAL_TRUE
30 | from spyder.core.constants import CURI_EXTRACTION_FINISHED
31 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER
32 | from spyder.core.settings import Settings
33 | from spyder.core.worker import AsyncZmqWorker
34 | from spyder import workerprocess
35 | 
36 | from spyder.processor.fetcher import FetchProcessor
37 | 
38 | class WorkerExtractorTestCase(unittest.TestCase):
39 | 
40 |     def test_that_creating_fetcher_works(self):
41 |         ctx = zmq.Context()
42 |         io_loop = IOLoop.instance()
43 | 
44 |         def stop_looping(_msg):
45 |             io_loop.stop()
46 | 
47 |         settings = Settings()
48 | 
49 |         master_push = ctx.socket(zmq.PUSH)
50 |         master_push.bind(settings.ZEROMQ_MASTER_PUSH)
51 | 
52 |         fetcher = workerprocess.create_worker_fetcher(settings, {}, ctx,
53 |                 StreamHandler(sys.stdout), io_loop)
54 | 
55 |         self.assertTrue(isinstance(fetcher._processing, FetchProcessor))
56 |         self.assertTrue(isinstance(fetcher, AsyncZmqWorker))
57 | 
58 |         fetcher._insocket.close()
59 |         fetcher._outsocket.close()
60 |         master_push.close()
61 |         ctx.term()
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     unittest.main()
66 | 


--------------------------------------------------------------------------------
/test/test_workerprocess_mgmtintegration.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_workerprocess.py 18-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import unittest
20 | import time
21 | 
22 | import zmq
23 | from zmq.eventloop.ioloop import IOLoop
24 | from zmq.eventloop.zmqstream import ZMQStream
25 | 
26 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER
27 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT
28 | from spyder.core.constants import ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK
29 | from spyder.core.messages import MgmtMessage
30 | from spyder.core.settings import Settings
31 | from spyder.processor import limiter
32 | from spyder import workerprocess
33 | 
34 | 
35 | class WorkerProcessTestCase(unittest.TestCase):
36 | 
37 |     def test_that_creating_mgmt_works(self):
38 | 
39 |         ctx = zmq.Context()
40 |         io_loop = IOLoop.instance()
41 | 
42 |         def stop_looping(_msg):
43 |             io_loop.stop()
44 | 
45 |         settings = Settings()
46 |         settings.ZEROMQ_MASTER_PUSH = 'inproc://spyder-zmq-master-push'
47 |         settings.ZEROMQ_WORKER_PROC_FETCHER_PULL = \
48 |             settings.ZEROMQ_MASTER_PUSH
49 |         settings.ZEROMQ_MASTER_SUB = 'inproc://spyder-zmq-master-sub'
50 |         settings.ZEROMQ_WORKER_PROC_EXTRACTOR_PUB = \
51 |             settings.ZEROMQ_MASTER_SUB
52 | 
53 |         settings.ZEROMQ_MGMT_MASTER = 'inproc://spyder-zmq-mgmt-master'
54 |         settings.ZEROMQ_MGMT_WORKER = 'inproc://spyder-zmq-mgmt-worker'
55 | 
56 |         pubsocket = ctx.socket(zmq.PUB)
57 |         pubsocket.bind(settings.ZEROMQ_MGMT_MASTER)
58 |         pub_stream = ZMQStream(pubsocket, io_loop)
59 | 
60 |         subsocket = ctx.socket(zmq.SUB)
61 |         subsocket.setsockopt(zmq.SUBSCRIBE, "")
62 |         subsocket.bind(settings.ZEROMQ_MGMT_WORKER)
63 |         sub_stream = ZMQStream(subsocket, io_loop)
64 | 
65 |         mgmt = workerprocess.create_worker_management(settings, ctx, io_loop)
66 |         mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, stop_looping)
67 |         mgmt.start()
68 | 
69 |         def assert_quit_message(msg):
70 |             self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg.data)
71 | 
72 |         sub_stream.on_recv(assert_quit_message)
73 | 
74 |         death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
75 |                 data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
76 |         pub_stream.send_multipart(death.serialize())
77 | 
78 |         io_loop.start()
79 | 
80 |         mgmt._out_stream.close()
81 |         mgmt._in_stream.close()
82 |         mgmt._publisher.close()
83 |         mgmt._subscriber.close()
84 |         pub_stream.close()
85 |         pubsocket.close()
86 |         sub_stream.close()
87 |         subsocket.close()
88 |         ctx.term()
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     unittest.main()
93 | 


--------------------------------------------------------------------------------
/test/test_workerprocess_processing.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_workerprocess_processing.py 18-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import unittest
20 | 
21 | from spyder.core.constants import CURI_OPTIONAL_TRUE
22 | from spyder.core.constants import CURI_EXTRACTION_FINISHED
23 | from spyder.core.settings import Settings
24 | from spyder.processor import limiter
25 | from spyder.thrift.gen.ttypes import CrawlUri
26 | from spyder import workerprocess
27 | 
28 | 
29 | class WorkerProcessingUnittest(unittest.TestCase):
30 | 
31 |     def test_that_creating_processing_function_works(self):
32 |         settings = Settings()
33 |         processors = settings.SPYDER_EXTRACTOR_PIPELINE
34 |         processors.extend(settings.SPYDER_SCOPER_PIPELINE)
35 |         processors.append('test_workerprocess')
36 |         self.assertRaises(ValueError, workerprocess.create_processing_function,
37 |                 settings, processors)
38 | 
39 |         processors.pop()
40 |         processors.append('test_workerprocess_unspec')
41 |         self.assertRaises(ValueError, workerprocess.create_processing_function,
42 |                 settings, processors)
43 | 
44 |         processors.pop()
45 |         processing = workerprocess.create_processing_function(settings,
46 |                 processors)
47 | 
48 |         curi = CrawlUri(optional_vars=dict())
49 |         curi.effective_url = "http://127.0.0.1/robots.txt"
50 |         curi2 = processing(curi)
51 | 
52 |         self.assertEqual(CURI_OPTIONAL_TRUE,
53 |                 curi2.optional_vars[CURI_EXTRACTION_FINISHED])
54 | 


--------------------------------------------------------------------------------
/test/test_workerprocess_unspec.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2011 Daniel Truemper truemped@googlemail.com
 3 | #
 4 | # test_workerprocess_unspec.py 26-Jan-2011
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | def a_plugin_with_no_create_processor_method():
20 |     pass
21 | 


--------------------------------------------------------------------------------
/versions.cfg:
--------------------------------------------------------------------------------
 1 | [versions]
 2 | zeromq = 2.1.9
 3 | tornado = 1.2
 4 | Brownie = 0.5.1
 5 | collective.recipe.sphinxbuilder = 0.7.0
 6 | coverage = 3.5.1
 7 | pbp.recipe.noserunner = 0.2.6
 8 | pep8 = 0.6.1
 9 | pycurl = 7.19.0
10 | pyflakes = 0.5.0
11 | pytz = 2011j
12 | pyzmq = 2.1.9
13 | thrift = 0.7.0
14 | 
15 | #Required by:
16 | #pbp.recipe.noserunner 0.2.6
17 | nose = 1.1.2
18 | 
19 | #Required by:
20 | #collective.recipe.sphinxbuilder 0.7.0
21 | zc.buildout = 1.5.2
22 | 
23 | #Required by:
24 | #collective.recipe.sphinxbuilder 0.7.0
25 | zc.recipe.egg = 1.3.2
26 | 


--------------------------------------------------------------------------------