├── .gitignore ├── .pylintrc ├── LICENSE ├── MANIFEST.in ├── README.rst ├── bootstrap.py ├── crawluri.thrift ├── docs-source ├── api │ ├── entrypoints.rst │ ├── extractor.rst │ ├── fetcher.rst │ ├── frontier.rst │ ├── masterprocess.rst │ ├── queues.rst │ ├── scoper.rst │ ├── sink.rst │ ├── spyderapi.rst │ └── workerprocess.rst ├── conf.py ├── crawler-design.rst ├── getting-started.rst ├── globals.rst ├── index.rst ├── libraries.rst ├── release-notes.rst └── roadmap.rst ├── local.cfg.template ├── setup.py ├── src └── spyder │ ├── __init__.py │ ├── core │ ├── __init__.py │ ├── constants.py │ ├── dnscache.py │ ├── frontier.py │ ├── log.py │ ├── master.py │ ├── messages.py │ ├── mgmt.py │ ├── prioritizer.py │ ├── queueassignment.py │ ├── queueselector.py │ ├── settings.py │ ├── sink.py │ ├── sqlitequeues.py │ ├── uri_uniq.py │ └── worker.py │ ├── defaultsettings.py │ ├── encoding.py │ ├── import_util.py │ ├── logsink.py │ ├── masterprocess.py │ ├── processor │ ├── __init__.py │ ├── cleanupquery.py │ ├── fetcher.py │ ├── htmllinkextractor.py │ ├── httpextractor.py │ ├── limiter.py │ ├── scoper.py │ └── stripsessions.py │ ├── spyder_template │ ├── log │ │ └── .keep │ ├── logging.conf │ ├── master.py │ ├── settings.py │ ├── sink.py │ └── spyder-ctrl.py │ ├── thrift │ ├── __init__.py │ └── gen │ │ ├── __init__.py │ │ ├── constants.py │ │ └── ttypes.py │ ├── time.py │ └── workerprocess.py ├── test ├── static │ └── robots.txt ├── test_async_worker.py ├── test_cleanup_qs.py ├── test_default_html_link_extractor.py ├── test_dns_cache.py ├── test_fetch_processor.py ├── test_fetch_processor_last_modified_works.py ├── test_fetch_processor_with_etag.py ├── test_frontier.py ├── test_http_extractor.py ├── test_limiter.py ├── test_masterprocess.py ├── test_messages.py ├── test_mgmt.py ├── test_multiple_frontier.py ├── test_queue_assignment.py ├── test_queue_selector.py ├── test_regex_scoper.py ├── test_settings.py ├── test_settings_settings.py ├── test_sqlite_multiple_queues.py ├── test_sqlite_queues.py ├── test_strip_session_ids.py ├── test_uri_unique_filter.py ├── test_worker.py ├── test_workerprocess_extractor.py ├── test_workerprocess_fetcher.py ├── test_workerprocess_mgmtintegration.py ├── test_workerprocess_processing.py └── test_workerprocess_unspec.py └── versions.cfg /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/.gitignore -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/.pylintrc -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/MANIFEST.in -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/README.rst -------------------------------------------------------------------------------- /bootstrap.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/bootstrap.py -------------------------------------------------------------------------------- /crawluri.thrift: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/crawluri.thrift -------------------------------------------------------------------------------- /docs-source/api/entrypoints.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/api/entrypoints.rst -------------------------------------------------------------------------------- /docs-source/api/extractor.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/api/extractor.rst -------------------------------------------------------------------------------- /docs-source/api/fetcher.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/api/fetcher.rst -------------------------------------------------------------------------------- /docs-source/api/frontier.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/api/frontier.rst -------------------------------------------------------------------------------- /docs-source/api/masterprocess.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/api/masterprocess.rst -------------------------------------------------------------------------------- /docs-source/api/queues.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/api/queues.rst -------------------------------------------------------------------------------- /docs-source/api/scoper.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/api/scoper.rst -------------------------------------------------------------------------------- /docs-source/api/sink.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/api/sink.rst -------------------------------------------------------------------------------- /docs-source/api/spyderapi.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/api/spyderapi.rst -------------------------------------------------------------------------------- /docs-source/api/workerprocess.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/api/workerprocess.rst -------------------------------------------------------------------------------- /docs-source/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/conf.py -------------------------------------------------------------------------------- /docs-source/crawler-design.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/crawler-design.rst -------------------------------------------------------------------------------- /docs-source/getting-started.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/getting-started.rst -------------------------------------------------------------------------------- /docs-source/globals.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/globals.rst -------------------------------------------------------------------------------- /docs-source/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/index.rst -------------------------------------------------------------------------------- /docs-source/libraries.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/libraries.rst -------------------------------------------------------------------------------- /docs-source/release-notes.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/release-notes.rst -------------------------------------------------------------------------------- /docs-source/roadmap.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/docs-source/roadmap.rst -------------------------------------------------------------------------------- /local.cfg.template: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/local.cfg.template -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/setup.py -------------------------------------------------------------------------------- /src/spyder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/__init__.py -------------------------------------------------------------------------------- /src/spyder/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/__init__.py -------------------------------------------------------------------------------- /src/spyder/core/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/constants.py -------------------------------------------------------------------------------- /src/spyder/core/dnscache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/dnscache.py -------------------------------------------------------------------------------- /src/spyder/core/frontier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/frontier.py -------------------------------------------------------------------------------- /src/spyder/core/log.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/log.py -------------------------------------------------------------------------------- /src/spyder/core/master.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/master.py -------------------------------------------------------------------------------- /src/spyder/core/messages.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/messages.py -------------------------------------------------------------------------------- /src/spyder/core/mgmt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/mgmt.py -------------------------------------------------------------------------------- /src/spyder/core/prioritizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/prioritizer.py -------------------------------------------------------------------------------- /src/spyder/core/queueassignment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/queueassignment.py -------------------------------------------------------------------------------- /src/spyder/core/queueselector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/queueselector.py -------------------------------------------------------------------------------- /src/spyder/core/settings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/settings.py -------------------------------------------------------------------------------- /src/spyder/core/sink.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/sink.py -------------------------------------------------------------------------------- /src/spyder/core/sqlitequeues.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/sqlitequeues.py -------------------------------------------------------------------------------- /src/spyder/core/uri_uniq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/uri_uniq.py -------------------------------------------------------------------------------- /src/spyder/core/worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/core/worker.py -------------------------------------------------------------------------------- /src/spyder/defaultsettings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/defaultsettings.py -------------------------------------------------------------------------------- /src/spyder/encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/encoding.py -------------------------------------------------------------------------------- /src/spyder/import_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/import_util.py -------------------------------------------------------------------------------- /src/spyder/logsink.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/logsink.py -------------------------------------------------------------------------------- /src/spyder/masterprocess.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/masterprocess.py -------------------------------------------------------------------------------- /src/spyder/processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/processor/__init__.py -------------------------------------------------------------------------------- /src/spyder/processor/cleanupquery.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/processor/cleanupquery.py -------------------------------------------------------------------------------- /src/spyder/processor/fetcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/processor/fetcher.py -------------------------------------------------------------------------------- /src/spyder/processor/htmllinkextractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/processor/htmllinkextractor.py -------------------------------------------------------------------------------- /src/spyder/processor/httpextractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/processor/httpextractor.py -------------------------------------------------------------------------------- /src/spyder/processor/limiter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/processor/limiter.py -------------------------------------------------------------------------------- /src/spyder/processor/scoper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/processor/scoper.py -------------------------------------------------------------------------------- /src/spyder/processor/stripsessions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/processor/stripsessions.py -------------------------------------------------------------------------------- /src/spyder/spyder_template/log/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/spyder/spyder_template/logging.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/spyder_template/logging.conf -------------------------------------------------------------------------------- /src/spyder/spyder_template/master.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/spyder_template/master.py -------------------------------------------------------------------------------- /src/spyder/spyder_template/settings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/spyder_template/settings.py -------------------------------------------------------------------------------- /src/spyder/spyder_template/sink.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/spyder_template/sink.py -------------------------------------------------------------------------------- /src/spyder/spyder_template/spyder-ctrl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/spyder_template/spyder-ctrl.py -------------------------------------------------------------------------------- /src/spyder/thrift/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/thrift/__init__.py -------------------------------------------------------------------------------- /src/spyder/thrift/gen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/thrift/gen/__init__.py -------------------------------------------------------------------------------- /src/spyder/thrift/gen/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/thrift/gen/constants.py -------------------------------------------------------------------------------- /src/spyder/thrift/gen/ttypes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/thrift/gen/ttypes.py -------------------------------------------------------------------------------- /src/spyder/time.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/time.py -------------------------------------------------------------------------------- /src/spyder/workerprocess.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/src/spyder/workerprocess.py -------------------------------------------------------------------------------- /test/static/robots.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/static/robots.txt -------------------------------------------------------------------------------- /test/test_async_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_async_worker.py -------------------------------------------------------------------------------- /test/test_cleanup_qs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_cleanup_qs.py -------------------------------------------------------------------------------- /test/test_default_html_link_extractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_default_html_link_extractor.py -------------------------------------------------------------------------------- /test/test_dns_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_dns_cache.py -------------------------------------------------------------------------------- /test/test_fetch_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_fetch_processor.py -------------------------------------------------------------------------------- /test/test_fetch_processor_last_modified_works.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_fetch_processor_last_modified_works.py -------------------------------------------------------------------------------- /test/test_fetch_processor_with_etag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_fetch_processor_with_etag.py -------------------------------------------------------------------------------- /test/test_frontier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_frontier.py -------------------------------------------------------------------------------- /test/test_http_extractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_http_extractor.py -------------------------------------------------------------------------------- /test/test_limiter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_limiter.py -------------------------------------------------------------------------------- /test/test_masterprocess.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_masterprocess.py -------------------------------------------------------------------------------- /test/test_messages.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_messages.py -------------------------------------------------------------------------------- /test/test_mgmt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_mgmt.py -------------------------------------------------------------------------------- /test/test_multiple_frontier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_multiple_frontier.py -------------------------------------------------------------------------------- /test/test_queue_assignment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_queue_assignment.py -------------------------------------------------------------------------------- /test/test_queue_selector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_queue_selector.py -------------------------------------------------------------------------------- /test/test_regex_scoper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_regex_scoper.py -------------------------------------------------------------------------------- /test/test_settings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_settings.py -------------------------------------------------------------------------------- /test/test_settings_settings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_settings_settings.py -------------------------------------------------------------------------------- /test/test_sqlite_multiple_queues.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_sqlite_multiple_queues.py -------------------------------------------------------------------------------- /test/test_sqlite_queues.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_sqlite_queues.py -------------------------------------------------------------------------------- /test/test_strip_session_ids.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_strip_session_ids.py -------------------------------------------------------------------------------- /test/test_uri_unique_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_uri_unique_filter.py -------------------------------------------------------------------------------- /test/test_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_worker.py -------------------------------------------------------------------------------- /test/test_workerprocess_extractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_workerprocess_extractor.py -------------------------------------------------------------------------------- /test/test_workerprocess_fetcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_workerprocess_fetcher.py -------------------------------------------------------------------------------- /test/test_workerprocess_mgmtintegration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_workerprocess_mgmtintegration.py -------------------------------------------------------------------------------- /test/test_workerprocess_processing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_workerprocess_processing.py -------------------------------------------------------------------------------- /test/test_workerprocess_unspec.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/test/test_workerprocess_unspec.py -------------------------------------------------------------------------------- /versions.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retresco/Spyder/HEAD/versions.cfg --------------------------------------------------------------------------------