├── .dockerignore ├── .gitignore ├── .travis.yml ├── Dockerfile ├── README.rst ├── alexa1k.csv ├── codecov.yml ├── deep-deep ├── check.sh ├── deepdeep │ ├── __init__.py │ ├── downloadermiddlewares.py │ ├── exports.py │ ├── extensions.py │ ├── goals.py │ ├── links.py │ ├── metrics.py │ ├── predictor.py │ ├── qlearning.py │ ├── queues.py │ ├── scheduler.py │ ├── score_pages.py │ ├── settings.py │ ├── spidermiddlewares.py │ ├── spiders │ │ ├── __init__.py │ │ ├── _base.py │ │ ├── baseline.py │ │ ├── checker.py │ │ ├── extraction.py │ │ ├── formspider.py │ │ ├── qspider.py │ │ └── relevancy.py │ ├── utils.py │ └── vectorizers.py ├── scrapy.cfg ├── scripts │ ├── crawl-depth3.sh │ ├── crawl-depth4.sh │ ├── crawl-forms.py │ ├── crawl-keywords.py │ ├── crawl-relevant.py │ ├── explain-model.py │ ├── explain-predictions.py │ ├── fixup-gz.py │ ├── show-lda-topics.py │ └── train-lda.py ├── setup.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── mockserver.py │ ├── test_metrics.py │ ├── test_queues.py │ ├── test_relevancy_spider.py │ └── utils.py └── tox.ini ├── docs ├── Makefile ├── RL.rst ├── conf.py ├── index.rst ├── make.bat ├── scheduling.rst └── scrapy.rst ├── examples └── standalone.py ├── notebooks ├── Check Crawl Graph.ipynb ├── Crawl Graphs.ipynb ├── Explore Crawled Data.ipynb ├── RL for web crawling.ipynb └── Results.ipynb └── requirements.txt /.dockerignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/.dockerignore -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/.gitignore -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/.travis.yml -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/Dockerfile -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/README.rst -------------------------------------------------------------------------------- /alexa1k.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/alexa1k.csv -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/codecov.yml -------------------------------------------------------------------------------- /deep-deep/check.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/check.sh -------------------------------------------------------------------------------- /deep-deep/deepdeep/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deep-deep/deepdeep/downloadermiddlewares.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/downloadermiddlewares.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/exports.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/exports.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/extensions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/extensions.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/goals.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/goals.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/links.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/links.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/metrics.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/predictor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/predictor.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/qlearning.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/qlearning.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/queues.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/queues.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/scheduler.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/score_pages.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/score_pages.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/settings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/settings.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/spidermiddlewares.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/spidermiddlewares.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/spiders/__init__.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/spiders/_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/spiders/_base.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/spiders/baseline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/spiders/baseline.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/spiders/checker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/spiders/checker.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/spiders/extraction.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/spiders/extraction.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/spiders/formspider.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/spiders/formspider.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/spiders/qspider.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/spiders/qspider.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/spiders/relevancy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/spiders/relevancy.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/utils.py -------------------------------------------------------------------------------- /deep-deep/deepdeep/vectorizers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/deepdeep/vectorizers.py -------------------------------------------------------------------------------- /deep-deep/scrapy.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/scrapy.cfg -------------------------------------------------------------------------------- /deep-deep/scripts/crawl-depth3.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/scripts/crawl-depth3.sh -------------------------------------------------------------------------------- /deep-deep/scripts/crawl-depth4.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/scripts/crawl-depth4.sh -------------------------------------------------------------------------------- /deep-deep/scripts/crawl-forms.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/scripts/crawl-forms.py -------------------------------------------------------------------------------- /deep-deep/scripts/crawl-keywords.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/scripts/crawl-keywords.py -------------------------------------------------------------------------------- /deep-deep/scripts/crawl-relevant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/scripts/crawl-relevant.py -------------------------------------------------------------------------------- /deep-deep/scripts/explain-model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/scripts/explain-model.py -------------------------------------------------------------------------------- /deep-deep/scripts/explain-predictions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/scripts/explain-predictions.py -------------------------------------------------------------------------------- /deep-deep/scripts/fixup-gz.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/scripts/fixup-gz.py -------------------------------------------------------------------------------- /deep-deep/scripts/show-lda-topics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/scripts/show-lda-topics.py -------------------------------------------------------------------------------- /deep-deep/scripts/train-lda.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/scripts/train-lda.py -------------------------------------------------------------------------------- /deep-deep/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/setup.py -------------------------------------------------------------------------------- /deep-deep/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /deep-deep/tests/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/tests/conftest.py -------------------------------------------------------------------------------- /deep-deep/tests/mockserver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/tests/mockserver.py -------------------------------------------------------------------------------- /deep-deep/tests/test_metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/tests/test_metrics.py -------------------------------------------------------------------------------- /deep-deep/tests/test_queues.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/tests/test_queues.py -------------------------------------------------------------------------------- /deep-deep/tests/test_relevancy_spider.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/tests/test_relevancy_spider.py -------------------------------------------------------------------------------- /deep-deep/tests/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/tests/utils.py -------------------------------------------------------------------------------- /deep-deep/tox.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/deep-deep/tox.ini -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/docs/Makefile -------------------------------------------------------------------------------- /docs/RL.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/docs/RL.rst -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/docs/conf.py -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/docs/index.rst -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/docs/make.bat -------------------------------------------------------------------------------- /docs/scheduling.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/docs/scheduling.rst -------------------------------------------------------------------------------- /docs/scrapy.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/docs/scrapy.rst -------------------------------------------------------------------------------- /examples/standalone.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/examples/standalone.py -------------------------------------------------------------------------------- /notebooks/Check Crawl Graph.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/notebooks/Check Crawl Graph.ipynb -------------------------------------------------------------------------------- /notebooks/Crawl Graphs.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/notebooks/Crawl Graphs.ipynb -------------------------------------------------------------------------------- /notebooks/Explore Crawled Data.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/notebooks/Explore Crawled Data.ipynb -------------------------------------------------------------------------------- /notebooks/RL for web crawling.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/notebooks/RL for web crawling.ipynb -------------------------------------------------------------------------------- /notebooks/Results.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/notebooks/Results.ipynb -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeamHG-Memex/deep-deep/HEAD/requirements.txt --------------------------------------------------------------------------------