├── .dockerignore ├── .gitignore ├── .gitmodules ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── autoscrape-server.py ├── autoscrape.py ├── autoscrape ├── __init__.py ├── backends │ ├── __init__.py │ ├── base │ │ ├── __init__.py │ │ ├── browser.py │ │ ├── dom.py │ │ ├── graph.py │ │ └── tags.py │ ├── requests │ │ ├── __init__.py │ │ ├── browser.py │ │ ├── dom.py │ │ └── tags.py │ ├── selenium │ │ ├── __init__.py │ │ ├── browser.py │ │ ├── dom.py │ │ └── tags.py │ └── warc │ │ ├── __init__.py │ │ ├── browser.py │ │ ├── dom.py │ │ └── tags.py ├── classification.py ├── cli │ ├── __init__.py │ └── scrape.py ├── control.py ├── filetypes.py ├── input_parser.py ├── scrapers │ ├── __init__.py │ ├── manual.py │ ├── null.py │ └── test.py ├── search │ ├── __init__.py │ ├── bfs.py │ └── graph.py ├── tasks.py ├── util │ ├── __init__.py │ └── warc.py └── vectorization │ ├── __init__.py │ ├── embeddings.py │ └── text.py ├── docker-compose.yml ├── extract.py ├── images ├── ai.png ├── code_embeddings.png ├── extraction of code.png ├── k-NN_small_data.png └── quickstart-video.png ├── requirements.api.txt ├── requirements.dev.txt ├── requirements.txt ├── setup.py ├── tests ├── common.sh ├── crawltest.sh ├── data │ ├── test_page.html │ └── test_page_large.cleaned.html ├── formsubmittest.sh ├── run_e2e_tests.sh ├── tag_test_data_page.html ├── tags.py ├── test_extractor.py ├── test_input_parser.py ├── test_tag_generation.py └── warctest.sh ├── tox.ini ├── train.py └── vectorize_data.py /.dockerignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/.dockerignore -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/.gitmodules -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/Dockerfile -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/MANIFEST.in -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/Makefile -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/README.rst -------------------------------------------------------------------------------- /autoscrape-server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape-server.py -------------------------------------------------------------------------------- /autoscrape.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape.py -------------------------------------------------------------------------------- /autoscrape/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/__init__.py -------------------------------------------------------------------------------- /autoscrape/backends/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autoscrape/backends/base/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autoscrape/backends/base/browser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/backends/base/browser.py -------------------------------------------------------------------------------- /autoscrape/backends/base/dom.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/backends/base/dom.py -------------------------------------------------------------------------------- /autoscrape/backends/base/graph.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autoscrape/backends/base/tags.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/backends/base/tags.py -------------------------------------------------------------------------------- /autoscrape/backends/requests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autoscrape/backends/requests/browser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/backends/requests/browser.py -------------------------------------------------------------------------------- /autoscrape/backends/requests/dom.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/backends/requests/dom.py -------------------------------------------------------------------------------- /autoscrape/backends/requests/tags.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/backends/requests/tags.py -------------------------------------------------------------------------------- /autoscrape/backends/selenium/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autoscrape/backends/selenium/browser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/backends/selenium/browser.py -------------------------------------------------------------------------------- /autoscrape/backends/selenium/dom.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/backends/selenium/dom.py -------------------------------------------------------------------------------- /autoscrape/backends/selenium/tags.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/backends/selenium/tags.py -------------------------------------------------------------------------------- /autoscrape/backends/warc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autoscrape/backends/warc/browser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/backends/warc/browser.py -------------------------------------------------------------------------------- /autoscrape/backends/warc/dom.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/backends/warc/dom.py -------------------------------------------------------------------------------- /autoscrape/backends/warc/tags.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from autoscrape.backends.requests.tags import Tagger 3 | -------------------------------------------------------------------------------- /autoscrape/classification.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/classification.py -------------------------------------------------------------------------------- /autoscrape/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autoscrape/cli/scrape.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/cli/scrape.py -------------------------------------------------------------------------------- /autoscrape/control.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/control.py -------------------------------------------------------------------------------- /autoscrape/filetypes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/filetypes.py -------------------------------------------------------------------------------- /autoscrape/input_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/input_parser.py -------------------------------------------------------------------------------- /autoscrape/scrapers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/scrapers/__init__.py -------------------------------------------------------------------------------- /autoscrape/scrapers/manual.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/scrapers/manual.py -------------------------------------------------------------------------------- /autoscrape/scrapers/null.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/scrapers/null.py -------------------------------------------------------------------------------- /autoscrape/scrapers/test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/scrapers/test.py -------------------------------------------------------------------------------- /autoscrape/search/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autoscrape/search/bfs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/search/bfs.py -------------------------------------------------------------------------------- /autoscrape/search/graph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/search/graph.py -------------------------------------------------------------------------------- /autoscrape/tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/tasks.py -------------------------------------------------------------------------------- /autoscrape/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/util/__init__.py -------------------------------------------------------------------------------- /autoscrape/util/warc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/util/warc.py -------------------------------------------------------------------------------- /autoscrape/vectorization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autoscrape/vectorization/embeddings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/vectorization/embeddings.py -------------------------------------------------------------------------------- /autoscrape/vectorization/text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/autoscrape/vectorization/text.py -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/docker-compose.yml -------------------------------------------------------------------------------- /extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/extract.py -------------------------------------------------------------------------------- /images/ai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/images/ai.png -------------------------------------------------------------------------------- /images/code_embeddings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/images/code_embeddings.png -------------------------------------------------------------------------------- /images/extraction of code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/images/extraction of code.png -------------------------------------------------------------------------------- /images/k-NN_small_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/images/k-NN_small_data.png -------------------------------------------------------------------------------- /images/quickstart-video.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/images/quickstart-video.png -------------------------------------------------------------------------------- /requirements.api.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/requirements.api.txt -------------------------------------------------------------------------------- /requirements.dev.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/requirements.dev.txt -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/requirements.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/setup.py -------------------------------------------------------------------------------- /tests/common.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/tests/common.sh -------------------------------------------------------------------------------- /tests/crawltest.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/tests/crawltest.sh -------------------------------------------------------------------------------- /tests/data/test_page.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/tests/data/test_page.html -------------------------------------------------------------------------------- /tests/data/test_page_large.cleaned.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/tests/data/test_page_large.cleaned.html -------------------------------------------------------------------------------- /tests/formsubmittest.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/tests/formsubmittest.sh -------------------------------------------------------------------------------- /tests/run_e2e_tests.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/tests/run_e2e_tests.sh -------------------------------------------------------------------------------- /tests/tag_test_data_page.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/tests/tag_test_data_page.html -------------------------------------------------------------------------------- /tests/tags.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/tests/tags.py -------------------------------------------------------------------------------- /tests/test_extractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/tests/test_extractor.py -------------------------------------------------------------------------------- /tests/test_input_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/tests/test_input_parser.py -------------------------------------------------------------------------------- /tests/test_tag_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/tests/test_tag_generation.py -------------------------------------------------------------------------------- /tests/warctest.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/tests/warctest.sh -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/tox.ini -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/train.py -------------------------------------------------------------------------------- /vectorize_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/HEAD/vectorize_data.py --------------------------------------------------------------------------------