├── .editorconfig ├── .env.example ├── .github ├── dependabot.yml └── workflows │ ├── codeql.yml │ └── pipeline.yaml ├── .gitignore ├── .gitmodules ├── .syft.yaml ├── .version.cache ├── .version.config ├── .vscode ├── extensions.json └── settings.json ├── LICENSE ├── Makefile ├── README.md ├── cicd └── test-unit-ci.sh ├── docs ├── logo.ico └── logo.png ├── pyproject.toml ├── src └── scrape_it_now │ ├── __init__.py │ ├── cli.py │ ├── helpers │ ├── __init__.py │ ├── http.py │ ├── identity.py │ ├── logging.py │ ├── monitoring.py │ ├── persistence.py │ ├── resources.py │ ├── threading.py │ └── trie.py │ ├── index.py │ ├── models │ ├── indexed.py │ ├── message.py │ ├── scraped.py │ └── state.py │ ├── persistence │ ├── azure_blob_storage.py │ ├── azure_queue_storage.py │ ├── azure_search.py │ ├── iblob.py │ ├── iqueue.py │ ├── isearch.py │ └── local_disk.py │ ├── resources │ └── ads-nl.txt │ └── scrape.py ├── tests ├── __init__.py ├── blob.py ├── conftest.py ├── queue.py ├── resources.py ├── scrape.py └── websites │ ├── azure.zip │ ├── bing.zip │ ├── craigslist.zip │ ├── google.zip │ ├── hackernews.zip │ ├── images.html │ ├── images │ └── banana.jpg │ ├── lemonde.zip │ ├── links.html │ ├── mermaid-export.sh │ ├── paragraphs.html │ ├── paragraphs.html.md │ └── servicepublic.zip └── uv.lock /.editorconfig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/.editorconfig -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/.env.example -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/.github/dependabot.yml -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/.github/workflows/codeql.yml -------------------------------------------------------------------------------- /.github/workflows/pipeline.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/.github/workflows/pipeline.yaml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/.gitmodules -------------------------------------------------------------------------------- /.syft.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/.syft.yaml -------------------------------------------------------------------------------- /.version.cache: -------------------------------------------------------------------------------- 1 | 20241003150343 2 | -------------------------------------------------------------------------------- /.version.config: -------------------------------------------------------------------------------- 1 | patch 2 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/.vscode/extensions.json -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/.vscode/settings.json -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/LICENSE -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/README.md -------------------------------------------------------------------------------- /cicd/test-unit-ci.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/cicd/test-unit-ci.sh -------------------------------------------------------------------------------- /docs/logo.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/docs/logo.ico -------------------------------------------------------------------------------- /docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/docs/logo.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/pyproject.toml -------------------------------------------------------------------------------- /src/scrape_it_now/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.0+unknown" 2 | -------------------------------------------------------------------------------- /src/scrape_it_now/cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/cli.py -------------------------------------------------------------------------------- /src/scrape_it_now/helpers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/helpers/__init__.py -------------------------------------------------------------------------------- /src/scrape_it_now/helpers/http.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/helpers/http.py -------------------------------------------------------------------------------- /src/scrape_it_now/helpers/identity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/helpers/identity.py -------------------------------------------------------------------------------- /src/scrape_it_now/helpers/logging.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/helpers/logging.py -------------------------------------------------------------------------------- /src/scrape_it_now/helpers/monitoring.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/helpers/monitoring.py -------------------------------------------------------------------------------- /src/scrape_it_now/helpers/persistence.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/helpers/persistence.py -------------------------------------------------------------------------------- /src/scrape_it_now/helpers/resources.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/helpers/resources.py -------------------------------------------------------------------------------- /src/scrape_it_now/helpers/threading.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/helpers/threading.py -------------------------------------------------------------------------------- /src/scrape_it_now/helpers/trie.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/helpers/trie.py -------------------------------------------------------------------------------- /src/scrape_it_now/index.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/index.py -------------------------------------------------------------------------------- /src/scrape_it_now/models/indexed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/models/indexed.py -------------------------------------------------------------------------------- /src/scrape_it_now/models/message.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/models/message.py -------------------------------------------------------------------------------- /src/scrape_it_now/models/scraped.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/models/scraped.py -------------------------------------------------------------------------------- /src/scrape_it_now/models/state.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/models/state.py -------------------------------------------------------------------------------- /src/scrape_it_now/persistence/azure_blob_storage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/persistence/azure_blob_storage.py -------------------------------------------------------------------------------- /src/scrape_it_now/persistence/azure_queue_storage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/persistence/azure_queue_storage.py -------------------------------------------------------------------------------- /src/scrape_it_now/persistence/azure_search.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/persistence/azure_search.py -------------------------------------------------------------------------------- /src/scrape_it_now/persistence/iblob.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/persistence/iblob.py -------------------------------------------------------------------------------- /src/scrape_it_now/persistence/iqueue.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/persistence/iqueue.py -------------------------------------------------------------------------------- /src/scrape_it_now/persistence/isearch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/persistence/isearch.py -------------------------------------------------------------------------------- /src/scrape_it_now/persistence/local_disk.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/persistence/local_disk.py -------------------------------------------------------------------------------- /src/scrape_it_now/resources/ads-nl.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/resources/ads-nl.txt -------------------------------------------------------------------------------- /src/scrape_it_now/scrape.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/src/scrape_it_now/scrape.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/blob.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/blob.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/conftest.py -------------------------------------------------------------------------------- /tests/queue.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/queue.py -------------------------------------------------------------------------------- /tests/resources.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/resources.py -------------------------------------------------------------------------------- /tests/scrape.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/scrape.py -------------------------------------------------------------------------------- /tests/websites/azure.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/websites/azure.zip -------------------------------------------------------------------------------- /tests/websites/bing.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/websites/bing.zip -------------------------------------------------------------------------------- /tests/websites/craigslist.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/websites/craigslist.zip -------------------------------------------------------------------------------- /tests/websites/google.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/websites/google.zip -------------------------------------------------------------------------------- /tests/websites/hackernews.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/websites/hackernews.zip -------------------------------------------------------------------------------- /tests/websites/images.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/websites/images.html -------------------------------------------------------------------------------- /tests/websites/images/banana.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/websites/images/banana.jpg -------------------------------------------------------------------------------- /tests/websites/lemonde.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/websites/lemonde.zip -------------------------------------------------------------------------------- /tests/websites/links.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/websites/links.html -------------------------------------------------------------------------------- /tests/websites/mermaid-export.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/websites/mermaid-export.sh -------------------------------------------------------------------------------- /tests/websites/paragraphs.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/websites/paragraphs.html -------------------------------------------------------------------------------- /tests/websites/paragraphs.html.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/websites/paragraphs.html.md -------------------------------------------------------------------------------- /tests/websites/servicepublic.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/tests/websites/servicepublic.zip -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemlesne/scrape-it-now/HEAD/uv.lock --------------------------------------------------------------------------------