├── .dockerignore ├── .gitignore ├── .gitmodules ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── autoscrape-server.py ├── autoscrape.py ├── autoscrape ├── __init__.py ├── backends │ ├── __init__.py │ ├── base │ │ ├── __init__.py │ │ ├── browser.py │ │ ├── dom.py │ │ ├── graph.py │ │ └── tags.py │ ├── requests │ │ ├── __init__.py │ │ ├── browser.py │ │ ├── dom.py │ │ └── tags.py │ ├── selenium │ │ ├── __init__.py │ │ ├── browser.py │ │ ├── dom.py │ │ └── tags.py │ └── warc │ │ ├── __init__.py │ │ ├── browser.py │ │ ├── dom.py │ │ └── tags.py ├── classification.py ├── cli │ ├── __init__.py │ └── scrape.py ├── control.py ├── filetypes.py ├── input_parser.py ├── scrapers │ ├── __init__.py │ ├── manual.py │ ├── null.py │ └── test.py ├── search │ ├── __init__.py │ ├── bfs.py │ └── graph.py ├── tasks.py ├── util │ ├── __init__.py │ └── warc.py └── vectorization │ ├── __init__.py │ ├── embeddings.py │ └── text.py ├── docker-compose.yml ├── extract.py ├── images ├── ai.png ├── code_embeddings.png ├── extraction of code.png ├── k-NN_small_data.png └── quickstart-video.png ├── requirements.api.txt ├── requirements.dev.txt ├── requirements.txt ├── setup.py ├── tests ├── common.sh ├── crawltest.sh ├── data │ ├── test_page.html │ └── test_page_large.cleaned.html ├── formsubmittest.sh ├── run_e2e_tests.sh ├── tag_test_data_page.html ├── tags.py ├── test_extractor.py ├── test_input_parser.py ├── test_tag_generation.py └── warctest.sh ├── tox.ini ├── train.py └── vectorize_data.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .git/ 2 | *.py[cod] 3 | *$py.class 4 | .cache/ 5 | *.pickle 6 | .idea/ 7 | .DS_Store 8 | __pycache__ 9 | 10 | *.pyc 11 | *.swp 12 | *.swo 13 | *.7z 14 | *.log 15 | 16 | # default output directory 17 | autoscrape-data/ 18 | 19 | # Algorithmic/Training Data-Related Files 20 | # Currently stored in a separate git repo (private, currently) 21 | # Will be released once ready, minimized, over LFS 22 | deps/ 23 | training_data/ 24 | *.pickle 25 | 26 | # python virtual environment 27 | venv 28 | 29 | # webdriver remote settings 30 | nodeConfig.json 31 | 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | *.swp 4 | *.swo 5 | *.7z 6 | *.log 7 | graph*.dot 8 | .idea 9 | .#* 10 | .python-version 11 | 12 | # default output directory 13 | autoscrape-data*/ 14 | 15 | # Algorithmic/Training Data-Related Files 16 | # Currently stored in a separate git repo (private, currently) 17 | # Will be released once ready, minimized, over LFS 18 | deps/ 19 | training_data/ 20 | *.pickle 21 | 22 | # python virtual environment 23 | venv 24 | 25 | # webdriver remote settings 26 | nodeConfig.json 27 | 28 | # python setup tools 29 | build/ 30 | dist/ 31 | autoscrape_py.egg-info/ 32 | autoscrape.egg-info/ 33 | 34 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "www"] 2 | path = www 3 | url = https://github.com/brandonrobertz/autoscrape-www 4 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.4-slim-buster AS deps 2 | 3 | # Install the Python deps (common across worker & web server, for now) 4 | RUN mkdir /app 5 | WORKDIR /app 6 | 7 | # Install Firefox deps (and curl, xvfb, vnc). Debian Buster has Firefox v68; 8 | # we'll install its dependencies and hope they satisfy _our_ Firefox version. 9 | RUN apt-get update \ 10 | && bash -c 'apt-get install -y --no-install-recommends $(apt-cache depends firefox-esr | awk "/Depends:/{print\$2}")' \ 11 | && apt-get install --no-install-recommends -y \ 12 | curl \ 13 | wget \ 14 | xauth \ 15 | xvfb \ 16 | xz-utils \ 17 | bzip2 \ 18 | postgresql-client \ 19 | && rm -rf /var/lib/apt/lists/* 20 | 21 | # Install Firefox. It's a separate step so it's easier to resume docker build. 22 | RUN curl -L https://download-installer.cdn.mozilla.net/pub/firefox/releases/64.0.2/linux-x86_64/en-US/firefox-64.0.2.tar.bz2 \ 23 | | tar jx -C /opt \ 24 | && ln -s /opt/firefox/firefox /usr/bin/firefox 25 | 26 | # Install geckodriver. It's a separate step so it's easier to resume docker build. 27 | RUN curl -L https://github.com/mozilla/geckodriver/releases/download/v0.23.0/geckodriver-v0.23.0-linux64.tar.gz \ 28 | | tar zx -C /usr/bin/ \ 29 | && chmod +x /usr/bin/geckodriver 30 | 31 | # Install the Python deps we use for integration tests. 32 | # 33 | # Integration tests don't rely on the Django stack, and that makes this 34 | # Dockerfile compile faster and cache better. 35 | #RUN pip install psycopg2-binary capybara-py selenium minio 36 | 37 | FROM deps AS pydeps 38 | 39 | COPY requirements.txt /app/ 40 | RUN pip install -r /app/requirements.txt 41 | 42 | COPY requirements.api.txt /app/ 43 | RUN pip install -r /app/requirements.api.txt 44 | 45 | FROM pydeps as base 46 | 47 | COPY autoscrape/ /app/autoscrape/ 48 | 49 | # Flask API server 50 | COPY autoscrape-server.py /app/ 51 | 52 | # Build AutoScrape WWW 53 | # Install Node.js 54 | 55 | RUN \ 56 | cd /tmp && \ 57 | curl https://nodejs.org/dist/v12.16.1/node-v12.16.1-linux-x64.tar.xz -o node-js.tar.xz && \ 58 | tar xvf node-js.tar.xz && \ 59 | rm -f node-js.tar.xz && \ 60 | cp -rfv node-v*/* / && \ 61 | rm -rf /tmp/node-* && \ 62 | npm install -g npm && \ 63 | printf '\n# Node.js\nexport PATH="node_modules/.bin:$PATH"' >> /root/.bashrc 64 | 65 | # Remember: this is a git submodule! 66 | COPY www/ /app/www/ 67 | RUN echo REACT_APP_API_HOST="http://localhost:5000" >> .env 68 | RUN cd /app/www && npm install && npm run download-hextractor && npm run build 69 | 70 | FROM autoscrape-worker-deps AS autoscrape-worker 71 | CMD [ "celery", "-A", "autoscrape.tasks", "worker", "--loglevel=info" ] 72 | 73 | FROM autoscrape-server-deps AS autoscrape-server 74 | EXPOSE 5000 75 | CMD [ "python", "autoscrape-server.py" ] 76 | 77 | FROM rabbitmq:3.7.8-management as rabbitmq 78 | EXPOSE 15672 79 | 80 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | recursive-include docs * 4 | recursive-include examples * 5 | recursive-include tests *.py 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | start: 2 | docker-compose down 3 | #docker-compose build --pull 4 | docker-compose up --build -t0 --abort-on-container-exit --renew-anon-volumes 5 | 6 | quickstart: 7 | docker-compose up -t0 --abort-on-container-exit 8 | 9 | build: 10 | docker-compose build --pull 11 | 12 | stop: 13 | docker-compose down -t0 14 | docker-compose rm -f -v # -v removes _anonymous_ volumes 15 | 16 | clean: stop 17 | if docker ps -a -q; then \ 18 | docker rm -f $$(docker ps -a -q) || exit 0; \ 19 | fi 20 | if docker images -q; then \ 21 | docker rmi -f $$(docker images -q) || exit 0; \ 22 | fi 23 | docker volume rm -f $$(docker volume ls | awk '{ print $$2 }') 24 | 25 | run_rabbitmq: 26 | docker run -it --rm --name rabbitmq -p 5672:5672 -p 15672:15672 rabbitmq:3-management 27 | 28 | update_www: 29 | git submodule init && git submodule update && cd www && git fetch && git reset origin/master --hard 30 | 31 | commit_www: 32 | git add www && git commit -m "Update www" 33 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | AutoScrape 2 | ========== 3 | 4 | .. image:: https://pypip.in/v/autoscrape/badge.svg 5 | :target: https://pypi.python.org/pypi/autoscrape/ 6 | 7 | .. image:: https://pypip.in/license/autoscrape/badge.svg 8 | :target: https://pypi.python.org/pypi/autoscrape/ 9 | 10 | 11 | .. figure:: https://github.com/brandonrobertz/autoscrape-py/blob/master/images/ai.png 12 | :alt: Artificial Informer Labs 13 | 14 | A project of `Artificial Informer Labs `__. 15 | 16 | AutoScrape is an automated scraper of structured data from interactive 17 | web pages. You point this scraper at a site, give it a little information 18 | and structured data can then be extracted. No brittle, site-specific 19 | programming necessary. 20 | 21 | This is an implementation of the web scraping framework described in the 22 | paper, `Robust Web Scraping in the Public Interest with AutoScrape `__ and presented at 23 | `Computation + Journalism Symposium 2019 `__. This is 24 | an experimental work in progress! 25 | 26 | Currently there are a few ways to use AutoScrape: 27 | 28 | - via a full Web interface for scraping (see bottom of page, make sure to pull in the submodule!) 29 | - as a local CLI python script 30 | - as a simplified web scraping framework 31 | 32 | Installation and running instructions are provided for both below. 33 | 34 | Quickstart 35 | ---------- 36 | 37 | Two ways, easiest first. 38 | 39 | :: 40 | 41 | pip install autoscrape[all] 42 | autoscrape --backend requests --output outdir --maxdepth 2 https://bxroberts.org 43 | 44 | This will install all dependencies for all backends and various options. 45 | 46 | Or: 47 | 48 | :: 49 | 50 | git clone https://github.com/brandonrobertz/autoscrape-py 51 | cd autoscrape-py/ 52 | pip install .[all] 53 | autoscrape --backend requests --output outdir --maxdepth 2 https://bxroberts.org 54 | 55 | Either way, you can now use ``autoscrape`` from the command line. 56 | 57 | Usage Examples 58 | -------------- 59 | 60 | Here are some straightforward use cases for AutoScrape and how you'd use 61 | the CLI tool to execute them. These, of course, assume you have the 62 | dependencies installed. 63 | 64 | Crawler Backends 65 | ~~~~~~~~~~~~~~~~ 66 | 67 | There are two backends available for driving AutoScrape: ``requests``, 68 | ``selenium`` and ``warc``. The ``requests`` backend (the default) is based on the 69 | Python requests library and is only capable of crawling sites and submitting 70 | simple HTTP forms. For any interaction with forms or JavaScript powered 71 | buttons, you'll need to use the ``selenium`` backend. 72 | 73 | You can control the backened with the ``--backend`` option: 74 | 75 | :: 76 | 77 | autoscrape \ 78 | --backend requests \ 79 | --output requests_crawled_site \ 80 | 'https://some.page/to-crawl' 81 | 82 | In order to use backends other than requests, you need to install 83 | the proper dependencies. `pip install autoscrape[all]` will 84 | install everything required for all backends/functionality, but 85 | you can also install dependencies in isolation: 86 | 87 | :: 88 | Selenium backend: 89 | pip install autoscrape[selenium-backend] 90 | 91 | Crawl graph builder (for use in --save-graph) 92 | pip install autoscrape[graph] 93 | 94 | WARC backend: 95 | pip install autoscrape[warc-backend] 96 | 97 | Note that for the Selenium backend, you need to install geckodriver or 98 | chromedriver, depending if you're using Firefox or Chrome, respectively. 99 | More information is below in the External Dependencies section. 100 | 101 | Crawl 102 | ~~~~~ 103 | 104 | Crawl an entire website, saving all HTML and stylesheets (no 105 | screenshots): 106 | 107 | :: 108 | 109 | autoscrape \ 110 | --backend requests \ 111 | --maxdepth -1 \ 112 | --output crawled_site \ 113 | 'https://some.page/to-crawl' 114 | 115 | Archive Page (Screenshot & Code) 116 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 117 | 118 | Archive a single webpage, both code and full-content screenshot (PNG), 119 | for future reference: 120 | 121 | :: 122 | 123 | autoscrape \ 124 | --backend selenium \ 125 | --full-page-screenshots \ 126 | --load-images --maxdepth 0 \ 127 | --save-screenshots --driver Firefox \ 128 | --output archived_webpage \ 129 | 'https://some.page/to-archive' 130 | 131 | Search Forms and Crawl Result Pages 132 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 133 | 134 | Query a web form, identified by containing the text "I'm a search form", 135 | entering "NAME" into the first (0th) text input field and select January 136 | 20th, 1992 in the second (1st) date field. Then click all buttons with 137 | the text "Next ->" to get all results pages: 138 | 139 | :: 140 | 141 | autoscrape \ 142 | --backend selenium \ 143 | --output search_query_data \ 144 | --form-match "I'm a search form" \ 145 | --input "i:0:NAME,d:1:1992-01-20" \ 146 | --next-match "Next ->" \ 147 | 'https://some.page/search?s=newquery' 148 | 149 | Setup for Standalone Local CLI 150 | ------------------------------ 151 | 152 | External Dependencies 153 | ~~~~~~~~~~~~~~~~~~~~~ 154 | 155 | If you want to use the ``selenium`` backend for interactive crawling, 156 | you need to have geckodriver installed. You can do that here: 157 | 158 | :: 159 | 160 | https://github.com/mozilla/geckodriver/releases 161 | 162 | Or through your package manager: 163 | 164 | :: 165 | apt install firefox-geckodriver 166 | 167 | Your ``geckodriver`` needs to be compatible with your current version of 168 | Firefox or you will get errors. If you install FF and the driver 169 | through your package manager, you *should* be okay, but it's 170 | not guaranteed. We have specific versions of both pinned in the 171 | ``Dockerfile``. 172 | 173 | If you prefer to use Chrome, you will need the ChromeDriver (we've 174 | tested using v2.41). It can be found in your distribution's package 175 | manager or here: 176 | 177 | :: 178 | 179 | https://sites.google.com/a/chromium.org/chromedriver/downloads 180 | 181 | Installing the remaining Python dependencies can be done using pip. 182 | 183 | Pip Install Method 184 | ~~~~~~~~~~~~~~~~~~ 185 | 186 | Next you need to set up your python virtual environment (Python 3.6 187 | required) and install the Python dependencies: 188 | 189 | :: 190 | 191 | pip install -r requirements.txt 192 | 193 | Running Standalone Scraper 194 | -------------------------- 195 | 196 | Environment Test Crawler 197 | ~~~~~~~~~~~~~~~~~~~~~~~~ 198 | 199 | You can run a test to ensure your webdriver is set up correctly by 200 | running the ``test`` crawler: 201 | 202 | :: 203 | 204 | ./autoscrape --backend selenium --show-browser [SITE_URL] 205 | 206 | The ``test`` crawler will just do a depth-first click-only crawl of an 207 | entire website. It will not interact with forms or POST data. Data will 208 | be saved to ``./autoscrape-data/`` (the default output directory). 209 | 210 | Manual Config-Based Scraper 211 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 212 | 213 | Autoscrape has a manually controlled mode, similar to wget, except this 214 | uses interactive capabilities and can input data to search forms, follow 215 | "next page"-type buttons, etc. This functionality can be used either as 216 | a standalone crawler/scraper or as a method to build a training set for 217 | the automated scrapers. 218 | 219 | Autoscrape manual-mode full options: 220 | 221 | :: 222 | 223 | AUTOSCRAPE - Interactively crawl, find searchable forms, 224 | input data to them and scrape data on the results, from an 225 | initial BASEURL. 226 | 227 | Usage: 228 | autoscrape [options] BASEURL 229 | 230 | General Options: 231 | --backend BACKEND 232 | The backend to use. Currently one of "selenium", "requests" or 233 | "warc". The requests browser is only capable of crawling, but 234 | is approximately 2-3.5x faster. WARC is for emulating browsing 235 | through Common Crawl archival data. 236 | [default: selenium] 237 | 238 | --loglevel LEVEL 239 | Loglevel, note that DEBUG is extremely verbose. 240 | [default: INFO] 241 | 242 | --quiet 243 | This will silence all logging to console. 244 | 245 | Crawl-Specific Options: 246 | --maxdepth DEPTH 247 | Maximum depth to crawl a site (in search of form 248 | if the option --form-match STRING is specified, 249 | see below). Setting to 0 means don't crawl at all, 250 | all operations are limited to the BASEURL page. 251 | Setting to -1 means unlimited maximum crawl depth. 252 | [default: 10] 253 | 254 | --max-pages NUM 255 | Maximum number of unique pages, in total, to fetch. 256 | AutoScrape will stop crawling once this is hit. 257 | 258 | --leave-host 259 | By default, autoscrape will not leave the host given 260 | in the BASEURL. This option lets the scraper leave 261 | the host. 262 | 263 | --only-links MATCH_STREING 264 | A whitelist of links to follow. All others will 265 | be ignored. Can be a string or a regex with 266 | multiple strings to match separated by a pipe 267 | (|) character. 268 | 269 | --ignore-links MATCH_STRING 270 | This option can be used to remove any links matching 271 | MATCH_STRING (can be a regex or just a string match) 272 | from consideration for clicking. Accepts the same 273 | argument format as --only-links. 274 | 275 | --link-priority SORT_STRING 276 | A string to sort the links by. In this case, any link 277 | containing "SORT_STRING" will be clicked before any other 278 | links. In most cases you probably want to use the 279 | whitelist, --only-links, option. 280 | 281 | --ignore-extensions IGNORE_EXTENSIONS 282 | Don't click on or download URLs pointing to files with 283 | these extensions. 284 | 285 | --result-page-links MATCH_STRINGS_LIST 286 | If specified, AutoScrape will click on any links matching 287 | this string when it arrives on a search result page. 288 | 289 | Interactive Form Search Options: 290 | --form-match SEARCH_STRING 291 | The crawler will identify a form to search/scrape if it 292 | contains the specified string. If matched, it will be 293 | interactively scraped using the below instructions. 294 | 295 | --input INPUT_DESCRIPTION 296 | Interactive search descriptor. This describes how to 297 | interact with a matched form. The inputs are 298 | described in the following format: 299 | 300 | "c:0:True,i:0:atext,s:1:France:d:0:1991-01-20" 301 | 302 | A single-input type can be one of three types: 303 | checkbox ("c"), input box ("i"), option select 304 | ("s"), and date inputs ("d", with inputs in the 305 | "YYYY-MM-DD" format). The type is separated by a 306 | colon, and the input index position is next. (Each 307 | input type has its own list, so a form with one 308 | input, one checkbox, and one option select, will all 309 | be at index 0.) The final command, sepearated by 310 | another colon, describes what to do with the input. 311 | 312 | Multiple inputs are separated by a comma, so you can 313 | interact with multiple inputs before submitting the 314 | form. 315 | 316 | To illustrate this, the above command does the following: 317 | - first input checkbox is checked (uncheck is False) 318 | - first input box gets filled with the string "first" 319 | - second select input gets the "France" option chosen 320 | - first date input gets set to Jan 20, 1991 321 | 322 | --next-match NEXT_BTN_STRING 323 | A string to match a "next" button with, after 324 | searching a form. The scraper will continue to 325 | click "next" buttons after a search until no matches 326 | are found, unless limited by the --formdepth option 327 | (see below). [default: next page] 328 | 329 | --formdepth DEPTH 330 | How deep the scraper will iterate, by clicking 331 | "next" buttons. Zero means infinite depth. 332 | [default: 0] 333 | 334 | --form-submit-natural-click 335 | Some webpages make clicking a link element difficult 336 | due to JavaScript onClick events. In cases where a 337 | click does nothing, you can use this option to get 338 | the scraper to emulate a mouse click over the link's 339 | poition on the page, activating any higher level JS 340 | interactions. 341 | 342 | --form-submit-wait SECONDS 343 | How many seconds to force wait after a submit to a form. 344 | This should be used in cases where the builtin 345 | wait-for-page-load isn't working properly (JS-heavy 346 | pages, etc). [default: 5] 347 | 348 | Webdriver-Specific and General Options: 349 | --load-images 350 | By default, images on a page will not be fetched. 351 | This speeds up scrapes on sites and lowers bandwidth 352 | needs. This option fetches all images on a page. 353 | 354 | --show-browser 355 | By default, we hide the browser during operation. 356 | This option displays a browser window, mostly 357 | for debugging purposes. 358 | 359 | --driver DRIVER 360 | Which browser to use. Current support for "Firefox", 361 | "Chrome", and "remote". [default: Firefox] 362 | 363 | --browser-binary PATH_TO_BROWSER 364 | Path to a specific browser binary. If left blank 365 | selenium will pull the browser found on your path. 366 | 367 | --remote-hub URI 368 | If using "remote" driver, specify the hub URI to 369 | connect to. Needs the proto, address, port, and path. 370 | [default: http://localhost:4444/wd/hub] 371 | 372 | WARC Options: 373 | --warc-directory PATH_TO_WARCS 374 | Path to the folder containing GZipped WARC files. These can be 375 | downloaded from Common Crawl. Required when using the "warc" 376 | backend. 377 | 378 | --warc-index-file PATH_TO_LEVELDB 379 | Path to the level DB database holding the URL-to-file 380 | index: URL => (filename, record_number) 381 | This will be generated from the WARCS in the --warc-directory 382 | speficied if it's not already. Required when using the "warc" 383 | backend. 384 | 385 | Data Saving Options: 386 | --output DIRECTORY_OR_URL 387 | If specified, this indicates where to save pages during a 388 | crawl. This directory will be created if it does not 389 | currently exist. This directory will have several 390 | sub-directories that contain the different types of pages 391 | found (i.e., search_pages, data_pages, screenshots). 392 | This can also accept a URL (i.e., http://localhost:5000/files) 393 | and AutoScrape will POST to that endpoint with each 394 | file scraped. 395 | [default: autoscrape-data] 396 | 397 | --keep-filename 398 | By default, we hash the files in a scrape in order to 399 | account for dynamic content under a single-page app 400 | (SPA) website implmentation. This option will force 401 | the scraper to retain the original filename, from the 402 | URL when saving scrape data. 403 | 404 | --save-screenshots 405 | This option makes the scraper save screenshots of each 406 | page, interaction, and search. Screenshots will be 407 | saved to the screenshots folder of the output dir. 408 | 409 | --full-page-screenshots 410 | By default, we only save the first displayed part of the 411 | webpage. The remaining portion that you can only see 412 | by scrolling down isn't captured. Setting this option 413 | forces AutoScrape to scroll down and capture the entire 414 | web content. This can fail in certain circumstances, like 415 | in API output mode and should be used with care. 416 | 417 | --save-graph 418 | This option allows the scraper to build a directed graph 419 | of the entire scrape and will save it to the "graph" 420 | subdirectory under the output dir. The output file 421 | is a timestamped networkx pickled graph. 422 | 423 | --disable-style-saving 424 | By default, AutoScrape saves the stylesheets associated 425 | with a scraped page. To save storage, you can disable this 426 | functionality by using this option. 427 | 428 | AutoScrape Web UI (Docker) 429 | -------------------------- 430 | 431 | AutoScrape can be ran as a containerized cluster environment, where 432 | scrapes can be triggered and stopped via API calls and data can be 433 | streamed to this server. 434 | 435 | This requires the `autoscrape-www `__ submodule to be pulled: 436 | 437 | :: 438 | 439 | git submodule init 440 | git submodule update 441 | 442 | This will pull the browser-based UI into the `www/` folder. 443 | 444 | You need 445 | `docker-ce `__ and 446 | `docker-compose `__. Once you 447 | have these dependencies installed, simply run: 448 | 449 | :: 450 | 451 | docker-compose build --pull 452 | docker-compose up 453 | 454 | This will build the containers and launch a API server running on local 455 | port 5000. More information about the API calls can be found in 456 | ``autoscrape-server.py``. 457 | 458 | If you have make installed, you can simply run ``make start``. 459 | -------------------------------------------------------------------------------- /autoscrape-server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | from flask import ( 5 | Flask, request, jsonify, send_from_directory 6 | ) 7 | from flask_sqlalchemy import SQLAlchemy 8 | from sqlalchemy import create_engine 9 | from sqlalchemy_utils import database_exists, create_database 10 | 11 | import autoscrape.tasks as tasks 12 | 13 | 14 | connect_str = 'postgresql://%s:%s@%s/autoscrape' % ( 15 | os.environ["AUTOSCRAPE_DB_USER"], 16 | os.environ["AUTOSCRAPE_DB_PASSWORD"], 17 | os.environ["AUTOSCRAPE_DB_HOST"] 18 | ) 19 | 20 | engine = create_engine(connect_str) 21 | if not database_exists(engine.url): 22 | create_database(engine.url) 23 | 24 | app = Flask("autoscrape-server", static_url_path="", static_folder="www/build") 25 | app.config['SQLALCHEMY_DATABASE_URI'] = connect_str 26 | db = SQLAlchemy(app) 27 | 28 | 29 | class Data(db.Model): 30 | """ 31 | Store our scrape data here, indexed by the scrape ID, 32 | timestamp and fileclass. 33 | """ 34 | __tablename__ = "data" 35 | id = db.Column(db.Integer, primary_key=True) 36 | timestamp = db.Column( 37 | db.DateTime, 38 | default=db.func.current_timestamp(), 39 | nullable=False 40 | ) 41 | task_id = db.Column(db.String, nullable=False) 42 | name = db.Column(db.String, nullable=False) 43 | fileclass = db.Column(db.String, nullable=False) 44 | data = db.Column(db.String, nullable=False) 45 | url = db.Column(db.String, nullable=False) 46 | 47 | db.UniqueConstraint('task_id', 'name', name='unique_name_per_task_1') 48 | 49 | def __init__(self, task_id, name, fileclass, data, url): 50 | self.task_id = task_id 51 | self.name = name 52 | self.fileclass = fileclass 53 | self.data = data 54 | self.url = url 55 | 56 | def __repr__(self): 57 | return '' % (self.name, self.fileclass) 58 | 59 | @property 60 | def serialize(self): 61 | return { 62 | "id": self.id, 63 | "timestamp": self.timestamp.isoformat(), 64 | "name": self.name, 65 | "fileclass": self.fileclass, 66 | "url": self.url, 67 | } 68 | 69 | 70 | @app.route("/", methods=["GET"]) 71 | @app.route("/scrape", methods=["GET"]) 72 | @app.route("/scrape/", methods=["GET"]) 73 | @app.route("/build-extractor", methods=["GET"]) 74 | @app.route("/download-data", methods=["GET"]) 75 | @app.route("/help", methods=["GET"]) 76 | def get_root(id=None): 77 | return send_from_directory("www/build", "index.html") 78 | 79 | 80 | @app.route("/", methods=["GET"]) 81 | def get_path(path): 82 | mimetypes = { 83 | '.wasm': 'application/wasm', 84 | } 85 | mimetype = mimetypes.get(path[-4:], None) 86 | return send_from_directory("www/build", path, mimetype=mimetype) 87 | 88 | 89 | @app.after_request 90 | def disable_cors(response): 91 | response.headers['Access-Control-Allow-Origin'] = '*' 92 | response.headers['Access-Control-Allow-Headers'] = '*' 93 | return response 94 | 95 | 96 | @app.route("/start", methods=["POST"]) 97 | def post_start(): 98 | """ 99 | This is the main endpoint for starting AutoScrape processes. This 100 | endpoint simply accepts the standard parameters as a JSON payload. 101 | Returns a status message and the scrape task ID, which can be used 102 | to query status or stop the scrape. 103 | 104 | Curl Example: 105 | curl http://localhost:5000/start -H 'content-type: application/json' \ 106 | --data '{"baseurl": "https://bxroberts.org",}' 107 | 108 | Success Returns: 109 | HTTP 200 OK 110 | {"status": "OK", "data": "SCRAPE-ID"} 111 | """ 112 | app.logger.debug("Starting AutoScrape job") 113 | args = request.get_json() 114 | app.logger.debug("Arguments: %s" % args) 115 | baseurl = args.pop("baseurl") 116 | # disables double logging in celery worker 117 | args["stdout"] = False 118 | args["loglevel"] = "DEBUG" 119 | args["output"] = os.environ.get( 120 | "AUTOSCRAPE_API_URL", 121 | "http://flask:5000/receive" 122 | ) 123 | app.logger.debug("Baseurl: %s" % baseurl) 124 | result = tasks.start.apply_async((baseurl, args)) 125 | app.logger.debug("Result: %s" % result) 126 | return jsonify({"status": "OK", "data": result.id}) 127 | 128 | 129 | @app.route("/status/", methods=["GET"]) 130 | def get_status(id): 131 | """ 132 | Get status about a running AutoScrape task specified by 133 | its task ID. 134 | 135 | HTTP GET /status/SCRAPE-ID 136 | 137 | Success Returns: 138 | HTTP 200 OK 139 | {"status": "OK", "message": "STARTED", "traceback": None} 140 | """ 141 | result = tasks.app.AsyncResult(id) 142 | data = Data.query.filter_by( 143 | task_id=id, 144 | fileclass="screenshot" 145 | ).order_by( 146 | Data.timestamp.desc() 147 | ).first() 148 | app.logger.debug("Task state: %s" % result.state) 149 | response = { 150 | "status": "OK", 151 | "message": result.state, 152 | } 153 | if result.traceback: 154 | response["traceback"] = result.traceback 155 | if data: 156 | app.logger.debug("Data: %s" % data) 157 | response["data"] = data.data 158 | response["url"] = data.url 159 | return jsonify(response) 160 | 161 | 162 | @app.route("/stop/", methods=["POST"]) 163 | def get_stop(id): 164 | """ 165 | Stop a running AutoScrape task specified by a task ID. 166 | 167 | HTTP POST /stop/SCRAPE-ID 168 | [no data required] 169 | 170 | Success Returns: 171 | HTTP 200 OK 172 | {"status": "OK"} 173 | """ 174 | app.logger.debug("Stopping scraper task: %s" % id) 175 | result = tasks.app.AsyncResult(id) 176 | result.revoke(terminate=True, signal='SIGKILL') 177 | return jsonify({"status": "OK"}) 178 | 179 | 180 | @app.route("/receive/", methods=["POST"]) 181 | def receive_data(id): 182 | """ 183 | This is a callback endpoint for receiving scrape data from 184 | a running AutoScrape instance, configured to send its data 185 | to this endpoint. 186 | 187 | HTTP POST /receive 188 | { 189 | "name": "crawl_data/some_file_name.html", 190 | "data": "base64-encoded-file-data", 191 | "fileclass": "crawl_data|screenshots|downloads|..." 192 | } 193 | """ 194 | app.logger.debug("Task ID : %s" % id) 195 | args = request.get_json() 196 | name = args["name"] 197 | app.logger.debug("Name: %s" % name) 198 | fileclass = args["fileclass"] 199 | app.logger.debug("File class: %s" % (fileclass)) 200 | url = args["url"] 201 | app.logger.debug("URL: %s" % (url)) 202 | 203 | try: 204 | data = args["data"] 205 | app.logger.debug("Data: %s" % len(data)) 206 | # app.logger.debug("Decoded: %s" % decoded) 207 | except Exception as e: 208 | app.logger.debug("Error parsing POST JSON: %s" % e) 209 | data = None 210 | fileclass = None 211 | 212 | # TODO: write b64 data to postgres under task ID key 213 | scraped_data = Data(id, name, fileclass, data, url) 214 | db.session.add(scraped_data) 215 | db.session.commit() 216 | app.logger.debug("Updated task state") 217 | 218 | # TODO: store/dispatch this data somewhere 219 | return jsonify({"status": "OK"}) 220 | 221 | 222 | @app.route("/files/list/", methods=["GET"]) 223 | def list_files(id): 224 | """ 225 | Get a directory listing for a scrape's data, with 226 | an optional fileclass query param (only look at downloads, 227 | crawl_data, data_files, etc). Defaults to *all* data 228 | scraped, ordered by date. 229 | """ 230 | filter_params = { 231 | "task_id": id, 232 | } 233 | 234 | fileclass = request.args.get("fileclass") 235 | if fileclass: 236 | filter_params["fileclass"] = fileclass 237 | 238 | page = int(request.args.get("page", 1)) 239 | pagination = Data.query.filter_by( 240 | **filter_params 241 | ).order_by( 242 | Data.timestamp.desc() 243 | ).paginate(page=page, error_out=False) 244 | 245 | return jsonify({ 246 | "status": "OK", 247 | "has_next": pagination.has_next, 248 | "has_prev": pagination.has_prev, 249 | "page": pagination.page, 250 | "data": [d.serialize for d in pagination.items] 251 | }) 252 | 253 | 254 | @app.route("/files/data//", methods=["GET"]) 255 | def get_file_data(task_id, file_id): 256 | """ 257 | Get the raw data for an individual file. 258 | """ 259 | app.logger.debug("Fetching task_id: %s, file_id: %s" % ( 260 | task_id, file_id)) 261 | 262 | data = Data.query.filter_by( 263 | task_id=task_id, 264 | id=file_id 265 | ).order_by( 266 | Data.timestamp.desc() 267 | ).first() 268 | 269 | app.logger.debug("Data: %s" % data) 270 | 271 | return jsonify({ 272 | "status": "OK", 273 | "data": { 274 | "scrape_id": task_id, 275 | "id": file_id, 276 | "name": data.name, 277 | "timestamp": data.timestamp, 278 | "data": data.data, 279 | "fileclass": data.fileclass, 280 | "url": data.url, 281 | } 282 | }) 283 | 284 | 285 | if __name__ == "__main__": 286 | db.create_all() 287 | app.run(host='0.0.0.0', port=5000) 288 | -------------------------------------------------------------------------------- /autoscrape.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | from autoscrape.cli import scrape 4 | 5 | 6 | if __name__ == "__main__": 7 | scrape.main() 8 | -------------------------------------------------------------------------------- /autoscrape/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # flake8: noqa: F401 3 | __title__ = 'autoscrape-py' 4 | __author__ = 'Brandon Roberts (brandon@bxroberts.org)' 5 | __license__ = 'AGPLv3' 6 | __version__ = '1.6.14' 7 | 8 | 9 | from autoscrape.scrapers.test import TestScraper 10 | from autoscrape.scrapers.null import NullScraper 11 | from autoscrape.scrapers.manual import ManualControlScraper 12 | 13 | 14 | """ 15 | COMMAND Logical Control Flow Step 16 | -------- --------------------------------------------------------------- 17 | INIT (url) initialize & get entry point 18 | │ 19 | ↓ 20 | load page 🠤───────────────────┐ 21 | │ │ 22 | GET_CLICKABLE │ click a link based on likelihood 23 | SELECT_LINK (index) │ of finding a search form 24 | ↓ │ 25 | GET_FORMS ┌────🠦 look for search form (possibly classifier) ───┘ 26 | │ │ 27 | │ │ FOUND 28 | │ ↓ 29 | GET_INPUTS │ identify forms on page that require input 30 | │ (begin with config then move to heuristic then ML) 31 | │ │ 32 | │ ↓ 33 | │ initialize iterators for required inputs 34 | │ (begin with config/brute force, then RL) 35 | │ │ 36 | │ ↓ 37 | └─────── are we at the end of our iterators? 38 | YES │ 39 | ↓ 40 | INPUT (index, chars) enter data into form inputs 🠤───────┐ 41 | │ │ 42 | ↓ │ 43 | SUBMIT (index) submit form and load next page │ 44 | │ │ 45 | ↓ │ 46 | ┌──────🠦 scrape the page │ 47 | │ │ │ 48 | │ ↓ │ 49 | GET_LINKS │ look for a next button ───────────┘ 50 | │ (classifier) NOT FOUND 51 | │ │ 52 | │ │ YES 53 | │ ↓ 54 | SELECT_LINK (index) └─── click the next button & load page 55 | """ 56 | -------------------------------------------------------------------------------- /autoscrape/backends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/backends/__init__.py -------------------------------------------------------------------------------- /autoscrape/backends/base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/backends/base/__init__.py -------------------------------------------------------------------------------- /autoscrape/backends/base/browser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from autoscrape.backends.base.tags import TaggerBase 3 | 4 | 5 | class BrowserBase(TaggerBase): 6 | def _no_tags(self, data, l_type="path"): 7 | clean = [] 8 | if type(data) == tuple: 9 | data = list(data) 10 | for p in data: 11 | name, t_args, kwargs = p 12 | args = list(t_args) 13 | if name == "click": 14 | if not args: 15 | continue 16 | args[0] = "[tag]" 17 | clean.append((name, args, kwargs)) 18 | return clean 19 | 20 | def click(self, tag, **kwargs): 21 | self.path.append(( 22 | "click", [tag], {"url": url} 23 | )) 24 | node = "Click\n text: %s\n hash: %s" % (text, hash) 25 | node_meta = { 26 | "click": tag, 27 | "click_text": text, 28 | "click_iterating_form": None, 29 | } 30 | self.graph.add_node( 31 | node, 32 | **node_meta 33 | ) 34 | self.graph.move_to_node(node) 35 | 36 | def fetch(self, url, initial=False): 37 | self.graph.add_root_node(node, url=url, action="fetch") 38 | 39 | def back(self): 40 | self.graph.move_to_parent() 41 | 42 | def input(self, tag, input): 43 | self.path.append(("input", ("", input,), {})) 44 | action = { 45 | "action": "input", 46 | "text": input, 47 | "tag": tag, 48 | } 49 | self.graph.add_action_to_current(action) 50 | 51 | def submit(self, tag, add_node=True): 52 | self.path.append(("submit", (tag,), {})) 53 | node = "Submit\n tag: %s" % (tag) 54 | node_meta = { 55 | "submit": tag, 56 | } 57 | self.graph.add_node(node, **node_meta) 58 | self.graph.move_to_node(node) 59 | 60 | @property 61 | def infinite_loop_detected(self): 62 | return False 63 | -------------------------------------------------------------------------------- /autoscrape/backends/base/dom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import logging 3 | import os 4 | import re 5 | import urllib 6 | 7 | from autoscrape.util import write_file, get_filename_from_url 8 | 9 | 10 | logger = logging.getLogger('AUTOSCRAPE') 11 | 12 | 13 | class DomBase: 14 | """ 15 | Stateful base of a web scraper. This class deals with finding and interacting 16 | with elements and tags. It also holds the base state variables like 17 | current url. 18 | """ 19 | 20 | def __init__(self, leave_host=False, current_url=None, current_html=None): 21 | self.leave_host = leave_host 22 | self.current_url = current_url 23 | self.current_html = current_html 24 | 25 | def elements_by_path(self, path, from_element=None): 26 | """ 27 | Return element nodes matching a path where path could be xpath, 28 | css, etc, depending on the backend) 29 | """ 30 | raise NotImplementedError("DomBase.elements_by_path not implemented") 31 | 32 | def element_attr(self, element, name, default=None): 33 | """ 34 | For a given element and attribute name, return the value if it 35 | exists. 36 | """ 37 | raise NotImplementedError("DomBase.element_attr not implemented") 38 | 39 | def element_by_tag(self, tag): 40 | """ 41 | For a given tag, return the specified element. 42 | """ 43 | raise NotImplementedError("DomBase.element_by_tag not implemented") 44 | 45 | def get_stylesheet(self): 46 | """ 47 | Return the text of all loaded CSS stylesheets. 48 | """ 49 | raise NotImplementedError("DomBase.get_stylesheet not implemented") 50 | 51 | def element_tag_name(self): 52 | """ 53 | Return the tag name of the given element. 54 | """ 55 | raise NotImplementedError("DomBase.element_tag_name not implemented") 56 | 57 | def element_text(self, element, block=False): 58 | """ 59 | Return the text of an element, or the combined text of all its 60 | descendants (if block=True). 61 | """ 62 | raise NotImplementedError("DomBase.element_text not implemented") 63 | 64 | def element_value(self, element): 65 | """ 66 | Return the text value of an element (e.g., input element). Since 67 | this is usually called like element.value() or element.value, we 68 | wrap this functionality here. 69 | """ 70 | if not hasattr(element, "value"): 71 | raise NotImplementedError("DomBase.element_value not implemented") 72 | return element.value 73 | 74 | def element_name(self, element): 75 | """ 76 | Return the name of an element (e.g., input element). """ 77 | if not hasattr(element, "name"): 78 | raise NotImplementedError("DomBase.element_name not implemented") 79 | return element.name 80 | 81 | def element_displayed(self, element): 82 | """ 83 | Figure out if an element is displayed. If its not, we can't really 84 | interact with it (get its text or attribs, etc). This only effects 85 | the selenium backend right now. 86 | """ 87 | return True 88 | 89 | def download_file(self, url, return_data=False): 90 | """ 91 | Fetch the given url, returning a byte stream of the page data. This 92 | really is only useful in situations where the scraper is on a binary 93 | filetype, such as PDF, etc. 94 | 95 | Note that we're doing this as opposed to some XHR thing inside the 96 | selenium driver due to CORS issues. 97 | """ 98 | logger.debug("Fetching non-HTML page directly: %s" % url) 99 | user_agent = ( 100 | "Mozilla/5.0 " 101 | "(Windows NT 10.0; Win64; x64; rv:62.0) " 102 | "Gecko/20100101 Firefox/62.0" 103 | ) 104 | request = urllib.request.Request(url, headers={ 105 | "User-Agent": user_agent, 106 | "Referrer": self.page_url, 107 | }) 108 | 109 | try: 110 | response = urllib.request.urlopen(request) 111 | except urllib.error.HTTPError as e: 112 | logger.debug("[!] HTTP error while downloading: %s" % (e)) 113 | return 114 | 115 | data = response.read() 116 | action = { 117 | "action": "download_file", 118 | "url": url, 119 | } 120 | self.graph.add_action_to_current(action) 121 | if return_data: 122 | return data 123 | 124 | # always keep filename for downloads, for now 125 | if re.match("^https?://", self.output): 126 | dl_dir = "downloads" 127 | else: 128 | dl_dir = os.path.join(self.output, "downloads") 129 | 130 | parsed_filename = get_filename_from_url(url) 131 | logger.debug("Parsed output filename: %s" % parsed_filename) 132 | filepath = os.path.join(dl_dir, parsed_filename) 133 | write_file( 134 | filepath, data, fileclass="download", writetype="wb", 135 | output=self.output, url=self.page_url, 136 | ) 137 | 138 | def _no_tags(self, list, l_type="path"): 139 | clean = [] 140 | for p in list: 141 | name, args, kwargs = p 142 | if name == "click": 143 | args[0] = "tag" 144 | clean.append((name, args, kwargs)) 145 | return clean 146 | -------------------------------------------------------------------------------- /autoscrape/backends/base/graph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/backends/base/graph.py -------------------------------------------------------------------------------- /autoscrape/backends/base/tags.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from urllib.parse import urlparse 5 | 6 | from autoscrape.backends.base.dom import DomBase 7 | 8 | 9 | logger = logging.getLogger('AUTOSCRAPE') 10 | 11 | 12 | class TaggerBase(DomBase): 13 | """ 14 | Generates tags from a given page that can be used, in a stateless manner, 15 | to refer to unique elements on a web page. 16 | """ 17 | 18 | def __init__(self, **kwargs): 19 | super().__init__(**kwargs) 20 | self.base_host = urlparse(self.current_url).netloc 21 | 22 | def tag_from_element(self, element): 23 | """ 24 | For a given element, return a path (a.k.a. a "tag") leading 25 | to it. Path can be CSS, XPath, or whatever the backend supports. 26 | """ 27 | raise NotImplementedError("Tagger.tag_from_element not implemented") 28 | 29 | def clickable_sanity_check(self, element, href=None): 30 | """ 31 | Check a series of element nodes, checking their attributes and other 32 | attributes, determining if a element is actually 'clickable'. This 33 | check determines which nodes will end up as clickable options for 34 | the scraper on this page. 35 | """ 36 | if not href: 37 | raw_href = self.element_attr(element, "href") 38 | if not raw_href: 39 | return False 40 | 41 | if hasattr(self, "_normalize_url"): 42 | href = self._normalize_url(raw_href).split("#")[0] 43 | else: 44 | href = raw_href 45 | 46 | if href.split("#")[0] == self.current_url: 47 | return False 48 | 49 | # skip any weird protos ... we whitelist notrmal HTTP, 50 | # anchor tags and blank tags (to support JavaScript & btns) 51 | if href and \ 52 | not href.startswith("//") and \ 53 | not href.startswith("https:") and \ 54 | not href.startswith("http:") and \ 55 | not href.startswith("javascript"): 56 | return False 57 | 58 | # Don't leave base host ... configurable? 59 | elem_host = urlparse(href).netloc 60 | if elem_host and not self.leave_host and elem_host != self.base_host: 61 | return False 62 | 63 | return True 64 | 65 | def get_clickable(self, path="//a"): 66 | """ 67 | Get all clickable element tags on the current page. 68 | """ 69 | tags = [] 70 | a_elems = self.elements_by_path(path) 71 | for element in a_elems: 72 | if not self.clickable_sanity_check(element): 73 | continue 74 | 75 | tag = self.tag_from_element(element) 76 | # No way to get back to here, so we can't use it 77 | if not tag: 78 | logger.warn("No tag for element %s" % (element)) 79 | return False 80 | 81 | tags.append(tag) 82 | return tags 83 | 84 | def get_inputs(self, form=None, itype=None, root_node=None): 85 | """ 86 | Get inputs, either for full page or by a form WebElement. 87 | Returns a list of tags. itype can be one of "text", "select", 88 | "checkbox", or None (all types), indicating the type of input. 89 | """ 90 | x_path = "//input" 91 | if itype == "select": 92 | x_path = "//select" 93 | elif itype == "date": 94 | x_path = "//input[@type='date']|//input[@structure='date']" 95 | elif itype: 96 | x_path = "//input[@type='%s']" % (itype) 97 | 98 | elem = root_node 99 | tags = [] 100 | if form is not None: 101 | elem = form 102 | x_path = ".%s" % x_path 103 | 104 | # radio checkboxes are grouped by name 105 | if itype == "radio": 106 | # store the radio group position by name here 107 | radio_names = [] 108 | elems = self.elements_by_path(x_path, from_element=elem) 109 | for radio in elems: 110 | name = self.element_attr(radio, "name") 111 | if name not in radio_names: 112 | radio_names.append(name) 113 | tags.append([]) 114 | radio_group_ix = radio_names.index(name) 115 | radio_tag = self.tag_from_element(radio) 116 | if not radio_tag: 117 | logger.warn("No tag for radio %s" % radio) 118 | continue 119 | tags[radio_group_ix].append(radio_tag) 120 | 121 | # all the rest are flat 122 | else: 123 | elems = self.elements_by_path(x_path, from_element=elem) 124 | for inp in elems: 125 | # this should weed out hidden/non-displayed inputs ... 126 | if not self.element_displayed(inp): 127 | continue 128 | # .. and just in case it didn't 129 | if self.element_attr(inp, "type") == "hidden": 130 | continue 131 | input_tag = self.tag_from_element(inp) 132 | if not input_tag: 133 | logger.warn("No tag for input %s" % input) 134 | continue 135 | 136 | tags.append(input_tag) 137 | 138 | return tags 139 | 140 | def get_forms(self): 141 | """ 142 | Get all tags to forms on a page and their respective text 143 | inputs. Tags are returned in a dict, with the form tag as 144 | the key and a list of input CSSPaths under the form. 145 | """ 146 | x_path = "//form" 147 | forms = self.elements_by_path(x_path) 148 | 149 | tags = {} 150 | for elem in forms: 151 | if not self.element_displayed(elem): 152 | continue 153 | tag = self.tag_from_element(elem) 154 | if not tag: 155 | logger.warn("No tag for element %s" % elem) 156 | continue 157 | 158 | tags[tag] = [ 159 | self.get_inputs(form=elem, itype="text"), 160 | self.get_inputs(form=elem, itype="select"), 161 | self.get_inputs(form=elem, itype="checkbox"), 162 | self.get_inputs(form=elem, itype="date"), 163 | self.get_inputs(form=elem, itype="radio"), 164 | ] 165 | 166 | return tags 167 | 168 | def get_buttons(self, in_form=False, path=None): 169 | """ 170 | Return all tags leading to a form link, button, or submit input button, 171 | optionally given a base form to look from. This is used to identify 172 | clickable things related to forms. 173 | """ 174 | x_path = path or "|".join([ 175 | "//form//a", "//button", "//input[@type='button']", 176 | "//input[@type='submit']", "//table//a", 177 | ]) 178 | btns = self.elements_by_path(x_path) 179 | 180 | tags = [] 181 | for elem in btns: 182 | if not self.element_displayed(elem): 183 | continue 184 | tag = self.tag_from_element(elem) 185 | if not tag: 186 | logger.warn("No tag for element %s" % elem) 187 | continue 188 | tags.append(tag) 189 | return tags 190 | -------------------------------------------------------------------------------- /autoscrape/backends/requests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/backends/requests/__init__.py -------------------------------------------------------------------------------- /autoscrape/backends/requests/browser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import logging 3 | import re 4 | import time 5 | 6 | import requests 7 | from autoscrape.backends.base.browser import BrowserBase 8 | from autoscrape.backends.requests.tags import Tagger 9 | from autoscrape.search.graph import Graph 10 | 11 | 12 | logger = logging.getLogger('AUTOSCRAPE') 13 | 14 | 15 | class RequestsBrowser(BrowserBase, Tagger): 16 | """ 17 | A simple HTTP-requests based scraper, currently capable of only 18 | doing crawls and very basic HTTP Post requests, but is between 19 | 2x and 5x faster. 20 | 21 | bxroberts.org full crawl - requests backend 22 | real 0m34.066s 23 | user 0m23.062s 24 | sys 0m0.640s 25 | 26 | bxroberts.org full crawl - selenium backend 27 | real 2m5.373s 28 | user 1m0.872s 29 | sys 0m10.976s 30 | """ 31 | 32 | def __init__(self, leave_host=False, **kwargs): 33 | # requests Session 34 | self.s = requests.Session() 35 | self.s.headers.update({ 36 | "User-Agent": ( 37 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0)" 38 | "Gecko/20100101 Firefox/35.0" 39 | ) 40 | }) 41 | 42 | # set of clicked elements 43 | self.visited = set() 44 | 45 | # queue of the path that led us to the current page 46 | # this is in the form of (command, *args, **kwargs) 47 | self.path = [] 48 | 49 | # tree building 50 | self.graph = Graph() 51 | 52 | # setting to False, ensures crawl will stay on same host 53 | self.leave_host = leave_host 54 | 55 | self.current_url = None 56 | self.current_html = None 57 | 58 | def _no_endslash_url(self, url): 59 | # remove trailing slash 60 | url_noslash = re.sub(r"/$", "", url) 61 | # and trailing slash before query params 62 | return re.sub(r"/&", "&", url_noslash) 63 | 64 | def _check_and_set_visited(self, url): 65 | """ 66 | Take a URL and see if we've visited it (or a slash-ending 67 | variation) if we haven't this sets the URL to the 68 | visited set. Returns True if we've visited False if not. 69 | """ 70 | url_no_endslash = self._no_endslash_url(url) 71 | if url_no_endslash in self.visited or url in self.visited: 72 | return True 73 | self.visited.add(url) 74 | self.visited.add(url_no_endslash) 75 | return False 76 | 77 | def click(self, tag, **kwargs): 78 | element = self.element_by_tag(tag) 79 | text = self.element_text(element) 80 | url = None 81 | tag_name = self.element_tag_name(element) 82 | logger.debug(" - tag name: %s Text: %s" % (tag_name, text)) 83 | if tag_name == "a": 84 | raw_href = self.element_attr(element, "href") 85 | if not raw_href: 86 | logger.debug("[!] No HREF, skipping link...") 87 | return False 88 | 89 | url = self._normalize_url(raw_href) 90 | if self._check_and_set_visited(url): 91 | logger.debug("[!] Already visited URL %s" % (url)) 92 | return False 93 | 94 | logger.info("[+] Clicking link: %s" % url) 95 | if not self.fetch(url): 96 | logger.debug("[!] Fetch failed on %s" % (url)) 97 | return False 98 | elif tag_name == "input": 99 | element_type = element.type 100 | if element_type == "submit": 101 | parent_form = element.xpath(".//ancestor::form")[0] 102 | parent_form_tag = self.tag_from_element(parent_form) 103 | self.submit(parent_form_tag, add_node=False) 104 | url = self.current_url 105 | if self._check_and_set_visited(url): 106 | return False 107 | 108 | elif tag_name == "iframe": 109 | raw_href = self.element_attr(element, "src") 110 | if not raw_href: 111 | return False 112 | 113 | url = self._normalize_url(raw_href) 114 | if self._check_and_set_visited(url): 115 | return False 116 | 117 | logger.info("[+] Fetching iframe: %s" % url) 118 | if not self.fetch(url): 119 | return False 120 | else: 121 | raise NotImplementedError( 122 | "click not implemented for element: %s" % (tag_name) 123 | ) 124 | 125 | self.path.append(( 126 | "click", [tag], {"url": url} 127 | )) 128 | node = "Click\n text: %s\n URL: %s" % (text, url) 129 | node_meta = { 130 | "click": tag, 131 | "click_text": text, 132 | "click_iterating_form": None, 133 | } 134 | self.graph.add_node( 135 | node, 136 | **node_meta 137 | ) 138 | self.graph.move_to_node(node) 139 | return True 140 | 141 | def fetch(self, url, initial=False): 142 | """ 143 | Fetch a page from a given URL (entry point, typically). Most of 144 | the time we just want to click a link or submit a form using 145 | webdriver. 146 | """ 147 | logger.info("%s Fetching url=%s initial=%s" % ( 148 | ("[+]" if initial else " -"), url, initial, 149 | )) 150 | retries = 3 151 | success = True 152 | while True: 153 | try: 154 | response = self.s.get(url) 155 | break 156 | except requests.exceptions.ConnectionError as e: 157 | logger.error(" ! Connection error retrying...") 158 | logger.error(e) 159 | if not retries: 160 | logger.error(" ! Connection error, skipping URL...") 161 | return False 162 | time.sleep(30) 163 | retries -= 1 164 | 165 | if not response.text: 166 | logger.error(" ! Blank response. Skipping URL...") 167 | return False 168 | 169 | # Requests' encoding detection is faulty. The following 170 | # block will fix most issues 171 | if response.encoding and "utf" not in response.encoding.lower(): 172 | response.encoding = response.apparent_encoding 173 | self.current_html = response.text 174 | # this check fixes improper decoding of UTF byte order mark 175 | if self.current_html[:3] == "": 176 | self.current_html = self.current_html.encode( 177 | response.encoding 178 | ).decode("utf-8-sig") 179 | 180 | self.current_url = response.url 181 | self.dom = self._get_dom() 182 | 183 | if initial: 184 | self.path.append(("fetch", [url], {"initial": initial})) 185 | node = "Fetch\n url: %s" % url 186 | self.graph.add_root_node(node, url=url, action="fetch") 187 | 188 | return True 189 | 190 | def back(self): 191 | logger.info("[+] Going back...") 192 | logger.debug(" - current path-length=%s path=%s" % ( 193 | len(self.path), self._no_tags(self.path), 194 | )) 195 | # We're now where we started from 196 | self.path.pop() 197 | if not self.path: 198 | self.path = [] 199 | return 200 | 201 | prev = self.path[-1] 202 | if prev[0] == "fetch": 203 | self.graph.move_to_parent() 204 | self.fetch(prev[1][0]) 205 | 206 | elif prev[0] == "click": 207 | self.graph.move_to_parent() 208 | self.fetch(prev[2]["url"]) 209 | 210 | @property 211 | def page_html(self): 212 | return self.current_html 213 | 214 | @property 215 | def page_url(self): 216 | return self.current_url 217 | 218 | def get_clickable(self, **kwargs): 219 | logger.debug(" - Getting clickable...") 220 | self.dom = self._get_dom() 221 | tagger = Tagger( 222 | current_html=self.current_html, 223 | current_url=self.current_url, 224 | leave_host=self.leave_host, 225 | ) 226 | clickable = tagger.get_clickable() 227 | return clickable 228 | 229 | # def download_file(self, url): 230 | # response = self.s.get(url) 231 | # action = { 232 | # "action": "download_page", 233 | # "url": url, 234 | # } 235 | # self.graph.add_action_to_current(action) 236 | # return response.text 237 | 238 | def input(self, tag, input): 239 | """ 240 | Enter some input into an element by a given tag. 241 | """ 242 | logger.info("[+] Injecting text \"%s\" to input" % (input)) 243 | elem = self.element_by_tag(tag) 244 | 245 | input_name = self.element_name(elem) 246 | value = self.element_value(elem) 247 | logger.debug("Input name=%s value=%s" % (input_name, value)) 248 | 249 | elem.attrib["value"] = input 250 | 251 | self.path.append(("input", ("", input,), {})) 252 | action = { 253 | "action": "input", 254 | "text": input, 255 | "tag": tag, 256 | } 257 | self.graph.add_action_to_current(action) 258 | 259 | def submit(self, tag, add_node=True): 260 | """ 261 | Submit a form, by extracting the method and url, then 262 | constructing the params and sending the request, along 263 | with the form data. 264 | """ 265 | form = self.element_by_tag(tag) 266 | inputs = self.elements_by_path("//input", from_element=form) 267 | 268 | data = {} 269 | for input in inputs: 270 | # TODO: handle non-text type inputs 271 | name = self.element_name(input) 272 | if not name: 273 | continue 274 | value = self.element_value(input) 275 | data[name] = value 276 | 277 | action = self.element_attr(form, "action", default=self.current_url) 278 | method = self.element_attr(form, "method", default="get") 279 | url = self._normalize_url(action) 280 | 281 | request_kwargs = {} 282 | if method.lower() == "post": 283 | request_kwargs["data"] = data 284 | elif method.lower() == "get": 285 | params = [] 286 | for key in data.keys(): 287 | params.append([key, data.get(key, "")]) 288 | request_kwargs["params"] = params 289 | 290 | request = requests.Request( 291 | method, 292 | url, 293 | **request_kwargs 294 | ) 295 | prepped = request.prepare() 296 | response = self.s.send(prepped) 297 | self.current_url = response.url 298 | self.current_html = response.text 299 | self.dom = self._get_dom() 300 | 301 | # TODO: all higher level stuff 302 | if add_node: 303 | self.path.append(("submit", (tag,), {})) 304 | node = "Submit\n tag: %s" % (tag) 305 | node_meta = { 306 | "submit": tag, 307 | } 308 | self.graph.add_node(node, **node_meta) 309 | self.graph.move_to_node(node) 310 | 311 | def get_screenshot(self): 312 | return None 313 | -------------------------------------------------------------------------------- /autoscrape/backends/requests/dom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import lxml.html 3 | # use threads so we can run autoscrape inside celery 4 | from multiprocessing.pool import ThreadPool 5 | 6 | import requests 7 | from urllib.parse import urlparse, ParseResult 8 | 9 | from autoscrape.backends.base.dom import DomBase 10 | 11 | 12 | def download_stylesheet(css_url): 13 | response = requests.get(css_url) 14 | data = response.text 15 | if type(data) == bytes: 16 | return data.decode("utf-8") 17 | return data 18 | 19 | 20 | class Dom(DomBase): 21 | def __init__(self, **kwargs): 22 | super().__init__(**kwargs) 23 | self.dom = self._get_dom() 24 | 25 | def _get_dom(self): 26 | dom = None 27 | try: 28 | dom = lxml.html.fromstring(self.current_html) 29 | # this handles trying to load XML, RSS feed, etc 30 | except ValueError as e: 31 | if "Please use bytes input" in str(e): 32 | html_b = bytes(self.current_html, encoding="utf-8") 33 | dom = lxml.html.fromstring(html_b) 34 | else: 35 | raise e 36 | # if our page's HTML is just an element, like an 37 | # iframe, without a body or html then lxml will 38 | # return an element surrounded by a body and html. 39 | # so here we make element the root and use that as 40 | # our base DOM. 41 | while True: 42 | parent = dom.getparent() 43 | if parent is None: 44 | break 45 | dom = parent 46 | return dom 47 | 48 | def element_attr(self, element, name, default=None): 49 | if not element.attrib: 50 | return default 51 | return element.attrib.get(name, default) 52 | 53 | def element_by_tag(self, tag): 54 | elements = self.dom.cssselect(tag) 55 | if not elements: 56 | return None 57 | return elements[0] 58 | 59 | def elements_by_path(self, xpath, from_element=None): 60 | if from_element is None: 61 | return self.dom.xpath(xpath) 62 | return from_element.xpath(xpath) 63 | 64 | def get_stylesheet(self, fetch_css=False): 65 | stylesheet_urls = [] 66 | for link in self.dom.xpath("//link"): 67 | if not link.attrib: 68 | continue 69 | l_type = link.attrib.get("type") 70 | l_rel = link.attrib.get("rel") 71 | l_href = link.attrib.get("href") 72 | if l_type != "text/css" and l_rel != "stylesheet": 73 | continue 74 | css_url = self._normalize_url(l_href) 75 | stylesheet_urls.append(css_url) 76 | 77 | pool = None 78 | try: 79 | pool = ThreadPool(8) 80 | except OSError: 81 | # operating system/container doesn't support threading 82 | pass 83 | 84 | if pool is not None: 85 | results = pool.map(download_stylesheet, stylesheet_urls) 86 | pool.close() 87 | # fallback to single threaded in case of threading not permitted 88 | else: 89 | results = [] 90 | for css_url in stylesheet_urls: 91 | results.append(download_stylesheet(css_url)) 92 | 93 | css = "\n".join(results) 94 | for style in self.dom.xpath("style"): 95 | css += style.text_content() 96 | return css 97 | 98 | def _normalize_url(self, url): 99 | argnames = ['scheme', 'netloc', 'path', 'params', 'query', 'fragment'] 100 | inheritable = ['scheme', 'netloc', 'path'] 101 | parsed_current_url = urlparse(self.current_url) 102 | parsed_url = urlparse(url) 103 | 104 | args = [] 105 | for argname in argnames: 106 | value = getattr(parsed_url, argname, None) 107 | if not value and argname in inheritable: 108 | value = getattr(parsed_current_url, argname, '') 109 | args.append(value) 110 | 111 | pr = ParseResult(*args) 112 | normalized = pr.geturl() 113 | return normalized 114 | 115 | def element_text(self, element, block=False): 116 | if block and element is not None: 117 | return element.text_content() 118 | if element is None: 119 | return '' 120 | text = element.text 121 | if not text: 122 | return '' 123 | return text 124 | 125 | def element_name(self, element): 126 | return element.name 127 | 128 | def element_tag_name(self, element): 129 | if element is None: 130 | return "" 131 | return element.tag 132 | -------------------------------------------------------------------------------- /autoscrape/backends/requests/tags.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import logging 3 | 4 | from autoscrape.backends.base.tags import TaggerBase 5 | from autoscrape.backends.requests.dom import Dom 6 | 7 | 8 | logger = logging.getLogger('AUTOSCRAPE') 9 | 10 | 11 | class Tagger(TaggerBase, Dom): 12 | def tag_from_element(self, el): 13 | path = [] 14 | while el is not None: 15 | nth = 1 16 | parent = el.getparent() 17 | children = [] 18 | if parent is not None: 19 | children = parent.getchildren() 20 | for child in children: 21 | if child == el: 22 | break 23 | if child.tag == el.tag: 24 | nth += 1 25 | selector = "%s:nth-of-type(%s)" % ( 26 | el.tag, nth 27 | ) 28 | path.insert(0, selector) 29 | el = parent 30 | tag = " > ".join(path) 31 | return tag 32 | 33 | def get_inputs(self, form=None, itype=None, root_node=None): 34 | return super().get_inputs(form=form, itype=itype, root_node=self.dom) 35 | 36 | def get_buttons(self, in_form=False, path=None): 37 | x_path = path or "|".join([ 38 | "//form//a", "//input[@type='submit']", "//table//a", 39 | ]) 40 | return super().get_buttons(in_form=in_form, path=x_path) 41 | 42 | def get_clickable(self, path=None): 43 | clickable = super().get_clickable(path="//a|//iframe") 44 | return clickable 45 | 46 | def clickable_sanity_check(self, element): 47 | raw_href = self.element_attr(element, "href") 48 | 49 | tag_name = self.element_tag_name(element) 50 | if tag_name == "iframe": 51 | raw_href = self.element_attr(element, "src") 52 | 53 | if not raw_href: 54 | return False 55 | 56 | href = self._normalize_url(raw_href).split("#")[0] 57 | if href.split("#")[0] == self.current_url: 58 | return False 59 | 60 | # skip any weird protos ... we whitelist notrmal HTTP, 61 | # anchor tags and blank tags (to support JavaScript & btns) 62 | if href and href.startswith("javascript"): 63 | return False 64 | 65 | return super().clickable_sanity_check(element, href=href) 66 | -------------------------------------------------------------------------------- /autoscrape/backends/selenium/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/backends/selenium/__init__.py -------------------------------------------------------------------------------- /autoscrape/backends/selenium/dom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | 5 | try: 6 | from selenium.common.exceptions import ( 7 | NoSuchElementException, StaleElementReferenceException, 8 | NoSuchFrameException 9 | ) 10 | except ModuleNotFoundError: 11 | # we haven't installed selenium backend deps 12 | pass 13 | 14 | from autoscrape.backends.base.dom import DomBase 15 | 16 | 17 | logger = logging.getLogger('AUTOSCRAPE') 18 | 19 | 20 | class FrameTransparentList(list): 21 | def __init__(self, *args, **kwargs): 22 | self.driver = kwargs.pop("driver") 23 | super().__init__(*args, **kwargs) 24 | 25 | def __getitem__(self, index): 26 | raw_item = super().__getitem__(index) 27 | if not isinstance(raw_item, list): 28 | self.driver.switch_to.default_content() 29 | return raw_item 30 | iframe_ix, element = raw_item 31 | self.driver.switch_to.frame(iframe_ix) 32 | return element 33 | 34 | def __iter__(self): 35 | for raw_item in list.__iter__(self): 36 | if not isinstance(raw_item, list): 37 | self.driver.switch_to.default_content() 38 | yield raw_item 39 | continue 40 | iframe_ix, element = raw_item 41 | self.driver.switch_to.frame(iframe_ix) 42 | yield element 43 | self.driver.switch_to.default_content() 44 | 45 | 46 | class Dom(DomBase): 47 | def element_attr(self, element, name, default=None): 48 | return element.get_attribute(name) 49 | 50 | def iframe_capable_lookup(self, tag): 51 | try: 52 | self.driver.switch_to.default_content() 53 | return self.driver.find_element_by_css_selector(tag) 54 | except NoSuchElementException: 55 | pass 56 | self.driver.switch_to.default_content() 57 | iframes = self.driver.find_elements_by_tag_name("iframe") 58 | for iframe_ix in range(len(iframes)): 59 | try: 60 | self.driver.switch_to.frame(iframe_ix) 61 | except NoSuchFrameException: 62 | continue 63 | try: 64 | return self.driver.find_element_by_css_selector(tag) 65 | except NoSuchElementException: 66 | continue 67 | self.driver.switch_to.default_content() 68 | logger.debug("[!] No element found for tag: %s" % (tag)) 69 | return None 70 | 71 | def element_by_tag(self, tag): 72 | """ 73 | Take a tag and return the corresponding live element in the DOM. 74 | """ 75 | inside_id = False 76 | # escaping logic 77 | newtag = "" 78 | for c in tag: 79 | if c == "#": 80 | inside_id = True 81 | newtag += c 82 | continue 83 | 84 | # end of ID 85 | elif inside_id and re.search("\s", c): 86 | inside_id = False 87 | 88 | elif inside_id and c in self.css_escapables: 89 | for escapable in self.css_escapables: 90 | c = "\%s" % escapable 91 | 92 | newtag += c 93 | 94 | if newtag != tag: 95 | logger.debug("Original tag: %s, newtag: %s" % (tag, newtag)) 96 | tag = newtag 97 | 98 | return self.iframe_capable_lookup(tag) 99 | 100 | def elements_by_path(self, xpath, from_element=None): 101 | """ 102 | Get all elements, across all iframes. We output a 103 | FrameTransparentList which is a normal list of elements, but it will 104 | switch to the correct frame upon accessing/iterating to 105 | each element. 106 | """ 107 | if from_element is None: 108 | from_element = self.driver 109 | return from_element.find_elements_by_xpath(xpath) 110 | iframes = from_element.find_elements_by_tag_name("iframe") 111 | if not len(iframes): 112 | return from_element.find_elements_by_xpath(xpath) 113 | # gather all elements from iframe 114 | elements = from_element.find_elements_by_xpath(xpath) 115 | for iframe_ix in range(len(iframes)): 116 | self.driver.switch_to.frame(iframe_ix) 117 | for el in self.driver.find_elements_by_xpath(xpath): 118 | elements.append([iframe_ix, el]) 119 | self.driver.switch_to.default_content() 120 | self.driver.switch_to.default_content() 121 | return FrameTransparentList(elements, driver=self.driver) 122 | 123 | def get_stylesheet(self): 124 | script = """ 125 | return [].slice.call(document.styleSheets) 126 | .reduce((prev, styleSheet) => { 127 | try { 128 | if (styleSheet.cssRules) { 129 | return prev + 130 | [].slice.call(styleSheet.cssRules) 131 | .reduce(function (prev, cssRule) { 132 | return prev + cssRule.cssText; 133 | }, ''); 134 | } else { 135 | return prev; 136 | } 137 | } catch (e) { 138 | return prev + `@import url("${styleSheet.href}");` 139 | } 140 | }, '');""" 141 | return self.driver.execute_script(script) 142 | 143 | def _text_via_many_means(self, el): 144 | text = [] 145 | try: 146 | txt = el.text 147 | if txt: 148 | text.append(txt.strip()) 149 | except Exception as e: 150 | logger.error("Error getting text element: %s, Err: %s" % ( 151 | el, e)) 152 | 153 | title = el.get_attribute("title") 154 | if title: 155 | text.append(title.strip()) 156 | 157 | try: 158 | placeholder = el.get_attribute("placeholder") 159 | if placeholder: 160 | text.append(placeholder.strip()) 161 | except Exception as e: 162 | logger.error("Error getting placeholder: %s, Error: %s" % ( 163 | el, e)) 164 | 165 | img_els = el.find_elements_by_tag_name("img") 166 | for img in img_els: 167 | try: 168 | text.append(img.get_attribute("alt")) 169 | except StaleElementReferenceException as e: 170 | logger.error("Error getting image text: %s, Error: %s" % ( 171 | img, e 172 | )) 173 | 174 | if self.element_tag_name(el) == "input": 175 | text.append(el.get_attribute("value")) 176 | 177 | return " ".join(text).replace("\n", "").strip() 178 | 179 | def element_text(self, element, block=False): 180 | """ 181 | Get the text for all elements either under a given element 182 | or for a whole page (if element == None) 183 | """ 184 | if not block and element is not None: 185 | return self._text_via_many_means(element) 186 | 187 | if element is None: 188 | element = self.driver 189 | 190 | return element.text 191 | 192 | def element_tag_name(self, element): 193 | if element is None: 194 | return "" 195 | return element.tag_name 196 | 197 | def element_value(self, element): 198 | return element.get_attribute("value") 199 | 200 | def element_displayed(self, element): 201 | fn_names = ["is_displayed", "is_enabled"] 202 | for fn_name in fn_names: 203 | if not hasattr(element, fn_name): 204 | continue 205 | try: 206 | if not getattr(element, fn_name)(): 207 | return False 208 | except StaleElementReferenceException as e: 209 | pass 210 | return True 211 | -------------------------------------------------------------------------------- /autoscrape/backends/selenium/tags.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from autoscrape.backends.base.tags import TaggerBase 5 | from autoscrape.backends.selenium.dom import Dom 6 | 7 | 8 | logger = logging.getLogger('AUTOSCRAPE') 9 | 10 | 11 | class Tagger(TaggerBase, Dom): 12 | """ 13 | Generates tags from a given page that can be used, in a stateless manner, 14 | to refer to unique elements on a web page. 15 | """ 16 | 17 | def __init__(self, driver=None, current_url=None, leave_host=False): 18 | super().__init__(current_url=current_url, leave_host=leave_host) 19 | self.driver = driver 20 | 21 | def tag_from_element(self, element): 22 | """ 23 | Takes a WebDriver element and returns an CSSPath for finding it 24 | in the future. As far as I know, this is only really feasible 25 | using JavaScript (without resorting to a complicated tree walking 26 | algorithm ... which we may need to do if this ends up failing). 27 | 28 | Modified from: https://stackoverflow.com/a/12222317 29 | """ 30 | script = """ 31 | var getPathTo = function(el) { 32 | if (!(el instanceof Element)) 33 | return; 34 | var path = []; 35 | while (el.nodeType === Node.ELEMENT_NODE) { 36 | // if (el.id) { 37 | // path.unshift(`#${el.id}`); 38 | // break; 39 | // } 40 | var selector = el.nodeName.toLowerCase(); 41 | // // NOTE: we removed this because web pages often use 42 | // // strange characters in ID names which cause the CSS 43 | // // selector to fail upon lookup. If we only use traversal 44 | // // methods, we don't have that webpage-specific problem 45 | // if (el.id) { 46 | // selector += '#' + el.id; 47 | // path.unshift(selector); 48 | // break; 49 | // } 50 | 51 | var sib = el, nth = 1; 52 | while (sib = sib.previousElementSibling) { 53 | if (sib.nodeName.toLowerCase() == selector) 54 | nth++; 55 | } 56 | 57 | // // NOTE: always give a nth-of-type tag, even if 58 | // // if there's only a single sibling, just to be 59 | // // extra-specific 60 | // if (nth != 1) 61 | 62 | selector += ":nth-of-type("+nth+")"; 63 | path.unshift(selector); 64 | el = el.parentNode; 65 | } 66 | return path.join(" > "); 67 | } 68 | 69 | // NOTE: this used to have a toLowerCase on it, but it caused 70 | // problems with some pages. Leaving it as it was found in the 71 | // original DOM is best here. 72 | return getPathTo(arguments[0]); //.toLowerCase(); 73 | """ 74 | return self.driver.execute_script(script, element) 75 | 76 | def clickable_sanity_check(self, element): 77 | try: 78 | if not self.element_displayed(element): 79 | return False 80 | except Exception as e: 81 | logger.debug("[!] Skipping sanity check due to failure: %s" % (e)) 82 | return False 83 | return super().clickable_sanity_check(element) 84 | 85 | def get_inputs(self, form=None, itype=None, root_node=None): 86 | return super().get_inputs(form=form, itype=itype, root_node=self.driver) 87 | 88 | def get_clickable(self, path=None): 89 | """ 90 | Get all clickable element tags on the current page. 91 | 92 | TODO: In the future we may need to recurse the page to find 93 | other clickable types like JS-enabled divs, etc. 94 | """ 95 | xpath = path or "|".join([ 96 | "//a", "//button", "//input[@type='submit']", 97 | "//input[@type='button']" 98 | ]) 99 | return super().get_clickable(path=xpath) 100 | -------------------------------------------------------------------------------- /autoscrape/backends/warc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/backends/warc/__init__.py -------------------------------------------------------------------------------- /autoscrape/backends/warc/browser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import io 3 | import logging 4 | import os 5 | import pickle 6 | import sys 7 | 8 | from autoscrape.backends.requests.browser import RequestsBrowser 9 | from autoscrape.backends.requests.tags import Tagger 10 | from autoscrape.search.graph import Graph 11 | from autoscrape.util.warc import build_warc_index, _warc_records 12 | 13 | 14 | logger = logging.getLogger('AUTOSCRAPE') 15 | 16 | 17 | try: 18 | import plyvel 19 | import warcio 20 | except ModuleNotFoundError: 21 | pass 22 | 23 | 24 | class WARCBrowser(RequestsBrowser): 25 | def __init__(self, warc_index_file=None, warc_directory=None, 26 | filter_domain=None, leave_host=False, **kwargs): 27 | try: 28 | warcio 29 | except NameError: 30 | logger.debug( 31 | "WARC dependencies not installed." 32 | " (Hint: pip install autoscrape[warc-backend])" 33 | " Exiting." 34 | ) 35 | sys.exit(1) 36 | 37 | no_dir_msg = "Error: No warc_directory specified for WARCBrowser" 38 | assert warc_directory is not None, no_dir_msg 39 | 40 | no_index_msg = "Error: No warc_index_file specified for WARCBrowser" 41 | assert warc_index_file is not None, no_index_msg 42 | 43 | # leveldb directory 44 | self.warc_index_file = warc_index_file 45 | # directory containing Common Crawl WARCs 46 | self.warc_directory = warc_directory 47 | # only build index for a specific domain 48 | self.filter_domain = filter_domain 49 | 50 | # WARC index: URL => (filename, record_number) 51 | self.warc_index = plyvel.DB(self.warc_index_file, create_if_missing=True) 52 | build_warc_index( 53 | db=self.warc_index, warc_directory=self.warc_directory, 54 | filter_domain=self.filter_domain 55 | ) 56 | # WARC cache: filename => [record1, ..., recordN] 57 | self.warc_cache = {} 58 | self.warc_directory = warc_directory 59 | 60 | # how many WARC files to keep in memory at a given time 61 | # since the crawls are sequential, most files for a site 62 | # will exist in a segment of a few WARC files. 63 | self.warc_cache_size = 2 64 | # we're going to store the order the files have have been 65 | # accessed most recently here: 66 | # [most_recently_used_filename, ..., least_recently_used_filename] 67 | # This will be used to enforce our cache size. 68 | self.warc_cache_stack = [] 69 | 70 | # set of clicked elements 71 | self.visited = set() 72 | 73 | # queue of the path that led us to the current page 74 | # this is in the form of (command, *args, **kwargs) 75 | self.path = [] 76 | 77 | # tree building 78 | self.graph = Graph() 79 | 80 | # setting to False, ensures crawl will stay on same host 81 | self.leave_host = leave_host 82 | 83 | self.current_url = None 84 | self.current_html = None 85 | 86 | def _load_warc_file(self, filename): 87 | """ 88 | Take a specified WARC file, load it and keep it in memory in a quickly 89 | readable format (python dict). This operates directly on the class 90 | variable self.warc_cache and also handles maximum cache size pruning. 91 | """ 92 | logger.debug("[-] Loading WARC file: %s" % (filename)) 93 | if len(self.warc_cache_stack) > self.warc_cache_size: 94 | least_used = self.warc_cache_stack.pop() 95 | logger.debug(" - Removing WARC from memory: %s" % (filename)) 96 | del self.warc_cache[least_used] 97 | 98 | self.warc_cache[filename] = [] 99 | for record in _warc_records(filename): 100 | payload = record["payload"] 101 | if not payload: 102 | payload = "" 103 | self.warc_cache[filename].append({ 104 | "header": record["headers"], 105 | "payload": payload, 106 | }) 107 | 108 | def fetch(self, url, initial=False): 109 | """ 110 | Fetch a page from a given URL from the WARC archive (via 111 | an index). 112 | """ 113 | logger.info("%s Fetching url=%s initial=%s" % ( 114 | ("[+]" if initial else " -"), url, initial, 115 | )) 116 | url_b = bytes(url, "utf-8") 117 | data = self.warc_index.get(url_b) 118 | if not data: 119 | logger.debug("[!] Couldn't find URL in WARC index: %s" % (url)) 120 | return False 121 | else: 122 | filename, record_number = pickle.loads(data) 123 | logger.debug(" - Loading filename: %s record number: %s" % ( 124 | filename, record_number 125 | )) 126 | if filename not in self.warc_cache: 127 | self._load_warc_file(filename) 128 | warcfile = self.warc_cache[filename] 129 | record = warcfile[record_number] 130 | self.current_html = record["payload"] 131 | 132 | try: 133 | self.warc_cache_stack.remove(filename) 134 | except ValueError: 135 | pass 136 | 137 | self.warc_cache_stack.insert(0, filename) 138 | 139 | self.current_url = url 140 | self.dom = self._get_dom() 141 | 142 | if initial: 143 | self.path.append(("fetch", [url], {"initial": initial})) 144 | node = "Fetch\n url: %s" % url 145 | self.graph.add_root_node(node, url=url, action="fetch") 146 | 147 | return True 148 | -------------------------------------------------------------------------------- /autoscrape/backends/warc/dom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from autoscrape.backends.requests.dom import Dom 3 | -------------------------------------------------------------------------------- /autoscrape/backends/warc/tags.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from autoscrape.backends.requests.tags import Tagger 3 | -------------------------------------------------------------------------------- /autoscrape/classification.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | 4 | class KNN(object): 5 | pass 6 | -------------------------------------------------------------------------------- /autoscrape/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/cli/__init__.py -------------------------------------------------------------------------------- /autoscrape/cli/scrape.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | AUTOSCRAPE - Interactively crawl, find searchable forms, 4 | input data to them and scrape data on the results, from an 5 | initial BASEURL. 6 | 7 | Usage: 8 | autoscrape [options] BASEURL 9 | 10 | General Options: 11 | --backend BACKEND 12 | The backend to use. Currently one of "selenium", "requests" or 13 | "warc". The requests browser is only capable of crawling, but 14 | is approximately 2-3.5x faster. WARC is for emulating browsing 15 | through Common Crawl archival data. 16 | [default: selenium] 17 | 18 | --loglevel LEVEL 19 | Loglevel, note that DEBUG is extremely verbose. 20 | [default: INFO] 21 | 22 | --quiet 23 | This will silence all logging to console. 24 | 25 | Crawl-Specific Options: 26 | --maxdepth DEPTH 27 | Maximum depth to crawl a site (in search of form 28 | if the option --form-match STRING is specified, 29 | see below). Setting to 0 means don't crawl at all, 30 | all operations are limited to the BASEURL page. 31 | Setting to -1 means unlimited maximum crawl depth. 32 | [default: 10] 33 | 34 | --max-pages NUM 35 | Maximum number of unique pages, in total, to fetch. 36 | AutoScrape will stop crawling once this is hit. 37 | 38 | --leave-host 39 | By default, autoscrape will not leave the host given 40 | in the BASEURL. This option lets the scraper leave 41 | the host. 42 | 43 | --only-links MATCH_STRING 44 | A whitelist of links to follow. All others will 45 | be ignored. Can be a string or a regex with 46 | multiple strings to match separated by a pipe 47 | (|) character. 48 | 49 | --ignore-links MATCH_STRING 50 | This option can be used to remove any links matching 51 | MATCH_STRING (can be a regex or just a string match) 52 | from consideration for clicking. Accepts the same 53 | argument format as --only-links. 54 | 55 | --link-priority SORT_STRING 56 | A string to sort the links by. In this case, any link 57 | containing "SORT_STRING" will be clicked before any other 58 | links. In most cases you probably want to use the 59 | whitelist, --only-links, option. 60 | 61 | --ignore-extensions IGNORE_EXTENSIONS 62 | Don't click on or download URLs pointing to files with 63 | these extensions. 64 | 65 | --result-page-links MATCH_STRINGS_LIST 66 | If specified, AutoScrape will click on any links matching 67 | this string when it arrives on a search result page. 68 | 69 | Interactive Form Search Options: 70 | --form-match SEARCH_STRING 71 | The crawler will identify a form to search/scrape if it 72 | contains the specified string. If matched, it will be 73 | interactively scraped using the below instructions. 74 | 75 | --input INPUT_DESCRIPTION 76 | Interactive search descriptor. This describes how to 77 | interact with a matched form. The inputs are 78 | described in the following format: 79 | 80 | "c:0:True,i:0:atext,s:1:France:d:0:1991-01-20,r:1:0" 81 | 82 | A single-input type can be one of four types: 83 | checkbox ("c"), input box ("i"), option select 84 | ("s"), radio checkbox ("r"), and date inputs 85 | ("d", with inputs in the "YYYY-MM-DD" format). 86 | The type is separated by a colon, and the input 87 | index position is next. (Each input type has its 88 | own list, so a form with one input, one checkbox, 89 | and one option select, will all be at index 0.) 90 | The final command, sepearated by another colon, 91 | describes what to do with the input. 92 | 93 | Multiple inputs are separated by a comma, so you can 94 | interact with multiple inputs before submitting the 95 | form. 96 | 97 | To illustrate this, the above command does the following: 98 | - first input checkbox is checked (uncheck is False) 99 | - first input box gets filled with the string "first" 100 | - second select input gets the "France" option chosen 101 | - first date input gets set to Jan 20, 1991 102 | - the second radio checkbox group will have its 103 | first option selected 104 | 105 | --next-match NEXT_BTN_STRING 106 | A string to match a "next" button with, after 107 | searching a form. The scraper will continue to 108 | click "next" buttons after a search until no matches 109 | are found, unless limited by the --formdepth option 110 | (see below). 111 | 112 | --formdepth DEPTH 113 | How deep the scraper will iterate, by clicking 114 | "next" buttons. Zero means infinite depth. 115 | [default: 0] 116 | 117 | --form-submit-button-selector XPATH_SELECTOR 118 | A XPath selector indicating a submit button/element 119 | to click. Use this when the submit button to a form 120 | doesn't have any text or is a strange element type. 121 | 122 | --form-submit-natural-click 123 | Some webpages make clicking a link element difficult 124 | due to JavaScript onClick events. In cases where a 125 | click does nothing, you can use this option to get 126 | the scraper to emulate a mouse click over the link's 127 | poition on the page, activating any higher level JS 128 | interactions. 129 | 130 | --form-submit-wait SECONDS 131 | How many seconds to force wait after a submit to a form. 132 | This should be used in cases where the builtin 133 | wait-for-page-load isn't working properly (JS-heavy 134 | pages, etc). [default: 5] 135 | 136 | Webdriver-Specific and General Options: 137 | --page-timeout SECONDS 138 | Selenium has a lot of timeout settings used for 139 | determining when a page has loaded. In cases where 140 | pages take a very long time, this may need to be 141 | increased. If you have a fast-loading page, but 142 | AutoScrape is waiting for load, you can set this 143 | lower to speed things up. [default: 30] 144 | 145 | --force-page-wait SECONDS 146 | This forces AutoScrape to wait for the specified number 147 | of seconds after performing a navigation action. 148 | Increase this for slow sites or oddly loading single page 149 | apps, decrease this for fast sites or crawls. 150 | [default: 1] 151 | 152 | --load-images 153 | By default, images on a page will not be fetched. 154 | This speeds up scrapes on sites and lowers bandwidth 155 | needs. This option fetches all images on a page. 156 | 157 | --show-browser 158 | By default, we hide the browser during operation. 159 | This option displays a browser window, mostly 160 | for debugging purposes. 161 | 162 | --driver DRIVER 163 | Which browser to use. Current support for "Firefox", 164 | "Chrome", and "remote". [default: Firefox] 165 | 166 | --browser-binary PATH_TO_BROWSER 167 | Path to a specific browser binary. If left blank 168 | selenium will pull the browser found on your path. 169 | 170 | --remote-hub URI 171 | If using "remote" driver, specify the hub URI to 172 | connect to. Needs the proto, address, port, and path. 173 | [default: http://localhost:4444/wd/hub] 174 | 175 | WARC Options: 176 | --warc-directory PATH_TO_WARCS 177 | Path to the folder containing GZipped WARC files. These can be 178 | downloaded from Common Crawl. Required when using the "warc" 179 | backend. 180 | 181 | --warc-index-file PATH_TO_LEVELDB 182 | Path to the level DB database holding the URL-to-file 183 | index: URL => (filename, record_number) 184 | This will be generated from the WARCS in the --warc-directory 185 | speficied if it's not already. Required when using the "warc" 186 | backend. 187 | 188 | Data Saving Options: 189 | --output DIRECTORY_OR_URL 190 | If specified, this indicates where to save pages during a 191 | crawl. This directory will be created if it does not 192 | currently exist. This directory will have several 193 | sub-directories that contain the different types of pages 194 | found (i.e., search_pages, data_pages, screenshots). 195 | This can also accept a URL (i.e., http://localhost:5000/files) 196 | and AutoScrape will POST to that endpoint with each 197 | file scraped. 198 | [default: autoscrape-data] 199 | 200 | --keep-filename 201 | By default, we hash the files in a scrape in order to 202 | account for dynamic content under a single-page app 203 | (SPA) website implmentation. This option will force 204 | the scraper to retain the original filename, from the 205 | URL when saving scrape data. 206 | 207 | --save-screenshots 208 | This option makes the scraper save screenshots of each 209 | page, interaction, and search. Screenshots will be 210 | saved to the screenshots folder of the output dir. 211 | 212 | --full-page-screenshots 213 | By default, we only save the first displayed part of the 214 | webpage. The remaining portion that you can only see 215 | by scrolling down isn't captured. Setting this option 216 | forces AutoScrape to scroll down and capture the entire 217 | web content. This can fail in certain circumstances, like 218 | in API output mode and should be used with care. 219 | 220 | --save-graph 221 | This option allows the scraper to build a directed graph 222 | of the entire scrape and will save it to the "graph" 223 | subdirectory under the output dir. The output file 224 | is a timestamped networkx pickled graph. 225 | 226 | --disable-style-saving 227 | By default, AutoScrape saves the stylesheets associated 228 | with a scraped page. To save storage, you can disable this 229 | functionality by using this option. 230 | """ 231 | import logging 232 | 233 | from docopt import docopt 234 | 235 | import autoscrape 236 | 237 | 238 | logger = logging.getLogger('AUTOSCRAPE') 239 | 240 | 241 | def main(): 242 | docopt_args = docopt(__doc__) 243 | 244 | BASEURL = docopt_args.pop("BASEURL") 245 | 246 | # strip the -- and convert - to _ 247 | args = {} 248 | for option in docopt_args: 249 | args[option[2:].replace('-', '_')] = docopt_args[option] 250 | 251 | # configure stdout logging 252 | docopt_args["stdout"] = True 253 | if "quiet" in args: 254 | quiet = args.pop("quiet") 255 | args["stdout"] = not quiet 256 | 257 | scraper = autoscrape.ManualControlScraper(BASEURL, **args) 258 | 259 | logger.debug("AutoScrape starting with arguments: %s" % (docopt_args)) 260 | scraper.run() 261 | -------------------------------------------------------------------------------- /autoscrape/control.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import time 3 | import logging 4 | 5 | from autoscrape.backends.requests.browser import RequestsBrowser 6 | from autoscrape.vectorization.text import TextVectorizer 7 | 8 | # backends/vectorizers with optional dependencies 9 | try: 10 | from autoscrape.vectorization.ebmeddings import EmbeddingsVectorizer 11 | except ModuleNotFoundError: 12 | pass 13 | 14 | try: 15 | from autoscrape.backends.selenium.browser import SeleniumBrowser 16 | except ModuleNotFoundError: 17 | pass 18 | 19 | try: 20 | from autoscrape.backends.warc.browser import WARCBrowser 21 | except ModuleNotFoundError: 22 | pass 23 | 24 | 25 | logger = logging.getLogger('AUTOSCRAPE') 26 | 27 | 28 | class Controller: 29 | """ 30 | High-level control for scraping a web page. This allows us to control 31 | all of the possible scraper commands in an automated way, using a set 32 | of indices instead of tags. This way we can present vectors of options 33 | to a ML model. This abstraction also returns feature matrices for pages 34 | and elements on the webpage. 35 | """ 36 | 37 | def __init__(self, leave_host=False, driver="Firefox", browser_binary=None, 38 | remote_hub="http://localhost:4444/wd/hub", output=None, 39 | form_submit_natural_click=False, form_submit_wait=5, 40 | warc_index_file=None, warc_directory=None, 41 | force_page_wait=None, form_submit_button_selector=None, 42 | load_images=False, show_browser=False, page_timeout=None, 43 | html_embeddings_file=None, word_embeddings_file=None, 44 | backend="selenium", vectorizer="text"): 45 | """ 46 | Set up our WebDriver and misc utilities. 47 | """ 48 | Browser = None 49 | self.backend = backend 50 | if backend == "selenium": 51 | Browser = SeleniumBrowser 52 | elif backend == "requests": 53 | Browser = RequestsBrowser 54 | elif backend == "warc": 55 | Browser = WARCBrowser 56 | else: 57 | raise NotImplementedError( 58 | "No backend found: %s" % (backend) 59 | ) 60 | 61 | self.scraper = Browser( 62 | leave_host=leave_host, driver=driver, 63 | browser_binary=browser_binary, remote_hub=remote_hub, 64 | form_submit_natural_click=form_submit_natural_click, 65 | form_submit_wait=form_submit_wait, 66 | form_submit_button_selector=form_submit_button_selector, 67 | warc_index_file=warc_index_file, warc_directory=warc_directory, 68 | load_images=load_images, show_browser=show_browser, 69 | output=output, page_timeout=page_timeout, 70 | ) 71 | 72 | Vectorizer = None 73 | if vectorizer == "text": 74 | self.vectorizer = TextVectorizer( 75 | scraper=self.scraper, controller=self 76 | ) 77 | elif vectorizer == "embeddings": 78 | self.vectorizer = EmbeddingsVectorizer( 79 | scraper=self.scraper, controller=self, 80 | html_embeddings_file=html_embeddings_file, 81 | word_embeddings_file=word_embeddings_file, 82 | ) 83 | else: 84 | raise NotImplementedError( 85 | "No vectorizer found: %s" % (vectorizer) 86 | ) 87 | 88 | # this flag marks vectors as stale. when this is true and 89 | # we try to access the link vectors, we'll re-load them 90 | self.stale = True 91 | 92 | self.clickable = None 93 | 94 | # simply a list of form tags, each forms input contents is 95 | # contained in the self.inputs multi-dimensional array, below 96 | self.forms = [] 97 | 98 | # this expands into the following format: 99 | # [ form_tag: 100 | # [ 101 | # [text input tags...], 102 | # [select input tags...], 103 | # [checkbox input tags...] 104 | # ], 105 | # other forms ..., 106 | # ] 107 | self.inputs = [] 108 | 109 | # TODO: the point of this wait is to ensure the DOM has stopped 110 | # mutating (loading results, etc). a proper fix for this is to 111 | # look at the count of DOM objects being queried for each index 112 | # type and detect when it stops changing. 113 | self.force_page_wait = force_page_wait 114 | if self.force_page_wait is not None: 115 | self.force_page_wait = int(self.force_page_wait) 116 | 117 | def load_indices(self): 118 | logger.debug("[.] Loading page vectors...") 119 | if self.backend == "selenium" and self.force_page_wait: 120 | logger.debug(" - Force waiting for %s seconds" % ( 121 | self.force_page_wait 122 | )) 123 | time.sleep(self.force_page_wait) 124 | 125 | self.clickable = None 126 | # self.clickable = self.scraper.get_clickable() 127 | logger.debug(" - Getting forms") 128 | forms_dict = self.scraper.get_forms() 129 | self.forms = list(forms_dict.keys()) 130 | logger.debug(" - Getting inputs") 131 | self.inputs = [tags for tags in forms_dict.values()] 132 | self.buttons = None # self.scraper.get_buttons() 133 | 134 | # logger.debug("Clickable links: %s" % (len(self.clickable))) 135 | # for i in range(len(self.clickable)): 136 | # t = self.clickable[i] 137 | # elem = self.scraper.element_by_tag(t) 138 | # text = "" 139 | # if elem: 140 | # text = elem.text.replace("\n", " ") 141 | # logger.debug(" %s - ...%s, %s" % (i, t[-25:], text)) 142 | 143 | # logger.debug("Forms: %s:" % (len(self.forms))) 144 | # for i in range(len(self.forms)): 145 | # t = self.forms[i] 146 | # text = "" 147 | # elem = self.scraper.element_by_tag(t) 148 | # if elem: 149 | # text = elem.text.replace("\n", " ") 150 | # logger.debug(" %s - ...%s, %s" % (i, t[-25:], text)) 151 | 152 | # logger.debug("Inputs: %s" % (len(self.inputs))) 153 | # for i in range(len(self.inputs)): 154 | # input_group = self.inputs[i] 155 | # for itype_ix in range(len(input_group)): 156 | # for t in input_group[itype_ix]: 157 | # elem = self.scraper.element_by_tag(t) 158 | # text = "" 159 | # placeholder = "" 160 | # if elem: 161 | # text = elem.text.replace("\n", " ") 162 | # placeholder = elem.get_attribute("placeholder") 163 | # logger.debug(" %s - ...%s, %s, %s" % ( 164 | # i, t[-25:], text, placeholder)) 165 | 166 | # logger.debug("Buttons: %s" % (len(self.buttons))) 167 | # for i in range(len(self.buttons)): 168 | # t = self.buttons[i] 169 | # elem = self.scraper.element_by_tag(t) 170 | # text = "" 171 | # value = "" 172 | # if elem: 173 | # text = elem.text.replace("\n", " ") 174 | # value = elem.get_attribute("value") 175 | # logger.debug(" %s - ...%s, %s, %s" % (i, t[-25:], text, value)) 176 | 177 | def initialize(self, url): 178 | """ 179 | Instantiate a web scraper, given a starting point URL. Also 180 | gets the links for the current page and sets its tag array. 181 | """ 182 | self.scraper.fetch(url, initial=True) 183 | self.load_indices() 184 | 185 | def select_link(self, index, iterating_form=False): 186 | if self.clickable is None: 187 | logger.debug(" - Getting links") 188 | self.clickable = self.scraper.get_clickable() 189 | if index >= len(self.clickable): 190 | logger.error( 191 | "[!] Critical error: link index exceeds clickable length." 192 | ) 193 | return False 194 | tag = self.clickable[index] 195 | clicked = self.scraper.click(tag, iterating_form=iterating_form) 196 | if clicked: 197 | self.load_indices() 198 | return clicked 199 | 200 | def select_button(self, index, iterating_form=False): 201 | if self.buttons is None: 202 | logger.debug(" - Getting buttons") 203 | self.buttons = self.scraper.get_buttons() 204 | tag = self.buttons[index] 205 | clicked = self.scraper.click(tag, iterating_form=iterating_form) 206 | if clicked: 207 | self.load_indices() 208 | return clicked 209 | 210 | def input(self, form_ix, index, chars): 211 | """ 212 | Add some string to a text input under a given form. 213 | """ 214 | tag = self.inputs[form_ix][0][index] 215 | self.scraper.input(tag, chars) 216 | 217 | def input_select_option(self, form_ix, index, option_str): 218 | """ 219 | Select an option for a select input under a given form. 220 | """ 221 | tag = self.inputs[form_ix][1][index] 222 | self.scraper.input_select_option(tag, option_str) 223 | 224 | def input_checkbox(self, form_ix, index, to_check): 225 | """ 226 | Check/uncheck a checkbox input under a given form. 227 | """ 228 | tag = self.inputs[form_ix][2][index] 229 | self.scraper.input_checkbox(tag, to_check) 230 | 231 | def input_date(self, form_ix, index, chars): 232 | """ 233 | Select a date from an input type="date". String needs to 234 | be in the MM-DD-YYYY format. 235 | """ 236 | tag = self.inputs[form_ix][3][index] 237 | self.scraper.input(tag, chars) 238 | 239 | def input_radio_option(self, form_ix, index, radio_index): 240 | """ 241 | Select a radio checkbox from a given form, checkbox group 242 | index and desired value by text. 243 | """ 244 | grp_tags = self.inputs[form_ix][4][index] 245 | tag = grp_tags[radio_index] 246 | self.scraper.input_checkbox(tag, True, radio=True) 247 | 248 | def submit(self, index): 249 | tag = self.forms[index] 250 | self.scraper.submit(tag) 251 | self.load_indices() 252 | 253 | def back(self): 254 | self.scraper.back() 255 | self.load_indices() 256 | -------------------------------------------------------------------------------- /autoscrape/filetypes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | TEXT_EXTENSIONS = ( 3 | # ASP Classic 4 | "asp", 5 | 6 | # ASP.NET 7 | "aspx", 8 | "axd", 9 | "asx", 10 | "asmx", 11 | "ashx", 12 | 13 | # CSS 14 | "css", 15 | 16 | # Coldfusion 17 | "cfm", 18 | 19 | # Erlang 20 | "yaws", 21 | 22 | # # Flash, treat as data 23 | # "swf", 24 | 25 | # HTML 26 | "html", 27 | "htm", 28 | "xhtml", 29 | "jhtml", 30 | "hta", 31 | 32 | # Java 33 | "jsp", 34 | "jspx", 35 | "wss", 36 | "do", 37 | "action", 38 | 39 | # JavaScript 40 | "js", 41 | 42 | # Perl 43 | "pl", 44 | 45 | # PHP 46 | "php", 47 | "php4", 48 | "php3", 49 | "phtml", 50 | 51 | # Python 52 | "py", 53 | 54 | # Ruby 55 | "rb", 56 | "rhtml", 57 | 58 | # SSI 59 | "shtml", 60 | 61 | # Other (C, perl etc.) 62 | # NOTE: these could also be data 63 | "cgi", 64 | "dll", 65 | 66 | # ASP.NET 67 | "axd", 68 | "asx", 69 | "asmx", 70 | "ashx", 71 | "aspx", 72 | 73 | # XML/Feed 74 | "xml", 75 | "rss", 76 | "atom", 77 | "svg", 78 | ) 79 | -------------------------------------------------------------------------------- /autoscrape/input_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import re 3 | 4 | 5 | class InputParser: 6 | def __init__(self, input): 7 | self.input = input 8 | 9 | def str2bool(self, string): 10 | if not string: 11 | return False 12 | if string.lower() in ["false", "no", "n", "0"]: 13 | return False 14 | return True 15 | 16 | def generate(self): 17 | """ 18 | Make a form input generator by parsing our input string. Output 19 | is a multidimensional array, where the first dimension is 20 | independent searches to attempt and the second dimension is 21 | which inputs for fill. Example: 22 | 23 | From this input string: 24 | 25 | "i:0:a%,c:1:True,s:2:France" 26 | 27 | We get the following output generator: 28 | 29 | [ 30 | [ 31 | { "index": 0, "string": "a%", "type": "input" } 32 | { "index": 1, "action": True, "type": "checkbox" } 33 | { "index": 2, "string": "France", "type": "select" } 34 | ], 35 | ..., 36 | [ 37 | { "index": 0, "string": "z%", "type": "input" }, 38 | { "index": 1, "action": True, "type": "checkbox" } 39 | { "index": 2, "string": "France", "type": "select" } 40 | ] 41 | ] 42 | 43 | This will do all searches with input 0 filled with a-z and the 44 | remaining specified inputs filled as desired (input 1, a checkbox, 45 | checked and input 2, a choice select, selected to the "France" 46 | option). 47 | """ 48 | # TODO: Find an overall way to support ranges without resulting 49 | # to clunky regex (which don't support ordering). Right now just 50 | # use GNU Parallel or something to do ranges. 51 | # split the independent searches first 52 | inputs = re.split(r'(?= self.max_pages: 124 | logger.info(" - Maximum pages %s reached, returning..." % self.max_pages) 125 | return 126 | 127 | link_vectors = self.control.vectorizer.link_vectors() 128 | link_zip = list(zip(range(len(link_vectors)), link_vectors)) 129 | link_zip = filter( 130 | lambda x: re.findall(self.result_page_links, x[1]), 131 | link_zip 132 | ) 133 | logger.debug(" - Candidate links: %s" % (link_zip)) 134 | # Click until we get no more matches 135 | for ix, text in link_zip: 136 | logger.info("[.] Clicking result page link: %s" % (text)) 137 | logger.debug(" - Current URL: %s" % (self.control.scraper.page_url)) 138 | if self.control.select_link(ix, iterating_form=True): 139 | self.total_pages += 1 140 | self.click_until_no_links(links) 141 | self.save_training_page(classname="data_pages") 142 | self.save_screenshot(classname="data_pages") 143 | self.control.back() 144 | 145 | def keep_clicking_next_btns(self): 146 | """ 147 | This looks for "next" buttons, or (in the future) page number 148 | links, and clicks them until one is not found. This saves the 149 | pages as it goes. 150 | """ 151 | logger.info("[*] Entering result page iteration routine") 152 | depth = 0 153 | while True: 154 | if self.formdepth and depth >= self.formdepth: 155 | logger.debug("[*] Max 'next' formdepth reached %s" % depth) 156 | break 157 | 158 | button_data = self.control.vectorizer.button_vectors() 159 | n_buttons = len(button_data) 160 | logger.info("[.] On result page %s" % (depth + 1)) 161 | logger.debug(" - Button vectors (%s): %s" % ( 162 | n_buttons, button_data 163 | )) 164 | 165 | if self.result_page_links: 166 | self.click_until_no_links(self.result_page_links) 167 | 168 | # element type, index, text 169 | next_found = None 170 | for ix in range(n_buttons): 171 | button_text = button_data[ix] 172 | logger.debug(" - Checking button: %s" % button_text) 173 | if re.findall(self.next_match.lower(), button_text.lower()): 174 | next_found = ("button", ix, button_text) 175 | break 176 | 177 | if not next_found: 178 | link_vectors = self.control.vectorizer.link_vectors() 179 | n_clickable = len(link_vectors) 180 | logger.debug(" - Button not found, searching %s links" % ( 181 | n_clickable 182 | )) 183 | for ix in range(n_clickable): 184 | link_text = link_vectors[ix] 185 | logger.debug(" - Checking clickable: %s" % link_text) 186 | if re.findall(self.next_match.lower(), link_text.lower()): 187 | next_found = ("link", ix, link_text) 188 | break 189 | 190 | # we didn't find a next match, break loop 191 | if next_found is None: 192 | logger.debug(" - Next button not found!") 193 | break 194 | else: 195 | ntype, ix, text = next_found 196 | logger.info("[.] Next button found! Clicking: %s" % (text)) 197 | depth += 1 198 | if ntype == "button": 199 | self.control.select_button(ix, iterating_form=True) 200 | elif ntype == "link": 201 | self.control.select_link(ix, iterating_form=True) 202 | 203 | # subsequent page loads get saved here 204 | self.save_training_page(classname="data_pages") 205 | self.save_screenshot(classname="data_pages") 206 | 207 | # check for infinite loop, this is based on the hash 208 | # of the previous few pages 209 | logger.debug(" - Checking for infinite loop...") 210 | if self.control.scraper.infinite_loop_detected: 211 | logger.debug(" - Infinte loop detected. Breaking.") 212 | break 213 | 214 | for _ in range(depth): 215 | logger.debug("[.] Going back from result page...") 216 | self.control.back() 217 | 218 | def scrape(self, depth=0): 219 | logger.info("[.] Crawl depth %s" % depth) 220 | logger.info(" - Total pages: %s of max: %s" % ( 221 | self.total_pages, self.max_pages 222 | )) 223 | if self.maxdepth != -1 and depth > self.maxdepth: 224 | logger.info(" - Maximum depth %s reached, returning..." % depth) 225 | self.control.back() 226 | return 227 | if self.max_pages is not None and self.total_pages >= self.max_pages: 228 | logger.info(" - Maximum pages %s reached, returning..." % self.max_pages) 229 | return 230 | if self.scraped: 231 | logger.debug(" - Scrape complete, not clicking anything else.") 232 | return 233 | 234 | if self.ignore_extensions and re.findall(self.ignore_extensions, 235 | self.control.scraper.page_url): 236 | logger.debug(" - Ignoring URL matching ignored extension: %s" % ( 237 | self.control.scraper.page_url 238 | )) 239 | return 240 | 241 | self.save_training_page(classname="crawl_pages") 242 | self.save_screenshot(classname="crawl_pages") 243 | form_vectors = self.control.vectorizer.form_vectors() 244 | 245 | # NOTE: we never get into this loop if self.input_gen is empty 246 | # this arises when input was not handed to the initializer 247 | for ix in range(len(form_vectors)): 248 | # don't bother with looking for forms if we didn't specify 249 | # th form_match option 250 | if not self.form_match: 251 | continue 252 | 253 | form_data = form_vectors[ix] 254 | 255 | # inputs are keyed by form index, purely here for debug purposes 256 | inputs = self.control.inputs[ix] 257 | logger.debug(" - Form: %s Text: %s" % (ix, form_data)) 258 | logger.debug(" - Inputs: %s" % inputs) 259 | 260 | if self.form_match.lower() not in form_data.lower(): 261 | continue 262 | 263 | logger.info("[*] Found an input form (No. %s on page)" % (ix)) 264 | self.save_training_page(classname="search_pages") 265 | self.save_screenshot(classname="search_pages") 266 | 267 | for input_phase in self.input_gen: 268 | logger.debug(" - Input plan: %s" % input_phase) 269 | for single_input in input_phase: 270 | input_index = single_input["index"] 271 | if single_input["type"] == "input": 272 | input_string = single_input["string"] 273 | logger.info("[.] Inputting %s to input %s" % ( 274 | input_string, ix 275 | )) 276 | self.control.input(ix, input_index, input_string) 277 | elif single_input["type"] == "select": 278 | input_string = single_input["string"] 279 | logger.info("[.] Selecting option %s in input %s" % ( 280 | input_string, input_index 281 | )) 282 | self.control.input_select_option( 283 | ix, input_index, input_string 284 | ) 285 | elif single_input["type"] == "checkbox": 286 | to_check = single_input["action"] 287 | logger.info("[.] %s checkbox input %s" % ( 288 | "Checking" if to_check else "Unchecking", 289 | input_index 290 | )) 291 | self.control.input_checkbox( 292 | ix, input_index, to_check 293 | ) 294 | elif single_input["type"] == "date": 295 | input_string = single_input["string"] 296 | logger.info("[.] Setting date to %s in date input %s" % ( 297 | input_string, ix)) 298 | self.control.input_date(ix, input_index, input_string) 299 | elif single_input["type"] == "radio": 300 | radio_index = single_input["string"] 301 | logger.info("[.] Selecting radio checkbox %s in group %s" % ( 302 | radio_index, input_index 303 | )) 304 | self.control.input_radio_option( 305 | ix, input_index, radio_index 306 | ) 307 | 308 | # capture post-input screenshot 309 | self.save_screenshot(classname="interaction_pages") 310 | 311 | # actually submit the page 312 | self.control.submit(ix) 313 | self.total_pages += 1 314 | 315 | # save the initial landing result page 316 | self.save_screenshot(classname="data_pages") 317 | self.save_training_page(classname="data_pages") 318 | 319 | # if we're looking for next buttons, click them 320 | if self.next_match: 321 | self.keep_clicking_next_btns() 322 | 323 | self.scraped = True 324 | self.control.back() 325 | 326 | logger.debug("[*] Completed iteration!") 327 | # Only scrape a single form, due to explicit, single 328 | # match configuration option 329 | if self.scraped: 330 | logger.info("[*] Scrape complete! Exiting.") 331 | return 332 | 333 | link_vectors = self.control.vectorizer.link_vectors() 334 | logger.debug("[.] Links on page: %s" % (link_vectors)) 335 | link_zip = list(zip(range(len(link_vectors)), link_vectors)) 336 | if self.ignore_links: 337 | logger.debug(" - Ignoring links matching: %s" % self.ignore_links) 338 | link_zip = filter( 339 | lambda x: not re.findall(self.ignore_links, x[1]), 340 | link_zip 341 | ) 342 | if self.only_links: 343 | logger.debug(" - Keeping only links matching: %s" % self.ignore_links) 344 | link_zip = filter( 345 | lambda x: re.findall(self.only_links, x[1]), 346 | link_zip 347 | ) 348 | if self.link_priority: 349 | logger.debug(" - Sorting by link priority: %s" % self.link_priority) 350 | link_zip.sort( 351 | key=lambda x: not re.findall(self.link_priority, x[1]) 352 | ) 353 | 354 | for ix, text in link_zip: 355 | logger.debug(" - Link index: %s text: %s" % (ix, text)) 356 | if self.maxdepth != -1 and depth == self.maxdepth: 357 | logger.debug(" - At maximum depth: %s, skipping links." % depth) 358 | break 359 | if self.max_pages is not None and self.total_pages >= self.max_pages: 360 | logger.info(" - Maximum pages reached, skipping links.") 361 | break 362 | if self.scraped: 363 | logger.debug(" - Scrape complete, not clicking anything else.") 364 | return 365 | 366 | logger.debug(" - Current URL: %s" % (self.control.scraper.page_url)) 367 | logger.debug(" - Attempting to click link text: %s" % text) 368 | if self.control.select_link(ix): 369 | logger.info("[.] Link clicked: %s" % (text)) 370 | logger.debug(" - Current URL: %s" % (self.control.scraper.page_url)) 371 | self.total_pages += 1 372 | self.scrape(depth=depth + 1) 373 | else: 374 | logger.debug(" - Click failed, skipping: %s" % text) 375 | 376 | logger.debug("[*] Searching forms and links on page complete") 377 | self.control.back() 378 | 379 | def run(self, *args, **kwargs): 380 | # we have to catch this so, in the case of failure, we 381 | # don't have random browser windows hanging around 382 | try: 383 | self.scrape(*args, **kwargs) 384 | except Exception as e: 385 | msg = "[!] Fatal error scraping: %s. Cleaning up, quitting." 386 | logger.error(msg % (e)) 387 | if hasattr(self.control.scraper, "driver"): 388 | self.control.scraper.driver.quit() 389 | if self.output and self.save_graph: 390 | self.save_scraper_graph() 391 | raise e 392 | # else: 393 | # logger.info("[+] AutoScrape run complete.") 394 | # if self.output and self.save_graph: 395 | # self.save_scraper_graph() 396 | try: 397 | self.control.scraper.driver.quit() 398 | except Exception: 399 | pass 400 | 401 | if self.return_data: 402 | return self.crawl_data 403 | -------------------------------------------------------------------------------- /autoscrape/scrapers/null.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import logging 3 | 4 | from . import BaseScraper 5 | from ..control import Controller 6 | from autoscrape.backends.requests.browser import RequestsBrowser 7 | from autoscrape.backends.selenium.browser import SeleniumBrowser 8 | 9 | 10 | logger = logging.getLogger('AUTOSCRAPE') 11 | 12 | 13 | class NullScraper(BaseScraper): 14 | """ 15 | A test scraper that just provides direct access to scraper and 16 | controller. For vectorizing documents. 17 | """ 18 | 19 | def __init__(self, *args, html_embeddings=None, word_embeddings=None, 20 | loglevel="INFO", scraper=True, controller=False, backend=None, 21 | driver="Chrome", vectorizer=False, **kwargs): 22 | super(NullScraper, self).setup_logging(loglevel=loglevel) 23 | 24 | Browser = None 25 | if backend == "selenium": 26 | Browser = SeleniumBrowser 27 | elif backend == "requests": 28 | Browser = RequestsBrowser 29 | else: 30 | raise NotImplementedError( 31 | "No backend found: %s" % (backend) 32 | ) 33 | 34 | if scraper: 35 | self.scraper = Browser(driver=driver) 36 | if controller: 37 | self.control = Controller( 38 | html_embeddings_file=html_embeddings, 39 | word_embeddings_file=word_embeddings, 40 | ) 41 | if vectorizer: 42 | self.vectorizer = self.control.vectorizer 43 | -------------------------------------------------------------------------------- /autoscrape/scrapers/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import logging 3 | 4 | from . import BaseScraper 5 | 6 | from autoscrape.backends.requests.browser import RequestsBrowser 7 | 8 | 9 | logger = logging.getLogger('AUTOSCRAPE') 10 | 11 | 12 | class TestScraper(BaseScraper): 13 | def __init__(self, baseurl, maxdepth=10, loglevel=None): 14 | """ 15 | Initialize our scraper and get the first page. 16 | """ 17 | super(TestScraper, self).setup_logging(loglevel=loglevel) 18 | self.scraper = RequestsBrowser() 19 | self.scraper.fetch(baseurl) 20 | self.maxdepth = maxdepth 21 | 22 | def run(self, depth=0, tags=None): 23 | """ 24 | This is the main recursive depth-first search of a site. It 25 | doesn't do anything but crawl a site DFS and ensure the tagging 26 | and web engine is working as it should. 27 | """ 28 | if depth > self.maxdepth: 29 | logger.debug("Maximum depth %s reached, returning..." % depth) 30 | self.scraper.back() 31 | return 32 | 33 | logger.debug("** DEPTH %s" % depth) 34 | 35 | if not tags: 36 | tags = self.scraper.get_clickable() 37 | 38 | logger.debug( 39 | "All tags at this depth \n %s" % ("\n ").join(tags)) 40 | 41 | for tag in tags: 42 | logger.debug("Attempting click on tag \n %s" % tag) 43 | 44 | if self.scraper.click(tag): 45 | logger.debug("Clicked! Recursing ...") 46 | self.run( 47 | depth=depth + 1, tags=self.scraper.get_clickable()) 48 | 49 | logger.debug("Going back...") 50 | self.scraper.back() 51 | -------------------------------------------------------------------------------- /autoscrape/search/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/search/__init__.py -------------------------------------------------------------------------------- /autoscrape/search/bfs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from collections import deque 3 | 4 | import networkx as nx 5 | 6 | 7 | class BFS(object): 8 | def __init__(self, root_node): 9 | # queue of remaining BFS nodes 10 | self.queue = deque() 11 | # store BFS scrape graph 12 | self.graph = nx.DiGraph() 13 | # current node (css tag) 14 | self.current = None 15 | 16 | def next(self): 17 | """ 18 | Get the next CSS path node to interact with. 19 | """ 20 | newcurrent = self.queue.popleft() 21 | self.current = newcurrent 22 | return newcurrent 23 | 24 | def add_root_node(self, node, **kwargs): 25 | """ 26 | Add an isolated node, make it the current node. This 27 | is for creating new subgraphs inside our graph. Does 28 | not add any edges. 29 | """ 30 | self.graph.add_node(node, **kwargs) 31 | self.current = node 32 | 33 | def add_node(self, node, **kwargs): 34 | """ 35 | Add a single node to the tree, with edges connecting 36 | to the current node. 37 | """ 38 | self.queue.append(node) 39 | self.graph.add_node(node, **kwargs) 40 | self.graph.add_edge(self.current, node) 41 | 42 | def add_nodes(self, nodes): 43 | """ 44 | Add a list of nodes to the current node in the graph. 45 | This handles adding the nodes and the edges. 46 | """ 47 | self.queue.extend(nodes) 48 | for node, meta in nodes: 49 | self.add_node(node, **meta) 50 | -------------------------------------------------------------------------------- /autoscrape/search/graph.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import logging 3 | 4 | try: 5 | import networkx as nx 6 | except ModuleNotFoundError: 7 | pass 8 | 9 | 10 | logger = logging.getLogger('AUTOSCRAPE') 11 | 12 | 13 | class Graph(object): 14 | def __init__(self): 15 | try: 16 | self.graph = nx.DiGraph() 17 | except NameError: 18 | logger.debug( 19 | "NetworkX not installed. Not building crawl graph." 20 | " (Hint: pip install autoscrape[graph])" 21 | ) 22 | self.graph = None 23 | # store scrape graph 24 | # current node (css tag) 25 | self.current = None 26 | 27 | def add_root_node(self, node, **kwargs): 28 | """ 29 | Add an isolated node, make it the current node. This 30 | is for creating new subgraphs inside our graph. Does 31 | not add any edges. 32 | """ 33 | if self.graph is None: 34 | return 35 | self.graph.add_node(node, **kwargs) 36 | self.current = node 37 | 38 | def add_node(self, node, **kwargs): 39 | """ 40 | Add a single node to the tree, with edges connecting 41 | to the current node. 42 | """ 43 | if self.graph is None: 44 | return 45 | self.graph.add_node(node, **kwargs) 46 | self.graph.add_edge(self.current, node) 47 | 48 | def add_nodes(self, nodes): 49 | """ 50 | Add a list of nodes to the current node in the graph. 51 | This handles adding the nodes and the edges. 52 | """ 53 | if self.graph is None: 54 | return 55 | for node, meta in nodes: 56 | self.add_node(node, **meta) 57 | 58 | def add_meta_to_current(self, **meta): 59 | if self.graph is None: 60 | return 61 | self.graph.nodes[self.current].update(**meta) 62 | 63 | def add_action_to_current(self, action): 64 | if self.graph is None: 65 | return 66 | current_meta = self.graph.nodes[self.current] 67 | current_actions = current_meta.get("actions", []) 68 | current_actions.append(action) 69 | current_meta["actions"] = current_actions 70 | nx.set_node_attributes(self.graph, current_meta, name=self.current) 71 | 72 | def move_to_node(self, node): 73 | if self.graph is None: 74 | return 75 | self.current = node 76 | 77 | def move_to_parent(self): 78 | if self.graph is None: 79 | return 80 | try: 81 | preds = self.graph.predecessors(self.current) 82 | parent = self.graph.predecessors(self.current).__next__() 83 | except StopIteration: 84 | return 85 | self.move_to_node(parent) 86 | 87 | def save_graph(self, output_path): 88 | if self.graph is None: 89 | return 90 | nx.write_gpickle(self.graph, output_path) 91 | -------------------------------------------------------------------------------- /autoscrape/tasks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import os 3 | import re 4 | 5 | from celery import Celery 6 | 7 | from .scrapers.manual import ManualControlScraper 8 | 9 | 10 | backend = "rpc://" 11 | if os.environ.get("AUTOSCRAPE_DB_HOST"): 12 | backend = 'db+postgresql://%s:%s@%s/autoscrape' % ( 13 | os.environ["AUTOSCRAPE_DB_USER"], 14 | os.environ["AUTOSCRAPE_DB_PASSWORD"], 15 | os.environ["AUTOSCRAPE_DB_HOST"] 16 | ) 17 | 18 | 19 | app = Celery( 20 | 'tasks', 21 | broker=os.environ.get("AUTOSCRAPE_RABBITMQ_HOST"), 22 | backend=backend, 23 | ) 24 | 25 | app.conf.update( 26 | # CELERYD_MAX_TASKS_PER_CHILD=1, 27 | # CELERYD_PREFETCH_MULTIPLIER=1, 28 | # CELERY_ACKS_LATE=True, 29 | # CELERY_RESULT_PERSISTENT=True, 30 | # CELERY_TASK_PUBLISH_RETRY=False, 31 | # CELERY_TASK_RESULT_EXPIRES=None, 32 | CELERY_TRACK_STARTED=True, 33 | CELERY_BROKER_HEARTBEAT=10 34 | ) 35 | 36 | 37 | @app.task(bind=True) 38 | def start(self, baseurl, args): 39 | print("Starting ManualControlScraper", baseurl, args) 40 | # append task ID to receiver URI 41 | output = args.get("output") 42 | if output and re.match("^https?://", output): 43 | if output[-1] != "/": 44 | output += "/" 45 | output += str(self.request.id) 46 | args["output"] = output 47 | scraper = ManualControlScraper(baseurl, **args) 48 | scraper.run() 49 | -------------------------------------------------------------------------------- /autoscrape/util/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import base64 3 | import json 4 | import logging 5 | import os 6 | import re 7 | from urllib import parse 8 | 9 | import requests 10 | 11 | 12 | logger = logging.getLogger('AUTOSCRAPE') 13 | 14 | 15 | def get_filename_from_url(url): 16 | """ 17 | Take a fully-qualified URL and turn it into a filename. For 18 | example, turn a url like this: 19 | 20 | https://www.cia.gov/library/readingroom/docs/%5B15423241%5D.pdf 21 | 22 | Using the parsed URL: 23 | 24 | ParseResult(scheme='https', netloc='www.cia.gov', 25 | path='/library/readingroom/docs/%5B15423241%5D.pdf 26 | 27 | Returing this representation (a string): 28 | 29 | _library_readingroom_docs_%5B15423241%5D.pdf 30 | 31 | NOTE: If no extension is found on the page, .html is appended. 32 | """ 33 | parsed = parse.urlparse(url) 34 | host = parsed.netloc 35 | # split filename/path and extension 36 | file_parts = os.path.splitext(parsed.path) 37 | # file_part = file_parts[0].replace("/", "__") 38 | extension = file_parts[1] or ".html" 39 | # filename = "%s_%s" % (host, file_part) 40 | filename = url.replace("/", "_").replace(":", "_") 41 | if parsed.query: 42 | query_part = "_".join(parsed.query.split("&")) 43 | filename = "%s__%s" % (filename, query_part) 44 | return "%s%s" % (filename, extension) 45 | 46 | 47 | def get_extension_from_url(url): 48 | # try and extract the extension from the URL 49 | path = parse.urlparse(url).path 50 | ext = os.path.splitext(path)[1] 51 | ext = ext if ext else "html" 52 | if ext[0] == ".": 53 | ext = ext[1:] 54 | return ext 55 | 56 | 57 | def write_file(filepath, data, fileclass=None, writetype="w", output=None, 58 | url=None): 59 | """ 60 | Write out a scraped data file to disk or a remote callback, 61 | specified in output parameter. 62 | """ 63 | logger.debug("[.] Writing file: %s to: %s" % (filepath, output)) 64 | if not output: 65 | return 66 | 67 | # Rest API callback mode 68 | if re.match("^https?://", output): 69 | # (b64encode) bytes -> (decode) str 70 | if type(data) == bytes: 71 | encoded = base64.b64encode(data).decode() 72 | else: 73 | encoded = base64.b64encode(bytes(data, "utf-8")).decode() 74 | payload = { 75 | "name": filepath, 76 | "data": encoded, 77 | "fileclass": fileclass, 78 | "url": url, 79 | } 80 | if fileclass: 81 | payload["fileclass"] = fileclass 82 | post_data = json.dumps(payload).encode("utf-8") 83 | headers = { 84 | "content-type": "application/json" 85 | } 86 | r = requests.post( 87 | output, data=post_data, headers=headers 88 | ) 89 | r.status_code 90 | 91 | # filesystem mode 92 | else: 93 | dirpath = os.path.dirname(filepath) 94 | if not os.path.exists(dirpath): 95 | os.makedirs(dirpath) 96 | with open(filepath, writetype) as f: 97 | f.write(data) 98 | -------------------------------------------------------------------------------- /autoscrape/util/warc.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from multiprocessing import Pool 3 | import os 4 | import pickle 5 | 6 | try: 7 | import plyvel 8 | import warcio 9 | except ModuleNotFoundError: 10 | pass 11 | 12 | 13 | logger = logging.getLogger('AUTOSCRAPE') 14 | 15 | 16 | def _warc_record_sane(record): 17 | if record.rec_type != "response": 18 | return False 19 | if not record.rec_headers.get_header('WARC-Target-URI'): 20 | return False 21 | return True 22 | 23 | 24 | def _warc_records(filename): 25 | records = [] 26 | try: 27 | with open(filename, "rb") as f: 28 | for record in warcio.ArchiveIterator(f): 29 | if not _warc_record_sane(record): 30 | continue 31 | parsed_rec = { 32 | "uri": record.rec_headers.get_header('WARC-Target-URI'), 33 | "payload": record.content_stream().read().strip(), 34 | "headers": record.http_headers.headers, 35 | } 36 | yield parsed_rec 37 | except Exception as e: 38 | logger.error("[!] Error opening WARC file %s" % (filename)) 39 | logger.error(e) 40 | return records 41 | 42 | 43 | def _process_warcfile(filepath, filter_domain): 44 | found = 0 45 | if not filepath.endswith(".warc.gz"): 46 | return [] 47 | logger.debug(" - Parsing %s" % (filepath)) 48 | record_number = -1 49 | results = [] 50 | for record in _warc_records(filepath): 51 | record_number += 1 52 | uri = record["uri"] 53 | if filter_domain and filter_domain not in uri: 54 | continue 55 | logger.debug("URI: %s" % (uri)) 56 | found += 1 57 | uri_bytes = bytes(uri, "utf-8") 58 | value = pickle.dumps((filepath, record_number)) 59 | results.append((uri_bytes, value)) 60 | if found: 61 | logger.debug(" - Found %s records" % (found)) 62 | return results 63 | 64 | 65 | def build_warc_index(db=None, warc_directory=None, filter_domain=None): 66 | """ 67 | Read through all WARC files in warc_directory and build 68 | an index: URL => filename, record_number 69 | """ 70 | blank = True 71 | for rec in db.iterator(): 72 | blank = False 73 | break 74 | if not blank: 75 | logger.debug("[.] Already loaded WARC index.") 76 | return 77 | logger.info("[.] Building WARC index. This might take a while...") 78 | _, _, filenames = list(os.walk(warc_directory))[0] 79 | filepaths = [(os.path.join(warc_directory, n), filter_domain) for n in filenames] 80 | print(filepaths[0]) 81 | 82 | with Pool(4) as f: 83 | results_groups = f.starmap(_process_warcfile, filepaths) 84 | for results in results_groups: 85 | for uri_bytes, value in results: 86 | db.put(uri_bytes, value) 87 | -------------------------------------------------------------------------------- /autoscrape/vectorization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/vectorization/__init__.py -------------------------------------------------------------------------------- /autoscrape/vectorization/embeddings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import logging 3 | import re 4 | 5 | import numpy as np 6 | 7 | 8 | logger = logging.getLogger('AUTOSCRAPE') 9 | 10 | 11 | class Embedding: 12 | def __init__(self, embeddings=None, t2id=None, id2t=None): 13 | self.embeddings = embeddings 14 | self.t2id = t2id 15 | self.id2t = id2t 16 | self.N, self.dim = embeddings.shape 17 | 18 | 19 | class EmbeddingsVectorizer: 20 | def __init__(self, html_embeddings_file=None, word_embeddings_file=None, 21 | scraper=None, controller=None, loglevel=None): 22 | """ 23 | Initialize our vectorizer with paths to the relevant word 24 | embedding files for our vectorization routines. 25 | 26 | Representations: 27 | - Element text vector (fasttext) 28 | - Element path vector ( body -> form -> div ... -> button ) 29 | 30 | Given state, q-function gets action-state pairs. We take 31 | the action giving the highest q-value. 32 | """ 33 | self.scraper = scraper 34 | self.controller = controller 35 | 36 | self.html = None 37 | if html_embeddings_file: 38 | logger.debug("[.] Loading HTML embeddings") 39 | self.html = self.load_embedding(html_embeddings_file) 40 | 41 | self.word = None 42 | if word_embeddings_file: 43 | logger.debug("[.] Loading word embeddings") 44 | self.word = self.load_embedding(word_embeddings_file) 45 | 46 | def embeddings_length(self, path): 47 | N = 0 48 | with open(path, "r") as f: 49 | for line in f: 50 | if N == 0 and re.match("^[0-9]+\s[0-9]+$", line): 51 | continue 52 | N += 1 53 | key, data = line.split(' ', 1) 54 | vec = [float(d) for d in data.split()] 55 | dim = len(vec) 56 | return N, dim 57 | 58 | def load_embedding(self, path): 59 | logger.info("[+] Loading embedding file %s..." % path) 60 | N, dim = self.embeddings_length(path) 61 | logger.info(" - vocab size: %s, dim: %s" % ( 62 | N, dim 63 | )) 64 | logger.debug(" - Allocating embedding matrix...") 65 | # token to ID (embedding row) 66 | t2id = dict() 67 | # ID to token 68 | id2t = dict() 69 | # embedding matrix 70 | embeddings = np.zeros(shape=(N, dim)) 71 | logger.debug(" - Reading embeddings into memory...") 72 | outputs = [(N // 10) * i for i in range(10)] 73 | with open(path, "r") as f: 74 | embed_id = 0 75 | for line in f: 76 | if embed_id == 0 and re.match("^[0-9]+\s[0-9]+$", line): 77 | continue 78 | if embed_id in outputs: 79 | pct_done = (embed_id / float(N)) * 100 80 | logger.info(" - %0.4f%% complete" % (pct_done)) 81 | key, data = line.split(' ', 1) 82 | vec = [float(d) for d in data.split()] 83 | embeddings[embed_id, :] = vec 84 | t2id[key] = embed_id 85 | id2t[embed_id] = key 86 | embed_id += 1 87 | 88 | logger.debug(" - Embeddings matrix: %s x %s" % embeddings.shape) 89 | return Embedding( 90 | embeddings=embeddings, 91 | t2id=t2id, 92 | id2t=id2t, 93 | ) 94 | 95 | def html_to_vector(self, html): 96 | x = np.zeros(self.html.dim) 97 | N = 0.0 98 | for t in html: 99 | N += 1 100 | if re.match("\s", t): 101 | t = "" 102 | id = self.html.t2id[t] 103 | x += self.html.embeddings[id] 104 | return x / N 105 | 106 | def text_to_vector(self, text): 107 | x = np.zeros(self.word.dim) 108 | N = 0.0 109 | for t in re.split("[^A-Za-z]", text): 110 | t = t.strip().lower() 111 | if not t: 112 | continue 113 | N += 1 114 | if re.match("\s", t): 115 | t = "" 116 | try: 117 | id = self.word.t2id[t] 118 | except Exception as e: 119 | logger.warn("Skipping word=%s, Error=%s" % ( 120 | t, e 121 | )) 122 | continue 123 | x += self.word.embeddings[id] 124 | return x / N 125 | 126 | def element_to_position_vector(self, element): 127 | return np.array([0.0]) 128 | 129 | def vectorize(self, html, text, element=None): 130 | x_html = self.html_to_vector(html) 131 | x_text = self.text_to_vector(text) 132 | concat_array = [x_html, x_text] 133 | if element: 134 | x_pos = self.element_to_position_vector(element) 135 | concat_array.append(x_pos) 136 | x = np.concatenate(concat_array) 137 | return x 138 | 139 | def page_vector(self): 140 | raise NotImplementedError( 141 | "EmbeddingsVectorizer.page_vector not implemented" 142 | ) 143 | 144 | def form_vectors(self): 145 | raise NotImplementedError( 146 | "EmbeddingsVectorizer.form_vectors not implemented" 147 | ) 148 | 149 | def button_vectors(self): 150 | raise NotImplementedError( 151 | "EmbeddingsVectorizer.button_vectors not implemented" 152 | ) 153 | 154 | def link_vectors(self): 155 | raise NotImplementedError( 156 | "EmbeddingsVectorizer.link_vectors not implemented" 157 | ) 158 | -------------------------------------------------------------------------------- /autoscrape/vectorization/text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import logging 3 | 4 | 5 | logger = logging.getLogger('AUTOSCRAPE') 6 | 7 | 8 | class TextVectorizer: 9 | """ 10 | # for ix, text in link_vectors 11 | link_vectors = self.control.vectorizer.link_vectors() 12 | # for ix, text in button_vectors 13 | button_data = self.control.vectorizer.button_vectors() 14 | form_vectors = self.control.vectorizer.form_vectors() 15 | """ 16 | def __init__(self, scraper=None, controller=None): 17 | self.scraper = scraper 18 | self.controller = controller 19 | 20 | def page_vector(self, html): 21 | """ 22 | Get feature vector from currently loaded page. This should 23 | be used to determine what type of page we're on and what action 24 | we ought to take (continue crawl, enter input, scrape structured 25 | data, etc). 26 | """ 27 | return self.scraper.element_text(None, block=True) 28 | 29 | def form_vectors(self): 30 | """ 31 | Get a feature vector representing the forms on a page. This ought 32 | to be used in cases where the model indicates the page may be a 33 | search page, but where there are multiple forms. Or where you 34 | just want to determine if a form is interactive data search. 35 | Another alternative strategy would be to try the search and then 36 | look at the next page. 37 | """ 38 | logger.debug("[.] Loading form vectors") 39 | form_data = [] 40 | for tag in self.controller.forms: 41 | form = self.scraper.element_by_tag(tag) 42 | txt = self.scraper.element_text(form, block=True) 43 | form_data.append(txt) 44 | return form_data 45 | 46 | def button_vectors(self): 47 | logger.debug("[.] Building button vectors") 48 | buttons_data = [] 49 | for tag in self.controller.scraper.get_buttons(): 50 | elem = self.scraper.element_by_tag(tag) 51 | value = "" 52 | if elem is not None: 53 | value = self.scraper.element_value(elem) 54 | text = [] 55 | if value: 56 | text.append(value) 57 | if elem is not None: 58 | text.append(self.scraper.element_text(elem)) 59 | logger.debug(" - button value: %s, text: %s" % (value, text)) 60 | buttons_data.append(" ".join(text)) 61 | return buttons_data 62 | 63 | def link_vectors(self): 64 | """ 65 | Get a matrix of link vectors. These describe the text of the link 66 | in a way that a ML algorithm could decide how to prioritize the 67 | search pattern. 68 | """ 69 | logger.debug("[.] Building link vectors") 70 | buttons_data = [] 71 | for t in self.controller.scraper.get_clickable(): 72 | elem = self.scraper.element_by_tag(t) 73 | tag_name = self.scraper.element_tag_name(elem) 74 | text = "" 75 | if elem is None: 76 | logger.warn("[!] Link element couldn't be found: %s" % t) 77 | elif tag_name != "input": 78 | text = self.scraper.element_text(elem).replace("\n", " ") 79 | elif tag_name == "input": 80 | value = self.scraper.element_attr(elem, "value") 81 | text = value.replace("\n", " ") 82 | buttons_data.append(text) 83 | return buttons_data 84 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Bring up all the pieces necessary to run the workbench 2 | # Data persists in Docker volumes and in local dir 3 | 4 | # This file passes through all necessary env variables to requisite 5 | # Docker containers and makes them available when running commands via 6 | # `docker exec`. 7 | 8 | version: '3.4' 9 | 10 | services: 11 | database: 12 | image: postgres:10.10 13 | environment: 14 | POSTGRES_USER: autoscrape 15 | POSTGRES_PASSWORD: autoscrape 16 | POSTGRES_DB: autoscrape 17 | PGDATA: /var/lib/postgresql/data/10.10 18 | networks: [ 'dev' ] 19 | volumes: 20 | - dbdata:/var/lib/postgresql/data 21 | 22 | rabbitmq: 23 | image: rabbitmq:3.7.8-management 24 | ports: [ '15672' ] # open management port, for debugging 25 | networks: [ 'dev' ] 26 | environment: 27 | # Use just one CPU 28 | RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS: '+S 1:1 +stbt ts +A 12' 29 | 30 | flask: 31 | build: 32 | context: . 33 | target: base 34 | volumes: 35 | - ./:/app:rw 36 | - virtualenvs:/root/.local/share/virtualenvs/:rw 37 | command: [ 'python3', 'autoscrape-server.py' ] 38 | ports: 39 | - '5000:5000' 40 | links: 41 | - rabbitmq 42 | depends_on: [ 'rabbitmq', 'database' ] 43 | networks: [ 'dev' ] 44 | environment: 45 | AUTOSCRAPE_RABBITMQ_HOST: amqp://guest:guest@rabbitmq/ 46 | AUTOSCRAPE_DB_HOST: database 47 | AUTOSCRAPE_DB_USER: autoscrape 48 | AUTOSCRAPE_DB_PASSWORD: autoscrape 49 | 50 | celery: 51 | build: 52 | context: . 53 | target: base 54 | shm_size: 6GB 55 | volumes: 56 | - ./:/app:rw 57 | - virtualenvs:/root/.local/share/virtualenvs/:rw 58 | command: [ 'celery', '-c', '1', '-A', 'autoscrape.tasks', 'worker', '--loglevel=info' ] 59 | #user: nobody 60 | links: 61 | - rabbitmq 62 | depends_on: [ 'rabbitmq', 'flask', 'database' ] 63 | networks: [ 'dev' ] 64 | environment: 65 | AUTOSCRAPE_RABBITMQ_HOST: amqp://guest:guest@rabbitmq/ 66 | AUTOSCRAPE_DB_HOST: database 67 | AUTOSCRAPE_DB_USER: autoscrape 68 | AUTOSCRAPE_DB_PASSWORD: autoscrape 69 | 70 | networks: 71 | dev: 72 | driver: bridge 73 | 74 | volumes: 75 | virtualenvs: {} 76 | dbdata: {} 77 | 78 | -------------------------------------------------------------------------------- /extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | """ 4 | Autoscrape Extractor - A wrapper around Hext for 5 | walking a directory and extracting all structured 6 | data using a provided Hext template. 7 | 8 | Hext templates can be created using the JavaScript 9 | UI found in ./hext_builder_ui. 10 | 11 | Usage: 12 | extract.py [options] 13 | 14 | Options: 15 | --output-file FILENAME 16 | By default, all output will be printed to stdout. 17 | This option directs all output to a specified file. 18 | """ 19 | from docopt import docopt 20 | import html5lib 21 | import hext 22 | 23 | 24 | def parse_html_file(filepath): 25 | with open(filepath, "r") as f: 26 | html = f.read() 27 | return html5lib.parse( 28 | html, treebuilder='lxml', namespaceHTMLElements=False 29 | ) 30 | 31 | 32 | if __name__ == "__main__": 33 | docopt_args = docopt(__doc__) 34 | 35 | option = None 36 | if "build-template" in docopt_args.keys(): 37 | option = "build-template" 38 | docopt_args.pop("build-template") 39 | elif "extract" in docopt_args.keys(): 40 | option = "extract" 41 | docopt_args.pop("extract") 42 | 43 | # strip the -- and convert - to _, remove <> 44 | args = {} 45 | for option in docopt_args: 46 | args[option[2:].replace( 47 | '<', '' 48 | ).replace( 49 | '>', '' 50 | ).replace( 51 | '-', '_' 52 | )] = docopt_args[option] 53 | 54 | if option == "extract": 55 | # TODO: walk directory, feed files to below: 56 | rule = hext.Rule(strhext) 57 | document = hext.Html(strhtml) 58 | result = rule.extract(document) 59 | -------------------------------------------------------------------------------- /images/ai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/images/ai.png -------------------------------------------------------------------------------- /images/code_embeddings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/images/code_embeddings.png -------------------------------------------------------------------------------- /images/extraction of code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/images/extraction of code.png -------------------------------------------------------------------------------- /images/k-NN_small_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/images/k-NN_small_data.png -------------------------------------------------------------------------------- /images/quickstart-video.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/images/quickstart-video.png -------------------------------------------------------------------------------- /requirements.api.txt: -------------------------------------------------------------------------------- 1 | Flask>=1.0.2 2 | psycopg2-binary 3 | SQLAlchemy>=1.3.3 4 | sqlalchemy-utils>=0.33.11 5 | flask-sqlalchemy>=2.3.2 6 | celery>=4.4.0 7 | -------------------------------------------------------------------------------- /requirements.dev.txt: -------------------------------------------------------------------------------- 1 | nose==1.3.7 2 | torchvision==0.2.1 3 | matplotlib==2.2.2 4 | scikit-learn==0.19.2 5 | scipy==1.1.0 6 | html2text==2018.1.9 7 | warcio>=1.7.3,<2.0.0 8 | pycld2==0.41 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | selenium>=3.141.0,<4.0.0 2 | lxml>=4.3.0 3 | html5lib>=1.0.1 4 | webencodings>=0.5.1 5 | docopt>=0.6.2 6 | networkx>=2.2 7 | numpy>=1.15.0 8 | cssselect>=1.1.0 9 | requests>=2.22.0 10 | lxml>=4.3.0 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from os import path 3 | import setuptools 4 | 5 | 6 | def get_long_description(): 7 | BASEDIR = path.abspath(path.dirname(__file__)) 8 | with open(path.join(BASEDIR, 'README.rst'), encoding='utf-8') as f: 9 | return f.read() 10 | 11 | 12 | setuptools.setup( 13 | name='autoscrape', 14 | version='1.6.14', 15 | description='An automated, programming-free web scraper for interactive sites', 16 | long_description=get_long_description(), 17 | author='Brandon Roberts', 18 | author_email='brandon@bxroberts.org', 19 | url='https://github.com/brandonrobertz/autoscrape-py', 20 | license='AGPLv3', 21 | classifiers=[ 22 | 'Development Status :: 3 - Alpha', 23 | 'Environment :: Console', 24 | 'Intended Audience :: Developers', 25 | 'Intended Audience :: End Users/Desktop', 26 | 'Intended Audience :: Science/Research', 27 | 'License :: OSI Approved :: GNU Affero General Public License v3', 28 | 'Natural Language :: English', 29 | 'Operating System :: OS Independent', 30 | 'Programming Language :: Python', 31 | 'Programming Language :: Python :: 3.5', 32 | 'Programming Language :: Python :: 3.6', 33 | 'Programming Language :: Python :: 3.7', 34 | 'Programming Language :: Python :: 3.8', 35 | 'Programming Language :: Python :: Implementation :: CPython', 36 | 'Programming Language :: Python :: Implementation :: PyPy', 37 | 'Topic :: Scientific/Engineering :: Information Analysis', 38 | 'Topic :: Software Development :: Libraries :: Python Modules', 39 | 'Topic :: Utilities' 40 | ], 41 | packages=[ 42 | 'autoscrape', 43 | 'autoscrape.cli', 44 | 'autoscrape.util', 45 | 'autoscrape.backends', 46 | 'autoscrape.backends.base', 47 | 'autoscrape.backends.selenium', 48 | 'autoscrape.backends.requests', 49 | 'autoscrape.backends.warc', 50 | 'autoscrape.scrapers', 51 | 'autoscrape.search', 52 | 'autoscrape.vectorization', 53 | ], 54 | entry_points={ 55 | 'console_scripts': [ 56 | 'autoscrape = autoscrape.cli.scrape:main', 57 | ] 58 | }, 59 | install_requires=[ 60 | 'lxml>=4.3.0', 61 | 'html5lib>=1.0.1', 62 | 'docopt>=0.6.2', 63 | 'cssselect>=1.1.0', 64 | 'requests>=2.22.0', 65 | ], 66 | extras_require={ 67 | 'selenium-backend': [ 68 | 'selenium>=3.141.0,<4.0.0', 69 | ], 70 | 'warc-backend': [ 71 | 'warcio>=1.7.3,<2.0.0', 72 | 'plyvel==1.2.0', 73 | ], 74 | 'embeddings-vectorizer': [ 75 | 'numpy>=1.15.0' 76 | ], 77 | 78 | 'graph': [ 79 | 'networkx>=2.2', 80 | ], 81 | 'all': [ 82 | 'selenium>=3.141.0,<4.0.0', 83 | 'networkx>=2.2', 84 | 'numpy>=1.15.0', 85 | 'warcio>=1.7.3,<2.0.0', 86 | 'plyvel==1.2.0', 87 | ], 88 | } 89 | ) 90 | -------------------------------------------------------------------------------- /tests/common.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_BASE="autoscrape-data-tests" 2 | BENCHMARK_LOG="${OUTPUT_BASE}/backend-benchmark.log" 3 | BACKENDS="selenium requests" 4 | TIME="$(which time) -p -a -o ${BENCHMARK_LOG}" 5 | 6 | AUTOSCRAPE="./autoscrape.py --save-graph --loglevel DEBUG --save-screenshots" 7 | 8 | mkdir -p ${OUTPUT_BASE} 9 | 10 | die () { 11 | echo "${*}" 12 | exit 1 13 | } 14 | 15 | add_benchmark_header () { 16 | benchmark="${1}" 17 | crawlname="${2}" 18 | echo "Running ${benchmark} ${crawlname}" | tee -a ${BENCHMARK_LOG} 19 | } 20 | 21 | add_separator () { 22 | echo "======================================================================" 23 | } 24 | 25 | -------------------------------------------------------------------------------- /tests/crawltest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source tests/common.sh 4 | 5 | for backend in ${BACKENDS}; do 6 | output="${OUTPUT_BASE}/autoscrape-data-${backend}" 7 | rm -rf ${output} 8 | add_separator 9 | add_benchmark_header ${backend} "crawl" 10 | ${TIME} ${AUTOSCRAPE} \ 11 | --backend ${backend} \ 12 | --output ${output} \ 13 | https://bxroberts.org \ 14 | || die "Backend ${backend} failed crawling." 15 | echo "${backend} crawl complete!" 16 | done 17 | 18 | -------------------------------------------------------------------------------- /tests/data/test_page.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | Civil Search 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 25 | 26 | 27 | 28 | 29 | 30 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 52 | 53 | 54 | 55 | 56 | 57 |
58 | 59 |
60 | 61 |
62 | 63 | 64 |
65 |
    66 |
  • 67 | 68 |
69 |
70 | 71 |
72 | 77 |
78 |
Civil Search Help
  • Civil Search
79 |
80 |
81 |
82 |
83 |
84 | 85 |
86 | 87 |
88 | 89 |
90 |
91 |

Civil Records Search

92 | 93 | 94 | 95 |
96 |