├── .dockerignore
├── .gitignore
├── .gitmodules
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── autoscrape-server.py
├── autoscrape.py
├── autoscrape
    ├── __init__.py
    ├── backends
    │   ├── __init__.py
    │   ├── base
    │   │   ├── __init__.py
    │   │   ├── browser.py
    │   │   ├── dom.py
    │   │   ├── graph.py
    │   │   └── tags.py
    │   ├── requests
    │   │   ├── __init__.py
    │   │   ├── browser.py
    │   │   ├── dom.py
    │   │   └── tags.py
    │   ├── selenium
    │   │   ├── __init__.py
    │   │   ├── browser.py
    │   │   ├── dom.py
    │   │   └── tags.py
    │   └── warc
    │   │   ├── __init__.py
    │   │   ├── browser.py
    │   │   ├── dom.py
    │   │   └── tags.py
    ├── classification.py
    ├── cli
    │   ├── __init__.py
    │   └── scrape.py
    ├── control.py
    ├── filetypes.py
    ├── input_parser.py
    ├── scrapers
    │   ├── __init__.py
    │   ├── manual.py
    │   ├── null.py
    │   └── test.py
    ├── search
    │   ├── __init__.py
    │   ├── bfs.py
    │   └── graph.py
    ├── tasks.py
    ├── util
    │   ├── __init__.py
    │   └── warc.py
    └── vectorization
    │   ├── __init__.py
    │   ├── embeddings.py
    │   └── text.py
├── docker-compose.yml
├── extract.py
├── images
    ├── ai.png
    ├── code_embeddings.png
    ├── extraction of code.png
    ├── k-NN_small_data.png
    └── quickstart-video.png
├── requirements.api.txt
├── requirements.dev.txt
├── requirements.txt
├── setup.py
├── tests
    ├── common.sh
    ├── crawltest.sh
    ├── data
    │   ├── test_page.html
    │   └── test_page_large.cleaned.html
    ├── formsubmittest.sh
    ├── run_e2e_tests.sh
    ├── tag_test_data_page.html
    ├── tags.py
    ├── test_extractor.py
    ├── test_input_parser.py
    ├── test_tag_generation.py
    └── warctest.sh
├── tox.ini
├── train.py
└── vectorize_data.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .git/
 2 | *.py[cod]
 3 | *$py.class
 4 | .cache/
 5 | *.pickle
 6 | .idea/
 7 | .DS_Store
 8 | __pycache__
 9 | 
10 | *.pyc
11 | *.swp
12 | *.swo
13 | *.7z
14 | *.log
15 | 
16 | # default output directory
17 | autoscrape-data/
18 | 
19 | # Algorithmic/Training Data-Related Files
20 | # Currently stored in a separate git repo (private, currently)
21 | # Will be released once ready, minimized, over LFS
22 | deps/
23 | training_data/
24 | *.pickle
25 | 
26 | # python virtual environment
27 | venv
28 | 
29 | # webdriver remote settings
30 | nodeConfig.json
31 | 
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | *.pyc
 3 | *.swp
 4 | *.swo
 5 | *.7z
 6 | *.log
 7 | graph*.dot
 8 | .idea
 9 | .#*
10 | .python-version
11 | 
12 | # default output directory
13 | autoscrape-data*/
14 | 
15 | # Algorithmic/Training Data-Related Files
16 | # Currently stored in a separate git repo (private, currently)
17 | # Will be released once ready, minimized, over LFS
18 | deps/
19 | training_data/
20 | *.pickle
21 | 
22 | # python virtual environment
23 | venv
24 | 
25 | # webdriver remote settings
26 | nodeConfig.json
27 | 
28 | # python setup tools
29 | build/
30 | dist/
31 | autoscrape_py.egg-info/
32 | autoscrape.egg-info/
33 | 
34 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "www"]
2 | 	path = www
3 | 	url = https://github.com/brandonrobertz/autoscrape-www
4 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7.4-slim-buster AS deps
 2 | 
 3 | # Install the Python deps (common across worker & web server, for now)
 4 | RUN mkdir /app
 5 | WORKDIR /app
 6 | 
 7 | # Install Firefox deps (and curl, xvfb, vnc). Debian Buster has Firefox v68;
 8 | # we'll install its dependencies and hope they satisfy _our_ Firefox version.
 9 | RUN apt-get update \
10 |     && bash -c 'apt-get install -y --no-install-recommends $(apt-cache depends firefox-esr | awk "/Depends:/{print\$2}")' \
11 |     && apt-get install --no-install-recommends -y \
12 |         curl \
13 |         wget \
14 |         xauth \
15 |         xvfb \
16 |         xz-utils \
17 |         bzip2 \
18 |         postgresql-client \
19 |     && rm -rf /var/lib/apt/lists/*
20 | 
21 | # Install Firefox. It's a separate step so it's easier to resume docker build.
22 | RUN curl -L https://download-installer.cdn.mozilla.net/pub/firefox/releases/64.0.2/linux-x86_64/en-US/firefox-64.0.2.tar.bz2 \
23 |         | tar jx -C /opt \
24 |         && ln -s /opt/firefox/firefox /usr/bin/firefox
25 | 
26 | # Install geckodriver. It's a separate step so it's easier to resume docker build.
27 | RUN curl -L https://github.com/mozilla/geckodriver/releases/download/v0.23.0/geckodriver-v0.23.0-linux64.tar.gz \
28 |         | tar zx -C /usr/bin/ \
29 |         && chmod +x /usr/bin/geckodriver
30 | 
31 | # Install the Python deps we use for integration tests.
32 | #
33 | # Integration tests don't rely on the Django stack, and that makes this
34 | # Dockerfile compile faster and cache better.
35 | #RUN pip install psycopg2-binary capybara-py selenium minio
36 | 
37 | FROM deps AS pydeps
38 | 
39 | COPY requirements.txt /app/
40 | RUN pip install -r /app/requirements.txt
41 | 
42 | COPY requirements.api.txt /app/
43 | RUN pip install -r /app/requirements.api.txt
44 | 
45 | FROM pydeps as base
46 | 
47 | COPY autoscrape/ /app/autoscrape/
48 | 
49 | # Flask API server
50 | COPY autoscrape-server.py /app/
51 | 
52 | # Build AutoScrape WWW
53 | # Install Node.js
54 | 
55 | RUN \
56 |   cd /tmp && \
57 |   curl https://nodejs.org/dist/v12.16.1/node-v12.16.1-linux-x64.tar.xz -o node-js.tar.xz && \
58 |   tar xvf node-js.tar.xz && \
59 |   rm -f node-js.tar.xz && \
60 |   cp -rfv node-v*/* / && \
61 |   rm -rf /tmp/node-* && \
62 |   npm install -g npm && \
63 |   printf '\n# Node.js\nexport PATH="node_modules/.bin:$PATH"' >> /root/.bashrc
64 | 
65 | # Remember: this is a git submodule!
66 | COPY www/ /app/www/
67 | RUN echo REACT_APP_API_HOST="http://localhost:5000" >> .env
68 | RUN cd /app/www && npm install && npm run download-hextractor && npm run build
69 | 
70 | FROM autoscrape-worker-deps AS autoscrape-worker
71 | CMD [ "celery", "-A", "autoscrape.tasks", "worker", "--loglevel=info" ]
72 | 
73 | FROM autoscrape-server-deps AS autoscrape-server
74 | EXPOSE 5000
75 | CMD [ "python", "autoscrape-server.py" ]
76 | 
77 | FROM rabbitmq:3.7.8-management as rabbitmq
78 | EXPOSE 15672
79 | 
80 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | recursive-include docs *
4 | recursive-include examples *
5 | recursive-include tests *.py
6 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | start:
 2 | 	docker-compose down
 3 | 	#docker-compose build --pull
 4 | 	docker-compose up --build -t0 --abort-on-container-exit --renew-anon-volumes
 5 | 
 6 | quickstart:
 7 | 	docker-compose up -t0 --abort-on-container-exit
 8 | 
 9 | build:
10 | 	docker-compose build --pull
11 | 
12 | stop:
13 | 	docker-compose down -t0
14 | 	docker-compose rm -f -v # -v removes _anonymous_ volumes
15 | 
16 | clean: stop
17 | 	if docker ps -a -q; then \
18 | 		docker rm -f $$(docker ps -a -q) || exit 0; \
19 | 	fi
20 | 	if docker images -q; then \
21 | 		docker rmi -f $$(docker images -q) || exit 0; \
22 | 	fi
23 | 	docker volume rm -f $$(docker volume ls | awk '{ print $$2 }')
24 | 
25 | run_rabbitmq:
26 | 	docker run -it --rm --name rabbitmq -p 5672:5672 -p 15672:15672 rabbitmq:3-management
27 | 
28 | update_www:
29 | 	git submodule init && git submodule update && cd www && git fetch && git reset origin/master --hard
30 | 
31 | commit_www:
32 | 	git add www && git commit -m "Update www"
33 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | AutoScrape
  2 | ==========
  3 | 
  4 | .. image:: https://pypip.in/v/autoscrape/badge.svg
  5 |         :target: https://pypi.python.org/pypi/autoscrape/
  6 | 
  7 | .. image:: https://pypip.in/license/autoscrape/badge.svg
  8 |         :target: https://pypi.python.org/pypi/autoscrape/
  9 | 
 10 | 
 11 | .. figure:: https://github.com/brandonrobertz/autoscrape-py/blob/master/images/ai.png
 12 |    :alt: Artificial Informer Labs
 13 | 
 14 | A project of `Artificial Informer Labs <https://artificialinformer.com>`__.
 15 | 
 16 | AutoScrape is an automated scraper of structured data from interactive
 17 | web pages. You point this scraper at a site, give it a little information
 18 | and structured data can then be extracted. No brittle, site-specific
 19 | programming necessary.
 20 | 
 21 | This is an implementation of the web scraping framework described in the
 22 | paper, `Robust Web Scraping in the Public Interest with AutoScrape <https://bxroberts.org/files/autoscrape.pdf>`__ and presented at
 23 | `Computation + Journalism Symposium 2019 <http://cplusj.org/>`__. This is
 24 | an experimental work in progress!
 25 | 
 26 | Currently there are a few ways to use  AutoScrape:
 27 | 
 28 | - via a full Web interface for scraping (see bottom of page, make sure to pull in the submodule!)
 29 | - as a local CLI python script
 30 | - as a simplified web scraping framework
 31 | 
 32 | Installation and running instructions are provided for both below.
 33 | 
 34 | Quickstart
 35 | ----------
 36 | 
 37 | Two ways, easiest first.
 38 | 
 39 | ::
 40 | 
 41 |     pip install autoscrape[all]
 42 |     autoscrape --backend requests --output outdir --maxdepth 2 https://bxroberts.org
 43 | 
 44 | This will install all dependencies for all backends and various options.
 45 | 
 46 | Or:
 47 | 
 48 | ::
 49 | 
 50 |     git clone https://github.com/brandonrobertz/autoscrape-py
 51 |     cd autoscrape-py/
 52 |     pip install .[all]
 53 |     autoscrape --backend requests --output outdir --maxdepth 2 https://bxroberts.org
 54 | 
 55 | Either way, you can now use ``autoscrape`` from the command line.
 56 | 
 57 | Usage Examples
 58 | --------------
 59 | 
 60 | Here are some straightforward use cases for AutoScrape and how you'd use
 61 | the CLI tool to execute them. These, of course, assume you have the
 62 | dependencies installed.
 63 | 
 64 | Crawler Backends
 65 | ~~~~~~~~~~~~~~~~
 66 | 
 67 | There are two backends available for driving AutoScrape: ``requests``,
 68 | ``selenium`` and ``warc``. The ``requests`` backend (the default) is based on the
 69 | Python requests library and is only capable of crawling sites and submitting
 70 | simple HTTP forms. For any interaction with forms or JavaScript powered
 71 | buttons, you'll need to use the ``selenium`` backend.
 72 | 
 73 | You can control the backened with the ``--backend`` option:
 74 | 
 75 | ::
 76 | 
 77 |     autoscrape \
 78 |       --backend requests \
 79 |       --output requests_crawled_site \
 80 |       'https://some.page/to-crawl'
 81 | 
 82 | In order to use backends other than requests, you need to install
 83 | the proper dependencies. `pip install autoscrape[all]` will
 84 | install everything required for all backends/functionality, but
 85 | you can also install dependencies in isolation:
 86 | 
 87 | ::
 88 |     Selenium backend:
 89 |     pip install autoscrape[selenium-backend]
 90 | 
 91 |     Crawl graph builder (for use in --save-graph)
 92 |     pip install autoscrape[graph]
 93 | 
 94 |     WARC backend:
 95 |     pip install autoscrape[warc-backend]
 96 | 
 97 | Note that for the Selenium backend, you need to install geckodriver or
 98 | chromedriver, depending if you're using Firefox or Chrome, respectively.
 99 | More information is below in the External Dependencies section.
100 | 
101 | Crawl
102 | ~~~~~
103 | 
104 | Crawl an entire website, saving all HTML and stylesheets (no
105 | screenshots):
106 | 
107 | ::
108 | 
109 |     autoscrape \
110 |       --backend requests \
111 |       --maxdepth -1 \
112 |       --output crawled_site \
113 |       'https://some.page/to-crawl'
114 | 
115 | Archive Page (Screenshot & Code)
116 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
117 | 
118 | Archive a single webpage, both code and full-content screenshot (PNG),
119 | for future reference:
120 | 
121 | ::
122 | 
123 |     autoscrape \
124 |       --backend selenium \
125 |       --full-page-screenshots \
126 |       --load-images --maxdepth 0 \
127 |       --save-screenshots --driver Firefox \
128 |       --output archived_webpage \
129 |       'https://some.page/to-archive'
130 | 
131 | Search Forms and Crawl Result Pages
132 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
133 | 
134 | Query a web form, identified by containing the text "I'm a search form",
135 | entering "NAME" into the first (0th) text input field and select January
136 | 20th, 1992 in the second (1st) date field. Then click all buttons with
137 | the text "Next ->" to get all results pages:
138 | 
139 | ::
140 | 
141 |     autoscrape \
142 |       --backend selenium \
143 |       --output search_query_data \
144 |       --form-match "I'm a search form" \
145 |       --input "i:0:NAME,d:1:1992-01-20" \
146 |       --next-match "Next ->" \
147 |       'https://some.page/search?s=newquery'
148 | 
149 | Setup for Standalone Local CLI
150 | ------------------------------
151 | 
152 | External Dependencies
153 | ~~~~~~~~~~~~~~~~~~~~~
154 | 
155 | If you want to use the ``selenium`` backend for interactive crawling,
156 | you need to have geckodriver installed. You can do that here:
157 | 
158 | ::
159 | 
160 |     https://github.com/mozilla/geckodriver/releases
161 | 
162 | Or through your package manager:
163 | 
164 | ::
165 |     apt install firefox-geckodriver
166 | 
167 | Your ``geckodriver`` needs to be compatible with your current version of
168 | Firefox or you will get errors. If you install FF and the driver
169 | through your package manager, you *should* be okay, but it's
170 | not guaranteed. We have specific versions of both pinned in the
171 | ``Dockerfile``.
172 | 
173 | If you prefer to use Chrome, you will need the ChromeDriver (we've
174 | tested using v2.41). It can be found in your distribution's package
175 | manager or here:
176 | 
177 | ::
178 | 
179 |     https://sites.google.com/a/chromium.org/chromedriver/downloads
180 | 
181 | Installing the remaining Python dependencies can be done using pip.
182 | 
183 | Pip Install Method
184 | ~~~~~~~~~~~~~~~~~~
185 | 
186 | Next you need to set up your python virtual environment (Python 3.6
187 | required) and install the Python dependencies:
188 | 
189 | ::
190 | 
191 |     pip install -r requirements.txt
192 | 
193 | Running Standalone Scraper
194 | --------------------------
195 | 
196 | Environment Test Crawler
197 | ~~~~~~~~~~~~~~~~~~~~~~~~
198 | 
199 | You can run a test to ensure your webdriver is set up correctly by
200 | running the ``test`` crawler:
201 | 
202 | ::
203 | 
204 |     ./autoscrape --backend selenium --show-browser [SITE_URL]
205 | 
206 | The ``test`` crawler will just do a depth-first click-only crawl of an
207 | entire website. It will not interact with forms or POST data. Data will
208 | be saved to ``./autoscrape-data/`` (the default output directory).
209 | 
210 | Manual Config-Based Scraper
211 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
212 | 
213 | Autoscrape has a manually controlled mode, similar to wget, except this
214 | uses interactive capabilities and can input data to search forms, follow
215 | "next page"-type buttons, etc. This functionality can be used either as
216 | a standalone crawler/scraper or as a method to build a training set for
217 | the automated scrapers.
218 | 
219 | Autoscrape manual-mode full options:
220 | 
221 | ::
222 | 
223 |     AUTOSCRAPE - Interactively crawl, find searchable forms,
224 |     input data to them and scrape data on the results, from an
225 |     initial BASEURL.
226 | 
227 |     Usage:
228 |         autoscrape [options] BASEURL
229 | 
230 |     General Options:
231 |         --backend BACKEND
232 |             The backend to use. Currently one of "selenium", "requests" or
233 |             "warc".  The requests browser is only capable of crawling, but
234 |             is approximately 2-3.5x faster. WARC is for emulating browsing
235 |             through Common Crawl archival data.
236 |             [default: selenium]
237 | 
238 |         --loglevel LEVEL
239 |             Loglevel, note that DEBUG is extremely verbose.
240 |             [default: INFO]
241 | 
242 |         --quiet
243 |             This will silence all logging to console.
244 | 
245 |     Crawl-Specific Options:
246 |         --maxdepth DEPTH
247 |             Maximum depth to crawl a site (in search of form
248 |             if the option --form-match STRING is specified,
249 |             see below). Setting to 0 means don't crawl at all,
250 |             all operations are limited to the BASEURL page.
251 |             Setting to -1 means unlimited maximum crawl depth.
252 |             [default: 10]
253 | 
254 |         --max-pages NUM
255 |             Maximum number of unique pages, in total, to fetch.
256 |             AutoScrape will stop crawling once this is hit.
257 | 
258 |         --leave-host
259 |             By default, autoscrape will not leave the host given
260 |             in the BASEURL. This option lets the scraper leave
261 |             the host.
262 | 
263 |         --only-links MATCH_STREING
264 |             A whitelist of links to follow. All others will
265 |             be ignored. Can be a string or a regex with
266 |             multiple strings to match separated by a pipe
267 |             (|) character.
268 | 
269 |         --ignore-links MATCH_STRING
270 |             This option can be used to remove any links matching
271 |             MATCH_STRING (can be a regex or just a string match)
272 |             from consideration for clicking. Accepts the same
273 |             argument format as --only-links.
274 | 
275 |         --link-priority SORT_STRING
276 |             A string to sort the links by. In this case, any link
277 |             containing "SORT_STRING" will be clicked before any other
278 |             links. In most cases you probably want to use the
279 |             whitelist, --only-links, option.
280 | 
281 |         --ignore-extensions IGNORE_EXTENSIONS
282 |             Don't click on or download URLs pointing to files with
283 |             these extensions.
284 | 
285 |         --result-page-links MATCH_STRINGS_LIST
286 |             If specified, AutoScrape will click on any links matching
287 |             this string when it arrives on a search result page.
288 | 
289 |     Interactive Form Search Options:
290 |         --form-match SEARCH_STRING
291 |             The crawler will identify a form to search/scrape if it
292 |             contains the specified string. If matched, it will be
293 |             interactively scraped using the below instructions.
294 | 
295 |         --input INPUT_DESCRIPTION
296 |             Interactive search descriptor. This describes how to
297 |             interact with a matched form. The inputs are
298 |             described in the following format:
299 | 
300 |             "c:0:True,i:0:atext,s:1:France:d:0:1991-01-20"
301 | 
302 |             A single-input type can be one of three types:
303 |             checkbox ("c"), input box ("i"), option select
304 |             ("s"), and date inputs ("d", with inputs in the
305 |             "YYYY-MM-DD" format). The type is separated by a
306 |             colon, and the input index position is next. (Each
307 |             input type has its own list, so a form with one
308 |             input, one checkbox, and one option select, will all
309 |             be at index 0.) The final command, sepearated by
310 |             another colon, describes what to do with the input.
311 | 
312 |             Multiple inputs are separated by a comma, so you can
313 |             interact with multiple inputs before submitting the
314 |             form.
315 | 
316 |             To illustrate this, the above command does the following:
317 |                 - first input checkbox is checked (uncheck is False)
318 |                 - first input box gets filled with the string "first"
319 |                 - second select input gets the "France" option chosen
320 |                 - first date input gets set to Jan 20, 1991
321 | 
322 |         --next-match NEXT_BTN_STRING
323 |             A string to match a "next" button with, after
324 |             searching a form.  The scraper will continue to
325 |             click "next" buttons after a search until no matches
326 |             are found, unless limited by the --formdepth option
327 |             (see below). [default: next page]
328 | 
329 |         --formdepth DEPTH
330 |             How deep the scraper will iterate, by clicking
331 |             "next" buttons. Zero means infinite depth.
332 |             [default: 0]
333 | 
334 |         --form-submit-natural-click
335 |             Some webpages make clicking a link element difficult
336 |             due to JavaScript onClick events. In cases where a
337 |             click does nothing, you can use this option to get
338 |             the scraper to emulate a mouse click over the link's
339 |             poition on the page, activating any higher level JS
340 |             interactions.
341 | 
342 |         --form-submit-wait SECONDS
343 |             How many seconds to force wait after a submit to a form.
344 |             This should be used in cases where the builtin
345 |             wait-for-page-load isn't working properly (JS-heavy
346 |             pages, etc). [default: 5]
347 | 
348 |     Webdriver-Specific and General Options:
349 |         --load-images
350 |             By default, images on a page will not be fetched.
351 |             This speeds up scrapes on sites and lowers bandwidth
352 |             needs. This option fetches all images on a page.
353 | 
354 |         --show-browser
355 |             By default, we hide the browser during operation.
356 |             This option displays a browser window, mostly
357 |             for debugging purposes.
358 | 
359 |         --driver DRIVER
360 |             Which browser to use. Current support for "Firefox",
361 |             "Chrome", and "remote". [default: Firefox]
362 | 
363 |         --browser-binary PATH_TO_BROWSER
364 |             Path to a specific browser binary. If left blank
365 |             selenium will pull the browser found on your path.
366 | 
367 |         --remote-hub URI
368 |             If using "remote" driver, specify the hub URI to
369 |             connect to. Needs the proto, address, port, and path.
370 |             [default: http://localhost:4444/wd/hub]
371 | 
372 |     WARC Options:
373 |         --warc-directory PATH_TO_WARCS
374 |             Path to the folder containing GZipped WARC files. These can be
375 |             downloaded from Common Crawl. Required when using the "warc"
376 |             backend.
377 | 
378 |         --warc-index-file PATH_TO_LEVELDB
379 |             Path to the level DB database holding the URL-to-file
380 |             index: URL => (filename, record_number)
381 |             This will be generated from the WARCS in the --warc-directory
382 |             speficied if it's not already. Required when using the "warc"
383 |             backend.
384 | 
385 |     Data Saving Options:
386 |         --output DIRECTORY_OR_URL
387 |             If specified, this indicates where to save pages during a
388 |             crawl. This directory will be created if it does not
389 |             currently exist.  This directory will have several
390 |             sub-directories that contain the different types of pages
391 |             found (i.e., search_pages, data_pages, screenshots).
392 |             This can also accept a URL (i.e., http://localhost:5000/files)
393 |             and AutoScrape will POST to that endpoint with each
394 |             file scraped.
395 |             [default: autoscrape-data]
396 | 
397 |         --keep-filename
398 |             By default, we hash the files in a scrape in order to
399 |             account for dynamic content under a single-page app
400 |             (SPA) website implmentation. This option will force
401 |             the scraper to retain the original filename, from the
402 |             URL when saving scrape data.
403 | 
404 |         --save-screenshots
405 |             This option makes the scraper save screenshots of each
406 |             page, interaction, and search. Screenshots will be
407 |             saved to the screenshots folder of the output dir.
408 | 
409 |         --full-page-screenshots
410 |             By default, we only save the first displayed part of the
411 |             webpage. The remaining portion that you can only see
412 |             by scrolling down isn't captured. Setting this option
413 |             forces AutoScrape to scroll down and capture the entire
414 |             web content. This can fail in certain circumstances, like
415 |             in API output mode and should be used with care.
416 | 
417 |         --save-graph
418 |             This option allows the scraper to build a directed graph
419 |             of the entire scrape and will save it to the "graph"
420 |             subdirectory under the output dir. The output file
421 |             is a timestamped networkx pickled graph.
422 | 
423 |         --disable-style-saving
424 |             By default, AutoScrape saves the stylesheets associated
425 |             with a scraped page. To save storage, you can disable this
426 |             functionality by using this option.
427 | 
428 | AutoScrape Web UI (Docker)
429 | --------------------------
430 | 
431 | AutoScrape can be ran as a containerized cluster environment, where
432 | scrapes can be triggered and stopped via API calls and data can be
433 | streamed to this server.
434 | 
435 | This requires the `autoscrape-www <https://github.com/brandonrobertz/autoscrape-www>`__ submodule to be pulled:
436 | 
437 | ::
438 | 
439 |     git submodule init
440 |     git submodule update
441 | 
442 | This will pull the browser-based UI into the `www/` folder.
443 | 
444 | You need
445 | `docker-ce <https://docs.docker.com/install/#server>`__ and
446 | `docker-compose <https://docs.docker.com/compose/install/>`__. Once you
447 | have these dependencies installed, simply run:
448 | 
449 | ::
450 | 
451 |     docker-compose build --pull
452 |     docker-compose up
453 | 
454 | This will build the containers and launch a API server running on local
455 | port 5000. More information about the API calls can be found in
456 | ``autoscrape-server.py``.
457 | 
458 | If you have make installed, you can simply run ``make start``.
459 | 


--------------------------------------------------------------------------------
/autoscrape-server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | 
  4 | from flask import (
  5 |     Flask, request, jsonify, send_from_directory
  6 | )
  7 | from flask_sqlalchemy import SQLAlchemy
  8 | from sqlalchemy import create_engine
  9 | from sqlalchemy_utils import database_exists, create_database
 10 | 
 11 | import autoscrape.tasks as tasks
 12 | 
 13 | 
 14 | connect_str = 'postgresql://%s:%s@%s/autoscrape' % (
 15 |     os.environ["AUTOSCRAPE_DB_USER"],
 16 |     os.environ["AUTOSCRAPE_DB_PASSWORD"],
 17 |     os.environ["AUTOSCRAPE_DB_HOST"]
 18 | )
 19 | 
 20 | engine = create_engine(connect_str)
 21 | if not database_exists(engine.url):
 22 |     create_database(engine.url)
 23 | 
 24 | app = Flask("autoscrape-server", static_url_path="", static_folder="www/build")
 25 | app.config['SQLALCHEMY_DATABASE_URI'] = connect_str
 26 | db = SQLAlchemy(app)
 27 | 
 28 | 
 29 | class Data(db.Model):
 30 |     """
 31 |     Store our scrape data here, indexed by the scrape ID,
 32 |     timestamp and fileclass.
 33 |     """
 34 |     __tablename__ = "data"
 35 |     id = db.Column(db.Integer, primary_key=True)
 36 |     timestamp = db.Column(
 37 |         db.DateTime,
 38 |         default=db.func.current_timestamp(),
 39 |         nullable=False
 40 |     )
 41 |     task_id = db.Column(db.String, nullable=False)
 42 |     name = db.Column(db.String, nullable=False)
 43 |     fileclass = db.Column(db.String, nullable=False)
 44 |     data = db.Column(db.String, nullable=False)
 45 |     url = db.Column(db.String, nullable=False)
 46 | 
 47 |     db.UniqueConstraint('task_id', 'name', name='unique_name_per_task_1')
 48 | 
 49 |     def __init__(self, task_id, name, fileclass, data, url):
 50 |         self.task_id = task_id
 51 |         self.name = name
 52 |         self.fileclass = fileclass
 53 |         self.data = data
 54 |         self.url = url
 55 | 
 56 |     def __repr__(self):
 57 |         return '<Data %r, %r>' % (self.name, self.fileclass)
 58 | 
 59 |     @property
 60 |     def serialize(self):
 61 |         return {
 62 |             "id": self.id,
 63 |             "timestamp": self.timestamp.isoformat(),
 64 |             "name": self.name,
 65 |             "fileclass": self.fileclass,
 66 |             "url": self.url,
 67 |         }
 68 | 
 69 | 
 70 | @app.route("/", methods=["GET"])
 71 | @app.route("/scrape", methods=["GET"])
 72 | @app.route("/scrape/<id>", methods=["GET"])
 73 | @app.route("/build-extractor", methods=["GET"])
 74 | @app.route("/download-data", methods=["GET"])
 75 | @app.route("/help", methods=["GET"])
 76 | def get_root(id=None):
 77 |     return send_from_directory("www/build", "index.html")
 78 | 
 79 | 
 80 | @app.route("/<path:path>", methods=["GET"])
 81 | def get_path(path):
 82 |     mimetypes = {
 83 |         '.wasm': 'application/wasm',
 84 |     }
 85 |     mimetype = mimetypes.get(path[-4:], None)
 86 |     return send_from_directory("www/build", path, mimetype=mimetype)
 87 | 
 88 | 
 89 | @app.after_request
 90 | def disable_cors(response):
 91 |     response.headers['Access-Control-Allow-Origin'] = '*'
 92 |     response.headers['Access-Control-Allow-Headers'] = '*'
 93 |     return response
 94 | 
 95 | 
 96 | @app.route("/start", methods=["POST"])
 97 | def post_start():
 98 |     """
 99 |     This is the main endpoint for starting AutoScrape processes. This
100 |     endpoint simply accepts the standard parameters as a JSON payload.
101 |     Returns a status message and the scrape task ID, which can be used
102 |     to query status or stop the scrape.
103 | 
104 |     Curl Example:
105 |         curl http://localhost:5000/start -H 'content-type: application/json' \
106 |             --data '{"baseurl": "https://bxroberts.org",}'
107 | 
108 |     Success Returns:
109 |         HTTP 200 OK
110 |         {"status": "OK", "data": "SCRAPE-ID"}
111 |     """
112 |     app.logger.debug("Starting AutoScrape job")
113 |     args = request.get_json()
114 |     app.logger.debug("Arguments: %s" % args)
115 |     baseurl = args.pop("baseurl")
116 |     # disables double logging in celery worker
117 |     args["stdout"] = False
118 |     args["loglevel"] = "DEBUG"
119 |     args["output"] = os.environ.get(
120 |         "AUTOSCRAPE_API_URL",
121 |         "http://flask:5000/receive"
122 |     )
123 |     app.logger.debug("Baseurl: %s" % baseurl)
124 |     result = tasks.start.apply_async((baseurl, args))
125 |     app.logger.debug("Result: %s" % result)
126 |     return jsonify({"status": "OK", "data": result.id})
127 | 
128 | 
129 | @app.route("/status/<id>", methods=["GET"])
130 | def get_status(id):
131 |     """
132 |     Get status about a running AutoScrape task specified by
133 |     its task ID.
134 | 
135 |     HTTP GET /status/SCRAPE-ID
136 | 
137 |     Success Returns:
138 |         HTTP 200 OK
139 |         {"status": "OK", "message": "STARTED", "traceback": None}
140 |     """
141 |     result = tasks.app.AsyncResult(id)
142 |     data = Data.query.filter_by(
143 |         task_id=id,
144 |         fileclass="screenshot"
145 |     ).order_by(
146 |         Data.timestamp.desc()
147 |     ).first()
148 |     app.logger.debug("Task state: %s" % result.state)
149 |     response = {
150 |         "status": "OK",
151 |         "message": result.state,
152 |     }
153 |     if result.traceback:
154 |         response["traceback"] = result.traceback
155 |     if data:
156 |         app.logger.debug("Data: %s" % data)
157 |         response["data"] = data.data
158 |         response["url"] = data.url
159 |     return jsonify(response)
160 | 
161 | 
162 | @app.route("/stop/<id>", methods=["POST"])
163 | def get_stop(id):
164 |     """
165 |     Stop a running AutoScrape task specified by a task ID.
166 | 
167 |     HTTP POST /stop/SCRAPE-ID
168 |         [no data required]
169 | 
170 |     Success Returns:
171 |         HTTP 200 OK
172 |         {"status": "OK"}
173 |     """
174 |     app.logger.debug("Stopping scraper task: %s" % id)
175 |     result = tasks.app.AsyncResult(id)
176 |     result.revoke(terminate=True, signal='SIGKILL')
177 |     return jsonify({"status": "OK"})
178 | 
179 | 
180 | @app.route("/receive/<id>", methods=["POST"])
181 | def receive_data(id):
182 |     """
183 |     This is a callback endpoint for receiving scrape data from
184 |     a running AutoScrape instance, configured to send its data
185 |     to this endpoint.
186 | 
187 |     HTTP POST /receive
188 |         {
189 |             "name": "crawl_data/some_file_name.html",
190 |             "data": "base64-encoded-file-data",
191 |             "fileclass": "crawl_data|screenshots|downloads|..."
192 |         }
193 |     """
194 |     app.logger.debug("Task ID : %s" % id)
195 |     args = request.get_json()
196 |     name = args["name"]
197 |     app.logger.debug("Name: %s" % name)
198 |     fileclass = args["fileclass"]
199 |     app.logger.debug("File class: %s" % (fileclass))
200 |     url = args["url"]
201 |     app.logger.debug("URL: %s" % (url))
202 | 
203 |     try:
204 |         data = args["data"]
205 |         app.logger.debug("Data: %s" % len(data))
206 |         # app.logger.debug("Decoded: %s" % decoded)
207 |     except Exception as e:
208 |         app.logger.debug("Error parsing POST JSON: %s" % e)
209 |         data = None
210 |         fileclass = None
211 | 
212 |     # TODO: write b64 data to postgres under task ID key
213 |     scraped_data = Data(id, name, fileclass, data, url)
214 |     db.session.add(scraped_data)
215 |     db.session.commit()
216 |     app.logger.debug("Updated task state")
217 | 
218 |     # TODO: store/dispatch this data somewhere
219 |     return jsonify({"status": "OK"})
220 | 
221 | 
222 | @app.route("/files/list/<id>", methods=["GET"])
223 | def list_files(id):
224 |     """
225 |     Get a directory listing for a scrape's data, with
226 |     an optional fileclass query param (only look at downloads,
227 |     crawl_data, data_files, etc). Defaults to *all* data
228 |     scraped, ordered by date.
229 |     """
230 |     filter_params = {
231 |         "task_id": id,
232 |     }
233 | 
234 |     fileclass = request.args.get("fileclass")
235 |     if fileclass:
236 |         filter_params["fileclass"] = fileclass
237 | 
238 |     page = int(request.args.get("page", 1))
239 |     pagination = Data.query.filter_by(
240 |         **filter_params
241 |     ).order_by(
242 |         Data.timestamp.desc()
243 |     ).paginate(page=page, error_out=False)
244 | 
245 |     return jsonify({
246 |         "status": "OK",
247 |         "has_next": pagination.has_next,
248 |         "has_prev": pagination.has_prev,
249 |         "page": pagination.page,
250 |         "data": [d.serialize for d in pagination.items]
251 |     })
252 | 
253 | 
254 | @app.route("/files/data/<task_id>/<file_id>", methods=["GET"])
255 | def get_file_data(task_id, file_id):
256 |     """
257 |     Get the raw data for an individual file.
258 |     """
259 |     app.logger.debug("Fetching task_id: %s, file_id: %s" % (
260 |         task_id, file_id))
261 | 
262 |     data = Data.query.filter_by(
263 |         task_id=task_id,
264 |         id=file_id
265 |     ).order_by(
266 |         Data.timestamp.desc()
267 |     ).first()
268 | 
269 |     app.logger.debug("Data: %s" % data)
270 | 
271 |     return jsonify({
272 |         "status": "OK",
273 |         "data": {
274 |             "scrape_id": task_id,
275 |             "id": file_id,
276 |             "name": data.name,
277 |             "timestamp": data.timestamp,
278 |             "data": data.data,
279 |             "fileclass": data.fileclass,
280 |             "url": data.url,
281 |         }
282 |     })
283 | 
284 | 
285 | if __name__ == "__main__":
286 |     db.create_all()
287 |     app.run(host='0.0.0.0', port=5000)
288 | 


--------------------------------------------------------------------------------
/autoscrape.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 | from autoscrape.cli import scrape
4 | 
5 | 
6 | if __name__ == "__main__":
7 |     scrape.main()
8 | 


--------------------------------------------------------------------------------
/autoscrape/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # flake8: noqa: F401
 3 | __title__ = 'autoscrape-py'
 4 | __author__ = 'Brandon Roberts (brandon@bxroberts.org)'
 5 | __license__ = 'AGPLv3'
 6 | __version__ = '1.6.14'
 7 | 
 8 | 
 9 | from autoscrape.scrapers.test import TestScraper
10 | from autoscrape.scrapers.null import NullScraper
11 | from autoscrape.scrapers.manual import ManualControlScraper
12 | 
13 | 
14 | """
15 | COMMAND        Logical Control Flow Step
16 | --------       ---------------------------------------------------------------
17 | INIT (url)               initialize & get entry point
18 |                                      │
19 |                                      ↓
20 |                                  load page    🠤───────────────────┐
21 |                                      │                            │
22 | GET_CLICKABLE                        │        click a link based on likelihood
23 | SELECT_LINK (index)                  │               of finding a search form
24 |                                      ↓                            │
25 | GET_FORMS    ┌────🠦 look for search form (possibly classifier) ───┘
26 |              │                       │
27 |              │                       │ FOUND
28 |              │                       ↓
29 | GET_INPUTS   │         identify forms on page that require input
30 |              │     (begin with config then move to heuristic then ML)
31 |              │                       │
32 |              │                       ↓
33 |              │      initialize iterators for required inputs
34 |              │      (begin with config/brute force, then RL)
35 |              │                       │
36 |              │                       ↓
37 |              └─────── are we at the end of our iterators?
38 |                 YES                  │
39 |                                      ↓
40 | INPUT (index, chars)     enter data into form inputs 🠤───────┐
41 |                                      │                       │
42 |                                      ↓                       │
43 | SUBMIT (index)          submit form and load next page       │
44 |                                      │                       │
45 |                                      ↓                       │
46 |                      ┌──────🠦 scrape the page                │
47 |                      │               │                       │
48 |                      │               ↓                       │
49 | GET_LINKS            │     look for a next button ───────────┘
50 |                      │         (classifier)        NOT FOUND
51 |                      │               │
52 |                      │               │ YES
53 |                      │               ↓
54 | SELECT_LINK (index)  └─── click the next button & load page
55 | """
56 | 


--------------------------------------------------------------------------------
/autoscrape/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/backends/__init__.py


--------------------------------------------------------------------------------
/autoscrape/backends/base/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/backends/base/__init__.py


--------------------------------------------------------------------------------
/autoscrape/backends/base/browser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from autoscrape.backends.base.tags import TaggerBase
 3 | 
 4 | 
 5 | class BrowserBase(TaggerBase):
 6 |     def _no_tags(self, data, l_type="path"):
 7 |         clean = []
 8 |         if type(data) == tuple:
 9 |             data = list(data)
10 |         for p in data:
11 |             name, t_args, kwargs = p
12 |             args = list(t_args)
13 |             if name == "click":
14 |                 if not args:
15 |                     continue
16 |                 args[0] = "[tag]"
17 |             clean.append((name, args, kwargs))
18 |         return clean
19 | 
20 |     def click(self, tag, **kwargs):
21 |         self.path.append((
22 |             "click", [tag], {"url": url}
23 |         ))
24 |         node = "Click\n text: %s\n hash: %s" % (text, hash)
25 |         node_meta = {
26 |             "click": tag,
27 |             "click_text": text,
28 |             "click_iterating_form": None,
29 |         }
30 |         self.graph.add_node(
31 |             node,
32 |             **node_meta
33 |         )
34 |         self.graph.move_to_node(node)
35 | 
36 |     def fetch(self, url, initial=False):
37 |         self.graph.add_root_node(node, url=url, action="fetch")
38 | 
39 |     def back(self):
40 |         self.graph.move_to_parent()
41 | 
42 |     def input(self, tag, input):
43 |         self.path.append(("input", ("", input,), {}))
44 |         action = {
45 |             "action": "input",
46 |             "text": input,
47 |             "tag": tag,
48 |         }
49 |         self.graph.add_action_to_current(action)
50 | 
51 |     def submit(self, tag, add_node=True):
52 |         self.path.append(("submit", (tag,), {}))
53 |         node = "Submit\n tag: %s" % (tag)
54 |         node_meta = {
55 |             "submit": tag,
56 |         }
57 |         self.graph.add_node(node, **node_meta)
58 |         self.graph.move_to_node(node)
59 | 
60 |     @property
61 |     def infinite_loop_detected(self):
62 |         return False
63 | 


--------------------------------------------------------------------------------
/autoscrape/backends/base/dom.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import logging
  3 | import os
  4 | import re
  5 | import urllib
  6 | 
  7 | from autoscrape.util import write_file, get_filename_from_url
  8 | 
  9 | 
 10 | logger = logging.getLogger('AUTOSCRAPE')
 11 | 
 12 | 
 13 | class DomBase:
 14 |     """
 15 |     Stateful base of a web scraper. This class deals with finding and interacting
 16 |     with elements and tags. It also holds the base state variables like
 17 |     current url.
 18 |     """
 19 | 
 20 |     def __init__(self, leave_host=False, current_url=None, current_html=None):
 21 |         self.leave_host = leave_host
 22 |         self.current_url = current_url
 23 |         self.current_html = current_html
 24 | 
 25 |     def elements_by_path(self, path, from_element=None):
 26 |         """
 27 |         Return element nodes matching a path where path could be xpath,
 28 |         css, etc, depending on the backend)
 29 |         """
 30 |         raise NotImplementedError("DomBase.elements_by_path not implemented")
 31 | 
 32 |     def element_attr(self, element, name, default=None):
 33 |         """
 34 |         For a given element and attribute name, return the value if it
 35 |         exists.
 36 |         """
 37 |         raise NotImplementedError("DomBase.element_attr not implemented")
 38 | 
 39 |     def element_by_tag(self, tag):
 40 |         """
 41 |         For a given tag, return the specified element.
 42 |         """
 43 |         raise NotImplementedError("DomBase.element_by_tag not implemented")
 44 | 
 45 |     def get_stylesheet(self):
 46 |         """
 47 |         Return the text of all loaded CSS stylesheets.
 48 |         """
 49 |         raise NotImplementedError("DomBase.get_stylesheet not implemented")
 50 | 
 51 |     def element_tag_name(self):
 52 |         """
 53 |         Return the tag name of the given element.
 54 |         """
 55 |         raise NotImplementedError("DomBase.element_tag_name not implemented")
 56 | 
 57 |     def element_text(self, element, block=False):
 58 |         """
 59 |         Return the text of an element, or the combined text of all its
 60 |         descendants (if block=True).
 61 |         """
 62 |         raise NotImplementedError("DomBase.element_text not implemented")
 63 | 
 64 |     def element_value(self, element):
 65 |         """
 66 |         Return the text value of an element (e.g., input element). Since
 67 |         this is usually called like element.value() or element.value, we
 68 |         wrap this functionality here.
 69 |         """
 70 |         if not hasattr(element, "value"):
 71 |             raise NotImplementedError("DomBase.element_value not implemented")
 72 |         return element.value
 73 | 
 74 |     def element_name(self, element):
 75 |         """
 76 |         Return the name of an element (e.g., input element).        """
 77 |         if not hasattr(element, "name"):
 78 |             raise NotImplementedError("DomBase.element_name not implemented")
 79 |         return element.name
 80 | 
 81 |     def element_displayed(self, element):
 82 |         """
 83 |         Figure out if an element is displayed. If its not, we can't really
 84 |         interact with it (get its text or attribs, etc). This only effects
 85 |         the selenium backend right now.
 86 |         """
 87 |         return True
 88 | 
 89 |     def download_file(self, url, return_data=False):
 90 |         """
 91 |         Fetch the given url, returning a byte stream of the page data. This
 92 |         really is only useful in situations where the scraper is on a binary
 93 |         filetype, such as PDF, etc.
 94 | 
 95 |         Note that we're doing this as opposed to some XHR thing inside the
 96 |         selenium driver due to CORS issues.
 97 |         """
 98 |         logger.debug("Fetching non-HTML page directly: %s" % url)
 99 |         user_agent = (
100 |             "Mozilla/5.0 "
101 |             "(Windows NT 10.0; Win64; x64; rv:62.0) "
102 |             "Gecko/20100101 Firefox/62.0"
103 |         )
104 |         request = urllib.request.Request(url, headers={
105 |             "User-Agent": user_agent,
106 |             "Referrer": self.page_url,
107 |         })
108 | 
109 |         try:
110 |             response = urllib.request.urlopen(request)
111 |         except urllib.error.HTTPError as e:
112 |             logger.debug("[!] HTTP error while downloading: %s" % (e))
113 |             return
114 | 
115 |         data = response.read()
116 |         action = {
117 |             "action": "download_file",
118 |             "url": url,
119 |         }
120 |         self.graph.add_action_to_current(action)
121 |         if return_data:
122 |             return data
123 | 
124 |         # always keep filename for downloads, for now
125 |         if re.match("^https?://", self.output):
126 |             dl_dir = "downloads"
127 |         else:
128 |             dl_dir = os.path.join(self.output, "downloads")
129 | 
130 |         parsed_filename = get_filename_from_url(url)
131 |         logger.debug("Parsed output filename: %s" % parsed_filename)
132 |         filepath = os.path.join(dl_dir, parsed_filename)
133 |         write_file(
134 |             filepath, data, fileclass="download", writetype="wb",
135 |             output=self.output, url=self.page_url,
136 |         )
137 | 
138 |     def _no_tags(self, list, l_type="path"):
139 |         clean = []
140 |         for p in list:
141 |             name, args, kwargs = p
142 |             if name == "click":
143 |                 args[0] = "tag"
144 |             clean.append((name, args, kwargs))
145 |         return clean
146 | 


--------------------------------------------------------------------------------
/autoscrape/backends/base/graph.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/backends/base/graph.py


--------------------------------------------------------------------------------
/autoscrape/backends/base/tags.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import logging
  3 | 
  4 | from urllib.parse import urlparse
  5 | 
  6 | from autoscrape.backends.base.dom import DomBase
  7 | 
  8 | 
  9 | logger = logging.getLogger('AUTOSCRAPE')
 10 | 
 11 | 
 12 | class TaggerBase(DomBase):
 13 |     """
 14 |     Generates tags from a given page that can be used, in a stateless manner,
 15 |     to refer to unique elements on a web page.
 16 |     """
 17 | 
 18 |     def __init__(self, **kwargs):
 19 |         super().__init__(**kwargs)
 20 |         self.base_host = urlparse(self.current_url).netloc
 21 | 
 22 |     def tag_from_element(self, element):
 23 |         """
 24 |         For a given element, return a path (a.k.a. a "tag") leading
 25 |         to it. Path can be CSS, XPath, or whatever the backend supports.
 26 |         """
 27 |         raise NotImplementedError("Tagger.tag_from_element not implemented")
 28 | 
 29 |     def clickable_sanity_check(self, element, href=None):
 30 |         """
 31 |         Check a series of element nodes, checking their attributes and other
 32 |         attributes, determining if a element is actually 'clickable'. This
 33 |         check determines which nodes will end up as clickable options for
 34 |         the scraper on this page.
 35 |         """
 36 |         if not href:
 37 |             raw_href = self.element_attr(element, "href")
 38 |             if not raw_href:
 39 |                 return False
 40 | 
 41 |             if hasattr(self, "_normalize_url"):
 42 |                 href = self._normalize_url(raw_href).split("#")[0]
 43 |             else:
 44 |                 href = raw_href
 45 | 
 46 |         if href.split("#")[0] == self.current_url:
 47 |             return False
 48 | 
 49 |         # skip any weird protos ... we whitelist notrmal HTTP,
 50 |         # anchor tags and blank tags (to support JavaScript & btns)
 51 |         if href and \
 52 |            not href.startswith("//") and \
 53 |            not href.startswith("https:") and \
 54 |            not href.startswith("http:") and \
 55 |            not href.startswith("javascript"):
 56 |             return False
 57 | 
 58 |         # Don't leave base host ... configurable?
 59 |         elem_host = urlparse(href).netloc
 60 |         if elem_host and not self.leave_host and elem_host != self.base_host:
 61 |             return False
 62 | 
 63 |         return True
 64 | 
 65 |     def get_clickable(self, path="//a"):
 66 |         """
 67 |         Get all clickable element tags on the current page.
 68 |         """
 69 |         tags = []
 70 |         a_elems = self.elements_by_path(path)
 71 |         for element in a_elems:
 72 |             if not self.clickable_sanity_check(element):
 73 |                 continue
 74 | 
 75 |             tag = self.tag_from_element(element)
 76 |             # No way to get back to here, so we can't use it
 77 |             if not tag:
 78 |                 logger.warn("No tag for element %s" % (element))
 79 |                 return False
 80 | 
 81 |             tags.append(tag)
 82 |         return tags
 83 | 
 84 |     def get_inputs(self, form=None, itype=None, root_node=None):
 85 |         """
 86 |         Get inputs, either for full page or by a form WebElement.
 87 |         Returns a list of tags. itype can be one of "text", "select",
 88 |         "checkbox", or None (all types), indicating the type of input.
 89 |         """
 90 |         x_path = "//input"
 91 |         if itype == "select":
 92 |             x_path = "//select"
 93 |         elif itype == "date":
 94 |             x_path = "//input[@type='date']|//input[@structure='date']"
 95 |         elif itype:
 96 |             x_path = "//input[@type='%s']" % (itype)
 97 | 
 98 |         elem = root_node
 99 |         tags = []
100 |         if form is not None:
101 |             elem = form
102 |             x_path = ".%s" % x_path
103 | 
104 |         # radio checkboxes are grouped by name
105 |         if itype == "radio":
106 |             # store the radio group position by name here
107 |             radio_names = []
108 |             elems = self.elements_by_path(x_path, from_element=elem)
109 |             for radio in elems:
110 |                 name = self.element_attr(radio, "name")
111 |                 if name not in radio_names:
112 |                     radio_names.append(name)
113 |                     tags.append([])
114 |                 radio_group_ix = radio_names.index(name)
115 |                 radio_tag = self.tag_from_element(radio)
116 |                 if not radio_tag:
117 |                     logger.warn("No tag for radio %s" % radio)
118 |                     continue
119 |                 tags[radio_group_ix].append(radio_tag)
120 | 
121 |         # all the rest are flat
122 |         else:
123 |             elems = self.elements_by_path(x_path, from_element=elem)
124 |             for inp in elems:
125 |                 # this should weed out hidden/non-displayed inputs ...
126 |                 if not self.element_displayed(inp):
127 |                     continue
128 |                 # .. and just in case it didn't
129 |                 if self.element_attr(inp, "type") == "hidden":
130 |                     continue
131 |                 input_tag = self.tag_from_element(inp)
132 |                 if not input_tag:
133 |                     logger.warn("No tag for input %s" % input)
134 |                     continue
135 | 
136 |                 tags.append(input_tag)
137 | 
138 |         return tags
139 | 
140 |     def get_forms(self):
141 |         """
142 |         Get all tags to forms on a page and their respective text
143 |         inputs. Tags are returned in a dict, with the form tag as
144 |         the key and a list of input CSSPaths under the form.
145 |         """
146 |         x_path = "//form"
147 |         forms = self.elements_by_path(x_path)
148 | 
149 |         tags = {}
150 |         for elem in forms:
151 |             if not self.element_displayed(elem):
152 |                 continue
153 |             tag = self.tag_from_element(elem)
154 |             if not tag:
155 |                 logger.warn("No tag for element %s" % elem)
156 |                 continue
157 | 
158 |             tags[tag] = [
159 |                 self.get_inputs(form=elem, itype="text"),
160 |                 self.get_inputs(form=elem, itype="select"),
161 |                 self.get_inputs(form=elem, itype="checkbox"),
162 |                 self.get_inputs(form=elem, itype="date"),
163 |                 self.get_inputs(form=elem, itype="radio"),
164 |             ]
165 | 
166 |         return tags
167 | 
168 |     def get_buttons(self, in_form=False, path=None):
169 |         """
170 |         Return all tags leading to a form link, button, or submit input button,
171 |         optionally given a base form to look from. This is used to identify
172 |         clickable things related to forms.
173 |         """
174 |         x_path = path or "|".join([
175 |             "//form//a", "//button", "//input[@type='button']",
176 |             "//input[@type='submit']", "//table//a",
177 |         ])
178 |         btns = self.elements_by_path(x_path)
179 | 
180 |         tags = []
181 |         for elem in btns:
182 |             if not self.element_displayed(elem):
183 |                 continue
184 |             tag = self.tag_from_element(elem)
185 |             if not tag:
186 |                 logger.warn("No tag for element %s" % elem)
187 |                 continue
188 |             tags.append(tag)
189 |         return tags
190 | 


--------------------------------------------------------------------------------
/autoscrape/backends/requests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/backends/requests/__init__.py


--------------------------------------------------------------------------------
/autoscrape/backends/requests/browser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import logging
  3 | import re
  4 | import time
  5 | 
  6 | import requests
  7 | from autoscrape.backends.base.browser import BrowserBase
  8 | from autoscrape.backends.requests.tags import Tagger
  9 | from autoscrape.search.graph import Graph
 10 | 
 11 | 
 12 | logger = logging.getLogger('AUTOSCRAPE')
 13 | 
 14 | 
 15 | class RequestsBrowser(BrowserBase, Tagger):
 16 |     """
 17 |     A simple HTTP-requests based scraper, currently capable of only
 18 |     doing crawls and very basic HTTP Post requests, but is between
 19 |     2x and 5x faster.
 20 | 
 21 |         bxroberts.org full crawl - requests backend
 22 |         real    0m34.066s
 23 |         user    0m23.062s
 24 |         sys     0m0.640s
 25 | 
 26 |         bxroberts.org full crawl - selenium backend
 27 |         real    2m5.373s
 28 |         user    1m0.872s
 29 |         sys     0m10.976s
 30 |     """
 31 | 
 32 |     def __init__(self, leave_host=False, **kwargs):
 33 |         # requests Session
 34 |         self.s = requests.Session()
 35 |         self.s.headers.update({
 36 |             "User-Agent": (
 37 |                 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0)"
 38 |                 "Gecko/20100101 Firefox/35.0"
 39 |             )
 40 |         })
 41 | 
 42 |         # set of clicked elements
 43 |         self.visited = set()
 44 | 
 45 |         # queue of the path that led us to the current page
 46 |         # this is in the form of (command, *args, **kwargs)
 47 |         self.path = []
 48 | 
 49 |         # tree building
 50 |         self.graph = Graph()
 51 | 
 52 |         # setting to False, ensures crawl will stay on same host
 53 |         self.leave_host = leave_host
 54 | 
 55 |         self.current_url = None
 56 |         self.current_html = None
 57 | 
 58 |     def _no_endslash_url(self, url):
 59 |         # remove trailing slash
 60 |         url_noslash = re.sub(r"/$", "", url)
 61 |         # and trailing slash before query params
 62 |         return re.sub(r"/&", "&", url_noslash)
 63 | 
 64 |     def _check_and_set_visited(self, url):
 65 |         """
 66 |         Take a URL and see if we've visited it (or a slash-ending
 67 |         variation) if we haven't this sets the URL to the
 68 |         visited set. Returns True if we've visited False if not.
 69 |         """
 70 |         url_no_endslash = self._no_endslash_url(url)
 71 |         if url_no_endslash in self.visited or url in self.visited:
 72 |             return True
 73 |         self.visited.add(url)
 74 |         self.visited.add(url_no_endslash)
 75 |         return False
 76 | 
 77 |     def click(self, tag, **kwargs):
 78 |         element = self.element_by_tag(tag)
 79 |         text = self.element_text(element)
 80 |         url = None
 81 |         tag_name = self.element_tag_name(element)
 82 |         logger.debug(" - tag name: %s Text: %s" % (tag_name, text))
 83 |         if tag_name == "a":
 84 |             raw_href = self.element_attr(element, "href")
 85 |             if not raw_href:
 86 |                 logger.debug("[!] No HREF, skipping link...")
 87 |                 return False
 88 | 
 89 |             url = self._normalize_url(raw_href)
 90 |             if self._check_and_set_visited(url):
 91 |                 logger.debug("[!] Already visited URL %s" % (url))
 92 |                 return False
 93 | 
 94 |             logger.info("[+] Clicking link: %s" % url)
 95 |             if not self.fetch(url):
 96 |                 logger.debug("[!] Fetch failed on %s" % (url))
 97 |                 return False
 98 |         elif tag_name == "input":
 99 |             element_type = element.type
100 |             if element_type == "submit":
101 |                 parent_form = element.xpath(".//ancestor::form")[0]
102 |                 parent_form_tag = self.tag_from_element(parent_form)
103 |                 self.submit(parent_form_tag, add_node=False)
104 |                 url = self.current_url
105 |                 if self._check_and_set_visited(url):
106 |                     return False
107 | 
108 |         elif tag_name == "iframe":
109 |             raw_href = self.element_attr(element, "src")
110 |             if not raw_href:
111 |                 return False
112 | 
113 |             url = self._normalize_url(raw_href)
114 |             if self._check_and_set_visited(url):
115 |                 return False
116 | 
117 |             logger.info("[+] Fetching iframe: %s" % url)
118 |             if not self.fetch(url):
119 |                 return False
120 |         else:
121 |             raise NotImplementedError(
122 |                 "click not implemented for element: %s" % (tag_name)
123 |             )
124 | 
125 |         self.path.append((
126 |             "click", [tag], {"url": url}
127 |         ))
128 |         node = "Click\n text: %s\n URL: %s" % (text, url)
129 |         node_meta = {
130 |             "click": tag,
131 |             "click_text": text,
132 |             "click_iterating_form": None,
133 |         }
134 |         self.graph.add_node(
135 |             node,
136 |             **node_meta
137 |         )
138 |         self.graph.move_to_node(node)
139 |         return True
140 | 
141 |     def fetch(self, url, initial=False):
142 |         """
143 |         Fetch a page from a given URL (entry point, typically). Most of
144 |         the time we just want to click a link or submit a form using
145 |         webdriver.
146 |         """
147 |         logger.info("%s Fetching url=%s initial=%s" % (
148 |             ("[+]" if initial else " -"), url, initial,
149 |         ))
150 |         retries = 3
151 |         success = True
152 |         while True:
153 |             try:
154 |                 response = self.s.get(url)
155 |                 break
156 |             except requests.exceptions.ConnectionError as e:
157 |                 logger.error(" ! Connection error retrying...")
158 |                 logger.error(e)
159 |                 if not retries:
160 |                     logger.error(" ! Connection error, skipping URL...")
161 |                     return False
162 |                 time.sleep(30)
163 |             retries -= 1
164 | 
165 |         if not response.text:
166 |             logger.error(" ! Blank response. Skipping URL...")
167 |             return False
168 | 
169 |         # Requests' encoding detection is faulty. The following
170 |         # block will fix most issues
171 |         if response.encoding and "utf" not in response.encoding.lower():
172 |             response.encoding = response.apparent_encoding
173 |         self.current_html = response.text
174 |         # this check fixes improper decoding of UTF byte order mark
175 |         if self.current_html[:3] == "ï»¿":
176 |             self.current_html = self.current_html.encode(
177 |                 response.encoding
178 |             ).decode("utf-8-sig")
179 | 
180 |         self.current_url = response.url
181 |         self.dom = self._get_dom()
182 | 
183 |         if initial:
184 |             self.path.append(("fetch", [url], {"initial": initial}))
185 |             node = "Fetch\n url: %s" % url
186 |             self.graph.add_root_node(node, url=url, action="fetch")
187 | 
188 |         return True
189 | 
190 |     def back(self):
191 |         logger.info("[+] Going back...")
192 |         logger.debug(" - current path-length=%s path=%s" % (
193 |             len(self.path), self._no_tags(self.path),
194 |         ))
195 |         # We're now where we started from
196 |         self.path.pop()
197 |         if not self.path:
198 |             self.path = []
199 |             return
200 | 
201 |         prev = self.path[-1]
202 |         if prev[0] == "fetch":
203 |             self.graph.move_to_parent()
204 |             self.fetch(prev[1][0])
205 | 
206 |         elif prev[0] == "click":
207 |             self.graph.move_to_parent()
208 |             self.fetch(prev[2]["url"])
209 | 
210 |     @property
211 |     def page_html(self):
212 |         return self.current_html
213 | 
214 |     @property
215 |     def page_url(self):
216 |         return self.current_url
217 | 
218 |     def get_clickable(self, **kwargs):
219 |         logger.debug(" - Getting clickable...")
220 |         self.dom = self._get_dom()
221 |         tagger = Tagger(
222 |             current_html=self.current_html,
223 |             current_url=self.current_url,
224 |             leave_host=self.leave_host,
225 |         )
226 |         clickable = tagger.get_clickable()
227 |         return clickable
228 | 
229 |     # def download_file(self, url):
230 |     #     response = self.s.get(url)
231 |     #     action = {
232 |     #         "action": "download_page",
233 |     #         "url": url,
234 |     #     }
235 |     #     self.graph.add_action_to_current(action)
236 |     #     return response.text
237 | 
238 |     def input(self, tag, input):
239 |         """
240 |         Enter some input into an element by a given tag.
241 |         """
242 |         logger.info("[+] Injecting text \"%s\" to input" % (input))
243 |         elem = self.element_by_tag(tag)
244 | 
245 |         input_name = self.element_name(elem)
246 |         value = self.element_value(elem)
247 |         logger.debug("Input name=%s value=%s" % (input_name, value))
248 | 
249 |         elem.attrib["value"] = input
250 | 
251 |         self.path.append(("input", ("", input,), {}))
252 |         action = {
253 |             "action": "input",
254 |             "text": input,
255 |             "tag": tag,
256 |         }
257 |         self.graph.add_action_to_current(action)
258 | 
259 |     def submit(self, tag, add_node=True):
260 |         """
261 |         Submit a form, by extracting the method and url, then
262 |         constructing the params and sending the request, along
263 |         with the form data.
264 |         """
265 |         form = self.element_by_tag(tag)
266 |         inputs = self.elements_by_path("//input", from_element=form)
267 | 
268 |         data = {}
269 |         for input in inputs:
270 |             # TODO: handle non-text type inputs
271 |             name = self.element_name(input)
272 |             if not name:
273 |                 continue
274 |             value = self.element_value(input)
275 |             data[name] = value
276 | 
277 |         action = self.element_attr(form, "action", default=self.current_url)
278 |         method = self.element_attr(form, "method", default="get")
279 |         url = self._normalize_url(action)
280 | 
281 |         request_kwargs = {}
282 |         if method.lower() == "post":
283 |             request_kwargs["data"] = data
284 |         elif method.lower() == "get":
285 |             params = []
286 |             for key in data.keys():
287 |                 params.append([key, data.get(key, "")])
288 |             request_kwargs["params"] = params
289 | 
290 |         request = requests.Request(
291 |             method,
292 |             url,
293 |             **request_kwargs
294 |         )
295 |         prepped = request.prepare()
296 |         response = self.s.send(prepped)
297 |         self.current_url = response.url
298 |         self.current_html = response.text
299 |         self.dom = self._get_dom()
300 | 
301 |         # TODO: all higher level stuff
302 |         if add_node:
303 |             self.path.append(("submit", (tag,), {}))
304 |             node = "Submit\n tag: %s" % (tag)
305 |             node_meta = {
306 |                 "submit": tag,
307 |             }
308 |             self.graph.add_node(node, **node_meta)
309 |             self.graph.move_to_node(node)
310 | 
311 |     def get_screenshot(self):
312 |         return None
313 | 


--------------------------------------------------------------------------------
/autoscrape/backends/requests/dom.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import lxml.html
  3 | # use threads so we can run autoscrape inside celery
  4 | from multiprocessing.pool import ThreadPool
  5 | 
  6 | import requests
  7 | from urllib.parse import urlparse, ParseResult
  8 | 
  9 | from autoscrape.backends.base.dom import DomBase
 10 | 
 11 | 
 12 | def download_stylesheet(css_url):
 13 |     response = requests.get(css_url)
 14 |     data = response.text
 15 |     if type(data) == bytes:
 16 |         return data.decode("utf-8")
 17 |     return data
 18 | 
 19 | 
 20 | class Dom(DomBase):
 21 |     def __init__(self, **kwargs):
 22 |         super().__init__(**kwargs)
 23 |         self.dom = self._get_dom()
 24 | 
 25 |     def _get_dom(self):
 26 |         dom = None
 27 |         try:
 28 |             dom = lxml.html.fromstring(self.current_html)
 29 |         # this handles trying to load XML, RSS feed, etc
 30 |         except ValueError as e:
 31 |             if "Please use bytes input" in str(e):
 32 |                 html_b = bytes(self.current_html, encoding="utf-8")
 33 |                 dom = lxml.html.fromstring(html_b)
 34 |             else:
 35 |                 raise e
 36 |         # if our page's HTML is just an element, like an
 37 |         # iframe, without a body or html then lxml will
 38 |         # return an element surrounded by a body and html.
 39 |         # so here we make element the root and use that as
 40 |         # our base DOM.
 41 |         while True:
 42 |             parent = dom.getparent()
 43 |             if parent is None:
 44 |                 break
 45 |             dom = parent
 46 |         return dom
 47 | 
 48 |     def element_attr(self, element, name, default=None):
 49 |         if not element.attrib:
 50 |             return default
 51 |         return element.attrib.get(name, default)
 52 | 
 53 |     def element_by_tag(self, tag):
 54 |         elements = self.dom.cssselect(tag)
 55 |         if not elements:
 56 |             return None
 57 |         return elements[0]
 58 | 
 59 |     def elements_by_path(self, xpath, from_element=None):
 60 |         if from_element is None:
 61 |             return self.dom.xpath(xpath)
 62 |         return from_element.xpath(xpath)
 63 | 
 64 |     def get_stylesheet(self, fetch_css=False):
 65 |         stylesheet_urls = []
 66 |         for link in self.dom.xpath("//link"):
 67 |             if not link.attrib:
 68 |                 continue
 69 |             l_type = link.attrib.get("type")
 70 |             l_rel = link.attrib.get("rel")
 71 |             l_href = link.attrib.get("href")
 72 |             if l_type != "text/css" and l_rel != "stylesheet":
 73 |                 continue
 74 |             css_url = self._normalize_url(l_href)
 75 |             stylesheet_urls.append(css_url)
 76 | 
 77 |         pool = None
 78 |         try:
 79 |             pool = ThreadPool(8)
 80 |         except OSError:
 81 |             # operating system/container doesn't support threading
 82 |             pass
 83 | 
 84 |         if pool is not None:
 85 |             results = pool.map(download_stylesheet, stylesheet_urls)
 86 |             pool.close()
 87 |         # fallback to single threaded in case of threading not permitted
 88 |         else:
 89 |             results = []
 90 |             for css_url in stylesheet_urls:
 91 |                 results.append(download_stylesheet(css_url))
 92 | 
 93 |         css = "\n".join(results)
 94 |         for style in self.dom.xpath("style"):
 95 |             css += style.text_content()
 96 |         return css
 97 | 
 98 |     def _normalize_url(self, url):
 99 |         argnames = ['scheme', 'netloc', 'path', 'params', 'query', 'fragment']
100 |         inheritable = ['scheme', 'netloc', 'path']
101 |         parsed_current_url = urlparse(self.current_url)
102 |         parsed_url = urlparse(url)
103 | 
104 |         args = []
105 |         for argname in argnames:
106 |             value = getattr(parsed_url, argname, None)
107 |             if not value and argname in inheritable:
108 |                 value = getattr(parsed_current_url, argname, '')
109 |             args.append(value)
110 | 
111 |         pr = ParseResult(*args)
112 |         normalized = pr.geturl()
113 |         return normalized
114 | 
115 |     def element_text(self, element, block=False):
116 |         if block and element is not None:
117 |             return element.text_content()
118 |         if element is None:
119 |             return ''
120 |         text = element.text
121 |         if not text:
122 |             return ''
123 |         return text
124 | 
125 |     def element_name(self, element):
126 |         return element.name
127 | 
128 |     def element_tag_name(self, element):
129 |         if element is None:
130 |             return ""
131 |         return element.tag
132 | 


--------------------------------------------------------------------------------
/autoscrape/backends/requests/tags.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import logging
 3 | 
 4 | from autoscrape.backends.base.tags import TaggerBase
 5 | from autoscrape.backends.requests.dom import Dom
 6 | 
 7 | 
 8 | logger = logging.getLogger('AUTOSCRAPE')
 9 | 
10 | 
11 | class Tagger(TaggerBase, Dom):
12 |     def tag_from_element(self, el):
13 |         path = []
14 |         while el is not None:
15 |             nth = 1
16 |             parent = el.getparent()
17 |             children = []
18 |             if parent is not None:
19 |                 children = parent.getchildren()
20 |             for child in children:
21 |                 if child == el:
22 |                     break
23 |                 if child.tag == el.tag:
24 |                     nth += 1
25 |             selector = "%s:nth-of-type(%s)" % (
26 |                 el.tag, nth
27 |             )
28 |             path.insert(0, selector)
29 |             el = parent
30 |         tag = " > ".join(path)
31 |         return tag
32 | 
33 |     def get_inputs(self, form=None, itype=None, root_node=None):
34 |         return super().get_inputs(form=form, itype=itype, root_node=self.dom)
35 | 
36 |     def get_buttons(self, in_form=False, path=None):
37 |         x_path = path or "|".join([
38 |             "//form//a", "//input[@type='submit']", "//table//a",
39 |         ])
40 |         return super().get_buttons(in_form=in_form, path=x_path)
41 | 
42 |     def get_clickable(self, path=None):
43 |         clickable = super().get_clickable(path="//a|//iframe")
44 |         return clickable
45 | 
46 |     def clickable_sanity_check(self, element):
47 |         raw_href = self.element_attr(element, "href")
48 | 
49 |         tag_name = self.element_tag_name(element)
50 |         if tag_name == "iframe":
51 |             raw_href = self.element_attr(element, "src")
52 | 
53 |         if not raw_href:
54 |             return False
55 | 
56 |         href = self._normalize_url(raw_href).split("#")[0]
57 |         if href.split("#")[0] == self.current_url:
58 |             return False
59 | 
60 |         # skip any weird protos ... we whitelist notrmal HTTP,
61 |         # anchor tags and blank tags (to support JavaScript & btns)
62 |         if href and href.startswith("javascript"):
63 |             return False
64 | 
65 |         return super().clickable_sanity_check(element, href=href)
66 | 


--------------------------------------------------------------------------------
/autoscrape/backends/selenium/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/backends/selenium/__init__.py


--------------------------------------------------------------------------------
/autoscrape/backends/selenium/dom.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import logging
  3 | import re
  4 | 
  5 | try:
  6 |     from selenium.common.exceptions import (
  7 |         NoSuchElementException, StaleElementReferenceException,
  8 |         NoSuchFrameException
  9 |     )
 10 | except ModuleNotFoundError:
 11 |     # we haven't installed selenium backend deps
 12 |     pass
 13 | 
 14 | from autoscrape.backends.base.dom import DomBase
 15 | 
 16 | 
 17 | logger = logging.getLogger('AUTOSCRAPE')
 18 | 
 19 | 
 20 | class FrameTransparentList(list):
 21 |     def __init__(self, *args, **kwargs):
 22 |         self.driver = kwargs.pop("driver")
 23 |         super().__init__(*args, **kwargs)
 24 | 
 25 |     def __getitem__(self, index):
 26 |         raw_item = super().__getitem__(index)
 27 |         if not isinstance(raw_item, list):
 28 |             self.driver.switch_to.default_content()
 29 |             return raw_item
 30 |         iframe_ix, element = raw_item
 31 |         self.driver.switch_to.frame(iframe_ix)
 32 |         return element
 33 | 
 34 |     def __iter__(self):
 35 |         for raw_item in list.__iter__(self):
 36 |             if not isinstance(raw_item, list):
 37 |                 self.driver.switch_to.default_content()
 38 |                 yield raw_item
 39 |                 continue
 40 |             iframe_ix, element = raw_item
 41 |             self.driver.switch_to.frame(iframe_ix)
 42 |             yield element
 43 |             self.driver.switch_to.default_content()
 44 | 
 45 | 
 46 | class Dom(DomBase):
 47 |     def element_attr(self, element, name, default=None):
 48 |         return element.get_attribute(name)
 49 | 
 50 |     def iframe_capable_lookup(self, tag):
 51 |         try:
 52 |             self.driver.switch_to.default_content()
 53 |             return self.driver.find_element_by_css_selector(tag)
 54 |         except NoSuchElementException:
 55 |             pass
 56 |         self.driver.switch_to.default_content()
 57 |         iframes = self.driver.find_elements_by_tag_name("iframe")
 58 |         for iframe_ix in range(len(iframes)):
 59 |             try:
 60 |                 self.driver.switch_to.frame(iframe_ix)
 61 |             except NoSuchFrameException:
 62 |                 continue
 63 |             try:
 64 |                 return self.driver.find_element_by_css_selector(tag)
 65 |             except NoSuchElementException:
 66 |                 continue
 67 |             self.driver.switch_to.default_content()
 68 |         logger.debug("[!] No element found for tag: %s" % (tag))
 69 |         return None
 70 | 
 71 |     def element_by_tag(self, tag):
 72 |         """
 73 |         Take a tag and return the corresponding live element in the DOM.
 74 |         """
 75 |         inside_id = False
 76 |         # escaping logic
 77 |         newtag = ""
 78 |         for c in tag:
 79 |             if c == "#":
 80 |                 inside_id = True
 81 |                 newtag += c
 82 |                 continue
 83 | 
 84 |             # end of ID
 85 |             elif inside_id and re.search("\s", c):
 86 |                 inside_id = False
 87 | 
 88 |             elif inside_id and c in self.css_escapables:
 89 |                 for escapable in self.css_escapables:
 90 |                     c = "\%s" % escapable
 91 | 
 92 |             newtag += c
 93 | 
 94 |         if newtag != tag:
 95 |             logger.debug("Original tag: %s, newtag: %s" % (tag, newtag))
 96 |             tag = newtag
 97 | 
 98 |         return self.iframe_capable_lookup(tag)
 99 | 
100 |     def elements_by_path(self, xpath, from_element=None):
101 |         """
102 |         Get all elements, across all iframes. We output a
103 |         FrameTransparentList which is a normal list of elements, but it will
104 |         switch to the correct frame upon accessing/iterating to
105 |         each element.
106 |         """
107 |         if from_element is None:
108 |             from_element = self.driver
109 |         return from_element.find_elements_by_xpath(xpath)
110 |         iframes = from_element.find_elements_by_tag_name("iframe")
111 |         if not len(iframes):
112 |             return from_element.find_elements_by_xpath(xpath)
113 |         # gather all elements from iframe
114 |         elements = from_element.find_elements_by_xpath(xpath)
115 |         for iframe_ix in range(len(iframes)):
116 |             self.driver.switch_to.frame(iframe_ix)
117 |             for el in self.driver.find_elements_by_xpath(xpath):
118 |                 elements.append([iframe_ix, el])
119 |             self.driver.switch_to.default_content()
120 |         self.driver.switch_to.default_content()
121 |         return FrameTransparentList(elements, driver=self.driver)
122 | 
123 |     def get_stylesheet(self):
124 |         script = """
125 |         return [].slice.call(document.styleSheets)
126 |           .reduce((prev, styleSheet) => {
127 |             try {
128 |               if (styleSheet.cssRules) {
129 |                 return prev +
130 |                   [].slice.call(styleSheet.cssRules)
131 |                     .reduce(function (prev, cssRule) {
132 |                       return prev + cssRule.cssText;
133 |                     }, '');
134 |               } else {
135 |                   return prev;
136 |               }
137 |             } catch (e) {
138 |               return prev + `@import url("${styleSheet.href}");`
139 |             }
140 |           }, '');"""
141 |         return self.driver.execute_script(script)
142 | 
143 |     def _text_via_many_means(self, el):
144 |         text = []
145 |         try:
146 |             txt = el.text
147 |             if txt:
148 |                 text.append(txt.strip())
149 |         except Exception as e:
150 |             logger.error("Error getting text element: %s, Err: %s" % (
151 |                 el, e))
152 | 
153 |         title = el.get_attribute("title")
154 |         if title:
155 |             text.append(title.strip())
156 | 
157 |         try:
158 |             placeholder = el.get_attribute("placeholder")
159 |             if placeholder:
160 |                 text.append(placeholder.strip())
161 |         except Exception as e:
162 |             logger.error("Error getting placeholder: %s, Error: %s" % (
163 |                 el, e))
164 | 
165 |         img_els = el.find_elements_by_tag_name("img")
166 |         for img in img_els:
167 |             try:
168 |                 text.append(img.get_attribute("alt"))
169 |             except StaleElementReferenceException as e:
170 |                 logger.error("Error getting image text: %s, Error: %s" % (
171 |                     img, e
172 |                 ))
173 | 
174 |         if self.element_tag_name(el) == "input":
175 |             text.append(el.get_attribute("value"))
176 | 
177 |         return " ".join(text).replace("\n", "").strip()
178 | 
179 |     def element_text(self, element, block=False):
180 |         """
181 |         Get the text for all elements either under a given element
182 |         or for a whole page (if element == None)
183 |         """
184 |         if not block and element is not None:
185 |             return self._text_via_many_means(element)
186 | 
187 |         if element is None:
188 |             element = self.driver
189 | 
190 |         return element.text
191 | 
192 |     def element_tag_name(self, element):
193 |         if element is None:
194 |             return ""
195 |         return element.tag_name
196 | 
197 |     def element_value(self, element):
198 |         return element.get_attribute("value")
199 | 
200 |     def element_displayed(self, element):
201 |         fn_names = ["is_displayed", "is_enabled"]
202 |         for fn_name in fn_names:
203 |             if not hasattr(element, fn_name):
204 |                 continue
205 |             try:
206 |                 if not getattr(element, fn_name)():
207 |                     return False
208 |             except StaleElementReferenceException as e:
209 |                 pass
210 |         return True
211 | 


--------------------------------------------------------------------------------
/autoscrape/backends/selenium/tags.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import logging
  3 | 
  4 | from autoscrape.backends.base.tags import TaggerBase
  5 | from autoscrape.backends.selenium.dom import Dom
  6 | 
  7 | 
  8 | logger = logging.getLogger('AUTOSCRAPE')
  9 | 
 10 | 
 11 | class Tagger(TaggerBase, Dom):
 12 |     """
 13 |     Generates tags from a given page that can be used, in a stateless manner,
 14 |     to refer to unique elements on a web page.
 15 |     """
 16 | 
 17 |     def __init__(self, driver=None, current_url=None, leave_host=False):
 18 |         super().__init__(current_url=current_url, leave_host=leave_host)
 19 |         self.driver = driver
 20 | 
 21 |     def tag_from_element(self, element):
 22 |         """
 23 |         Takes a WebDriver element and returns an CSSPath for finding it
 24 |         in the future. As far as I know, this is only really feasible
 25 |         using JavaScript (without resorting to a complicated tree walking
 26 |         algorithm ... which we may need to do if this ends up failing).
 27 | 
 28 |         Modified from: https://stackoverflow.com/a/12222317
 29 |         """
 30 |         script = """
 31 |             var getPathTo = function(el) {
 32 |                 if (!(el instanceof Element))
 33 |                     return;
 34 |                 var path = [];
 35 |                 while (el.nodeType === Node.ELEMENT_NODE) {
 36 |                     // if (el.id) {
 37 |                     //   path.unshift(`#${el.id}`);
 38 |                     //   break;
 39 |                     // }
 40 |                     var selector = el.nodeName.toLowerCase();
 41 |                     // // NOTE: we removed this because web pages often use
 42 |                     // // strange characters in ID names which cause the CSS
 43 |                     // // selector to fail upon lookup. If we only use traversal
 44 |                     // // methods, we don't have that webpage-specific problem
 45 |                     // if (el.id) {
 46 |                     //     selector += '#' + el.id;
 47 |                     //     path.unshift(selector);
 48 |                     //     break;
 49 |                     // }
 50 | 
 51 |                     var sib = el, nth = 1;
 52 |                     while (sib = sib.previousElementSibling) {
 53 |                         if (sib.nodeName.toLowerCase() == selector)
 54 |                            nth++;
 55 |                     }
 56 | 
 57 |                     // // NOTE: always give a nth-of-type tag, even if
 58 |                     // // if there's only a single sibling, just to be
 59 |                     // // extra-specific
 60 |                     // if (nth != 1)
 61 | 
 62 |                     selector += ":nth-of-type("+nth+")";
 63 |                     path.unshift(selector);
 64 |                     el = el.parentNode;
 65 |                 }
 66 |                 return path.join(" > ");
 67 |             }
 68 | 
 69 |             // NOTE: this used to have a toLowerCase on it, but it caused
 70 |             // problems with some pages. Leaving it as it was found in the
 71 |             // original DOM is best here.
 72 |             return getPathTo(arguments[0]); //.toLowerCase();
 73 |         """
 74 |         return self.driver.execute_script(script, element)
 75 | 
 76 |     def clickable_sanity_check(self, element):
 77 |         try:
 78 |             if not self.element_displayed(element):
 79 |                 return False
 80 |         except Exception as e:
 81 |             logger.debug("[!] Skipping sanity check due to failure: %s" % (e))
 82 |             return False
 83 |         return super().clickable_sanity_check(element)
 84 | 
 85 |     def get_inputs(self, form=None, itype=None, root_node=None):
 86 |         return super().get_inputs(form=form, itype=itype, root_node=self.driver)
 87 | 
 88 |     def get_clickable(self, path=None):
 89 |         """
 90 |         Get all clickable element tags on the current page.
 91 | 
 92 |         TODO: In the future we may need to recurse the page to find
 93 |         other clickable types like JS-enabled divs, etc.
 94 |         """
 95 |         xpath = path or "|".join([
 96 |             "//a", "//button", "//input[@type='submit']",
 97 |             "//input[@type='button']"
 98 |         ])
 99 |         return super().get_clickable(path=xpath)
100 | 


--------------------------------------------------------------------------------
/autoscrape/backends/warc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/backends/warc/__init__.py


--------------------------------------------------------------------------------
/autoscrape/backends/warc/browser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import io
  3 | import logging
  4 | import os
  5 | import pickle
  6 | import sys
  7 | 
  8 | from autoscrape.backends.requests.browser import RequestsBrowser
  9 | from autoscrape.backends.requests.tags import Tagger
 10 | from autoscrape.search.graph import Graph
 11 | from autoscrape.util.warc import build_warc_index, _warc_records
 12 | 
 13 | 
 14 | logger = logging.getLogger('AUTOSCRAPE')
 15 | 
 16 | 
 17 | try:
 18 |     import plyvel
 19 |     import warcio
 20 | except ModuleNotFoundError:
 21 |     pass
 22 | 
 23 | 
 24 | class WARCBrowser(RequestsBrowser):
 25 |     def __init__(self, warc_index_file=None, warc_directory=None,
 26 |                  filter_domain=None, leave_host=False, **kwargs):
 27 |         try:
 28 |             warcio
 29 |         except NameError:
 30 |             logger.debug(
 31 |                 "WARC dependencies not installed."
 32 |                 " (Hint: pip install autoscrape[warc-backend])"
 33 |                 " Exiting."
 34 |             )
 35 |             sys.exit(1)
 36 | 
 37 |         no_dir_msg = "Error: No warc_directory specified for WARCBrowser"
 38 |         assert warc_directory is not None, no_dir_msg
 39 | 
 40 |         no_index_msg = "Error: No warc_index_file specified for WARCBrowser"
 41 |         assert warc_index_file is not None, no_index_msg
 42 | 
 43 |         # leveldb directory
 44 |         self.warc_index_file = warc_index_file
 45 |         # directory containing Common Crawl WARCs
 46 |         self.warc_directory = warc_directory
 47 |         # only build index for a specific domain
 48 |         self.filter_domain = filter_domain
 49 | 
 50 |         # WARC index: URL => (filename, record_number)
 51 |         self.warc_index = plyvel.DB(self.warc_index_file, create_if_missing=True)
 52 |         build_warc_index(
 53 |             db=self.warc_index, warc_directory=self.warc_directory,
 54 |             filter_domain=self.filter_domain
 55 |         )
 56 |         # WARC cache: filename => [record1, ..., recordN]
 57 |         self.warc_cache = {}
 58 |         self.warc_directory = warc_directory
 59 | 
 60 |         # how many WARC files to keep in memory at a given time
 61 |         # since the crawls are sequential, most files for a site
 62 |         # will exist in a segment of a few WARC files.
 63 |         self.warc_cache_size = 2
 64 |         # we're going to store the order the files have have been
 65 |         # accessed most recently here:
 66 |         #     [most_recently_used_filename, ..., least_recently_used_filename]
 67 |         # This will be used to enforce our cache size.
 68 |         self.warc_cache_stack = []
 69 | 
 70 |         # set of clicked elements
 71 |         self.visited = set()
 72 | 
 73 |         # queue of the path that led us to the current page
 74 |         # this is in the form of (command, *args, **kwargs)
 75 |         self.path = []
 76 | 
 77 |         # tree building
 78 |         self.graph = Graph()
 79 | 
 80 |         # setting to False, ensures crawl will stay on same host
 81 |         self.leave_host = leave_host
 82 | 
 83 |         self.current_url = None
 84 |         self.current_html = None
 85 | 
 86 |     def _load_warc_file(self, filename):
 87 |         """
 88 |         Take a specified WARC file, load it and keep it in memory in a quickly
 89 |         readable format (python dict). This operates directly on the class
 90 |         variable self.warc_cache and also handles maximum cache size pruning.
 91 |         """
 92 |         logger.debug("[-] Loading WARC file: %s" % (filename))
 93 |         if len(self.warc_cache_stack) > self.warc_cache_size:
 94 |             least_used = self.warc_cache_stack.pop()
 95 |             logger.debug(" - Removing WARC from memory: %s" % (filename))
 96 |             del self.warc_cache[least_used]
 97 | 
 98 |         self.warc_cache[filename] = []
 99 |         for record in _warc_records(filename):
100 |             payload = record["payload"]
101 |             if not payload:
102 |                 payload = "<html></html>"
103 |             self.warc_cache[filename].append({
104 |                 "header": record["headers"],
105 |                 "payload": payload,
106 |             })
107 | 
108 |     def fetch(self, url, initial=False):
109 |         """
110 |         Fetch a page from a given URL from the WARC archive (via
111 |         an index).
112 |         """
113 |         logger.info("%s Fetching url=%s initial=%s" % (
114 |             ("[+]" if initial else " -"), url, initial,
115 |         ))
116 |         url_b = bytes(url, "utf-8")
117 |         data = self.warc_index.get(url_b)
118 |         if not data:
119 |             logger.debug("[!] Couldn't find URL in WARC index: %s" % (url))
120 |             return False
121 |         else:
122 |             filename, record_number = pickle.loads(data)
123 |             logger.debug(" -  Loading filename: %s record number: %s" % (
124 |                 filename, record_number
125 |             ))
126 |             if filename not in self.warc_cache:
127 |                 self._load_warc_file(filename)
128 |             warcfile = self.warc_cache[filename]
129 |             record = warcfile[record_number]
130 |             self.current_html = record["payload"]
131 | 
132 |             try:
133 |                 self.warc_cache_stack.remove(filename)
134 |             except ValueError:
135 |                 pass
136 | 
137 |             self.warc_cache_stack.insert(0, filename)
138 | 
139 |         self.current_url = url
140 |         self.dom = self._get_dom()
141 | 
142 |         if initial:
143 |             self.path.append(("fetch", [url], {"initial": initial}))
144 |             node = "Fetch\n url: %s" % url
145 |             self.graph.add_root_node(node, url=url, action="fetch")
146 | 
147 |         return True
148 | 


--------------------------------------------------------------------------------
/autoscrape/backends/warc/dom.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | from autoscrape.backends.requests.dom import Dom
3 | 


--------------------------------------------------------------------------------
/autoscrape/backends/warc/tags.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | from autoscrape.backends.requests.tags import Tagger
3 | 


--------------------------------------------------------------------------------
/autoscrape/classification.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | 
3 | 
4 | class KNN(object):
5 |     pass
6 | 


--------------------------------------------------------------------------------
/autoscrape/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/cli/__init__.py


--------------------------------------------------------------------------------
/autoscrape/cli/scrape.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | AUTOSCRAPE - Interactively crawl, find searchable forms,
  4 | input data to them and scrape data on the results, from an
  5 | initial BASEURL.
  6 | 
  7 | Usage:
  8 |     autoscrape [options] BASEURL
  9 | 
 10 | General Options:
 11 |     --backend BACKEND
 12 |         The backend to use. Currently one of "selenium", "requests" or
 13 |         "warc".  The requests browser is only capable of crawling, but
 14 |         is approximately 2-3.5x faster. WARC is for emulating browsing
 15 |         through Common Crawl archival data.
 16 |         [default: selenium]
 17 | 
 18 |     --loglevel LEVEL
 19 |         Loglevel, note that DEBUG is extremely verbose.
 20 |         [default: INFO]
 21 | 
 22 |     --quiet
 23 |         This will silence all logging to console.
 24 | 
 25 | Crawl-Specific Options:
 26 |     --maxdepth DEPTH
 27 |         Maximum depth to crawl a site (in search of form
 28 |         if the option --form-match STRING is specified,
 29 |         see below). Setting to 0 means don't crawl at all,
 30 |         all operations are limited to the BASEURL page.
 31 |         Setting to -1 means unlimited maximum crawl depth.
 32 |         [default: 10]
 33 | 
 34 |     --max-pages NUM
 35 |         Maximum number of unique pages, in total, to fetch.
 36 |         AutoScrape will stop crawling once this is hit.
 37 | 
 38 |     --leave-host
 39 |         By default, autoscrape will not leave the host given
 40 |         in the BASEURL. This option lets the scraper leave
 41 |         the host.
 42 | 
 43 |     --only-links MATCH_STRING
 44 |         A whitelist of links to follow. All others will
 45 |         be ignored. Can be a string or a regex with
 46 |         multiple strings to match separated by a pipe
 47 |         (|) character.
 48 | 
 49 |     --ignore-links MATCH_STRING
 50 |         This option can be used to remove any links matching
 51 |         MATCH_STRING (can be a regex or just a string match)
 52 |         from consideration for clicking. Accepts the same
 53 |         argument format as --only-links.
 54 | 
 55 |     --link-priority SORT_STRING
 56 |         A string to sort the links by. In this case, any link
 57 |         containing "SORT_STRING" will be clicked before any other
 58 |         links. In most cases you probably want to use the
 59 |         whitelist, --only-links, option.
 60 | 
 61 |     --ignore-extensions IGNORE_EXTENSIONS
 62 |         Don't click on or download URLs pointing to files with
 63 |         these extensions.
 64 | 
 65 |     --result-page-links MATCH_STRINGS_LIST
 66 |         If specified, AutoScrape will click on any links matching
 67 |         this string when it arrives on a search result page.
 68 | 
 69 | Interactive Form Search Options:
 70 |     --form-match SEARCH_STRING
 71 |         The crawler will identify a form to search/scrape if it
 72 |         contains the specified string. If matched, it will be
 73 |         interactively scraped using the below instructions.
 74 | 
 75 |     --input INPUT_DESCRIPTION
 76 |         Interactive search descriptor. This describes how to
 77 |         interact with a matched form. The inputs are
 78 |         described in the following format:
 79 | 
 80 |         "c:0:True,i:0:atext,s:1:France:d:0:1991-01-20,r:1:0"
 81 | 
 82 |         A single-input type can be one of four types:
 83 |         checkbox ("c"), input box ("i"), option select
 84 |         ("s"), radio checkbox ("r"),  and date inputs
 85 |         ("d", with inputs in the  "YYYY-MM-DD" format).
 86 |         The type is separated by a colon, and the input
 87 |         index position is next. (Each input type has its
 88 |         own list, so a form with one input, one checkbox,
 89 |         and one option select, will all be at index 0.)
 90 |         The final command, sepearated by another colon,
 91 |         describes what to do with the input.
 92 | 
 93 |         Multiple inputs are separated by a comma, so you can
 94 |         interact with multiple inputs before submitting the
 95 |         form.
 96 | 
 97 |         To illustrate this, the above command does the following:
 98 |             - first input checkbox is checked (uncheck is False)
 99 |             - first input box gets filled with the string "first"
100 |             - second select input gets the "France" option chosen
101 |             - first date input gets set to Jan 20, 1991
102 |             - the second radio checkbox group will have its
103 |               first option selected
104 | 
105 |     --next-match NEXT_BTN_STRING
106 |         A string to match a "next" button with, after
107 |         searching a form.  The scraper will continue to
108 |         click "next" buttons after a search until no matches
109 |         are found, unless limited by the --formdepth option
110 |         (see below).
111 | 
112 |     --formdepth DEPTH
113 |         How deep the scraper will iterate, by clicking
114 |         "next" buttons. Zero means infinite depth.
115 |         [default: 0]
116 | 
117 |     --form-submit-button-selector XPATH_SELECTOR
118 |         A XPath selector indicating a submit button/element
119 |         to click. Use this when the submit button to a form
120 |         doesn't have any text or is a strange element type.
121 | 
122 |     --form-submit-natural-click
123 |         Some webpages make clicking a link element difficult
124 |         due to JavaScript onClick events. In cases where a
125 |         click does nothing, you can use this option to get
126 |         the scraper to emulate a mouse click over the link's
127 |         poition on the page, activating any higher level JS
128 |         interactions.
129 | 
130 |     --form-submit-wait SECONDS
131 |         How many seconds to force wait after a submit to a form.
132 |         This should be used in cases where the builtin
133 |         wait-for-page-load isn't working properly (JS-heavy
134 |         pages, etc). [default: 5]
135 | 
136 | Webdriver-Specific and General Options:
137 |     --page-timeout SECONDS
138 |         Selenium has a lot of timeout settings used for
139 |         determining when a page has loaded. In cases where
140 |         pages take a very long time, this may need to be
141 |         increased. If you have a fast-loading page, but
142 |         AutoScrape is waiting for load, you can set this
143 |         lower to speed things up. [default: 30]
144 | 
145 |     --force-page-wait SECONDS
146 |        This forces AutoScrape to wait for the specified number
147 |        of seconds after performing a navigation action.
148 |        Increase this for slow sites or oddly loading single page
149 |        apps, decrease this for fast sites or crawls.
150 |        [default: 1]
151 | 
152 |     --load-images
153 |         By default, images on a page will not be fetched.
154 |         This speeds up scrapes on sites and lowers bandwidth
155 |         needs. This option fetches all images on a page.
156 | 
157 |     --show-browser
158 |         By default, we hide the browser during operation.
159 |         This option displays a browser window, mostly
160 |         for debugging purposes.
161 | 
162 |     --driver DRIVER
163 |         Which browser to use. Current support for "Firefox",
164 |         "Chrome", and "remote". [default: Firefox]
165 | 
166 |     --browser-binary PATH_TO_BROWSER
167 |         Path to a specific browser binary. If left blank
168 |         selenium will pull the browser found on your path.
169 | 
170 |     --remote-hub URI
171 |         If using "remote" driver, specify the hub URI to
172 |         connect to. Needs the proto, address, port, and path.
173 |         [default: http://localhost:4444/wd/hub]
174 | 
175 | WARC Options:
176 |     --warc-directory PATH_TO_WARCS
177 |         Path to the folder containing GZipped WARC files. These can be
178 |         downloaded from Common Crawl. Required when using the "warc"
179 |         backend.
180 | 
181 |     --warc-index-file PATH_TO_LEVELDB
182 |         Path to the level DB database holding the URL-to-file
183 |         index: URL => (filename, record_number)
184 |         This will be generated from the WARCS in the --warc-directory
185 |         speficied if it's not already. Required when using the "warc"
186 |         backend.
187 | 
188 | Data Saving Options:
189 |     --output DIRECTORY_OR_URL
190 |         If specified, this indicates where to save pages during a
191 |         crawl. This directory will be created if it does not
192 |         currently exist.  This directory will have several
193 |         sub-directories that contain the different types of pages
194 |         found (i.e., search_pages, data_pages, screenshots).
195 |         This can also accept a URL (i.e., http://localhost:5000/files)
196 |         and AutoScrape will POST to that endpoint with each
197 |         file scraped.
198 |         [default: autoscrape-data]
199 | 
200 |     --keep-filename
201 |         By default, we hash the files in a scrape in order to
202 |         account for dynamic content under a single-page app
203 |         (SPA) website implmentation. This option will force
204 |         the scraper to retain the original filename, from the
205 |         URL when saving scrape data.
206 | 
207 |     --save-screenshots
208 |         This option makes the scraper save screenshots of each
209 |         page, interaction, and search. Screenshots will be
210 |         saved to the screenshots folder of the output dir.
211 | 
212 |     --full-page-screenshots
213 |         By default, we only save the first displayed part of the
214 |         webpage. The remaining portion that you can only see
215 |         by scrolling down isn't captured. Setting this option
216 |         forces AutoScrape to scroll down and capture the entire
217 |         web content. This can fail in certain circumstances, like
218 |         in API output mode and should be used with care.
219 | 
220 |     --save-graph
221 |         This option allows the scraper to build a directed graph
222 |         of the entire scrape and will save it to the "graph"
223 |         subdirectory under the output dir. The output file
224 |         is a timestamped networkx pickled graph.
225 | 
226 |     --disable-style-saving
227 |         By default, AutoScrape saves the stylesheets associated
228 |         with a scraped page. To save storage, you can disable this
229 |         functionality by using this option.
230 | """
231 | import logging
232 | 
233 | from docopt import docopt
234 | 
235 | import autoscrape
236 | 
237 | 
238 | logger = logging.getLogger('AUTOSCRAPE')
239 | 
240 | 
241 | def main():
242 |     docopt_args = docopt(__doc__)
243 | 
244 |     BASEURL = docopt_args.pop("BASEURL")
245 | 
246 |     # strip the -- and convert - to _
247 |     args = {}
248 |     for option in docopt_args:
249 |         args[option[2:].replace('-', '_')] = docopt_args[option]
250 | 
251 |     # configure stdout logging
252 |     docopt_args["stdout"] = True
253 |     if "quiet" in args:
254 |         quiet = args.pop("quiet")
255 |         args["stdout"] = not quiet
256 | 
257 |     scraper = autoscrape.ManualControlScraper(BASEURL, **args)
258 | 
259 |     logger.debug("AutoScrape starting with arguments: %s" % (docopt_args))
260 |     scraper.run()
261 | 


--------------------------------------------------------------------------------
/autoscrape/control.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import time
  3 | import logging
  4 | 
  5 | from autoscrape.backends.requests.browser import RequestsBrowser
  6 | from autoscrape.vectorization.text import TextVectorizer
  7 | 
  8 | # backends/vectorizers with optional dependencies
  9 | try:
 10 |     from autoscrape.vectorization.ebmeddings import EmbeddingsVectorizer
 11 | except ModuleNotFoundError:
 12 |     pass
 13 | 
 14 | try:
 15 |     from autoscrape.backends.selenium.browser import SeleniumBrowser
 16 | except ModuleNotFoundError:
 17 |     pass
 18 | 
 19 | try:
 20 |     from autoscrape.backends.warc.browser import WARCBrowser
 21 | except ModuleNotFoundError:
 22 |     pass
 23 | 
 24 | 
 25 | logger = logging.getLogger('AUTOSCRAPE')
 26 | 
 27 | 
 28 | class Controller:
 29 |     """
 30 |     High-level control for scraping a web page. This allows us to control
 31 |     all of the possible scraper commands in an automated way, using a set
 32 |     of indices instead of tags. This way we can present vectors of options
 33 |     to a ML model. This abstraction also returns feature matrices for pages
 34 |     and elements on the webpage.
 35 |     """
 36 | 
 37 |     def __init__(self, leave_host=False, driver="Firefox", browser_binary=None,
 38 |                  remote_hub="http://localhost:4444/wd/hub", output=None,
 39 |                  form_submit_natural_click=False, form_submit_wait=5,
 40 |                  warc_index_file=None, warc_directory=None,
 41 |                  force_page_wait=None, form_submit_button_selector=None,
 42 |                  load_images=False, show_browser=False, page_timeout=None,
 43 |                  html_embeddings_file=None, word_embeddings_file=None,
 44 |                  backend="selenium", vectorizer="text"):
 45 |         """
 46 |         Set up our WebDriver and misc utilities.
 47 |         """
 48 |         Browser = None
 49 |         self.backend = backend
 50 |         if backend == "selenium":
 51 |             Browser = SeleniumBrowser
 52 |         elif backend == "requests":
 53 |             Browser = RequestsBrowser
 54 |         elif backend == "warc":
 55 |             Browser = WARCBrowser
 56 |         else:
 57 |             raise NotImplementedError(
 58 |                 "No backend found: %s" % (backend)
 59 |             )
 60 | 
 61 |         self.scraper = Browser(
 62 |             leave_host=leave_host, driver=driver,
 63 |             browser_binary=browser_binary, remote_hub=remote_hub,
 64 |             form_submit_natural_click=form_submit_natural_click,
 65 |             form_submit_wait=form_submit_wait,
 66 |             form_submit_button_selector=form_submit_button_selector,
 67 |             warc_index_file=warc_index_file, warc_directory=warc_directory,
 68 |             load_images=load_images, show_browser=show_browser,
 69 |             output=output, page_timeout=page_timeout,
 70 |         )
 71 | 
 72 |         Vectorizer = None
 73 |         if vectorizer == "text":
 74 |             self.vectorizer = TextVectorizer(
 75 |                 scraper=self.scraper, controller=self
 76 |             )
 77 |         elif vectorizer == "embeddings":
 78 |             self.vectorizer = EmbeddingsVectorizer(
 79 |                 scraper=self.scraper, controller=self,
 80 |                 html_embeddings_file=html_embeddings_file,
 81 |                 word_embeddings_file=word_embeddings_file,
 82 |             )
 83 |         else:
 84 |             raise NotImplementedError(
 85 |                 "No vectorizer found: %s" % (vectorizer)
 86 |             )
 87 | 
 88 |         # this flag marks vectors as stale. when this is true and
 89 |         # we try to access the link vectors, we'll re-load them
 90 |         self.stale = True
 91 | 
 92 |         self.clickable = None
 93 | 
 94 |         # simply a list of form tags, each forms input contents is
 95 |         # contained in the self.inputs multi-dimensional array, below
 96 |         self.forms = []
 97 | 
 98 |         # this expands into the following format:
 99 |         # [ form_tag:
100 |         #   [
101 |         #     [text input tags...],
102 |         #     [select input tags...],
103 |         #     [checkbox input tags...]
104 |         #   ],
105 |         #   other forms ...,
106 |         # ]
107 |         self.inputs = []
108 | 
109 |         # TODO: the point of this wait is to ensure the DOM has stopped
110 |         # mutating (loading results, etc). a proper fix for this is to
111 |         # look at the count of DOM objects being queried for each index
112 |         # type and detect when it stops changing.
113 |         self.force_page_wait = force_page_wait
114 |         if self.force_page_wait is not None:
115 |             self.force_page_wait = int(self.force_page_wait)
116 | 
117 |     def load_indices(self):
118 |         logger.debug("[.] Loading page vectors...")
119 |         if self.backend == "selenium" and self.force_page_wait:
120 |             logger.debug(" - Force waiting for %s seconds" % (
121 |                 self.force_page_wait
122 |             ))
123 |             time.sleep(self.force_page_wait)
124 | 
125 |         self.clickable = None
126 |         # self.clickable = self.scraper.get_clickable()
127 |         logger.debug(" - Getting forms")
128 |         forms_dict = self.scraper.get_forms()
129 |         self.forms = list(forms_dict.keys())
130 |         logger.debug(" - Getting inputs")
131 |         self.inputs = [tags for tags in forms_dict.values()]
132 |         self.buttons = None # self.scraper.get_buttons()
133 | 
134 |         # logger.debug("Clickable links: %s" % (len(self.clickable)))
135 |         # for i in range(len(self.clickable)):
136 |         #     t = self.clickable[i]
137 |         #     elem = self.scraper.element_by_tag(t)
138 |         #     text = ""
139 |         #     if elem:
140 |         #         text = elem.text.replace("\n", " ")
141 |         #     logger.debug("  %s - ...%s, %s" % (i, t[-25:], text))
142 | 
143 |         # logger.debug("Forms: %s:" % (len(self.forms)))
144 |         # for i in range(len(self.forms)):
145 |         #     t = self.forms[i]
146 |         #     text = ""
147 |         #     elem = self.scraper.element_by_tag(t)
148 |         #     if elem:
149 |         #         text = elem.text.replace("\n", " ")
150 |         #     logger.debug("  %s - ...%s, %s" % (i, t[-25:], text))
151 | 
152 |         # logger.debug("Inputs: %s" % (len(self.inputs)))
153 |         # for i in range(len(self.inputs)):
154 |         #     input_group = self.inputs[i]
155 |         #     for itype_ix in range(len(input_group)):
156 |         #         for t in input_group[itype_ix]:
157 |         #             elem = self.scraper.element_by_tag(t)
158 |         #             text = ""
159 |         #             placeholder = ""
160 |         #             if elem:
161 |         #                 text = elem.text.replace("\n", " ")
162 |         #                 placeholder = elem.get_attribute("placeholder")
163 |         #             logger.debug("  %s - ...%s, %s, %s" % (
164 |         #                 i, t[-25:], text, placeholder))
165 | 
166 |         # logger.debug("Buttons: %s" % (len(self.buttons)))
167 |         # for i in range(len(self.buttons)):
168 |         #     t = self.buttons[i]
169 |         #     elem = self.scraper.element_by_tag(t)
170 |         #     text = ""
171 |         #     value = ""
172 |         #     if elem:
173 |         #         text = elem.text.replace("\n", " ")
174 |         #         value = elem.get_attribute("value")
175 |         #     logger.debug("  %s - ...%s, %s, %s" % (i, t[-25:], text, value))
176 | 
177 |     def initialize(self, url):
178 |         """
179 |         Instantiate a web scraper, given a starting point URL. Also
180 |         gets the links for the current page and sets its tag array.
181 |         """
182 |         self.scraper.fetch(url, initial=True)
183 |         self.load_indices()
184 | 
185 |     def select_link(self, index, iterating_form=False):
186 |         if self.clickable is None:
187 |             logger.debug(" - Getting links")
188 |             self.clickable = self.scraper.get_clickable()
189 |         if index >= len(self.clickable):
190 |             logger.error(
191 |                 "[!] Critical error: link index exceeds clickable length."
192 |             )
193 |             return False
194 |         tag = self.clickable[index]
195 |         clicked = self.scraper.click(tag, iterating_form=iterating_form)
196 |         if clicked:
197 |             self.load_indices()
198 |         return clicked
199 | 
200 |     def select_button(self, index, iterating_form=False):
201 |         if self.buttons is None:
202 |             logger.debug(" - Getting buttons")
203 |             self.buttons = self.scraper.get_buttons()
204 |         tag = self.buttons[index]
205 |         clicked = self.scraper.click(tag, iterating_form=iterating_form)
206 |         if clicked:
207 |             self.load_indices()
208 |         return clicked
209 | 
210 |     def input(self, form_ix, index, chars):
211 |         """
212 |         Add some string to a text input under a given form.
213 |         """
214 |         tag = self.inputs[form_ix][0][index]
215 |         self.scraper.input(tag, chars)
216 | 
217 |     def input_select_option(self, form_ix, index, option_str):
218 |         """
219 |         Select an option for a select input under a given form.
220 |         """
221 |         tag = self.inputs[form_ix][1][index]
222 |         self.scraper.input_select_option(tag, option_str)
223 | 
224 |     def input_checkbox(self, form_ix, index, to_check):
225 |         """
226 |         Check/uncheck a checkbox input under a given form.
227 |         """
228 |         tag = self.inputs[form_ix][2][index]
229 |         self.scraper.input_checkbox(tag, to_check)
230 | 
231 |     def input_date(self, form_ix, index, chars):
232 |         """
233 |         Select a date from an input type="date". String needs to
234 |         be in the MM-DD-YYYY format.
235 |         """
236 |         tag = self.inputs[form_ix][3][index]
237 |         self.scraper.input(tag, chars)
238 | 
239 |     def input_radio_option(self, form_ix, index, radio_index):
240 |         """
241 |         Select a radio checkbox from a given form, checkbox group
242 |         index and desired value by text.
243 |         """
244 |         grp_tags = self.inputs[form_ix][4][index]
245 |         tag = grp_tags[radio_index]
246 |         self.scraper.input_checkbox(tag, True, radio=True)
247 | 
248 |     def submit(self, index):
249 |         tag = self.forms[index]
250 |         self.scraper.submit(tag)
251 |         self.load_indices()
252 | 
253 |     def back(self):
254 |         self.scraper.back()
255 |         self.load_indices()
256 | 


--------------------------------------------------------------------------------
/autoscrape/filetypes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | TEXT_EXTENSIONS = (
 3 |     # ASP Classic
 4 |     "asp",
 5 | 
 6 |     # ASP.NET
 7 |     "aspx",
 8 |     "axd",
 9 |     "asx",
10 |     "asmx",
11 |     "ashx",
12 | 
13 |     # CSS
14 |     "css",
15 | 
16 |     # Coldfusion
17 |     "cfm",
18 | 
19 |     # Erlang
20 |     "yaws",
21 | 
22 |     # # Flash, treat as data
23 |     # "swf",
24 | 
25 |     # HTML
26 |     "html",
27 |     "htm",
28 |     "xhtml",
29 |     "jhtml",
30 |     "hta",
31 | 
32 |     # Java
33 |     "jsp",
34 |     "jspx",
35 |     "wss",
36 |     "do",
37 |     "action",
38 | 
39 |     # JavaScript
40 |     "js",
41 | 
42 |     # Perl
43 |     "pl",
44 | 
45 |     # PHP
46 |     "php",
47 |     "php4",
48 |     "php3",
49 |     "phtml",
50 | 
51 |     # Python
52 |     "py",
53 | 
54 |     # Ruby
55 |     "rb",
56 |     "rhtml",
57 | 
58 |     # SSI
59 |     "shtml",
60 | 
61 |     # Other (C, perl etc.)
62 |     # NOTE: these could also be data
63 |     "cgi",
64 |     "dll",
65 | 
66 |     # ASP.NET
67 |     "axd",
68 |     "asx",
69 |     "asmx",
70 |     "ashx",
71 |     "aspx",
72 | 
73 |     # XML/Feed
74 |     "xml",
75 |     "rss",
76 |     "atom",
77 |     "svg",
78 | )
79 | 


--------------------------------------------------------------------------------
/autoscrape/input_parser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import re
  3 | 
  4 | 
  5 | class InputParser:
  6 |     def __init__(self, input):
  7 |         self.input = input
  8 | 
  9 |     def str2bool(self, string):
 10 |         if not string:
 11 |             return False
 12 |         if string.lower() in ["false", "no", "n", "0"]:
 13 |             return False
 14 |         return True
 15 | 
 16 |     def generate(self):
 17 |         """
 18 |         Make a form input generator by parsing our input string. Output
 19 |         is a multidimensional array, where the first dimension is
 20 |         independent searches to attempt and the second dimension is
 21 |         which inputs for fill. Example:
 22 | 
 23 |         From this input string:
 24 | 
 25 |           "i:0:a%,c:1:True,s:2:France"
 26 | 
 27 |         We get the following output generator:
 28 | 
 29 |             [
 30 |               [
 31 |                 { "index": 0, "string": "a%", "type": "input" }
 32 |                 { "index": 1, "action": True, "type": "checkbox" }
 33 |                 { "index": 2, "string": "France", "type": "select" }
 34 |               ],
 35 |               ...,
 36 |               [
 37 |                 { "index": 0, "string": "z%", "type": "input" },
 38 |                 { "index": 1, "action": True, "type": "checkbox" }
 39 |                 { "index": 2, "string": "France", "type": "select" }
 40 |               ]
 41 |             ]
 42 | 
 43 |         This will do all searches with input 0 filled with a-z and the
 44 |         remaining specified inputs filled as desired (input 1, a checkbox,
 45 |         checked and input 2, a choice select, selected to the "France"
 46 |         option).
 47 |         """
 48 |         # TODO: Find an overall way to support ranges without resulting
 49 |         # to clunky regex (which don't support ordering). Right now just
 50 |         # use GNU Parallel or something to do ranges.
 51 |         # split the independent searches first
 52 |         inputs = re.split(r'(?<!\\);', self.input)
 53 |         for inp in inputs:
 54 |             indiv_search = []
 55 |             # split the inputs to be filled per search
 56 |             indiv_inputs_list = re.split(r'(?<!\\),', inp)
 57 |             for indiv_inputs in indiv_inputs_list:
 58 |                 # input type, input index, action
 59 |                 itype, ix, action = indiv_inputs.split(":", 2)
 60 |                 action = action.replace("\,", ",").replace("\;", ";")
 61 |                 ix = int(ix)
 62 |                 if itype == "i":
 63 |                     indiv_search.append({
 64 |                         "index": ix,
 65 |                         "type": "input",
 66 |                         "string": action,
 67 |                     })
 68 |                 elif itype == "s":
 69 |                     indiv_search.append({
 70 |                         "index": ix,
 71 |                         "type": "select",
 72 |                         "string": action,
 73 |                     })
 74 |                 elif itype == "c":
 75 |                     indiv_search.append({
 76 |                         "index": ix,
 77 |                         "type": "checkbox",
 78 |                         "action": self.str2bool(action),
 79 |                     })
 80 |                 elif itype == "d":
 81 |                     # make sure our date field is correctly
 82 |                     # formatted. non-matching date fields
 83 |                     # will not make it to the input in webdriver
 84 |                     # datefmt = re.match("[0-9]{4}\-[0-9]{2}\-[0-9]{2}", action)
 85 |                     # assert datefmt, "Bad Date! Dates need to be: YYYY-MM-DD"
 86 |                     indiv_search.append({
 87 |                         "index": ix,
 88 |                         "type": "date",
 89 |                         "string": action,
 90 |                     })
 91 |                 elif itype == "r":
 92 |                     indiv_search.append({
 93 |                         "index": ix,
 94 |                         "type": "radio",
 95 |                         "string": int(action),
 96 |                     })
 97 |                 else:
 98 |                     raise Exception("Invalid input type found: %s" % itype)
 99 | 
100 |             yield indiv_search
101 | 


--------------------------------------------------------------------------------
/autoscrape/scrapers/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import datetime
  3 | import hashlib
  4 | import logging
  5 | import os
  6 | import re
  7 | import sys
  8 | import time
  9 | 
 10 | from autoscrape.filetypes import TEXT_EXTENSIONS
 11 | from autoscrape.util import (
 12 |     get_filename_from_url, get_extension_from_url, write_file
 13 | )
 14 | 
 15 | 
 16 | logger = logging.getLogger('AUTOSCRAPE')
 17 | 
 18 | 
 19 | class BaseScraper(object):
 20 |     """
 21 |     A base class for common scraper functionality like loglevel
 22 |     parsing & setup, file saving, etc.
 23 |     """
 24 | 
 25 |     def setup_logging(self, loglevel=None, stdout=False):
 26 |         if loglevel == "DEBUG":
 27 |             loglevel = logging.DEBUG
 28 |         elif not loglevel or loglevel == "INFO":
 29 |             loglevel = logging.INFO
 30 |         elif loglevel == "WARN":
 31 |             loglevel = logging.WARN
 32 |         elif loglevel == "ERROR":
 33 |             loglevel = logging.ERROR
 34 | 
 35 |         logger.setLevel(loglevel)
 36 |         if stdout:
 37 |             console_handler = logging.StreamHandler(stream=sys.stdout)
 38 |             logger.addHandler(console_handler)
 39 | 
 40 |     def save_screenshot(self, classname=None):
 41 |         """
 42 |         Save a screenshot of the current page window. The files
 43 |         created by this are quite large and also only contain the
 44 |         first, above-the-fold content.
 45 | 
 46 |         Screenshots are saved by the timestamp and the "class"
 47 |         of the page currently visited (data_page, etc).
 48 |         """
 49 |         if not self.output or not self.save_screenshots:
 50 |             return
 51 | 
 52 |         if classname not in self.training_classes:
 53 |             raise ValueError("Base class speficied: %s" % classname)
 54 | 
 55 |         if self.full_page_screenshots:
 56 |             # adjust the window size to the document size, this captures most
 57 |             # web configurations except cases where a footer, for example, has
 58 |             # been placed absolutely outside of the body and hangs beneath
 59 |             # NOTE: this comes from here: https://stackoverflow.com/a/52572919
 60 |             original_size = self.control.scraper.driver.get_window_size()
 61 |             required_width = self.control.scraper.driver.execute_script(
 62 |                 'return document.body.parentNode.scrollWidth'
 63 |             )
 64 |             required_height = self.control.scraper.driver.execute_script(
 65 |                 'return document.body.parentNode.scrollHeight'
 66 |             )
 67 |             logger.debug(
 68 |                 "Required width=%s, height=%s, Original width=%s, height=%s" % (
 69 |                     required_width, required_height, original_size["width"],
 70 |                     original_size["height"]
 71 |                 ))
 72 |             self.control.scraper.driver.set_window_size(
 73 |                 required_width, required_height
 74 |             )
 75 | 
 76 |         if re.match("^https?://", self.output):
 77 |             screenshot_dir = "screenshots"
 78 |         else:
 79 |             screenshot_dir = os.path.join(self.output, "screenshots")
 80 | 
 81 |         filepath = os.path.join(screenshot_dir, "%s_%s.png" % (
 82 |             int(time.time()), classname
 83 |         ))
 84 | 
 85 |         png = None
 86 |         if hasattr(self.control.scraper, "driver"):
 87 |             logger.debug("[.] Saving screenshot to file: %s." % filepath)
 88 |             png = self.control.scraper.get_screenshot()
 89 | 
 90 |         # if self.control.scraper.driver_name == "Firefox":
 91 |         #     try:
 92 |         #         html_el = self.control.scraper.driver.find_element_by_tag_name(
 93 |         #             "html"
 94 |         #         )
 95 |         #         png = html_el.screenshot_as_png
 96 |         #         logger.debug("HTML root for screenshot: %s" % html_el)
 97 |         #         logger.debug("DIR: %s" % dir(html_el))
 98 |         #     except Exception as e:
 99 |         #         logger.warn("Error saving screenshot: %s" % e)
100 | 
101 |         # elif not png:
102 |         #     try:
103 |         #         png = self.control.scraper.driver.get_screenshot_as_png()
104 |         #     except Exception as e:
105 |         #         logger.warn("Error saving screenshot: %s" % e)
106 | 
107 |         if png and self.output:
108 |             write_file(
109 |                 filepath, png, fileclass="screenshot",
110 |                 writetype="wb", output=self.output,
111 |                 url=self.control.scraper.page_url,
112 |             )
113 | 
114 |         if self.full_page_screenshots:
115 |             # restore original window size to avoid side effects
116 |             self.control.scraper.driver.set_window_size(
117 |                 original_size['width'], original_size['height']
118 |             )
119 | 
120 |     def save_training_page(self, classname=None, url=None):
121 |         """
122 |         Writes the current page to the output data directory (if provided)
123 |         to the given class folder.
124 |         """
125 |         if not self.output and not self.return_data:
126 |             return
127 | 
128 |         logger.debug("[.] Saving training page for class: %s" % classname)
129 |         if classname not in self.training_classes:
130 |             raise ValueError("Base class speficied: %s" % classname)
131 | 
132 |         data = None
133 |         if url is None:
134 |             # TODO: migrate over
135 |             data = self.control.scraper.page_html
136 |             url = self.control.scraper.page_url
137 |         else:
138 |             data = self.control.scraper.download_file(url)
139 | 
140 |         ext = get_extension_from_url(url)
141 |         link_to_text = ext in TEXT_EXTENSIONS
142 |         if not link_to_text:
143 |             data = self.control.scraper.download_file(url, return_data=True)
144 |             print("DOWNLOADS")
145 |             classname = "downloads"
146 | 
147 |         # always keep filename for downloads, for now
148 |         if not self.output or re.match("^https?://", self.output):
149 |             classdir = classname
150 |         else:
151 |             classdir = os.path.join(self.output, classname)
152 | 
153 | 
154 |         # we had some kind of error downloading
155 |         if data is None:
156 |             return
157 | 
158 |         # hash the contents of the file, so we don't *not* save dynamic
159 |         # JS pages with the same URl and that we *don't* excessively save
160 |         # the same page over and over
161 |         if type(data) == bytes:
162 |             sha256 = hashlib.sha256()
163 |             sha256.update(data)
164 |             h = sha256.digest().hex()
165 |             writetype = "wb"
166 |         else:
167 |             h = hashlib.sha256(data.encode("utf-8")).digest().hex()
168 |             writetype = "w"
169 | 
170 |         logger.debug("URL: %s, Hash: %s, Extension: %s" % (url, h, ext))
171 | 
172 |         # don't use the hash, use the filename from URL
173 |         if self.keep_filename:
174 |             parsed_filename = get_filename_from_url(url)
175 |             logger.debug("Parsed output filename: %s" % parsed_filename)
176 |             filepath = os.path.join(classdir, parsed_filename)
177 |         # use the hash as the output filename
178 |         else:
179 |             filepath = os.path.join(classdir, "%s.%s" % (h, ext))
180 | 
181 |         crawl_data = None
182 |         if self.output:
183 |             logger.info(" - Saving page: %s" % (filepath))
184 |             write_file(
185 |                 filepath, data, fileclass=classname,
186 |                 writetype=writetype, output=self.output,
187 |                 url=self.control.scraper.page_url,
188 |             )
189 |         if self.return_data:
190 |             now = datetime.datetime.utcnow()
191 |             crawl_data = {
192 |                 "url": url,
193 |                 "html": data,
194 |                 "fileclass": classname,
195 |                 "date": now.isoformat(timespec="seconds") + "Z",
196 |             }
197 | 
198 |         # only save stylesheets for web content types
199 |         if link_to_text and not self.disable_style_saving:
200 |             logger.debug(" -  Saving stylesheet")
201 |             style_filepath = "%s.css" % filepath
202 |             # this will save stylesheet as filepath.html.css
203 |             stylesheet = self.control.scraper.get_stylesheet()
204 |             if self.output:
205 |                 write_file(
206 |                     style_filepath, stylesheet,
207 |                     fileclass=classname, output=self.output,
208 |                     url=self.control.scraper.page_url,
209 |                 )
210 |             if self.return_data:
211 |                 crawl_data["css"] = stylesheet
212 | 
213 |         if self.return_data:
214 |             self.crawl_data.append(crawl_data)
215 | 
216 |     def save_scraper_graph(self):
217 |         """
218 |         Saves our graph that was built throughout the scrape. This can
219 |         be used to visualize the scrape, debug it, and replicate it.
220 | 
221 |         Graph is saved to the graph subdirectory of the output
222 |         path, with the filename a microscond timestamp with the .gpickle
223 |         extension.
224 | 
225 |         If output is not set, then calling this function has no
226 |         effect.
227 |         """
228 |         if not self.output or not self.save_graph:
229 |             logger.debug("[!] No output or save-graph options. Not saving")
230 |             return
231 | 
232 |         filename = "%s.gpickle" % int(time.time() * 1000)
233 |         basedir = os.path.join(self.output, "graph")
234 |         if not os.path.exists(basedir):
235 |             os.makedirs(basedir)
236 | 
237 |         graph_save_path = os.path.join(basedir, filename)
238 |         logger.debug("[.] Saving graph to: %s" % graph_save_path)
239 |         self.control.scraper.graph.save_graph(graph_save_path)
240 | 


--------------------------------------------------------------------------------
/autoscrape/scrapers/manual.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import logging
  3 | import re
  4 | 
  5 | from autoscrape.scrapers import BaseScraper
  6 | from autoscrape.control import Controller
  7 | from autoscrape.input_parser import InputParser
  8 | 
  9 | 
 10 | logger = logging.getLogger('AUTOSCRAPE')
 11 | 
 12 | 
 13 | class ManualControlScraper(BaseScraper):
 14 |     """
 15 |     A Depth-First Search scraper that looks for forms, inputs, and next
 16 |     buttons by some manual criteria and iterates accordingly.
 17 |     """
 18 |     training_classes = [
 19 |         "data_pages", "error_pages", "links_to_documents",
 20 |         "links_to_search", "search_pages", "crawl_pages",
 21 |         "interaction_pages",
 22 |     ]
 23 | 
 24 |     # TODO: save the path to the matched search form to a file. then
 25 |     # upon subsequent loads, we can try that path first and then go
 26 |     # from there. this way the manual control scraper can learn from
 27 |     # self-exploration based on manual params
 28 |     #
 29 |     # Another related idea: it would be nice if we had an API that
 30 |     # saved data about successful runs and loading/replaying them
 31 |     # so users can get familiar with the concepts of self-exploration
 32 |     # and self-learning but without having to get the ML concepts.
 33 | 
 34 |     def __init__(self, baseurl, loglevel=None, stdout=None, maxdepth=10,
 35 |                  max_pages=None,
 36 |                  formdepth=0, next_match=None, form_match=None,
 37 |                  output=None, keep_filename=False, disable_style_saving=False,
 38 |                  save_screenshots=False, save_graph=False,
 39 |                  full_page_screenshots=False, input=None, leave_host=False,
 40 |                  driver="Firefox", browser_binary=None,
 41 |                  remote_hub="http://localhost:4444/wd/hub",
 42 |                  link_priority=None, ignore_links=None, only_links=None,
 43 |                  ignore_extensions=None, result_page_links=None,
 44 |                  form_submit_natural_click=False, form_submit_wait=5,
 45 |                  force_page_wait=None, form_submit_button_selector=None,
 46 |                  load_images=False, show_browser=False, warc_index_file=None,
 47 |                  warc_directory=None, return_data=False, page_timeout=None,
 48 |                  backend="selenium"):
 49 |         # setup logging, etc
 50 |         super().setup_logging(
 51 |             loglevel=loglevel, stdout=stdout
 52 |         )
 53 |         if page_timeout is not None:
 54 |             page_timeout = int(page_timeout or 0)
 55 |         # set up web scraper controller
 56 |         self.control = Controller(
 57 |             leave_host=leave_host, driver=driver, remote_hub=remote_hub,
 58 |             form_submit_natural_click=form_submit_natural_click,
 59 |             form_submit_wait=int(form_submit_wait or 0),
 60 |             form_submit_button_selector=form_submit_button_selector,
 61 |             force_page_wait=int(force_page_wait or 0),
 62 |             load_images=load_images, show_browser=show_browser,
 63 |             warc_index_file=warc_index_file, warc_directory=warc_directory,
 64 |             output=output, backend=backend, browser_binary=browser_binary,
 65 |             page_timeout=page_timeout,
 66 |         )
 67 |         self.control.initialize(baseurl)
 68 |         # depth of DFS in search of form
 69 |         self.maxdepth = int(maxdepth or 0)
 70 |         # current depth of iterating through 'next' form buttons
 71 |         self.formdepth = int(formdepth or 0)
 72 |         # max total pages to fetch
 73 |         self.max_pages = int(max_pages) if max_pages else None
 74 |         # keep number of fetched pages here.
 75 |         self.total_pages = 0
 76 |         # match for link to identify a "next" button
 77 |         self.next_match = next_match
 78 |         # string to match a form (by element text) we want to scrape
 79 |         self.form_match = form_match
 80 |         # Where to write training data from crawl
 81 |         self.output = output
 82 |         # Whether or not to return crawled data upon completion. This can be
 83 |         # used along with output, or on its own. This will store all
 84 |         # data in memory (in self.crawl_data) during the crawl, so beware!
 85 |         self.return_data = return_data
 86 |         self.crawl_data = []
 87 |         # If this is true, do not use a file hash for the filename
 88 |         self.keep_filename = keep_filename
 89 |         # Disable saving of stylesheets for web content types
 90 |         self.disable_style_saving = disable_style_saving
 91 |         # To save screenshots or not (they're large and expensive)
 92 |         self.save_screenshots = save_screenshots
 93 |         # To save the whole, scrolled down page screenshot, this can
 94 |         # fail in certain circumstances
 95 |         self.full_page_screenshots = full_page_screenshots
 96 |         # Whether to save the crawl & interaction traversal graph
 97 |         self.save_graph = save_graph
 98 |         # string used to match link text in order to sort them higher
 99 |         self.link_priority = link_priority
100 |         # string or regex to be used to omit links from clickable
101 |         self.ignore_links = ignore_links
102 |         # string or regex to use to not go to URLs with extensions
103 |         self.ignore_extensions = ignore_extensions
104 |         # Whitelisted links to click
105 |         self.only_links = only_links
106 |         # Apply any link clicking rules to the results pages
107 |         self.result_page_links = result_page_links
108 |         # attempt a position-based "natural click" over the element
109 |         self.form_submit_natural_click = form_submit_natural_click
110 |         # a period of seconds to force a wait after a submit
111 |         self.form_submit_wait = int(form_submit_wait or 0)
112 |         # a generator, outputting individual form interaction plans
113 |         if input:
114 |             self.input_gen = InputParser(input).generate()
115 |         # if not specified, do nothing with forms
116 |         else:
117 |             self.input_gen = [[]]
118 |         # whether or not we've successfully scraped what we want
119 |         self.scraped = False
120 | 
121 |     def click_until_no_links(self, links):
122 |         logger.info("[.] Clicking result page links...")
123 |         if self.max_pages is not None and self.total_pages >= self.max_pages:
124 |             logger.info(" - Maximum pages %s reached, returning..." % self.max_pages)
125 |             return
126 | 
127 |         link_vectors = self.control.vectorizer.link_vectors()
128 |         link_zip = list(zip(range(len(link_vectors)), link_vectors))
129 |         link_zip = filter(
130 |             lambda x: re.findall(self.result_page_links, x[1]),
131 |             link_zip
132 |         )
133 |         logger.debug(" - Candidate links: %s" % (link_zip))
134 |         # Click until we get no more matches
135 |         for ix, text in link_zip:
136 |             logger.info("[.] Clicking result page link: %s" % (text))
137 |             logger.debug(" - Current URL: %s" % (self.control.scraper.page_url))
138 |             if self.control.select_link(ix, iterating_form=True):
139 |                 self.total_pages += 1
140 |                 self.click_until_no_links(links)
141 |                 self.save_training_page(classname="data_pages")
142 |                 self.save_screenshot(classname="data_pages")
143 |                 self.control.back()
144 | 
145 |     def keep_clicking_next_btns(self):
146 |         """
147 |         This looks for "next" buttons, or (in the future) page number
148 |         links, and clicks them until one is not found. This saves the
149 |         pages as it goes.
150 |         """
151 |         logger.info("[*] Entering result page iteration routine")
152 |         depth = 0
153 |         while True:
154 |             if self.formdepth and depth >= self.formdepth:
155 |                 logger.debug("[*] Max 'next' formdepth reached %s" % depth)
156 |                 break
157 | 
158 |             button_data = self.control.vectorizer.button_vectors()
159 |             n_buttons = len(button_data)
160 |             logger.info("[.] On result page %s" % (depth + 1))
161 |             logger.debug(" - Button vectors (%s): %s" % (
162 |                 n_buttons, button_data
163 |             ))
164 | 
165 |             if self.result_page_links:
166 |                 self.click_until_no_links(self.result_page_links)
167 | 
168 |             # element type, index, text
169 |             next_found = None
170 |             for ix in range(n_buttons):
171 |                 button_text = button_data[ix]
172 |                 logger.debug(" - Checking button: %s" % button_text)
173 |                 if re.findall(self.next_match.lower(), button_text.lower()):
174 |                     next_found = ("button", ix, button_text)
175 |                     break
176 | 
177 |             if not next_found:
178 |                 link_vectors = self.control.vectorizer.link_vectors()
179 |                 n_clickable = len(link_vectors)
180 |                 logger.debug(" - Button not found, searching %s links" % (
181 |                     n_clickable
182 |                 ))
183 |                 for ix in range(n_clickable):
184 |                     link_text = link_vectors[ix]
185 |                     logger.debug(" - Checking clickable: %s" % link_text)
186 |                     if re.findall(self.next_match.lower(), link_text.lower()):
187 |                         next_found = ("link", ix, link_text)
188 |                         break
189 | 
190 |             # we didn't find a next match, break loop
191 |             if next_found is None:
192 |                 logger.debug(" - Next button not found!")
193 |                 break
194 |             else:
195 |                 ntype, ix, text = next_found
196 |                 logger.info("[.] Next button found! Clicking: %s" % (text))
197 |                 depth += 1
198 |                 if ntype == "button":
199 |                     self.control.select_button(ix, iterating_form=True)
200 |                 elif ntype == "link":
201 |                     self.control.select_link(ix, iterating_form=True)
202 | 
203 |                 # subsequent page loads get saved here
204 |                 self.save_training_page(classname="data_pages")
205 |                 self.save_screenshot(classname="data_pages")
206 | 
207 |             # check for infinite loop, this is based on the hash
208 |             # of the previous few pages
209 |             logger.debug(" - Checking for infinite loop...")
210 |             if self.control.scraper.infinite_loop_detected:
211 |                 logger.debug(" - Infinte loop detected. Breaking.")
212 |                 break
213 | 
214 |         for _ in range(depth):
215 |             logger.debug("[.] Going back from result page...")
216 |             self.control.back()
217 | 
218 |     def scrape(self, depth=0):
219 |         logger.info("[.] Crawl depth %s" % depth)
220 |         logger.info(" - Total pages: %s of max: %s" % (
221 |             self.total_pages, self.max_pages
222 |         ))
223 |         if self.maxdepth != -1 and depth > self.maxdepth:
224 |             logger.info(" - Maximum depth %s reached, returning..." % depth)
225 |             self.control.back()
226 |             return
227 |         if self.max_pages is not None and self.total_pages >= self.max_pages:
228 |             logger.info(" - Maximum pages %s reached, returning..." % self.max_pages)
229 |             return
230 |         if self.scraped:
231 |             logger.debug(" - Scrape complete, not clicking anything else.")
232 |             return
233 | 
234 |         if self.ignore_extensions and re.findall(self.ignore_extensions,
235 |                                                  self.control.scraper.page_url):
236 |             logger.debug(" - Ignoring URL matching ignored extension: %s" % (
237 |                 self.control.scraper.page_url
238 |             ))
239 |             return
240 | 
241 |         self.save_training_page(classname="crawl_pages")
242 |         self.save_screenshot(classname="crawl_pages")
243 |         form_vectors = self.control.vectorizer.form_vectors()
244 | 
245 |         # NOTE: we never get into this loop if self.input_gen is empty
246 |         # this arises when input was not handed to the initializer
247 |         for ix in range(len(form_vectors)):
248 |             # don't bother with looking for forms if we didn't specify
249 |             # th form_match option
250 |             if not self.form_match:
251 |                 continue
252 | 
253 |             form_data = form_vectors[ix]
254 | 
255 |             # inputs are keyed by form index, purely here for debug purposes
256 |             inputs = self.control.inputs[ix]
257 |             logger.debug(" - Form: %s Text: %s" % (ix, form_data))
258 |             logger.debug(" - Inputs: %s" % inputs)
259 | 
260 |             if self.form_match.lower() not in form_data.lower():
261 |                 continue
262 | 
263 |             logger.info("[*] Found an input form (No. %s on page)" % (ix))
264 |             self.save_training_page(classname="search_pages")
265 |             self.save_screenshot(classname="search_pages")
266 | 
267 |             for input_phase in self.input_gen:
268 |                 logger.debug(" - Input plan: %s" % input_phase)
269 |                 for single_input in input_phase:
270 |                     input_index = single_input["index"]
271 |                     if single_input["type"] == "input":
272 |                         input_string = single_input["string"]
273 |                         logger.info("[.] Inputting %s to input %s" % (
274 |                             input_string, ix
275 |                         ))
276 |                         self.control.input(ix, input_index, input_string)
277 |                     elif single_input["type"] == "select":
278 |                         input_string = single_input["string"]
279 |                         logger.info("[.] Selecting option %s in input %s" % (
280 |                             input_string, input_index
281 |                         ))
282 |                         self.control.input_select_option(
283 |                             ix, input_index, input_string
284 |                         )
285 |                     elif single_input["type"] == "checkbox":
286 |                         to_check = single_input["action"]
287 |                         logger.info("[.] %s checkbox input %s" % (
288 |                             "Checking" if to_check else "Unchecking",
289 |                             input_index
290 |                         ))
291 |                         self.control.input_checkbox(
292 |                             ix, input_index, to_check
293 |                         )
294 |                     elif single_input["type"] == "date":
295 |                         input_string = single_input["string"]
296 |                         logger.info("[.] Setting date to %s in date input %s" % (
297 |                             input_string, ix))
298 |                         self.control.input_date(ix, input_index, input_string)
299 |                     elif single_input["type"] == "radio":
300 |                         radio_index = single_input["string"]
301 |                         logger.info("[.] Selecting radio checkbox %s in group %s" % (
302 |                             radio_index, input_index
303 |                         ))
304 |                         self.control.input_radio_option(
305 |                             ix, input_index, radio_index
306 |                         )
307 | 
308 |                 # capture post-input screenshot
309 |                 self.save_screenshot(classname="interaction_pages")
310 | 
311 |                 # actually submit the page
312 |                 self.control.submit(ix)
313 |                 self.total_pages += 1
314 | 
315 |                 # save the initial landing result page
316 |                 self.save_screenshot(classname="data_pages")
317 |                 self.save_training_page(classname="data_pages")
318 | 
319 |                 # if we're looking for next buttons, click them
320 |                 if self.next_match:
321 |                     self.keep_clicking_next_btns()
322 | 
323 |                 self.scraped = True
324 |                 self.control.back()
325 | 
326 |             logger.debug("[*] Completed iteration!")
327 |             # Only scrape a single form, due to explicit, single
328 |             # match configuration option
329 |             if self.scraped:
330 |                 logger.info("[*] Scrape complete! Exiting.")
331 |                 return
332 | 
333 |         link_vectors = self.control.vectorizer.link_vectors()
334 |         logger.debug("[.] Links on page: %s" % (link_vectors))
335 |         link_zip = list(zip(range(len(link_vectors)), link_vectors))
336 |         if self.ignore_links:
337 |             logger.debug(" - Ignoring links matching: %s" % self.ignore_links)
338 |             link_zip = filter(
339 |                 lambda x: not re.findall(self.ignore_links, x[1]),
340 |                 link_zip
341 |             )
342 |         if self.only_links:
343 |             logger.debug(" - Keeping only links matching: %s" % self.ignore_links)
344 |             link_zip = filter(
345 |                 lambda x: re.findall(self.only_links, x[1]),
346 |                 link_zip
347 |             )
348 |         if self.link_priority:
349 |             logger.debug(" - Sorting by link priority: %s" % self.link_priority)
350 |             link_zip.sort(
351 |                 key=lambda x: not re.findall(self.link_priority, x[1])
352 |             )
353 | 
354 |         for ix, text in link_zip:
355 |             logger.debug(" - Link index: %s text: %s" % (ix, text))
356 |             if self.maxdepth != -1 and depth == self.maxdepth:
357 |                 logger.debug(" - At maximum depth: %s, skipping links." % depth)
358 |                 break
359 |             if self.max_pages is not None and self.total_pages >= self.max_pages:
360 |                 logger.info(" - Maximum pages reached, skipping links.")
361 |                 break
362 |             if self.scraped:
363 |                 logger.debug(" - Scrape complete, not clicking anything else.")
364 |                 return
365 | 
366 |             logger.debug(" - Current URL: %s" % (self.control.scraper.page_url))
367 |             logger.debug(" - Attempting to click link text: %s" % text)
368 |             if self.control.select_link(ix):
369 |                 logger.info("[.] Link clicked: %s" % (text))
370 |                 logger.debug(" - Current URL: %s" % (self.control.scraper.page_url))
371 |                 self.total_pages += 1
372 |                 self.scrape(depth=depth + 1)
373 |             else:
374 |                 logger.debug(" - Click failed, skipping: %s" % text)
375 | 
376 |         logger.debug("[*] Searching forms and links on page complete")
377 |         self.control.back()
378 | 
379 |     def run(self, *args, **kwargs):
380 |         # we have to catch this so, in the case of failure, we
381 |         # don't have random browser windows hanging around
382 |         try:
383 |             self.scrape(*args, **kwargs)
384 |         except Exception as e:
385 |             msg = "[!] Fatal error scraping: %s. Cleaning up, quitting."
386 |             logger.error(msg % (e))
387 |             if hasattr(self.control.scraper, "driver"):
388 |                 self.control.scraper.driver.quit()
389 |             if self.output and self.save_graph:
390 |                 self.save_scraper_graph()
391 |             raise e
392 |         # else:
393 |         #     logger.info("[+] AutoScrape run complete.")
394 |         #     if self.output and self.save_graph:
395 |         #         self.save_scraper_graph()
396 |         try:
397 |             self.control.scraper.driver.quit()
398 |         except Exception:
399 |             pass
400 | 
401 |         if self.return_data:
402 |             return self.crawl_data
403 | 


--------------------------------------------------------------------------------
/autoscrape/scrapers/null.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import logging
 3 | 
 4 | from . import BaseScraper
 5 | from ..control import Controller
 6 | from autoscrape.backends.requests.browser import RequestsBrowser
 7 | from autoscrape.backends.selenium.browser import SeleniumBrowser
 8 | 
 9 | 
10 | logger = logging.getLogger('AUTOSCRAPE')
11 | 
12 | 
13 | class NullScraper(BaseScraper):
14 |     """
15 |     A test scraper that just provides direct access to scraper and
16 |     controller. For vectorizing documents.
17 |     """
18 | 
19 |     def __init__(self, *args, html_embeddings=None, word_embeddings=None,
20 |                  loglevel="INFO", scraper=True, controller=False, backend=None,
21 |                  driver="Chrome", vectorizer=False, **kwargs):
22 |         super(NullScraper, self).setup_logging(loglevel=loglevel)
23 | 
24 |         Browser = None
25 |         if backend == "selenium":
26 |             Browser = SeleniumBrowser
27 |         elif backend == "requests":
28 |             Browser = RequestsBrowser
29 |         else:
30 |             raise NotImplementedError(
31 |                 "No backend found: %s" % (backend)
32 |             )
33 | 
34 |         if scraper:
35 |             self.scraper = Browser(driver=driver)
36 |         if controller:
37 |             self.control = Controller(
38 |                 html_embeddings_file=html_embeddings,
39 |                 word_embeddings_file=word_embeddings,
40 |             )
41 |         if vectorizer:
42 |             self.vectorizer = self.control.vectorizer
43 | 


--------------------------------------------------------------------------------
/autoscrape/scrapers/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import logging
 3 | 
 4 | from . import BaseScraper
 5 | 
 6 | from autoscrape.backends.requests.browser import RequestsBrowser
 7 | 
 8 | 
 9 | logger = logging.getLogger('AUTOSCRAPE')
10 | 
11 | 
12 | class TestScraper(BaseScraper):
13 |     def __init__(self, baseurl, maxdepth=10, loglevel=None):
14 |         """
15 |         Initialize our scraper and get the first page.
16 |         """
17 |         super(TestScraper, self).setup_logging(loglevel=loglevel)
18 |         self.scraper = RequestsBrowser()
19 |         self.scraper.fetch(baseurl)
20 |         self.maxdepth = maxdepth
21 | 
22 |     def run(self, depth=0, tags=None):
23 |         """
24 |         This is the main recursive depth-first search of a site. It
25 |         doesn't do anything but crawl a site DFS and ensure the tagging
26 |         and web engine is working as it should.
27 |         """
28 |         if depth > self.maxdepth:
29 |             logger.debug("Maximum depth %s reached, returning..." % depth)
30 |             self.scraper.back()
31 |             return
32 | 
33 |         logger.debug("** DEPTH %s" % depth)
34 | 
35 |         if not tags:
36 |             tags = self.scraper.get_clickable()
37 | 
38 |         logger.debug(
39 |             "All tags at this depth \n    %s" % ("\n    ").join(tags))
40 | 
41 |         for tag in tags:
42 |             logger.debug("Attempting click on tag \n    %s" % tag)
43 | 
44 |             if self.scraper.click(tag):
45 |                 logger.debug("Clicked! Recursing ...")
46 |                 self.run(
47 |                     depth=depth + 1, tags=self.scraper.get_clickable())
48 | 
49 |         logger.debug("Going back...")
50 |         self.scraper.back()
51 | 


--------------------------------------------------------------------------------
/autoscrape/search/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/search/__init__.py


--------------------------------------------------------------------------------
/autoscrape/search/bfs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from collections import deque
 3 | 
 4 | import networkx as nx
 5 | 
 6 | 
 7 | class BFS(object):
 8 |     def __init__(self, root_node):
 9 |         # queue of remaining BFS nodes
10 |         self.queue = deque()
11 |         # store BFS scrape graph
12 |         self.graph = nx.DiGraph()
13 |         # current node (css tag)
14 |         self.current = None
15 | 
16 |     def next(self):
17 |         """
18 |         Get the next CSS path node to interact with.
19 |         """
20 |         newcurrent = self.queue.popleft()
21 |         self.current = newcurrent
22 |         return newcurrent
23 | 
24 |     def add_root_node(self, node, **kwargs):
25 |         """
26 |         Add an isolated node, make it the current node. This
27 |         is for creating new subgraphs inside our graph. Does
28 |         not add any edges.
29 |         """
30 |         self.graph.add_node(node, **kwargs)
31 |         self.current = node
32 | 
33 |     def add_node(self, node, **kwargs):
34 |         """
35 |         Add a single node to the tree, with edges connecting
36 |         to the current node.
37 |         """
38 |         self.queue.append(node)
39 |         self.graph.add_node(node, **kwargs)
40 |         self.graph.add_edge(self.current, node)
41 | 
42 |     def add_nodes(self, nodes):
43 |         """
44 |         Add a list of nodes to the current node in the graph.
45 |         This handles adding the nodes and the edges.
46 |         """
47 |         self.queue.extend(nodes)
48 |         for node, meta in nodes:
49 |             self.add_node(node, **meta)
50 | 


--------------------------------------------------------------------------------
/autoscrape/search/graph.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import logging
 3 | 
 4 | try:
 5 |     import networkx as nx
 6 | except ModuleNotFoundError:
 7 |     pass
 8 | 
 9 | 
10 | logger = logging.getLogger('AUTOSCRAPE')
11 | 
12 | 
13 | class Graph(object):
14 |     def __init__(self):
15 |         try:
16 |             self.graph = nx.DiGraph()
17 |         except NameError:
18 |             logger.debug(
19 |                 "NetworkX not installed. Not building crawl graph."
20 |                 " (Hint: pip install autoscrape[graph])"
21 |             )
22 |             self.graph = None
23 |         # store scrape graph
24 |         # current node (css tag)
25 |         self.current = None
26 | 
27 |     def add_root_node(self, node, **kwargs):
28 |         """
29 |         Add an isolated node, make it the current node. This
30 |         is for creating new subgraphs inside our graph. Does
31 |         not add any edges.
32 |         """
33 |         if self.graph is None:
34 |             return
35 |         self.graph.add_node(node, **kwargs)
36 |         self.current = node
37 | 
38 |     def add_node(self, node, **kwargs):
39 |         """
40 |         Add a single node to the tree, with edges connecting
41 |         to the current node.
42 |         """
43 |         if self.graph is None:
44 |             return
45 |         self.graph.add_node(node, **kwargs)
46 |         self.graph.add_edge(self.current, node)
47 | 
48 |     def add_nodes(self, nodes):
49 |         """
50 |         Add a list of nodes to the current node in the graph.
51 |         This handles adding the nodes and the edges.
52 |         """
53 |         if self.graph is None:
54 |             return
55 |         for node, meta in nodes:
56 |             self.add_node(node, **meta)
57 | 
58 |     def add_meta_to_current(self, **meta):
59 |         if self.graph is None:
60 |             return
61 |         self.graph.nodes[self.current].update(**meta)
62 | 
63 |     def add_action_to_current(self, action):
64 |         if self.graph is None:
65 |             return
66 |         current_meta = self.graph.nodes[self.current]
67 |         current_actions = current_meta.get("actions", [])
68 |         current_actions.append(action)
69 |         current_meta["actions"] = current_actions
70 |         nx.set_node_attributes(self.graph, current_meta, name=self.current)
71 | 
72 |     def move_to_node(self, node):
73 |         if self.graph is None:
74 |             return
75 |         self.current = node
76 | 
77 |     def move_to_parent(self):
78 |         if self.graph is None:
79 |             return
80 |         try:
81 |             preds = self.graph.predecessors(self.current)
82 |             parent = self.graph.predecessors(self.current).__next__()
83 |         except StopIteration:
84 |             return
85 |         self.move_to_node(parent)
86 | 
87 |     def save_graph(self, output_path):
88 |         if self.graph is None:
89 |             return
90 |         nx.write_gpickle(self.graph, output_path)
91 | 


--------------------------------------------------------------------------------
/autoscrape/tasks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import os
 3 | import re
 4 | 
 5 | from celery import Celery
 6 | 
 7 | from .scrapers.manual import ManualControlScraper
 8 | 
 9 | 
10 | backend = "rpc://"
11 | if os.environ.get("AUTOSCRAPE_DB_HOST"):
12 |     backend = 'db+postgresql://%s:%s@%s/autoscrape' % (
13 |         os.environ["AUTOSCRAPE_DB_USER"],
14 |         os.environ["AUTOSCRAPE_DB_PASSWORD"],
15 |         os.environ["AUTOSCRAPE_DB_HOST"]
16 |     )
17 | 
18 | 
19 | app = Celery(
20 |     'tasks',
21 |     broker=os.environ.get("AUTOSCRAPE_RABBITMQ_HOST"),
22 |     backend=backend,
23 | )
24 | 
25 | app.conf.update(
26 |     # CELERYD_MAX_TASKS_PER_CHILD=1,
27 |     # CELERYD_PREFETCH_MULTIPLIER=1,
28 |     # CELERY_ACKS_LATE=True,
29 |     # CELERY_RESULT_PERSISTENT=True,
30 |     # CELERY_TASK_PUBLISH_RETRY=False,
31 |     # CELERY_TASK_RESULT_EXPIRES=None,
32 |     CELERY_TRACK_STARTED=True,
33 |     CELERY_BROKER_HEARTBEAT=10
34 | )
35 | 
36 | 
37 | @app.task(bind=True)
38 | def start(self, baseurl, args):
39 |     print("Starting ManualControlScraper", baseurl, args)
40 |     # append task ID to receiver URI
41 |     output = args.get("output")
42 |     if output and re.match("^https?://", output):
43 |         if output[-1] != "/":
44 |             output += "/"
45 |         output += str(self.request.id)
46 |         args["output"] = output
47 |     scraper = ManualControlScraper(baseurl, **args)
48 |     scraper.run()
49 | 


--------------------------------------------------------------------------------
/autoscrape/util/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import base64
 3 | import json
 4 | import logging
 5 | import os
 6 | import re
 7 | from urllib import parse
 8 | 
 9 | import requests
10 | 
11 | 
12 | logger = logging.getLogger('AUTOSCRAPE')
13 | 
14 | 
15 | def get_filename_from_url(url):
16 |     """
17 |     Take a fully-qualified URL and turn it into a filename. For
18 |     example, turn a url like this:
19 | 
20 |         https://www.cia.gov/library/readingroom/docs/%5B15423241%5D.pdf
21 | 
22 |     Using the parsed URL:
23 | 
24 |         ParseResult(scheme='https', netloc='www.cia.gov',
25 |             path='/library/readingroom/docs/%5B15423241%5D.pdf
26 | 
27 |     Returing this representation (a string):
28 | 
29 |         _library_readingroom_docs_%5B15423241%5D.pdf
30 | 
31 |     NOTE: If no extension is found on the page, .html is appended.
32 |     """
33 |     parsed = parse.urlparse(url)
34 |     host = parsed.netloc
35 |     # split filename/path and extension
36 |     file_parts = os.path.splitext(parsed.path)
37 |     # file_part = file_parts[0].replace("/", "__")
38 |     extension = file_parts[1] or ".html"
39 |     # filename = "%s_%s" % (host, file_part)
40 |     filename = url.replace("/", "_").replace(":", "_")
41 |     if parsed.query:
42 |         query_part = "_".join(parsed.query.split("&"))
43 |         filename = "%s__%s" % (filename, query_part)
44 |     return "%s%s" % (filename, extension)
45 | 
46 | 
47 | def get_extension_from_url(url):
48 |     # try and extract the extension from the URL
49 |     path = parse.urlparse(url).path
50 |     ext = os.path.splitext(path)[1]
51 |     ext = ext if ext else "html"
52 |     if ext[0] == ".":
53 |         ext = ext[1:]
54 |     return ext
55 | 
56 | 
57 | def write_file(filepath, data, fileclass=None, writetype="w", output=None,
58 |                url=None):
59 |     """
60 |     Write out a scraped data file to disk or a remote callback,
61 |     specified in output parameter.
62 |     """
63 |     logger.debug("[.] Writing file: %s to: %s" % (filepath, output))
64 |     if not output:
65 |         return
66 | 
67 |     # Rest API callback mode
68 |     if re.match("^https?://", output):
69 |         # (b64encode) bytes -> (decode) str
70 |         if type(data) == bytes:
71 |             encoded = base64.b64encode(data).decode()
72 |         else:
73 |             encoded = base64.b64encode(bytes(data, "utf-8")).decode()
74 |         payload = {
75 |             "name": filepath,
76 |             "data": encoded,
77 |             "fileclass": fileclass,
78 |             "url": url,
79 |         }
80 |         if fileclass:
81 |             payload["fileclass"] = fileclass
82 |             post_data = json.dumps(payload).encode("utf-8")
83 |             headers = {
84 |                 "content-type": "application/json"
85 |             }
86 |             r = requests.post(
87 |                 output, data=post_data, headers=headers
88 |             )
89 |             r.status_code
90 | 
91 |     # filesystem mode
92 |     else:
93 |         dirpath = os.path.dirname(filepath)
94 |         if not os.path.exists(dirpath):
95 |             os.makedirs(dirpath)
96 |         with open(filepath, writetype) as f:
97 |             f.write(data)
98 | 


--------------------------------------------------------------------------------
/autoscrape/util/warc.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from multiprocessing import Pool
 3 | import os
 4 | import pickle
 5 | 
 6 | try:
 7 |     import plyvel
 8 |     import warcio
 9 | except ModuleNotFoundError:
10 |     pass
11 | 
12 | 
13 | logger = logging.getLogger('AUTOSCRAPE')
14 | 
15 | 
16 | def _warc_record_sane(record):
17 |     if record.rec_type != "response":
18 |         return False
19 |     if not record.rec_headers.get_header('WARC-Target-URI'):
20 |         return False
21 |     return True
22 | 
23 | 
24 | def _warc_records(filename):
25 |     records = []
26 |     try:
27 |         with open(filename, "rb") as f:
28 |             for record in warcio.ArchiveIterator(f):
29 |                 if not _warc_record_sane(record):
30 |                     continue
31 |                 parsed_rec = {
32 |                     "uri": record.rec_headers.get_header('WARC-Target-URI'),
33 |                     "payload": record.content_stream().read().strip(),
34 |                     "headers": record.http_headers.headers,
35 |                 }
36 |                 yield parsed_rec
37 |     except Exception as e:
38 |         logger.error("[!] Error opening WARC file %s" % (filename))
39 |         logger.error(e)
40 |     return records
41 | 
42 | 
43 | def _process_warcfile(filepath, filter_domain):
44 |     found = 0
45 |     if not filepath.endswith(".warc.gz"):
46 |         return []
47 |     logger.debug(" - Parsing %s" % (filepath))
48 |     record_number = -1
49 |     results = []
50 |     for record in _warc_records(filepath):
51 |         record_number += 1
52 |         uri = record["uri"]
53 |         if filter_domain and filter_domain not in uri:
54 |             continue
55 |         logger.debug("URI: %s" % (uri))
56 |         found += 1
57 |         uri_bytes = bytes(uri, "utf-8")
58 |         value = pickle.dumps((filepath, record_number))
59 |         results.append((uri_bytes, value))
60 |     if found:
61 |         logger.debug(" - Found %s records" % (found))
62 |     return results
63 | 
64 | 
65 | def build_warc_index(db=None, warc_directory=None, filter_domain=None):
66 |     """
67 |     Read through all WARC files in warc_directory and build
68 |     an index: URL => filename, record_number
69 |     """
70 |     blank = True
71 |     for rec in db.iterator():
72 |         blank = False
73 |         break
74 |     if not blank:
75 |         logger.debug("[.] Already loaded WARC index.")
76 |         return
77 |     logger.info("[.] Building WARC index. This might take a while...")
78 |     _, _, filenames = list(os.walk(warc_directory))[0]
79 |     filepaths = [(os.path.join(warc_directory, n), filter_domain) for n in filenames]
80 |     print(filepaths[0])
81 | 
82 |     with Pool(4) as f:
83 |         results_groups = f.starmap(_process_warcfile, filepaths)
84 |     for results in results_groups:
85 |         for uri_bytes, value in results:
86 |             db.put(uri_bytes, value)
87 | 


--------------------------------------------------------------------------------
/autoscrape/vectorization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/autoscrape/vectorization/__init__.py


--------------------------------------------------------------------------------
/autoscrape/vectorization/embeddings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import logging
  3 | import re
  4 | 
  5 | import numpy as np
  6 | 
  7 | 
  8 | logger = logging.getLogger('AUTOSCRAPE')
  9 | 
 10 | 
 11 | class Embedding:
 12 |     def __init__(self, embeddings=None, t2id=None, id2t=None):
 13 |         self.embeddings = embeddings
 14 |         self.t2id = t2id
 15 |         self.id2t = id2t
 16 |         self.N, self.dim = embeddings.shape
 17 | 
 18 | 
 19 | class EmbeddingsVectorizer:
 20 |     def __init__(self, html_embeddings_file=None, word_embeddings_file=None,
 21 |                  scraper=None, controller=None, loglevel=None):
 22 |         """
 23 |         Initialize our vectorizer with paths to the relevant word
 24 |         embedding files for our vectorization routines.
 25 | 
 26 |         Representations:
 27 |             - Element text vector (fasttext)
 28 |             - Element path vector ( body -> form -> div ... -> button )
 29 | 
 30 |         Given state, q-function gets action-state pairs. We take
 31 |         the action giving the highest q-value.
 32 |         """
 33 |         self.scraper = scraper
 34 |         self.controller = controller
 35 | 
 36 |         self.html = None
 37 |         if html_embeddings_file:
 38 |             logger.debug("[.] Loading HTML embeddings")
 39 |             self.html = self.load_embedding(html_embeddings_file)
 40 | 
 41 |         self.word = None
 42 |         if word_embeddings_file:
 43 |             logger.debug("[.] Loading word embeddings")
 44 |             self.word = self.load_embedding(word_embeddings_file)
 45 | 
 46 |     def embeddings_length(self, path):
 47 |         N = 0
 48 |         with open(path, "r") as f:
 49 |             for line in f:
 50 |                 if N == 0 and re.match("^[0-9]+\s[0-9]+$", line):
 51 |                     continue
 52 |                 N += 1
 53 |             key, data = line.split(' ', 1)
 54 |             vec = [float(d) for d in data.split()]
 55 |             dim = len(vec)
 56 |         return N, dim
 57 | 
 58 |     def load_embedding(self, path):
 59 |         logger.info("[+] Loading embedding file %s..." % path)
 60 |         N, dim = self.embeddings_length(path)
 61 |         logger.info(" - vocab size: %s, dim: %s" % (
 62 |             N, dim
 63 |         ))
 64 |         logger.debug(" - Allocating embedding matrix...")
 65 |         # token to ID (embedding row)
 66 |         t2id = dict()
 67 |         # ID to token
 68 |         id2t = dict()
 69 |         # embedding matrix
 70 |         embeddings = np.zeros(shape=(N, dim))
 71 |         logger.debug(" - Reading embeddings into memory...")
 72 |         outputs = [(N // 10) * i for i in range(10)]
 73 |         with open(path, "r") as f:
 74 |             embed_id = 0
 75 |             for line in f:
 76 |                 if embed_id == 0 and re.match("^[0-9]+\s[0-9]+$", line):
 77 |                     continue
 78 |                 if embed_id in outputs:
 79 |                     pct_done = (embed_id / float(N)) * 100
 80 |                     logger.info(" - %0.4f%% complete" % (pct_done))
 81 |                 key, data = line.split(' ', 1)
 82 |                 vec = [float(d) for d in data.split()]
 83 |                 embeddings[embed_id, :] = vec
 84 |                 t2id[key] = embed_id
 85 |                 id2t[embed_id] = key
 86 |                 embed_id += 1
 87 | 
 88 |         logger.debug(" - Embeddings matrix: %s x %s" % embeddings.shape)
 89 |         return Embedding(
 90 |             embeddings=embeddings,
 91 |             t2id=t2id,
 92 |             id2t=id2t,
 93 |         )
 94 | 
 95 |     def html_to_vector(self, html):
 96 |         x = np.zeros(self.html.dim)
 97 |         N = 0.0
 98 |         for t in html:
 99 |             N += 1
100 |             if re.match("\s", t):
101 |                 t = "</s>"
102 |             id = self.html.t2id[t]
103 |             x += self.html.embeddings[id]
104 |         return x / N
105 | 
106 |     def text_to_vector(self, text):
107 |         x = np.zeros(self.word.dim)
108 |         N = 0.0
109 |         for t in re.split("[^A-Za-z]", text):
110 |             t = t.strip().lower()
111 |             if not t:
112 |                 continue
113 |             N += 1
114 |             if re.match("\s", t):
115 |                 t = "</s>"
116 |             try:
117 |                 id = self.word.t2id[t]
118 |             except Exception as e:
119 |                 logger.warn("Skipping word=%s,  Error=%s" % (
120 |                     t, e
121 |                 ))
122 |                 continue
123 |             x += self.word.embeddings[id]
124 |         return x / N
125 | 
126 |     def element_to_position_vector(self, element):
127 |         return np.array([0.0])
128 | 
129 |     def vectorize(self, html, text, element=None):
130 |         x_html = self.html_to_vector(html)
131 |         x_text = self.text_to_vector(text)
132 |         concat_array = [x_html, x_text]
133 |         if element:
134 |             x_pos = self.element_to_position_vector(element)
135 |             concat_array.append(x_pos)
136 |         x = np.concatenate(concat_array)
137 |         return x
138 | 
139 |     def page_vector(self):
140 |         raise NotImplementedError(
141 |             "EmbeddingsVectorizer.page_vector not implemented"
142 |         )
143 | 
144 |     def form_vectors(self):
145 |         raise NotImplementedError(
146 |             "EmbeddingsVectorizer.form_vectors not implemented"
147 |         )
148 | 
149 |     def button_vectors(self):
150 |         raise NotImplementedError(
151 |             "EmbeddingsVectorizer.button_vectors not implemented"
152 |         )
153 | 
154 |     def link_vectors(self):
155 |         raise NotImplementedError(
156 |             "EmbeddingsVectorizer.link_vectors not implemented"
157 |         )
158 | 


--------------------------------------------------------------------------------
/autoscrape/vectorization/text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import logging
 3 | 
 4 | 
 5 | logger = logging.getLogger('AUTOSCRAPE')
 6 | 
 7 | 
 8 | class TextVectorizer:
 9 |     """
10 |     # for ix, text in link_vectors
11 |     link_vectors = self.control.vectorizer.link_vectors()
12 |     # for ix, text in button_vectors
13 |     button_data = self.control.vectorizer.button_vectors()
14 |     form_vectors = self.control.vectorizer.form_vectors()
15 |     """
16 |     def __init__(self, scraper=None, controller=None):
17 |         self.scraper = scraper
18 |         self.controller = controller
19 | 
20 |     def page_vector(self, html):
21 |         """
22 |         Get feature vector from currently loaded page. This should
23 |         be used to determine what type of page we're on and what action
24 |         we ought to take (continue crawl, enter input, scrape structured
25 |         data, etc).
26 |         """
27 |         return self.scraper.element_text(None, block=True)
28 | 
29 |     def form_vectors(self):
30 |         """
31 |         Get a feature vector representing the forms on a page. This ought
32 |         to be used in cases where the model indicates the page may be a
33 |         search page, but where there are multiple forms. Or where you
34 |         just want to determine if a form is interactive data search.
35 |         Another alternative strategy would be to try the search and then
36 |         look at the next page.
37 |         """
38 |         logger.debug("[.] Loading form vectors")
39 |         form_data = []
40 |         for tag in self.controller.forms:
41 |             form = self.scraper.element_by_tag(tag)
42 |             txt = self.scraper.element_text(form, block=True)
43 |             form_data.append(txt)
44 |         return form_data
45 | 
46 |     def button_vectors(self):
47 |         logger.debug("[.] Building button vectors")
48 |         buttons_data = []
49 |         for tag in self.controller.scraper.get_buttons():
50 |             elem = self.scraper.element_by_tag(tag)
51 |             value = ""
52 |             if elem is not None:
53 |                 value = self.scraper.element_value(elem)
54 |             text = []
55 |             if value:
56 |                 text.append(value)
57 |             if elem is not None:
58 |                 text.append(self.scraper.element_text(elem))
59 |             logger.debug(" - button value: %s, text: %s" % (value, text))
60 |             buttons_data.append(" ".join(text))
61 |         return buttons_data
62 | 
63 |     def link_vectors(self):
64 |         """
65 |         Get a matrix of link vectors. These describe the text of the link
66 |         in a way that a ML algorithm could decide how to prioritize the
67 |         search pattern.
68 |         """
69 |         logger.debug("[.] Building link vectors")
70 |         buttons_data = []
71 |         for t in self.controller.scraper.get_clickable():
72 |             elem = self.scraper.element_by_tag(t)
73 |             tag_name = self.scraper.element_tag_name(elem)
74 |             text = ""
75 |             if elem is None:
76 |                 logger.warn("[!] Link element couldn't be found: %s" % t)
77 |             elif tag_name != "input":
78 |                 text = self.scraper.element_text(elem).replace("\n", " ")
79 |             elif tag_name == "input":
80 |                 value = self.scraper.element_attr(elem, "value")
81 |                 text = value.replace("\n", " ")
82 |             buttons_data.append(text)
83 |         return buttons_data
84 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | # Bring up all the pieces necessary to run the workbench
 2 | # Data persists in Docker volumes and in local dir
 3 | 
 4 | # This file passes through all necessary env variables to requisite
 5 | # Docker containers and makes them available when running commands via
 6 | # `docker exec`.
 7 | 
 8 | version: '3.4'
 9 | 
10 | services:
11 |   database:
12 |     image: postgres:10.10
13 |     environment:
14 |       POSTGRES_USER: autoscrape
15 |       POSTGRES_PASSWORD: autoscrape
16 |       POSTGRES_DB: autoscrape
17 |       PGDATA: /var/lib/postgresql/data/10.10
18 |     networks: [ 'dev' ]
19 |     volumes:
20 |       - dbdata:/var/lib/postgresql/data
21 | 
22 |   rabbitmq:
23 |     image: rabbitmq:3.7.8-management
24 |     ports: [ '15672' ] # open management port, for debugging
25 |     networks: [ 'dev' ]
26 |     environment:
27 |       # Use just one CPU
28 |       RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS: '+S 1:1 +stbt ts +A 12'
29 | 
30 |   flask:
31 |     build:
32 |       context: .
33 |       target: base
34 |     volumes:
35 |       - ./:/app:rw
36 |       - virtualenvs:/root/.local/share/virtualenvs/:rw
37 |     command: [ 'python3', 'autoscrape-server.py' ]
38 |     ports:
39 |       - '5000:5000'
40 |     links:
41 |       - rabbitmq
42 |     depends_on: [ 'rabbitmq', 'database' ]
43 |     networks: [ 'dev' ]
44 |     environment:
45 |       AUTOSCRAPE_RABBITMQ_HOST: amqp://guest:guest@rabbitmq/
46 |       AUTOSCRAPE_DB_HOST: database
47 |       AUTOSCRAPE_DB_USER: autoscrape
48 |       AUTOSCRAPE_DB_PASSWORD: autoscrape
49 | 
50 |   celery:
51 |     build:
52 |       context: .
53 |       target: base
54 |     shm_size: 6GB
55 |     volumes:
56 |       - ./:/app:rw
57 |       - virtualenvs:/root/.local/share/virtualenvs/:rw
58 |     command: [ 'celery', '-c', '1', '-A', 'autoscrape.tasks', 'worker', '--loglevel=info' ]
59 |     #user: nobody
60 |     links:
61 |       - rabbitmq
62 |     depends_on: [ 'rabbitmq', 'flask', 'database' ]
63 |     networks: [ 'dev' ]
64 |     environment:
65 |       AUTOSCRAPE_RABBITMQ_HOST: amqp://guest:guest@rabbitmq/
66 |       AUTOSCRAPE_DB_HOST: database
67 |       AUTOSCRAPE_DB_USER: autoscrape
68 |       AUTOSCRAPE_DB_PASSWORD: autoscrape
69 | 
70 | networks:
71 |   dev:
72 |     driver: bridge
73 | 
74 | volumes:
75 |   virtualenvs: {}
76 |   dbdata: {}
77 | 
78 | 


--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | """
 4 | Autoscrape Extractor - A wrapper around Hext for
 5 | walking a directory and extracting all structured
 6 | data using a provided Hext template.
 7 | 
 8 | Hext templates can be created using the JavaScript
 9 | UI found in ./hext_builder_ui.
10 | 
11 | Usage:
12 |     extract.py <input-directory> <hext-template> [options]
13 | 
14 | Options:
15 |     --output-file FILENAME
16 |         By default, all output will be printed to stdout.
17 |         This option directs all output to a specified file.
18 | """
19 | from docopt import docopt
20 | import html5lib
21 | import hext
22 | 
23 | 
24 | def parse_html_file(filepath):
25 |     with open(filepath, "r") as f:
26 |         html = f.read()
27 |     return html5lib.parse(
28 |         html, treebuilder='lxml', namespaceHTMLElements=False
29 |     )
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     docopt_args = docopt(__doc__)
34 | 
35 |     option = None
36 |     if "build-template" in docopt_args.keys():
37 |         option = "build-template"
38 |         docopt_args.pop("build-template")
39 |     elif "extract" in docopt_args.keys():
40 |         option = "extract"
41 |         docopt_args.pop("extract")
42 | 
43 |     # strip the -- and convert - to _, remove <>
44 |     args = {}
45 |     for option in docopt_args:
46 |         args[option[2:].replace(
47 |             '<', ''
48 |         ).replace(
49 |             '>', ''
50 |         ).replace(
51 |             '-', '_'
52 |         )] = docopt_args[option]
53 | 
54 |     if option == "extract":
55 |         # TODO: walk directory, feed files to below:
56 |         rule = hext.Rule(strhext)
57 |         document = hext.Html(strhtml)
58 |         result = rule.extract(document)
59 | 


--------------------------------------------------------------------------------
/images/ai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/images/ai.png


--------------------------------------------------------------------------------
/images/code_embeddings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/images/code_embeddings.png


--------------------------------------------------------------------------------
/images/extraction of code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/images/extraction of code.png


--------------------------------------------------------------------------------
/images/k-NN_small_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/images/k-NN_small_data.png


--------------------------------------------------------------------------------
/images/quickstart-video.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonrobertz/autoscrape-py/360f791c167c54336179120df4f52b30bf323bd0/images/quickstart-video.png


--------------------------------------------------------------------------------
/requirements.api.txt:
--------------------------------------------------------------------------------
1 | Flask>=1.0.2
2 | psycopg2-binary
3 | SQLAlchemy>=1.3.3
4 | sqlalchemy-utils>=0.33.11
5 | flask-sqlalchemy>=2.3.2
6 | celery>=4.4.0
7 | 


--------------------------------------------------------------------------------
/requirements.dev.txt:
--------------------------------------------------------------------------------
1 | nose==1.3.7
2 | torchvision==0.2.1
3 | matplotlib==2.2.2
4 | scikit-learn==0.19.2
5 | scipy==1.1.0
6 | html2text==2018.1.9
7 | warcio>=1.7.3,<2.0.0
8 | pycld2==0.41
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | selenium>=3.141.0,<4.0.0
 2 | lxml>=4.3.0
 3 | html5lib>=1.0.1
 4 | webencodings>=0.5.1
 5 | docopt>=0.6.2
 6 | networkx>=2.2
 7 | numpy>=1.15.0
 8 | cssselect>=1.1.0
 9 | requests>=2.22.0
10 | lxml>=4.3.0
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from os import path
 3 | import setuptools
 4 | 
 5 | 
 6 | def get_long_description():
 7 |     BASEDIR = path.abspath(path.dirname(__file__))
 8 |     with open(path.join(BASEDIR, 'README.rst'), encoding='utf-8') as f:
 9 |         return f.read()
10 | 
11 | 
12 | setuptools.setup(
13 |     name='autoscrape',
14 |     version='1.6.14',
15 |     description='An automated, programming-free web scraper for interactive sites',
16 |     long_description=get_long_description(),
17 |     author='Brandon Roberts',
18 |     author_email='brandon@bxroberts.org',
19 |     url='https://github.com/brandonrobertz/autoscrape-py',
20 |     license='AGPLv3',
21 |     classifiers=[
22 |         'Development Status :: 3 - Alpha',
23 |         'Environment :: Console',
24 |         'Intended Audience :: Developers',
25 |         'Intended Audience :: End Users/Desktop',
26 |         'Intended Audience :: Science/Research',
27 |         'License :: OSI Approved :: GNU Affero General Public License v3',
28 |         'Natural Language :: English',
29 |         'Operating System :: OS Independent',
30 |         'Programming Language :: Python',
31 |         'Programming Language :: Python :: 3.5',
32 |         'Programming Language :: Python :: 3.6',
33 |         'Programming Language :: Python :: 3.7',
34 |         'Programming Language :: Python :: 3.8',
35 |         'Programming Language :: Python :: Implementation :: CPython',
36 |         'Programming Language :: Python :: Implementation :: PyPy',
37 |         'Topic :: Scientific/Engineering :: Information Analysis',
38 |         'Topic :: Software Development :: Libraries :: Python Modules',
39 |         'Topic :: Utilities'
40 |     ],
41 |     packages=[
42 |         'autoscrape',
43 |         'autoscrape.cli',
44 |         'autoscrape.util',
45 |         'autoscrape.backends',
46 |         'autoscrape.backends.base',
47 |         'autoscrape.backends.selenium',
48 |         'autoscrape.backends.requests',
49 |         'autoscrape.backends.warc',
50 |         'autoscrape.scrapers',
51 |         'autoscrape.search',
52 |         'autoscrape.vectorization',
53 |     ],
54 |     entry_points={
55 |         'console_scripts': [
56 |             'autoscrape = autoscrape.cli.scrape:main',
57 |         ]
58 |     },
59 |     install_requires=[
60 |         'lxml>=4.3.0',
61 |         'html5lib>=1.0.1',
62 |         'docopt>=0.6.2',
63 |         'cssselect>=1.1.0',
64 |         'requests>=2.22.0',
65 |     ],
66 |     extras_require={
67 |         'selenium-backend': [
68 |             'selenium>=3.141.0,<4.0.0',
69 |         ],
70 |         'warc-backend': [
71 |             'warcio>=1.7.3,<2.0.0',
72 |             'plyvel==1.2.0',
73 |         ],
74 |         'embeddings-vectorizer': [
75 |             'numpy>=1.15.0'
76 |         ],
77 | 
78 |         'graph': [
79 |             'networkx>=2.2',
80 |         ],
81 |         'all': [
82 |             'selenium>=3.141.0,<4.0.0',
83 |             'networkx>=2.2',
84 |             'numpy>=1.15.0',
85 |             'warcio>=1.7.3,<2.0.0',
86 |             'plyvel==1.2.0',
87 |         ],
88 |     }
89 | )
90 | 


--------------------------------------------------------------------------------
/tests/common.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_BASE="autoscrape-data-tests"
 2 | BENCHMARK_LOG="${OUTPUT_BASE}/backend-benchmark.log"
 3 | BACKENDS="selenium requests"
 4 | TIME="$(which time) -p -a -o ${BENCHMARK_LOG}"
 5 | 
 6 | AUTOSCRAPE="./autoscrape.py --save-graph --loglevel DEBUG --save-screenshots"
 7 | 
 8 | mkdir -p ${OUTPUT_BASE}
 9 | 
10 | die () {
11 |   echo "${*}"
12 |   exit 1
13 | }
14 | 
15 | add_benchmark_header () {
16 |   benchmark="${1}"
17 |   crawlname="${2}"
18 |   echo "Running ${benchmark} ${crawlname}" | tee -a ${BENCHMARK_LOG}
19 | }
20 | 
21 | add_separator () {
22 |   echo "======================================================================"
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/tests/crawltest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source tests/common.sh
 4 | 
 5 | for backend in ${BACKENDS}; do
 6 |   output="${OUTPUT_BASE}/autoscrape-data-${backend}"
 7 |   rm -rf ${output}
 8 |   add_separator
 9 |   add_benchmark_header ${backend} "crawl"
10 |   ${TIME} ${AUTOSCRAPE} \
11 |     --backend ${backend} \
12 |     --output ${output} \
13 |     https://bxroberts.org \
14 |     || die "Backend ${backend} failed crawling."
15 |   echo "${backend} crawl complete!"
16 | done
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/data/test_page.html:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | <!DOCTYPE html>
  4 | <!--[if lt IE 7 ]> <html class="ie6 no-css3" lang="en"> <![endif]-->
  5 | <!--[if IE 7 ]>    <html class="ie7 no-css3" lang="en"> <![endif]-->
  6 | <!--[if IE 8 ]>    <html class="ie8 no-css3" lang="en"> <![endif]-->
  7 | <!--[if IE 9 ]>    <html class="ie9" lang="en"> <![endif]-->
  8 | <!--[if (gt IE 9)|!(IE)]><!--> <html lang="en"> <!--<![endif]-->
  9 | <head>
 10 |           <meta charset="UTF-8">
 11 |             <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
 12 |               <title>Civil Search</title>
 13 |                 <link rel="icon" href="/i/favicon.ico" type="image/x-icon">
 14 |                   <link rel="shortcut icon" href="/i/favicon.ico" type="image/x-icon">
 15 |                     <link rel="stylesheet" href="/i/css/apex.min.css?v=4.2.5.00.08" type="text/css" />
 16 |                     <!--[if IE]><link rel="stylesheet" href="/i/css/apex_ie.min.css?v=4.2.5.00.08" type="text/css" /><![endif]-->
 17 |                     <link rel="stylesheet" href="/i/libraries/jquery-ui/1.8.22/themes/base/jquery-ui.min.css" type="text/css" />
 18 | 
 19 | 
 20 | 
 21 | 
 22 |         <script type="text/javascript">
 23 | var apex_img_dir = "/i/", htmldb_Img_Dir = apex_img_dir;
 24 |         </script>
 25 |         <script src="/i/libraries/apex/minified/desktop_all.min.js?v=4.2.5.00.08" type="text/javascript"></script>
 26 |         <script src="/i/libraries/apex/minified/legacy.min.js?v=4.2.5.00.08" type="text/javascript"></script>
 27 | 
 28 | 
 29 | 
 30 |         <script type="text/javascript">
 31 | var htmldb_delete_message='Would you like to perform this delete action?';
 32 | 
 33 | 
 34 |         </script>
 35 | 
 36 |         <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 37 | 
 38 |           <meta name="viewport" content="width=device-width, initial-scale=1.0">
 39 |             <link rel="stylesheet" href="/i/themes/theme_101/css/4_1.css?v=2">
 40 |               <script src="/i/themes/theme_101/js/4_1.js"></script>
 41 |               <!-- Begin Google Analytics -->
 42 |         <script>
 43 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
 44 |             (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
 45 |                     m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
 46 |             })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
 47 | 
 48 |   ga('create', 'UA-49567328-1', 'travis.tx.us');
 49 |   ga('send', 'pageview');
 50 | 
 51 |         </script>
 52 | 
 53 | 
 54 | </head>
 55 | <body >
 56 |           <!--[if lte IE 6]><div id="outdated-browser">You are using an outdated web browser. For a list of supported browsers, please reference the Oracle Application Express Installation Guide.</div><![endif]-->
 57 |             <form action="wwv_flow.accept" method="post" name="wwv_flow" id="wwvFlowForm" novalidate >
 58 |                     <input type="hidden" name="p_flow_id" value="105" id="pFlowId" /><input type="hidden" name="p_flow_step_id" value="10" id="pFlowStepId" /><input type="hidden" name="p_instance" value="4326555053940" id="pInstance" /><input type="hidden" name="p_page_submission_id" value="1450644969779" id="pPageSubmissionId" /><input type="hidden" name="p_request" value="" id="pRequest" />
 59 |                       <div id="uBodyContainer"><header id="uHeader">
 60 |                                         
 61 |                                         <hgroup>
 62 |                                         <!--    <a href="f?p=105:1:4326555053940" id="uLogo"><img src="wwv_flow_file_mgr.get_file?p_security_group_id=2652307492609429&p_fname=spacer.gif"  /></a> -->
 63 |                                               <a href="http://www.traviscountyclerk.org/eclerk/Content.do?code=Home" id="uLogo"><img src="wwv_flow_file_mgr.get_file?p_security_group_id=2652307492609429&p_fname=spacer.gif"  /></a>
 64 |                                                   <div class="uAPEXNavBar">
 65 |                                                                 <ul>
 66 |                                                                                 <li><span></span></li>
 67 |                                                                                         
 68 |                                                                                       </ul>
 69 |                                                                                           </div>
 70 |                                                                                               
 71 |                                                                                             </hgroup>
 72 |                                                                                               <nav>
 73 |                                                                                                           <ul>
 74 |                                                                                                                         <li><a href="javascript:apex.submit('T_HOME');">Home</a></li><li><a href="javascript:apex.submit('T_MISDEANOR SEARCH');">Misdemeanor</a></li><li><a href="javascript:apex.submit('T_CIVIL');" class="active">Civil</a></li><li><a href="javascript:apex.submit('T_PROBATE');">Probate</a></li><li><a href="javascript:apex.submit('T_RECORDING');">Recording</a></li>
 75 |                                                                                                                             </ul>
 76 |                                                                                                                               </nav>
 77 |                                                                                                                                 <div id="uBreadcrumbs" >
 78 |                                                                                                                                           <table summary="" cellspacing="0" cellpadding="0" border="0" width="100%"><tr><td align="right"><a href="javascript:popUp2(&#x27;f?p=105:8:4326555053940:::::&#x27;,%201000,%20650);" class="uButton uHotButton" id="B397118382427113335" style="margin-top:5px; margin-bottom:-5px;" role="button"><span>Civil Search Help</span></a> </td></tr></table><ul><li class="active"><span>Civil Search</span></li> </ul>
 79 |                                                                                                                                             <div class="uBreadcrumbsBG">
 80 |                                                                                                                                                         <div class="uLeft"></div>
 81 |                                                                                                                                                             <div class="uRight"></div>
 82 |                                                                                                                                                               </div>
 83 |                                                                                                                                 </div>
 84 |                                                                                                                                   
 85 |                               </header>
 86 | 
 87 |                               <div id="uOneCol">
 88 |                                         
 89 |                                         <section class="uRegion clearfix" id="R397117782275113323" style="width:750px">
 90 |                                                   <div class="uRegionHeading">
 91 |                                                               <h1>Civil Records Search</h1>
 92 |                                                                   <span class="uButtonContainer">
 93 |                                                                                 
 94 |                                                                               </span>
 95 |                                                                                 </div>
 96 |                                                                                   <div cight"><label for="P10_MIDDLE_INIT" class="uOptional"><a class="uHelpLink" href="javascript:popupFieldHelp('397118760966113349',ers="PARTY">APPLICANT #1</td><td  headers="FULL_NAME">ABEYTA JENNIFER</td><td  headers="CASE_TYPE_DESC">CV OCCUPATIONAL DRIVERS LIC   </td><td  headers="FILING_DATE">07&#x2F;22&#x2F;2010</td><td  headers="CASE_STATUS_DESC">DISPOSED</td><td  headers="ATTORNEY_NAME"> - </td><td  headers="PERSON_BUSINESS">PERSON</td></tr>
 97 |                                                                                           <tr><td  headers="CAUSE_NUMBER"><a href="f&#x3F;p&#x3D;105&#x3A;39&#x3A;0&#x3A;&#x3A;NO&#x3A;&#x3A;P39_CASE_ID,P39_PARTY_ID,P39_PARTY_TYPE,P39_PARTY_NUMBER&#x3A;941346,2237610,&#x23;Pf&#x3F;p&#x3D;105&#x3A;39&#x3A;0&#x3A;&#x3A;NO&#x3A;&#x3A;P39_CASE_ID,P39_PARTY_ID,P39_PARTY_TYPE,P39_PARTY_NUMBER&#x3A;948147,2257928,&#x23;PARTY_TYPE&#x23;,&#x23;PARTY_NUMBER&#x23;" >C-1-CV-10-002045</a></td><td  headers="PARTY">APPLICANT #1</td><td ,P39_PARTY_TYPE,P39_PARTY_NUMBER&#x3A;176582,769559,&#x23;PARTY_TYPE&#x23;,&#x23;PARTY_NUMBER&#x23;" >C-1-CV-06-004213</a></td><td  headers="PARTY">APPLICANT #1</td><td  headers="FULL_NAME">ACOSTA JUAN ANGEL</td><td  headers="CASE_TYPE_DESC">CV OCCUPATIONAL DRIVERS LIC   </td><td  headers="FILING_DATE">06&#x2F;23&#x2F;2006</td><td  headers="CASE_STATUS_DESC">DISPOSED</td><td  headers="ATTORNEY_NAME"> - </td><td  headers="PERSON_BUSINESS">PERSON</td></tr>
 98 |                                                                                           <tr><td  headers="CAUSE_NUMBER"><a href="f&#x3F;p&#x3D;105&#x3A;39&#x3A;0&#x3A;&#x3A;NO&#x3A;&#x3A;P39_CASE_ID,P39_PARTY_ID,P39_PARTY_TYPE,P39_PARTY_NUMBER&#x3A;931982,2202642,&#x23;PARTY_TYPE&#x23;,&#x23;PARTY_NUMBER&#x23;" >C-1-CV-09-011470</a></td><td  headers="PARTY">APPLICANT #1</td><td  heaLICANT #1</td><td  headers="FULL_NAME">ADAMS BLAKE</td><td  headers="CASE_TYPE_DESC">CV OCCUPATIONAL DRIVERS LIC   </td><td  headers="FILING_DATE">12&#x2F;17&#x2F;2010</td><td  headers="CASE_STATUS_DESC">DISPOSED</td><td  headers="ATTORNEY_NAME">BENNETT ALAN                  </td><td  headers="PERSON_BUSINESS">PERSON</td></tr>
 99 |                                                                                           <tr><td  headers="CAUSE_NUMBER"><a href="f&#x3F;p&#x3D;105&#x3A;39&#x3A;0&#x3A;&#x3A;NO&#x3A;&#x3A;P39_CASE_ID,P39_PARTY_ID,P39_PARTY_TYPE,P39_PARTY_NUMBER&#x3A;958716,2288960,&#x23;PARTY_TYPE&#x23;,&#x23;PARTY_NUMBER&#x23;" >C-1-CV-10-004917</a></td><td  headers="PARTY">APPLICANT #1</td><td  headers="FULL_NAME">ADAMS CHRISTOPHER</td><td  headers="CASE_TYPE_DESC">CV OCCUPATIONAL DRIVERS LIC   </td><td  headers="FILING_DATE"CANT #1</td><td  headers="FULL_NAME">ADAMS VEACHEL</td><td  headers="CASE_TYPE_DESC">CV OCCUPATIONAL DRIVERS LIC   </td><td  headers="FILING_DATE">12&#x2F;05&#x2F;2008</td><td  headers="CASE_STATUS_DESC">DISPOSED</td><td  headers="ATTORNEY_NAME"> - </td><td  headers="PERSON_BUSINESS">PERSON</td></tr>
100 |                                                                                           <tr><td  headers="CAUSE_NUMBER"><a href="f&#x3F;p&#x3D;105&#x3A;39&#x3A;0&#x3A;&#x3A;NO&#x3A;&#x3A;P39_CASE_ID,P39_PARTY_ID,P39_PARTY_TYPE,OHN STILLMAN          </td><td  headers="PERSON_BUSINESS">PERSO&#x3A;39&#x3A;0&#x3A;&#x3A;NO&#x3A;&#x3A;P39_CASE_ID,P39_PARTY_ID,P39_PARTY_TYPE,P39_PARTY_NUMBER&#x3A;978834,2349638,&#x23;PARTY_TYPE&#x23;,&#x23;PARTY_NUMBER&#x23;" >C-1-CV-10-009856</a></td><td  headers="PARTY">APPLICANT #1</td><td  headers="FULL_NAME">ALCANTAR ABRAHAM</td><td  headers="CASE_TYPE_DESC">CV OCCUPATIONAL DRIVERS LIC   </td><td  headers="FILING_DATE">10&#x2F;12&#x2F;2010</td><td  headers="CASE_STATUS_DESC">DISPOSED</td><t</a></td><td  headers="PARTY">APPLICANT #1</td><td  headers="FULL_NAME">ALEMAR LILA</td><td  headers="CASE_TYPE_DESC">CV PROTECTIVE ORDER           </td><td  headers="FILING_DATE">09&#x2F;05&#x2F;2008</td><td  headers="CASE_STATUS_DESC">DISPOSED</td><td  headers="ATTORNEY_NAME">DAMEN MARION A.               </td><td  headers="PERSON_BUSINESS">PERSON</td></tr>
101 |                                                                                           <tr><td  headers="CAUSE_NUMBER"><a href="f&#x3F;p&#x3D;105&#x3A;39&#x3A;0&#x3A;CASE_STATUS_DESC">DISPOSED</td><td  headers="ATTORNEY_NAME">RIT></tr>
102 |                                                                                           <tr><td  headers="CAUSE_NUMBER"><a href="f&#x3F;p&#x3D;105&#x3A;39&#x3A;0&#x3A;&#x3A;NO&#x3A;&#x3A;P39_CASE_ID,P39_PARTY_ID,P39_PARTY_TYPE,P39_PARTY_NUMBER&#x3A;952306,2270361,&#x23;apex.widget.datepicker("#P10_FILING_DATE_TO",{"buttonImage":"\u002Fi\u002Fasfdcldr.gif","buttonImageOnly":true,"buttonText":"Popup Calendar: Filing Date To","showTime":false,"defaultDate":new Date(2018,7,12,02,03,10),"showOn":"focus","showOtherMonths":false,"yearRange":"1950:sysdate","changeMonth":true,"changeYear":true},"mm\u002Fdd\u002Fyy","en");})();
103 |                                                                                           (function(){apex.widget.report.init("R397120976658113360",{"styleMouseOver":"#F0F0F0","styleChecked":"#E8E8E8","internalRegionId":"397120976658113360"});})();
104 | 
105 | 
106 | });</script>
107 | 
108 | </body>
109 | </html>
110 | 


--------------------------------------------------------------------------------
/tests/formsubmittest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | URL="https://utdirect.utexas.edu/apps/degree/degrees/nlogon/"
 3 | 
 4 | source tests/common.sh
 5 | 
 6 | for backend in ${BACKENDS}; do
 7 |   output="${OUTPUT_BASE}/autoscrape-data-formsubmit-${backend}"
 8 |   rm -rf ${output}
 9 |   add_separator
10 |   add_benchmark_header ${backend} "form submitter"
11 |   ${TIME} ${AUTOSCRAPE} \
12 |     --backend ${backend} \
13 |     --maxdepth 1 \
14 |     --formdepth 1 \
15 |     --form-match "List students starting from" \
16 |     --input "i:0:SMITH\, JOHN" \
17 |     --next-match "Next Page" \
18 |     --output ${output} \
19 |     ${URL} \
20 |     || die "Backend ${backend} failed submitting forms"
21 | done
22 | 
23 | 


--------------------------------------------------------------------------------
/tests/run_e2e_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source tests/common.sh
 4 | 
 5 | die () {
 6 |   echo "${*}"
 7 |   exit 1
 8 | }
 9 | 
10 | rm -f ${BENCHMARK_LOG}
11 | ./tests/crawltest.sh && ./tests/formsubmittest.sh \
12 |   || die "Tests failed."
13 | 
14 | echo "Tests complete!"
15 | 
16 | cat ${BENCHMARK_LOG}
17 | 


--------------------------------------------------------------------------------
/tests/tag_test_data_page.html:
--------------------------------------------------------------------------------
1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
2 | <html xmlns="http://www.w3.org/1999/xhtml" lang="EN"> <head>  <title>Degrees and Dates of Attendance</title> <style type="text/css"> @import url(https://utdirect.utexas.edu/utdirect/system/css/utd_typography.css); @import url(https://utdirect.utexas.edu/utdirect/system/css/custom_utdirect.css); @import url(https://utdirect.utexas.edu/utdirect/system/css/utd_subnav.css); </style> <link rel="stylesheet" href="https://utdirect.utexas.edu/utdirect/system/css/utd_css_print.css" media="print"> <link rel="stylesheet" type="text/css" href="degv.css" media="all"> </head> <body class="minWidth" onload="utd_load_a();" onunload="utd_unload_a();"> <div id="skip"><a href="#main_content">Skip to Main Content</a></div> <div id="header"> <!-- HEADER --> <div id="mainnavtext"> <!-- MAINNAVTEXT --> <ul> <li> <a href="https://utdirect.utexas.edu/utdirect/help/index.WBX?c=0" onclick="utd_openNPW('https://utdirect.utexas.edu/utdirect/help/index.WBX?c=0','win','height=400,width=500,location=no,menubar=no,resizable=yes,scrollbars=yes,status=no,toolbar=no'); return false;"> UT Direct Help </a> </li> <li class="last_item"> <a href="https://utdirect.utexas.edu/security-443/logoff.cgi?goto=http://www.utexas.edu"> Logoff </a> </li> <li class="last_item"> <a href="https://utdirect.utexas.edu/utdirect/index.WBX?t=MYHOME" title="My Home Tab" accesskey="1"> <img alt="UT Direct Home" title="UT Direct Home" src="https://utdirect.utexas.edu/utdirect/system/images/home.png"> </a> </li> <li class="last_item"> <a id="search_link" href="#"> <img alt="Search UT Direct" title="Search UT Direct" src="https://utdirect.utexas.edu/utdirect/system/images/search.png"> </a> </li> </ul> </div> <!-- MAINNAVTEXT CLOSE --> <div id="search_display"> <!-- SEARCH_DISPLAY --> <form name="utd_search" id="search_form" action="https://utdirect.utexas.edu/utdirect/search/index.WBX" method="get"> <input type="hidden" name="s_component" value="0"> <input type="hidden" name="s_environment" value="PUTD"> <input type="hidden" name="s_security_flag" value="P"> <label for="search_type" class="nodisplaytext">Select the type of search to perform:</label> <select id="search_type" name="s_search_type"> <option value="SS">any word</option> <option value="AW">all words</option> <option value="EX">exact phrase</option> </select> <label for="search_term" class="nodisplaytext">Enter a search term:</label> <input id="search_term" type="text" name="input_srch_string" size="72" maxlength="126" title="You may search for a maximum of 7 words" value=""> <input type="submit" value="search"> </form> </div> <!-- SEARCH_DISPLAY CLOSE --> <div id="header_bottom"> <!-- HEADER_BOTTOM --> </div> <!-- HEADER_BOTTOM CLOSE --> <div id="ut_ribbon" onclick="location.href='http://www.utexas.edu';" style="cursor:pointer;" title="Link to University of Texas homepage"> <!-- UT_RIBBON --> </div> <!-- UT_RIBBON CLOSE --> </div> <!-- HEADER CLOSE --> <div id="utd_container"> <!-- UTD_CONTAINER --> <div id="fullpage"> <!-- CONTENT_FULLPAGE --> <a name="main_content"></a> <div id="utd_toppage_fullpage"> <!-- UTD_TOPPAGE_FULLPAGE --> <div class="pageTitle noTabs"> <!-- PAGETITLE --> <div id="pgTitle"><h1>Degrees and Dates of Attendance</h1></div> <div id="utilities"> <!-- UTILITIES --> <ul> </ul> </div> <!-- UTILITIES CLOSE --> </div> <!-- PAGETITLE CLOSE --> </div><!-- UTD_TOPPAGE CLOSE--> <div id="service_content"><p><strong>Office of the Registrar</strong></p> <div class="spacer">&#160;</div> <div class="rgpage"> <div class="intro"> This service provides public directory information for all current and former students whose records are computerized. Generally, this includes students enrolled from Fall 1977 to the present. For official certification of this information, contact Enrollment Certification at (512) 475-7644. For verbal verification of students enrolled prior to Fall 1977, please contact Diploma Services at (512) 475-7619. <br> <span class="em_statement">If you graduate in May 2018, your degrees will appear in our system after 5pm on Friday, May 25, 2018.</span> </div> <div class="spacer">&#160;</div> <div class="spacer">&#160;</div> <form id="form_degv" name="form_degv" action="degv.WBX" method="post"> <input type="hidden" id="s_last_name_read" name="s_last_name_read" value="AARON, DEBRA LYNN"> <input type="hidden" id="s_last_isn_read" name="s_last_isn_read" value="790244"> <input type="hidden" id="s_first_name_read" name="s_first_name_read" value="AARNOS, SEPPO"> <input type="hidden" id="s_first_isn_read" name="s_first_isn_read" value="829359"> <input type="hidden" id="s_paging_sw" name="s_paging_sw"> <input type="hidden" id="s_first_time_sw" name="s_first_time_sw" value="X"> <fieldset> <legend>Start name entry</legend> <label for="s_start_name">List students starting from <span class="label_bold">lastname, firstname</span></label> <input type="text" id="s_start_name" name="s_start_name" maxlength="25" size="25"> <input type="submit" id="start_page" name="start_page" value="Submit"> </fieldset> <div class="spacer">&#160;</div> <label class="button" for="prev_page_1">Previous Page</label> <input type="button" id="prev_page_1" name="prev_page" value="Previous Page" onclick="prevPage();"> <label class="button" for="next_page_1">Next Page</label> <input type="button" id="next_page_1" name="next_page" value="Next Page" onclick="nextPage();"> <div class="spacer">&#160;</div> <div class="spacer">&#160;</div> <h2>Search results beginning with AARNOS, SEPPO:</h2> <div class="spacer">&#160;</div> <table class="student_listing" border="0" cellspacing="0" cellpadding="3" summary="UT-Austin students and degrees awarded, presented in alphabetical order."> <tr> <th id="student_name_th">Student name</th> <th id="first_sem_th">First semester enrolled</th> <th id="last_sem_th">Last semester enrolled</th> <th id="degree_name_th">Degree name</th> <th id="major_name_th">Major name</th> <th id="degree_date_th">Date degree received</th> <th id="honors_th">Honors received</th> <th id="sp_honors_th">Special honors received</th> <th id="degree_note_th">Degree notes</th> </tr> <tr class="student_main_info"> <td class="student_name" headers="student_name_th" nowrap>AARNOS, SEPPO</td> <td class="first_sem" headers="first_sem_th" nowrap> <span class="sem_label">first semester</span> Summer 1987 </td> <td class="last_sem" headers="last_sem_th" nowrap> <span class="sem_label">last semester</span> Summer 1987 </td> </tr> <tr class="student_main_info"> <td class="student_name" headers="student_name_th" nowrap>AARON, ANGELA GAE</td> <td class="first_sem" headers="first_sem_th" nowrap> <span class="sem_label">first semester</span> Summer 1984 </td> <td class="last_sem" headers="last_sem_th" nowrap> <span class="sem_label">last semester</span> Fall 1986 </td> </tr> <tr class="student_main_info"> <td class="student_name" headers="student_name_th" nowrap>AARON, ANN-MARIE CERDA</td> <td class="first_sem" headers="first_sem_th" nowrap> <span class="sem_label">first semester</span> Fall 1979 </td> <td class="last_sem" headers="last_sem_th" nowrap> <span class="sem_label">last semester</span> Spring 1982 </td> </tr> <tr> <td class="degree_name" headers="degree_name_th" style="padding-left: 3em">DOCTOR OF JURISPRUDENCE</td> <td class="major_name" headers="major_name_th"><span class="major_name_label">major</span> LAW - JURISPRUDENCE</td> <td class="degree_date" headers="degree_date_th">May 22, 1982</td> </tr> <tr class="student_main_info"> <td class="student_name" headers="student_name_th" nowrap>AARON, ARIEL PATRICE</td> <td class="first_sem" headers="first_sem_th" nowrap> <span class="sem_label">first semester</span> Fall 2005 </td> <td class="last_sem" headers="last_sem_th" nowrap> <span class="sem_label">last semester</span> Fall 2012 </td> </tr> <tr class="student_main_info"> <td class="student_name" headers="student_name_th" nowrap>AARON, BENNETT HARRISON</td> <td class="first_sem" headers="first_sem_th" nowrap> <span class="sem_label">first semester</span> Summer 2008 </td> <td class="last_sem" headers="last_sem_th" nowrap> <span class="sem_label">last semester</span> Spring 2012 </td> </tr> <tr> <td class="degree_name" headers="degree_name_th" style="padding-left: 3em">BACHELOR OF ARTS</td> <td class="major_name" headers="major_name_th"><span class="major_name_label">major</span> AMERICAN STUDIES</td> <td class="degree_date" headers="degree_date_th">May 19, 2012</td> </tr> <tr class="student_main_info"> <td class="student_name" headers="student_name_th" nowrap>AARON, CHARLES WAYNE</td> <td class="first_sem" headers="first_sem_th" nowrap> <span class="sem_label">first semester</span> Fall 1997 </td> <td class="last_sem" headers="last_sem_th" nowrap> <span class="sem_label">last semester</span> Spring 2000 </td> </tr> <tr> <td class="degree_name" headers="degree_name_th" style="padding-left: 3em">BACHELOR OF SCIENCE IN MECHANICAL ENGINEERING</td> <td class="major_name" headers="major_name_th"><span class="major_name_label">major</span> MECHANICAL ENGINEERING</td> <td class="degree_date" headers="degree_date_th">May 20, 2000</td> </tr> <tr class="student_main_info"> <td class="student_name" headers="student_name_th" nowrap>AARON, CHELSEA LYNN</td> <td class="first_sem" headers="first_sem_th" nowrap> <span class="sem_label">first semester</span> Fall 2011 </td> <td class="last_sem" headers="last_sem_th" nowrap> <span class="sem_label">last semester</span> Summer 2015 </td> </tr> <tr> <td class="degree_name" headers="degree_name_th" style="padding-left: 3em">BACHELOR OF SCIENCE AND ARTS</td> <td class="major_name" headers="major_name_th"><span class="major_name_label">major</span> BIOLOGY</td> <td class="degree_date" headers="degree_date_th">Aug 17, 2015</td> </tr> <tr class="student_main_info"> <td class="student_name" headers="student_name_th" nowrap>AARON, DAVID L.</td> <td class="first_sem" headers="first_sem_th" nowrap> <span class="sem_label">first semester</span> Fall 1997 </td> <td class="last_sem" headers="last_sem_th" nowrap> <span class="sem_label">last semester</span> Summer 2002 </td> </tr> <tr> <td class="degree_name" headers="degree_name_th" style="padding-left: 3em">BACHELOR OF ARTS</td> <td class="major_name" headers="major_name_th"><span class="major_name_label">major</span> GOVERNMENT</td> <td class="degree_date" headers="degree_date_th">Aug 19, 2002</td> </tr> <tr class="student_main_info"> <td class="student_name" headers="student_name_th" nowrap>AARON, DEBRA BETH</td> <td class="first_sem" headers="first_sem_th" nowrap> <span class="sem_label">first semester</span> Fall 1981 </td> <td class="last_sem" headers="last_sem_th" nowrap> <span class="sem_label">last semester</span> Spring 1983 </td> </tr> <tr class="student_main_info"> <td class="student_name" headers="student_name_th" nowrap>AARON, DEBRA LYNN</td> <td class="first_sem" headers="first_sem_th" nowrap> <span class="sem_label">first semester</span> Fall 1982 </td> <td class="last_sem" headers="last_sem_th" nowrap> <span class="sem_label">last semester</span> Spring 1985 </td> </tr> <tr> <td class="filler" colspan="3">&#160;</td> </tr> </table> <div class="spacer">&#160;</div> <div class="spacer">&#160;</div> <label class="button" for="prev_page_2">Previous Page</label> <input type="button" id="prev_page_2" name="prev_page" value="Previous Page" onclick="prevPage();"> <label class="button" for="next_page_2">Next Page</label> <input type="button" id="next_page_2" name="next_page" value="Next Page" onclick="nextPage();"> </form> </div> <p id="sis_ferpa_banner"> The Family Educational Rights and Privacy Act of 1974 restricts access to student records. These legal restrictions apply to all users. You are responsible for maintaining the confidentiality of records made available to you. For more information, please visit the <a href="http://registrar.utexas.edu/staff/ferpa/"> UT FERPA website. </a> </p> </div> <!-- SERVICE_CONTENT --> </div> <!-- CONTENT CLOSE or CONTENT_FULLPAGE CLOSE--> </div> <!-- UTD_CONTAINER CLOSE --> <div id="utd_close" class="#min_width_style"> <!--div "footer_top" contains the departmental footer div and the site map divs --> <div id="footer_top"> <div id="dept_footer"> <ul> <li class="first_item">Comments to: <a href="mailto:certification@austin.utexas.edu" onmouseover="status='Send us your comments.'; return true;" onmouseout="status=' '; return true;">Certifications</a></li> <li class=""><a href="http://www.utexas.edu/student/registrar/" onmouseover="status='Office of the Registrar'; return true;" onmouseout="status=' '; return true;">Office of the Registrar</a></li> <li class="last_item"> </ul> <div> </div><!-- close for legal notice --> </div> <!-- close DEPT_FOOTER div --> <div id="site_map"> <h3>Sitemap <span class="pad10"></span></h3> <div id="site_map1"> <ul> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=ACADEMICS&amp;s_env=PUTD&amp;s_sf=S">Academics</a></li> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=ADMINISTRATIVEAPPLICATIONS&amp;s_env=PUTD&amp;s_sf=S">Administrative Applications</a></li> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=ADMISSIONS&amp;s_env=PUTD&amp;s_sf=S">Admissions</a></li> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=AROUNDCAMPUS&amp;s_env=PUTD&amp;s_sf=S">Around Campus</a></li> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=COLLEGES&amp;s_env=PUTD&amp;s_sf=S">Colleges, Schools and Academic Units</a></li> </ul> </div> <div id="site_map2"> <ul> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=COMPUTINGANDTECHNOLOGY&amp;s_env=PUTD&amp;s_sf=S">Computing and Technology</a></li> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=DIRECTORIESANDOFFICES&amp;s_env=PUTD&amp;s_sf=S">Directories and Offices</a></li> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=EMPLOYMENTANDCAREERS&amp;s_env=PUTD&amp;s_sf=S">Employment and Careers</a></li> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=FINANCES&amp;s_env=PUTD&amp;s_sf=S">Finances</a></li> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=GRADUATION&amp;s_env=PUTD&amp;s_sf=S">Graduation</a></li> </ul> </div> <div id="site_map3"> <ul> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=INSTRUCTION&amp;s_env=PUTD&amp;s_sf=S">Instruction</a></li> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=NEWS&amp;s_env=PUTD&amp;s_sf=S">News, Events and Calendars</a></li> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=PERSONALINFO&amp;s_env=PUTD&amp;s_sf=S">Personal Info</a></li> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=RESEARCH&amp;s_env=PUTD&amp;s_sf=S">Research</a></li> <li><a href="https://utdirect.utexas.edu/utdirect/sitemap/index.WBX?s_cin=WORKINGATUT&amp;s_env=PUTD&amp;s_sf=S">Working at UT</a></li> </ul> </div> <div id="ut_wordmark_footer" onclick="location.href='http://www.utexas.edu';" style="cursor:pointer;" title="Link to The University of Texas at Austin home page"><img src="https://utdirect.utexas.edu/utdirect/system/images/ut_logo_footer.png" width="130" height="69" alt="the university of texas at austin"><br> The University of Texas at Austin<br> 1 University Station<br> Austin, Texas 78712 </div> </div><!-- close site_map div --> </div> <!-- close for footer_top --> <div id="footer_bottom" class="_bottom"> <!--div "footer_tower" is absolutely positioned on top of the "footer_top" and "footer_bottom" and contains the graphic with the wordmark and tower in the background--> <div id="footer_tower"></div> <div id="footer_ribbon" class=""> <div id="ut_footer"> <ul> <li><a href="mailto:utd-stewards@utlists.utexas.edu" target="_top" onmouseover="status='Send a comment to the UT Direct team.'; return true;" onmouseout="status=' '; return true;">UT Direct Comments</a></li> <li><a href="http://www.utexas.edu/web-privacy-policy" target="_blank" onmouseover="status='Read the UT Austin Web Privacy Policy.'; return true;" onmouseout="status=' '; return true;">Web Privacy Policy</a></li> <li class="last_item"><a href="http://www.utexas.edu/web-accessibility-policy" target="_blank" accesskey="0">Web Accessibility</a></li> </ul> <div id="ut_copyright">&#169;<a href="http://www.utexas.edu" target="_blank" onmouseover="status='Go to the UT Home Page.'; return true;" onmouseout="status=' '; return true;">The University of Texas at Austin</a> 2011</div> </div> <!-- close ut_footer --> </div> <!-- close footer_ribbon --> </div> <!-- close footer_bottom --> </div> <!-- utd_close --> </body> </html>
3 | 


--------------------------------------------------------------------------------
/tests/tags.py:
--------------------------------------------------------------------------------
 1 | def read_file(filename):
 2 |     with open(filename) as f:
 3 |         return f.read()
 4 | 
 5 | 
 6 | def test_good_html_parse():
 7 |     html = read_file("tests/tag_test_data_page.html")
 8 |     assert(html)
 9 |     assert(len(html) > 0)
10 | 


--------------------------------------------------------------------------------
/tests/test_extractor.py:
--------------------------------------------------------------------------------
 1 | CASES = [{
 2 |     "extractor": """
 3 | <article class="lawyer">
 4 |   <a>
 5 |     <img style:replace(/background: url\((.*)\);/, "\1"):prepend("https://texasbar.com"):profileImg />
 6 |   </a>
 7 |   <div>
 8 |     <h3>
 9 |       <a href:prepend("https://texasbar.com"):profileLink >
10 |         <span:nth-child(1) @text:namePrefix />
11 |         <span:nth-child(2) @text:nameFirst />
12 |         <span:nth-child(3) @text:nameLast />
13 |         <span:nth-child(4) @text:nameSuffix/>
14 |       </a>
15 |     </h3>
16 |     <h5 @text:affiliation />
17 |     <p:attribute-count(0) @text:practiceLocation />
18 |     <p class="address" @text:address />
19 |     <p class="areas" @text:practiceArea />
20 |   </div>
21 | </article>""",
22 |     "html": """
23 | <article class="lawyer">
24 |   <div class="no-img">
25 |     <a href="/AM/Template.cfm?Section=Find_A_Lawyer&amp;template=/Customsource/MemberDirectory/MemberDirectoryDetail.cfm&amp;ContactID=282365">
26 |       <!--
27 |         <img src="/AM/templates/layoutcatalog/1601/Images/ProfilePhotoPlaceholder.gif" title="State Bar Seal" alt="State Bar Seal" class="avatar"/>
28 |       -->
29 |       <img style="background: url(/AM/templates/layoutcatalog/1601/Images/ProfilePhotoPlaceholder.gif);" class="avatar">
30 |     </a>
31 |   </div>
32 |   <div class="avatar-column">
33 |     <!-- Determine which status class should be displayed -->
34 |     <h3>
35 |       <span class="status-icon blue triangle"></span>
36 |       <a href="/AM/Template.cfm?Section=Find_A_Lawyer&amp;template=/Customsource/MemberDirectory/MemberDirectoryDetail.cfm&amp;ContactID=282365">
37 |         <span class="honorific-prefix"></span>
38 |         <span class="given-name">John H.</span>
39 |         <span class="family-name">Smither</span>
40 |         <span class="honorific-suffix"></span>
41 |       </a>
42 |     </h3>
43 |     <h5>Vinson &amp; Elkins LLP</h5>
44 |     <p>
45 |     <strong>Primary Practice Location:</strong>
46 |     Houston,&nbsp;TX
47 |     </p>
48 |     <p class="address">
49 |     1001 Fannin St Ste 2801<br> Houston, TX&nbsp;77002
50 |     </p>
51 |     <p class="areas">
52 |     <strong>Practice Areas:</strong>
53 |     None Specified by Attorney
54 |     </p>
55 |     <a href="/AM/Template.cfm?Section=Find_A_Lawyer&amp;template=/Customsource/MemberDirectory/MemberDirectoryDetail.cfm&amp;ContactID=282365" class="read-more">Full profile</a>
56 |     <!--
57 |       <h4 class="wai">Memberships</h4>
58 |       <ul class="memberships">
59 |       </ul>
60 |     -->
61 |   </div>
62 |   <div class="contact">
63 |     <a href="http://h2vx.com/vcf/https://texasbar.com/AM/Template.cfm?Section=Find_A_Lawyer&amp;template=/Customsource/MemberDirectory/vcard.cfm&amp;ContactID=282365">Download vCard <i class="fa fa-download"></i></a>
64 |     <a href="tel:713-758-2466">Tel: 713-758-2466 <i class="fa fa-phone"></i></a>
65 |   </div>
66 |   <div class="badges">
67 |   </div>
68 | </article>
69 |     """,
70 | }, {
71 |     "extractor": """""",
72 |     "html": """""",
73 | }]
74 | 


--------------------------------------------------------------------------------
/tests/test_input_parser.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from autoscrape.input_parser import InputParser
 4 | 
 5 | 
 6 | class TestInputParser(unittest.TestCase):
 7 |     def test_normal_string_parsing(self):
 8 |         inp_str = "i:0:string%,c:1:True,s:2:France"
 9 |         inputs_gen = InputParser(inp_str).generate()
10 |         inputs = list(inputs_gen)
11 |         self.assertEqual(len(inputs), 1)
12 |         first_search = inputs[0]
13 |         self.assertEqual(len(first_search), 3)
14 | 
15 |     def test_normal_string_multi_search_parsing(self):
16 |         inp_str = "i:0:string1%,s:2:France;i:0:string2%,s:2:Russia"
17 |         inputs_gen = InputParser(inp_str).generate()
18 |         inputs = list(inputs_gen)
19 |         self.assertEqual(len(inputs), 2)
20 |         search = inputs[0]
21 |         self.assertEqual(len(search), 2)
22 |         self.assertEqual(search[0]["string"], "string1%")
23 |         self.assertEqual(search[0]["type"], "input")
24 |         self.assertEqual(search[0]["index"], 0)
25 |         self.assertEqual(search[1]["string"], "France")
26 |         self.assertEqual(search[1]["type"], "select")
27 |         self.assertEqual(search[1]["index"], 2)
28 |         search = inputs[1]
29 |         self.assertEqual(len(search), 2)
30 |         self.assertEqual(search[0]["string"], "string2%")
31 |         self.assertEqual(search[0]["type"], "input")
32 |         self.assertEqual(search[0]["index"], 0)
33 |         self.assertEqual(search[1]["string"], "Russia")
34 |         self.assertEqual(search[1]["type"], "select")
35 |         self.assertEqual(search[1]["index"], 2)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     unittest.main()
40 | 


--------------------------------------------------------------------------------
/tests/test_tag_generation.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import urllib
 3 | 
 4 | import autoscrape
 5 | 
 6 | 
 7 | class TestTagGeneration(unittest.TestCase):
 8 |     @classmethod
 9 |     def setUpClass(self):
10 |         self.filename = "test_page_large.cleaned.html"
11 |         self.url = "http://localhost:8000/%s" % (self.filename)
12 | 
13 |     def test_test_server_running(self):
14 |         """
15 |         Make sure our test HTML server is online. We need this to be
16 |         loaded to continue the remaining tests.
17 |         """
18 |         html = None
19 |         try:
20 |             html = urllib.request.urlopen(self.url).read()
21 |         except urllib.error.URLError:
22 |             pass
23 |         msg = "Test server not running! HINT: python -m http.server --directory ./tests/data/"
24 |         self.assertIsNotNone(
25 |             html, msg=msg
26 |         )
27 | 
28 |     def _get_page_source(self):
29 |         with open("tests/data/%s" % (self.filename), "r") as f:
30 |             return f.read()
31 | 
32 |     def test_requests_backend_can_load_page(self):
33 |         """
34 |         Load the test page and make sure it matches our test page.
35 |         """
36 |         self.requests_browser = autoscrape.backends.requests.browser.RequestsBrowser()
37 |         self.requests_browser.fetch(self.url)
38 |         loaded_html = self.requests_browser.page_html
39 |         self.assertIsNotNone(loaded_html)
40 |         raw_html = self._get_page_source()
41 |         self.assertEqual(raw_html, loaded_html)
42 | 
43 |     def test_selenium_backend_can_load_page(self):
44 |         """
45 |         Test that we can load the test page and that it doesn't get
46 |         mutated by the browser (it's clean so this shouldn't happen).
47 |         We do this so that later we can measure differences in the CSS
48 |         path algorithms, not differences between represented pages.
49 |         """
50 |         self.selenium_browser = autoscrape.backends.selenium.browser.SeleniumBrowser()
51 |         self.selenium_browser.fetch(self.url)
52 |         loaded_html = self.selenium_browser.page_html
53 |         self.assertIsNotNone(loaded_html)
54 |         raw_html = self._get_page_source()
55 |         self.assertEqual(raw_html, loaded_html)
56 | 
57 |     def test_backend_tags_match(self):
58 |         self.requests_browser = autoscrape.backends.requests.browser.RequestsBrowser()
59 |         self.requests_browser.fetch(self.url)
60 |         self.selenium_browser = autoscrape.backends.selenium.browser.SeleniumBrowser()
61 |         self.selenium_browser.fetch(self.url)
62 |         sel_clickable = self.selenium_browser.get_clickable()
63 |         req_clickable = self.requests_browser.get_clickable()
64 |         self.assertTrue(len(sel_clickable) > 0)
65 |         self.assertTrue(len(req_clickable) > 0)
66 |         # self.assertEqual(len(sel_clickable), len(req_clickable))
67 |         # self.assertEqual(sel_clickable, req_clickable)
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     unittest.main()
72 | 


--------------------------------------------------------------------------------
/tests/warctest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | time ./autoscrape.py \
 4 |   --maxdepth 10 \
 5 |   --loglevel DEBUG \
 6 |   --backend warc \
 7 |   --warc-directory training_data/common-crawl/filtered/ \
 8 |   --warc-index-file /tmp/level.db \
 9 |   --output '' \
10 |   'http://calendar.uconn.edu/2017/day/065/433/'
11 | 
12 | rm -rf /tmp/level.db
13 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py35,py36,py37
 3 | 
 4 | [testenv]
 5 | commands = nosetests
 6 | 
 7 | [testenv:py35]
 8 | deps = -rrequirements.txt
 9 | 
10 | [testenv:py36]
11 | deps = {[testenv:py35]deps}
12 | 
13 | [testenv:py37]
14 | deps = {[testenv:py35]deps}
15 | 
16 | [flake8]
17 | ignore=E128,E501,F403
18 | # E128 continuation line under-indented for visual indent
19 | # E501 line too long (X > 79 characters)
20 | # F403 'from xyz import *' used; unable to detect undefined names
21 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | import argparse
 4 | import pickle
 5 | 
 6 | import numpy as np
 7 | from sklearn.model_selection import cross_val_score
 8 | from sklearn.neighbors import KNeighborsClassifier
 9 | from sklearn.svm import SVC
10 | 
11 | 
12 | def parse_args():
13 |     desc = "Convenience script for training autoscrape models."
14 |     parser = argparse.ArgumentParser(
15 |         description=desc
16 |     )
17 |     parser.add_argument(
18 |         "--data", type=str, required=True,
19 |         help="Input data pickle."
20 |     )
21 |     parser.add_argument(
22 |         "--output", type=str, required=True,
23 |         help="Fileame to output trained model to."
24 |     )
25 |     parser.add_argument(
26 |         "--model", type=str, default="kNN",
27 |         choices=["kNN", "SVC"],
28 |         help="Which classifier to use (default: kNN)."
29 |     )
30 |     return parser.parse_args()
31 | 
32 | 
33 | def load_data(filepath):
34 |     print("Loading data...")
35 |     with open(filepath, "rb") as f:
36 |         return pickle.load(f)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     args = parse_args()
41 | 
42 |     MODEL = "k-NN"
43 |     X = []
44 |     y = []
45 | 
46 |     data = load_data(args.data)
47 | 
48 |     print("X shape", data.X.shape)
49 |     for i in range(data.X.shape[0]):
50 |         x = data.X[i]
51 |         if np.any(np.isnan(x)):
52 |             continue
53 |         X.append(x)
54 |         y.append(data.y[i][0])
55 | 
56 |     X = np.array(X)
57 |     y = np.array(y)
58 | 
59 |     if args.model == "kNN":
60 |         model = KNeighborsClassifier(1)
61 |     elif args.model == "SVC":
62 |         print("Fitting SVC model..")
63 |         model = SVC()
64 |     else:
65 |         raise NotImplementedError("Bad model selected: %s" % args.model)
66 | 
67 |     # model.fit(X, y)
68 | 
69 |     scores = cross_val_score(model, X, y, cv=3)
70 |     print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
71 | 
72 |     # print("Predicting on training data...")
73 |     # y_pred = model.predict(X)
74 | 
75 |     # print("Complete!")
76 |     # print(classification_report(y, y_pred))
77 | 
78 |     print("Saving model...")
79 |     with open(args.output, "wb") as f:
80 |         pickle.dump(model, f)
81 | 
82 |     print("Done!")
83 | 


--------------------------------------------------------------------------------
/vectorize_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | import argparse
  4 | import numpy as np
  5 | import os
  6 | import pickle
  7 | import html2text
  8 | 
  9 | from autoscrape.vectorization import Vectorizer
 10 | 
 11 | 
 12 | class Data:
 13 |     def __init__(self, X=None, y=None):
 14 |         self.X = X
 15 |         self.y = y
 16 | 
 17 |     def dump(self, filepath):
 18 |         with open(filepath, "wb") as f:
 19 |             pickle.dump(self, f)
 20 | 
 21 |     def load(self, filepath):
 22 |         with open(filepath, "rb") as f:
 23 |             tmp_d = pickle.load(f)
 24 |             self.X = tmp_d.X
 25 |             self.y = tmp_d.y
 26 | 
 27 | 
 28 | def parse_args():
 29 |     desc = "Convenience script for vectorizing webpage training data."
 30 | 
 31 |     parser = argparse.ArgumentParser(
 32 |         description=desc,
 33 |     )
 34 | 
 35 |     parser.add_argument(
 36 |         "--html_embeddings", type=str, required=True,
 37 |         help="Location of HTML character embeddings file."
 38 |     )
 39 |     parser.add_argument(
 40 |         "--word_embeddings", type=str, required=True,
 41 |         help="Location of word embeddings file."
 42 |     )
 43 |     parser.add_argument(
 44 |         "--output_file", type=str, default="data.pickle",
 45 |         help="Output file for data matrices."
 46 |     )
 47 |     parser.add_argument(
 48 |         '--loglevel', type=str, default="INFO",
 49 |         choices=["DEBUG", "INFO", "WARN", "ERROR"],
 50 |         help="Loglevel (default: INFO)"
 51 |     )
 52 |     # parser.add_argument(
 53 |     #     '--driver', type=str, default="Firefox",
 54 |     #     choices=["Firefox", "Chrome", "remote"],
 55 |     #     help="Which browser driver to use",
 56 |     # )
 57 |     parser.add_argument(
 58 |         "dir", type=str,
 59 |         help=("""
 60 | Location of directory containing training HTML data. This directory needs to have the following subdirectories, which correspond to classes: data_pages, error_pages, links_to_documents, links_to_search, search_pages
 61 | """
 62 |               )
 63 |     )
 64 | 
 65 |     args = parser.parse_args()
 66 |     return args
 67 | 
 68 | 
 69 | def load_file(filename):
 70 |     with open(filename, "r") as f:
 71 |         return f.read()
 72 | 
 73 | 
 74 | if __name__ == "__main__":
 75 |     args = parse_args()
 76 | 
 77 |     cls_data = {}
 78 |     total_records = 0
 79 |     for root, dirs, files in os.walk(args.dir):
 80 |         cls = root.split("/")[-1]
 81 |         if not files or not cls:
 82 |             continue
 83 | 
 84 |         cls_data[cls] = []
 85 |         for file in files:
 86 |             filepath = os.path.join(root, file)
 87 |             cls_data[cls].append(filepath)
 88 | 
 89 |         records = len(cls_data[cls])
 90 |         print("Class=%s Records=%s" % (cls, records))
 91 |         total_records += records
 92 | 
 93 |     print("Total records: %s" % total_records)
 94 | 
 95 |     print("Loading vectorizer")
 96 |     vectorizer = Vectorizer(
 97 |         html_embeddings_file=args.html_embeddings,
 98 |         word_embeddings_file=args.word_embeddings,
 99 |         loglevel=args.loglevel,
100 |     )
101 | 
102 |     dim = vectorizer.html.dim + vectorizer.word.dim
103 |     print("Vector dimension: %s" % dim)
104 | 
105 |     X = np.zeros(shape=(total_records, dim))
106 |     y = np.zeros(shape=(total_records, 1))
107 | 
108 |     html2text.config.BODY_WIDTH = 0
109 | 
110 |     base_dir = os.path.abspath(os.curdir)
111 | 
112 |     keys = list(cls_data.keys())
113 |     I = 0
114 |     for ix in range(len(keys)):
115 |         cls = keys[ix]
116 |         for file in cls_data[cls]:
117 |             print("I=%s" % I, end="\r")
118 |             abs_path = os.path.join(base_dir, file)
119 |             print("File=%s, Absolute Path=%s" % (file, abs_path))
120 |             html = load_file(abs_path)
121 |             parser = html2text.HTML2Text()
122 |             parser.feed(html)
123 |             text = parser.close()
124 |             x = vectorizer.vectorize(html, text)
125 |             X[I, :] = x
126 |             y[I, :] = [keys.index(cls)]
127 |             print("x=%s" % x)
128 |             I += 1
129 | 
130 |     print("X: %s" % X)
131 |     print("y: %s" % y)
132 | 
133 |     data = Data(X=X, y=y)
134 |     data.dump(args.output_file)
135 | 


--------------------------------------------------------------------------------