├── .github
    └── workflows
    │   └── pythonpackage.yml
├── .gitignore
├── CHANGES.txt
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── requirements.txt
├── scrape
    ├── __init__.py
    ├── crawler.py
    ├── orderedset.py
    ├── scrape.py
    └── utils.py
├── setup.py
└── testing
    ├── admissions.html
    ├── courses.html
    ├── dropouts.html
    ├── extra.html
    ├── faculty.html
    ├── home.html
    ├── students.html
    ├── test.pdf
    ├── test.txt
    ├── test1.html
    └── test_scrape.py


/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       max-parallel: 4
11 |       matrix:
12 |         python-version: [3.6, 3.7, 3.8, 3.9]
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v1
16 |     - name: Set up Python ${{ matrix.python-version }}
17 |       uses: actions/setup-python@v1
18 |       with:
19 |         python-version: ${{ matrix.python-version }}
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install -r requirements.txt
24 |     - name: Lint with flake8
25 |       run: |
26 |         pip install flake8
27 |         # stop the build if there are Python syntax errors or undefined names
28 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
29 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
30 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
31 |     - name: Test with nose
32 |       run: |
33 |         pip install nose
34 |         nosetests
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask instance folder
57 | instance/
58 | 
59 | # Sphinx documentation
60 | docs/_build/
61 | 
62 | # PyBuilder
63 | target/
64 | 
65 | # IPython Notebook
66 | .ipynb_checkpoints
67 | 
68 | # pyenv
69 | .python-version
70 | 
71 | # dotenv
72 | .env
73 | 
74 | # Vim
75 | # swap
76 | [._]*.s[a-w][a-z]
77 | [._]s[a-w][a-z]
78 | # session
79 | Session.vim
80 | # temporary
81 | .netrwhist
82 | *~
83 | # auto-generated tag files
84 | tags
85 | 
86 | # Backup files
87 | *.bak
88 | 
89 | # Local files
90 | *.local*
91 | 
92 | # BitTorrent sync
93 | *.bts
94 | 
95 | # PyCharm
96 | *.idea/
97 | 


--------------------------------------------------------------------------------
/CHANGES.txt:
--------------------------------------------------------------------------------
  1 | 0.11.3
  2 | ------
  3 | 
  4 |  - Adjust to relocation of module for container abstract base classes
  5 | 
  6 | 0.11.2
  7 | ------
  8 | 
  9 |  - Bump lxml from 4.6.3 to 4.6.5 to fix security vulnerability
 10 | 
 11 | 0.11.1
 12 | ------
 13 | 
 14 |  - Bump lxml from 4.6.2 to 4.6.3 to fix security vulnerability
 15 |  - Retire support for Python 3.5 due to lxml 4.6.3 incompatibility
 16 | 
 17 | 0.11.0
 18 | ------
 19 | 
 20 |  - Retire support for Python 2
 21 |  - Retire Travis CI build and enable Python 3.8 and 3.9 in GitHub Actions
 22 |  - Enable local file access for wkhtmltopdf to fix failure in embedding images in PDFs
 23 | 
 24 | 0.10.2
 25 | ------
 26 | 
 27 |  - Bump lxml from 4.3.0 to 4.6.2 for security patch
 28 |  - Remove support for Python 3.4 as not supported in latest lxml version
 29 | 
 30 | 0.10.1
 31 | ------
 32 | 
 33 |  - Bugfix: TypeError when attempting to hash unencoded Unicode-objects
 34 | 
 35 | 0.10.0
 36 | ------
 37 | 
 38 |  - Test python 3.7 and 3.8 in Travis CI/GitHub Actions
 39 |  - Replace cgi.escape with html.escape in Python 3 due to removal of cgi.escape in 3.8
 40 |  - Reformat using Black
 41 | 
 42 | 0.9.15
 43 | ------
 44 | 
 45 |  - travis CI does not support 3.7 yet, removing that version from build
 46 | 
 47 | 0.9.14
 48 | ------
 49 | 
 50 |  - added versions 3.6 and 3.7 to travis CI build, removed 2.6 and 3.3
 51 |  - 2.6 and 3.3 deprecated by lxml
 52 | 
 53 | 0.9.13
 54 | ------
 55 | 
 56 |  - 3.7 added as supported version in setup
 57 |  - Updated LICENSE and requirements.txt
 58 | 
 59 | 0.9.12
 60 | ------
 61 | 
 62 |  - 3.6 added as supported version in setup
 63 |  - Updated LICENSE
 64 | 
 65 | 0.9.11
 66 | ------
 67 | 
 68 |  - Bugfix: MissingSchema during requests get
 69 |  - Bugfix: Check for Python 2 should have been for Python 3
 70 | 
 71 | 0.9.10
 72 | ------
 73 | 
 74 |  - More refactoring
 75 | 
 76 | 0.9.9
 77 | ------
 78 | 
 79 |  - Converted markdown README to rst
 80 | 
 81 | 0.9.8
 82 | ------
 83 | 
 84 |  - Changed Utility classifier to Utilities
 85 | 
 86 | 0.9.7
 87 | ------
 88 | 
 89 |  - Replaced compat.py with six module
 90 |  - Made imports relative rather than from PATH
 91 |  - More refactoring
 92 | 
 93 | 0.9.6
 94 | ------
 95 | 
 96 |  - Bugfix: Remove non-links through filtering by protocol
 97 |  - Refactorings
 98 | 
 99 | 0.9.5
100 | ------
101 | 
102 |  - Bugfix: Properly join internal and base URLs for crawling
103 | 
104 | 0.9.4
105 | ------
106 | 
107 |  - Retired support for 3.2 as tldextract doesn't support it
108 | 
109 | 0.9.3
110 | ------
111 | 
112 |  - Moved crawling functions into a Crawler class
113 |  - General refactorings to docstrings, function names, etc.
114 |  - Consolidated max_pages and max_links arguments as max_crawls
115 |  - Added tldextract module for getting URL domain, suffixes
116 | 
117 | 0.9.2
118 | ------
119 | 
120 |  - Added compat.py file
121 |  - Moved compatible builtin definitions to __init__
122 |  - Added requests cache
123 | 
124 | 0.9.1
125 | ------
126 | 
127 |  - Updated version in requirements and setup keywords
128 |  - Removed --use-mirrors for 3.5 support
129 | 
130 | 0.9.0
131 | ------
132 | 
133 | - Bugfix: Fixed comparison of duplicate URLs when crawling
134 | 
135 | 0.8.11
136 | ------
137 | 
138 |  - Bugfix: Improper check of domain when being restrictive
139 | 
140 | 0.8.10
141 | ------
142 | 
143 |  - Strip '/' from end of urls when crawling
144 | 
145 | 0.8.9
146 | ------
147 | 
148 |  - Added argument for cache link size & fixed up others
149 | 
150 | 0.8.8
151 | ------
152 | 
153 |  - Updated README and setup
154 | 
155 | 0.8.7
156 | ------
157 | 
158 |  - added CSV as a format
159 | 
160 | 0.8.6
161 | ------
162 | 
163 |  - added environ variable SCRAPE_DISABLE_IMGS to not save images
164 | 
165 | 0.8.5
166 | ------
167 | 
168 |  - warn user that saving images during crawling is slow
169 | 
170 | 0.8.4
171 | ------
172 | 
173 |  - moved print_text() from crawl.py back to scrape.py
174 | 
175 | 0.8.3
176 | ------
177 | 
178 |  - fixed bad formatting in readme usage
179 | 
180 | 0.8.2
181 | ------
182 | 
183 |  - ignore-load-errors removed from wkhtmltopdf executable
184 | 
185 | 0.8.1
186 | ------
187 | 
188 |  - removed extra schema adding
189 | 
190 | 0.8.0
191 | ------
192 | 
193 |  - fixed bug where added url schema not reflected in query
194 | 
195 | 0.7.9
196 | ------
197 | 
198 |  - moved file crawling to new file
199 |  - avoid overwrite prompt in tests
200 | 
201 | 0.7.8
202 | ------
203 | 
204 |  - updated program description
205 |  - removed overwriting test due to issues with it
206 | 
207 | 0.7.7
208 | ------
209 | 
210 |  - no longer defaults to overwriting files, added program flags/a prompt
211 |  - adding renaming mechanism if choosing to not overwrite a file
212 |  - some function reorganizing
213 | 
214 | 0.7.6
215 | ------
216 | 
217 |  - added print text to stdout option
218 |  - removed extra newline appended in re_filter
219 |  - wrapped pdfkit import in try/except as it isnt essential
220 | 
221 | 0.7.5
222 | ------
223 | 
224 |  - removed extra urlparse import
225 | 
226 | 0.7.4
227 | ------
228 | 
229 |  - added option to not save images
230 |  - images are now only saved if saving to HTML or PDF
231 |  - checks if outfilename has extension before adding new one
232 |  - fixed domains being sometimes mismatched to urls
233 |  - fixed extension being unnecessary appended to urls (for the most part)
234 | 
235 | 0.7.3
236 | ------
237 | 
238 |  - development status reverted to beta
239 | 
240 | 0.7.2
241 | ------
242 | 
243 |  - now saves images with PART.html files (but not css yet)
244 |  - added module level docstrings
245 | 
246 | 0.7.1
247 | ------
248 | 
249 |  - added EOFError handling
250 | 
251 | 0.7.0
252 | ------
253 | 
254 |  - fixed crawl not returning filenames to add to infilenames
255 |  - fixed re_filter adding duplicate matches
256 |  - fixed domain unboundlocalerror
257 | 
258 | 0.6.9
259 | ------
260 | 
261 |  - fixed bug where query not found in urls due to trailing /
262 | 
263 | 0.6.8
264 | ------
265 | 
266 |  - updated program usage
267 | 
268 | 0.6.7
269 | ------
270 | 
271 |  - fixed bounds check on out file names
272 | 
273 | 0.6.6
274 | ------
275 | 
276 |  - added out file names as a program argument
277 |  - fixed bug where re-writing multiple files
278 |  - fixed bug where writing only the first file when writing single file
279 | 
280 | 0.6.5
281 | ------
282 | 
283 |  - major improvement to remove_whitespace()
284 | 
285 | 0.6.4
286 | ------
287 | 
288 |  - more docstring improvements
289 | 
290 | 0.6.3
291 | ------
292 | 
293 |  - began process of making docstrings conform to pep257
294 |  - increased size of link cache from 10 to 100
295 |  - remove the newline at start of text files
296 |  - add newlines between lines filtered by regex
297 |  - remove_whitespace now removes newlines that are 3 in a row or more
298 | 
299 | 0.6.2
300 | ------
301 | 
302 |  - stylistic changes
303 |  - files are now read in 1K chunks
304 | 
305 | 0.6.1
306 | ------
307 | 
308 |  - remove consecutive whitespace before writing text files
309 |  - empty text files no longer written
310 | 
311 | 0.6.0
312 | ------
313 | 
314 |  - fixed bug where single out file name wasn't properly constructed
315 |  - out file names are all returned as lowercase now
316 | 
317 | 0.5.9
318 | ------
319 | 
320 |  - fixed bug where text wouldn't write unless xpath specified
321 | 
322 | 0.5.8
323 | ------
324 | 
325 |  - can now parse HTML using XPath and save to all formats
326 |  - remove carriage returns in scraped text files
327 | 
328 | 0.5.7
329 | ------
330 | 
331 |  - added maximum out file name length of 24 characters
332 | 
333 | 0.5.6
334 | ------
335 | 
336 |  - fixed urls not being properly added under file_types
337 | 
338 | 0.5.5
339 | ------
340 | 
341 |  - fixed UnboundLocalError in write_single_file
342 | 
343 | 0.5.4
344 | ------
345 | 
346 |  - fixed redefinition of out_file_name in write_to_text
347 | 
348 | 0.5.3
349 | ------
350 | 
351 |  - fixed IndexError in write_to_text
352 | 
353 | 0.5.2
354 | ------
355 | 
356 |  - small fix for finding single out file name
357 | 
358 | 0.5.1
359 | ------
360 | 
361 |  - remade method to find single out file name
362 | 
363 | 0.5.0
364 | ------
365 | 
366 |  - can now save to single or multiple output files/directories
367 |  - added tests for writing to single or multiple files
368 |  - preserves original lines/newlines when parsing/writing files
369 | 
370 | 0.4.11
371 | ------
372 | 
373 |  - changed generator.next() to next(generator) for python 3 compatibility
374 | 
375 | 0.4.10
376 | ------
377 | 
378 |  - forgot to remove all occurrences of xrange
379 | 
380 | 0.4.9
381 | ------
382 | 
383 |  - changed unicode decode to ascii decode when writing html to disk
384 | 
385 | 0.4.8
386 | ------
387 | 
388 |  - added missing python 3 compatibilities
389 | 
390 | 0.4.7
391 | ------
392 | 
393 |  - fixed urlparse importerror in utils.py for python 3 users
394 | 
395 | 0.4.6
396 | ------
397 | 
398 |  - fixed html => text
399 |  - all conversions fixed, test_scrape.py added to keep it this way
400 |  - added pdfkit to requirements.txt
401 | 
402 | 0.4.5
403 | ------
404 | 
405 |  - added docstrings to all functions
406 |  - fixed IOError when trying to convert local html to html
407 |  - fixed IOError when trying to convert local html to pdf
408 |  - fixed saving scraped files to text, was saving PART filenames instead
409 | 
410 | 0.4.4
411 | ------
412 | 
413 |  - prompts for filetype from user if none entered
414 |  - modularized a couple functions
415 | 
416 | 0.4.3
417 | ------
418 | 
419 |  - fixed out_file naming
420 |  - pep8 and pylint reformatting
421 | 
422 | 0.4.2
423 | ------
424 | 
425 |  - removed read_part_files in place of get_part_files as pdfkit reads filenames
426 | 
427 | 0.4.1
428 | ------
429 | 
430 |  - fixed bug preventing writing scraped urls to pdf
431 | 
432 | 0.4.0
433 | ------
434 | 
435 |  - can now read in text and filter it
436 |  - recognizes local files, no need for user to enter special flag
437 |  - moved html/ files to testing/ and added a text file to it
438 |  - added better distinction between input and output files
439 |  - changed instances of file to f_name in utils
440 |  - pep8 reformatting
441 | 
442 | 0.3.9
443 | ------
444 | 
445 |  - add scheme to urls if none present
446 |  - fixed bug where raw_html was calling get_html rather than get_raw_html
447 | 
448 | 0.3.8
449 | ------
450 | 
451 |  - made distinction between links and pages with multiple links on them
452 |  - use --maxpages to set the maximum number of pages to get links from
453 |  - use --maxlinks to set the maximum number of links to parse
454 |  - improved the argument help messages
455 |  - improved notes/description in README
456 | 
457 | 0.3.7
458 | ------
459 | 
460 |  - fixes to page caching and writing PART files
461 |  - use --local to read in local html files
462 |  - use --max to indicate max number of pages to crawl
463 |  - changed program description and keywords
464 | 
465 | 0.3.6
466 | ------
467 | 
468 |  - cleanup using pylint as reference
469 | 
470 | 0.3.5
471 | ------
472 | 
473 | - updated long program description in readme
474 | - added pypi monthly downloads image in readme
475 | 
476 | 0.3.4
477 | ------
478 | 
479 |  - updated description header in readme
480 | 
481 | 0.3.3
482 | ------
483 | 
484 |  - added file conversion to program description
485 | 
486 | 0.3.2
487 | ------
488 | 
489 |  - added travis-ci build status to readme
490 | 
491 | 0.3.1
492 | ------
493 | 
494 |  - updated program description and added extra installation instructions
495 |  - added .travis.yml and requirements.txt
496 | 
497 | 0.3.0
498 | ------
499 | 
500 |  - added read option for user inputted html files, currently writes files individually and not grouped, to do next is add grouping option
501 |  - added html/ directory containing test html files
502 |  - made relative imports explicit using absolute_import
503 |  - added proxies to utils.py
504 | 
505 | 0.2.10
506 | ------
507 | 
508 |  - moved OrderedSet class to orderedset.py rather than utils.py
509 | 
510 | 0.2.9
511 | ------
512 | 
513 |  - updated program description and keywords in setup.py
514 | 
515 | 0.2.8
516 | ------
517 | 
518 |  - restricts crawling to seed domain by default, changed --strict to --nonstrict for crawling outside given website
519 | 
520 | 0.2.5
521 | ------
522 | 
523 |  - added requests to install_requires in setup.py
524 | 
525 | 0.2.4
526 | ------
527 | 
528 |  - added attributes flag which specifies which tag attributes to extract from a given page, such as text, href, etc.
529 | 
530 | 0.2.3
531 | ------
532 | 
533 |  - updated flags and flag help messages
534 |  - verbose now by default and reduced number of messages, use --quiet to silence messages
535 |  - changed name of --files flag to --html for saving output as html
536 |  - added --text flag, default is still text
537 | 
538 | 0.2.2
539 | ------
540 | 
541 |  - fixed character encoding issue, all unicode now
542 | 
543 | 0.2.1
544 | ------
545 | 
546 |  - improvements to exception handling for proper PART file removal
547 | 
548 | 0.2.0
549 | ------
550 | 
551 |  - pages are now saved as they are crawled to PART.html files and processed/removed as necessary, this greatly saves on program memory
552 |  - added a page cache with a limit of 10 for greater duplicate protection
553 |  - added --files option for keeping webpages as PART.html instead of saving as text or pdf, this also organizes them into a subdirectory named after the seed url's domain
554 |  - changed --restrict flag to --strict for restricting the domain to the seed domain while crawling
555 |  - more --verbose messages being printed
556 | 
557 | 0.1.10
558 | ------
559 | 
560 |  - now compares urls scheme-less before updating links to prevent http:// and https:// duplicates and replaced set_scheme with remove_scheme in utils.py
561 |  - renamed write_pages to write_links
562 | 
563 | 0.1.9
564 | ------
565 | 
566 |  - added behavior for --crawl keywords in crawl method
567 |  - added a domain check before outputting crawled message or adding to crawled links
568 |  - domain key in args is now set to base domain for proper --restrict behavior
569 |  - clean_url now rstrips / character for proper link crawling
570 |  - resolve_url now rstrips / character for proper out_file writing
571 |  - updated description of --crawl flag
572 | 
573 | 0.1.8
574 | ------
575 | 
576 |  - removed url fragments
577 |  - replaced set_base with urlparse method urljoin
578 |  - out_file name construction now uses urlparse 'path' member
579 |  - raw_links is now an OrderedSet to try to eliminate as much processing as possible
580 |  - added clear method to OrderedSet in utils.py
581 | 
582 | 0.1.7
583 | ------
584 | 
585 |  - removed validate_domain and replaced it with a lambda instead
586 |  - replaced domain with base_url in set_base as should have been done before
587 |  - crawled message no longer prints if url was a duplicate
588 | 
589 | 0.1.6
590 | ------
591 | 
592 |  - uncommented import __version__
593 | 
594 | 0.1.5
595 | ------
596 | 
597 |  - set_domain was replaced by set_base, proper solution for links that are relative
598 |  - fixed verbose behavior
599 |  - updated description in README
600 | 
601 | 0.1.4
602 | ------
603 | 
604 |  - fixed output file generation, was using domain instead of base_url
605 |  - minor code cleanup
606 | 
607 | 0.1.3
608 | ------
609 | 
610 |  - blank lines are no longer written to text unless as a page separator
611 |  - style tags now ignored alongside script tags when getting text
612 | 
613 | 0.1.2
614 | ------
615 | 
616 |  - added shebang
617 | 
618 | 0.1.1
619 | ------
620 | 
621 |  - uncommented import __version__
622 | 
623 | 0.1.0
624 | ------
625 | 
626 |  - reformatting to conform with PEP 8
627 |  - added regexp support for matching crawl keywords and filter text keywords
628 |  - improved url resolution by correcting domains and schemes
629 |  - added --restrict option to restrict crawler links to only those with seed domain
630 |  - made text the default write option rather than pdf, can now use --pdf to change that
631 |  - removed page number being written to text, separator is now just a single blank line
632 |  - improved construction of output file name
633 | 
634 | 0.0.11
635 | ------
636 | 
637 |  - fixed missing comma in install_requires in setup.py
638 |  - also labeled now as beta as there are still some kinks with crawling
639 | 
640 | 0.0.10
641 | ------
642 | 
643 |  - now ignoring pdfkit load errors only if more than one link to try to prevent an empty pdf being created in case of error
644 | 
645 | 0.0.9
646 | ------
647 | 
648 |  - pdfkit now ignores load errors and writes as many pages as possible
649 | 
650 | 0.0.8
651 | ------
652 | 
653 |  - better implementation of crawler, can now scrape entire websites
654 |  - added OrderedSet class to utils.py
655 | 
656 | 0.0.7
657 | ------
658 | 
659 |  - changed --keywords to --filter and positional arg url to urls
660 | 
661 | 0.0.6
662 | ------
663 | 
664 |  - use --keywords flag for filtering text
665 |  - can pass multiple links now
666 |  - will not write empty files anymore
667 | 
668 | 0.0.5
669 | ------
670 | 
671 |  - added --verbose argument for use with pdfkit
672 |  - improved output file name processing
673 | 
674 | 0.0.4
675 | ------
676 | 
677 |  - accepts 0 or 1 url's, allowing a call with just --version
678 | 
679 | 0.0.3
680 | ------
681 | 
682 |  - Moved utils.py to scrape/
683 | 
684 | 0.0.2
685 | ------
686 | 
687 |  - First entry
688 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (C) 2015-2021 Hunter Hammond (huntrar@gmail.com)
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include CHANGES.txt
2 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | scrape |PyPI Version| |Total Downloads|
  2 | ======================================================
  3 | 
  4 | a command-line web scraping tool
  5 | --------------------------------
  6 | 
  7 | scrape is a rule-based web crawler and information extraction tool
  8 | capable of manipulating and merging new and existing documents. XML Path
  9 | Language (XPath) and regular expressions are used to define rules for
 10 | filtering content and web traversal. Output may be converted into text,
 11 | csv, pdf, and/or HTML formats.
 12 | 
 13 | Installation
 14 | ------------
 15 | 
 16 | ::
 17 | 
 18 |     pip install scrape
 19 | 
 20 | or
 21 | 
 22 | ::
 23 | 
 24 |     pip install git+https://github.com/huntrar/scrape.git#egg=scrape
 25 | 
 26 | or
 27 | 
 28 | ::
 29 | 
 30 |     git clone https://github.com/huntrar/scrape
 31 |     cd scrape
 32 |     python setup.py install
 33 | 
 34 | You must `install
 35 | wkhtmltopdf <https://github.com/pdfkit/pdfkit/wiki/Installing-WKHTMLTOPDF>`__
 36 | to save files to pdf.
 37 | 
 38 | Usage
 39 | -----
 40 | 
 41 | ::
 42 | 
 43 |     usage: scrape.py [-h] [-a [ATTRIBUTES [ATTRIBUTES ...]]] [-all]
 44 |                      [-c [CRAWL [CRAWL ...]]] [-C] [--csv] [-cs [CACHE_SIZE]]
 45 |                      [-f [FILTER [FILTER ...]]] [--html] [-i] [-m]
 46 |                      [-max MAX_CRAWLS] [-n] [-ni] [-no] [-o [OUT [OUT ...]]] [-ow]
 47 |                      [-p] [-pt] [-q] [-s] [-t] [-v] [-x [XPATH]]
 48 |                      [QUERY [QUERY ...]]
 49 | 
 50 |     a command-line web scraping tool
 51 | 
 52 |     positional arguments:
 53 |       QUERY                 URLs/files to scrape
 54 | 
 55 |     optional arguments:
 56 |       -h, --help            show this help message and exit
 57 |       -a [ATTRIBUTES [ATTRIBUTES ...]], --attributes [ATTRIBUTES [ATTRIBUTES ...]]
 58 |                             extract text using tag attributes
 59 |       -all, --crawl-all     crawl all pages
 60 |       -c [CRAWL [CRAWL ...]], --crawl [CRAWL [CRAWL ...]]
 61 |                             regexp rules for following new pages
 62 |       -C, --clear-cache     clear requests cache
 63 |       --csv                 write files as csv
 64 |       -cs [CACHE_SIZE], --cache-size [CACHE_SIZE]
 65 |                             size of page cache (default: 1000)
 66 |       -f [FILTER [FILTER ...]], --filter [FILTER [FILTER ...]]
 67 |                             regexp rules for filtering text
 68 |       --html                write files as HTML
 69 |       -i, --images          save page images
 70 |       -m, --multiple        save to multiple files
 71 |       -max MAX_CRAWLS, --max-crawls MAX_CRAWLS
 72 |                             max number of pages to crawl
 73 |       -n, --nonstrict       allow crawler to visit any domain
 74 |       -ni, --no-images      do not save page images
 75 |       -no, --no-overwrite   do not overwrite files if they exist
 76 |       -o [OUT [OUT ...]], --out [OUT [OUT ...]]
 77 |                             specify outfile names
 78 |       -ow, --overwrite      overwrite a file if it exists
 79 |       -p, --pdf             write files as pdf
 80 |       -pt, --print          print text output
 81 |       -q, --quiet           suppress program output
 82 |       -s, --single          save to a single file
 83 |       -t, --text            write files as text
 84 |       -v, --version         display current version
 85 |       -x [XPATH], --xpath [XPATH]
 86 |                             filter HTML using XPath
 87 | 
 88 | Author
 89 | ------
 90 | 
 91 | -  Hunter Hammond (huntrar@gmail.com)
 92 | 
 93 | Notes
 94 | -----
 95 | 
 96 | -  Input to scrape can be links, files, or a combination of the two,
 97 |    allowing you to create new files constructed from both existing and
 98 |    newly scraped content.
 99 | -  Multiple input files/URLs are saved to multiple output
100 |    files/directories by default. To consolidate them, use the --single
101 |    flag.
102 | -  Images are automatically included when saving as pdf or HTML; this
103 |    involves making additional HTTP requests, adding a significant amount
104 |    of processing time. If you wish to forgo this feature use the
105 |    --no-images flag, or set the environment variable
106 |    SCRAPE\_DISABLE\_IMGS.
107 | -  Requests cache is enabled by default to cache webpages, it can be
108 |    disabled by setting the environment variable SCRAPE\_DISABLE\_CACHE.
109 | -  Pages are saved temporarily as PART.html files during processing.
110 |    Unless saving pages as HTML, these files are removed automatically
111 |    upon conversion or exit.
112 | -  To crawl pages with no restrictions use the --crawl-all flag, or
113 |    filter which pages to crawl by URL keywords by passing one or more
114 |    regexps to --crawl.
115 | -  If you want the crawler to follow links outside of the given URLs
116 |    domain, use --nonstrict.
117 | -  Crawling can be stopped by Ctrl-C or alternatively by setting the
118 |    number of pages or links to be crawled using --maxpages and
119 |    --maxlinks. A page may contain zero or many links to more pages.
120 | -  The text output of scraped files can be printed to stdout rather than
121 |    saved by entering --print.
122 | -  Filtering HTML can be done using --xpath, while filtering text is
123 |    done by entering one or more regexps to --filter.
124 | -  If you only want to specify specific tag attributes to extract rather
125 |    than an entire XPath, use --attributes. The default choice is to
126 |    extract only text attributes, but you can specify one or many
127 |    different attributes (such as href, src, title, or any attribute
128 |    available..).
129 | 
130 | .. |PyPI Version| image:: https://img.shields.io/pypi/v/scrape.svg
131 |    :target: https://pypi.python.org/pypi/scrape
132 | .. |Total Downloads| image:: https://pepy.tech/badge/scrape
133 |    :target: https://pepy.tech/project/scrape
134 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | argparse==1.4.0
2 | lxml==4.6.5
3 | pdfkit==0.6.1
4 | requests==2.25.1
5 | requests-cache==0.4.13
6 | six==1.15.0
7 | tldextract==3.1.0
8 | 


--------------------------------------------------------------------------------
/scrape/__init__.py:
--------------------------------------------------------------------------------
1 | """scrape is a rule-based web crawler and information extraction tool capable of manipulating and merging new and existing documents. XML Path Language (XPath) and regular expressions are used to define rules for filtering content and web traversal. Output may be converted into text, csv, pdf, and/or HTML formats.
2 | """
3 | 
4 | __version__ = "0.11.3"
5 | 


--------------------------------------------------------------------------------
/scrape/crawler.py:
--------------------------------------------------------------------------------
  1 | """A class to crawl webpages."""
  2 | 
  3 | from __future__ import absolute_import, print_function
  4 | import sys
  5 | 
  6 | import lxml.html as lh
  7 | 
  8 | from .orderedset import OrderedSet
  9 | from . import utils
 10 | 
 11 | 
 12 | class Crawler(object):
 13 |     """Follows and saves webpages to PART.html files."""
 14 | 
 15 |     def __init__(self, args, seed_url=None):
 16 |         """Set seed URL and program arguments"""
 17 |         self.seed_url = seed_url
 18 |         self.args = args
 19 |         self.page_cache = []
 20 | 
 21 |     def get_new_links(self, url, resp):
 22 |         """Get new links from a URL and filter them."""
 23 |         links_on_page = resp.xpath("//a/@href")
 24 |         links = [utils.clean_url(u, url) for u in links_on_page]
 25 | 
 26 |         # Remove non-links through filtering by protocol
 27 |         links = [x for x in links if utils.check_protocol(x)]
 28 | 
 29 |         # Restrict new URLs by the domain of the input URL
 30 |         if not self.args["nonstrict"]:
 31 |             domain = utils.get_domain(url)
 32 |             links = [x for x in links if utils.get_domain(x) == domain]
 33 | 
 34 |         # Filter URLs by regex keywords, if any
 35 |         if self.args["crawl"]:
 36 |             links = utils.re_filter(links, self.args["crawl"])
 37 |         return links
 38 | 
 39 |     def limit_reached(self, num_crawls):
 40 |         """Check if number of pages crawled have reached a limit."""
 41 |         return self.args["max_crawls"] and num_crawls >= self.args["max_crawls"]
 42 | 
 43 |     def page_crawled(self, page_resp):
 44 |         """Check if page has been crawled by hashing its text content.
 45 | 
 46 |         Add new pages to the page cache.
 47 |         Return whether page was found in cache.
 48 |         """
 49 |         page_text = utils.parse_text(page_resp)
 50 |         page_hash = utils.hash_text("".join(page_text))
 51 |         if page_hash not in self.page_cache:
 52 |             utils.cache_page(self.page_cache, page_hash, self.args["cache_size"])
 53 |             return False
 54 |         return True
 55 | 
 56 |     def crawl_links(self, seed_url=None):
 57 |         """Find new links given a seed URL and follow them breadth-first.
 58 | 
 59 |         Save page responses as PART.html files.
 60 |         Return the PART.html filenames created during crawling.
 61 |         """
 62 |         if seed_url is not None:
 63 |             self.seed_url = seed_url
 64 | 
 65 |         if self.seed_url is None:
 66 |             sys.stderr.write("Crawling requires a seed URL.\n")
 67 |             return []
 68 | 
 69 |         prev_part_num = utils.get_num_part_files()
 70 |         crawled_links = set()
 71 |         uncrawled_links = OrderedSet()
 72 | 
 73 |         uncrawled_links.add(self.seed_url)
 74 |         try:
 75 |             while uncrawled_links:
 76 |                 # Check limit on number of links and pages to crawl
 77 |                 if self.limit_reached(len(crawled_links)):
 78 |                     break
 79 |                 url = uncrawled_links.pop(last=False)
 80 | 
 81 |                 # Remove protocol, fragments, etc. to get unique URLs
 82 |                 unique_url = utils.remove_protocol(utils.clean_url(url))
 83 |                 if unique_url not in crawled_links:
 84 |                     raw_resp = utils.get_raw_resp(url)
 85 |                     if raw_resp is None:
 86 |                         if not self.args["quiet"]:
 87 |                             sys.stderr.write("Failed to parse {0}.\n".format(url))
 88 |                         continue
 89 | 
 90 |                     resp = lh.fromstring(raw_resp)
 91 |                     if self.page_crawled(resp):
 92 |                         continue
 93 | 
 94 |                     crawled_links.add(unique_url)
 95 |                     new_links = self.get_new_links(url, resp)
 96 |                     uncrawled_links.update(new_links)
 97 |                     if not self.args["quiet"]:
 98 |                         print("Crawled {0} (#{1}).".format(url, len(crawled_links)))
 99 | 
100 |                     # Write page response to PART.html file
101 |                     utils.write_part_file(
102 |                         self.args, url, raw_resp, resp, len(crawled_links)
103 |                     )
104 |         except (KeyboardInterrupt, EOFError):
105 |             pass
106 | 
107 |         curr_part_num = utils.get_num_part_files()
108 |         return utils.get_part_filenames(curr_part_num, prev_part_num)
109 | 


--------------------------------------------------------------------------------
/scrape/orderedset.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import MutableSet
 2 | 
 3 | 
 4 | class OrderedSet(MutableSet):
 5 |     def __init__(self, iterable=None):
 6 |         self.end = end = []
 7 |         end += [None, end, end]  # sentinel node for doubly linked list
 8 |         self.map = {}  # key --> [key, prev, next]
 9 |         if iterable is not None:
10 |             self |= iterable
11 | 
12 |     def __len__(self):
13 |         return len(self.map)
14 | 
15 |     def __contains__(self, key):
16 |         return key in self.map
17 | 
18 |     def add(self, key):
19 |         if key not in self.map:
20 |             end = self.end
21 |             curr = end[1]
22 |             curr[2] = end[1] = self.map[key] = [key, curr, end]
23 | 
24 |     def update(self, iterable):
25 |         for item in iterable:
26 |             self.add(item)
27 | 
28 |     def discard(self, key):
29 |         if key in self.map:
30 |             key, prev, next = self.map.pop(key)
31 |             prev[2] = next
32 |             next[1] = prev
33 | 
34 |     def __iter__(self):
35 |         end = self.end
36 |         curr = end[2]
37 |         while curr is not end:
38 |             yield curr[0]
39 |             curr = curr[2]
40 | 
41 |     def __reversed__(self):
42 |         end = self.end
43 |         curr = end[1]
44 |         while curr is not end:
45 |             yield curr[0]
46 |             curr = curr[1]
47 | 
48 |     def pop(self, last=True):
49 |         if not self:
50 |             raise KeyError("set is empty")
51 |         key = self.end[1][0] if last else self.end[2][0]
52 |         self.discard(key)
53 |         return key
54 | 
55 |     def clear(self):
56 |         while self:
57 |             self.pop()
58 | 
59 |     def __repr__(self):
60 |         if not self:
61 |             return "%s()" % (self.__class__.__name__,)
62 |         return "%s(%r)" % (self.__class__.__name__, list(self))
63 | 
64 |     def __eq__(self, other):
65 |         if isinstance(other, OrderedSet):
66 |             return len(self) == len(other) and list(self) == list(other)
67 |         return set(self) == set(other)
68 | 


--------------------------------------------------------------------------------
/scrape/scrape.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """ scrape - a command-line web scraping tool
  3 | 
  4 |     written by Hunter Hammond (huntrar@gmail.com)
  5 | """
  6 | 
  7 | from __future__ import absolute_import, print_function
  8 | from argparse import ArgumentParser
  9 | import os
 10 | import sys
 11 | 
 12 | from six.moves import input
 13 | from six import iterkeys
 14 | 
 15 | from .crawler import Crawler
 16 | from . import utils, __version__
 17 | 
 18 | 
 19 | def get_parser():
 20 |     """Parse command-line arguments."""
 21 |     parser = ArgumentParser(description="a command-line web scraping tool")
 22 |     parser.add_argument(
 23 |         "query", metavar="QUERY", type=str, nargs="*", help="URLs/files to scrape"
 24 |     )
 25 |     parser.add_argument(
 26 |         "-a",
 27 |         "--attributes",
 28 |         type=str,
 29 |         nargs="*",
 30 |         help="extract text using tag attributes",
 31 |     )
 32 |     parser.add_argument(
 33 |         "-all", "--crawl-all", help="crawl all pages", action="store_true"
 34 |     )
 35 |     parser.add_argument(
 36 |         "-c",
 37 |         "--crawl",
 38 |         type=str,
 39 |         nargs="*",
 40 |         help="regexp rules for following new pages",
 41 |     )
 42 |     parser.add_argument(
 43 |         "-C", "--clear-cache", help="clear requests cache", action="store_true"
 44 |     )
 45 |     parser.add_argument("--csv", help="write files as csv", action="store_true")
 46 |     parser.add_argument(
 47 |         "-cs",
 48 |         "--cache-size",
 49 |         type=int,
 50 |         nargs="?",
 51 |         help="size of page cache (default: 1000)",
 52 |         default=1000,
 53 |     )
 54 |     parser.add_argument(
 55 |         "-f", "--filter", type=str, nargs="*", help="regexp rules for filtering text"
 56 |     )
 57 |     parser.add_argument("--html", help="write files as HTML", action="store_true")
 58 |     parser.add_argument("-i", "--images", action="store_true", help="save page images")
 59 |     parser.add_argument(
 60 |         "-m", "--multiple", help="save to multiple files", action="store_true"
 61 |     )
 62 |     parser.add_argument(
 63 |         "-max", "--max-crawls", type=int, help="max number of pages to crawl"
 64 |     )
 65 |     parser.add_argument(
 66 |         "-n",
 67 |         "--nonstrict",
 68 |         action="store_true",
 69 |         help="allow crawler to visit any domain",
 70 |     )
 71 |     parser.add_argument(
 72 |         "-ni", "--no-images", action="store_true", help="do not save page images"
 73 |     )
 74 |     parser.add_argument(
 75 |         "-no",
 76 |         "--no-overwrite",
 77 |         action="store_true",
 78 |         help="do not overwrite files if they exist",
 79 |     )
 80 |     parser.add_argument(
 81 |         "-o", "--out", type=str, nargs="*", help="specify outfile names"
 82 |     )
 83 |     parser.add_argument(
 84 |         "-ow", "--overwrite", action="store_true", help="overwrite a file if it exists"
 85 |     )
 86 |     parser.add_argument("-p", "--pdf", help="write files as pdf", action="store_true")
 87 |     parser.add_argument("-pt", "--print", help="print text output", action="store_true")
 88 |     parser.add_argument(
 89 |         "-q", "--quiet", help="suppress program output", action="store_true"
 90 |     )
 91 |     parser.add_argument(
 92 |         "-s", "--single", help="save to a single file", action="store_true"
 93 |     )
 94 |     parser.add_argument("-t", "--text", help="write files as text", action="store_true")
 95 |     parser.add_argument(
 96 |         "-v", "--version", help="display current version", action="store_true"
 97 |     )
 98 |     parser.add_argument(
 99 |         "-x", "--xpath", type=str, nargs="?", help="filter HTML using XPath"
100 |     )
101 |     return parser
102 | 
103 | 
104 | def write_files(args, infilenames, outfilename):
105 |     """Write scraped or local file(s) in desired format.
106 | 
107 |     Keyword arguments:
108 |     args -- program arguments (dict)
109 |     infilenames -- names of user-inputted and/or downloaded files (list)
110 |     outfilename -- name of output file (str)
111 | 
112 |     Remove PART(#).html files after conversion unless otherwise specified.
113 |     """
114 |     write_actions = {
115 |         "print": utils.print_text,
116 |         "pdf": utils.write_pdf_files,
117 |         "csv": utils.write_csv_files,
118 |         "text": utils.write_text_files,
119 |     }
120 |     try:
121 |         for action in iterkeys(write_actions):
122 |             if args[action]:
123 |                 write_actions[action](args, infilenames, outfilename)
124 |     finally:
125 |         if args["urls"] and not args["html"]:
126 |             utils.remove_part_files()
127 | 
128 | 
129 | def write_single_file(args, base_dir, crawler):
130 |     """Write to a single output file and/or subdirectory."""
131 |     if args["urls"] and args["html"]:
132 |         # Create a directory to save PART.html files in
133 |         domain = utils.get_domain(args["urls"][0])
134 |         if not args["quiet"]:
135 |             print("Storing html files in {0}/".format(domain))
136 |         utils.mkdir_and_cd(domain)
137 | 
138 |     infilenames = []
139 |     for query in args["query"]:
140 |         if query in args["files"]:
141 |             infilenames.append(query)
142 |         elif query.strip("/") in args["urls"]:
143 |             if args["crawl"] or args["crawl_all"]:
144 |                 # Crawl and save HTML files/image files to disk
145 |                 infilenames += crawler.crawl_links(query)
146 |             else:
147 |                 raw_resp = utils.get_raw_resp(query)
148 |                 if raw_resp is None:
149 |                     return False
150 | 
151 |                 prev_part_num = utils.get_num_part_files()
152 |                 utils.write_part_file(args, query, raw_resp)
153 |                 curr_part_num = prev_part_num + 1
154 |                 infilenames += utils.get_part_filenames(curr_part_num, prev_part_num)
155 | 
156 |     # Convert output or leave as PART.html files
157 |     if args["html"]:
158 |         # HTML files have been written already, so return to base directory
159 |         os.chdir(base_dir)
160 |     else:
161 |         # Write files to text or pdf
162 |         if infilenames:
163 |             if args["out"]:
164 |                 outfilename = args["out"][0]
165 |             else:
166 |                 outfilename = utils.get_single_outfilename(args)
167 |             if outfilename:
168 |                 write_files(args, infilenames, outfilename)
169 |         else:
170 |             utils.remove_part_files()
171 |     return True
172 | 
173 | 
174 | def write_multiple_files(args, base_dir, crawler):
175 |     """Write to multiple output files and/or subdirectories."""
176 |     for i, query in enumerate(args["query"]):
177 |         if query in args["files"]:
178 |             # Write files
179 |             if args["out"] and i < len(args["out"]):
180 |                 outfilename = args["out"][i]
181 |             else:
182 |                 outfilename = ".".join(query.split(".")[:-1])
183 |             write_files(args, [query], outfilename)
184 |         elif query in args["urls"]:
185 |             # Scrape/crawl urls
186 |             domain = utils.get_domain(query)
187 |             if args["html"]:
188 |                 # Create a directory to save PART.html files in
189 |                 if not args["quiet"]:
190 |                     print("Storing html files in {0}/".format(domain))
191 |                 utils.mkdir_and_cd(domain)
192 | 
193 |             if args["crawl"] or args["crawl_all"]:
194 |                 # Crawl and save HTML files/image files to disk
195 |                 infilenames = crawler.crawl_links(query)
196 |             else:
197 |                 raw_resp = utils.get_raw_resp(query)
198 |                 if raw_resp is None:
199 |                     return False
200 | 
201 |                 # Saves page as PART.html file
202 |                 prev_part_num = utils.get_num_part_files()
203 |                 utils.write_part_file(args, query, raw_resp)
204 |                 curr_part_num = prev_part_num + 1
205 |                 infilenames = utils.get_part_filenames(curr_part_num, prev_part_num)
206 | 
207 |             # Convert output or leave as PART.html files
208 |             if args["html"]:
209 |                 # HTML files have been written already, so return to base dir
210 |                 os.chdir(base_dir)
211 |             else:
212 |                 # Write files to text or pdf
213 |                 if infilenames:
214 |                     if args["out"] and i < len(args["out"]):
215 |                         outfilename = args["out"][i]
216 |                     else:
217 |                         outfilename = utils.get_outfilename(query, domain)
218 |                     write_files(args, infilenames, outfilename)
219 |                 else:
220 |                     sys.stderr.write(
221 |                         "Failed to retrieve content from {0}.\n".format(query)
222 |                     )
223 |     return True
224 | 
225 | 
226 | def split_input(args):
227 |     """Split query input into local files and URLs."""
228 |     args["files"] = []
229 |     args["urls"] = []
230 |     for arg in args["query"]:
231 |         if os.path.isfile(arg):
232 |             args["files"].append(arg)
233 |         else:
234 |             args["urls"].append(arg.strip("/"))
235 | 
236 | 
237 | def detect_output_type(args):
238 |     """Detect whether to save to a single or multiple files."""
239 |     if not args["single"] and not args["multiple"]:
240 |         # Save to multiple files if multiple files/URLs entered
241 |         if len(args["query"]) > 1 or len(args["out"]) > 1:
242 |             args["multiple"] = True
243 |         else:
244 |             args["single"] = True
245 | 
246 | 
247 | def scrape(args):
248 |     """Scrape webpage content."""
249 |     try:
250 |         base_dir = os.getcwd()
251 |         if args["out"] is None:
252 |             args["out"] = []
253 | 
254 |         # Detect whether to save to a single or multiple files
255 |         detect_output_type(args)
256 | 
257 |         # Split query input into local files and URLs
258 |         split_input(args)
259 | 
260 |         if args["urls"]:
261 |             # Add URL extensions and schemes and update query and URLs
262 |             urls_with_exts = [utils.add_url_suffix(x) for x in args["urls"]]
263 |             args["query"] = [
264 |                 utils.add_protocol(x) if x in args["urls"] else x
265 |                 for x in urls_with_exts
266 |             ]
267 |             args["urls"] = [x for x in args["query"] if x not in args["files"]]
268 | 
269 |         # Print error if attempting to convert local files to HTML
270 |         if args["files"] and args["html"]:
271 |             sys.stderr.write("Cannot convert local files to HTML.\n")
272 |             args["files"] = []
273 | 
274 |         # Instantiate web crawler if necessary
275 |         crawler = None
276 |         if args["crawl"] or args["crawl_all"]:
277 |             crawler = Crawler(args)
278 | 
279 |         if args["single"]:
280 |             return write_single_file(args, base_dir, crawler)
281 |         elif args["multiple"]:
282 |             return write_multiple_files(args, base_dir, crawler)
283 | 
284 |     except (KeyboardInterrupt, Exception):
285 |         if args["html"]:
286 |             try:
287 |                 os.chdir(base_dir)
288 |             except OSError:
289 |                 pass
290 |         else:
291 |             utils.remove_part_files()
292 |         raise
293 | 
294 | 
295 | def prompt_filetype(args):
296 |     """Prompt user for filetype if none specified."""
297 |     valid_types = ("print", "text", "csv", "pdf", "html")
298 |     if not any(args[x] for x in valid_types):
299 |         try:
300 |             filetype = input(
301 |                 "Print or save output as ({0}): ".format(", ".join(valid_types))
302 |             ).lower()
303 |             while filetype not in valid_types:
304 |                 filetype = input(
305 |                     "Invalid entry. Choose from ({0}): ".format(", ".join(valid_types))
306 |                 ).lower()
307 |         except (KeyboardInterrupt, EOFError):
308 |             return
309 |         args[filetype] = True
310 | 
311 | 
312 | def prompt_save_images(args):
313 |     """Prompt user to save images when crawling (for pdf and HTML formats)."""
314 |     if args["images"] or args["no_images"]:
315 |         return
316 | 
317 |     if (args["pdf"] or args["html"]) and (args["crawl"] or args["crawl_all"]):
318 |         save_msg = (
319 |             "Choosing to save images will greatly slow the"
320 |             " crawling process.\nSave images anyways? (y/n): "
321 |         )
322 |         try:
323 |             save_images = utils.confirm_input(input(save_msg))
324 |         except (KeyboardInterrupt, EOFError):
325 |             return
326 | 
327 |         args["images"] = save_images
328 |         args["no_images"] = not save_images
329 | 
330 | 
331 | def command_line_runner():
332 |     """Handle command-line interaction."""
333 |     parser = get_parser()
334 |     args = vars(parser.parse_args())
335 |     if args["version"]:
336 |         print(__version__)
337 |         return
338 |     if args["clear_cache"]:
339 |         utils.clear_cache()
340 |         print("Cleared {0}.".format(utils.CACHE_DIR))
341 |         return
342 |     if not args["query"]:
343 |         parser.print_help()
344 |         return
345 | 
346 |     # Enable cache unless user sets environ variable SCRAPE_DISABLE_CACHE
347 |     if not os.getenv("SCRAPE_DISABLE_CACHE"):
348 |         utils.enable_cache()
349 | 
350 |     # Save images unless user sets environ variable SCRAPE_DISABLE_IMGS
351 |     if os.getenv("SCRAPE_DISABLE_IMGS"):
352 |         args["no_images"] = True
353 | 
354 |     # Prompt user for filetype if none specified
355 |     prompt_filetype(args)
356 | 
357 |     # Prompt user to save images when crawling (for pdf and HTML formats)
358 |     prompt_save_images(args)
359 | 
360 |     # Scrape webpage content
361 |     scrape(args)
362 | 
363 | 
364 | if __name__ == "__main__":
365 |     command_line_runner()
366 | 


--------------------------------------------------------------------------------
/scrape/utils.py:
--------------------------------------------------------------------------------
  1 | """scrape utility functions.
  2 | 
  3 |    Functions include:
  4 |    Web requests and requests caching
  5 |    Document caching
  6 |    Text processing
  7 |    HTML parsing
  8 |    URL processing
  9 |    File processing
 10 |    User input and sanitation
 11 |    Miscellaneous
 12 | """
 13 | 
 14 | from __future__ import print_function
 15 | import glob
 16 | import hashlib
 17 | import os
 18 | import random
 19 | import re
 20 | import shutil
 21 | import string
 22 | import sys
 23 | import time
 24 | 
 25 | import lxml.html as lh
 26 | 
 27 | try:
 28 |     import pdfkit as pk
 29 | except ImportError:
 30 |     pass
 31 | import requests
 32 | from requests.exceptions import MissingSchema
 33 | from six import PY2
 34 | from six.moves import input, xrange as range
 35 | from six.moves.urllib.parse import urlparse, urljoin
 36 | from six.moves.urllib.request import getproxies
 37 | import tldextract
 38 | 
 39 | if PY2:
 40 |     from cgi import escape
 41 | else:
 42 |     from html import escape
 43 | 
 44 | USER_AGENTS = (
 45 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) "
 46 |     "Gecko/20100101 Firefox/11.0",
 47 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) " "Gecko/20100 101 Firefox/22.0",
 48 |     "Mozilla/5.0 (Windows NT 6.1; rv:11.0) " "Gecko/20100101 Firefox/11.0",
 49 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) "
 50 |     "AppleWebKit/536.5 (KHTML, like Gecko) "
 51 |     "Chrome/19.0.1084.46 Safari/536.5",
 52 |     "Mozilla/5.0 (Windows; Windows NT 6.1) "
 53 |     "AppleWebKit/536.5 (KHTML, like Gecko) "
 54 |     "Chrome/19.0.1084.46 Safari/536.5",
 55 | )
 56 | 
 57 | 
 58 | XDG_CACHE_DIR = os.environ.get(
 59 |     "XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")
 60 | )
 61 | CACHE_DIR = os.path.join(XDG_CACHE_DIR, "scrape")
 62 | CACHE_FILE = os.path.join(CACHE_DIR, "cache{0}".format("" if PY2 else "3"))
 63 | 
 64 | # Web requests and requests caching functions
 65 | #
 66 | 
 67 | 
 68 | def get_proxies():
 69 |     """Get available proxies to use with requests library."""
 70 |     proxies = getproxies()
 71 |     filtered_proxies = {}
 72 |     for key, value in proxies.items():
 73 |         if key.startswith("http://"):
 74 |             if not value.startswith("http://"):
 75 |                 filtered_proxies[key] = "http://{0}".format(value)
 76 |             else:
 77 |                 filtered_proxies[key] = value
 78 |     return filtered_proxies
 79 | 
 80 | 
 81 | def get_resp(url):
 82 |     """Get webpage response as an lxml.html.HtmlElement object."""
 83 |     try:
 84 |         headers = {"User-Agent": random.choice(USER_AGENTS)}
 85 |         try:
 86 |             request = requests.get(url, headers=headers, proxies=get_proxies())
 87 |         except MissingSchema:
 88 |             url = add_protocol(url)
 89 |             request = requests.get(url, headers=headers, proxies=get_proxies())
 90 |         return lh.fromstring(request.text.encode("utf-8") if PY2 else request.text)
 91 |     except Exception:
 92 |         sys.stderr.write("Failed to retrieve {0}.\n".format(url))
 93 |         raise
 94 | 
 95 | 
 96 | def get_raw_resp(url):
 97 |     """Get webpage response as a unicode string."""
 98 |     try:
 99 |         headers = {"User-Agent": random.choice(USER_AGENTS)}
100 |         try:
101 |             request = requests.get(url, headers=headers, proxies=get_proxies())
102 |         except MissingSchema:
103 |             url = add_protocol(url)
104 |             request = requests.get(url, headers=headers, proxies=get_proxies())
105 |         return request.text.encode("utf-8") if PY2 else request.text
106 |     except Exception:
107 |         sys.stderr.write("Failed to retrieve {0} as str.\n".format(url))
108 |         raise
109 | 
110 | 
111 | def enable_cache():
112 |     """Enable requests library cache."""
113 |     try:
114 |         import requests_cache
115 |     except ImportError as err:
116 |         sys.stderr.write("Failed to enable cache: {0}\n".format(str(err)))
117 |         return
118 |     if not os.path.exists(CACHE_DIR):
119 |         os.makedirs(CACHE_DIR)
120 |     requests_cache.install_cache(CACHE_FILE)
121 | 
122 | 
123 | def clear_cache():
124 |     """Clear requests library cache."""
125 |     for cache in glob.glob("{0}*".format(CACHE_FILE)):
126 |         os.remove(cache)
127 | 
128 | 
129 | # Document caching functions
130 | #
131 | 
132 | 
133 | def hash_text(text):
134 |     """Return MD5 hash of a string."""
135 |     md5 = hashlib.md5()
136 |     md5.update(text.encode("utf-8"))
137 |     return md5.hexdigest()
138 | 
139 | 
140 | def cache_page(page_cache, page_hash, cache_size):
141 |     """Add a page to the page cache."""
142 |     page_cache.append(page_hash)
143 |     if len(page_cache) > cache_size:
144 |         page_cache.pop(0)
145 | 
146 | 
147 | # Text processing functions
148 | #
149 | 
150 | 
151 | def re_filter(text, regexps):
152 |     """Filter text using regular expressions."""
153 |     if not regexps:
154 |         return text
155 | 
156 |     matched_text = []
157 |     compiled_regexps = [re.compile(x) for x in regexps]
158 |     for line in text:
159 |         if line in matched_text:
160 |             continue
161 | 
162 |         for regexp in compiled_regexps:
163 |             found = regexp.search(line)
164 |             if found and found.group():
165 |                 matched_text.append(line)
166 | 
167 |     return matched_text or text
168 | 
169 | 
170 | def remove_whitespace(text):
171 |     """Remove unnecessary whitespace while keeping logical structure.
172 | 
173 |     Keyword arguments:
174 |     text -- text to remove whitespace from (list)
175 | 
176 |     Retain paragraph structure but remove other whitespace,
177 |     such as between words on a line and at the start and end of the text.
178 |     """
179 |     clean_text = []
180 |     curr_line = ""
181 |     # Remove any newlines that follow two lines of whitespace consecutively
182 |     # Also remove whitespace at start and end of text
183 |     while text:
184 |         if not curr_line:
185 |             # Find the first line that is not whitespace and add it
186 |             curr_line = text.pop(0)
187 |             while not curr_line.strip() and text:
188 |                 curr_line = text.pop(0)
189 |             if curr_line.strip():
190 |                 clean_text.append(curr_line)
191 |         else:
192 |             # Filter the rest of the lines
193 |             curr_line = text.pop(0)
194 |             if not text:
195 |                 # Add the final line if it is not whitespace
196 |                 if curr_line.strip():
197 |                     clean_text.append(curr_line)
198 |                 continue
199 | 
200 |             if curr_line.strip():
201 |                 clean_text.append(curr_line)
202 |             else:
203 |                 # If the current line is whitespace then make sure there is
204 |                 # no more than one consecutive line of whitespace following
205 |                 if not text[0].strip():
206 |                     if len(text) > 1 and text[1].strip():
207 |                         clean_text.append(curr_line)
208 |                 else:
209 |                     clean_text.append(curr_line)
210 | 
211 |     # Now filter each individual line for extraneous whitespace
212 |     cleaner_text = []
213 |     for line in clean_text:
214 |         clean_line = " ".join(line.split())
215 |         if not clean_line.strip():
216 |             clean_line += "\n"
217 |         cleaner_text.append(clean_line)
218 |     return cleaner_text
219 | 
220 | 
221 | def parse_text(infile, xpath=None, filter_words=None, attributes=None):
222 |     """Filter text using XPath, regex keywords, and tag attributes.
223 | 
224 |     Keyword arguments:
225 |     infile -- HTML or text content to parse (list)
226 |     xpath -- an XPath expression (str)
227 |     filter_words -- regex keywords (list)
228 |     attributes -- HTML tag attributes (list)
229 | 
230 |     Return a list of strings of text.
231 |     """
232 |     infiles = []
233 |     text = []
234 |     if xpath is not None:
235 |         infile = parse_html(infile, xpath)
236 |         if isinstance(infile, list):
237 |             if isinstance(infile[0], lh.HtmlElement):
238 |                 infiles = list(infile)
239 |             else:
240 |                 text = [line + "\n" for line in infile]
241 |         elif isinstance(infile, lh.HtmlElement):
242 |             infiles = [infile]
243 |         else:
244 |             text = [infile]
245 |     else:
246 |         infiles = [infile]
247 | 
248 |     if attributes is not None:
249 |         attributes = [clean_attr(x) for x in attributes]
250 |         attributes = [x for x in attributes if x]
251 |     else:
252 |         attributes = ["text()"]
253 | 
254 |     if not text:
255 |         text_xpath = "//*[not(self::script) and not(self::style)]"
256 |         for attr in attributes:
257 |             for infile in infiles:
258 |                 if isinstance(infile, lh.HtmlElement):
259 |                     new_text = infile.xpath("{0}/{1}".format(text_xpath, attr))
260 |                 else:
261 |                     # re.split preserves delimiters place in the list
262 |                     new_text = [x for x in re.split("(\n)", infile) if x]
263 |                 text += new_text
264 | 
265 |     if filter_words is not None:
266 |         text = re_filter(text, filter_words)
267 |     return [
268 |         "".join(x for x in line if x in string.printable)
269 |         for line in remove_whitespace(text)
270 |         if line
271 |     ]
272 | 
273 | 
274 | def get_parsed_text(args, infilename):
275 |     """Parse and return text content of infiles.
276 | 
277 |     Keyword arguments:
278 |     args -- program arguments (dict)
279 |     infilenames -- name of user-inputted and/or downloaded file (str)
280 | 
281 |     Return a list of strings of text.
282 |     """
283 |     parsed_text = []
284 |     if infilename.endswith(".html"):
285 |         # Convert HTML to lxml object for content parsing
286 |         html = lh.fromstring(read_files(infilename))
287 |         text = None
288 |     else:
289 |         html = None
290 |         text = read_files(infilename)
291 | 
292 |     if html is not None:
293 |         parsed_text = parse_text(
294 |             html, args["xpath"], args["filter"], args["attributes"]
295 |         )
296 |     elif text is not None:
297 |         parsed_text = parse_text(text, args["xpath"], args["filter"])
298 |     else:
299 |         if not args["quiet"]:
300 |             sys.stderr.write("Failed to parse text from {0}.\n".format(infilename))
301 |     return parsed_text
302 | 
303 | 
304 | # HTML parsing functions
305 | #
306 | 
307 | 
308 | def clean_attr(attr):
309 |     """Append @ to attributes and resolve text -> text() for XPath."""
310 |     if attr:
311 |         if "text" in attr:
312 |             return "text()"
313 |         else:
314 |             attr = attr.lstrip("@")
315 |     if attr:
316 |         return "@" + attr
317 |     return None
318 | 
319 | 
320 | def parse_html(infile, xpath):
321 |     """Filter HTML using XPath."""
322 |     if not isinstance(infile, lh.HtmlElement):
323 |         infile = lh.fromstring(infile)
324 |     infile = infile.xpath(xpath)
325 |     if not infile:
326 |         raise ValueError("XPath {0} returned no results.".format(xpath))
327 |     return infile
328 | 
329 | 
330 | # URL processing functions
331 | #
332 | 
333 | 
334 | def get_domain(url):
335 |     """Get the domain of a URL using tldextract."""
336 |     return tldextract.extract(url).domain
337 | 
338 | 
339 | def add_protocol(url):
340 |     """Add protocol to URL."""
341 |     if not check_protocol(url):
342 |         return "http://{0}".format(url)
343 |     return url
344 | 
345 | 
346 | def check_protocol(url):
347 |     """Check URL for a protocol."""
348 |     if url and (url.startswith("http://") or url.startswith("https://")):
349 |         return True
350 |     return False
351 | 
352 | 
353 | def remove_protocol(url):
354 |     """Remove protocol from URL."""
355 |     if check_protocol(url):
356 |         return url.replace("http://", "").replace("https://", "")
357 |     return url
358 | 
359 | 
360 | def clean_url(url, base_url=None):
361 |     """Add base netloc and path to internal URLs and remove www, fragments."""
362 |     parsed_url = urlparse(url)
363 | 
364 |     fragment = "{url.fragment}".format(url=parsed_url)
365 |     if fragment:
366 |         url = url.split(fragment)[0]
367 | 
368 |     # Identify internal URLs and fix their format
369 |     netloc = "{url.netloc}".format(url=parsed_url)
370 |     if base_url is not None and not netloc:
371 |         parsed_base = urlparse(base_url)
372 |         split_base = "{url.scheme}://{url.netloc}{url.path}/".format(url=parsed_base)
373 |         url = urljoin(split_base, url)
374 |         netloc = "{url.netloc}".format(url=urlparse(url))
375 | 
376 |     if "www." in netloc:
377 |         url = url.replace(netloc, netloc.replace("www.", ""))
378 |     return url.rstrip(string.punctuation)
379 | 
380 | 
381 | def has_suffix(url):
382 |     """Return whether the url has a suffix using tldextract."""
383 |     return bool(tldextract.extract(url).suffix)
384 | 
385 | 
386 | def add_url_suffix(url):
387 |     """Add .com suffix to URL if none found."""
388 |     url = url.rstrip("/")
389 |     if not has_suffix(url):
390 |         return "{0}.com".format(url)
391 |     return url
392 | 
393 | 
394 | # File processing functions
395 | #
396 | 
397 | 
398 | def get_outfilename(url, domain=None):
399 |     """Construct the output filename from domain and end of path."""
400 |     if domain is None:
401 |         domain = get_domain(url)
402 | 
403 |     path = "{url.path}".format(url=urlparse(url))
404 |     if "." in path:
405 |         tail_url = path.split(".")[-2]
406 |     else:
407 |         tail_url = path
408 | 
409 |     if tail_url:
410 |         if "/" in tail_url:
411 |             tail_pieces = [x for x in tail_url.split("/") if x]
412 |             tail_url = tail_pieces[-1]
413 | 
414 |         # Keep length of return string below or equal to max_len
415 |         max_len = 24
416 |         if domain:
417 |             max_len -= len(domain) + 1
418 |         if len(tail_url) > max_len:
419 |             if "-" in tail_url:
420 |                 tail_pieces = [x for x in tail_url.split("-") if x]
421 |                 tail_url = tail_pieces.pop(0)
422 |                 if len(tail_url) > max_len:
423 |                     tail_url = tail_url[:max_len]
424 |                 else:
425 |                     # Add as many tail pieces that can fit
426 |                     tail_len = 0
427 |                     for piece in tail_pieces:
428 |                         tail_len += len(piece)
429 |                         if tail_len <= max_len:
430 |                             tail_url += "-" + piece
431 |                         else:
432 |                             break
433 |             else:
434 |                 tail_url = tail_url[:max_len]
435 | 
436 |         if domain:
437 |             return "{0}-{1}".format(domain, tail_url).lower()
438 |         return tail_url
439 |     return domain.lower()
440 | 
441 | 
442 | def get_single_outfilename(args):
443 |     """Use first possible entry in query as filename."""
444 |     for arg in args["query"]:
445 |         if arg in args["files"]:
446 |             return (".".join(arg.split(".")[:-1])).lower()
447 |         for url in args["urls"]:
448 |             if arg.strip("/") in url:
449 |                 domain = get_domain(url)
450 |                 return get_outfilename(url, domain)
451 |     sys.stderr.write("Failed to construct a single out filename.\n")
452 |     return ""
453 | 
454 | 
455 | def remove_file(filename):
456 |     """Remove a file from disk."""
457 |     try:
458 |         os.remove(filename)
459 |         return True
460 |     except (OSError, IOError):
461 |         return False
462 | 
463 | 
464 | def modify_filename_id(filename):
465 |     """Modify filename to have a unique numerical identifier."""
466 |     split_filename = os.path.splitext(filename)
467 |     id_num_re = re.compile("(\(\d\))")
468 |     id_num = re.findall(id_num_re, split_filename[-2])
469 |     if id_num:
470 |         new_id_num = int(id_num[-1].lstrip("(").rstrip(")")) + 1
471 | 
472 |         # Reconstruct filename with incremented id and its extension
473 |         filename = "".join(
474 |             (
475 |                 re.sub(id_num_re, "({0})".format(new_id_num), split_filename[-2]),
476 |                 split_filename[-1],
477 |             )
478 |         )
479 |     else:
480 |         split_filename = os.path.splitext(filename)
481 | 
482 |         # Reconstruct filename with new id and its extension
483 |         filename = "".join(("{0} (2)".format(split_filename[-2]), split_filename[-1]))
484 |     return filename
485 | 
486 | 
487 | def overwrite_file_check(args, filename):
488 |     """If filename exists, overwrite or modify it to be unique."""
489 |     if not args["overwrite"] and os.path.exists(filename):
490 |         # Confirm overwriting of the file, or modify filename
491 |         if args["no_overwrite"]:
492 |             overwrite = False
493 |         else:
494 |             try:
495 |                 overwrite = confirm_input(
496 |                     input("Overwrite {0}? (yes/no): ".format(filename))
497 |                 )
498 |             except (KeyboardInterrupt, EOFError):
499 |                 sys.exit()
500 |         if not overwrite:
501 |             new_filename = modify_filename_id(filename)
502 |             while os.path.exists(new_filename):
503 |                 new_filename = modify_filename_id(new_filename)
504 |             return new_filename
505 |     return filename
506 | 
507 | 
508 | def print_text(args, infilenames, outfilename=None):
509 |     """Print text content of infiles to stdout.
510 | 
511 |     Keyword arguments:
512 |     args -- program arguments (dict)
513 |     infilenames -- names of user-inputted and/or downloaded files (list)
514 |     outfilename -- only used for interface purposes (None)
515 |     """
516 |     for infilename in infilenames:
517 |         parsed_text = get_parsed_text(args, infilename)
518 |         if parsed_text:
519 |             for line in parsed_text:
520 |                 print(line)
521 |             print("")
522 | 
523 | 
524 | def write_pdf_files(args, infilenames, outfilename):
525 |     """Write pdf file(s) to disk using pdfkit.
526 | 
527 |     Keyword arguments:
528 |     args -- program arguments (dict)
529 |     infilenames -- names of user-inputted and/or downloaded files (list)
530 |     outfilename -- name of output pdf file (str)
531 |     """
532 |     if not outfilename.endswith(".pdf"):
533 |         outfilename = outfilename + ".pdf"
534 |     outfilename = overwrite_file_check(args, outfilename)
535 | 
536 |     options = {"enable-local-file-access": None}
537 |     try:
538 |         if args["multiple"]:
539 |             # Multiple files are written one at a time, so infilenames will
540 |             # never contain more than one file here
541 |             infilename = infilenames[0]
542 |             if not args["quiet"]:
543 |                 print("Attempting to write to {0}.".format(outfilename))
544 |             else:
545 |                 options["quiet"] = None
546 | 
547 |             if args["xpath"]:
548 |                 # Process HTML with XPath before writing
549 |                 html = parse_html(read_files(infilename), args["xpath"])
550 |                 if isinstance(html, list):
551 |                     if isinstance(html[0], str):
552 |                         pk.from_string("\n".join(html), outfilename, options=options)
553 |                     else:
554 |                         pk.from_string(
555 |                             "\n".join(lh.tostring(x) for x in html),
556 |                             outfilename,
557 |                             options=options,
558 |                         )
559 |                 elif isinstance(html, str):
560 |                     pk.from_string(html, outfilename, options=options)
561 |                 else:
562 |                     pk.from_string(lh.tostring(html), outfilename, options=options)
563 |             else:
564 |                 pk.from_file(infilename, outfilename, options=options)
565 |         elif args["single"]:
566 |             if not args["quiet"]:
567 |                 print(
568 |                     "Attempting to write {0} page(s) to {1}.".format(
569 |                         len(infilenames), outfilename
570 |                     )
571 |                 )
572 |             else:
573 |                 options["quiet"] = None
574 | 
575 |             if args["xpath"]:
576 |                 # Process HTML with XPath before writing
577 |                 html = parse_html(read_files(infilenames), args["xpath"])
578 |                 if isinstance(html, list):
579 |                     if isinstance(html[0], str):
580 |                         pk.from_string("\n".join(html), outfilename, options=options)
581 |                     else:
582 |                         pk.from_string(
583 |                             "\n".join(lh.tostring(x) for x in html),
584 |                             outfilename,
585 |                             options=options,
586 |                         )
587 |                 elif isinstance(html, str):
588 |                     pk.from_string(html, outfilename, options=options)
589 |                 else:
590 |                     pk.from_string(lh.tostring(html), outfilename, options=options)
591 |             else:
592 |                 pk.from_file(infilenames, outfilename, options=options)
593 |         return True
594 |     except (OSError, IOError) as err:
595 |         sys.stderr.write(
596 |             "An error occurred while writing {0}:\n{1}".format(outfilename, str(err))
597 |         )
598 |         return False
599 | 
600 | 
601 | def write_csv_files(args, infilenames, outfilename):
602 |     """Write csv file(s) to disk.
603 | 
604 |     Keyword arguments:
605 |     args -- program arguments (dict)
606 |     infilenames -- names of user-inputted and/or downloaded files (list)
607 |     outfilename -- name of output text file (str)
608 |     """
609 | 
610 |     def csv_convert(line):
611 |         """Strip punctuation and insert commas"""
612 |         clean_line = []
613 |         for word in line.split(" "):
614 |             clean_line.append(word.strip(string.punctuation))
615 |         return ", ".join(clean_line)
616 | 
617 |     if not outfilename.endswith(".csv"):
618 |         outfilename = outfilename + ".csv"
619 |     outfilename = overwrite_file_check(args, outfilename)
620 | 
621 |     all_text = []  # Text must be aggregated if writing to a single output file
622 |     for i, infilename in enumerate(infilenames):
623 |         parsed_text = get_parsed_text(args, infilename)
624 |         if parsed_text:
625 |             if args["multiple"]:
626 |                 if not args["quiet"]:
627 |                     print("Attempting to write to {0}.".format(outfilename))
628 | 
629 |                 csv_text = [csv_convert(x) for x in parsed_text]
630 |                 print(csv_text)
631 |                 write_file(csv_text, outfilename)
632 |             elif args["single"]:
633 |                 all_text += parsed_text
634 |                 # Newline added between multiple files being aggregated
635 |                 if len(infilenames) > 1 and i < len(infilenames) - 1:
636 |                     all_text.append("\n")
637 | 
638 |     # Write all text to a single output file
639 |     if args["single"] and all_text:
640 |         if not args["quiet"]:
641 |             print(
642 |                 "Attempting to write {0} page(s) to {1}.".format(
643 |                     len(infilenames), outfilename
644 |                 )
645 |             )
646 | 
647 |         csv_text = [csv_convert(x) for x in all_text]
648 |         print(csv_text)
649 |         write_file(csv_text, outfilename)
650 | 
651 | 
652 | def write_text_files(args, infilenames, outfilename):
653 |     """Write text file(s) to disk.
654 | 
655 |     Keyword arguments:
656 |     args -- program arguments (dict)
657 |     infilenames -- names of user-inputted and/or downloaded files (list)
658 |     outfilename -- name of output text file (str)
659 |     """
660 |     if not outfilename.endswith(".txt"):
661 |         outfilename = outfilename + ".txt"
662 |     outfilename = overwrite_file_check(args, outfilename)
663 | 
664 |     all_text = []  # Text must be aggregated if writing to a single output file
665 |     for i, infilename in enumerate(infilenames):
666 |         parsed_text = get_parsed_text(args, infilename)
667 |         if parsed_text:
668 |             if args["multiple"]:
669 |                 if not args["quiet"]:
670 |                     print("Attempting to write to {0}.".format(outfilename))
671 |                 write_file(parsed_text, outfilename)
672 |             elif args["single"]:
673 |                 all_text += parsed_text
674 |                 # Newline added between multiple files being aggregated
675 |                 if len(infilenames) > 1 and i < len(infilenames) - 1:
676 |                     all_text.append("\n")
677 | 
678 |     # Write all text to a single output file
679 |     if args["single"] and all_text:
680 |         if not args["quiet"]:
681 |             print(
682 |                 "Attempting to write {0} page(s) to {1}.".format(
683 |                     len(infilenames), outfilename
684 |                 )
685 |             )
686 |         write_file(all_text, outfilename)
687 | 
688 | 
689 | def write_file(data, outfilename):
690 |     """Write a single file to disk."""
691 |     if not data:
692 |         return False
693 |     try:
694 |         with open(outfilename, "w") as outfile:
695 |             for line in data:
696 |                 if line:
697 |                     outfile.write(line)
698 |         return True
699 |     except (OSError, IOError) as err:
700 |         sys.stderr.write(
701 |             "An error occurred while writing {0}:\n{1}".format(outfilename, str(err))
702 |         )
703 |         return False
704 | 
705 | 
706 | def get_num_part_files():
707 |     """Get the number of PART.html files currently saved to disk."""
708 |     num_parts = 0
709 |     for filename in os.listdir(os.getcwd()):
710 |         if filename.startswith("PART") and filename.endswith(".html"):
711 |             num_parts += 1
712 |     return num_parts
713 | 
714 | 
715 | def write_part_images(url, raw_html, html, filename):
716 |     """Write image file(s) associated with HTML to disk, substituting filenames.
717 | 
718 |     Keywords arguments:
719 |     url -- the URL from which the HTML has been extracted from (str)
720 |     raw_html -- unparsed HTML file content (list)
721 |     html -- parsed HTML file content (lxml.html.HtmlElement) (default: None)
722 |     filename -- the PART.html filename (str)
723 | 
724 |     Return raw HTML with image names replaced with local image filenames.
725 |     """
726 |     save_dirname = "{0}_files".format(os.path.splitext(filename)[0])
727 |     if not os.path.exists(save_dirname):
728 |         os.makedirs(save_dirname)
729 |     images = html.xpath("//img/@src")
730 |     internal_image_urls = [x for x in images if x.startswith("/")]
731 | 
732 |     headers = {"User-Agent": random.choice(USER_AGENTS)}
733 |     for img_url in images:
734 |         img_name = img_url.split("/")[-1]
735 |         if "?" in img_name:
736 |             img_name = img_name.split("?")[0]
737 |         if not os.path.splitext(img_name)[1]:
738 |             img_name = "{0}.jpeg".format(img_name)
739 | 
740 |         try:
741 |             full_img_name = os.path.join(save_dirname, img_name)
742 |             with open(full_img_name, "wb") as img:
743 |                 if img_url in internal_image_urls:
744 |                     # Internal images need base url added
745 |                     full_img_url = "{0}{1}".format(url.rstrip("/"), img_url)
746 |                 else:
747 |                     # External image
748 |                     full_img_url = img_url
749 |                 img_content = requests.get(
750 |                     full_img_url, headers=headers, proxies=get_proxies()
751 |                 ).content
752 |                 img.write(img_content)
753 |                 raw_html = raw_html.replace(escape(img_url), full_img_name)
754 |         except (OSError, IOError):
755 |             pass
756 |         time.sleep(random.uniform(0, 0.5))  # Slight delay between downloads
757 |     return raw_html
758 | 
759 | 
760 | def write_part_file(args, url, raw_html, html=None, part_num=None):
761 |     """Write PART.html file(s) to disk, images in PART_files directory.
762 | 
763 |     Keyword arguments:
764 |     args -- program arguments (dict)
765 |     raw_html -- unparsed HTML file content (list)
766 |     html -- parsed HTML file content (lxml.html.HtmlElement) (default: None)
767 |     part_num -- PART(#).html file number (int) (default: None)
768 |     """
769 |     if part_num is None:
770 |         part_num = get_num_part_files() + 1
771 |     filename = "PART{0}.html".format(part_num)
772 | 
773 |     # Decode bytes to string in Python 3 versions
774 |     if not PY2 and isinstance(raw_html, bytes):
775 |         raw_html = raw_html.encode("ascii", "ignore")
776 | 
777 |     # Convert html to an lh.HtmlElement object for parsing/saving images
778 |     if html is None:
779 |         html = lh.fromstring(raw_html)
780 | 
781 |     # Parse HTML if XPath entered
782 |     if args["xpath"]:
783 |         raw_html = parse_html(html, args["xpath"])
784 |         if isinstance(raw_html, list):
785 |             if not isinstance(raw_html[0], lh.HtmlElement):
786 |                 raise ValueError("XPath should return an HtmlElement object.")
787 |         else:
788 |             if not isinstance(raw_html, lh.HtmlElement):
789 |                 raise ValueError("XPath should return an HtmlElement object.")
790 | 
791 |     # Write HTML and possibly images to disk
792 |     if raw_html:
793 |         if not args["no_images"] and (args["pdf"] or args["html"]):
794 |             raw_html = write_part_images(url, raw_html, html, filename)
795 |         with open(filename, "w") as part:
796 |             if not isinstance(raw_html, list):
797 |                 raw_html = [raw_html]
798 |                 if isinstance(raw_html[0], lh.HtmlElement):
799 |                     for elem in raw_html:
800 |                         part.write(lh.tostring(elem))
801 |                 else:
802 |                     for line in raw_html:
803 |                         part.write(line)
804 | 
805 | 
806 | def get_part_filenames(num_parts=None, start_num=0):
807 |     """Get numbered PART.html filenames."""
808 |     if num_parts is None:
809 |         num_parts = get_num_part_files()
810 |     return ["PART{0}.html".format(i) for i in range(start_num + 1, num_parts + 1)]
811 | 
812 | 
813 | def read_files(filenames):
814 |     """Read a file into memory."""
815 |     if isinstance(filenames, list):
816 |         for filename in filenames:
817 |             with open(filename, "r") as infile:
818 |                 return infile.read()
819 |     else:
820 |         with open(filenames, "r") as infile:
821 |             return infile.read()
822 | 
823 | 
824 | def remove_part_images(filename):
825 |     """Remove PART(#)_files directory containing images from disk."""
826 |     dirname = "{0}_files".format(os.path.splitext(filename)[0])
827 |     if os.path.exists(dirname):
828 |         shutil.rmtree(dirname)
829 | 
830 | 
831 | def remove_part_files(num_parts=None):
832 |     """Remove PART(#).html files and image directories from disk."""
833 |     filenames = get_part_filenames(num_parts)
834 |     for filename in filenames:
835 |         remove_part_images(filename)
836 |         remove_file(filename)
837 | 
838 | 
839 | # User input and sanitation functions
840 | #
841 | 
842 | 
843 | def confirm_input(user_input):
844 |     """Check user input for yes, no, or an exit signal."""
845 |     if isinstance(user_input, list):
846 |         user_input = "".join(user_input)
847 | 
848 |     try:
849 |         u_inp = user_input.lower().strip()
850 |     except AttributeError:
851 |         u_inp = user_input
852 | 
853 |     # Check for exit signal
854 |     if u_inp in ("q", "quit", "exit"):
855 |         sys.exit()
856 |     if u_inp in ("y", "yes"):
857 |         return True
858 |     return False
859 | 
860 | 
861 | # Miscellaneous functions
862 | #
863 | 
864 | 
865 | def mkdir_and_cd(dirname):
866 |     """Change directory and/or create it if necessary."""
867 |     if not os.path.exists(dirname):
868 |         os.makedirs(dirname)
869 |         os.chdir(dirname)
870 |     else:
871 |         os.chdir(dirname)
872 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup, find_packages
 4 | import scrape
 5 | import os
 6 | 
 7 | 
 8 | def read(*names):
 9 |     values = dict()
10 |     extensions = [".txt", ".rst"]
11 |     for name in names:
12 |         value = ""
13 |         for extension in extensions:
14 |             filename = name + extension
15 |             if os.path.isfile(filename):
16 |                 value = open(name + extension).read()
17 |                 break
18 |         values[name] = value
19 |     return values
20 | 
21 | 
22 | with open(
23 |     os.path.join(os.path.abspath(os.path.dirname(__file__)), "README.rst"),
24 |     encoding="utf-8",
25 | ) as f:
26 |     long_description = f.read()
27 | 
28 | 
29 | setup(
30 |     name="scrape",
31 |     version=scrape.__version__,
32 |     description="a command-line web scraping tool",
33 |     long_description=long_description,
34 |     long_description_content_type="text/x-rst",
35 |     classifiers=[
36 |         "Development Status :: 4 - Beta",
37 |         "Environment :: Console",
38 |         "Environment :: Web Environment",
39 |         "Intended Audience :: End Users/Desktop",
40 |         "Intended Audience :: Developers",
41 |         "Intended Audience :: System Administrators",
42 |         "License :: OSI Approved :: MIT License",
43 |         "Operating System :: OS Independent",
44 |         "Programming Language :: Python :: 3",
45 |         "Programming Language :: Python :: 3.6",
46 |         "Programming Language :: Python :: 3.7",
47 |         "Programming Language :: Python :: 3.8",
48 |         "Programming Language :: Python :: 3.9",
49 |         "Topic :: Utilities",
50 |         "Topic :: Text Processing",
51 |     ],
52 |     keywords="web crawler scraper scrape crawl download filter save webpages websites images docs document documentation pdf csv html lxml",
53 |     author="Hunter H",
54 |     author_email="huntrar@gmail.com",
55 |     maintainer="Hunter H",
56 |     maintainer_email="huntrar@gmail.com",
57 |     url="https://github.com/huntrar/scrape",
58 |     license="MIT",
59 |     packages=find_packages(),
60 |     entry_points={"console_scripts": ["scrape = scrape.scrape:command_line_runner"]},
61 |     install_requires=["lxml", "pdfkit", "requests", "six", "tldextract"],
62 | )
63 | 


--------------------------------------------------------------------------------
/testing/admissions.html:
--------------------------------------------------------------------------------
 1 | <html><body>
 2 | ADMISSION TO ONLINE COLLEGE
 3 | <P>
 4 | Aplicants are considered for admission to Online College 
 5 | on the basis of their ISP, quality of their home pages and
 6 | quantity of emails exchanged per day.
 7 | <P>
 8 | It is recommended that students prepare for enrollment in
 9 | Online College by signing up for DSL service and
10 | buying a new computer.
11 | <P>
12 | <A HREF="home.html">Back to Online College home page</A> </body>
13 | 
14 | 
15 | 			</HTML>
16 | 
17 | 


--------------------------------------------------------------------------------
/testing/courses.html:
--------------------------------------------------------------------------------
 1 | <HTML>
 2 | <BODY>
 3 | <P>
 4 | ONLINE COLLEGE COURSES:
 5 | <P>
 6 | Online College offers degrees in 
 7 | beginning and advanced Web-surfing, 
 8 | email writing courses and the sociology of
 9 | chat rooms.
10 | <P>
11 | Online College provides its students with many
12 | opportunities for <A HREF="extra.html">extracurricular activities</A>.<BR>Check
13 | the <A HREF="admissions.html">admissions criteria</A> to see if 
14 | you qualify to enroll in Online College.
15 | 
16 | 
17 | </BODY>		 </HTML>
18 | 


--------------------------------------------------------------------------------
/testing/dropouts.html:
--------------------------------------------------------------------------------
 1 | <HTML>
 2 | <BODY>
 3 | Sorry, there are no dropouts from Online College!
 4 | Maybe they are too busy with 
 5 | <A HREF="extra.html">extracurricular activities</A> to find the
 6 | time to drop out.
 7 | <BR>
 8 | <A HREF="home.html">home</A>
 9 | </BODY>
10 | </HTML>
11 | 


--------------------------------------------------------------------------------
/testing/extra.html:
--------------------------------------------------------------------------------
 1 | <HTML>
 2 | <BODY>
 3 | Students at Online College
 4 | participate
 5 | in 
 6 | 
 7 | 
 8 | a large    number of extracurricular 
 9 | activities. <P> The favorite activities are
10 | dating, partying, and doing laundry.
11 | <BR> Online College is proud to be the
12 | national leader in cutting classes.
13 | <JUNK>
14 | To see why, check out our <A HREF="courses.html"> course
15 | offerings</A>. <P>
16 | <A HREF="home.html">Back to Online Home page</A>
17 | 
18 | </BODY>
19 | </HTML>
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/testing/faculty.html:
--------------------------------------------------------------------------------
 1 | 
 2 | Comments from faculty at Online College:
 3 | 
 4 | <P>"Online College students are exceptionally diligent.
 5 | They take Web-surfing very seriously." <P>
 6 | "Students seem to prefer sending me emails to submitting
 7 | term papers."
 8 | <P><A HREF="home.html">back to Online College home page</A>
 9 | <P>
10 | Do you want to hear from out <A HREF="dropouts.html">dropouts?</A>
11 | 
12 | 


--------------------------------------------------------------------------------
/testing/home.html:
--------------------------------------------------------------------------------
 1 | <HTML>
 2 | <BODY>
 3 | WELCOME TO ONLINE COLLEGE!
 4 | <P>
 5 | Online College is a senior college in the Online University.
 6 | More than 200,000 students are not enrolled in the academic
 7 | programs offered through the Online University.
 8 | <P>
 9 | Prospective students should apply for <A HREF="admissions.html">admission</A>.
10 | <BR>
11 | We offer a full schedule of <A HREF="courses.html"> undergraduate
12 | courses </A> as well as <A HREF="extra.html"> extracurricular 
13 | ativities.
14 | </A> <P> You can read testimonials from
15 | <A HREF="students.html">students,</A> <A HREF="faculty.html">
16 | faculty </A> and <A HREF="dropouts.html">dropouts</A>.
17 | 
18 | </BODY>
19 | </HTML>
20 | 
21 | 


--------------------------------------------------------------------------------
/testing/students.html:
--------------------------------------------------------------------------------
 1 | See what our students have to 
 2 | say 
 3 | about 
 4 | Online 
 5 | College: <P>
 6 | 
 7 | "Online College is the best!  My professors showed me lots of
 8 | cool Web sites and were always available for a chat."
 9 | <P>
10 | "My favorite  course is Physics of Cyberspace."
11 | <P>
12 | "When you take virtual classes, you don't have to watch anyone
13 | yawn."
14 | <P>
15 | <P>
16 | Try to contact our <A HREF="alumnae.html">alumnae.</A>
17 | Tell them we would appreciate a donation.
18 | <P>
19 | <A HREF="home.html">
20 | Back to Online College home page</A>
21 | 


--------------------------------------------------------------------------------
/testing/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huntrar/scrape/1dfd98bb0a308ef2a45b1e5dd136c38b17c27bc7/testing/test.pdf


--------------------------------------------------------------------------------
/testing/test.txt:
--------------------------------------------------------------------------------
 1 | ADMISSION TO ONLINE COLLEGE
 2 | Aplicants are considered for admission to Online College
 3 | on the basis of their ISP, quality of their home pages and
 4 | quantity of emails exchanged per day.
 5 | It is recommended that students prepare for enrollment in
 6 | Online College by signing up for DSL service and
 7 | buying a new computer.
 8 | Back to Online College home page
 9 | 
10 | ONLINE COLLEGE COURSES:
11 | Online College offers degrees in
12 | beginning and advanced Web-surfing,
13 | email writing courses and the sociology of
14 | chat rooms.
15 | Online College provides its students with many
16 | opportunities for
17 | extracurricular activities
18 | .
19 | Check
20 | the
21 | admissions criteria
22 | to see if
23 | you qualify to enroll in Online College.
24 | 
25 | Sorry, there are no dropouts from Online College!
26 | Maybe they are too busy with
27 | extracurricular activities
28 | to find the
29 | time to drop out.
30 | home
31 | 
32 | Students at Online College
33 | participate
34 | in
35 | 
36 | a large number of extracurricular
37 | activities.
38 | The favorite activities are
39 | dating, partying, and doing laundry.
40 | Online College is proud to be the
41 | national leader in cutting classes.
42 | To see why, check out our
43 | course
44 | offerings
45 | .
46 | Back to Online Home page
47 | 
48 | Comments from faculty at Online College:
49 | "Online College students are exceptionally diligent.
50 | They take Web-surfing very seriously."
51 | "Students seem to prefer sending me emails to submitting
52 | term papers."
53 | back to Online College home page
54 | Do you want to hear from out
55 | dropouts?
56 | 
57 | WELCOME TO ONLINE COLLEGE!
58 | Online College is a senior college in the Online University.
59 | More than 200,000 students are not enrolled in the academic
60 | programs offered through the Online University.
61 | Prospective students should apply for
62 | admission
63 | .
64 | We offer a full schedule of
65 | undergraduate
66 | courses
67 | as well as
68 | extracurricular
69 | ativities.
70 | You can read testimonials from
71 | students,
72 | faculty
73 | and
74 | dropouts
75 | .
76 | 
77 | See what our students have to
78 | say
79 | about
80 | Online
81 | College:
82 | "Online College is the best! My professors showed me lots of
83 | cool Web sites and were always available for a chat."
84 | "My favorite course is Physics of Cyberspace."
85 | "When you take virtual classes, you don't have to watch anyone
86 | yawn."
87 | Try to contact our
88 | alumnae.
89 | Tell them we would appreciate a donation.
90 | Back to Online College home page
91 | 
92 | One, two
93 | Buckle my shoe.
94 | 
95 | Three, four
96 | Shut the door.
97 | Five, six
98 | Pick up sticks.


--------------------------------------------------------------------------------
/testing/test1.html:
--------------------------------------------------------------------------------
1 | One, two            
2 | Buckle my shoe.             
3 | < P >
4 | Three, four
5 | Shut the door.
6 | Five, six
7 | Pick up sticks.
8 | 


--------------------------------------------------------------------------------
/testing/test_scrape.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Unit tests for scrape"""
  4 | 
  5 | import os
  6 | import shutil
  7 | import sys
  8 | import unittest
  9 | 
 10 | from scrape import scrape, utils
 11 | 
 12 | 
 13 | class ScrapeTestCase(unittest.TestCase):
 14 |     def call_scrape(self, cmd, filetype, num_files=None):
 15 |         if not isinstance(cmd, list):
 16 |             cmd = [cmd]
 17 |         parser = scrape.get_parser()
 18 |         args = vars(parser.parse_args(cmd))
 19 | 
 20 |         args["overwrite"] = True  # Avoid overwrite prompt
 21 |         if args["crawl"] or args["crawl_all"]:
 22 |             args["no_images"] = True  # Avoid save image prompt when crawling
 23 |         args[filetype] = True
 24 |         if num_files is not None:
 25 |             args[num_files] = True
 26 |         return scrape.scrape(args)
 27 | 
 28 |     def setUp(self):
 29 |         self.original_files = os.listdir(os.getcwd())
 30 |         self.html_files = [x for x in self.original_files if x.endswith(".html")]
 31 |         self.text_files = [x for x in self.original_files if x.endswith(".txt")]
 32 |         self.query = self.html_files + self.text_files
 33 | 
 34 |     def tearDown(self):
 35 |         pass
 36 | 
 37 |     def assert_exists_and_rm(self, filename):
 38 |         self.assertTrue(os.path.isfile(filename))
 39 |         if filename not in self.original_files:
 40 |             self.assertTrue(utils.remove_file(filename))
 41 | 
 42 |     def delete_subdir(self, domain):
 43 |         """Delete subdirectory containing HTML files if no other data in it"""
 44 |         subdir_path = "{0}/{1}".format(os.getcwd(), domain)
 45 |         files = os.listdir(subdir_path)
 46 |         files_to_rm = [x for x in files if x.startswith("PART") and x.endswith(".html")]
 47 | 
 48 |         if len(files_to_rm) != len(files):
 49 |             for filename in files_to_rm:
 50 |                 os.remove(filename)
 51 |         else:
 52 |             shutil.rmtree(subdir_path)
 53 | 
 54 |     def get_single_outfilename(self, query):
 55 |         """Use first possible entry in query as filename"""
 56 |         if not isinstance(query, list):
 57 |             query = [query]
 58 |         for arg in query:
 59 |             if arg in self.html_files or arg in self.text_files:
 60 |                 return (".".join(arg.split(".")[:-1])).lower()
 61 |         sys.stderr.write("Failed to construct a single out filename.\n")
 62 |         return ""
 63 | 
 64 |     """to_pdf functions require wkhtmltopdf executable to run
 65 |     def test_query_to_multi_pdf(self):
 66 |         self.call_scrape(self.query, 'pdf', 'multiple')
 67 |         for filename in self.html_files + self.text_files:
 68 |             outfilename = '.'.join(filename.split('.')[:-1]) + '.pdf'
 69 |             self.assert_exists_and_rm(outfilename)
 70 | 
 71 |     def test_query_to_single_pdf(self):
 72 |         self.call_scrape(self.query, 'pdf', 'single')
 73 |         outfilename = self.get_single_outfilename(self.query) + '.pdf'
 74 |         self.assert_exists_and_rm(outfilename)
 75 | 
 76 |     def test_html_to_pdf(self):
 77 |         self.call_scrape(self.html_files, 'pdf')
 78 |         outfilenames = [x.replace('.html', '.pdf') for x in self.html_files]
 79 | 
 80 |         # Assert new files have been created, then assert their deletion
 81 |         for outfilename in outfilenames:
 82 |             self.assert_exists_and_rm(outfilename)
 83 | 
 84 |     def test_text_to_pdf(self):
 85 |         self.call_scrape(self.text_files, 'pdf')
 86 |         outfilenames = [x.replace('.txt', '.pdf') for x in self.text_files]
 87 | 
 88 |         # Assert new files have been created, then assert their deletion
 89 |         for outfilename in outfilenames:
 90 |             self.assert_exists_and_rm(outfilename)
 91 |    """
 92 | 
 93 |     def test_query_to_multi_text(self):
 94 |         self.call_scrape(self.query, "text", "multiple")
 95 |         for filename in self.html_files + self.text_files:
 96 |             outfilename = ".".join(filename.split(".")[:-1]) + ".txt"
 97 |             self.assert_exists_and_rm(outfilename)
 98 | 
 99 |     def test_query_to_single_text(self):
100 |         self.call_scrape(self.query, "text", "single")
101 |         outfilename = self.get_single_outfilename(self.query) + ".txt"
102 |         self.assert_exists_and_rm(outfilename)
103 | 
104 |     def test_html_to_text(self):
105 |         self.call_scrape(self.html_files, "text")
106 |         outfilenames = [x.replace(".html", ".txt") for x in self.html_files]
107 | 
108 |         # Assert new files have been created, then assert their deletion
109 |         for outfilename in outfilenames:
110 |             self.assert_exists_and_rm(outfilename)
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     unittest.main()
115 | 


--------------------------------------------------------------------------------