├── .github └── workflows │ └── pythonpackage.yml ├── .gitignore ├── CHANGES.txt ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── requirements.txt ├── scrape ├── __init__.py ├── crawler.py ├── orderedset.py ├── scrape.py └── utils.py ├── setup.py └── testing ├── admissions.html ├── courses.html ├── dropouts.html ├── extra.html ├── faculty.html ├── home.html ├── students.html ├── test.pdf ├── test.txt ├── test1.html └── test_scrape.py /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | max-parallel: 4 11 | matrix: 12 | python-version: [3.6, 3.7, 3.8, 3.9] 13 | 14 | steps: 15 | - uses: actions/checkout@v1 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v1 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -r requirements.txt 24 | - name: Lint with flake8 25 | run: | 26 | pip install flake8 27 | # stop the build if there are Python syntax errors or undefined names 28 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 29 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 30 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 31 | - name: Test with nose 32 | run: | 33 | pip install nose 34 | nosetests 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask instance folder 57 | instance/ 58 | 59 | # Sphinx documentation 60 | docs/_build/ 61 | 62 | # PyBuilder 63 | target/ 64 | 65 | # IPython Notebook 66 | .ipynb_checkpoints 67 | 68 | # pyenv 69 | .python-version 70 | 71 | # dotenv 72 | .env 73 | 74 | # Vim 75 | # swap 76 | [._]*.s[a-w][a-z] 77 | [._]s[a-w][a-z] 78 | # session 79 | Session.vim 80 | # temporary 81 | .netrwhist 82 | *~ 83 | # auto-generated tag files 84 | tags 85 | 86 | # Backup files 87 | *.bak 88 | 89 | # Local files 90 | *.local* 91 | 92 | # BitTorrent sync 93 | *.bts 94 | 95 | # PyCharm 96 | *.idea/ 97 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | 0.11.3 2 | ------ 3 | 4 | - Adjust to relocation of module for container abstract base classes 5 | 6 | 0.11.2 7 | ------ 8 | 9 | - Bump lxml from 4.6.3 to 4.6.5 to fix security vulnerability 10 | 11 | 0.11.1 12 | ------ 13 | 14 | - Bump lxml from 4.6.2 to 4.6.3 to fix security vulnerability 15 | - Retire support for Python 3.5 due to lxml 4.6.3 incompatibility 16 | 17 | 0.11.0 18 | ------ 19 | 20 | - Retire support for Python 2 21 | - Retire Travis CI build and enable Python 3.8 and 3.9 in GitHub Actions 22 | - Enable local file access for wkhtmltopdf to fix failure in embedding images in PDFs 23 | 24 | 0.10.2 25 | ------ 26 | 27 | - Bump lxml from 4.3.0 to 4.6.2 for security patch 28 | - Remove support for Python 3.4 as not supported in latest lxml version 29 | 30 | 0.10.1 31 | ------ 32 | 33 | - Bugfix: TypeError when attempting to hash unencoded Unicode-objects 34 | 35 | 0.10.0 36 | ------ 37 | 38 | - Test python 3.7 and 3.8 in Travis CI/GitHub Actions 39 | - Replace cgi.escape with html.escape in Python 3 due to removal of cgi.escape in 3.8 40 | - Reformat using Black 41 | 42 | 0.9.15 43 | ------ 44 | 45 | - travis CI does not support 3.7 yet, removing that version from build 46 | 47 | 0.9.14 48 | ------ 49 | 50 | - added versions 3.6 and 3.7 to travis CI build, removed 2.6 and 3.3 51 | - 2.6 and 3.3 deprecated by lxml 52 | 53 | 0.9.13 54 | ------ 55 | 56 | - 3.7 added as supported version in setup 57 | - Updated LICENSE and requirements.txt 58 | 59 | 0.9.12 60 | ------ 61 | 62 | - 3.6 added as supported version in setup 63 | - Updated LICENSE 64 | 65 | 0.9.11 66 | ------ 67 | 68 | - Bugfix: MissingSchema during requests get 69 | - Bugfix: Check for Python 2 should have been for Python 3 70 | 71 | 0.9.10 72 | ------ 73 | 74 | - More refactoring 75 | 76 | 0.9.9 77 | ------ 78 | 79 | - Converted markdown README to rst 80 | 81 | 0.9.8 82 | ------ 83 | 84 | - Changed Utility classifier to Utilities 85 | 86 | 0.9.7 87 | ------ 88 | 89 | - Replaced compat.py with six module 90 | - Made imports relative rather than from PATH 91 | - More refactoring 92 | 93 | 0.9.6 94 | ------ 95 | 96 | - Bugfix: Remove non-links through filtering by protocol 97 | - Refactorings 98 | 99 | 0.9.5 100 | ------ 101 | 102 | - Bugfix: Properly join internal and base URLs for crawling 103 | 104 | 0.9.4 105 | ------ 106 | 107 | - Retired support for 3.2 as tldextract doesn't support it 108 | 109 | 0.9.3 110 | ------ 111 | 112 | - Moved crawling functions into a Crawler class 113 | - General refactorings to docstrings, function names, etc. 114 | - Consolidated max_pages and max_links arguments as max_crawls 115 | - Added tldextract module for getting URL domain, suffixes 116 | 117 | 0.9.2 118 | ------ 119 | 120 | - Added compat.py file 121 | - Moved compatible builtin definitions to __init__ 122 | - Added requests cache 123 | 124 | 0.9.1 125 | ------ 126 | 127 | - Updated version in requirements and setup keywords 128 | - Removed --use-mirrors for 3.5 support 129 | 130 | 0.9.0 131 | ------ 132 | 133 | - Bugfix: Fixed comparison of duplicate URLs when crawling 134 | 135 | 0.8.11 136 | ------ 137 | 138 | - Bugfix: Improper check of domain when being restrictive 139 | 140 | 0.8.10 141 | ------ 142 | 143 | - Strip '/' from end of urls when crawling 144 | 145 | 0.8.9 146 | ------ 147 | 148 | - Added argument for cache link size & fixed up others 149 | 150 | 0.8.8 151 | ------ 152 | 153 | - Updated README and setup 154 | 155 | 0.8.7 156 | ------ 157 | 158 | - added CSV as a format 159 | 160 | 0.8.6 161 | ------ 162 | 163 | - added environ variable SCRAPE_DISABLE_IMGS to not save images 164 | 165 | 0.8.5 166 | ------ 167 | 168 | - warn user that saving images during crawling is slow 169 | 170 | 0.8.4 171 | ------ 172 | 173 | - moved print_text() from crawl.py back to scrape.py 174 | 175 | 0.8.3 176 | ------ 177 | 178 | - fixed bad formatting in readme usage 179 | 180 | 0.8.2 181 | ------ 182 | 183 | - ignore-load-errors removed from wkhtmltopdf executable 184 | 185 | 0.8.1 186 | ------ 187 | 188 | - removed extra schema adding 189 | 190 | 0.8.0 191 | ------ 192 | 193 | - fixed bug where added url schema not reflected in query 194 | 195 | 0.7.9 196 | ------ 197 | 198 | - moved file crawling to new file 199 | - avoid overwrite prompt in tests 200 | 201 | 0.7.8 202 | ------ 203 | 204 | - updated program description 205 | - removed overwriting test due to issues with it 206 | 207 | 0.7.7 208 | ------ 209 | 210 | - no longer defaults to overwriting files, added program flags/a prompt 211 | - adding renaming mechanism if choosing to not overwrite a file 212 | - some function reorganizing 213 | 214 | 0.7.6 215 | ------ 216 | 217 | - added print text to stdout option 218 | - removed extra newline appended in re_filter 219 | - wrapped pdfkit import in try/except as it isnt essential 220 | 221 | 0.7.5 222 | ------ 223 | 224 | - removed extra urlparse import 225 | 226 | 0.7.4 227 | ------ 228 | 229 | - added option to not save images 230 | - images are now only saved if saving to HTML or PDF 231 | - checks if outfilename has extension before adding new one 232 | - fixed domains being sometimes mismatched to urls 233 | - fixed extension being unnecessary appended to urls (for the most part) 234 | 235 | 0.7.3 236 | ------ 237 | 238 | - development status reverted to beta 239 | 240 | 0.7.2 241 | ------ 242 | 243 | - now saves images with PART.html files (but not css yet) 244 | - added module level docstrings 245 | 246 | 0.7.1 247 | ------ 248 | 249 | - added EOFError handling 250 | 251 | 0.7.0 252 | ------ 253 | 254 | - fixed crawl not returning filenames to add to infilenames 255 | - fixed re_filter adding duplicate matches 256 | - fixed domain unboundlocalerror 257 | 258 | 0.6.9 259 | ------ 260 | 261 | - fixed bug where query not found in urls due to trailing / 262 | 263 | 0.6.8 264 | ------ 265 | 266 | - updated program usage 267 | 268 | 0.6.7 269 | ------ 270 | 271 | - fixed bounds check on out file names 272 | 273 | 0.6.6 274 | ------ 275 | 276 | - added out file names as a program argument 277 | - fixed bug where re-writing multiple files 278 | - fixed bug where writing only the first file when writing single file 279 | 280 | 0.6.5 281 | ------ 282 | 283 | - major improvement to remove_whitespace() 284 | 285 | 0.6.4 286 | ------ 287 | 288 | - more docstring improvements 289 | 290 | 0.6.3 291 | ------ 292 | 293 | - began process of making docstrings conform to pep257 294 | - increased size of link cache from 10 to 100 295 | - remove the newline at start of text files 296 | - add newlines between lines filtered by regex 297 | - remove_whitespace now removes newlines that are 3 in a row or more 298 | 299 | 0.6.2 300 | ------ 301 | 302 | - stylistic changes 303 | - files are now read in 1K chunks 304 | 305 | 0.6.1 306 | ------ 307 | 308 | - remove consecutive whitespace before writing text files 309 | - empty text files no longer written 310 | 311 | 0.6.0 312 | ------ 313 | 314 | - fixed bug where single out file name wasn't properly constructed 315 | - out file names are all returned as lowercase now 316 | 317 | 0.5.9 318 | ------ 319 | 320 | - fixed bug where text wouldn't write unless xpath specified 321 | 322 | 0.5.8 323 | ------ 324 | 325 | - can now parse HTML using XPath and save to all formats 326 | - remove carriage returns in scraped text files 327 | 328 | 0.5.7 329 | ------ 330 | 331 | - added maximum out file name length of 24 characters 332 | 333 | 0.5.6 334 | ------ 335 | 336 | - fixed urls not being properly added under file_types 337 | 338 | 0.5.5 339 | ------ 340 | 341 | - fixed UnboundLocalError in write_single_file 342 | 343 | 0.5.4 344 | ------ 345 | 346 | - fixed redefinition of out_file_name in write_to_text 347 | 348 | 0.5.3 349 | ------ 350 | 351 | - fixed IndexError in write_to_text 352 | 353 | 0.5.2 354 | ------ 355 | 356 | - small fix for finding single out file name 357 | 358 | 0.5.1 359 | ------ 360 | 361 | - remade method to find single out file name 362 | 363 | 0.5.0 364 | ------ 365 | 366 | - can now save to single or multiple output files/directories 367 | - added tests for writing to single or multiple files 368 | - preserves original lines/newlines when parsing/writing files 369 | 370 | 0.4.11 371 | ------ 372 | 373 | - changed generator.next() to next(generator) for python 3 compatibility 374 | 375 | 0.4.10 376 | ------ 377 | 378 | - forgot to remove all occurrences of xrange 379 | 380 | 0.4.9 381 | ------ 382 | 383 | - changed unicode decode to ascii decode when writing html to disk 384 | 385 | 0.4.8 386 | ------ 387 | 388 | - added missing python 3 compatibilities 389 | 390 | 0.4.7 391 | ------ 392 | 393 | - fixed urlparse importerror in utils.py for python 3 users 394 | 395 | 0.4.6 396 | ------ 397 | 398 | - fixed html => text 399 | - all conversions fixed, test_scrape.py added to keep it this way 400 | - added pdfkit to requirements.txt 401 | 402 | 0.4.5 403 | ------ 404 | 405 | - added docstrings to all functions 406 | - fixed IOError when trying to convert local html to html 407 | - fixed IOError when trying to convert local html to pdf 408 | - fixed saving scraped files to text, was saving PART filenames instead 409 | 410 | 0.4.4 411 | ------ 412 | 413 | - prompts for filetype from user if none entered 414 | - modularized a couple functions 415 | 416 | 0.4.3 417 | ------ 418 | 419 | - fixed out_file naming 420 | - pep8 and pylint reformatting 421 | 422 | 0.4.2 423 | ------ 424 | 425 | - removed read_part_files in place of get_part_files as pdfkit reads filenames 426 | 427 | 0.4.1 428 | ------ 429 | 430 | - fixed bug preventing writing scraped urls to pdf 431 | 432 | 0.4.0 433 | ------ 434 | 435 | - can now read in text and filter it 436 | - recognizes local files, no need for user to enter special flag 437 | - moved html/ files to testing/ and added a text file to it 438 | - added better distinction between input and output files 439 | - changed instances of file to f_name in utils 440 | - pep8 reformatting 441 | 442 | 0.3.9 443 | ------ 444 | 445 | - add scheme to urls if none present 446 | - fixed bug where raw_html was calling get_html rather than get_raw_html 447 | 448 | 0.3.8 449 | ------ 450 | 451 | - made distinction between links and pages with multiple links on them 452 | - use --maxpages to set the maximum number of pages to get links from 453 | - use --maxlinks to set the maximum number of links to parse 454 | - improved the argument help messages 455 | - improved notes/description in README 456 | 457 | 0.3.7 458 | ------ 459 | 460 | - fixes to page caching and writing PART files 461 | - use --local to read in local html files 462 | - use --max to indicate max number of pages to crawl 463 | - changed program description and keywords 464 | 465 | 0.3.6 466 | ------ 467 | 468 | - cleanup using pylint as reference 469 | 470 | 0.3.5 471 | ------ 472 | 473 | - updated long program description in readme 474 | - added pypi monthly downloads image in readme 475 | 476 | 0.3.4 477 | ------ 478 | 479 | - updated description header in readme 480 | 481 | 0.3.3 482 | ------ 483 | 484 | - added file conversion to program description 485 | 486 | 0.3.2 487 | ------ 488 | 489 | - added travis-ci build status to readme 490 | 491 | 0.3.1 492 | ------ 493 | 494 | - updated program description and added extra installation instructions 495 | - added .travis.yml and requirements.txt 496 | 497 | 0.3.0 498 | ------ 499 | 500 | - added read option for user inputted html files, currently writes files individually and not grouped, to do next is add grouping option 501 | - added html/ directory containing test html files 502 | - made relative imports explicit using absolute_import 503 | - added proxies to utils.py 504 | 505 | 0.2.10 506 | ------ 507 | 508 | - moved OrderedSet class to orderedset.py rather than utils.py 509 | 510 | 0.2.9 511 | ------ 512 | 513 | - updated program description and keywords in setup.py 514 | 515 | 0.2.8 516 | ------ 517 | 518 | - restricts crawling to seed domain by default, changed --strict to --nonstrict for crawling outside given website 519 | 520 | 0.2.5 521 | ------ 522 | 523 | - added requests to install_requires in setup.py 524 | 525 | 0.2.4 526 | ------ 527 | 528 | - added attributes flag which specifies which tag attributes to extract from a given page, such as text, href, etc. 529 | 530 | 0.2.3 531 | ------ 532 | 533 | - updated flags and flag help messages 534 | - verbose now by default and reduced number of messages, use --quiet to silence messages 535 | - changed name of --files flag to --html for saving output as html 536 | - added --text flag, default is still text 537 | 538 | 0.2.2 539 | ------ 540 | 541 | - fixed character encoding issue, all unicode now 542 | 543 | 0.2.1 544 | ------ 545 | 546 | - improvements to exception handling for proper PART file removal 547 | 548 | 0.2.0 549 | ------ 550 | 551 | - pages are now saved as they are crawled to PART.html files and processed/removed as necessary, this greatly saves on program memory 552 | - added a page cache with a limit of 10 for greater duplicate protection 553 | - added --files option for keeping webpages as PART.html instead of saving as text or pdf, this also organizes them into a subdirectory named after the seed url's domain 554 | - changed --restrict flag to --strict for restricting the domain to the seed domain while crawling 555 | - more --verbose messages being printed 556 | 557 | 0.1.10 558 | ------ 559 | 560 | - now compares urls scheme-less before updating links to prevent http:// and https:// duplicates and replaced set_scheme with remove_scheme in utils.py 561 | - renamed write_pages to write_links 562 | 563 | 0.1.9 564 | ------ 565 | 566 | - added behavior for --crawl keywords in crawl method 567 | - added a domain check before outputting crawled message or adding to crawled links 568 | - domain key in args is now set to base domain for proper --restrict behavior 569 | - clean_url now rstrips / character for proper link crawling 570 | - resolve_url now rstrips / character for proper out_file writing 571 | - updated description of --crawl flag 572 | 573 | 0.1.8 574 | ------ 575 | 576 | - removed url fragments 577 | - replaced set_base with urlparse method urljoin 578 | - out_file name construction now uses urlparse 'path' member 579 | - raw_links is now an OrderedSet to try to eliminate as much processing as possible 580 | - added clear method to OrderedSet in utils.py 581 | 582 | 0.1.7 583 | ------ 584 | 585 | - removed validate_domain and replaced it with a lambda instead 586 | - replaced domain with base_url in set_base as should have been done before 587 | - crawled message no longer prints if url was a duplicate 588 | 589 | 0.1.6 590 | ------ 591 | 592 | - uncommented import __version__ 593 | 594 | 0.1.5 595 | ------ 596 | 597 | - set_domain was replaced by set_base, proper solution for links that are relative 598 | - fixed verbose behavior 599 | - updated description in README 600 | 601 | 0.1.4 602 | ------ 603 | 604 | - fixed output file generation, was using domain instead of base_url 605 | - minor code cleanup 606 | 607 | 0.1.3 608 | ------ 609 | 610 | - blank lines are no longer written to text unless as a page separator 611 | - style tags now ignored alongside script tags when getting text 612 | 613 | 0.1.2 614 | ------ 615 | 616 | - added shebang 617 | 618 | 0.1.1 619 | ------ 620 | 621 | - uncommented import __version__ 622 | 623 | 0.1.0 624 | ------ 625 | 626 | - reformatting to conform with PEP 8 627 | - added regexp support for matching crawl keywords and filter text keywords 628 | - improved url resolution by correcting domains and schemes 629 | - added --restrict option to restrict crawler links to only those with seed domain 630 | - made text the default write option rather than pdf, can now use --pdf to change that 631 | - removed page number being written to text, separator is now just a single blank line 632 | - improved construction of output file name 633 | 634 | 0.0.11 635 | ------ 636 | 637 | - fixed missing comma in install_requires in setup.py 638 | - also labeled now as beta as there are still some kinks with crawling 639 | 640 | 0.0.10 641 | ------ 642 | 643 | - now ignoring pdfkit load errors only if more than one link to try to prevent an empty pdf being created in case of error 644 | 645 | 0.0.9 646 | ------ 647 | 648 | - pdfkit now ignores load errors and writes as many pages as possible 649 | 650 | 0.0.8 651 | ------ 652 | 653 | - better implementation of crawler, can now scrape entire websites 654 | - added OrderedSet class to utils.py 655 | 656 | 0.0.7 657 | ------ 658 | 659 | - changed --keywords to --filter and positional arg url to urls 660 | 661 | 0.0.6 662 | ------ 663 | 664 | - use --keywords flag for filtering text 665 | - can pass multiple links now 666 | - will not write empty files anymore 667 | 668 | 0.0.5 669 | ------ 670 | 671 | - added --verbose argument for use with pdfkit 672 | - improved output file name processing 673 | 674 | 0.0.4 675 | ------ 676 | 677 | - accepts 0 or 1 url's, allowing a call with just --version 678 | 679 | 0.0.3 680 | ------ 681 | 682 | - Moved utils.py to scrape/ 683 | 684 | 0.0.2 685 | ------ 686 | 687 | - First entry 688 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2015-2021 Hunter Hammond (huntrar@gmail.com) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CHANGES.txt 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | scrape |PyPI Version| |Total Downloads| 2 | ====================================================== 3 | 4 | a command-line web scraping tool 5 | -------------------------------- 6 | 7 | scrape is a rule-based web crawler and information extraction tool 8 | capable of manipulating and merging new and existing documents. XML Path 9 | Language (XPath) and regular expressions are used to define rules for 10 | filtering content and web traversal. Output may be converted into text, 11 | csv, pdf, and/or HTML formats. 12 | 13 | Installation 14 | ------------ 15 | 16 | :: 17 | 18 | pip install scrape 19 | 20 | or 21 | 22 | :: 23 | 24 | pip install git+https://github.com/huntrar/scrape.git#egg=scrape 25 | 26 | or 27 | 28 | :: 29 | 30 | git clone https://github.com/huntrar/scrape 31 | cd scrape 32 | python setup.py install 33 | 34 | You must `install 35 | wkhtmltopdf `__ 36 | to save files to pdf. 37 | 38 | Usage 39 | ----- 40 | 41 | :: 42 | 43 | usage: scrape.py [-h] [-a [ATTRIBUTES [ATTRIBUTES ...]]] [-all] 44 | [-c [CRAWL [CRAWL ...]]] [-C] [--csv] [-cs [CACHE_SIZE]] 45 | [-f [FILTER [FILTER ...]]] [--html] [-i] [-m] 46 | [-max MAX_CRAWLS] [-n] [-ni] [-no] [-o [OUT [OUT ...]]] [-ow] 47 | [-p] [-pt] [-q] [-s] [-t] [-v] [-x [XPATH]] 48 | [QUERY [QUERY ...]] 49 | 50 | a command-line web scraping tool 51 | 52 | positional arguments: 53 | QUERY URLs/files to scrape 54 | 55 | optional arguments: 56 | -h, --help show this help message and exit 57 | -a [ATTRIBUTES [ATTRIBUTES ...]], --attributes [ATTRIBUTES [ATTRIBUTES ...]] 58 | extract text using tag attributes 59 | -all, --crawl-all crawl all pages 60 | -c [CRAWL [CRAWL ...]], --crawl [CRAWL [CRAWL ...]] 61 | regexp rules for following new pages 62 | -C, --clear-cache clear requests cache 63 | --csv write files as csv 64 | -cs [CACHE_SIZE], --cache-size [CACHE_SIZE] 65 | size of page cache (default: 1000) 66 | -f [FILTER [FILTER ...]], --filter [FILTER [FILTER ...]] 67 | regexp rules for filtering text 68 | --html write files as HTML 69 | -i, --images save page images 70 | -m, --multiple save to multiple files 71 | -max MAX_CRAWLS, --max-crawls MAX_CRAWLS 72 | max number of pages to crawl 73 | -n, --nonstrict allow crawler to visit any domain 74 | -ni, --no-images do not save page images 75 | -no, --no-overwrite do not overwrite files if they exist 76 | -o [OUT [OUT ...]], --out [OUT [OUT ...]] 77 | specify outfile names 78 | -ow, --overwrite overwrite a file if it exists 79 | -p, --pdf write files as pdf 80 | -pt, --print print text output 81 | -q, --quiet suppress program output 82 | -s, --single save to a single file 83 | -t, --text write files as text 84 | -v, --version display current version 85 | -x [XPATH], --xpath [XPATH] 86 | filter HTML using XPath 87 | 88 | Author 89 | ------ 90 | 91 | - Hunter Hammond (huntrar@gmail.com) 92 | 93 | Notes 94 | ----- 95 | 96 | - Input to scrape can be links, files, or a combination of the two, 97 | allowing you to create new files constructed from both existing and 98 | newly scraped content. 99 | - Multiple input files/URLs are saved to multiple output 100 | files/directories by default. To consolidate them, use the --single 101 | flag. 102 | - Images are automatically included when saving as pdf or HTML; this 103 | involves making additional HTTP requests, adding a significant amount 104 | of processing time. If you wish to forgo this feature use the 105 | --no-images flag, or set the environment variable 106 | SCRAPE\_DISABLE\_IMGS. 107 | - Requests cache is enabled by default to cache webpages, it can be 108 | disabled by setting the environment variable SCRAPE\_DISABLE\_CACHE. 109 | - Pages are saved temporarily as PART.html files during processing. 110 | Unless saving pages as HTML, these files are removed automatically 111 | upon conversion or exit. 112 | - To crawl pages with no restrictions use the --crawl-all flag, or 113 | filter which pages to crawl by URL keywords by passing one or more 114 | regexps to --crawl. 115 | - If you want the crawler to follow links outside of the given URLs 116 | domain, use --nonstrict. 117 | - Crawling can be stopped by Ctrl-C or alternatively by setting the 118 | number of pages or links to be crawled using --maxpages and 119 | --maxlinks. A page may contain zero or many links to more pages. 120 | - The text output of scraped files can be printed to stdout rather than 121 | saved by entering --print. 122 | - Filtering HTML can be done using --xpath, while filtering text is 123 | done by entering one or more regexps to --filter. 124 | - If you only want to specify specific tag attributes to extract rather 125 | than an entire XPath, use --attributes. The default choice is to 126 | extract only text attributes, but you can specify one or many 127 | different attributes (such as href, src, title, or any attribute 128 | available..). 129 | 130 | .. |PyPI Version| image:: https://img.shields.io/pypi/v/scrape.svg 131 | :target: https://pypi.python.org/pypi/scrape 132 | .. |Total Downloads| image:: https://pepy.tech/badge/scrape 133 | :target: https://pepy.tech/project/scrape 134 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | argparse==1.4.0 2 | lxml==4.6.5 3 | pdfkit==0.6.1 4 | requests==2.25.1 5 | requests-cache==0.4.13 6 | six==1.15.0 7 | tldextract==3.1.0 8 | -------------------------------------------------------------------------------- /scrape/__init__.py: -------------------------------------------------------------------------------- 1 | """scrape is a rule-based web crawler and information extraction tool capable of manipulating and merging new and existing documents. XML Path Language (XPath) and regular expressions are used to define rules for filtering content and web traversal. Output may be converted into text, csv, pdf, and/or HTML formats. 2 | """ 3 | 4 | __version__ = "0.11.3" 5 | -------------------------------------------------------------------------------- /scrape/crawler.py: -------------------------------------------------------------------------------- 1 | """A class to crawl webpages.""" 2 | 3 | from __future__ import absolute_import, print_function 4 | import sys 5 | 6 | import lxml.html as lh 7 | 8 | from .orderedset import OrderedSet 9 | from . import utils 10 | 11 | 12 | class Crawler(object): 13 | """Follows and saves webpages to PART.html files.""" 14 | 15 | def __init__(self, args, seed_url=None): 16 | """Set seed URL and program arguments""" 17 | self.seed_url = seed_url 18 | self.args = args 19 | self.page_cache = [] 20 | 21 | def get_new_links(self, url, resp): 22 | """Get new links from a URL and filter them.""" 23 | links_on_page = resp.xpath("//a/@href") 24 | links = [utils.clean_url(u, url) for u in links_on_page] 25 | 26 | # Remove non-links through filtering by protocol 27 | links = [x for x in links if utils.check_protocol(x)] 28 | 29 | # Restrict new URLs by the domain of the input URL 30 | if not self.args["nonstrict"]: 31 | domain = utils.get_domain(url) 32 | links = [x for x in links if utils.get_domain(x) == domain] 33 | 34 | # Filter URLs by regex keywords, if any 35 | if self.args["crawl"]: 36 | links = utils.re_filter(links, self.args["crawl"]) 37 | return links 38 | 39 | def limit_reached(self, num_crawls): 40 | """Check if number of pages crawled have reached a limit.""" 41 | return self.args["max_crawls"] and num_crawls >= self.args["max_crawls"] 42 | 43 | def page_crawled(self, page_resp): 44 | """Check if page has been crawled by hashing its text content. 45 | 46 | Add new pages to the page cache. 47 | Return whether page was found in cache. 48 | """ 49 | page_text = utils.parse_text(page_resp) 50 | page_hash = utils.hash_text("".join(page_text)) 51 | if page_hash not in self.page_cache: 52 | utils.cache_page(self.page_cache, page_hash, self.args["cache_size"]) 53 | return False 54 | return True 55 | 56 | def crawl_links(self, seed_url=None): 57 | """Find new links given a seed URL and follow them breadth-first. 58 | 59 | Save page responses as PART.html files. 60 | Return the PART.html filenames created during crawling. 61 | """ 62 | if seed_url is not None: 63 | self.seed_url = seed_url 64 | 65 | if self.seed_url is None: 66 | sys.stderr.write("Crawling requires a seed URL.\n") 67 | return [] 68 | 69 | prev_part_num = utils.get_num_part_files() 70 | crawled_links = set() 71 | uncrawled_links = OrderedSet() 72 | 73 | uncrawled_links.add(self.seed_url) 74 | try: 75 | while uncrawled_links: 76 | # Check limit on number of links and pages to crawl 77 | if self.limit_reached(len(crawled_links)): 78 | break 79 | url = uncrawled_links.pop(last=False) 80 | 81 | # Remove protocol, fragments, etc. to get unique URLs 82 | unique_url = utils.remove_protocol(utils.clean_url(url)) 83 | if unique_url not in crawled_links: 84 | raw_resp = utils.get_raw_resp(url) 85 | if raw_resp is None: 86 | if not self.args["quiet"]: 87 | sys.stderr.write("Failed to parse {0}.\n".format(url)) 88 | continue 89 | 90 | resp = lh.fromstring(raw_resp) 91 | if self.page_crawled(resp): 92 | continue 93 | 94 | crawled_links.add(unique_url) 95 | new_links = self.get_new_links(url, resp) 96 | uncrawled_links.update(new_links) 97 | if not self.args["quiet"]: 98 | print("Crawled {0} (#{1}).".format(url, len(crawled_links))) 99 | 100 | # Write page response to PART.html file 101 | utils.write_part_file( 102 | self.args, url, raw_resp, resp, len(crawled_links) 103 | ) 104 | except (KeyboardInterrupt, EOFError): 105 | pass 106 | 107 | curr_part_num = utils.get_num_part_files() 108 | return utils.get_part_filenames(curr_part_num, prev_part_num) 109 | -------------------------------------------------------------------------------- /scrape/orderedset.py: -------------------------------------------------------------------------------- 1 | from collections.abc import MutableSet 2 | 3 | 4 | class OrderedSet(MutableSet): 5 | def __init__(self, iterable=None): 6 | self.end = end = [] 7 | end += [None, end, end] # sentinel node for doubly linked list 8 | self.map = {} # key --> [key, prev, next] 9 | if iterable is not None: 10 | self |= iterable 11 | 12 | def __len__(self): 13 | return len(self.map) 14 | 15 | def __contains__(self, key): 16 | return key in self.map 17 | 18 | def add(self, key): 19 | if key not in self.map: 20 | end = self.end 21 | curr = end[1] 22 | curr[2] = end[1] = self.map[key] = [key, curr, end] 23 | 24 | def update(self, iterable): 25 | for item in iterable: 26 | self.add(item) 27 | 28 | def discard(self, key): 29 | if key in self.map: 30 | key, prev, next = self.map.pop(key) 31 | prev[2] = next 32 | next[1] = prev 33 | 34 | def __iter__(self): 35 | end = self.end 36 | curr = end[2] 37 | while curr is not end: 38 | yield curr[0] 39 | curr = curr[2] 40 | 41 | def __reversed__(self): 42 | end = self.end 43 | curr = end[1] 44 | while curr is not end: 45 | yield curr[0] 46 | curr = curr[1] 47 | 48 | def pop(self, last=True): 49 | if not self: 50 | raise KeyError("set is empty") 51 | key = self.end[1][0] if last else self.end[2][0] 52 | self.discard(key) 53 | return key 54 | 55 | def clear(self): 56 | while self: 57 | self.pop() 58 | 59 | def __repr__(self): 60 | if not self: 61 | return "%s()" % (self.__class__.__name__,) 62 | return "%s(%r)" % (self.__class__.__name__, list(self)) 63 | 64 | def __eq__(self, other): 65 | if isinstance(other, OrderedSet): 66 | return len(self) == len(other) and list(self) == list(other) 67 | return set(self) == set(other) 68 | -------------------------------------------------------------------------------- /scrape/scrape.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ scrape - a command-line web scraping tool 3 | 4 | written by Hunter Hammond (huntrar@gmail.com) 5 | """ 6 | 7 | from __future__ import absolute_import, print_function 8 | from argparse import ArgumentParser 9 | import os 10 | import sys 11 | 12 | from six.moves import input 13 | from six import iterkeys 14 | 15 | from .crawler import Crawler 16 | from . import utils, __version__ 17 | 18 | 19 | def get_parser(): 20 | """Parse command-line arguments.""" 21 | parser = ArgumentParser(description="a command-line web scraping tool") 22 | parser.add_argument( 23 | "query", metavar="QUERY", type=str, nargs="*", help="URLs/files to scrape" 24 | ) 25 | parser.add_argument( 26 | "-a", 27 | "--attributes", 28 | type=str, 29 | nargs="*", 30 | help="extract text using tag attributes", 31 | ) 32 | parser.add_argument( 33 | "-all", "--crawl-all", help="crawl all pages", action="store_true" 34 | ) 35 | parser.add_argument( 36 | "-c", 37 | "--crawl", 38 | type=str, 39 | nargs="*", 40 | help="regexp rules for following new pages", 41 | ) 42 | parser.add_argument( 43 | "-C", "--clear-cache", help="clear requests cache", action="store_true" 44 | ) 45 | parser.add_argument("--csv", help="write files as csv", action="store_true") 46 | parser.add_argument( 47 | "-cs", 48 | "--cache-size", 49 | type=int, 50 | nargs="?", 51 | help="size of page cache (default: 1000)", 52 | default=1000, 53 | ) 54 | parser.add_argument( 55 | "-f", "--filter", type=str, nargs="*", help="regexp rules for filtering text" 56 | ) 57 | parser.add_argument("--html", help="write files as HTML", action="store_true") 58 | parser.add_argument("-i", "--images", action="store_true", help="save page images") 59 | parser.add_argument( 60 | "-m", "--multiple", help="save to multiple files", action="store_true" 61 | ) 62 | parser.add_argument( 63 | "-max", "--max-crawls", type=int, help="max number of pages to crawl" 64 | ) 65 | parser.add_argument( 66 | "-n", 67 | "--nonstrict", 68 | action="store_true", 69 | help="allow crawler to visit any domain", 70 | ) 71 | parser.add_argument( 72 | "-ni", "--no-images", action="store_true", help="do not save page images" 73 | ) 74 | parser.add_argument( 75 | "-no", 76 | "--no-overwrite", 77 | action="store_true", 78 | help="do not overwrite files if they exist", 79 | ) 80 | parser.add_argument( 81 | "-o", "--out", type=str, nargs="*", help="specify outfile names" 82 | ) 83 | parser.add_argument( 84 | "-ow", "--overwrite", action="store_true", help="overwrite a file if it exists" 85 | ) 86 | parser.add_argument("-p", "--pdf", help="write files as pdf", action="store_true") 87 | parser.add_argument("-pt", "--print", help="print text output", action="store_true") 88 | parser.add_argument( 89 | "-q", "--quiet", help="suppress program output", action="store_true" 90 | ) 91 | parser.add_argument( 92 | "-s", "--single", help="save to a single file", action="store_true" 93 | ) 94 | parser.add_argument("-t", "--text", help="write files as text", action="store_true") 95 | parser.add_argument( 96 | "-v", "--version", help="display current version", action="store_true" 97 | ) 98 | parser.add_argument( 99 | "-x", "--xpath", type=str, nargs="?", help="filter HTML using XPath" 100 | ) 101 | return parser 102 | 103 | 104 | def write_files(args, infilenames, outfilename): 105 | """Write scraped or local file(s) in desired format. 106 | 107 | Keyword arguments: 108 | args -- program arguments (dict) 109 | infilenames -- names of user-inputted and/or downloaded files (list) 110 | outfilename -- name of output file (str) 111 | 112 | Remove PART(#).html files after conversion unless otherwise specified. 113 | """ 114 | write_actions = { 115 | "print": utils.print_text, 116 | "pdf": utils.write_pdf_files, 117 | "csv": utils.write_csv_files, 118 | "text": utils.write_text_files, 119 | } 120 | try: 121 | for action in iterkeys(write_actions): 122 | if args[action]: 123 | write_actions[action](args, infilenames, outfilename) 124 | finally: 125 | if args["urls"] and not args["html"]: 126 | utils.remove_part_files() 127 | 128 | 129 | def write_single_file(args, base_dir, crawler): 130 | """Write to a single output file and/or subdirectory.""" 131 | if args["urls"] and args["html"]: 132 | # Create a directory to save PART.html files in 133 | domain = utils.get_domain(args["urls"][0]) 134 | if not args["quiet"]: 135 | print("Storing html files in {0}/".format(domain)) 136 | utils.mkdir_and_cd(domain) 137 | 138 | infilenames = [] 139 | for query in args["query"]: 140 | if query in args["files"]: 141 | infilenames.append(query) 142 | elif query.strip("/") in args["urls"]: 143 | if args["crawl"] or args["crawl_all"]: 144 | # Crawl and save HTML files/image files to disk 145 | infilenames += crawler.crawl_links(query) 146 | else: 147 | raw_resp = utils.get_raw_resp(query) 148 | if raw_resp is None: 149 | return False 150 | 151 | prev_part_num = utils.get_num_part_files() 152 | utils.write_part_file(args, query, raw_resp) 153 | curr_part_num = prev_part_num + 1 154 | infilenames += utils.get_part_filenames(curr_part_num, prev_part_num) 155 | 156 | # Convert output or leave as PART.html files 157 | if args["html"]: 158 | # HTML files have been written already, so return to base directory 159 | os.chdir(base_dir) 160 | else: 161 | # Write files to text or pdf 162 | if infilenames: 163 | if args["out"]: 164 | outfilename = args["out"][0] 165 | else: 166 | outfilename = utils.get_single_outfilename(args) 167 | if outfilename: 168 | write_files(args, infilenames, outfilename) 169 | else: 170 | utils.remove_part_files() 171 | return True 172 | 173 | 174 | def write_multiple_files(args, base_dir, crawler): 175 | """Write to multiple output files and/or subdirectories.""" 176 | for i, query in enumerate(args["query"]): 177 | if query in args["files"]: 178 | # Write files 179 | if args["out"] and i < len(args["out"]): 180 | outfilename = args["out"][i] 181 | else: 182 | outfilename = ".".join(query.split(".")[:-1]) 183 | write_files(args, [query], outfilename) 184 | elif query in args["urls"]: 185 | # Scrape/crawl urls 186 | domain = utils.get_domain(query) 187 | if args["html"]: 188 | # Create a directory to save PART.html files in 189 | if not args["quiet"]: 190 | print("Storing html files in {0}/".format(domain)) 191 | utils.mkdir_and_cd(domain) 192 | 193 | if args["crawl"] or args["crawl_all"]: 194 | # Crawl and save HTML files/image files to disk 195 | infilenames = crawler.crawl_links(query) 196 | else: 197 | raw_resp = utils.get_raw_resp(query) 198 | if raw_resp is None: 199 | return False 200 | 201 | # Saves page as PART.html file 202 | prev_part_num = utils.get_num_part_files() 203 | utils.write_part_file(args, query, raw_resp) 204 | curr_part_num = prev_part_num + 1 205 | infilenames = utils.get_part_filenames(curr_part_num, prev_part_num) 206 | 207 | # Convert output or leave as PART.html files 208 | if args["html"]: 209 | # HTML files have been written already, so return to base dir 210 | os.chdir(base_dir) 211 | else: 212 | # Write files to text or pdf 213 | if infilenames: 214 | if args["out"] and i < len(args["out"]): 215 | outfilename = args["out"][i] 216 | else: 217 | outfilename = utils.get_outfilename(query, domain) 218 | write_files(args, infilenames, outfilename) 219 | else: 220 | sys.stderr.write( 221 | "Failed to retrieve content from {0}.\n".format(query) 222 | ) 223 | return True 224 | 225 | 226 | def split_input(args): 227 | """Split query input into local files and URLs.""" 228 | args["files"] = [] 229 | args["urls"] = [] 230 | for arg in args["query"]: 231 | if os.path.isfile(arg): 232 | args["files"].append(arg) 233 | else: 234 | args["urls"].append(arg.strip("/")) 235 | 236 | 237 | def detect_output_type(args): 238 | """Detect whether to save to a single or multiple files.""" 239 | if not args["single"] and not args["multiple"]: 240 | # Save to multiple files if multiple files/URLs entered 241 | if len(args["query"]) > 1 or len(args["out"]) > 1: 242 | args["multiple"] = True 243 | else: 244 | args["single"] = True 245 | 246 | 247 | def scrape(args): 248 | """Scrape webpage content.""" 249 | try: 250 | base_dir = os.getcwd() 251 | if args["out"] is None: 252 | args["out"] = [] 253 | 254 | # Detect whether to save to a single or multiple files 255 | detect_output_type(args) 256 | 257 | # Split query input into local files and URLs 258 | split_input(args) 259 | 260 | if args["urls"]: 261 | # Add URL extensions and schemes and update query and URLs 262 | urls_with_exts = [utils.add_url_suffix(x) for x in args["urls"]] 263 | args["query"] = [ 264 | utils.add_protocol(x) if x in args["urls"] else x 265 | for x in urls_with_exts 266 | ] 267 | args["urls"] = [x for x in args["query"] if x not in args["files"]] 268 | 269 | # Print error if attempting to convert local files to HTML 270 | if args["files"] and args["html"]: 271 | sys.stderr.write("Cannot convert local files to HTML.\n") 272 | args["files"] = [] 273 | 274 | # Instantiate web crawler if necessary 275 | crawler = None 276 | if args["crawl"] or args["crawl_all"]: 277 | crawler = Crawler(args) 278 | 279 | if args["single"]: 280 | return write_single_file(args, base_dir, crawler) 281 | elif args["multiple"]: 282 | return write_multiple_files(args, base_dir, crawler) 283 | 284 | except (KeyboardInterrupt, Exception): 285 | if args["html"]: 286 | try: 287 | os.chdir(base_dir) 288 | except OSError: 289 | pass 290 | else: 291 | utils.remove_part_files() 292 | raise 293 | 294 | 295 | def prompt_filetype(args): 296 | """Prompt user for filetype if none specified.""" 297 | valid_types = ("print", "text", "csv", "pdf", "html") 298 | if not any(args[x] for x in valid_types): 299 | try: 300 | filetype = input( 301 | "Print or save output as ({0}): ".format(", ".join(valid_types)) 302 | ).lower() 303 | while filetype not in valid_types: 304 | filetype = input( 305 | "Invalid entry. Choose from ({0}): ".format(", ".join(valid_types)) 306 | ).lower() 307 | except (KeyboardInterrupt, EOFError): 308 | return 309 | args[filetype] = True 310 | 311 | 312 | def prompt_save_images(args): 313 | """Prompt user to save images when crawling (for pdf and HTML formats).""" 314 | if args["images"] or args["no_images"]: 315 | return 316 | 317 | if (args["pdf"] or args["html"]) and (args["crawl"] or args["crawl_all"]): 318 | save_msg = ( 319 | "Choosing to save images will greatly slow the" 320 | " crawling process.\nSave images anyways? (y/n): " 321 | ) 322 | try: 323 | save_images = utils.confirm_input(input(save_msg)) 324 | except (KeyboardInterrupt, EOFError): 325 | return 326 | 327 | args["images"] = save_images 328 | args["no_images"] = not save_images 329 | 330 | 331 | def command_line_runner(): 332 | """Handle command-line interaction.""" 333 | parser = get_parser() 334 | args = vars(parser.parse_args()) 335 | if args["version"]: 336 | print(__version__) 337 | return 338 | if args["clear_cache"]: 339 | utils.clear_cache() 340 | print("Cleared {0}.".format(utils.CACHE_DIR)) 341 | return 342 | if not args["query"]: 343 | parser.print_help() 344 | return 345 | 346 | # Enable cache unless user sets environ variable SCRAPE_DISABLE_CACHE 347 | if not os.getenv("SCRAPE_DISABLE_CACHE"): 348 | utils.enable_cache() 349 | 350 | # Save images unless user sets environ variable SCRAPE_DISABLE_IMGS 351 | if os.getenv("SCRAPE_DISABLE_IMGS"): 352 | args["no_images"] = True 353 | 354 | # Prompt user for filetype if none specified 355 | prompt_filetype(args) 356 | 357 | # Prompt user to save images when crawling (for pdf and HTML formats) 358 | prompt_save_images(args) 359 | 360 | # Scrape webpage content 361 | scrape(args) 362 | 363 | 364 | if __name__ == "__main__": 365 | command_line_runner() 366 | -------------------------------------------------------------------------------- /scrape/utils.py: -------------------------------------------------------------------------------- 1 | """scrape utility functions. 2 | 3 | Functions include: 4 | Web requests and requests caching 5 | Document caching 6 | Text processing 7 | HTML parsing 8 | URL processing 9 | File processing 10 | User input and sanitation 11 | Miscellaneous 12 | """ 13 | 14 | from __future__ import print_function 15 | import glob 16 | import hashlib 17 | import os 18 | import random 19 | import re 20 | import shutil 21 | import string 22 | import sys 23 | import time 24 | 25 | import lxml.html as lh 26 | 27 | try: 28 | import pdfkit as pk 29 | except ImportError: 30 | pass 31 | import requests 32 | from requests.exceptions import MissingSchema 33 | from six import PY2 34 | from six.moves import input, xrange as range 35 | from six.moves.urllib.parse import urlparse, urljoin 36 | from six.moves.urllib.request import getproxies 37 | import tldextract 38 | 39 | if PY2: 40 | from cgi import escape 41 | else: 42 | from html import escape 43 | 44 | USER_AGENTS = ( 45 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) " 46 | "Gecko/20100101 Firefox/11.0", 47 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) " "Gecko/20100 101 Firefox/22.0", 48 | "Mozilla/5.0 (Windows NT 6.1; rv:11.0) " "Gecko/20100101 Firefox/11.0", 49 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) " 50 | "AppleWebKit/536.5 (KHTML, like Gecko) " 51 | "Chrome/19.0.1084.46 Safari/536.5", 52 | "Mozilla/5.0 (Windows; Windows NT 6.1) " 53 | "AppleWebKit/536.5 (KHTML, like Gecko) " 54 | "Chrome/19.0.1084.46 Safari/536.5", 55 | ) 56 | 57 | 58 | XDG_CACHE_DIR = os.environ.get( 59 | "XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache") 60 | ) 61 | CACHE_DIR = os.path.join(XDG_CACHE_DIR, "scrape") 62 | CACHE_FILE = os.path.join(CACHE_DIR, "cache{0}".format("" if PY2 else "3")) 63 | 64 | # Web requests and requests caching functions 65 | # 66 | 67 | 68 | def get_proxies(): 69 | """Get available proxies to use with requests library.""" 70 | proxies = getproxies() 71 | filtered_proxies = {} 72 | for key, value in proxies.items(): 73 | if key.startswith("http://"): 74 | if not value.startswith("http://"): 75 | filtered_proxies[key] = "http://{0}".format(value) 76 | else: 77 | filtered_proxies[key] = value 78 | return filtered_proxies 79 | 80 | 81 | def get_resp(url): 82 | """Get webpage response as an lxml.html.HtmlElement object.""" 83 | try: 84 | headers = {"User-Agent": random.choice(USER_AGENTS)} 85 | try: 86 | request = requests.get(url, headers=headers, proxies=get_proxies()) 87 | except MissingSchema: 88 | url = add_protocol(url) 89 | request = requests.get(url, headers=headers, proxies=get_proxies()) 90 | return lh.fromstring(request.text.encode("utf-8") if PY2 else request.text) 91 | except Exception: 92 | sys.stderr.write("Failed to retrieve {0}.\n".format(url)) 93 | raise 94 | 95 | 96 | def get_raw_resp(url): 97 | """Get webpage response as a unicode string.""" 98 | try: 99 | headers = {"User-Agent": random.choice(USER_AGENTS)} 100 | try: 101 | request = requests.get(url, headers=headers, proxies=get_proxies()) 102 | except MissingSchema: 103 | url = add_protocol(url) 104 | request = requests.get(url, headers=headers, proxies=get_proxies()) 105 | return request.text.encode("utf-8") if PY2 else request.text 106 | except Exception: 107 | sys.stderr.write("Failed to retrieve {0} as str.\n".format(url)) 108 | raise 109 | 110 | 111 | def enable_cache(): 112 | """Enable requests library cache.""" 113 | try: 114 | import requests_cache 115 | except ImportError as err: 116 | sys.stderr.write("Failed to enable cache: {0}\n".format(str(err))) 117 | return 118 | if not os.path.exists(CACHE_DIR): 119 | os.makedirs(CACHE_DIR) 120 | requests_cache.install_cache(CACHE_FILE) 121 | 122 | 123 | def clear_cache(): 124 | """Clear requests library cache.""" 125 | for cache in glob.glob("{0}*".format(CACHE_FILE)): 126 | os.remove(cache) 127 | 128 | 129 | # Document caching functions 130 | # 131 | 132 | 133 | def hash_text(text): 134 | """Return MD5 hash of a string.""" 135 | md5 = hashlib.md5() 136 | md5.update(text.encode("utf-8")) 137 | return md5.hexdigest() 138 | 139 | 140 | def cache_page(page_cache, page_hash, cache_size): 141 | """Add a page to the page cache.""" 142 | page_cache.append(page_hash) 143 | if len(page_cache) > cache_size: 144 | page_cache.pop(0) 145 | 146 | 147 | # Text processing functions 148 | # 149 | 150 | 151 | def re_filter(text, regexps): 152 | """Filter text using regular expressions.""" 153 | if not regexps: 154 | return text 155 | 156 | matched_text = [] 157 | compiled_regexps = [re.compile(x) for x in regexps] 158 | for line in text: 159 | if line in matched_text: 160 | continue 161 | 162 | for regexp in compiled_regexps: 163 | found = regexp.search(line) 164 | if found and found.group(): 165 | matched_text.append(line) 166 | 167 | return matched_text or text 168 | 169 | 170 | def remove_whitespace(text): 171 | """Remove unnecessary whitespace while keeping logical structure. 172 | 173 | Keyword arguments: 174 | text -- text to remove whitespace from (list) 175 | 176 | Retain paragraph structure but remove other whitespace, 177 | such as between words on a line and at the start and end of the text. 178 | """ 179 | clean_text = [] 180 | curr_line = "" 181 | # Remove any newlines that follow two lines of whitespace consecutively 182 | # Also remove whitespace at start and end of text 183 | while text: 184 | if not curr_line: 185 | # Find the first line that is not whitespace and add it 186 | curr_line = text.pop(0) 187 | while not curr_line.strip() and text: 188 | curr_line = text.pop(0) 189 | if curr_line.strip(): 190 | clean_text.append(curr_line) 191 | else: 192 | # Filter the rest of the lines 193 | curr_line = text.pop(0) 194 | if not text: 195 | # Add the final line if it is not whitespace 196 | if curr_line.strip(): 197 | clean_text.append(curr_line) 198 | continue 199 | 200 | if curr_line.strip(): 201 | clean_text.append(curr_line) 202 | else: 203 | # If the current line is whitespace then make sure there is 204 | # no more than one consecutive line of whitespace following 205 | if not text[0].strip(): 206 | if len(text) > 1 and text[1].strip(): 207 | clean_text.append(curr_line) 208 | else: 209 | clean_text.append(curr_line) 210 | 211 | # Now filter each individual line for extraneous whitespace 212 | cleaner_text = [] 213 | for line in clean_text: 214 | clean_line = " ".join(line.split()) 215 | if not clean_line.strip(): 216 | clean_line += "\n" 217 | cleaner_text.append(clean_line) 218 | return cleaner_text 219 | 220 | 221 | def parse_text(infile, xpath=None, filter_words=None, attributes=None): 222 | """Filter text using XPath, regex keywords, and tag attributes. 223 | 224 | Keyword arguments: 225 | infile -- HTML or text content to parse (list) 226 | xpath -- an XPath expression (str) 227 | filter_words -- regex keywords (list) 228 | attributes -- HTML tag attributes (list) 229 | 230 | Return a list of strings of text. 231 | """ 232 | infiles = [] 233 | text = [] 234 | if xpath is not None: 235 | infile = parse_html(infile, xpath) 236 | if isinstance(infile, list): 237 | if isinstance(infile[0], lh.HtmlElement): 238 | infiles = list(infile) 239 | else: 240 | text = [line + "\n" for line in infile] 241 | elif isinstance(infile, lh.HtmlElement): 242 | infiles = [infile] 243 | else: 244 | text = [infile] 245 | else: 246 | infiles = [infile] 247 | 248 | if attributes is not None: 249 | attributes = [clean_attr(x) for x in attributes] 250 | attributes = [x for x in attributes if x] 251 | else: 252 | attributes = ["text()"] 253 | 254 | if not text: 255 | text_xpath = "//*[not(self::script) and not(self::style)]" 256 | for attr in attributes: 257 | for infile in infiles: 258 | if isinstance(infile, lh.HtmlElement): 259 | new_text = infile.xpath("{0}/{1}".format(text_xpath, attr)) 260 | else: 261 | # re.split preserves delimiters place in the list 262 | new_text = [x for x in re.split("(\n)", infile) if x] 263 | text += new_text 264 | 265 | if filter_words is not None: 266 | text = re_filter(text, filter_words) 267 | return [ 268 | "".join(x for x in line if x in string.printable) 269 | for line in remove_whitespace(text) 270 | if line 271 | ] 272 | 273 | 274 | def get_parsed_text(args, infilename): 275 | """Parse and return text content of infiles. 276 | 277 | Keyword arguments: 278 | args -- program arguments (dict) 279 | infilenames -- name of user-inputted and/or downloaded file (str) 280 | 281 | Return a list of strings of text. 282 | """ 283 | parsed_text = [] 284 | if infilename.endswith(".html"): 285 | # Convert HTML to lxml object for content parsing 286 | html = lh.fromstring(read_files(infilename)) 287 | text = None 288 | else: 289 | html = None 290 | text = read_files(infilename) 291 | 292 | if html is not None: 293 | parsed_text = parse_text( 294 | html, args["xpath"], args["filter"], args["attributes"] 295 | ) 296 | elif text is not None: 297 | parsed_text = parse_text(text, args["xpath"], args["filter"]) 298 | else: 299 | if not args["quiet"]: 300 | sys.stderr.write("Failed to parse text from {0}.\n".format(infilename)) 301 | return parsed_text 302 | 303 | 304 | # HTML parsing functions 305 | # 306 | 307 | 308 | def clean_attr(attr): 309 | """Append @ to attributes and resolve text -> text() for XPath.""" 310 | if attr: 311 | if "text" in attr: 312 | return "text()" 313 | else: 314 | attr = attr.lstrip("@") 315 | if attr: 316 | return "@" + attr 317 | return None 318 | 319 | 320 | def parse_html(infile, xpath): 321 | """Filter HTML using XPath.""" 322 | if not isinstance(infile, lh.HtmlElement): 323 | infile = lh.fromstring(infile) 324 | infile = infile.xpath(xpath) 325 | if not infile: 326 | raise ValueError("XPath {0} returned no results.".format(xpath)) 327 | return infile 328 | 329 | 330 | # URL processing functions 331 | # 332 | 333 | 334 | def get_domain(url): 335 | """Get the domain of a URL using tldextract.""" 336 | return tldextract.extract(url).domain 337 | 338 | 339 | def add_protocol(url): 340 | """Add protocol to URL.""" 341 | if not check_protocol(url): 342 | return "http://{0}".format(url) 343 | return url 344 | 345 | 346 | def check_protocol(url): 347 | """Check URL for a protocol.""" 348 | if url and (url.startswith("http://") or url.startswith("https://")): 349 | return True 350 | return False 351 | 352 | 353 | def remove_protocol(url): 354 | """Remove protocol from URL.""" 355 | if check_protocol(url): 356 | return url.replace("http://", "").replace("https://", "") 357 | return url 358 | 359 | 360 | def clean_url(url, base_url=None): 361 | """Add base netloc and path to internal URLs and remove www, fragments.""" 362 | parsed_url = urlparse(url) 363 | 364 | fragment = "{url.fragment}".format(url=parsed_url) 365 | if fragment: 366 | url = url.split(fragment)[0] 367 | 368 | # Identify internal URLs and fix their format 369 | netloc = "{url.netloc}".format(url=parsed_url) 370 | if base_url is not None and not netloc: 371 | parsed_base = urlparse(base_url) 372 | split_base = "{url.scheme}://{url.netloc}{url.path}/".format(url=parsed_base) 373 | url = urljoin(split_base, url) 374 | netloc = "{url.netloc}".format(url=urlparse(url)) 375 | 376 | if "www." in netloc: 377 | url = url.replace(netloc, netloc.replace("www.", "")) 378 | return url.rstrip(string.punctuation) 379 | 380 | 381 | def has_suffix(url): 382 | """Return whether the url has a suffix using tldextract.""" 383 | return bool(tldextract.extract(url).suffix) 384 | 385 | 386 | def add_url_suffix(url): 387 | """Add .com suffix to URL if none found.""" 388 | url = url.rstrip("/") 389 | if not has_suffix(url): 390 | return "{0}.com".format(url) 391 | return url 392 | 393 | 394 | # File processing functions 395 | # 396 | 397 | 398 | def get_outfilename(url, domain=None): 399 | """Construct the output filename from domain and end of path.""" 400 | if domain is None: 401 | domain = get_domain(url) 402 | 403 | path = "{url.path}".format(url=urlparse(url)) 404 | if "." in path: 405 | tail_url = path.split(".")[-2] 406 | else: 407 | tail_url = path 408 | 409 | if tail_url: 410 | if "/" in tail_url: 411 | tail_pieces = [x for x in tail_url.split("/") if x] 412 | tail_url = tail_pieces[-1] 413 | 414 | # Keep length of return string below or equal to max_len 415 | max_len = 24 416 | if domain: 417 | max_len -= len(domain) + 1 418 | if len(tail_url) > max_len: 419 | if "-" in tail_url: 420 | tail_pieces = [x for x in tail_url.split("-") if x] 421 | tail_url = tail_pieces.pop(0) 422 | if len(tail_url) > max_len: 423 | tail_url = tail_url[:max_len] 424 | else: 425 | # Add as many tail pieces that can fit 426 | tail_len = 0 427 | for piece in tail_pieces: 428 | tail_len += len(piece) 429 | if tail_len <= max_len: 430 | tail_url += "-" + piece 431 | else: 432 | break 433 | else: 434 | tail_url = tail_url[:max_len] 435 | 436 | if domain: 437 | return "{0}-{1}".format(domain, tail_url).lower() 438 | return tail_url 439 | return domain.lower() 440 | 441 | 442 | def get_single_outfilename(args): 443 | """Use first possible entry in query as filename.""" 444 | for arg in args["query"]: 445 | if arg in args["files"]: 446 | return (".".join(arg.split(".")[:-1])).lower() 447 | for url in args["urls"]: 448 | if arg.strip("/") in url: 449 | domain = get_domain(url) 450 | return get_outfilename(url, domain) 451 | sys.stderr.write("Failed to construct a single out filename.\n") 452 | return "" 453 | 454 | 455 | def remove_file(filename): 456 | """Remove a file from disk.""" 457 | try: 458 | os.remove(filename) 459 | return True 460 | except (OSError, IOError): 461 | return False 462 | 463 | 464 | def modify_filename_id(filename): 465 | """Modify filename to have a unique numerical identifier.""" 466 | split_filename = os.path.splitext(filename) 467 | id_num_re = re.compile("(\(\d\))") 468 | id_num = re.findall(id_num_re, split_filename[-2]) 469 | if id_num: 470 | new_id_num = int(id_num[-1].lstrip("(").rstrip(")")) + 1 471 | 472 | # Reconstruct filename with incremented id and its extension 473 | filename = "".join( 474 | ( 475 | re.sub(id_num_re, "({0})".format(new_id_num), split_filename[-2]), 476 | split_filename[-1], 477 | ) 478 | ) 479 | else: 480 | split_filename = os.path.splitext(filename) 481 | 482 | # Reconstruct filename with new id and its extension 483 | filename = "".join(("{0} (2)".format(split_filename[-2]), split_filename[-1])) 484 | return filename 485 | 486 | 487 | def overwrite_file_check(args, filename): 488 | """If filename exists, overwrite or modify it to be unique.""" 489 | if not args["overwrite"] and os.path.exists(filename): 490 | # Confirm overwriting of the file, or modify filename 491 | if args["no_overwrite"]: 492 | overwrite = False 493 | else: 494 | try: 495 | overwrite = confirm_input( 496 | input("Overwrite {0}? (yes/no): ".format(filename)) 497 | ) 498 | except (KeyboardInterrupt, EOFError): 499 | sys.exit() 500 | if not overwrite: 501 | new_filename = modify_filename_id(filename) 502 | while os.path.exists(new_filename): 503 | new_filename = modify_filename_id(new_filename) 504 | return new_filename 505 | return filename 506 | 507 | 508 | def print_text(args, infilenames, outfilename=None): 509 | """Print text content of infiles to stdout. 510 | 511 | Keyword arguments: 512 | args -- program arguments (dict) 513 | infilenames -- names of user-inputted and/or downloaded files (list) 514 | outfilename -- only used for interface purposes (None) 515 | """ 516 | for infilename in infilenames: 517 | parsed_text = get_parsed_text(args, infilename) 518 | if parsed_text: 519 | for line in parsed_text: 520 | print(line) 521 | print("") 522 | 523 | 524 | def write_pdf_files(args, infilenames, outfilename): 525 | """Write pdf file(s) to disk using pdfkit. 526 | 527 | Keyword arguments: 528 | args -- program arguments (dict) 529 | infilenames -- names of user-inputted and/or downloaded files (list) 530 | outfilename -- name of output pdf file (str) 531 | """ 532 | if not outfilename.endswith(".pdf"): 533 | outfilename = outfilename + ".pdf" 534 | outfilename = overwrite_file_check(args, outfilename) 535 | 536 | options = {"enable-local-file-access": None} 537 | try: 538 | if args["multiple"]: 539 | # Multiple files are written one at a time, so infilenames will 540 | # never contain more than one file here 541 | infilename = infilenames[0] 542 | if not args["quiet"]: 543 | print("Attempting to write to {0}.".format(outfilename)) 544 | else: 545 | options["quiet"] = None 546 | 547 | if args["xpath"]: 548 | # Process HTML with XPath before writing 549 | html = parse_html(read_files(infilename), args["xpath"]) 550 | if isinstance(html, list): 551 | if isinstance(html[0], str): 552 | pk.from_string("\n".join(html), outfilename, options=options) 553 | else: 554 | pk.from_string( 555 | "\n".join(lh.tostring(x) for x in html), 556 | outfilename, 557 | options=options, 558 | ) 559 | elif isinstance(html, str): 560 | pk.from_string(html, outfilename, options=options) 561 | else: 562 | pk.from_string(lh.tostring(html), outfilename, options=options) 563 | else: 564 | pk.from_file(infilename, outfilename, options=options) 565 | elif args["single"]: 566 | if not args["quiet"]: 567 | print( 568 | "Attempting to write {0} page(s) to {1}.".format( 569 | len(infilenames), outfilename 570 | ) 571 | ) 572 | else: 573 | options["quiet"] = None 574 | 575 | if args["xpath"]: 576 | # Process HTML with XPath before writing 577 | html = parse_html(read_files(infilenames), args["xpath"]) 578 | if isinstance(html, list): 579 | if isinstance(html[0], str): 580 | pk.from_string("\n".join(html), outfilename, options=options) 581 | else: 582 | pk.from_string( 583 | "\n".join(lh.tostring(x) for x in html), 584 | outfilename, 585 | options=options, 586 | ) 587 | elif isinstance(html, str): 588 | pk.from_string(html, outfilename, options=options) 589 | else: 590 | pk.from_string(lh.tostring(html), outfilename, options=options) 591 | else: 592 | pk.from_file(infilenames, outfilename, options=options) 593 | return True 594 | except (OSError, IOError) as err: 595 | sys.stderr.write( 596 | "An error occurred while writing {0}:\n{1}".format(outfilename, str(err)) 597 | ) 598 | return False 599 | 600 | 601 | def write_csv_files(args, infilenames, outfilename): 602 | """Write csv file(s) to disk. 603 | 604 | Keyword arguments: 605 | args -- program arguments (dict) 606 | infilenames -- names of user-inputted and/or downloaded files (list) 607 | outfilename -- name of output text file (str) 608 | """ 609 | 610 | def csv_convert(line): 611 | """Strip punctuation and insert commas""" 612 | clean_line = [] 613 | for word in line.split(" "): 614 | clean_line.append(word.strip(string.punctuation)) 615 | return ", ".join(clean_line) 616 | 617 | if not outfilename.endswith(".csv"): 618 | outfilename = outfilename + ".csv" 619 | outfilename = overwrite_file_check(args, outfilename) 620 | 621 | all_text = [] # Text must be aggregated if writing to a single output file 622 | for i, infilename in enumerate(infilenames): 623 | parsed_text = get_parsed_text(args, infilename) 624 | if parsed_text: 625 | if args["multiple"]: 626 | if not args["quiet"]: 627 | print("Attempting to write to {0}.".format(outfilename)) 628 | 629 | csv_text = [csv_convert(x) for x in parsed_text] 630 | print(csv_text) 631 | write_file(csv_text, outfilename) 632 | elif args["single"]: 633 | all_text += parsed_text 634 | # Newline added between multiple files being aggregated 635 | if len(infilenames) > 1 and i < len(infilenames) - 1: 636 | all_text.append("\n") 637 | 638 | # Write all text to a single output file 639 | if args["single"] and all_text: 640 | if not args["quiet"]: 641 | print( 642 | "Attempting to write {0} page(s) to {1}.".format( 643 | len(infilenames), outfilename 644 | ) 645 | ) 646 | 647 | csv_text = [csv_convert(x) for x in all_text] 648 | print(csv_text) 649 | write_file(csv_text, outfilename) 650 | 651 | 652 | def write_text_files(args, infilenames, outfilename): 653 | """Write text file(s) to disk. 654 | 655 | Keyword arguments: 656 | args -- program arguments (dict) 657 | infilenames -- names of user-inputted and/or downloaded files (list) 658 | outfilename -- name of output text file (str) 659 | """ 660 | if not outfilename.endswith(".txt"): 661 | outfilename = outfilename + ".txt" 662 | outfilename = overwrite_file_check(args, outfilename) 663 | 664 | all_text = [] # Text must be aggregated if writing to a single output file 665 | for i, infilename in enumerate(infilenames): 666 | parsed_text = get_parsed_text(args, infilename) 667 | if parsed_text: 668 | if args["multiple"]: 669 | if not args["quiet"]: 670 | print("Attempting to write to {0}.".format(outfilename)) 671 | write_file(parsed_text, outfilename) 672 | elif args["single"]: 673 | all_text += parsed_text 674 | # Newline added between multiple files being aggregated 675 | if len(infilenames) > 1 and i < len(infilenames) - 1: 676 | all_text.append("\n") 677 | 678 | # Write all text to a single output file 679 | if args["single"] and all_text: 680 | if not args["quiet"]: 681 | print( 682 | "Attempting to write {0} page(s) to {1}.".format( 683 | len(infilenames), outfilename 684 | ) 685 | ) 686 | write_file(all_text, outfilename) 687 | 688 | 689 | def write_file(data, outfilename): 690 | """Write a single file to disk.""" 691 | if not data: 692 | return False 693 | try: 694 | with open(outfilename, "w") as outfile: 695 | for line in data: 696 | if line: 697 | outfile.write(line) 698 | return True 699 | except (OSError, IOError) as err: 700 | sys.stderr.write( 701 | "An error occurred while writing {0}:\n{1}".format(outfilename, str(err)) 702 | ) 703 | return False 704 | 705 | 706 | def get_num_part_files(): 707 | """Get the number of PART.html files currently saved to disk.""" 708 | num_parts = 0 709 | for filename in os.listdir(os.getcwd()): 710 | if filename.startswith("PART") and filename.endswith(".html"): 711 | num_parts += 1 712 | return num_parts 713 | 714 | 715 | def write_part_images(url, raw_html, html, filename): 716 | """Write image file(s) associated with HTML to disk, substituting filenames. 717 | 718 | Keywords arguments: 719 | url -- the URL from which the HTML has been extracted from (str) 720 | raw_html -- unparsed HTML file content (list) 721 | html -- parsed HTML file content (lxml.html.HtmlElement) (default: None) 722 | filename -- the PART.html filename (str) 723 | 724 | Return raw HTML with image names replaced with local image filenames. 725 | """ 726 | save_dirname = "{0}_files".format(os.path.splitext(filename)[0]) 727 | if not os.path.exists(save_dirname): 728 | os.makedirs(save_dirname) 729 | images = html.xpath("//img/@src") 730 | internal_image_urls = [x for x in images if x.startswith("/")] 731 | 732 | headers = {"User-Agent": random.choice(USER_AGENTS)} 733 | for img_url in images: 734 | img_name = img_url.split("/")[-1] 735 | if "?" in img_name: 736 | img_name = img_name.split("?")[0] 737 | if not os.path.splitext(img_name)[1]: 738 | img_name = "{0}.jpeg".format(img_name) 739 | 740 | try: 741 | full_img_name = os.path.join(save_dirname, img_name) 742 | with open(full_img_name, "wb") as img: 743 | if img_url in internal_image_urls: 744 | # Internal images need base url added 745 | full_img_url = "{0}{1}".format(url.rstrip("/"), img_url) 746 | else: 747 | # External image 748 | full_img_url = img_url 749 | img_content = requests.get( 750 | full_img_url, headers=headers, proxies=get_proxies() 751 | ).content 752 | img.write(img_content) 753 | raw_html = raw_html.replace(escape(img_url), full_img_name) 754 | except (OSError, IOError): 755 | pass 756 | time.sleep(random.uniform(0, 0.5)) # Slight delay between downloads 757 | return raw_html 758 | 759 | 760 | def write_part_file(args, url, raw_html, html=None, part_num=None): 761 | """Write PART.html file(s) to disk, images in PART_files directory. 762 | 763 | Keyword arguments: 764 | args -- program arguments (dict) 765 | raw_html -- unparsed HTML file content (list) 766 | html -- parsed HTML file content (lxml.html.HtmlElement) (default: None) 767 | part_num -- PART(#).html file number (int) (default: None) 768 | """ 769 | if part_num is None: 770 | part_num = get_num_part_files() + 1 771 | filename = "PART{0}.html".format(part_num) 772 | 773 | # Decode bytes to string in Python 3 versions 774 | if not PY2 and isinstance(raw_html, bytes): 775 | raw_html = raw_html.encode("ascii", "ignore") 776 | 777 | # Convert html to an lh.HtmlElement object for parsing/saving images 778 | if html is None: 779 | html = lh.fromstring(raw_html) 780 | 781 | # Parse HTML if XPath entered 782 | if args["xpath"]: 783 | raw_html = parse_html(html, args["xpath"]) 784 | if isinstance(raw_html, list): 785 | if not isinstance(raw_html[0], lh.HtmlElement): 786 | raise ValueError("XPath should return an HtmlElement object.") 787 | else: 788 | if not isinstance(raw_html, lh.HtmlElement): 789 | raise ValueError("XPath should return an HtmlElement object.") 790 | 791 | # Write HTML and possibly images to disk 792 | if raw_html: 793 | if not args["no_images"] and (args["pdf"] or args["html"]): 794 | raw_html = write_part_images(url, raw_html, html, filename) 795 | with open(filename, "w") as part: 796 | if not isinstance(raw_html, list): 797 | raw_html = [raw_html] 798 | if isinstance(raw_html[0], lh.HtmlElement): 799 | for elem in raw_html: 800 | part.write(lh.tostring(elem)) 801 | else: 802 | for line in raw_html: 803 | part.write(line) 804 | 805 | 806 | def get_part_filenames(num_parts=None, start_num=0): 807 | """Get numbered PART.html filenames.""" 808 | if num_parts is None: 809 | num_parts = get_num_part_files() 810 | return ["PART{0}.html".format(i) for i in range(start_num + 1, num_parts + 1)] 811 | 812 | 813 | def read_files(filenames): 814 | """Read a file into memory.""" 815 | if isinstance(filenames, list): 816 | for filename in filenames: 817 | with open(filename, "r") as infile: 818 | return infile.read() 819 | else: 820 | with open(filenames, "r") as infile: 821 | return infile.read() 822 | 823 | 824 | def remove_part_images(filename): 825 | """Remove PART(#)_files directory containing images from disk.""" 826 | dirname = "{0}_files".format(os.path.splitext(filename)[0]) 827 | if os.path.exists(dirname): 828 | shutil.rmtree(dirname) 829 | 830 | 831 | def remove_part_files(num_parts=None): 832 | """Remove PART(#).html files and image directories from disk.""" 833 | filenames = get_part_filenames(num_parts) 834 | for filename in filenames: 835 | remove_part_images(filename) 836 | remove_file(filename) 837 | 838 | 839 | # User input and sanitation functions 840 | # 841 | 842 | 843 | def confirm_input(user_input): 844 | """Check user input for yes, no, or an exit signal.""" 845 | if isinstance(user_input, list): 846 | user_input = "".join(user_input) 847 | 848 | try: 849 | u_inp = user_input.lower().strip() 850 | except AttributeError: 851 | u_inp = user_input 852 | 853 | # Check for exit signal 854 | if u_inp in ("q", "quit", "exit"): 855 | sys.exit() 856 | if u_inp in ("y", "yes"): 857 | return True 858 | return False 859 | 860 | 861 | # Miscellaneous functions 862 | # 863 | 864 | 865 | def mkdir_and_cd(dirname): 866 | """Change directory and/or create it if necessary.""" 867 | if not os.path.exists(dirname): 868 | os.makedirs(dirname) 869 | os.chdir(dirname) 870 | else: 871 | os.chdir(dirname) 872 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | import scrape 5 | import os 6 | 7 | 8 | def read(*names): 9 | values = dict() 10 | extensions = [".txt", ".rst"] 11 | for name in names: 12 | value = "" 13 | for extension in extensions: 14 | filename = name + extension 15 | if os.path.isfile(filename): 16 | value = open(name + extension).read() 17 | break 18 | values[name] = value 19 | return values 20 | 21 | 22 | with open( 23 | os.path.join(os.path.abspath(os.path.dirname(__file__)), "README.rst"), 24 | encoding="utf-8", 25 | ) as f: 26 | long_description = f.read() 27 | 28 | 29 | setup( 30 | name="scrape", 31 | version=scrape.__version__, 32 | description="a command-line web scraping tool", 33 | long_description=long_description, 34 | long_description_content_type="text/x-rst", 35 | classifiers=[ 36 | "Development Status :: 4 - Beta", 37 | "Environment :: Console", 38 | "Environment :: Web Environment", 39 | "Intended Audience :: End Users/Desktop", 40 | "Intended Audience :: Developers", 41 | "Intended Audience :: System Administrators", 42 | "License :: OSI Approved :: MIT License", 43 | "Operating System :: OS Independent", 44 | "Programming Language :: Python :: 3", 45 | "Programming Language :: Python :: 3.6", 46 | "Programming Language :: Python :: 3.7", 47 | "Programming Language :: Python :: 3.8", 48 | "Programming Language :: Python :: 3.9", 49 | "Topic :: Utilities", 50 | "Topic :: Text Processing", 51 | ], 52 | keywords="web crawler scraper scrape crawl download filter save webpages websites images docs document documentation pdf csv html lxml", 53 | author="Hunter H", 54 | author_email="huntrar@gmail.com", 55 | maintainer="Hunter H", 56 | maintainer_email="huntrar@gmail.com", 57 | url="https://github.com/huntrar/scrape", 58 | license="MIT", 59 | packages=find_packages(), 60 | entry_points={"console_scripts": ["scrape = scrape.scrape:command_line_runner"]}, 61 | install_requires=["lxml", "pdfkit", "requests", "six", "tldextract"], 62 | ) 63 | -------------------------------------------------------------------------------- /testing/admissions.html: -------------------------------------------------------------------------------- 1 | 2 | ADMISSION TO ONLINE COLLEGE 3 |

4 | Aplicants are considered for admission to Online College 5 | on the basis of their ISP, quality of their home pages and 6 | quantity of emails exchanged per day. 7 |

8 | It is recommended that students prepare for enrollment in 9 | Online College by signing up for DSL service and 10 | buying a new computer. 11 |

12 | Back to Online College home page 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /testing/courses.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

4 | ONLINE COLLEGE COURSES: 5 |

6 | Online College offers degrees in 7 | beginning and advanced Web-surfing, 8 | email writing courses and the sociology of 9 | chat rooms. 10 |

11 | Online College provides its students with many 12 | opportunities for extracurricular activities.
Check 13 | the admissions criteria to see if 14 | you qualify to enroll in Online College. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /testing/dropouts.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Sorry, there are no dropouts from Online College! 4 | Maybe they are too busy with 5 | extracurricular activities to find the 6 | time to drop out. 7 |
8 | home 9 | 10 | 11 | -------------------------------------------------------------------------------- /testing/extra.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Students at Online College 4 | participate 5 | in 6 | 7 | 8 | a large number of extracurricular 9 | activities.

The favorite activities are 10 | dating, partying, and doing laundry. 11 |
Online College is proud to be the 12 | national leader in cutting classes. 13 | 14 | To see why, check out our course 15 | offerings.

16 | Back to Online Home page 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /testing/faculty.html: -------------------------------------------------------------------------------- 1 | 2 | Comments from faculty at Online College: 3 | 4 |

"Online College students are exceptionally diligent. 5 | They take Web-surfing very seriously."

6 | "Students seem to prefer sending me emails to submitting 7 | term papers." 8 |

back to Online College home page 9 |

10 | Do you want to hear from out dropouts? 11 | 12 | -------------------------------------------------------------------------------- /testing/home.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | WELCOME TO ONLINE COLLEGE! 4 |

5 | Online College is a senior college in the Online University. 6 | More than 200,000 students are not enrolled in the academic 7 | programs offered through the Online University. 8 |

9 | Prospective students should apply for admission. 10 |
11 | We offer a full schedule of undergraduate 12 | courses as well as extracurricular 13 | ativities. 14 |

You can read testimonials from 15 | students, 16 | faculty and dropouts. 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /testing/students.html: -------------------------------------------------------------------------------- 1 | See what our students have to 2 | say 3 | about 4 | Online 5 | College:

6 | 7 | "Online College is the best! My professors showed me lots of 8 | cool Web sites and were always available for a chat." 9 |

10 | "My favorite course is Physics of Cyberspace." 11 |

12 | "When you take virtual classes, you don't have to watch anyone 13 | yawn." 14 |

15 |

16 | Try to contact our alumnae. 17 | Tell them we would appreciate a donation. 18 |

19 | 20 | Back to Online College home page 21 | -------------------------------------------------------------------------------- /testing/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huntrar/scrape/1dfd98bb0a308ef2a45b1e5dd136c38b17c27bc7/testing/test.pdf -------------------------------------------------------------------------------- /testing/test.txt: -------------------------------------------------------------------------------- 1 | ADMISSION TO ONLINE COLLEGE 2 | Aplicants are considered for admission to Online College 3 | on the basis of their ISP, quality of their home pages and 4 | quantity of emails exchanged per day. 5 | It is recommended that students prepare for enrollment in 6 | Online College by signing up for DSL service and 7 | buying a new computer. 8 | Back to Online College home page 9 | 10 | ONLINE COLLEGE COURSES: 11 | Online College offers degrees in 12 | beginning and advanced Web-surfing, 13 | email writing courses and the sociology of 14 | chat rooms. 15 | Online College provides its students with many 16 | opportunities for 17 | extracurricular activities 18 | . 19 | Check 20 | the 21 | admissions criteria 22 | to see if 23 | you qualify to enroll in Online College. 24 | 25 | Sorry, there are no dropouts from Online College! 26 | Maybe they are too busy with 27 | extracurricular activities 28 | to find the 29 | time to drop out. 30 | home 31 | 32 | Students at Online College 33 | participate 34 | in 35 | 36 | a large number of extracurricular 37 | activities. 38 | The favorite activities are 39 | dating, partying, and doing laundry. 40 | Online College is proud to be the 41 | national leader in cutting classes. 42 | To see why, check out our 43 | course 44 | offerings 45 | . 46 | Back to Online Home page 47 | 48 | Comments from faculty at Online College: 49 | "Online College students are exceptionally diligent. 50 | They take Web-surfing very seriously." 51 | "Students seem to prefer sending me emails to submitting 52 | term papers." 53 | back to Online College home page 54 | Do you want to hear from out 55 | dropouts? 56 | 57 | WELCOME TO ONLINE COLLEGE! 58 | Online College is a senior college in the Online University. 59 | More than 200,000 students are not enrolled in the academic 60 | programs offered through the Online University. 61 | Prospective students should apply for 62 | admission 63 | . 64 | We offer a full schedule of 65 | undergraduate 66 | courses 67 | as well as 68 | extracurricular 69 | ativities. 70 | You can read testimonials from 71 | students, 72 | faculty 73 | and 74 | dropouts 75 | . 76 | 77 | See what our students have to 78 | say 79 | about 80 | Online 81 | College: 82 | "Online College is the best! My professors showed me lots of 83 | cool Web sites and were always available for a chat." 84 | "My favorite course is Physics of Cyberspace." 85 | "When you take virtual classes, you don't have to watch anyone 86 | yawn." 87 | Try to contact our 88 | alumnae. 89 | Tell them we would appreciate a donation. 90 | Back to Online College home page 91 | 92 | One, two 93 | Buckle my shoe. 94 | 95 | Three, four 96 | Shut the door. 97 | Five, six 98 | Pick up sticks. -------------------------------------------------------------------------------- /testing/test1.html: -------------------------------------------------------------------------------- 1 | One, two 2 | Buckle my shoe. 3 | < P > 4 | Three, four 5 | Shut the door. 6 | Five, six 7 | Pick up sticks. 8 | -------------------------------------------------------------------------------- /testing/test_scrape.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Unit tests for scrape""" 4 | 5 | import os 6 | import shutil 7 | import sys 8 | import unittest 9 | 10 | from scrape import scrape, utils 11 | 12 | 13 | class ScrapeTestCase(unittest.TestCase): 14 | def call_scrape(self, cmd, filetype, num_files=None): 15 | if not isinstance(cmd, list): 16 | cmd = [cmd] 17 | parser = scrape.get_parser() 18 | args = vars(parser.parse_args(cmd)) 19 | 20 | args["overwrite"] = True # Avoid overwrite prompt 21 | if args["crawl"] or args["crawl_all"]: 22 | args["no_images"] = True # Avoid save image prompt when crawling 23 | args[filetype] = True 24 | if num_files is not None: 25 | args[num_files] = True 26 | return scrape.scrape(args) 27 | 28 | def setUp(self): 29 | self.original_files = os.listdir(os.getcwd()) 30 | self.html_files = [x for x in self.original_files if x.endswith(".html")] 31 | self.text_files = [x for x in self.original_files if x.endswith(".txt")] 32 | self.query = self.html_files + self.text_files 33 | 34 | def tearDown(self): 35 | pass 36 | 37 | def assert_exists_and_rm(self, filename): 38 | self.assertTrue(os.path.isfile(filename)) 39 | if filename not in self.original_files: 40 | self.assertTrue(utils.remove_file(filename)) 41 | 42 | def delete_subdir(self, domain): 43 | """Delete subdirectory containing HTML files if no other data in it""" 44 | subdir_path = "{0}/{1}".format(os.getcwd(), domain) 45 | files = os.listdir(subdir_path) 46 | files_to_rm = [x for x in files if x.startswith("PART") and x.endswith(".html")] 47 | 48 | if len(files_to_rm) != len(files): 49 | for filename in files_to_rm: 50 | os.remove(filename) 51 | else: 52 | shutil.rmtree(subdir_path) 53 | 54 | def get_single_outfilename(self, query): 55 | """Use first possible entry in query as filename""" 56 | if not isinstance(query, list): 57 | query = [query] 58 | for arg in query: 59 | if arg in self.html_files or arg in self.text_files: 60 | return (".".join(arg.split(".")[:-1])).lower() 61 | sys.stderr.write("Failed to construct a single out filename.\n") 62 | return "" 63 | 64 | """to_pdf functions require wkhtmltopdf executable to run 65 | def test_query_to_multi_pdf(self): 66 | self.call_scrape(self.query, 'pdf', 'multiple') 67 | for filename in self.html_files + self.text_files: 68 | outfilename = '.'.join(filename.split('.')[:-1]) + '.pdf' 69 | self.assert_exists_and_rm(outfilename) 70 | 71 | def test_query_to_single_pdf(self): 72 | self.call_scrape(self.query, 'pdf', 'single') 73 | outfilename = self.get_single_outfilename(self.query) + '.pdf' 74 | self.assert_exists_and_rm(outfilename) 75 | 76 | def test_html_to_pdf(self): 77 | self.call_scrape(self.html_files, 'pdf') 78 | outfilenames = [x.replace('.html', '.pdf') for x in self.html_files] 79 | 80 | # Assert new files have been created, then assert their deletion 81 | for outfilename in outfilenames: 82 | self.assert_exists_and_rm(outfilename) 83 | 84 | def test_text_to_pdf(self): 85 | self.call_scrape(self.text_files, 'pdf') 86 | outfilenames = [x.replace('.txt', '.pdf') for x in self.text_files] 87 | 88 | # Assert new files have been created, then assert their deletion 89 | for outfilename in outfilenames: 90 | self.assert_exists_and_rm(outfilename) 91 | """ 92 | 93 | def test_query_to_multi_text(self): 94 | self.call_scrape(self.query, "text", "multiple") 95 | for filename in self.html_files + self.text_files: 96 | outfilename = ".".join(filename.split(".")[:-1]) + ".txt" 97 | self.assert_exists_and_rm(outfilename) 98 | 99 | def test_query_to_single_text(self): 100 | self.call_scrape(self.query, "text", "single") 101 | outfilename = self.get_single_outfilename(self.query) + ".txt" 102 | self.assert_exists_and_rm(outfilename) 103 | 104 | def test_html_to_text(self): 105 | self.call_scrape(self.html_files, "text") 106 | outfilenames = [x.replace(".html", ".txt") for x in self.html_files] 107 | 108 | # Assert new files have been created, then assert their deletion 109 | for outfilename in outfilenames: 110 | self.assert_exists_and_rm(outfilename) 111 | 112 | 113 | if __name__ == "__main__": 114 | unittest.main() 115 | --------------------------------------------------------------------------------