├── Chapter04_FirstWebScraper.ipynb ├── Chapter05_AdvancedHTMLParsing.ipynb ├── Chapter06_Web-Crawlers.ipynb ├── Chapter07_CrawlingModels.ipynb ├── Chapter08_Scrapy └── wikiSpider │ ├── build │ └── lib │ │ └── wikiSpider │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ ├── __init__.py │ │ ├── article.py │ │ ├── articleItems.py │ │ ├── articlePipelines.py │ │ ├── articles.py │ │ └── articlesMoreRules.py │ ├── project.egg-info │ ├── PKG-INFO │ ├── SOURCES.txt │ ├── dependency_links.txt │ ├── entry_points.txt │ └── top_level.txt │ ├── scrapinghub.yml │ ├── scrapy.cfg │ ├── setup.py │ └── wikiSpider │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-39.pyc │ ├── items.cpython-39.pyc │ ├── pipelines.cpython-39.pyc │ └── settings.cpython-39.pyc │ ├── articles.csv │ ├── articles.json │ ├── articles.xml │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-39.pyc │ ├── article.cpython-39.pyc │ ├── articleItems.cpython-39.pyc │ ├── articlePipelines.cpython-39.pyc │ └── articlesMoreRules.cpython-39.pyc │ ├── article.py │ ├── articleItems.py │ ├── articlePipelines.py │ ├── articles.py │ └── articlesMoreRules.py ├── Chapter09_StoringData.ipynb ├── Chapter10_ReadingDocuments.ipynb ├── Chapter11_CleaningYourDirtyData.ipynb ├── Chapter12_NaturalLanguages.ipynb ├── Chapter13_CrawlingThroughFormsAndLogins.ipynb ├── Chapter14_JavaScript.ipynb ├── Chapter15_API-Scraping.ipynb ├── Chapter16_ImageProcessing.ipynb ├── Chapter16_ImageProcessingFiles ├── .DS_Store ├── kitten.jpg ├── kitten_blurred.jpg ├── test.png ├── text.png ├── text_2.png ├── text_bad.png ├── text_cleaned.png └── textoutput.txt ├── Chapter16_Parallel ├── multiprocess.py ├── multiprocess_example.py ├── multiprocess_queue.py ├── multithreaded.py ├── multithreaded_class.py ├── multithreaded_example.py ├── multithreaded_queue.py ├── threading_crawler.py └── threading_example.py ├── Chapter17_ScrapingTraps.ipynb ├── Chapter18_Testing.ipynb ├── Chapter19_WebScrapingInParallel.ipynb ├── Chapter20_WebScrapingProxies.ipynb ├── README.md ├── captcha.png ├── captchas ├── .DS_Store ├── 2F8S.png ├── 2PHU.png ├── 2XMT.png ├── 34PQ.png ├── 37QR.png ├── 3A68.png ├── 3BE8.png ├── 3DXJ.png ├── 3FDB.png ├── 3FK7.png ├── 3JW5.png ├── 3LNK.png ├── 3QZZ.png ├── 3RW7.png ├── 3TCH.png ├── 3YL8.png ├── 3ZV3.png ├── 46VL.png ├── 49K7.png ├── 4AUM.png ├── 4QXS.png ├── 4VHT.png ├── 4WSU.png ├── 52X6.png ├── 56AZ.png ├── 65KQ.png ├── 696R.png ├── 6DM4.png ├── 6MGR.png ├── 6PQ8.png ├── 7994.png ├── 7B63.png ├── 7CA2.png ├── 7HSD.png ├── 7MGH.png ├── 7R6J.png ├── 7RK3.png ├── 7VUC.png ├── 7X8F.png ├── 7Y4S.png ├── 832C.png ├── 83CA.png ├── 8696.png ├── 88MU.png ├── 8D8L.png ├── 8MB6.png ├── 8N2Q.png ├── 8N6D.png ├── 8NMS.png ├── 8PRQ.png ├── 8XAQ.png ├── 8YEP.png ├── 9D6N.png ├── 9J8K.png ├── 9J9F.png ├── A23U.png ├── A5HM.png ├── ACQC.png ├── ADU5.png ├── AK6F.png ├── ALX2.png ├── APAR.png ├── AQF2.png ├── ASMW.png ├── BGKH.png ├── BX48.png ├── C6TJ.png ├── CFGF.png ├── CQ34.png ├── CRET.png ├── CX5M.png ├── D675.png ├── DCSR.png ├── DJFF.png ├── DPML.png ├── DTKQ.png ├── DU9H.png ├── DZQW.png ├── E34Y.png ├── E88R.png ├── EASL.png ├── EFZZ.png ├── EJZV.png ├── EKJC.png ├── EMS3.png ├── ERU6.png ├── EW7Q.png ├── EYPK.png ├── FAAS.png ├── FFNS.png ├── FNT9.png ├── FP6Z.png ├── FPL3.png ├── FUYF.png ├── FX8M.png ├── GEV7.png ├── GQ7W.png ├── GSAZ.png ├── GVPA.png ├── GWH9.png ├── GZ45.png ├── H2U5.png ├── HCEA.png ├── HF4F.png ├── HH9N.png ├── HKUM.png ├── JTM7.png ├── K3WQ.png ├── K4U4.png ├── KE7H.png ├── KQCT.png ├── KUR6.png ├── L4V8.png ├── L95D.png ├── LADE.png ├── LDS9.png ├── LH74.png ├── MBE7.png ├── MD5K.png ├── MKTX.png ├── MMB9.png ├── N82K.png ├── NB7Y.png ├── NEQT.png ├── NNMB.png ├── NYJE.png ├── P9UC.png ├── PC5N.png ├── PCEV.png ├── PK7W.png ├── PQWA.png ├── PWF9.png ├── QDKW.png ├── QJJX.png ├── QLAX.png ├── QPP7.png ├── QR3C.png ├── QTHL.png ├── QTP6.png ├── QUEB.png ├── QX4A.png ├── QYWB.png ├── R66E.png ├── RKE5.png ├── RLZ7.png ├── RQTM.png ├── RSE8.png ├── S2UB.png ├── S5QK.png ├── S8Z8.png ├── SAAM.png ├── SZ34.png ├── T2CS.png ├── T46Y.png ├── TJ8H.png ├── TJZS.png ├── TLRX.png ├── TNBP.png ├── TUCS.png ├── TVLQ.png ├── U6GN.png ├── U7CH.png ├── U9EH.png ├── UGA6.png ├── UKXW.png ├── UNKE.png ├── UQZE.png ├── USE7.png ├── UUFN.png ├── V3YY.png ├── V9AH.png ├── VBUM.png ├── VCMC.png ├── VCUD.png ├── VDS5.png ├── VFC5.png ├── VTWE.png ├── W43T.png ├── WAP7.png ├── WH3C.png ├── WKU5.png ├── X5ZS.png ├── X7D4.png ├── XE48.png ├── XER2.png ├── XFCT.png ├── XPGT.png ├── XU8Y.png ├── Y2Z3.png ├── Y5L5.png ├── YAGV.png ├── YEZY.png ├── YGV4.png ├── YMZM.png ├── YRQ8.png ├── YSRA.png ├── ZCBP.png ├── ZNYD.png ├── ZTNL.png └── ZXBW.png ├── downloaded ├── cdn.oreillystatic.com │ ├── ajax │ │ └── libs │ │ │ └── jquery │ │ │ └── 3.3.1 │ │ │ ├── jquery.min.js │ │ │ └── jquery.min.js? │ ├── images │ │ ├── dei │ │ │ ├── deij-odot.svg │ │ │ └── deij-odot.svg? │ │ └── sitewide-headers │ │ │ ├── oreilly_logo_mark_red.svg │ │ │ └── oreilly_logo_mark_red.svg? │ └── oreilly │ │ └── images │ │ ├── amazon-appstore-logo.png │ │ ├── amazon-appstore-logo.png? │ │ ├── app-store-logo.png │ │ ├── app-store-logo.png? │ │ ├── cert-vendor-logos.png │ │ ├── cert-vendor-logos.png? │ │ ├── google-play-logo.png │ │ ├── google-play-logo.png? │ │ ├── home-video-testimonial-thumb1-711x400-20221020.jpg │ │ ├── home-video-testimonial-thumb1-711x400-20221020.jpg? │ │ ├── home-video-testimonial-thumb2-400x225-20221019.jpg │ │ ├── home-video-testimonial-thumb2-400x225-20221019.jpg? │ │ ├── home-video-testimonial-thumb3-711x400-20230201.jpg │ │ ├── home-video-testimonial-thumb3-711x400-20230201.jpg? │ │ ├── home_plot3_lot_600x600.jpg │ │ ├── home_plot3_lot_600x600.jpg? │ │ ├── interactive_laptop_780x426.png │ │ ├── interactive_laptop_780x426.png? │ │ ├── laptop-flat-courses-20230228.png │ │ ├── laptop-flat-courses-20230228.png? │ │ ├── roku-tv-logo.png │ │ └── roku-tv-logo.png? ├── cdnapisec.kaltura.com │ └── p │ │ └── 1681692 │ │ └── sp │ │ └── 168169200 │ │ └── embedIframeJs │ │ └── uiconf_id │ │ └── 47268383 │ │ └── partner_id │ │ ├── 1681692 │ │ └── 1681692? ├── pythonscraping.com │ ├── wp-content │ │ ├── plugins │ │ │ └── pagelayer │ │ │ │ └── js │ │ │ │ ├── combined.js │ │ │ │ └── combined.js?ver=1.5.9 │ │ ├── themes │ │ │ └── popularfx │ │ │ │ └── js │ │ │ │ ├── navigation.js │ │ │ │ └── navigation.js?ver=1.2.0 │ │ └── uploads │ │ │ ├── 2021 │ │ │ └── 08 │ │ │ │ ├── home1.jpg │ │ │ │ ├── home1.jpg? │ │ │ │ ├── logo01-e1681353135199.png │ │ │ │ └── logo01-e1681353135199.png? │ │ │ └── 2023 │ │ │ └── 04 │ │ │ ├── python-logo-e1681354047443.png │ │ │ └── python-logo-e1681354047443.png? │ └── wp-includes │ │ └── js │ │ └── jquery │ │ ├── jquery-migrate.min.js │ │ ├── jquery-migrate.min.js?ver=3.4.0 │ │ ├── jquery.min.js │ │ └── jquery.min.js?ver=3.6.3 └── www.googletagmanager.com │ ├── ns.html │ └── ns.html?id=GTM-5P4V6Z ├── drivers ├── .DS_Store └── chromedriver_mac_arm64 │ ├── LICENSE.chromedriver │ └── chromedriver ├── editors.csv ├── foo.pdf ├── logo.jpg ├── output.txt ├── page.jpg ├── pythonscraping.png ├── result.html └── test.csv /Chapter04_FirstWebScraper.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Writing Your First Web Scraper" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "b'\\n\\nA Useful Page\\n\\n\\n

An Interesting Title

\\n
\\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\\n
\\n\\n\\n'\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "from urllib.request import urlopen\n", 25 | "\n", 26 | "html = urlopen('http://pythonscraping.com/pages/page1.html')\n", 27 | "print(html.read())" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 1, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "

An Interesting Title

\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "from urllib.request import urlopen\n", 45 | "from bs4 import BeautifulSoup\n", 46 | "\n", 47 | "html = urlopen('http://www.pythonscraping.com/pages/page1.html')\n", 48 | "bs = BeautifulSoup(html.read(), 'html.parser')\n", 49 | "print(bs.h1)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 34, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "[]" 61 | ] 62 | }, 63 | "execution_count": 34, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "from urllib.request import urlopen\n", 70 | "from bs4 import BeautifulSoup\n", 71 | "\n", 72 | "html = urlopen('https://en.wikipedia.org/wiki/Iron_Gwazi')\n", 73 | "bs = BeautifulSoup(html.read(), 'html.parser')\n", 74 | "# 'class':['mw-file-description']\n", 75 | "#bs.find_all(attrs={'class': ['mw-ui-icon-wikimedia-listBullet', 'vector-icon']})\n", 76 | "\n", 77 | "bs.find_all(_class='mw-ui-icon-wikimedia-listBullet')" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 2, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "The server could not be found!\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "from urllib.request import urlopen\n", 95 | "from urllib.error import HTTPError\n", 96 | "from urllib.error import URLError\n", 97 | "\n", 98 | "try:\n", 99 | " html = urlopen(\"https://pythonscrapingthisurldoesnotexist.com\")\n", 100 | "except HTTPError as e:\n", 101 | " print(\"The server returned an HTTP error\")\n", 102 | "except URLError as e:\n", 103 | " print(\"The server could not be found!\")\n", 104 | "else:\n", 105 | " print(html.read())" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 5, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "

An Interesting Title

\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "from urllib.request import urlopen\n", 123 | "from urllib.error import HTTPError\n", 124 | "from bs4 import BeautifulSoup\n", 125 | "\n", 126 | "\n", 127 | "def getTitle(url):\n", 128 | " try:\n", 129 | " html = urlopen(url)\n", 130 | " except HTTPError as e:\n", 131 | " return None\n", 132 | " try:\n", 133 | " bsObj = BeautifulSoup(html.read(), \"lxml\")\n", 134 | " title = bsObj.body.h1\n", 135 | " except AttributeError as e:\n", 136 | " return None\n", 137 | " return title\n", 138 | "\n", 139 | "\n", 140 | "title = getTitle(\"http://www.pythonscraping.com/pages/page1.html\")\n", 141 | "if title == None:\n", 142 | " print(\"Title could not be found\")\n", 143 | "else:\n", 144 | " print(title)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "collapsed": true 152 | }, 153 | "outputs": [], 154 | "source": [] 155 | } 156 | ], 157 | "metadata": { 158 | "kernelspec": { 159 | "display_name": "Python 3 (ipykernel)", 160 | "language": "python", 161 | "name": "python3" 162 | }, 163 | "language_info": { 164 | "codemirror_mode": { 165 | "name": "ipython", 166 | "version": 3 167 | }, 168 | "file_extension": ".py", 169 | "mimetype": "text/x-python", 170 | "name": "python", 171 | "nbconvert_exporter": "python", 172 | "pygments_lexer": "ipython3", 173 | "version": "3.9.12" 174 | } 175 | }, 176 | "nbformat": 4, 177 | "nbformat_minor": 2 178 | } 179 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/build/lib/wikiSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter08_Scrapy/wikiSpider/build/lib/wikiSpider/__init__.py -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/build/lib/wikiSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class Article(scrapy.Item): 12 | url = scrapy.Field() 13 | title = scrapy.Field() 14 | text = scrapy.Field() 15 | lastUpdated = scrapy.Field() 16 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/build/lib/wikiSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class WikispiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/build/lib/wikiSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from wikiSpider.items import Article 3 | from string import whitespace 4 | 5 | class WikispiderPipeline(object): 6 | def process_item(self, article, spider): 7 | article['lastUpdated'] = article['lastUpdated'].replace('This page was last edited on', '') 8 | article['lastUpdated'] = article['lastUpdated'].strip() 9 | article['lastUpdated'] = datetime.strptime(article['lastUpdated'], '%d %B %Y, at %H:%M.') 10 | article['text'] = [line for line in article['text'] if line not in whitespace] 11 | article['text'] = ''.join(article['text']) 12 | return article 13 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/build/lib/wikiSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for wikiSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'wikiSpider' 13 | 14 | SPIDER_MODULES = ['wikiSpider.spiders'] 15 | NEWSPIDER_MODULE = 'wikiSpider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'wikiSpider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'wikiSpider.middlewares.WikispiderSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'wikiSpider.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'wikiSpider.pipelines.WikispiderPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/build/lib/wikiSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/build/lib/wikiSpider/spiders/article.py: -------------------------------------------------------------------------------- 1 | from scrapy import Spider, Request 2 | 3 | class ArticleSpider(Spider): 4 | name='article' 5 | 6 | def start_requests(self): 7 | urls = [ 8 | "http://en.wikipedia.org/wiki/Python_%28programming_language%29", 9 | "https://en.wikipedia.org/wiki/Functional_programming", 10 | "https://en.wikipedia.org/wiki/Monty_Python"] 11 | return [Request(url=url, callback=self.parse) for url in urls] 12 | 13 | def parse(self, response): 14 | url = response.url 15 | title = response.css('h1::text').extract_first() 16 | print('URL is: {}'.format(url)) 17 | print('Title is: {}'.format(title)) 18 | 19 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/build/lib/wikiSpider/spiders/articleItems.py: -------------------------------------------------------------------------------- 1 | from scrapy.linkextractors import LinkExtractor 2 | from scrapy.spiders import CrawlSpider, Rule 3 | from wikiSpider.items import Article 4 | 5 | class ArticleSpider(CrawlSpider): 6 | name = 'articleItems' 7 | allowed_domains = ['wikipedia.org'] 8 | start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] 9 | rules = [ 10 | Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'), callback='parse_items', follow=True), 11 | ] 12 | 13 | def parse_items(self, response): 14 | article = Article() 15 | article['url'] = response.url 16 | article['title'] = response.css('h1::text').extract_first() 17 | article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract() 18 | lastUpdated = response.css('li#footer-info-lastmod::text').extract_first() 19 | article['lastUpdated'] = lastUpdated.replace('This page was last edited on ', '') 20 | return article -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/build/lib/wikiSpider/spiders/articlePipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy.linkextractors import LinkExtractor 2 | from scrapy.spiders import CrawlSpider, Rule 3 | from wikiSpider.items import Article 4 | 5 | class ArticleSpider(CrawlSpider): 6 | name = 'articlePipelines' 7 | allowed_domains = ['wikipedia.org'] 8 | start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] 9 | rules = [ 10 | Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'), callback='parse_items', follow=True), 11 | ] 12 | 13 | def parse_items(self, response): 14 | article = Article() 15 | article['url'] = response.url 16 | article['title'] = response.css('h1::text').extract_first() 17 | article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract() 18 | article['lastUpdated'] = response.css('li#footer-info-lastmod::text').extract_first() 19 | return article -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/build/lib/wikiSpider/spiders/articles.py: -------------------------------------------------------------------------------- 1 | from scrapy.linkextractors import LinkExtractor 2 | from scrapy.spiders import CrawlSpider, Rule 3 | 4 | class ArticleSpider(CrawlSpider): 5 | name = 'articles' 6 | allowed_domains = ['wikipedia.org'] 7 | start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] 8 | rules = [Rule(LinkExtractor(allow=r'.*'), callback='parse_items', follow=True)] 9 | 10 | def parse_items(self, response): 11 | url = response.url 12 | title = response.css('h1::text').extract_first() 13 | text = response.xpath('//div[@id="mw-content-text"]//text()').extract() 14 | lastUpdated = response.css('li#footer-info-lastmod::text').extract_first() 15 | lastUpdated = lastUpdated.replace('This page was last edited on ', '') 16 | print('URL is: {}'.format(url)) 17 | print('title is: {} '.format(title)) 18 | print('text is: {}'.format(text)) 19 | print('Last updated: {}'.format(lastUpdated)) 20 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/build/lib/wikiSpider/spiders/articlesMoreRules.py: -------------------------------------------------------------------------------- 1 | from scrapy.linkextractors import LinkExtractor 2 | from scrapy.spiders import CrawlSpider, Rule 3 | 4 | class ArticleSpider(CrawlSpider): 5 | name = 'articles' 6 | allowed_domains = ['wikipedia.org'] 7 | start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] 8 | rules = [ 9 | Rule(LinkExtractor(allow='^(/wiki/)((?!:).)*$'), callback='parse_items', follow=True, cb_kwargs={'is_article': True}), 10 | Rule(LinkExtractor(allow='.*'), callback='parse_items', cb_kwargs={'is_article': False}) 11 | ] 12 | 13 | def parse_items(self, response, is_article): 14 | print(response.url) 15 | title = response.css('h1::text').extract_first() 16 | if is_article: 17 | url = response.url 18 | text = response.xpath('//div[@id="mw-content-text"]//text()').extract() 19 | lastUpdated = response.css('li#footer-info-lastmod::text').extract_first() 20 | lastUpdated = lastUpdated.replace('This page was last edited on ', '') 21 | print('Title is: {} '.format(title)) 22 | print('title is: {} '.format(title)) 23 | print('text is: {}'.format(text)) 24 | else: 25 | print('This is not an article: {}'.format(title)) 26 | 27 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/project.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: project 3 | Version: 1.0 4 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/project.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | project.egg-info/PKG-INFO 3 | project.egg-info/SOURCES.txt 4 | project.egg-info/dependency_links.txt 5 | project.egg-info/entry_points.txt 6 | project.egg-info/top_level.txt 7 | wikiSpider/__init__.py 8 | wikiSpider/items.py 9 | wikiSpider/middlewares.py 10 | wikiSpider/pipelines.py 11 | wikiSpider/settings.py 12 | wikiSpider/spiders/__init__.py 13 | wikiSpider/spiders/article.py 14 | wikiSpider/spiders/articleItems.py 15 | wikiSpider/spiders/articlePipelines.py 16 | wikiSpider/spiders/articles.py 17 | wikiSpider/spiders/articlesMoreRules.py -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/project.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/project.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [scrapy] 2 | settings = wikiSpider.settings 3 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/project.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | wikiSpider 2 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/scrapinghub.yml: -------------------------------------------------------------------------------- 1 | project: 624829 2 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = wikiSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = wikiSpider 12 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: shub deploy 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name = 'project', 7 | version = '1.0', 8 | packages = find_packages(), 9 | entry_points = {'scrapy': ['settings = wikiSpider.settings']}, 10 | ) 11 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter08_Scrapy/wikiSpider/wikiSpider/__init__.py -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter08_Scrapy/wikiSpider/wikiSpider/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/__pycache__/items.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter08_Scrapy/wikiSpider/wikiSpider/__pycache__/items.cpython-39.pyc -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/__pycache__/pipelines.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter08_Scrapy/wikiSpider/wikiSpider/__pycache__/pipelines.cpython-39.pyc -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/__pycache__/settings.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter08_Scrapy/wikiSpider/wikiSpider/__pycache__/settings.cpython-39.pyc -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class Article(scrapy.Item): 12 | url = scrapy.Field() 13 | title = scrapy.Field() 14 | text = scrapy.Field() 15 | lastUpdated = scrapy.Field() 16 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class WikispiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from wikiSpider.items import Article 3 | from string import whitespace 4 | 5 | class WikispiderPipeline(object): 6 | def process_item(self, article, spider): 7 | article['lastUpdated'] = article['lastUpdated'].replace('This page was last edited on', '') 8 | article['lastUpdated'] = article['lastUpdated'].strip() 9 | article['lastUpdated'] = datetime.strptime(article['lastUpdated'], '%d %B %Y, at %H:%M.') 10 | article['text'] = [line for line in article['text'] if line not in whitespace] 11 | article['text'] = ''.join(article['text']) 12 | return article 13 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for wikiSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'wikiSpider' 13 | 14 | SPIDER_MODULES = ['wikiSpider.spiders'] 15 | NEWSPIDER_MODULE = 'wikiSpider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'wikiSpider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'wikiSpider.middlewares.WikispiderSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'wikiSpider.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'wikiSpider.pipelines.WikispiderPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/__pycache__/article.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/__pycache__/article.cpython-39.pyc -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/__pycache__/articleItems.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/__pycache__/articleItems.cpython-39.pyc -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/__pycache__/articlePipelines.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/__pycache__/articlePipelines.cpython-39.pyc -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/__pycache__/articlesMoreRules.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/__pycache__/articlesMoreRules.cpython-39.pyc -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/article.py: -------------------------------------------------------------------------------- 1 | from scrapy import Spider, Request 2 | 3 | class ArticleSpider(Spider): 4 | name='article' 5 | 6 | def start_requests(self): 7 | urls = [ 8 | "http://en.wikipedia.org/wiki/Python_%28programming_language%29", 9 | "https://en.wikipedia.org/wiki/Functional_programming", 10 | "https://en.wikipedia.org/wiki/Monty_Python"] 11 | return [Request(url=url, callback=self.parse) for url in urls] 12 | 13 | def parse(self, response): 14 | url = response.url 15 | title = response.css('h1::text').extract_first() 16 | print('URL is: {}'.format(url)) 17 | print('Title is: {}'.format(title)) 18 | 19 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/articleItems.py: -------------------------------------------------------------------------------- 1 | from scrapy.linkextractors import LinkExtractor 2 | from scrapy.spiders import CrawlSpider, Rule 3 | from wikiSpider.items import Article 4 | 5 | class ArticleSpider(CrawlSpider): 6 | name = 'articleItems' 7 | allowed_domains = ['wikipedia.org'] 8 | start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] 9 | rules = [ 10 | Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'), callback='parse_items', follow=True), 11 | ] 12 | 13 | def parse_items(self, response): 14 | article = Article() 15 | article['url'] = response.url 16 | article['title'] = response.css('h1::text').extract_first() 17 | article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract() 18 | lastUpdated = response.css('li#footer-info-lastmod::text').extract_first() 19 | article['lastUpdated'] = lastUpdated.replace('This page was last edited on ', '') 20 | return article -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/articlePipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy.linkextractors import LinkExtractor 2 | from scrapy.spiders import CrawlSpider, Rule 3 | from wikiSpider.items import Article 4 | 5 | class ArticleSpider(CrawlSpider): 6 | name = 'articlePipelines' 7 | allowed_domains = ['wikipedia.org'] 8 | start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] 9 | rules = [ 10 | Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'), callback='parse_items', follow=True), 11 | ] 12 | 13 | def parse_items(self, response): 14 | article = Article() 15 | article['url'] = response.url 16 | article['title'] = response.css('h1::text').extract_first() 17 | article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract() 18 | article['lastUpdated'] = response.css('li#footer-info-lastmod::text').extract_first() 19 | return article -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/articles.py: -------------------------------------------------------------------------------- 1 | from scrapy.linkextractors import LinkExtractor 2 | from scrapy.spiders import CrawlSpider, Rule 3 | 4 | class ArticleSpider(CrawlSpider): 5 | name = 'articles' 6 | allowed_domains = ['wikipedia.org'] 7 | start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] 8 | rules = [Rule(LinkExtractor(allow=r'.*'), callback='parse_items', follow=True)] 9 | 10 | def parse_items(self, response): 11 | url = response.url 12 | title = response.css('h1::text').extract_first() 13 | text = response.xpath('//div[@id="mw-content-text"]//text()').extract() 14 | lastUpdated = response.css('li#footer-info-lastmod::text').extract_first() 15 | lastUpdated = lastUpdated.replace('This page was last edited on ', '') 16 | print('URL is: {}'.format(url)) 17 | print('title is: {} '.format(title)) 18 | print('text is: {}'.format(text)) 19 | print('Last updated: {}'.format(lastUpdated)) 20 | -------------------------------------------------------------------------------- /Chapter08_Scrapy/wikiSpider/wikiSpider/spiders/articlesMoreRules.py: -------------------------------------------------------------------------------- 1 | from scrapy.linkextractors import LinkExtractor 2 | from scrapy.spiders import CrawlSpider, Rule 3 | 4 | class ArticleSpider(CrawlSpider): 5 | name = 'articles' 6 | allowed_domains = ['wikipedia.org'] 7 | start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] 8 | rules = [ 9 | Rule(LinkExtractor(allow='^(/wiki/)((?!:).)*$'), callback='parse_items', follow=True, cb_kwargs={'is_article': True}), 10 | Rule(LinkExtractor(allow='.*'), callback='parse_items', cb_kwargs={'is_article': False}) 11 | ] 12 | 13 | def parse_items(self, response, is_article): 14 | print(response.url) 15 | title = response.css('h1::text').extract_first() 16 | if is_article: 17 | url = response.url 18 | text = response.xpath('//div[@id="mw-content-text"]//text()').extract() 19 | lastUpdated = response.css('li#footer-info-lastmod::text').extract_first() 20 | lastUpdated = lastUpdated.replace('This page was last edited on ', '') 21 | print('Title is: {} '.format(title)) 22 | print('title is: {} '.format(title)) 23 | print('text is: {}'.format(text)) 24 | else: 25 | print('This is not an article: {}'.format(title)) 26 | 27 | -------------------------------------------------------------------------------- /Chapter13_CrawlingThroughFormsAndLogins.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 79, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Hello there, Ryan Mitchell!\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "import requests\n", 18 | "\n", 19 | "params = {'firstname': 'Ryan', 'lastname': 'Mitchell'}\n", 20 | "r = requests.post(\n", 21 | " 'https://pythonscraping.com/pages/files/processing.php',\n", 22 | " params)\n", 23 | "print(r.text)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 80, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stdout", 33 | "output_type": "stream", 34 | "text": [ 35 | "{\"status\":\"0\",\"errmsg\":\"You have already subscribed.\"}\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "import requests\n", 41 | "params = {\n", 42 | " 'firstname': 'Ryan',\n", 43 | " 'lastname': 'Mitchell',\n", 44 | " 'email': 'ryanemitchell@gmail.com',\n", 45 | " 'action': 'eclg_add_newsletter'\n", 46 | "}\n", 47 | "r = requests.post('https://pythonscraping.com/wp-admin/admin-ajax.php',\n", 48 | " params)\n", 49 | "print(r.text)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 83, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "Sorry, there was an error uploading your file.\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "import requests\n", 67 | "\n", 68 | "files = {'uploadFile': open('logo.jpg', 'rb')}\n", 69 | "r = requests.post('http://pythonscraping.com/pages/processing2.php', files=files)\n", 70 | "print(r.text)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 86, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "\n", 83 | "

Welcome to the Website!

\n", 84 | "You have logged in successfully!
Check out your profile!\n", 85 | "Cookie is set to:\n", 86 | "{'loggedin': '1', 'username': 'Ryan'}\n", 87 | "Going to profile page...\n", 88 | "Hey Ryan! Looks like you're still logged into the site!\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "import requests\n", 94 | "\n", 95 | "params = {'username': 'Ryan', 'password': 'password'}\n", 96 | "r = requests.post(\n", 97 | " 'https://pythonscraping.com/pages/cookies/welcome.php',\n", 98 | " params)\n", 99 | "print(r.text)\n", 100 | "\n", 101 | "print('Cookie is set to:')\n", 102 | "print(r.cookies.get_dict())\n", 103 | "print('Going to profile page...')\n", 104 | "r = requests.get('https://pythonscraping.com/pages/cookies/profile.php', \n", 105 | " cookies=r.cookies)\n", 106 | "print(r.text)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 85, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Cookie is set to:\n", 119 | "{'loggedin': '1', 'username': 'Ryan'}\n", 120 | "Going to profile page...\n", 121 | "Hey Ryan! Looks like you're still logged into the site!\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "import requests\n", 127 | "\n", 128 | "session = requests.Session()\n", 129 | "\n", 130 | "params = {'username': 'Ryan', 'password': 'password'}\n", 131 | "s = session.post('https://pythonscraping.com/pages/cookies/welcome.php', params)\n", 132 | "print('Cookie is set to:')\n", 133 | "print(s.cookies.get_dict())\n", 134 | "print('Going to profile page...')\n", 135 | "s = session.get('https://pythonscraping.com/pages/cookies/profile.php')\n", 136 | "print(s.text)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 88, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "

Hello ryan.

You entered password as your password.

\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "import requests\n", 154 | "from requests.auth import AuthBase\n", 155 | "from requests.auth import HTTPBasicAuth\n", 156 | "\n", 157 | "auth = HTTPBasicAuth('ryan', 'password')\n", 158 | "r = requests.post(\n", 159 | " url='https://pythonscraping.com/pages/auth/login.php', auth=auth)\n", 160 | "print(r.text)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [] 169 | } 170 | ], 171 | "metadata": { 172 | "kernelspec": { 173 | "display_name": "Python 3 (ipykernel)", 174 | "language": "python", 175 | "name": "python3" 176 | }, 177 | "language_info": { 178 | "codemirror_mode": { 179 | "name": "ipython", 180 | "version": 3 181 | }, 182 | "file_extension": ".py", 183 | "mimetype": "text/x-python", 184 | "name": "python", 185 | "nbconvert_exporter": "python", 186 | "pygments_lexer": "ipython3", 187 | "version": "3.9.12" 188 | } 189 | }, 190 | "nbformat": 4, 191 | "nbformat_minor": 2 192 | } 193 | -------------------------------------------------------------------------------- /Chapter14_JavaScript.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from selenium import webdriver\n", 10 | "from selenium.webdriver.chrome.service import Service\n", 11 | "from webdriver_manager.chrome import ChromeDriverManager\n", 12 | "\n", 13 | "CHROMEDRIVER_PATH = ChromeDriverManager().install()\n", 14 | "driver = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH))\n", 15 | "driver.get(\"http://www.python.org\")\n", 16 | "time.sleep(2)\n", 17 | "driver.close()" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 15, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "CHROMEDRIVER_PATH = ChromeDriverManager().install()" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 16, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "Here is some important text you want to retrieve!\n", 39 | "A button to click!\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "from selenium import webdriver\n", 45 | "from selenium.webdriver.common.by import By\n", 46 | "from selenium.webdriver.chrome.options import Options\n", 47 | "import time\n", 48 | "\n", 49 | "chrome_options = Options()\n", 50 | "chrome_options.add_argument(\"--headless\")\n", 51 | "driver = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH), options=chrome_options)\n", 52 | "driver.get('http://pythonscraping.com/pages/javascript/ajaxDemo.html')\n", 53 | "time.sleep(3)\n", 54 | "print(driver.find_element(By.CSS_SELECTOR, '#content').text)\n", 55 | "driver.close()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 18, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "Here is some important text you want to retrieve!\n", 68 | "A button to click!\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "from selenium import webdriver\n", 74 | "from selenium.webdriver.common.by import By\n", 75 | "from selenium.webdriver.support.ui import WebDriverWait\n", 76 | "from selenium.webdriver.support import expected_conditions as EC\n", 77 | "\n", 78 | "chrome_options = Options()\n", 79 | "chrome_options.add_argument(\"--headless\")\n", 80 | "driver = webdriver.Chrome(\n", 81 | " service=Service(CHROMEDRIVER_PATH),\n", 82 | " options=chrome_options)\n", 83 | "\n", 84 | "driver.get('http://pythonscraping.com/pages/javascript/ajaxDemo.html')\n", 85 | "try:\n", 86 | " element = WebDriverWait(driver, 10).until(\n", 87 | " EC.presence_of_element_located((By.ID, 'loadedButton')))\n", 88 | "finally:\n", 89 | " print(driver.find_element(By.ID, 'content').text)\n", 90 | " driver.close()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 35, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "Timing out after 10 seconds and returning\n", 103 | "\n", 104 | "The Destination Page!\n", 105 | "\n", 106 | "\n", 107 | "\n", 108 | "This is the page you are looking for!\n", 109 | "\n", 110 | "\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "from selenium import webdriver\n", 116 | "from selenium.webdriver.chrome.options import Options\n", 117 | "from selenium.common.exceptions import StaleElementReferenceException\n", 118 | "import time\n", 119 | "\n", 120 | "def waitForLoad(driver):\n", 121 | " elem = driver.find_element(By.TAG_NAME, \"html\")\n", 122 | " count = 0\n", 123 | " for _ in range(0, 20):\n", 124 | " try:\n", 125 | " elem == driver.find_element(By.TAG_NAME, \"html\")\n", 126 | " except StaleElementReferenceException:\n", 127 | " return\n", 128 | " time.sleep(0.5)\n", 129 | " print(\"Timing out after 10 seconds and returning\")\n", 130 | " \n", 131 | "chrome_options = Options()\n", 132 | "chrome_options.add_argument(\"--headless\")\n", 133 | "driver = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH), options=chrome_options)\n", 134 | "driver.get(\"http://pythonscraping.com/pages/javascript/redirectDemo1.html\")\n", 135 | "waitForLoad(driver)\n", 136 | "print(driver.page_source)\n", 137 | "driver.close()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 9, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "This is the page you are looking for!\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "from selenium.webdriver.common.by import By\n", 155 | "from selenium.webdriver.support.ui import WebDriverWait\n", 156 | "from selenium.webdriver.chrome.options import Options\n", 157 | "from selenium.webdriver.support import expected_conditions as EC\n", 158 | "from selenium.common.exceptions import TimeoutException\n", 159 | "\n", 160 | "chrome_options = Options()\n", 161 | "chrome_options.add_argument(\"--headless\")\n", 162 | "driver = webdriver.Chrome(\n", 163 | " executable_path='drivers/chromedriver', \n", 164 | " options=chrome_options)\n", 165 | "driver.get('http://pythonscraping.com/pages/javascript/redirectDemo1.html')\n", 166 | "try:\n", 167 | " bodyElement = WebDriverWait(driver, 15).until(EC.presence_of_element_located(\n", 168 | " (By.XPATH, '//body[contains(text(), \"This is the page you are looking for!\")]')))\n", 169 | " print(bodyElement.text)\n", 170 | "except TimeoutException:\n", 171 | " print('Did not find the element')" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "from webdriver_manager.firefox import GeckoDriverManager\n", 181 | "print(GeckoDriverManager().install())" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 31, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "name": "stderr", 191 | "output_type": "stream", 192 | "text": [ 193 | "[WDM] - Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 10.2M/10.2M [00:01<00:00, 7.44MB/s]\n" 194 | ] 195 | }, 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "'/Users/RSpecht/.wdm/drivers/edgedriver/mac64/113.0.1774.57/msedgedriver'" 200 | ] 201 | }, 202 | "execution_count": 31, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "from webdriver_manager.microsoft import EdgeChromiumDriverManager\n", 209 | "print(EdgeChromiumDriverManager().install())" 210 | ] 211 | } 212 | ], 213 | "metadata": { 214 | "kernelspec": { 215 | "display_name": "Python 3 (ipykernel)", 216 | "language": "python", 217 | "name": "python3" 218 | }, 219 | "language_info": { 220 | "codemirror_mode": { 221 | "name": "ipython", 222 | "version": 3 223 | }, 224 | "file_extension": ".py", 225 | "mimetype": "text/x-python", 226 | "name": "python", 227 | "nbconvert_exporter": "python", 228 | "pygments_lexer": "ipython3", 229 | "version": "3.9.12" 230 | } 231 | }, 232 | "nbformat": 4, 233 | "nbformat_minor": 2 234 | } 235 | -------------------------------------------------------------------------------- /Chapter16_ImageProcessingFiles/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter16_ImageProcessingFiles/.DS_Store -------------------------------------------------------------------------------- /Chapter16_ImageProcessingFiles/kitten.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter16_ImageProcessingFiles/kitten.jpg -------------------------------------------------------------------------------- /Chapter16_ImageProcessingFiles/kitten_blurred.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter16_ImageProcessingFiles/kitten_blurred.jpg -------------------------------------------------------------------------------- /Chapter16_ImageProcessingFiles/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter16_ImageProcessingFiles/test.png -------------------------------------------------------------------------------- /Chapter16_ImageProcessingFiles/text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter16_ImageProcessingFiles/text.png -------------------------------------------------------------------------------- /Chapter16_ImageProcessingFiles/text_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter16_ImageProcessingFiles/text_2.png -------------------------------------------------------------------------------- /Chapter16_ImageProcessingFiles/text_bad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter16_ImageProcessingFiles/text_bad.png -------------------------------------------------------------------------------- /Chapter16_ImageProcessingFiles/text_cleaned.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/Chapter16_ImageProcessingFiles/text_cleaned.png -------------------------------------------------------------------------------- /Chapter16_ImageProcessingFiles/textoutput.txt: -------------------------------------------------------------------------------- 1 | This is some text, written in Arial, that \ 2 | Tesseract. Here are some symbols: 3 | -------------------------------------------------------------------------------- /Chapter16_Parallel/multiprocess.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from bs4 import BeautifulSoup 3 | import re 4 | import random 5 | 6 | from multiprocessing import Process, Queue 7 | import os 8 | import time 9 | import Thread 10 | 11 | def getLinks(bsObj, queue): 12 | print('Getting links in {}'.format(os.getpid())) 13 | links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')) 14 | return [link for link in links if link not in queue.get()] 15 | 16 | def scrape_article(path, queue): 17 | queue.get().append() 18 | print("Process {} list is now: {}".format(os.getpid(), visited)) 19 | html = urlopen('http://en.wikipedia.org{}'.format(path)) 20 | time.sleep(5) 21 | bsObj = BeautifulSoup(html, 'html.parser') 22 | title = bsObj.find('h1').get_text() 23 | print('Scraping {} in process {}'.format(title, os.getpid())) 24 | links = getLinks(bsObj) 25 | if len(links) > 0: 26 | newArticle = links[random.randint(0, len(links)-1)].attrs['href'] 27 | print(newArticle) 28 | scrape_article(newArticle) 29 | 30 | processes = [] 31 | queue = Queue() 32 | processes.append(Process(target=scrape_article, args=('/wiki/Kevin_Bacon', queue,))) 33 | processes.append(Process(target=scrape_article, args=('/wiki/Monty_Python', queue,))) 34 | 35 | for p in processes: 36 | p.start() 37 | -------------------------------------------------------------------------------- /Chapter16_Parallel/multiprocess_example.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process 2 | import time 3 | 4 | def print_time(threadName, delay, iterations): 5 | start = int(time.time()) 6 | for i in range(0,iterations): 7 | time.sleep(delay) 8 | seconds_elapsed = str(int(time.time()) - start) 9 | print (threadName if threadName else seconds_elapsed) 10 | 11 | 12 | processes = [] 13 | processes.append(Process(target=print_time, args=(None, 1, 100))) 14 | processes.append(Process(target=print_time, args=("Fizz", 3, 33))) 15 | processes.append(Process(target=print_time, args=("Buzz", 5, 20))) 16 | 17 | for p in processes: 18 | p.start() 19 | 20 | for p in processes: 21 | p.join() 22 | 23 | print("Program complete") -------------------------------------------------------------------------------- /Chapter16_Parallel/multiprocess_queue.py: -------------------------------------------------------------------------------- 1 | 2 | from urllib.request import urlopen 3 | from bs4 import BeautifulSoup 4 | import re 5 | import random 6 | from multiprocessing import Process, Queue 7 | import os 8 | import time 9 | 10 | 11 | def task_delegator(taskQueue, foundUrlsQueue): 12 | #Initialize with a task for each process 13 | visited = ['/wiki/Kevin_Bacon', '/wiki/Monty_Python'] 14 | taskQueue.put('/wiki/Kevin_Bacon') 15 | taskQueue.put('/wiki/Monty_Python') 16 | 17 | while 1: 18 | #Check to see if there are new links in the foundUrlsQueue for processing 19 | if not foundUrlsQueue.empty(): 20 | links = [link for link in foundUrlsQueue.get() if link not in visited] 21 | for link in links: 22 | #Add new link to the taskQueue 23 | taskQueue.put(link) 24 | #Add new link to the visited list 25 | visited.append(link) 26 | 27 | def get_links(bsObj): 28 | links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')) 29 | return [link.attrs['href'] for link in links] 30 | 31 | def scrape_article(taskQueue, foundUrlsQueue): 32 | while 1: 33 | while taskQueue.empty(): 34 | #Sleep 100 ms while waiting for the task queue 35 | #This should be rare 36 | time.sleep(.1) 37 | path = taskQueue.get() 38 | html = urlopen('http://en.wikipedia.org{}'.format(path)) 39 | time.sleep(5) 40 | bsObj = BeautifulSoup(html, 'html.parser') 41 | title = bsObj.find('h1').get_text() 42 | print('Scraping {} in process {}'.format(title, os.getpid())) 43 | links = get_links(bsObj) 44 | #Send these to the delegator for processing 45 | foundUrlsQueue.put(links) 46 | 47 | 48 | processes = [] 49 | taskQueue = Queue() 50 | foundUrlsQueue = Queue() 51 | processes.append(Process(target=task_delegator, args=(taskQueue, foundUrlsQueue,))) 52 | processes.append(Process(target=scrape_article, args=(taskQueue, foundUrlsQueue,))) 53 | processes.append(Process(target=scrape_article, args=(taskQueue, foundUrlsQueue,))) 54 | 55 | for p in processes: 56 | p.start() 57 | -------------------------------------------------------------------------------- /Chapter16_Parallel/multithreaded.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from bs4 import BeautifulSoup 3 | import re 4 | import random 5 | 6 | import _thread 7 | import time 8 | 9 | visited = [] 10 | def getLinks(thread_name, bsObj): 11 | print('Getting links in {}'.format(thread_name)) 12 | links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')) 13 | return [link for link in links if link not in visited] 14 | 15 | def scrape_article(thread_name, path): 16 | visited.append(path) 17 | html = urlopen('http://en.wikipedia.org{}'.format(path)) 18 | time.sleep(5) 19 | bsObj = BeautifulSoup(html, 'html.parser') 20 | title = bsObj.find('h1').get_text() 21 | print('Scraping {} in thread {}'.format(title, thread_name)) 22 | links = getLinks(thread_name, bsObj) 23 | if len(links) > 0: 24 | newArticle = links[random.randint(0, len(links)-1)].attrs['href'] 25 | print(newArticle) 26 | scrape_article(thread_name, newArticle) 27 | 28 | 29 | try: 30 | _thread.start_new_thread(scrape_article, ('Thread 1', '/wiki/Kevin_Bacon',)) 31 | _thread.start_new_thread(scrape_article, ('Thread 2', '/wiki/Monty_Python',)) 32 | except: 33 | print ('Error: unable to start threads') 34 | 35 | while 1: 36 | pass -------------------------------------------------------------------------------- /Chapter16_Parallel/multithreaded_class.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from bs4 import BeautifulSoup 3 | import re 4 | import random 5 | 6 | import _thread 7 | import time 8 | 9 | visited = [] 10 | def getLinks(thread_name, bsObj): 11 | print('Getting links in {}'.format(thread_name)) 12 | links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')) 13 | return [link for link in links if link not in visited] 14 | 15 | # Define a function for the thread 16 | def scrape_article(thread_name, path): 17 | visited.append(path) 18 | html = urlopen('http://en.wikipedia.org{}'.format(path)) 19 | time.sleep(5) 20 | bsObj = BeautifulSoup(html, 'html.parser') 21 | title = bsObj.find('h1').get_text() 22 | print('Scraping {} in thread {}'.format(title, thread_name)) 23 | links = getLinks(thread_name, bsObj) 24 | if len(links) > 0: 25 | newArticle = links[random.randint(0, len(links)-1)].attrs['href'] 26 | print(newArticle) 27 | scrape_article(thread_name, newArticle) 28 | 29 | 30 | # Create two threads as follows 31 | try: 32 | _thread.start_new_thread(scrape_article, ('Thread 1', '/wiki/Kevin_Bacon',)) 33 | _thread.start_new_thread(scrape_article, ('Thread 2', '/wiki/Monty_Python',)) 34 | except: 35 | print ('Error: unable to start threads') 36 | 37 | while 1: 38 | pass -------------------------------------------------------------------------------- /Chapter16_Parallel/multithreaded_example.py: -------------------------------------------------------------------------------- 1 | import _thread 2 | import time 3 | 4 | def print_time(threadName, delay, iterations): 5 | start = int(time.time()) 6 | for i in range(0,iterations): 7 | time.sleep(delay) 8 | seconds_elapsed = str(int(time.time()) - start) 9 | print (threadName if threadName else seconds_elapsed) 10 | 11 | try: 12 | _thread.start_new_thread(print_time, (None, 1, 100)) 13 | _thread.start_new_thread(print_time, ("Fizz", 3, 33)) 14 | _thread.start_new_thread(print_time, ("Buzz", 5, 20)) 15 | except: 16 | print ("Error: unable to start thread") 17 | 18 | while 1: 19 | pass -------------------------------------------------------------------------------- /Chapter16_Parallel/multithreaded_queue.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from bs4 import BeautifulSoup 3 | import re 4 | import random 5 | import _thread 6 | from queue import Queue 7 | import time 8 | import pymysql 9 | 10 | 11 | def storage(queue): 12 | conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='', db='mysql', charset='utf8') 13 | cur = conn.cursor() 14 | cur.execute('USE wiki_threads') 15 | while 1: 16 | if not queue.empty(): 17 | article = queue.get() 18 | cur.execute('SELECT * FROM pages WHERE path = %s', (article["path"])) 19 | if cur.rowcount == 0: 20 | print("Storing article {}".format(article["title"])) 21 | cur.execute('INSERT INTO pages (title, path) VALUES (%s, %s)', (article["title"], article["path"])) 22 | conn.commit() 23 | else: 24 | print("Article already exists: {}".format(article['title'])) 25 | 26 | visited = [] 27 | def getLinks(thread_name, bsObj): 28 | print('Getting links in {}'.format(thread_name)) 29 | links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')) 30 | return [link for link in links if link not in visited] 31 | 32 | def scrape_article(thread_name, path, queue): 33 | visited.append(path) 34 | html = urlopen('http://en.wikipedia.org{}'.format(path)) 35 | time.sleep(5) 36 | bsObj = BeautifulSoup(html, 'html.parser') 37 | title = bsObj.find('h1').get_text() 38 | print('Added {} for storage in thread {}'.format(title, thread_name)) 39 | queue.put({"title":title, "path":path}) 40 | links = getLinks(thread_name, bsObj) 41 | if len(links) > 0: 42 | newArticle = links[random.randint(0, len(links)-1)].attrs['href'] 43 | scrape_article(thread_name, newArticle, queue) 44 | 45 | queue = Queue() 46 | try: 47 | _thread.start_new_thread(scrape_article, ('Thread 1', '/wiki/Kevin_Bacon', queue,)) 48 | _thread.start_new_thread(scrape_article, ('Thread 2', '/wiki/Monty_Python', queue,)) 49 | _thread.start_new_thread(storage, (queue,)) 50 | except: 51 | print ('Error: unable to start threads') 52 | 53 | while 1: 54 | pass -------------------------------------------------------------------------------- /Chapter16_Parallel/threading_crawler.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import time 3 | 4 | class Crawler(threading.Thread): 5 | def __init__(self): 6 | threading.Thread.__init__(self) 7 | self.done = False 8 | 9 | def isDone(self): 10 | return self.done 11 | 12 | def run(self): 13 | time.sleep(5) 14 | self.done = True 15 | raise Exception('Something bad happened!') 16 | 17 | t = Crawler() 18 | t.start() 19 | 20 | while True: 21 | time.sleep(1) 22 | if t.isDone(): 23 | print('Done') 24 | break 25 | if not t.isAlive(): 26 | t = Crawler() 27 | t.start() 28 | 29 | -------------------------------------------------------------------------------- /Chapter16_Parallel/threading_example.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import time 3 | 4 | def print_time(threadName, delay, iterations): 5 | start = int(time.time()) 6 | for i in range(0,iterations): 7 | time.sleep(delay) 8 | seconds_elapsed = str(int(time.time()) - start) 9 | print ('{} {}'.format(seconds_elapsed, threadName)) 10 | 11 | t = threading.Thread(target=print_time, args=('Fizz', 3, 33)).start() 12 | t = threading.Thread(target=print_time, args=('Buzz', 5, 20)).start() 13 | t = threading.Thread(target=print_time, args=('Counter', 1, 100)).start() 14 | 15 | -------------------------------------------------------------------------------- /Chapter17_ScrapingTraps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "\n", 13 | "\n", 14 | "ACCEPT\n", 15 | "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7\n", 16 | "\n", 17 | "\n", 18 | "ACCEPT-ENCODING\n", 19 | "gzip, deflate, br\n", 20 | "\n", 21 | "\n", 22 | "CONNECTION\n", 23 | "keep-alive\n", 24 | "\n", 25 | "\n", 26 | "CONTENT-LENGTH\n", 27 | "\n", 28 | "\n", 29 | "\n", 30 | "CONTENT-TYPE\n", 31 | "\n", 32 | "\n", 33 | "\n", 34 | "HOST\n", 35 | "www.whatismybrowser.com\n", 36 | "\n", 37 | "\n", 38 | "USER-AGENT\n", 39 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36\n", 40 | "\n", 41 | ">\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "import requests\n", 47 | "from bs4 import BeautifulSoup\n", 48 | "\n", 49 | "session = requests.Session()\n", 50 | "headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',\n", 51 | " 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'}\n", 52 | "url = 'https://www.whatismybrowser.com/'\\\n", 53 | " 'developers/what-http-headers-is-my-browser-sending'\n", 54 | "req = session.get(url, headers=headers)\n", 55 | "\n", 56 | "bs = BeautifulSoup(req.text, 'html.parser')\n", 57 | "print(bs.find('table', {'class':'table-striped'}).get_text)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "from webdriver_manager.chrome import ChromeDriverManager\n", 67 | "CHROMEDRIVER_PATH = ChromeDriverManager().install()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 7, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "[{'domain': '.pythonscraping.com', 'expiry': 1722996491, 'httpOnly': False, 'name': '_ga', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'GA1.1.285394841.1688436491'}, {'domain': '.pythonscraping.com', 'expiry': 1722996491, 'httpOnly': False, 'name': '_ga_G60J5CGY1N', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'GS1.1.1688436491.1.0.1688436491.0.0.0'}]\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "from selenium import webdriver\n", 85 | "from selenium.webdriver.chrome.options import Options\n", 86 | "\n", 87 | "chrome_options = Options()\n", 88 | "chrome_options.add_argument(\"--headless\")\n", 89 | "driver = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH), options=chrome_options)\n", 90 | "\n", 91 | "driver.get('http://pythonscraping.com')\n", 92 | "driver.implicitly_wait(1)\n", 93 | "print(driver.get_cookies())" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 9, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "[{'domain': '.pythonscraping.com', 'expiry': 1722997590, 'httpOnly': False, 'name': '_ga', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'GA1.1.678086850.1688437590'}, {'domain': '.pythonscraping.com', 'expiry': 1722997590, 'httpOnly': False, 'name': '_ga_G60J5CGY1N', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'GS1.1.1688437589.1.0.1688437589.0.0.0'}]\n", 106 | "[{'domain': '.pythonscraping.com', 'expiry': 1722997597, 'httpOnly': False, 'name': '_ga_G60J5CGY1N', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'GS1.1.1688437589.1.1.1688437597.0.0.0'}, {'domain': '.pythonscraping.com', 'expiry': 1722997597, 'httpOnly': False, 'name': '_ga', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'GA1.1.678086850.1688437590'}]\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "from selenium import webdriver\n", 112 | "from selenium.webdriver.chrome.options import Options\n", 113 | "\n", 114 | "chrome_options = Options()\n", 115 | "chrome_options.add_argument(\"--headless\")\n", 116 | "driver = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH), options=chrome_options)\n", 117 | "\n", 118 | "driver.get('http://pythonscraping.com')\n", 119 | "driver.implicitly_wait(1)\n", 120 | "\n", 121 | "savedCookies = driver.get_cookies()\n", 122 | "print(savedCookies)\n", 123 | "\n", 124 | "driver2 = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH), options=chrome_options)\n", 125 | "\n", 126 | "driver2.get('http://pythonscraping.com')\n", 127 | "driver2.delete_all_cookies()\n", 128 | "for cookie in savedCookies:\n", 129 | " driver2.add_cookie(cookie)\n", 130 | "\n", 131 | "driver2.get('http://pythonscraping.com')\n", 132 | "driver.implicitly_wait(1)\n", 133 | "print(driver2.get_cookies())" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 15, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "name": "stdout", 143 | "output_type": "stream", 144 | "text": [ 145 | "The link http://pythonscraping.com/dontgohere is a trap\n", 146 | "Do not change value of phone\n", 147 | "Do not change value of email\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "from selenium import webdriver\n", 153 | "from selenium.webdriver.remote.webelement import WebElement\n", 154 | "from selenium.webdriver.chrome.options import Options\n", 155 | "from selenium.webdriver.common.by import By\n", 156 | "\n", 157 | "driver = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH))\n", 158 | "\n", 159 | "driver.get('http://pythonscraping.com/pages/itsatrap.html')\n", 160 | "links = driver.find_elements(By.TAG_NAME, 'a')\n", 161 | "for link in links:\n", 162 | " if not link.is_displayed():\n", 163 | " print(f'The link {link.get_attribute(\"href\")} is a trap')\n", 164 | "\n", 165 | "fields = driver.find_elements(By.TAG_NAME, 'input')\n", 166 | "for field in fields:\n", 167 | " if not field.is_displayed():\n", 168 | " print(f'Do not change value of {field.get_attribute(\"name\")}')" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [] 177 | } 178 | ], 179 | "metadata": { 180 | "kernelspec": { 181 | "display_name": "Python 3 (ipykernel)", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.9.12" 196 | } 197 | }, 198 | "nbformat": 4, 199 | "nbformat_minor": 2 200 | } 201 | -------------------------------------------------------------------------------- /Chapter18_Testing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | ".\n", 13 | "----------------------------------------------------------------------\n", 14 | "Ran 1 test in 0.001s\n", 15 | "\n", 16 | "OK\n" 17 | ] 18 | }, 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "Setting up the test\n", 24 | "Tearing down the test\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "import unittest\n", 30 | "\n", 31 | "class TestAddition(unittest.TestCase):\n", 32 | " def setUp(self):\n", 33 | " print('Setting up the test')\n", 34 | "\n", 35 | " def tearDown(self):\n", 36 | " print('Tearing down the test')\n", 37 | "\n", 38 | " def test_twoPlusTwo(self):\n", 39 | " total = 2+2\n", 40 | " self.assertEqual(4, total);\n", 41 | "\n", 42 | "if __name__ == '__main__':\n", 43 | " unittest.main(argv=[''], exit=False)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stderr", 53 | "output_type": "stream", 54 | "text": [ 55 | "." 56 | ] 57 | }, 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "Setting up the test\n", 63 | "Tearing down the test\n" 64 | ] 65 | }, 66 | { 67 | "name": "stderr", 68 | "output_type": "stream", 69 | "text": [ 70 | "..\n", 71 | "----------------------------------------------------------------------\n", 72 | "Ran 3 tests in 0.719s\n", 73 | "\n", 74 | "OK\n" 75 | ] 76 | }, 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "Once deleted, variables cannot be recovered. Proceed (y/[n])? \n", 82 | "Nothing done.\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "from urllib.request import urlopen\n", 88 | "from bs4 import BeautifulSoup\n", 89 | "import unittest\n", 90 | "\n", 91 | "class TestWikipedia(unittest.TestCase):\n", 92 | " bs = None\n", 93 | " def setUpClass():\n", 94 | " url = 'http://en.wikipedia.org/wiki/Monty_Python'\n", 95 | " TestWikipedia.bs = BeautifulSoup(urlopen(url), 'html.parser')\n", 96 | "\n", 97 | " def test_titleText(self):\n", 98 | " pageTitle = TestWikipedia.bs.find('h1').get_text()\n", 99 | " self.assertEqual('Monty Python', pageTitle);\n", 100 | "\n", 101 | " def test_contentExists(self):\n", 102 | " content = TestWikipedia.bs.find('div',{'id':'mw-content-text'})\n", 103 | " self.assertIsNotNone(content)\n", 104 | "\n", 105 | "\n", 106 | "if __name__ == '__main__':\n", 107 | " unittest.main(argv=[''], exit=False)\n", 108 | " %reset\n" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 5, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "name": "stderr", 118 | "output_type": "stream", 119 | "text": [ 120 | "." 121 | ] 122 | }, 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "Setting up the test\n", 128 | "Tearing down the test\n" 129 | ] 130 | }, 131 | { 132 | "name": "stderr", 133 | "output_type": "stream", 134 | "text": [ 135 | ".\n", 136 | "----------------------------------------------------------------------\n", 137 | "Ran 2 tests in 7.159s\n", 138 | "\n", 139 | "OK\n" 140 | ] 141 | }, 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "Done!\n", 147 | "Once deleted, variables cannot be recovered. Proceed (y/[n])? y\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "from urllib.request import urlopen\n", 153 | "from bs4 import BeautifulSoup\n", 154 | "import unittest\n", 155 | "import re\n", 156 | "import random\n", 157 | "from urllib.parse import unquote\n", 158 | "\n", 159 | "class TestWikipedia(unittest.TestCase):\n", 160 | "\n", 161 | " def test_PageProperties(self):\n", 162 | " self.url = 'http://en.wikipedia.org/wiki/Monty_Python'\n", 163 | " #Test the first 10 pages we encounter\n", 164 | " for i in range(1, 10):\n", 165 | " self.bs = BeautifulSoup(urlopen(self.url), 'html.parser')\n", 166 | " titles = self.titleMatchesURL()\n", 167 | " self.assertEqual(titles[0], titles[1])\n", 168 | " self.assertTrue(self.contentExists())\n", 169 | " self.url = self.getNextLink()\n", 170 | " print('Done!')\n", 171 | "\n", 172 | " def titleMatchesURL(self):\n", 173 | " pageTitle = self.bs.find('h1').get_text()\n", 174 | " urlTitle = self.url[(self.url.index('/wiki/')+6):]\n", 175 | " urlTitle = urlTitle.replace('_', ' ')\n", 176 | " urlTitle = unquote(urlTitle)\n", 177 | " return [pageTitle.lower(), urlTitle.lower()]\n", 178 | "\n", 179 | " def contentExists(self):\n", 180 | " content = self.bs.find('div',{'id':'mw-content-text'})\n", 181 | " if content is not None:\n", 182 | " return True\n", 183 | " return False\n", 184 | "\n", 185 | " def getNextLink(self):\n", 186 | " # Returns random link on page, using technique from Chapter 3\n", 187 | " links = self.bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))\n", 188 | " randomLink = random.SystemRandom().choice(links)\n", 189 | " return f'https://wikipedia.org{randomLink.attrs[\"href\"]}'\n", 190 | " \n", 191 | "\n", 192 | "if __name__ == '__main__':\n", 193 | " unittest.main(argv=[''], exit=False)\n", 194 | " %reset" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 23, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "from webdriver_manager.chrome import ChromeDriverManager\n", 204 | "\n", 205 | "CHROMEDRIVER_PATH = ChromeDriverManager().install()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 19, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "Hello there, Ryan Mitchell!\n" 218 | ] 219 | } 220 | ], 221 | "source": [ 222 | "from selenium import webdriver\n", 223 | "from selenium.webdriver.chrome.service import Service\n", 224 | "from selenium.webdriver.common.keys import Keys\n", 225 | "from selenium.webdriver import ActionChains\n", 226 | "from selenium.webdriver.chrome.options import Options\n", 227 | "from selenium.webdriver.common.by import By\n", 228 | "\n", 229 | "chrome_options = Options()\n", 230 | "chrome_options.add_argument(\"--headless\")\n", 231 | "\n", 232 | "driver = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH), options=chrome_options)\n", 233 | "\n", 234 | "driver.get('http://pythonscraping.com/pages/files/form.html')\n", 235 | "\n", 236 | "firstnameField = driver.find_element(By.NAME, 'firstname')\n", 237 | "lastnameField = driver.find_element(By.NAME, 'lastname')\n", 238 | "submitButton = driver.find_element(By.ID, 'submit')\n", 239 | "\n", 240 | "### METHOD 1 ###\n", 241 | "firstnameField.send_keys('Ryan')\n", 242 | "lastnameField.send_keys('Mitchell')\n", 243 | "submitButton.click()\n", 244 | "################\n", 245 | "\n", 246 | "### METHOD 2 ###\n", 247 | "#actions = ActionChains(driver).click(firstnameField).send_keys('Ryan').click(lastnameField).send_keys('Mitchell').send_keys(Keys.RETURN)\n", 248 | "#actions.perform()\n", 249 | "################\n", 250 | "\n", 251 | "print(driver.find_element(By.TAG_NAME, 'body').text)\n", 252 | "\n", 253 | "driver.close()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 24, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "from webdriver_manager.chrome import ChromeDriverManager\n", 263 | "\n", 264 | "CHROMEDRIVER_PATH = ChromeDriverManager().install()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 20, 270 | "metadata": { 271 | "scrolled": true 272 | }, 273 | "outputs": [ 274 | { 275 | "name": "stderr", 276 | "output_type": "stream", 277 | "text": [ 278 | ".\n", 279 | "----------------------------------------------------------------------\n", 280 | "Ran 1 test in 3.137s\n", 281 | "\n", 282 | "OK\n" 283 | ] 284 | }, 285 | { 286 | "name": "stdout", 287 | "output_type": "stream", 288 | "text": [ 289 | "Once deleted, variables cannot be recovered. Proceed (y/[n])? y\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "from selenium import webdriver\n", 295 | "from selenium.webdriver import ActionChains\n", 296 | "from selenium.webdriver.chrome.options import Options\n", 297 | "from selenium.webdriver.common.by import By\n", 298 | "from selenium.webdriver.chrome.service import Service\n", 299 | "import unittest\n", 300 | "\n", 301 | "\n", 302 | "class TestDragAndDrop(unittest.TestCase):\n", 303 | " driver = None\n", 304 | "\n", 305 | " def setUp(self):\n", 306 | " chrome_options = Options()\n", 307 | " chrome_options.add_argument(\"--headless\")\n", 308 | " self.driver = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH), options=chrome_options)\n", 309 | " url = 'http://pythonscraping.com/pages/javascript/draggableDemo.html'\n", 310 | " self.driver.get(url)\n", 311 | "\n", 312 | " def tearDown(self):\n", 313 | " self.driver.close()\n", 314 | "\n", 315 | " def test_drag(self):\n", 316 | " element = self.driver.find_element(By.ID, 'draggable')\n", 317 | " target = self.driver.find_element(By.ID, 'div2')\n", 318 | " actions = ActionChains(self.driver)\n", 319 | " actions.drag_and_drop(element, target).perform()\n", 320 | " self.assertEqual('You are definitely not a bot!', self.driver.find_element(By.ID, 'message').text)\n", 321 | "\n", 322 | "if __name__ == '__main__':\n", 323 | " unittest.main(argv=[''], exit=False)\n", 324 | " %reset" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 25, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "from webdriver_manager.chrome import ChromeDriverManager\n", 334 | "\n", 335 | "CHROMEDRIVER_PATH = ChromeDriverManager().install()" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 29, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "True" 347 | ] 348 | }, 349 | "execution_count": 29, 350 | "metadata": {}, 351 | "output_type": "execute_result" 352 | } 353 | ], 354 | "source": [ 355 | "from selenium.webdriver.chrome.options import Options\n", 356 | "from selenium import webdriver\n", 357 | "from selenium.webdriver.chrome.service import Service\n", 358 | "\n", 359 | "chrome_options = Options()\n", 360 | "chrome_options.add_argument(\"--headless\")\n", 361 | "driver = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH), options=chrome_options)\n", 362 | "driver.get('http://www.pythonscraping.com/')\n", 363 | "driver.get_screenshot_as_file('pythonscraping.png')" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [] 372 | } 373 | ], 374 | "metadata": { 375 | "kernelspec": { 376 | "display_name": "Python 3 (ipykernel)", 377 | "language": "python", 378 | "name": "python3" 379 | }, 380 | "language_info": { 381 | "codemirror_mode": { 382 | "name": "ipython", 383 | "version": 3 384 | }, 385 | "file_extension": ".py", 386 | "mimetype": "text/x-python", 387 | "name": "python", 388 | "nbconvert_exporter": "python", 389 | "pygments_lexer": "ipython3", 390 | "version": "3.9.12" 391 | } 392 | }, 393 | "nbformat": 4, 394 | "nbformat_minor": 2 395 | } 396 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web Scraping with Python Code Samples 2 | 3 | These code samples are for the book Web Scraping with Python 2nd Edition 4 | 5 | If you're looking for the first edition code files, they can be found in the v1 directory. 6 | 7 | Most code for the second edition is contained in Jupyter notebooks. Although these files can be viewed directly in your browser in Github, some formatting changes and oddities may occur. I recommend that you clone the repository, install Jupyter, and view them locally for the best experience. 8 | 9 | The web changes, libraries update, and make mistakes and typos more frequently than I'd like to admit! If you think you've spotted an error, please feel free to make a pull request against this repository. 10 | -------------------------------------------------------------------------------- /captcha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captcha.png -------------------------------------------------------------------------------- /captchas/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/.DS_Store -------------------------------------------------------------------------------- /captchas/2F8S.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/2F8S.png -------------------------------------------------------------------------------- /captchas/2PHU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/2PHU.png -------------------------------------------------------------------------------- /captchas/2XMT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/2XMT.png -------------------------------------------------------------------------------- /captchas/34PQ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/34PQ.png -------------------------------------------------------------------------------- /captchas/37QR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/37QR.png -------------------------------------------------------------------------------- /captchas/3A68.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/3A68.png -------------------------------------------------------------------------------- /captchas/3BE8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/3BE8.png -------------------------------------------------------------------------------- /captchas/3DXJ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/3DXJ.png -------------------------------------------------------------------------------- /captchas/3FDB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/3FDB.png -------------------------------------------------------------------------------- /captchas/3FK7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/3FK7.png -------------------------------------------------------------------------------- /captchas/3JW5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/3JW5.png -------------------------------------------------------------------------------- /captchas/3LNK.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/3LNK.png -------------------------------------------------------------------------------- /captchas/3QZZ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/3QZZ.png -------------------------------------------------------------------------------- /captchas/3RW7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/3RW7.png -------------------------------------------------------------------------------- /captchas/3TCH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/3TCH.png -------------------------------------------------------------------------------- /captchas/3YL8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/3YL8.png -------------------------------------------------------------------------------- /captchas/3ZV3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/3ZV3.png -------------------------------------------------------------------------------- /captchas/46VL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/46VL.png -------------------------------------------------------------------------------- /captchas/49K7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/49K7.png -------------------------------------------------------------------------------- /captchas/4AUM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/4AUM.png -------------------------------------------------------------------------------- /captchas/4QXS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/4QXS.png -------------------------------------------------------------------------------- /captchas/4VHT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/4VHT.png -------------------------------------------------------------------------------- /captchas/4WSU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/4WSU.png -------------------------------------------------------------------------------- /captchas/52X6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/52X6.png -------------------------------------------------------------------------------- /captchas/56AZ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/56AZ.png -------------------------------------------------------------------------------- /captchas/65KQ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/65KQ.png -------------------------------------------------------------------------------- /captchas/696R.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/696R.png -------------------------------------------------------------------------------- /captchas/6DM4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/6DM4.png -------------------------------------------------------------------------------- /captchas/6MGR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/6MGR.png -------------------------------------------------------------------------------- /captchas/6PQ8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/6PQ8.png -------------------------------------------------------------------------------- /captchas/7994.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/7994.png -------------------------------------------------------------------------------- /captchas/7B63.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/7B63.png -------------------------------------------------------------------------------- /captchas/7CA2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/7CA2.png -------------------------------------------------------------------------------- /captchas/7HSD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/7HSD.png -------------------------------------------------------------------------------- /captchas/7MGH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/7MGH.png -------------------------------------------------------------------------------- /captchas/7R6J.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/7R6J.png -------------------------------------------------------------------------------- /captchas/7RK3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/7RK3.png -------------------------------------------------------------------------------- /captchas/7VUC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/7VUC.png -------------------------------------------------------------------------------- /captchas/7X8F.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/7X8F.png -------------------------------------------------------------------------------- /captchas/7Y4S.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/7Y4S.png -------------------------------------------------------------------------------- /captchas/832C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/832C.png -------------------------------------------------------------------------------- /captchas/83CA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/83CA.png -------------------------------------------------------------------------------- /captchas/8696.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/8696.png -------------------------------------------------------------------------------- /captchas/88MU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/88MU.png -------------------------------------------------------------------------------- /captchas/8D8L.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/8D8L.png -------------------------------------------------------------------------------- /captchas/8MB6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/8MB6.png -------------------------------------------------------------------------------- /captchas/8N2Q.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/8N2Q.png -------------------------------------------------------------------------------- /captchas/8N6D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/8N6D.png -------------------------------------------------------------------------------- /captchas/8NMS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/8NMS.png -------------------------------------------------------------------------------- /captchas/8PRQ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/8PRQ.png -------------------------------------------------------------------------------- /captchas/8XAQ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/8XAQ.png -------------------------------------------------------------------------------- /captchas/8YEP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/8YEP.png -------------------------------------------------------------------------------- /captchas/9D6N.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/9D6N.png -------------------------------------------------------------------------------- /captchas/9J8K.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/9J8K.png -------------------------------------------------------------------------------- /captchas/9J9F.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/9J9F.png -------------------------------------------------------------------------------- /captchas/A23U.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/A23U.png -------------------------------------------------------------------------------- /captchas/A5HM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/A5HM.png -------------------------------------------------------------------------------- /captchas/ACQC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/ACQC.png -------------------------------------------------------------------------------- /captchas/ADU5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/ADU5.png -------------------------------------------------------------------------------- /captchas/AK6F.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/AK6F.png -------------------------------------------------------------------------------- /captchas/ALX2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/ALX2.png -------------------------------------------------------------------------------- /captchas/APAR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/APAR.png -------------------------------------------------------------------------------- /captchas/AQF2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/AQF2.png -------------------------------------------------------------------------------- /captchas/ASMW.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/ASMW.png -------------------------------------------------------------------------------- /captchas/BGKH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/BGKH.png -------------------------------------------------------------------------------- /captchas/BX48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/BX48.png -------------------------------------------------------------------------------- /captchas/C6TJ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/C6TJ.png -------------------------------------------------------------------------------- /captchas/CFGF.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/CFGF.png -------------------------------------------------------------------------------- /captchas/CQ34.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/CQ34.png -------------------------------------------------------------------------------- /captchas/CRET.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/CRET.png -------------------------------------------------------------------------------- /captchas/CX5M.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/CX5M.png -------------------------------------------------------------------------------- /captchas/D675.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/D675.png -------------------------------------------------------------------------------- /captchas/DCSR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/DCSR.png -------------------------------------------------------------------------------- /captchas/DJFF.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/DJFF.png -------------------------------------------------------------------------------- /captchas/DPML.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/DPML.png -------------------------------------------------------------------------------- /captchas/DTKQ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/DTKQ.png -------------------------------------------------------------------------------- /captchas/DU9H.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/DU9H.png -------------------------------------------------------------------------------- /captchas/DZQW.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/DZQW.png -------------------------------------------------------------------------------- /captchas/E34Y.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/E34Y.png -------------------------------------------------------------------------------- /captchas/E88R.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/E88R.png -------------------------------------------------------------------------------- /captchas/EASL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/EASL.png -------------------------------------------------------------------------------- /captchas/EFZZ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/EFZZ.png -------------------------------------------------------------------------------- /captchas/EJZV.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/EJZV.png -------------------------------------------------------------------------------- /captchas/EKJC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/EKJC.png -------------------------------------------------------------------------------- /captchas/EMS3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/EMS3.png -------------------------------------------------------------------------------- /captchas/ERU6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/ERU6.png -------------------------------------------------------------------------------- /captchas/EW7Q.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/EW7Q.png -------------------------------------------------------------------------------- /captchas/EYPK.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/EYPK.png -------------------------------------------------------------------------------- /captchas/FAAS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/FAAS.png -------------------------------------------------------------------------------- /captchas/FFNS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/FFNS.png -------------------------------------------------------------------------------- /captchas/FNT9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/FNT9.png -------------------------------------------------------------------------------- /captchas/FP6Z.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/FP6Z.png -------------------------------------------------------------------------------- /captchas/FPL3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/FPL3.png -------------------------------------------------------------------------------- /captchas/FUYF.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/FUYF.png -------------------------------------------------------------------------------- /captchas/FX8M.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/FX8M.png -------------------------------------------------------------------------------- /captchas/GEV7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/GEV7.png -------------------------------------------------------------------------------- /captchas/GQ7W.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/GQ7W.png -------------------------------------------------------------------------------- /captchas/GSAZ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/GSAZ.png -------------------------------------------------------------------------------- /captchas/GVPA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/GVPA.png -------------------------------------------------------------------------------- /captchas/GWH9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/GWH9.png -------------------------------------------------------------------------------- /captchas/GZ45.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/GZ45.png -------------------------------------------------------------------------------- /captchas/H2U5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/H2U5.png -------------------------------------------------------------------------------- /captchas/HCEA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/HCEA.png -------------------------------------------------------------------------------- /captchas/HF4F.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/HF4F.png -------------------------------------------------------------------------------- /captchas/HH9N.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/HH9N.png -------------------------------------------------------------------------------- /captchas/HKUM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/HKUM.png -------------------------------------------------------------------------------- /captchas/JTM7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/JTM7.png -------------------------------------------------------------------------------- /captchas/K3WQ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/K3WQ.png -------------------------------------------------------------------------------- /captchas/K4U4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/K4U4.png -------------------------------------------------------------------------------- /captchas/KE7H.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/KE7H.png -------------------------------------------------------------------------------- /captchas/KQCT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/KQCT.png -------------------------------------------------------------------------------- /captchas/KUR6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/KUR6.png -------------------------------------------------------------------------------- /captchas/L4V8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/L4V8.png -------------------------------------------------------------------------------- /captchas/L95D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/L95D.png -------------------------------------------------------------------------------- /captchas/LADE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/LADE.png -------------------------------------------------------------------------------- /captchas/LDS9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/LDS9.png -------------------------------------------------------------------------------- /captchas/LH74.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/LH74.png -------------------------------------------------------------------------------- /captchas/MBE7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/MBE7.png -------------------------------------------------------------------------------- /captchas/MD5K.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/MD5K.png -------------------------------------------------------------------------------- /captchas/MKTX.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/MKTX.png -------------------------------------------------------------------------------- /captchas/MMB9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/MMB9.png -------------------------------------------------------------------------------- /captchas/N82K.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/N82K.png -------------------------------------------------------------------------------- /captchas/NB7Y.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/NB7Y.png -------------------------------------------------------------------------------- /captchas/NEQT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/NEQT.png -------------------------------------------------------------------------------- /captchas/NNMB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/NNMB.png -------------------------------------------------------------------------------- /captchas/NYJE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/NYJE.png -------------------------------------------------------------------------------- /captchas/P9UC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/P9UC.png -------------------------------------------------------------------------------- /captchas/PC5N.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/PC5N.png -------------------------------------------------------------------------------- /captchas/PCEV.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/PCEV.png -------------------------------------------------------------------------------- /captchas/PK7W.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/PK7W.png -------------------------------------------------------------------------------- /captchas/PQWA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/PQWA.png -------------------------------------------------------------------------------- /captchas/PWF9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/PWF9.png -------------------------------------------------------------------------------- /captchas/QDKW.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/QDKW.png -------------------------------------------------------------------------------- /captchas/QJJX.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/QJJX.png -------------------------------------------------------------------------------- /captchas/QLAX.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/QLAX.png -------------------------------------------------------------------------------- /captchas/QPP7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/QPP7.png -------------------------------------------------------------------------------- /captchas/QR3C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/QR3C.png -------------------------------------------------------------------------------- /captchas/QTHL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/QTHL.png -------------------------------------------------------------------------------- /captchas/QTP6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/QTP6.png -------------------------------------------------------------------------------- /captchas/QUEB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/QUEB.png -------------------------------------------------------------------------------- /captchas/QX4A.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/QX4A.png -------------------------------------------------------------------------------- /captchas/QYWB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/QYWB.png -------------------------------------------------------------------------------- /captchas/R66E.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/R66E.png -------------------------------------------------------------------------------- /captchas/RKE5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/RKE5.png -------------------------------------------------------------------------------- /captchas/RLZ7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/RLZ7.png -------------------------------------------------------------------------------- /captchas/RQTM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/RQTM.png -------------------------------------------------------------------------------- /captchas/RSE8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/RSE8.png -------------------------------------------------------------------------------- /captchas/S2UB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/S2UB.png -------------------------------------------------------------------------------- /captchas/S5QK.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/S5QK.png -------------------------------------------------------------------------------- /captchas/S8Z8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/S8Z8.png -------------------------------------------------------------------------------- /captchas/SAAM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/SAAM.png -------------------------------------------------------------------------------- /captchas/SZ34.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/SZ34.png -------------------------------------------------------------------------------- /captchas/T2CS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/T2CS.png -------------------------------------------------------------------------------- /captchas/T46Y.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/T46Y.png -------------------------------------------------------------------------------- /captchas/TJ8H.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/TJ8H.png -------------------------------------------------------------------------------- /captchas/TJZS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/TJZS.png -------------------------------------------------------------------------------- /captchas/TLRX.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/TLRX.png -------------------------------------------------------------------------------- /captchas/TNBP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/TNBP.png -------------------------------------------------------------------------------- /captchas/TUCS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/TUCS.png -------------------------------------------------------------------------------- /captchas/TVLQ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/TVLQ.png -------------------------------------------------------------------------------- /captchas/U6GN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/U6GN.png -------------------------------------------------------------------------------- /captchas/U7CH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/U7CH.png -------------------------------------------------------------------------------- /captchas/U9EH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/U9EH.png -------------------------------------------------------------------------------- /captchas/UGA6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/UGA6.png -------------------------------------------------------------------------------- /captchas/UKXW.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/UKXW.png -------------------------------------------------------------------------------- /captchas/UNKE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/UNKE.png -------------------------------------------------------------------------------- /captchas/UQZE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/UQZE.png -------------------------------------------------------------------------------- /captchas/USE7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/USE7.png -------------------------------------------------------------------------------- /captchas/UUFN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/UUFN.png -------------------------------------------------------------------------------- /captchas/V3YY.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/V3YY.png -------------------------------------------------------------------------------- /captchas/V9AH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/V9AH.png -------------------------------------------------------------------------------- /captchas/VBUM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/VBUM.png -------------------------------------------------------------------------------- /captchas/VCMC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/VCMC.png -------------------------------------------------------------------------------- /captchas/VCUD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/VCUD.png -------------------------------------------------------------------------------- /captchas/VDS5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/VDS5.png -------------------------------------------------------------------------------- /captchas/VFC5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/VFC5.png -------------------------------------------------------------------------------- /captchas/VTWE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/VTWE.png -------------------------------------------------------------------------------- /captchas/W43T.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/W43T.png -------------------------------------------------------------------------------- /captchas/WAP7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/WAP7.png -------------------------------------------------------------------------------- /captchas/WH3C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/WH3C.png -------------------------------------------------------------------------------- /captchas/WKU5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/WKU5.png -------------------------------------------------------------------------------- /captchas/X5ZS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/X5ZS.png -------------------------------------------------------------------------------- /captchas/X7D4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/X7D4.png -------------------------------------------------------------------------------- /captchas/XE48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/XE48.png -------------------------------------------------------------------------------- /captchas/XER2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/XER2.png -------------------------------------------------------------------------------- /captchas/XFCT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/XFCT.png -------------------------------------------------------------------------------- /captchas/XPGT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/XPGT.png -------------------------------------------------------------------------------- /captchas/XU8Y.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/XU8Y.png -------------------------------------------------------------------------------- /captchas/Y2Z3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/Y2Z3.png -------------------------------------------------------------------------------- /captchas/Y5L5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/Y5L5.png -------------------------------------------------------------------------------- /captchas/YAGV.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/YAGV.png -------------------------------------------------------------------------------- /captchas/YEZY.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/YEZY.png -------------------------------------------------------------------------------- /captchas/YGV4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/YGV4.png -------------------------------------------------------------------------------- /captchas/YMZM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/YMZM.png -------------------------------------------------------------------------------- /captchas/YRQ8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/YRQ8.png -------------------------------------------------------------------------------- /captchas/YSRA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/YSRA.png -------------------------------------------------------------------------------- /captchas/ZCBP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/ZCBP.png -------------------------------------------------------------------------------- /captchas/ZNYD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/ZNYD.png -------------------------------------------------------------------------------- /captchas/ZTNL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/ZTNL.png -------------------------------------------------------------------------------- /captchas/ZXBW.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/captchas/ZXBW.png -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/images/dei/deij-odot.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/images/dei/deij-odot.svg?: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/images/sitewide-headers/oreilly_logo_mark_red.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | oreilly_logo_mark_red_d30000 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/images/sitewide-headers/oreilly_logo_mark_red.svg?: -------------------------------------------------------------------------------- 1 | 2 | 3 | oreilly_logo_mark_red_d30000 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/amazon-appstore-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/amazon-appstore-logo.png -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/amazon-appstore-logo.png?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/amazon-appstore-logo.png? -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/app-store-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/app-store-logo.png -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/app-store-logo.png?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/app-store-logo.png? -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/cert-vendor-logos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/cert-vendor-logos.png -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/cert-vendor-logos.png?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/cert-vendor-logos.png? -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/google-play-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/google-play-logo.png -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/google-play-logo.png?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/google-play-logo.png? -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/home-video-testimonial-thumb1-711x400-20221020.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/home-video-testimonial-thumb1-711x400-20221020.jpg -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/home-video-testimonial-thumb1-711x400-20221020.jpg?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/home-video-testimonial-thumb1-711x400-20221020.jpg? -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/home-video-testimonial-thumb2-400x225-20221019.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/home-video-testimonial-thumb2-400x225-20221019.jpg -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/home-video-testimonial-thumb2-400x225-20221019.jpg?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/home-video-testimonial-thumb2-400x225-20221019.jpg? -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/home-video-testimonial-thumb3-711x400-20230201.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/home-video-testimonial-thumb3-711x400-20230201.jpg -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/home-video-testimonial-thumb3-711x400-20230201.jpg?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/home-video-testimonial-thumb3-711x400-20230201.jpg? -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/home_plot3_lot_600x600.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/home_plot3_lot_600x600.jpg -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/home_plot3_lot_600x600.jpg?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/home_plot3_lot_600x600.jpg? -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/interactive_laptop_780x426.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/interactive_laptop_780x426.png -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/interactive_laptop_780x426.png?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/interactive_laptop_780x426.png? -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/laptop-flat-courses-20230228.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/laptop-flat-courses-20230228.png -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/laptop-flat-courses-20230228.png?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/laptop-flat-courses-20230228.png? -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/roku-tv-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/roku-tv-logo.png -------------------------------------------------------------------------------- /downloaded/cdn.oreillystatic.com/oreilly/images/roku-tv-logo.png?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/cdn.oreillystatic.com/oreilly/images/roku-tv-logo.png? -------------------------------------------------------------------------------- /downloaded/pythonscraping.com/wp-content/themes/popularfx/js/navigation.js: -------------------------------------------------------------------------------- 1 | /** 2 | * File navigation.js. 3 | * 4 | * Handles toggling the navigation menu for small screens and enables TAB key 5 | * navigation support for dropdown menus. 6 | */ 7 | ( function() { 8 | var container, button, menu, links, i, len; 9 | 10 | container = document.getElementById( 'site-navigation' ); 11 | if ( ! container ) { 12 | return; 13 | } 14 | 15 | button = document.getElementsByClassName( 'menu-toggle' )[0]; 16 | if ( 'undefined' === typeof button ) { 17 | return; 18 | } 19 | 20 | menu = container.getElementsByTagName( 'ul' )[0]; 21 | 22 | // Hide menu toggle button if menu is empty and return early. 23 | if ( 'undefined' === typeof menu ) { 24 | button.style.display = 'none'; 25 | return; 26 | } 27 | 28 | if ( -1 === menu.className.indexOf( 'nav-menu' ) ) { 29 | menu.className += ' nav-menu'; 30 | } 31 | 32 | button.onclick = function() { 33 | 34 | if(container.classList.contains('hidden-mobile') == true){ 35 | container.className = container.className.replace( 'hidden-mobile', '' ); 36 | 37 | button.setAttribute( 'aria-expanded', 'false' ); 38 | }else{ 39 | container.className += ' hidden-mobile'; 40 | button.setAttribute( 'aria-expanded', 'true' ); 41 | 42 | } 43 | }; 44 | 45 | // Close small menu when user clicks outside 46 | document.addEventListener( 'click', function( event ) { 47 | var isClickInside = container.contains( event.target ); 48 | 49 | if ( ! isClickInside ) { 50 | container.className = container.className.replace( ' toggled', '' ); 51 | button.setAttribute( 'aria-expanded', 'false' ); 52 | } 53 | } ); 54 | 55 | // Get all the link elements within the menu. 56 | links = menu.getElementsByTagName( 'a' ); 57 | 58 | // Each time a menu link is focused or blurred, toggle focus. 59 | for ( i = 0, len = links.length; i < len; i++ ) { 60 | links[i].addEventListener( 'focus', toggleFocus, true ); 61 | links[i].addEventListener( 'blur', toggleFocus, true ); 62 | } 63 | 64 | /** 65 | * Sets or removes .focus class on an element. 66 | */ 67 | function toggleFocus() { 68 | var self = this; 69 | 70 | // Move up through the ancestors of the current link until we hit .nav-menu. 71 | while ( -1 === self.className.indexOf( 'nav-menu' ) ) { 72 | // On li elements toggle the class .focus. 73 | if ( 'li' === self.tagName.toLowerCase() ) { 74 | if ( -1 !== self.className.indexOf( 'focus' ) ) { 75 | self.className = self.className.replace( ' focus', '' ); 76 | } else { 77 | self.className += ' focus'; 78 | } 79 | } 80 | 81 | self = self.parentElement; 82 | } 83 | } 84 | 85 | /** 86 | * Toggles `focus` class to allow submenu access on tablets. 87 | */ 88 | ( function() { 89 | var touchStartFn, 90 | parentLink = container.querySelectorAll( '.menu-item-has-children > a, .page_item_has_children > a' ); 91 | 92 | if ( 'ontouchstart' in window ) { 93 | touchStartFn = function( e ) { 94 | var menuItem = this.parentNode; 95 | 96 | if ( ! menuItem.classList.contains( 'focus' ) ) { 97 | e.preventDefault(); 98 | for ( i = 0; i < menuItem.parentNode.children.length; ++i ) { 99 | if ( menuItem === menuItem.parentNode.children[i] ) { 100 | continue; 101 | } 102 | menuItem.parentNode.children[i].classList.remove( 'focus' ); 103 | } 104 | menuItem.classList.add( 'focus' ); 105 | } else { 106 | menuItem.classList.remove( 'focus' ); 107 | } 108 | }; 109 | 110 | for ( i = 0; i < parentLink.length; ++i ) { 111 | parentLink[i].addEventListener( 'touchstart', touchStartFn, false ); 112 | } 113 | } 114 | }( container ) ); 115 | }() ); 116 | 117 | /** 118 | * File skip-link-focus-fix.js. 119 | * 120 | * Helps with accessibility for keyboard only users. 121 | * 122 | * Learn more: https://git.io/vWdr2 123 | */ 124 | ( function() { 125 | var isIe = /(trident|msie)/i.test( navigator.userAgent ); 126 | 127 | if ( isIe && document.getElementById && window.addEventListener ) { 128 | window.addEventListener( 'hashchange', function() { 129 | var id = location.hash.substring( 1 ), 130 | element; 131 | 132 | if ( ! ( /^[A-z0-9_-]+$/.test( id ) ) ) { 133 | return; 134 | } 135 | 136 | element = document.getElementById( id ); 137 | 138 | if ( element ) { 139 | if ( ! ( /^(?:a|select|input|button|textarea)$/i.test( element.tagName ) ) ) { 140 | element.tabIndex = -1; 141 | } 142 | 143 | element.focus(); 144 | } 145 | }, false ); 146 | } 147 | }() ); 148 | 149 | -------------------------------------------------------------------------------- /downloaded/pythonscraping.com/wp-content/themes/popularfx/js/navigation.js?ver=1.2.0: -------------------------------------------------------------------------------- 1 | /** 2 | * File navigation.js. 3 | * 4 | * Handles toggling the navigation menu for small screens and enables TAB key 5 | * navigation support for dropdown menus. 6 | */ 7 | ( function() { 8 | var container, button, menu, links, i, len; 9 | 10 | container = document.getElementById( 'site-navigation' ); 11 | if ( ! container ) { 12 | return; 13 | } 14 | 15 | button = document.getElementsByClassName( 'menu-toggle' )[0]; 16 | if ( 'undefined' === typeof button ) { 17 | return; 18 | } 19 | 20 | menu = container.getElementsByTagName( 'ul' )[0]; 21 | 22 | // Hide menu toggle button if menu is empty and return early. 23 | if ( 'undefined' === typeof menu ) { 24 | button.style.display = 'none'; 25 | return; 26 | } 27 | 28 | if ( -1 === menu.className.indexOf( 'nav-menu' ) ) { 29 | menu.className += ' nav-menu'; 30 | } 31 | 32 | button.onclick = function() { 33 | 34 | if(container.classList.contains('hidden-mobile') == true){ 35 | container.className = container.className.replace( 'hidden-mobile', '' ); 36 | 37 | button.setAttribute( 'aria-expanded', 'false' ); 38 | }else{ 39 | container.className += ' hidden-mobile'; 40 | button.setAttribute( 'aria-expanded', 'true' ); 41 | 42 | } 43 | }; 44 | 45 | // Close small menu when user clicks outside 46 | document.addEventListener( 'click', function( event ) { 47 | var isClickInside = container.contains( event.target ); 48 | 49 | if ( ! isClickInside ) { 50 | container.className = container.className.replace( ' toggled', '' ); 51 | button.setAttribute( 'aria-expanded', 'false' ); 52 | } 53 | } ); 54 | 55 | // Get all the link elements within the menu. 56 | links = menu.getElementsByTagName( 'a' ); 57 | 58 | // Each time a menu link is focused or blurred, toggle focus. 59 | for ( i = 0, len = links.length; i < len; i++ ) { 60 | links[i].addEventListener( 'focus', toggleFocus, true ); 61 | links[i].addEventListener( 'blur', toggleFocus, true ); 62 | } 63 | 64 | /** 65 | * Sets or removes .focus class on an element. 66 | */ 67 | function toggleFocus() { 68 | var self = this; 69 | 70 | // Move up through the ancestors of the current link until we hit .nav-menu. 71 | while ( -1 === self.className.indexOf( 'nav-menu' ) ) { 72 | // On li elements toggle the class .focus. 73 | if ( 'li' === self.tagName.toLowerCase() ) { 74 | if ( -1 !== self.className.indexOf( 'focus' ) ) { 75 | self.className = self.className.replace( ' focus', '' ); 76 | } else { 77 | self.className += ' focus'; 78 | } 79 | } 80 | 81 | self = self.parentElement; 82 | } 83 | } 84 | 85 | /** 86 | * Toggles `focus` class to allow submenu access on tablets. 87 | */ 88 | ( function() { 89 | var touchStartFn, 90 | parentLink = container.querySelectorAll( '.menu-item-has-children > a, .page_item_has_children > a' ); 91 | 92 | if ( 'ontouchstart' in window ) { 93 | touchStartFn = function( e ) { 94 | var menuItem = this.parentNode; 95 | 96 | if ( ! menuItem.classList.contains( 'focus' ) ) { 97 | e.preventDefault(); 98 | for ( i = 0; i < menuItem.parentNode.children.length; ++i ) { 99 | if ( menuItem === menuItem.parentNode.children[i] ) { 100 | continue; 101 | } 102 | menuItem.parentNode.children[i].classList.remove( 'focus' ); 103 | } 104 | menuItem.classList.add( 'focus' ); 105 | } else { 106 | menuItem.classList.remove( 'focus' ); 107 | } 108 | }; 109 | 110 | for ( i = 0; i < parentLink.length; ++i ) { 111 | parentLink[i].addEventListener( 'touchstart', touchStartFn, false ); 112 | } 113 | } 114 | }( container ) ); 115 | }() ); 116 | 117 | /** 118 | * File skip-link-focus-fix.js. 119 | * 120 | * Helps with accessibility for keyboard only users. 121 | * 122 | * Learn more: https://git.io/vWdr2 123 | */ 124 | ( function() { 125 | var isIe = /(trident|msie)/i.test( navigator.userAgent ); 126 | 127 | if ( isIe && document.getElementById && window.addEventListener ) { 128 | window.addEventListener( 'hashchange', function() { 129 | var id = location.hash.substring( 1 ), 130 | element; 131 | 132 | if ( ! ( /^[A-z0-9_-]+$/.test( id ) ) ) { 133 | return; 134 | } 135 | 136 | element = document.getElementById( id ); 137 | 138 | if ( element ) { 139 | if ( ! ( /^(?:a|select|input|button|textarea)$/i.test( element.tagName ) ) ) { 140 | element.tabIndex = -1; 141 | } 142 | 143 | element.focus(); 144 | } 145 | }, false ); 146 | } 147 | }() ); 148 | 149 | -------------------------------------------------------------------------------- /downloaded/pythonscraping.com/wp-content/uploads/2021/08/home1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/pythonscraping.com/wp-content/uploads/2021/08/home1.jpg -------------------------------------------------------------------------------- /downloaded/pythonscraping.com/wp-content/uploads/2021/08/home1.jpg?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/pythonscraping.com/wp-content/uploads/2021/08/home1.jpg? -------------------------------------------------------------------------------- /downloaded/pythonscraping.com/wp-content/uploads/2021/08/logo01-e1681353135199.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/pythonscraping.com/wp-content/uploads/2021/08/logo01-e1681353135199.png -------------------------------------------------------------------------------- /downloaded/pythonscraping.com/wp-content/uploads/2021/08/logo01-e1681353135199.png?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/pythonscraping.com/wp-content/uploads/2021/08/logo01-e1681353135199.png? -------------------------------------------------------------------------------- /downloaded/pythonscraping.com/wp-content/uploads/2023/04/python-logo-e1681354047443.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/pythonscraping.com/wp-content/uploads/2023/04/python-logo-e1681354047443.png -------------------------------------------------------------------------------- /downloaded/pythonscraping.com/wp-content/uploads/2023/04/python-logo-e1681354047443.png?: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/downloaded/pythonscraping.com/wp-content/uploads/2023/04/python-logo-e1681354047443.png? -------------------------------------------------------------------------------- /downloaded/pythonscraping.com/wp-includes/js/jquery/jquery-migrate.min.js: -------------------------------------------------------------------------------- 1 | /*! jQuery Migrate v3.4.0 | (c) OpenJS Foundation and other contributors | jquery.org/license */ 2 | "undefined"==typeof jQuery.migrateMute&&(jQuery.migrateMute=!0),function(t){"use strict";"function"==typeof define&&define.amd?define(["jquery"],function(e){return t(e,window)}):"object"==typeof module&&module.exports?module.exports=t(require("jquery"),window):t(jQuery,window)}(function(s,n){"use strict";function e(e){return 0<=function(e,t){for(var r=/^(\d+)\.(\d+)\.(\d+)/,n=r.exec(e)||[],o=r.exec(t)||[],a=1;a<=3;a++){if(+n[a]>+o[a])return 1;if(+n[a]<+o[a])return-1}return 0}(s.fn.jquery,e)}s.migrateVersion="3.4.0";var t=Object.create(null),o=(s.migrateDisablePatches=function(){for(var e=0;e\x20\t\r\n\f]*)[^>]*)\/>/gi),_=(s.UNSAFE_restoreLegacyHtmlPrefilter=function(){s.migrateEnablePatches("self-closed-tags")},d(s,"htmlPrefilter",function(e){var t,r;return(r=(t=e).replace(O,"<$1>"))!==t&&C(t)!==C(r)&&i("self-closed-tags","HTML tags must be properly nested and closed: "+t),e.replace(O,"<$1>")},"self-closed-tags"),s.migrateDisablePatches("self-closed-tags"),s.fn.offset);return d(s.fn,"offset",function(){var e=this[0];return!e||e.nodeType&&e.getBoundingClientRect?_.apply(this,arguments):(i("offset-valid-elem","jQuery.fn.offset() requires a valid DOM element"),arguments.length?this:void 0)},"offset-valid-elem"),s.ajax&&(H=s.param,d(s,"param",function(e,t){var r=s.ajaxSettings&&s.ajaxSettings.traditional;return void 0===t&&r&&(i("param-ajax-traditional","jQuery.param() no longer uses jQuery.ajaxSettings.traditional"),t=r),H.call(this,e,t)},"param-ajax-traditional")),u(s.fn,"andSelf",s.fn.addBack,"andSelf","jQuery.fn.andSelf() is deprecated and removed, use jQuery.fn.addBack()"),s.Deferred&&(E=s.Deferred,M=[["resolve","done",s.Callbacks("once memory"),s.Callbacks("once memory"),"resolved"],["reject","fail",s.Callbacks("once memory"),s.Callbacks("once memory"),"rejected"],["notify","progress",s.Callbacks("memory"),s.Callbacks("memory")]],d(s,"Deferred",function(e){var a=E(),i=a.promise();function t(){var o=arguments;return s.Deferred(function(n){s.each(M,function(e,t){var r="function"==typeof o[e]&&o[e];a[t[1]](function(){var e=r&&r.apply(this,arguments);e&&"function"==typeof e.promise?e.promise().done(n.resolve).fail(n.reject).progress(n.notify):n[t[0]+"With"](this===i?n.promise():this,r?[e]:arguments)})}),o=null}).promise()}return u(a,"pipe",t,"deferred-pipe","deferred.pipe() is deprecated"),u(i,"pipe",t,"deferred-pipe","deferred.pipe() is deprecated"),e&&e.call(a,a),a},"deferred-pipe"),s.Deferred.exceptionHook=E.exceptionHook),s}); 3 | -------------------------------------------------------------------------------- /downloaded/pythonscraping.com/wp-includes/js/jquery/jquery-migrate.min.js?ver=3.4.0: -------------------------------------------------------------------------------- 1 | /*! jQuery Migrate v3.4.0 | (c) OpenJS Foundation and other contributors | jquery.org/license */ 2 | "undefined"==typeof jQuery.migrateMute&&(jQuery.migrateMute=!0),function(t){"use strict";"function"==typeof define&&define.amd?define(["jquery"],function(e){return t(e,window)}):"object"==typeof module&&module.exports?module.exports=t(require("jquery"),window):t(jQuery,window)}(function(s,n){"use strict";function e(e){return 0<=function(e,t){for(var r=/^(\d+)\.(\d+)\.(\d+)/,n=r.exec(e)||[],o=r.exec(t)||[],a=1;a<=3;a++){if(+n[a]>+o[a])return 1;if(+n[a]<+o[a])return-1}return 0}(s.fn.jquery,e)}s.migrateVersion="3.4.0";var t=Object.create(null),o=(s.migrateDisablePatches=function(){for(var e=0;e\x20\t\r\n\f]*)[^>]*)\/>/gi),_=(s.UNSAFE_restoreLegacyHtmlPrefilter=function(){s.migrateEnablePatches("self-closed-tags")},d(s,"htmlPrefilter",function(e){var t,r;return(r=(t=e).replace(O,"<$1>"))!==t&&C(t)!==C(r)&&i("self-closed-tags","HTML tags must be properly nested and closed: "+t),e.replace(O,"<$1>")},"self-closed-tags"),s.migrateDisablePatches("self-closed-tags"),s.fn.offset);return d(s.fn,"offset",function(){var e=this[0];return!e||e.nodeType&&e.getBoundingClientRect?_.apply(this,arguments):(i("offset-valid-elem","jQuery.fn.offset() requires a valid DOM element"),arguments.length?this:void 0)},"offset-valid-elem"),s.ajax&&(H=s.param,d(s,"param",function(e,t){var r=s.ajaxSettings&&s.ajaxSettings.traditional;return void 0===t&&r&&(i("param-ajax-traditional","jQuery.param() no longer uses jQuery.ajaxSettings.traditional"),t=r),H.call(this,e,t)},"param-ajax-traditional")),u(s.fn,"andSelf",s.fn.addBack,"andSelf","jQuery.fn.andSelf() is deprecated and removed, use jQuery.fn.addBack()"),s.Deferred&&(E=s.Deferred,M=[["resolve","done",s.Callbacks("once memory"),s.Callbacks("once memory"),"resolved"],["reject","fail",s.Callbacks("once memory"),s.Callbacks("once memory"),"rejected"],["notify","progress",s.Callbacks("memory"),s.Callbacks("memory")]],d(s,"Deferred",function(e){var a=E(),i=a.promise();function t(){var o=arguments;return s.Deferred(function(n){s.each(M,function(e,t){var r="function"==typeof o[e]&&o[e];a[t[1]](function(){var e=r&&r.apply(this,arguments);e&&"function"==typeof e.promise?e.promise().done(n.resolve).fail(n.reject).progress(n.notify):n[t[0]+"With"](this===i?n.promise():this,r?[e]:arguments)})}),o=null}).promise()}return u(a,"pipe",t,"deferred-pipe","deferred.pipe() is deprecated"),u(i,"pipe",t,"deferred-pipe","deferred.pipe() is deprecated"),e&&e.call(a,a),a},"deferred-pipe"),s.Deferred.exceptionHook=E.exceptionHook),s}); 3 | -------------------------------------------------------------------------------- /downloaded/www.googletagmanager.com/ns.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | ns 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /downloaded/www.googletagmanager.com/ns.html?id=GTM-5P4V6Z: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | ns 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /drivers/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/drivers/.DS_Store -------------------------------------------------------------------------------- /drivers/chromedriver_mac_arm64/chromedriver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/drivers/chromedriver_mac_arm64/chromedriver -------------------------------------------------------------------------------- /editors.csv: -------------------------------------------------------------------------------- 1 | Name,Developer,Initial release,Latest release,Programming language,Cost (US$),License,GUI,TUI or CLI 2 | Version,Date 3 | Acme,Rob Pike,1993,Plan 9 and Inferno,,C,No cost,MITGPL-2.0-onlyLPL-1.02,, 4 | AkelPad,Alexey KuznetsovAlexander Shengalts,2003,4.9.8[1],2016-07-18,C,No cost,BSD-2-Clause,, 5 | Alphatk,Vince Darley,1999,8.3.3[2],2004-12-10,,$40,"Proprietary, with BSD components",, 6 | Atom,GitHub,2014,1.63.1[3],2022-11-23,"HTML, CSS, JavaScript, C++",No cost,MIT,, 7 | BBEdit,Rich Siegel,1992,14.6.3[4],2023-01-19,"Objective-C, Objective-C++","No cost for most features, $49.99 for full version",Proprietary,, 8 | Bluefish,Bluefish Development Team,1999,2.2.13[5],2023-02-23,C,No cost,GPL-3.0-or-later,, 9 | Brackets,Adobe Systems,2012,2.1.3[6],2022-10-31,"HTML, CSS, JavaScript, C++",No cost,MIT,, 10 | Coda,Panic,2007,2.7.7[7],2020-11-05,Objective-C,$99,Proprietary,, 11 | ConTEXT,ConTEXT Project Ltd,1999,0.98.6[8],2009-08-14,Object Pascal (Delphi),No cost,BSD-3-Clause,, 12 | Crimson Editor,Ingyu Kang,1999,3.72-r286m[9],2011-10-01,C++,No cost,Proprietary,, 13 | CudaText,UVViewSoft[a],2015,1.170.5[10],2022-09-10,Object Pascal (Lazarus),No cost,MPL-2.0,, 14 | ed,Ken Thompson,1970,unchanged from original,,C,No cost,?,, 15 | Editra,Cody Precord,2007,0.7.20,2013-01-05,Python,No cost,wxWindows license,, 16 | EmEditor,"Emurasoft, Inc.",1997,21.3.0,2021-11-24,C++,$39.99,Shareware,, 17 | epsilon,Lugaru Software,1984,14.00[11],2020-10-20,C,$250,Proprietary,, 18 | FeatherPad,Pedram Pourang,2016,1.3.3[12],2022-10-07,"C++, Qt",No cost,GPL-3.0-or-later,, 19 | gedit,GNU Project,2000,44.2[13],2023-01-19,C,No cost,GPL-2.0-or-later,, 20 | Geany,Enrico Tröger,2005,1.38.0[14],2021-10-09,"C, GTK+",No cost,GPL-2.0-or-later,, 21 | GNU Emacs,Richard Stallman,1984,28.2[15],2022-09-12,"C, Emacs Lisp",No cost,GPL-3.0-or-later,, 22 | GNU nano,Chris Allegretta,1999,7.2[16],2023-01-18,C,No cost,GPL-3.0-or-later,, 23 | IA Writer,Information Architects,2010.09.22,5.6,2020-12-07,"Objective-C (iOS), Objective-C (macOS), C# (Windows), Java (Android)",$29.99,Proprietary,, 24 | JED,John E. Davis,1992,0.99-19,2009-12-13,"C, S-Lang",No cost,GPL-2.0-or-later,, 25 | jEdit,Slava Pestov,1998,5.6.0[17],2020-09-03,Java,No cost,GPL-2.0-or-later,, 26 | JOE,Joseph Allen,1988,4.6[18],2018-01-09,C,No cost,GPL,, 27 | JOVE,Johnathon Payne,1983,4.16.0.73,2010-07-11,C,No cost,JOVE license,, 28 | Kate,KDE Project,2000-12,22.12.0[19],2022-11-30,C++,No cost,GPL,, 29 | KEDIT,"Mansfield Software Group, Inc.",1983,1.6.1,2016-12-05,C,$129,Proprietary,, 30 | Komodo Edit,Activestate,2007,12.0.1[20],2020-02-10,"Python, JavaScript, Perl, Tcl, PHP, Ruby",No cost,"MPL, GPL, LGPL",, 31 | Komodo IDE,Activestate,2001,12.0.1,2020-02-10,"Python, JavaScript, Perl, Tcl, PHP, Ruby",$295,Proprietary,, 32 | KWrite,KDE Project,2000,21.12.2[21],2022-02-03,C++,No cost,GPL,, 33 | LE,Alexander V. Lukyanov,1997,1.16.8[22],2021-02-05,C++,No cost,GPL-3.0-or-later,, 34 | Leo,Edward K. Ream,1996,,,Python,No cost,MIT,, 35 | Light Table,Chris Granger,2012,0.8.1[23],2016-01-22,ClojureScript,No cost,MIT,, 36 | mcedit,Miguel de Icaza,1994,4.8.29[24][25],2023-01-21,"C, Python, PHP, Javascript, Perl, Tcl, Ruby",No cost,GPL-3.0-or-later,, 37 | Metapad,Alexander Davidson,1999,3.6[26][27],2011-05-28,C,No cost,GPL-3.0-or-later,, 38 | mg,Dave Conroy,1986,current[b],2020-07-22,C,No cost,Public domain,, 39 | MinEd,Thomas Wolff,1992,2022.27,2022-12-23,C,No cost,GPL,, 40 | MS-DOS Editor,Microsoft,1991,2.0.026,,,"Bundled with MS-DOS, Microsoft Windows",Proprietary,, 41 | ne,Sebastiano VignaTodd LewisDaniele Filaretti,1993,,,C,No cost,GPL-3.0-or-later,, 42 | NEdit,Mark Edel,1991,5.7[29],2017-02-08,C,No cost,GPL-2.0-or-later,, 43 | Notepad,Microsoft,1985,11.2302.16.0,,MASM (originally),Bundled with Microsoft Windows,Proprietary,, 44 | Notepad++,Don Ho,2003-11-25,8.5.1[30],2023-03-24,C++,No cost,GPL-3.0-or-later,, 45 | nvi,Keith Bostic,1994,1.81.6[31],2007-11-18,C,No cost,BSD-3-Clause,, 46 | Pico,University of Washington,1992,4.64,,C,No cost,Apache-2.0,, 47 | PolyEdit,PolySoft Solutions,1998,5.4,2010‑04‑07,,$27.95,Shareware,, 48 | PSPad,Jan Fiala,2002,5.0.7[32],2023-03-10,Object Pascal (Delphi),No cost,Proprietary,, 49 | RJ TextEd,Rickard Johansson,2004,15.63[33],2022-09-29,Object Pascal (Delphi),No cost,Proprietary,, 50 | Sam,Rob Pike,1980s (early),stable,,C,No cost,LPL (OSI approved),, 51 | SciTE,Neil Hodgson,1999,5.3.4[34],2023-03-08,C++,$41.99 for macOS. No cost for others,HPND,, 52 | SlickEdit,"SlickEdit, Inc.",1988,25.0.0,2020-10-20,"C, Slick-C",$299,Proprietary,, 53 | Smultron,Peter Borg,2004,12.0.6,2020-01-03,Objective-C,$7.99,Proprietary,, 54 | Source Insight,Source Dynamics,?,4.0.0084,2017-02-26,Source Insight macro language,$239-$255,Proprietary,, 55 | SubEthaEdit,TheCodingMonkeys,2003,5.2.4[35],2022-01-08,,No cost,MIT,, 56 | Sublime Text,"Jon Skinner, Sublime HQ",2008,4 (Build 4143)[36],2022-11-11,"C++, Python",$99,Shareware,, 57 | TED Notepad,Juraj Simlovic,2001,6.3.1[37],2021-12-01,C,No cost,Proprietary,, 58 | Textadept,Mitchell,2007,,,"C, Lua",No cost,MIT,, 59 | TextEdit,Apple Inc.,2001,1.18,2022-10-24,,No cost (also bundled with macOS),BSD-3-Clause,, 60 | TextMate,MacroMates,2004-10-10,2.0.23[38],2021-10-12,Objective-C++,No cost,GPL-3.0-or-later,, 61 | TextPad,Helios Software Solutions,1992,8.14.2[39],2022-11-22,,$30.00 (£16.50),Shareware,, 62 | TextWrangler,Bare Bones Software,2003,5.5.2,2016-09-20,,No cost,Proprietary,, 63 | The SemWare Editor,Sammy Mitchell,1985-11,4.43[40],2022-05,"C, SAL",No cost,BSD-2-Clause,, 64 | UltraEdit,IDM Computer Solutions,1994,25.0,2018-03-12,C++,$99.95,Proprietary,, 65 | VEDIT,"Ted Green, Greenview Data",1980,6.24.2,2015-01-12,"Assembly, C","$89 standard, $239 Pro64",Proprietary,, 66 | vi,Bill Joy,1976,3.7,,C,No cost,BSD-4-Clause or CDDL,, 67 | Vim,Bram Moolenaar,1991,9.0[41],2022-06-28,"C, Vim script",No cost,Vim,, 68 | Visual Studio Code,Microsoft,2015,1.77.3[42],2023-04-12,"JavaScript, TypeScript",No cost,Source code: MIT[43] Binaries built by Microsoft: Proprietary[44][45],, 69 | XEmacs,Lucid Inc.,1991,21.4.22[46],2009-01-30,"C, Emacs Lisp",No cost,GPL-2.0-or-later,, 70 | XNEdit,Olaf Wintermann,2018,1.4,2022-01-14,C,No cost,GPL-2.0-or-later,, 71 | Name,Developer,Initial release,Latest release,Programming language,Cost (US$),License,GUI,TUI or CLI 72 | Version,Date 73 | -------------------------------------------------------------------------------- /foo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/foo.pdf -------------------------------------------------------------------------------- /logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/logo.jpg -------------------------------------------------------------------------------- /output.txt: -------------------------------------------------------------------------------- 1 | This is some text, written in Arial, that will be read by 2 | Tesseract. Here are some symbols: !|@#$%*&*() 3 | -------------------------------------------------------------------------------- /page.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/page.jpg -------------------------------------------------------------------------------- /pythonscraping.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/REMitchell/python-scraping/2183ee8eac05c0f6b6ffb12cc5a418bff95b0242/pythonscraping.png -------------------------------------------------------------------------------- /result.html: -------------------------------------------------------------------------------- 1 | ["Client not found"] -------------------------------------------------------------------------------- /test.csv: -------------------------------------------------------------------------------- 1 | number,number plus 2,number times 2 2 | 0,2,0 3 | 1,3,2 4 | 2,4,4 5 | 3,5,6 6 | 4,6,8 7 | 5,7,10 8 | 6,8,12 9 | 7,9,14 10 | 8,10,16 11 | 9,11,18 12 | --------------------------------------------------------------------------------