├── .gitignore ├── CHANGES.txt ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── docs ├── exporting_tc.png └── saving_tc.png ├── requirements.txt ├── seleniumcrawler ├── __init__.py ├── config.py ├── filesystem.py ├── handle.py ├── makeparsed.py ├── sites │ ├── __init__.py │ ├── forbes │ │ ├── __init__.py │ │ ├── forbes.py │ │ ├── forbes.tc │ │ └── forbes_raw.py │ ├── hackaday │ │ ├── __init__.py │ │ ├── hackaday.py │ │ ├── hackaday.tc │ │ └── hackaday_raw.py │ ├── hnews │ │ ├── __init__.py │ │ ├── hnews.py │ │ ├── hnews.tc │ │ └── hnews_raw.py │ └── reddit │ │ ├── __init__.py │ │ ├── reddit.py │ │ ├── reddit.tc │ │ └── reddit_raw.py └── tests │ ├── __init__.py │ └── test_all.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | chromedriver.log 4 | selenium_crawler.egg-info 5 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | v0.1.0, April 29 2013 -- Initial release. 2 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Cory Walker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | recursive-include docs *.txt 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | selenium-crawler 0.1.0 2 | ====================== 3 | 4 | Have you ever needed to crawl a list of urls that may or may not directly 5 | contain the content you so desperately crave? The web is full of links that do 6 | not behave for a number of reasons, and here is a list of just some of them: 7 | 8 | 1. The page is actually a landing page that links to the page you want: 9 | * Hacker News 10 | * Hack-a-day 11 | * Reddit 12 | 2. The page content is only made available after closing an ad: 13 | * Forbes 14 | 3. The content is behind some sort of login or paywall: 15 | * Boston Globe 16 | 4. One must click through some sort of pagination to find the content: 17 | * Web forums 18 | 19 | You might be asking, why use Selenium when you can use a combination of 20 | PhantomJS and BeautifulSoup to extract the needed data? This is a great way to 21 | accomplish some of the listed tasks above, but it has a number of limitations: 22 | 23 | * Business teams would sometimes rather work with a visual tool rather than 24 | writing lines of code. 25 | * Selenium has most of the code that would be needed already built in. 26 | 27 | **Depending on Selenium DOES NOT mean that your crawling servers will need to 28 | also run a GUI. Selenium can run in a headless environment. See below for more 29 | information.** 30 | 31 | Quickstart 32 | ========== 33 | 34 | ```bash 35 | pip install -e git+https://github.com/cmwslw/selenium-crawler.git#egg=selenium-crawler 36 | ``` 37 | 38 | ```python 39 | from seleniumcrawler import handle_url 40 | print handle_url('https://news.ycombinator.com/item?id=5626377') 41 | ``` 42 | 43 | This will open up a browser window, 'click' on the main link, and load the 44 | article. It will print the following: 45 | 46 | ```python 47 | { 48 | 'url': 'http://googleblog.blogspot.com/2013/04/google-now-on-your-iphone-and-ipad-with.html', 49 | 'source': {{HTMLSOURCE}}, 50 | 'handler': 'hnews' 51 | } 52 | ``` 53 | 54 | Where `{{HTMLSOURCE}}` is the actual HTML of the article. 55 | 56 | Creating test cases 57 | =================== 58 | 59 | Create test cases as you normally would in Selenium, but be sure to take the 60 | following things into account: 61 | 62 | ### Never make a test case article-specific 63 | By default, when one records Selenium test cases, the very first instruction 64 | will be for the browser to load up a specific URL. This url will be the current 65 | article that you are viewing. Since you will be designing scripts that should 66 | work for ANY given article on the site. Make sure you always remove this line. I 67 | might make selenium-crawler take care of this step in the future. That is TBD. 68 | 69 | ### Make your locators as robust as possible 70 | On the one hand, make the locators specific enough to be confident that the 71 | will only match exactly what needs to be matched. On the other hand, make sure 72 | that your locator will continue to match even in the case of a website theme 73 | change or even redesign. It is impossible to account for every possible change 74 | in site structure, but you get the idea. By default, Selenium tends to create 75 | very specific locators while recording. Make sure to fix these up a bit. 76 | 77 | ### Save test cases in a standard way 78 | While it really doesn't matter where saved test case files even go, for 79 | reference and debugging purposes it is useful to save them in a standard 80 | location. For the time being, I have established the form 81 | `sites/{name}/{name}.tc`. Hopefully this will prove to be a decent convention. 82 | 83 | Exporting test cases 84 | ==================== 85 | 86 | Selenium-crawler takes in exported selenium Python scripts as inputs. Follow 87 | these instructions to obtain the exported scripts. Be sure to use the WebDriver 88 | backend. 89 | 90 | ![ScreenShot](https://raw.github.com/cmwslw/selenium-crawler/master/docs/exporting_tc.png) 91 | 92 | Next we need to save the script in a place where selenium-crawler can find it. 93 | Find the sites directory. Next, create a directory for the site. Choose a 94 | useful, concise name. Save the actual exported script in the format of 95 | `{name}_raw.py`: 96 | 97 | ![ScreenShot](https://raw.github.com/cmwslw/selenium-crawler/master/docs/saving_tc.png) 98 | 99 | Parsing selenium cases 100 | ====================== 101 | 102 | The test cases that Selenium exports are not even valid Python code. Here is an 103 | example: 104 | 105 | ```python 106 | self.accept_next_alert = true 107 | ``` 108 | 109 | Once you fix the syntax errors, they are useful for writing test suites, but not 110 | for writing crawlers. Running `python makeparsed.py` takes all these Selenium 111 | test cases and converts them to Python code usable for crawling. Here is an 112 | example that clicks through an annoying Javascript ad on Forbes pages: 113 | 114 | ```python 115 | from selenium import webdriver 116 | 117 | def handle_link(link): 118 | driver = webdriver.Firefox() 119 | driver.implicitly_wait(30) 120 | driver.get(link) 121 | 122 | driver.find_element_by_xpath("//div[@class='header']/div[@class='continue']/a").click() 123 | 124 | results = { 125 | 'url': driver.current_url, 126 | 'source': driver.page_source 127 | } 128 | driver.quit() 129 | 130 | return results 131 | ``` 132 | 133 | So go ahead. Run `python makeparsed.py`. You should see output similar to the 134 | following: 135 | 136 | ``` 137 | Parsed ./sites/forbes/forbes_raw.py. 138 | Parsed ./sites/hnews/hnews_raw.py. 139 | Parsed ./sites/reddit/reddit_raw.py. 140 | ``` 141 | 142 | Don't worry if the paths are different for your installation. Keep in mind that 143 | `makeparsed.py` only has to be run when site scripts have either been changed 144 | or added. 145 | 146 | Headless configuration 147 | ====================== 148 | 149 | Running headless means that no actual GUI will be running on a monitor during 150 | use. Put simply, it means that no browser window will pop up when handling a 151 | URL. One way to run headless is through the use of xvfb, a tool used to set up 152 | virtual framebuffers. Run this before using selenium-crawler: 153 | 154 | ```bash 155 | sh -e /etc/init.d/xvfb start 156 | export DISPLAY=:99.0 157 | ``` 158 | 159 | This is the method that CI systems like Travis-CI and CircleCI recommend. There 160 | are other methods of running Selenium in a headless environment. Do a quick 161 | Google search for more information. 162 | 163 | Contributing 164 | ============ 165 | 166 | Contributing is easy. If you write any new site handling scripts, just be sure 167 | to follow the guide above and write a quick test for it in `test_all.py`. Just 168 | send in a pull request and I'll get a `CONTRIBUTORS.txt` file going. 169 | 170 | -------------------------------------------------------------------------------- /docs/exporting_tc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/corywalker/selenium-crawler/e49df4c1f40a330af19d90b7c59d49ac1acee86c/docs/exporting_tc.png -------------------------------------------------------------------------------- /docs/saving_tc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/corywalker/selenium-crawler/e49df4c1f40a330af19d90b7c59d49ac1acee86c/docs/saving_tc.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | selenium==2.32.0 2 | -------------------------------------------------------------------------------- /seleniumcrawler/__init__.py: -------------------------------------------------------------------------------- 1 | from seleniumcrawler.handle import handle_url 2 | -------------------------------------------------------------------------------- /seleniumcrawler/config.py: -------------------------------------------------------------------------------- 1 | sites_dict = { 2 | 'forbes': '(.*)forbes.com(.*)', 3 | 'hnews': '(.*)news.ycombinator.com(.*)', 4 | 'reddit': '(.*)reddit.com(.*)', 5 | 'hackaday': '(.*)hackaday.com(.*)', 6 | } 7 | -------------------------------------------------------------------------------- /seleniumcrawler/filesystem.py: -------------------------------------------------------------------------------- 1 | # Global modules 2 | import os 3 | # Local modules 4 | from seleniumcrawler.config import sites_dict 5 | 6 | THIS_DIR = os.path.dirname(os.path.realpath(__file__)) 7 | SITES_DIR = os.path.join(THIS_DIR, 'sites') 8 | def locate_sites(): 9 | location_list = [] 10 | for site, regex in sites_dict.items(): 11 | this_site_dir = os.path.join(SITES_DIR, site) 12 | # This is only the EXPECTED script name. 13 | # All scripts should follow this convention. 14 | script_name = site + '_raw.py' 15 | script_path = os.path.join(this_site_dir, script_name) 16 | config_path = os.path.join(this_site_dir, 'config.py') 17 | location_dict = { 18 | 'name': site, 19 | 'script_path': script_path, 20 | 'config_path': config_path, 21 | 'site_dir': this_site_dir 22 | } 23 | location_list.append(location_dict) 24 | 25 | return location_list 26 | -------------------------------------------------------------------------------- /seleniumcrawler/handle.py: -------------------------------------------------------------------------------- 1 | # Global modules 2 | import re 3 | # Local modules 4 | from seleniumcrawler.config import sites_dict 5 | 6 | class HandlerError(Exception): 7 | def __init__(self, value): 8 | self.value = value 9 | def __str__(self): 10 | return repr(self.value) 11 | 12 | def import_from(module, name): 13 | module = __import__(module, fromlist=[name]) 14 | return getattr(module, name) 15 | 16 | def handle_url(url): 17 | for site, regex in sites_dict.items(): 18 | if re.match(regex, url): 19 | handler = import_from('seleniumcrawler.sites.%s.%s' % (site, site), 'handle_link') 20 | result = handler(url) 21 | result['handler'] = site 22 | return result 23 | raise HandlerError('Handler for URL not defined') 24 | -------------------------------------------------------------------------------- /seleniumcrawler/makeparsed.py: -------------------------------------------------------------------------------- 1 | # Global modules 2 | import os 3 | # Third party modules 4 | import pystache 5 | # Local modules 6 | from seleniumcrawler.filesystem import locate_sites 7 | 8 | template = '''from selenium import webdriver 9 | from selenium.webdriver.firefox.firefox_profile import FirefoxProfile 10 | 11 | def get_profile(): 12 | # get the Firefox profile object 13 | firefoxProfile = FirefoxProfile() 14 | # Disable CSS 15 | firefoxProfile.set_preference('permissions.default.stylesheet', 2) 16 | # Disable images 17 | firefoxProfile.set_preference('permissions.default.image', 2) 18 | # Disable Flash 19 | firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 20 | 'false') 21 | 22 | return firefoxProfile 23 | 24 | def handle_link(link): 25 | profile = get_profile() 26 | 27 | # Set the modified profile while creating the browser object 28 | driver = webdriver.Firefox(profile) 29 | driver.implicitly_wait(30) 30 | driver.get(link) 31 | 32 | {{{code}}} 33 | results = { 34 | 'url': driver.current_url, 35 | 'source': driver.page_source 36 | } 37 | driver.quit() 38 | 39 | return results 40 | ''' 41 | 42 | def parse_raw_script(name, directory, path): 43 | f = open(path) 44 | 45 | codelines = [] 46 | at_main_code = False 47 | for line in f: 48 | if at_main_code: 49 | codelines.append(line[4:]) 50 | if line.startswith(' def test_'): 51 | at_main_code = True 52 | elif line.startswith(' def') and at_main_code: 53 | codelines = codelines[1:-2] 54 | at_main_code = False 55 | break 56 | 57 | fout = open(os.path.join(directory, name + '.py'), 'w') 58 | code = ''.join(codelines) 59 | data = {'code': code} 60 | rendered = pystache.render(template, data) 61 | fout.write(rendered) 62 | 63 | for site in locate_sites(): 64 | parse_raw_script(site['name'], site['site_dir'], site['script_path']) 65 | print "Parsed %s." % site['script_path'] 66 | -------------------------------------------------------------------------------- /seleniumcrawler/sites/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/corywalker/selenium-crawler/e49df4c1f40a330af19d90b7c59d49ac1acee86c/seleniumcrawler/sites/__init__.py -------------------------------------------------------------------------------- /seleniumcrawler/sites/forbes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/corywalker/selenium-crawler/e49df4c1f40a330af19d90b7c59d49ac1acee86c/seleniumcrawler/sites/forbes/__init__.py -------------------------------------------------------------------------------- /seleniumcrawler/sites/forbes/forbes.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.firefox.firefox_profile import FirefoxProfile 3 | 4 | def get_profile(): 5 | # get the Firefox profile object 6 | firefoxProfile = FirefoxProfile() 7 | # Disable CSS 8 | firefoxProfile.set_preference('permissions.default.stylesheet', 2) 9 | # Disable images 10 | firefoxProfile.set_preference('permissions.default.image', 2) 11 | # Disable Flash 12 | firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 13 | 'false') 14 | 15 | return firefoxProfile 16 | 17 | def handle_link(link): 18 | profile = get_profile() 19 | 20 | # Set the modified profile while creating the browser object 21 | driver = webdriver.Firefox(profile) 22 | driver.implicitly_wait(30) 23 | driver.get(link) 24 | 25 | driver.find_element_by_xpath("//div[@class='header']/div[@class='continue']/a").click() 26 | 27 | results = { 28 | 'url': driver.current_url, 29 | 'source': driver.page_source 30 | } 31 | driver.quit() 32 | 33 | return results 34 | -------------------------------------------------------------------------------- /seleniumcrawler/sites/forbes/forbes.tc: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Forbes 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
Forbes
clickAndWait//div[@class='header']/div[@class='continue']/a
21 | 22 | 23 | -------------------------------------------------------------------------------- /seleniumcrawler/sites/forbes/forbes_raw.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support.ui import Select 4 | from selenium.common.exceptions import NoSuchElementException 5 | import unittest, time, re 6 | 7 | class ForbesRaw(unittest.TestCase): 8 | def setUp(self): 9 | self.driver = webdriver.Firefox() 10 | self.driver.implicitly_wait(30) 11 | self.base_url = "https://news.ycombinator.com/" 12 | self.verificationErrors = [] 13 | self.accept_next_alert = true 14 | 15 | def test_forbes_raw(self): 16 | driver = self.driver 17 | driver.find_element_by_xpath("//div[@class='header']/div[@class='continue']/a").click() 18 | 19 | def is_element_present(self, how, what): 20 | try: self.driver.find_element(by=how, value=what) 21 | except NoSuchElementException, e: return False 22 | return True 23 | 24 | def close_alert_and_get_its_text(self): 25 | try: 26 | alert = self.driver.switch_to_alert() 27 | if self.accept_next_alert: 28 | alert.accept() 29 | else: 30 | alert.dismiss() 31 | return alert.text 32 | finally: self.accept_next_alert = True 33 | 34 | def tearDown(self): 35 | self.driver.quit() 36 | self.assertEqual([], self.verificationErrors) 37 | 38 | if __name__ == "__main__": 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /seleniumcrawler/sites/hackaday/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/corywalker/selenium-crawler/e49df4c1f40a330af19d90b7c59d49ac1acee86c/seleniumcrawler/sites/hackaday/__init__.py -------------------------------------------------------------------------------- /seleniumcrawler/sites/hackaday/hackaday.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.firefox.firefox_profile import FirefoxProfile 3 | 4 | def get_profile(): 5 | # get the Firefox profile object 6 | firefoxProfile = FirefoxProfile() 7 | # Disable CSS 8 | firefoxProfile.set_preference('permissions.default.stylesheet', 2) 9 | # Disable images 10 | firefoxProfile.set_preference('permissions.default.image', 2) 11 | # Disable Flash 12 | firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 13 | 'false') 14 | 15 | return firefoxProfile 16 | 17 | def handle_link(link): 18 | profile = get_profile() 19 | 20 | # Set the modified profile while creating the browser object 21 | driver = webdriver.Firefox(profile) 22 | driver.implicitly_wait(30) 23 | driver.get(link) 24 | 25 | driver.find_element_by_xpath("//div[@class='entry-content']/p/a[1]").click() 26 | 27 | results = { 28 | 'url': driver.current_url, 29 | 'source': driver.page_source 30 | } 31 | driver.quit() 32 | 33 | return results 34 | -------------------------------------------------------------------------------- /seleniumcrawler/sites/hackaday/hackaday.tc: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | hackaday 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
hackaday
clickAndWait//div[@class='entry-content']/p/a[1]
21 | 22 | 23 | -------------------------------------------------------------------------------- /seleniumcrawler/sites/hackaday/hackaday_raw.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support.ui import Select 4 | from selenium.common.exceptions import NoSuchElementException 5 | import unittest, time, re 6 | 7 | class HackadayRaw(unittest.TestCase): 8 | def setUp(self): 9 | self.driver = webdriver.Firefox() 10 | self.driver.implicitly_wait(30) 11 | self.base_url = "http://hackaday.com/" 12 | self.verificationErrors = [] 13 | self.accept_next_alert = true 14 | 15 | def test_hackaday_raw(self): 16 | driver = self.driver 17 | driver.find_element_by_xpath("//div[@class='entry-content']/p/a[1]").click() 18 | 19 | def is_element_present(self, how, what): 20 | try: self.driver.find_element(by=how, value=what) 21 | except NoSuchElementException, e: return False 22 | return True 23 | 24 | def close_alert_and_get_its_text(self): 25 | try: 26 | alert = self.driver.switch_to_alert() 27 | if self.accept_next_alert: 28 | alert.accept() 29 | else: 30 | alert.dismiss() 31 | return alert.text 32 | finally: self.accept_next_alert = True 33 | 34 | def tearDown(self): 35 | self.driver.quit() 36 | self.assertEqual([], self.verificationErrors) 37 | 38 | if __name__ == "__main__": 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /seleniumcrawler/sites/hnews/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/corywalker/selenium-crawler/e49df4c1f40a330af19d90b7c59d49ac1acee86c/seleniumcrawler/sites/hnews/__init__.py -------------------------------------------------------------------------------- /seleniumcrawler/sites/hnews/hnews.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.firefox.firefox_profile import FirefoxProfile 3 | 4 | def get_profile(): 5 | # get the Firefox profile object 6 | firefoxProfile = FirefoxProfile() 7 | # Disable CSS 8 | firefoxProfile.set_preference('permissions.default.stylesheet', 2) 9 | # Disable images 10 | firefoxProfile.set_preference('permissions.default.image', 2) 11 | # Disable Flash 12 | firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 13 | 'false') 14 | 15 | return firefoxProfile 16 | 17 | def handle_link(link): 18 | profile = get_profile() 19 | 20 | # Set the modified profile while creating the browser object 21 | driver = webdriver.Firefox(profile) 22 | driver.implicitly_wait(30) 23 | driver.get(link) 24 | 25 | driver.find_element_by_xpath("//td[@class='title']/a").click() 26 | 27 | results = { 28 | 'url': driver.current_url, 29 | 'source': driver.page_source 30 | } 31 | driver.quit() 32 | 33 | return results 34 | -------------------------------------------------------------------------------- /seleniumcrawler/sites/hnews/hnews.tc: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Hacker News 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
Hacker News
clickAndWait//td[@class='title']/a
21 | 22 | 23 | -------------------------------------------------------------------------------- /seleniumcrawler/sites/hnews/hnews_raw.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support.ui import Select 4 | from selenium.common.exceptions import NoSuchElementException 5 | import unittest, time, re 6 | 7 | class HnewsRaw(unittest.TestCase): 8 | def setUp(self): 9 | self.driver = webdriver.Firefox() 10 | self.driver.implicitly_wait(30) 11 | self.base_url = "https://news.ycombinator.com/" 12 | self.verificationErrors = [] 13 | self.accept_next_alert = true 14 | 15 | def test_hnews_raw(self): 16 | driver = self.driver 17 | driver.find_element_by_xpath("//td[@class='title']/a").click() 18 | 19 | def is_element_present(self, how, what): 20 | try: self.driver.find_element(by=how, value=what) 21 | except NoSuchElementException, e: return False 22 | return True 23 | 24 | def close_alert_and_get_its_text(self): 25 | try: 26 | alert = self.driver.switch_to_alert() 27 | if self.accept_next_alert: 28 | alert.accept() 29 | else: 30 | alert.dismiss() 31 | return alert.text 32 | finally: self.accept_next_alert = True 33 | 34 | def tearDown(self): 35 | self.driver.quit() 36 | self.assertEqual([], self.verificationErrors) 37 | 38 | if __name__ == "__main__": 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /seleniumcrawler/sites/reddit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/corywalker/selenium-crawler/e49df4c1f40a330af19d90b7c59d49ac1acee86c/seleniumcrawler/sites/reddit/__init__.py -------------------------------------------------------------------------------- /seleniumcrawler/sites/reddit/reddit.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.firefox.firefox_profile import FirefoxProfile 3 | 4 | def get_profile(): 5 | # get the Firefox profile object 6 | firefoxProfile = FirefoxProfile() 7 | # Disable CSS 8 | firefoxProfile.set_preference('permissions.default.stylesheet', 2) 9 | # Disable images 10 | firefoxProfile.set_preference('permissions.default.image', 2) 11 | # Disable Flash 12 | firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 13 | 'false') 14 | 15 | return firefoxProfile 16 | 17 | def handle_link(link): 18 | profile = get_profile() 19 | 20 | # Set the modified profile while creating the browser object 21 | driver = webdriver.Firefox(profile) 22 | driver.implicitly_wait(30) 23 | driver.get(link) 24 | 25 | driver.find_element_by_xpath("//p[@class='title']/a").click() 26 | 27 | results = { 28 | 'url': driver.current_url, 29 | 'source': driver.page_source 30 | } 31 | driver.quit() 32 | 33 | return results 34 | -------------------------------------------------------------------------------- /seleniumcrawler/sites/reddit/reddit.tc: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | reddit2 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
reddit2
clickAndWait//p[@class='title']/a
21 | 22 | 23 | -------------------------------------------------------------------------------- /seleniumcrawler/sites/reddit/reddit_raw.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support.ui import Select 4 | from selenium.common.exceptions import NoSuchElementException 5 | import unittest, time, re 6 | 7 | class Reddit2(unittest.TestCase): 8 | def setUp(self): 9 | self.driver = webdriver.Firefox() 10 | self.driver.implicitly_wait(30) 11 | self.base_url = "https://news.ycombinator.com/" 12 | self.verificationErrors = [] 13 | self.accept_next_alert = true 14 | 15 | def test_reddit2(self): 16 | driver = self.driver 17 | driver.find_element_by_xpath("//p[@class='title']/a").click() 18 | 19 | def is_element_present(self, how, what): 20 | try: self.driver.find_element(by=how, value=what) 21 | except NoSuchElementException, e: return False 22 | return True 23 | 24 | def close_alert_and_get_its_text(self): 25 | try: 26 | alert = self.driver.switch_to_alert() 27 | if self.accept_next_alert: 28 | alert.accept() 29 | else: 30 | alert.dismiss() 31 | return alert.text 32 | finally: self.accept_next_alert = True 33 | 34 | def tearDown(self): 35 | self.driver.quit() 36 | self.assertEqual([], self.verificationErrors) 37 | 38 | if __name__ == "__main__": 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /seleniumcrawler/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/corywalker/selenium-crawler/e49df4c1f40a330af19d90b7c59d49ac1acee86c/seleniumcrawler/tests/__init__.py -------------------------------------------------------------------------------- /seleniumcrawler/tests/test_all.py: -------------------------------------------------------------------------------- 1 | # Global modules 2 | import unittest 3 | # Local modules 4 | from seleniumcrawler.handle import handle_url 5 | 6 | class TestHandlers(unittest.TestCase): 7 | 8 | def test_forbes(self): 9 | r = handle_url('http://www.forbes.com/sites/abrambrown/2013/04/22/netflixs-profit-picture-clears-q1s-big-beat-surprises-wall-street/') 10 | self.assertEqual(r['handler'], 'forbes') 11 | self.assertTrue('You need to go back to 2011, to shortly before the Qwikster') 12 | 13 | def test_hnews(self): 14 | r = handle_url('https://news.ycombinator.com/item?id=5612912') 15 | self.assertEqual(r['handler'], 'hnews') 16 | self.assertTrue('Wolfe, Cockrell and the rest of the team got a couple of Nexus') 17 | 18 | def test_reddit(self): 19 | r = handle_url('http://www.reddit.com/r/technology/comments/1d5ptg/the_force_of_fiber_google_fiber_is_pressuring/') 20 | self.assertEqual(r['handler'], 'reddit') 21 | self.assertTrue('that there is no public data that paints a complete picture') 22 | 23 | def test_hackaday(self): 24 | r = handle_url('http://hackaday.com/2013/04/26/old-led-marquee-turned-embedded-video-player/') 25 | self.assertEqual(r['handler'], 'hackaday') 26 | self.assertTrue('A better look at the industrial PC.') 27 | 28 | if __name__ == '__main__': 29 | unittest.main() 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | # Utility function to read the README file. 5 | # Used for the long_description. It's nice, because now 1) we have a top level 6 | # README file and 2) it's easier to type in the README file than to put a raw 7 | # string in below ... 8 | def read(fname): 9 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 10 | 11 | with open('requirements.txt') as f: 12 | required = f.read().splitlines() 13 | 14 | setup( 15 | name = "selenium-crawler", 16 | version = "0.1.0", 17 | packages = find_packages(), 18 | 19 | # Project uses reStructuredText, so ensure that the docutils get 20 | # installed or upgraded on the target machine 21 | install_requires=required, 22 | 23 | package_data = { 24 | # If any package contains *.txt or *.rst files, include them: 25 | '': ['*.txt', '*.rst'], 26 | # And include any *.msg files found in the 'hello' package, too: 27 | 'hello': ['*.msg'], 28 | }, 29 | 30 | # metadata for upload to PyPI 31 | author = "Cory Walker", 32 | author_email = "cwalker32@gmail.com", 33 | description = ("Sometimes sites make crawling hard. Selenium-crawler uses " 34 | "Selenium automation to fix that."), 35 | license = "LICENSE.txt", 36 | keywords = "selenium crawling crawl automate ads landing", 37 | url = "https://github.com/cmwslw/selenium-crawler", 38 | 39 | long_description=read('README.md'), 40 | test_suite = "seleniumcrawler.tests.test_all", 41 | classifiers=[ 42 | "Development Status :: 3 - Alpha", 43 | "Topic :: Utilities", 44 | "License :: OSI Approved :: MIT License", 45 | ], 46 | ) 47 | --------------------------------------------------------------------------------