├── README.md └── Scrape.py /README.md: -------------------------------------------------------------------------------- 1 | 1. The urls are for companies in the credit & debt markets but can be replaced for any other websites. 2 | 2. The data scraped is stored into a csv file 3 | 3. Need an ahrefs account and a google account must be inputted manually in order for this program to work 4 | 5 | My First real software project for a company. 6 | -------------------------------------------------------------------------------- /Scrape.py: -------------------------------------------------------------------------------- 1 | # Webscraping-Selenium 2 | 3 | # Webscraping Tool that pull data off an Ahrefs premium account for SEO Keyword Research. 4 | # Can Parse thoruhg hundreds of keywords and insert that data into an excel file. 5 | 6 | #Imports 7 | from selenium import webdriver 8 | from selenium.webdriver.common.keys import Keys 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support.ui import WebDriverWait 11 | from selenium.webdriver.support import expected_conditions as EC 12 | import xlsxwriter 13 | import time 14 | 15 | #Browser 16 | PATH = "C:\Program Files (x86)\chromedriver.exe" 17 | driver = webdriver.Chrome(PATH) 18 | 19 | #Opening Google Login 20 | driver.get("https://accounts.google.com/o/oauth2/v2/auth/oauthchooseaccount?redirect_uri=https%3A%2F%2Fdevelopers.google.com%2Foauthplayground&prompt=consent&response_type=code&client_id=407408718192.apps.googleusercontent.com&scope=email&access_type=offline&flowName=GeneralOAuthFlow") 21 | 22 | 23 | #Manual Input Time 24 | time.sleep(45) 25 | 26 | #Defining Search Bar 27 | search = driver.find_element_by_id('se_pe_target') 28 | 29 | #Backspace Code 30 | expo = 'www.experian.com' 31 | count = len(expo) + 1 32 | while count > 0: 33 | search.send_keys(Keys.BACKSPACE) 34 | count -= 1 35 | time.sleep(2) 36 | 37 | #Create Excel Files 38 | book = xlsxwriter.Workbook('Site_Explorer6.xlsx') 39 | sheet = book.add_worksheet() 40 | 41 | #Declaring Titles 42 | 43 | url = ["www.liberty1financial.com", 44 | "www.badcredit.org/", 45 | "www.discover.com/", 46 | "www.creditsesame.com/", 47 | "www.bankofamerica.com/credit-cards/"] 48 | 49 | 50 | 51 | b = 0 52 | 53 | time.sleep(20) 54 | for y in url: 55 | b += 1 56 | search.send_keys(y) 57 | search.send_keys(Keys.ENTER) 58 | Rank = driver.find_element_by_id('topAhrefsRank') 59 | sheet.write(b, 1, Rank.text) 60 | URL = driver.find_element_by_id('UrlRatingContainer') 61 | sheet.write(b, 2, URL.text) 62 | DR = driver.find_element_by_id('DomainRatingContainer') 63 | sheet.write(b, 3, DR.text) 64 | Backlinks = driver.find_element_by_id('numberOfRefPages') 65 | sheet.write(b, 4, Backlinks.text) 66 | Referring_Domains = driver.find_element_by_id('numberOfRefDomains') 67 | sheet.write(b, 5, Referring_Domains.text) 68 | OrganicKey = driver.find_element_by_id('numberOfOrganicKeywords') 69 | sheet.write(b, 6, OrganicKey.text) 70 | OrganicTraffic = driver.find_element_by_id('numberOfOrganicTraffic') 71 | sheet.write(b, 7, OrganicTraffic.text) 72 | BacklinkStats = driver.find_element_by_id('BacklinksStatsContainer') 73 | Backie = BacklinkStats.text 74 | l = Backie.split() 75 | for j in range(11): 76 | i = 1 + j*3 77 | column = j+8 78 | sheet.write(b, column, l[i]) 79 | 80 | time.sleep(12) 81 | count = len(url) + 50 82 | while count > 0: 83 | search.send_keys(Keys.BACKSPACE) 84 | count -= 1 85 | time.sleep(8) 86 | 87 | a = 0 88 | Titles = ['Ahrefs Rank', 'URL Rank', 'Domain Rank', 'Backlinks', 'Referring Domains', 'Organic Keywords', 'Traffic Value', 89 | 'Referring domains', 'Dofollow', 'Governmentatl', 'Educational', '.gov', '.edu', '.com', 90 | '.net', '.org', 'Backlinks', 'Dofollow', 'Nofollow', 'UGC', 'Sponsored', 91 | 'Text', 'REdirect', 'Image', 'Form', 'Governmental', 'Educational'] 92 | for x in Titles: 93 | a+=1 94 | sheet.write(0, a, x) 95 | 96 | 97 | 98 | 99 | #declare data 100 | book.close() 101 | 102 | #Leaving Driver 103 | driver.quit() 104 | --------------------------------------------------------------------------------