├── README.md ├── knowledge-panel-scraper.py └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | # knowledge-panel-scraper 2 | knowledge-panel-scraper is a Python 3 command line tool that scrapes Google's [Knowledge Panels](https://support.google.com/business/answer/6331288?hl=en) to retrieve the phone numbers, addresses, and hours of an inputted list of businesses. 3 | 4 | ## Installation 5 | 6 | Use git to clone the repository, then install required libraries with the package manager [pip](https://pip.pypa.io/en/stable/). 7 | 8 | ```bash 9 | git clone https://github.com/jnovak98/knowledge-panel-scraper.git 10 | cd knowledge-panel-scraper 11 | pip install -r requirements.txt 12 | ``` 13 | 14 | ## Usage 15 | ```bash 16 | python knowledge-panel-scraper.py inputfile.csv 17 | ``` 18 | 19 | inputfile.csv should be a plain text CSV file with each row containing data to generate a search query for a specific business. 20 | For example: 21 | ```csv 22 | "Bobcat of Monroe,Monroe,NC",1711 MORGAN MILL ROAD,MONROE,NC,28110,(704) 289-2200 23 | "Kelly's Garage,Perry,NY",2868 STATE ROUTE 246,PERRY,NY,14530,(585) 237-2504 24 | "Hoxie Implement Co,Hoxie,KS",933 OAK AVENUE,HOXIE,KS,67740-0587,(785) 675-3201 25 | "Duhon Machinery,St. Rose,LA",10460 WEST AIRLINE HIGHWAY,ST. ROSE,LA,70087,(504) 466-5495 26 | ``` 27 | 28 | The results will be saved in results.csv in the same directory. 29 | 30 | ## Contributing 31 | Pull requests are welcome. 32 | 33 | ## License 34 | [MIT](https://choosealicense.com/licenses/mit/) 35 | -------------------------------------------------------------------------------- /knowledge-panel-scraper.py: -------------------------------------------------------------------------------- 1 | import csv, json, re, sys 2 | import requests 3 | 4 | headers_Get = { 5 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0', 6 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 7 | 'Accept-Language': 'en-US,en;q=0.5', 8 | 'Accept-Encoding': 'gzip, deflate', 9 | 'DNT': '1', 10 | 'Connection': 'keep-alive', 11 | 'Upgrade-Insecure-Requests': '1' 12 | } 13 | 14 | html_tags = { 15 | 'knowledge_panel': 'kp-blk knowledge-panel', 16 | 'claimed': "Own this business?", 17 | 'name': "kno-ecr-pt kno-fb-ctx", 18 | 'phone': 'LrzXr zdqRlf kno-fv', 19 | 'days': "kc:/location/location:hours", 20 | 'address': "kc:/location/location:address", 21 | 'website': "IzNS7c duf-h" 22 | } 23 | 24 | html_regexes = { 25 | 'name': '(.*)', 26 | 'phone': '(.*?)', 27 | 'hours': '(.*)', 28 | 'address': '(.*)', 29 | 'website': 'href="(.*?)"' 30 | } 31 | 32 | days = ["Sunday", "Monday","Tuesday","Wednesday", "Thursday","Friday","Saturday"] 33 | 34 | def google(q): 35 | s = requests.Session() 36 | q = '+'.join(q.split()) 37 | url = 'https://www.google.com/search?q=' + q + '&ie=utf-8&oe=utf-8' 38 | r = s.get(url, headers=headers_Get) 39 | return r.text 40 | 41 | def get_string_after_tag(string, tag, regex, distance): 42 | if(tag not in string): 43 | return None 44 | 45 | index = string.find(tag) 46 | substr = string[index:index+distance] 47 | if re.search(regex,substr): 48 | return re.search(regex,substr).group(1) 49 | else: 50 | return None 51 | 52 | def get_details(query): 53 | html_results = google(query) 54 | results = {'query':query} 55 | has_knowledge_panel = html_tags['knowledge_panel'] in html_results 56 | 57 | if(has_knowledge_panel): 58 | results['exists'] = True 59 | results['name'] = get_string_after_tag(html_results, html_tags['name'],html_regexes['name'],500) 60 | 61 | results['claimed'] = html_tags['claimed'] not in html_results 62 | 63 | phone_number = get_string_after_tag(html_results, html_tags['phone'],html_regexes['phone'],200) 64 | if(phone_number): 65 | results['phone_number'] = phone_number 66 | 67 | address = get_string_after_tag(html_results, html_tags['address'],html_regexes['address'],1000) 68 | if(address): 69 | results['address'] = address 70 | 71 | website = get_string_after_tag(html_results, html_tags['website'],html_regexes['website'],200) 72 | if(website): 73 | results['website'] = website 74 | 75 | if html_tags['days'] in html_results: 76 | hours_index = html_results.find(html_tags['days']) 77 | hours_substr = html_results[hours_index:hours_index+2000] 78 | for day in days: 79 | results['{}_hours'.format(day)] = get_string_after_tag(hours_substr,day,html_regexes['hours'],50) 80 | else: 81 | results['exists'] = False 82 | return results 83 | 84 | 85 | if __name__ == "__main__": 86 | with open(sys.argv[1], newline='') as csvfile: 87 | with open('results.csv', 'w', newline='') as results: 88 | reader = csv.reader(csvfile) 89 | fieldnames = ['query','exists', 'name','claimed','phone_number','address','website', 90 | "Friday_hours","Saturday_hours","Sunday_hours", "Monday_hours","Tuesday_hours","Wednesday_hours", "Thursday_hours"] 91 | writer = csv.DictWriter(results, fieldnames=fieldnames) 92 | writer.writeheader() 93 | for row in reader: 94 | print(reader.line_num) 95 | writer.writerow(get_details(u" ".join(row))) 96 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.22.0 --------------------------------------------------------------------------------