├── README.md
├── knowledge-panel-scraper.py
└── requirements.txt


/README.md:
--------------------------------------------------------------------------------
 1 | # knowledge-panel-scraper
 2 | knowledge-panel-scraper is a Python 3 command line tool that scrapes Google's [Knowledge Panels](https://support.google.com/business/answer/6331288?hl=en) to retrieve the phone numbers, addresses, and hours of an inputted list of businesses.
 3 | 
 4 | ## Installation
 5 | 
 6 | Use git to clone the repository, then install required libraries with the package manager [pip](https://pip.pypa.io/en/stable/).
 7 | 
 8 | ```bash
 9 | git clone https://github.com/jnovak98/knowledge-panel-scraper.git
10 | cd knowledge-panel-scraper
11 | pip install -r requirements.txt
12 | ```
13 | 
14 | ## Usage
15 | ```bash
16 | python knowledge-panel-scraper.py inputfile.csv
17 | ```
18 | 
19 | inputfile.csv should be a plain text CSV file with each row containing data to generate a search query for a specific business.
20 | For example:
21 | ```csv
22 | "Bobcat of Monroe,Monroe,NC",1711 MORGAN MILL ROAD,MONROE,NC,28110,(704) 289-2200
23 | "Kelly's Garage,Perry,NY",2868 STATE ROUTE 246,PERRY,NY,14530,(585) 237-2504
24 | "Hoxie Implement Co,Hoxie,KS",933 OAK AVENUE,HOXIE,KS,67740-0587,(785) 675-3201
25 | "Duhon Machinery,St. Rose,LA",10460 WEST AIRLINE HIGHWAY,ST. ROSE,LA,70087,(504) 466-5495
26 | ```
27 | 
28 | The results will be saved in results.csv in the same directory.
29 | 
30 | ## Contributing
31 | Pull requests are welcome.
32 | 
33 | ## License
34 | [MIT](https://choosealicense.com/licenses/mit/)
35 | 


--------------------------------------------------------------------------------
/knowledge-panel-scraper.py:
--------------------------------------------------------------------------------
 1 | import csv, json, re, sys
 2 | import requests
 3 | 
 4 | headers_Get = {
 5 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
 6 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 7 |         'Accept-Language': 'en-US,en;q=0.5',
 8 |         'Accept-Encoding': 'gzip, deflate',
 9 |         'DNT': '1',
10 |         'Connection': 'keep-alive',
11 |         'Upgrade-Insecure-Requests': '1'
12 |     }
13 | 
14 | html_tags = {
15 |     'knowledge_panel': 'kp-blk knowledge-panel',
16 |     'claimed': "Own this business?",
17 |     'name': "kno-ecr-pt kno-fb-ctx",
18 |     'phone': 'LrzXr zdqRlf kno-fv',
19 |     'days': "kc:/location/location:hours",
20 |     'address': "kc:/location/location:address",
21 |     'website': "IzNS7c duf-h"
22 | }
23 | 
24 | html_regexes = {
25 |     'name': '<span>(.*)</span>',
26 |     'phone': '<span>(.*?)</span>',
27 |     'hours': '<td>(.*)</td>',
28 |     'address': '<span class="LrzXr">(.*)</span>',
29 |     'website': 'href="(.*?)"'
30 | }
31 | 
32 | days = ["Sunday", "Monday","Tuesday","Wednesday", "Thursday","Friday","Saturday"]
33 | 
34 | def google(q):
35 |     s = requests.Session()
36 |     q = '+'.join(q.split())
37 |     url = 'https://www.google.com/search?q=' + q + '&ie=utf-8&oe=utf-8'
38 |     r = s.get(url, headers=headers_Get)
39 |     return r.text
40 | 
41 | def get_string_after_tag(string, tag, regex, distance):
42 |     if(tag not in string):
43 |         return None
44 | 
45 |     index = string.find(tag) 
46 |     substr =  string[index:index+distance]
47 |     if re.search(regex,substr):
48 |         return re.search(regex,substr).group(1)
49 |     else:
50 |         return None
51 | 
52 | def get_details(query):
53 |     html_results = google(query)
54 |     results = {'query':query}
55 |     has_knowledge_panel = html_tags['knowledge_panel'] in html_results
56 | 
57 |     if(has_knowledge_panel):
58 |         results['exists'] = True
59 |         results['name'] = get_string_after_tag(html_results, html_tags['name'],html_regexes['name'],500)
60 | 
61 |         results['claimed'] = html_tags['claimed'] not in html_results
62 | 
63 |         phone_number = get_string_after_tag(html_results, html_tags['phone'],html_regexes['phone'],200)
64 |         if(phone_number):
65 |             results['phone_number'] = phone_number
66 | 
67 |         address = get_string_after_tag(html_results, html_tags['address'],html_regexes['address'],1000)
68 |         if(address):
69 |             results['address'] = address
70 | 
71 |         website = get_string_after_tag(html_results, html_tags['website'],html_regexes['website'],200)
72 |         if(website):
73 |             results['website'] = website
74 | 
75 |         if html_tags['days'] in html_results:
76 |             hours_index = html_results.find(html_tags['days'])
77 |             hours_substr = html_results[hours_index:hours_index+2000]
78 |             for day in days:
79 |                 results['{}_hours'.format(day)] = get_string_after_tag(hours_substr,day,html_regexes['hours'],50)
80 |     else:
81 |         results['exists'] = False
82 |     return results
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     with open(sys.argv[1], newline='') as csvfile:
87 |         with open('results.csv', 'w', newline='') as results:
88 |             reader = csv.reader(csvfile)
89 |             fieldnames = ['query','exists', 'name','claimed','phone_number','address','website',
90 |                 "Friday_hours","Saturday_hours","Sunday_hours", "Monday_hours","Tuesday_hours","Wednesday_hours", "Thursday_hours"]
91 |             writer = csv.DictWriter(results, fieldnames=fieldnames)
92 |             writer.writeheader()
93 |             for row in reader:
94 |                 print(reader.line_num)
95 |                 writer.writerow(get_details(u"  ".join(row)))
96 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.22.0


--------------------------------------------------------------------------------