├── LICENSE ├── README.md └── SeoPy └── SeoPy.py /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2012 Corey McMahon 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SeoPy for everyone! -------------------------------------------------------------------------------- /SeoPy/SeoPy.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SeoPy 3 | ==== 4 | A set of tools for performing some standard SEO tasks, such as keyword research and competitive analysis. 5 | ''' 6 | 7 | import re 8 | import httplib 9 | import urllib 10 | import time 11 | 12 | GOOGLE_SEARCH_URL = "https://encrypted.google.com/search?complete=0&pws=0&q=" 13 | TOOLBAR_URL = 'http://toolbarqueries.google.com/search?client=navclient-auto&ch={0}&features=Rank&q=info:{1}' 14 | 15 | class SeoPy: 16 | ''' This class provides the entry-point to the functionality of the library. ''' 17 | 18 | def execute_http_request (self, endpoint, http_method="GET", http_body="", http_headers=""): 19 | # Trim off everything before // and everything after the first occurence of / 20 | domain = endpoint[ endpoint.find("//") + 2 :] 21 | if domain.find("/") != -1 : 22 | domain = domain[ 0 : domain.find("/") ] 23 | 24 | # Get the resource (relative URL) 25 | resource = endpoint[ endpoint.find(domain) + len(domain) :] 26 | 27 | # Do we want http or https? 28 | conn = httplib.HTTPConnection(domain) 29 | if endpoint.find("https") != -1 : 30 | conn = httplib.HTTPSConnection(domain) 31 | 32 | conn.request(http_method, resource) 33 | response = conn.getresponse() 34 | return response 35 | 36 | def get_results_for(self,keyword) : 37 | ''' Get the first 10 results for this search query ''' 38 | url = GOOGLE_SEARCH_URL + urllib.quote(keyword) 39 | return GoogleResults(self.execute_http_request(url).read()) 40 | 41 | 42 | 43 | class GoogleResults: 44 | ''' Class representing a page of Google search results ''' 45 | 46 | html_document = "" 47 | 48 | def __init__(self, html_document): 49 | ''' Pass in the raw HTML for the results page ''' 50 | self.html_document = html_document 51 | 52 | def get_average_pagerank(self): 53 | ''' Get the average PageRank of the 10 results on the first page ''' 54 | checker = PageRankChecker() 55 | results = self.get_results() 56 | pageranks = [] 57 | min = 11 58 | max = -1 59 | total = 0 60 | i = 0 61 | for result in results: 62 | # TODO: work out why the PR toolbar server is denying us after a certain # of requests 63 | pagerank = int(checker.get_pr(result[1])) 64 | time.sleep(4) 65 | total += pagerank 66 | if (pagerank < min) : 67 | min = pagerank 68 | if (pagerank > max) : 69 | max = pagerank 70 | i += 1 71 | print result[1] + "(" + str(pagerank) + ")\n" 72 | 73 | total = total - min - max 74 | return float(total) / float(i) 75 | 76 | 77 | def get_number_of_results(self): 78 | ''' Get the total number of results for this query ''' 79 | results = re.search('([0-9,]*?) result', self.html_document) 80 | return results.group(1).replace(",","") 81 | 82 | def get_results(self) : 83 | ''' Get an array of arrays containing the top 10 results, the title of the page and the URL ''' 84 | results = re.findall('', self.html_document, re.M) 85 | 86 | rlist = [] 87 | for result in results: 88 | title = re.sub('<[^<]+?>', '', result) 89 | # TODO: line below is failing for certain queries 90 | url_matches = re.search("/url\?q=(.*?)\&", result) 91 | url = urllib.unquote(url_matches.group(1)) 92 | rlist.append([title, url]) 93 | return rlist 94 | 95 | def get_raw_html(self): 96 | ''' Get the raw HTML for this resultset ''' 97 | return self.html_document 98 | 99 | 100 | class PageRankChecker: 101 | ''' PageRank checked based on the implementation here: https://github.com/phurix/pagerank/blob/master/pagerank.py ''' 102 | 103 | prhost='toolbarqueries.google.com' 104 | prurl='http://%s/tbr?client=navclient-auto&ch=%s&features=Rank&q=info:%s' 105 | 106 | def int_str(self, String, Integer, Factor): 107 | for i in range(len(String)) : 108 | Integer *= Factor 109 | Integer &= 0xFFFFFFFF 110 | Integer += ord(String[i]) 111 | return Integer 112 | 113 | def hash_url(self, Str): 114 | C1 = self.int_str(Str, 0x1505, 0x21) 115 | C2 = self.int_str(Str, 0, 0x1003F) 116 | 117 | C1 >>= 2 118 | C1 = ((C1 >> 4) & 0x3FFFFC0) | (C1 & 0x3F) 119 | C1 = ((C1 >> 4) & 0x3FFC00) | (C1 & 0x3FF) 120 | C1 = ((C1 >> 4) & 0x3C000) | (C1 & 0x3FFF) 121 | 122 | T1 = (C1 & 0x3C0) << 4 123 | T1 |= C1 & 0x3C 124 | T1 = (T1 << 2) | (C2 & 0xF0F) 125 | 126 | T2 = (C1 & 0xFFFFC000) << 4 127 | T2 |= C1 & 0x3C00 128 | T2 = (T2 << 0xA) | (C2 & 0xF0F0000) 129 | 130 | return (T1 | T2) 131 | 132 | def check_hash(self, HashInt): 133 | HashStr = "%u" % (HashInt) 134 | Flag = 0 135 | CheckByte = 0 136 | 137 | i = len(HashStr) - 1 138 | while i >= 0: 139 | Byte = int(HashStr[i]) 140 | if 1 == (Flag % 2): 141 | Byte *= 2; 142 | Byte = Byte / 10 + Byte % 10 143 | CheckByte += Byte 144 | Flag += 1 145 | i -= 1 146 | 147 | CheckByte %= 10 148 | if 0 != CheckByte: 149 | CheckByte = 10 - CheckByte 150 | if 1 == Flag % 2: 151 | if 1 == CheckByte % 2: 152 | CheckByte += 9 153 | CheckByte >>= 1 154 | 155 | return '7' + str(CheckByte) + HashStr 156 | 157 | 158 | def get_url(self,query): 159 | hash = self.check_hash(self.hash_url(query)) 160 | url = self.prurl % (self.prhost,hash,query) 161 | return url 162 | 163 | def get_pr(self,url): 164 | endpoint = self.get_url(url) 165 | seopy = SeoPy() 166 | rank = seopy.execute_http_request(endpoint).read() 167 | print "[" + rank + "]" 168 | return rank[rank.rfind(":")+1:].replace("\n","") 169 | 170 | 171 | --------------------------------------------------------------------------------