├── LICENSE
├── README.md
└── SeoPy
    └── SeoPy.py


/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (C) 2012 Corey McMahon
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | SeoPy for everyone!


--------------------------------------------------------------------------------
/SeoPy/SeoPy.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | SeoPy
  3 | ====
  4 | A set of tools for performing some standard SEO tasks, such as keyword research and competitive analysis.
  5 | '''
  6 | 
  7 | import re
  8 | import httplib
  9 | import urllib
 10 | import time
 11 | 
 12 | GOOGLE_SEARCH_URL = "https://encrypted.google.com/search?complete=0&pws=0&q="
 13 | TOOLBAR_URL = 'http://toolbarqueries.google.com/search?client=navclient-auto&ch={0}&features=Rank&q=info:{1}'
 14 | 
 15 | class SeoPy:	
 16 | 	''' This class provides the entry-point to the functionality of the library. '''
 17 | 	
 18 | 	def execute_http_request (self, endpoint, http_method="GET", http_body="", http_headers=""):
 19 | 		# Trim off everything before // and everything after the first occurence of /
 20 | 		domain = endpoint[ endpoint.find("//") + 2 :]
 21 | 		if domain.find("/") != -1 :
 22 | 			domain = domain[ 0 : domain.find("/") ]
 23 | 		
 24 | 		# Get the resource (relative URL)
 25 | 		resource = endpoint[ endpoint.find(domain) + len(domain) :]
 26 | 
 27 | 		# Do we want http or https?
 28 | 		conn = httplib.HTTPConnection(domain)
 29 | 		if endpoint.find("https") != -1 :
 30 | 			conn = httplib.HTTPSConnection(domain)
 31 | 
 32 | 		conn.request(http_method, resource)
 33 | 		response = conn.getresponse()
 34 | 		return response
 35 | 	
 36 | 	def get_results_for(self,keyword) :
 37 | 		''' Get the first 10 results for this search query '''
 38 | 		url = GOOGLE_SEARCH_URL + urllib.quote(keyword)
 39 | 		return GoogleResults(self.execute_http_request(url).read())
 40 | 
 41 | 
 42 | 
 43 | class GoogleResults:
 44 | 	''' Class representing a page of Google search results '''
 45 | 	
 46 | 	html_document = ""
 47 | 
 48 | 	def __init__(self, html_document):
 49 | 		''' Pass in the raw HTML for the results page '''
 50 | 		self.html_document = html_document
 51 | 	
 52 | 	def get_average_pagerank(self):
 53 | 		''' Get the average PageRank of the 10 results on the first page '''
 54 | 		checker = PageRankChecker()
 55 | 		results = self.get_results()
 56 | 		pageranks = []
 57 | 		min = 11
 58 | 		max = -1
 59 | 		total = 0
 60 | 		i = 0
 61 | 		for result in results:
 62 | 			# TODO: work out why the PR toolbar server is denying us after a certain # of requests
 63 | 			pagerank = int(checker.get_pr(result[1]))
 64 | 			time.sleep(4)
 65 | 			total += pagerank
 66 | 			if (pagerank < min) :
 67 | 				min = pagerank
 68 | 			if (pagerank > max) :
 69 | 				max = pagerank
 70 | 			i += 1
 71 | 			print result[1] + "(" + str(pagerank) + ")\n"
 72 | 		
 73 | 		total = total - min - max
 74 | 		return float(total) / float(i)
 75 | 		
 76 | 	
 77 | 	def get_number_of_results(self):
 78 | 		''' Get the total number of results for this query '''
 79 | 		results = re.search('([0-9,]*?) result', self.html_document)
 80 | 		return results.group(1).replace(",","")
 81 | 	
 82 | 	def get_results(self) :
 83 | 		''' Get an array of arrays containing the top 10 results, the title of the page and the URL '''
 84 | 		results = re.findall('<h3.*?</h3>', self.html_document, re.M)
 85 | 		
 86 | 		rlist = []
 87 | 		for result in results:
 88 | 			title = re.sub('<[^<]+?>', '', result)
 89 | 			# TODO: line below is failing for certain queries
 90 | 			url_matches = re.search("/url\?q=(.*?)\&amp;", result)
 91 | 			url = urllib.unquote(url_matches.group(1))
 92 | 			rlist.append([title, url])
 93 | 		return rlist
 94 | 	
 95 | 	def get_raw_html(self):
 96 | 		''' Get the raw HTML for this resultset '''
 97 | 		return self.html_document
 98 | 
 99 | 
100 | class PageRankChecker:
101 | 	''' PageRank checked based on the implementation here: https://github.com/phurix/pagerank/blob/master/pagerank.py '''
102 | 	
103 | 	prhost='toolbarqueries.google.com'
104 | 	prurl='http://%s/tbr?client=navclient-auto&ch=%s&features=Rank&q=info:%s'
105 | 	
106 | 	def int_str(self, String, Integer, Factor):
107 | 		for i in range(len(String)) :
108 | 			Integer *= Factor
109 | 			Integer &= 0xFFFFFFFF
110 | 			Integer += ord(String[i])
111 | 		return Integer
112 | 	
113 | 	def hash_url(self, Str):
114 | 		C1 = self.int_str(Str, 0x1505, 0x21)
115 | 		C2 = self.int_str(Str, 0, 0x1003F)
116 | 		
117 | 		C1 >>= 2
118 | 		C1 = ((C1 >> 4) & 0x3FFFFC0) | (C1 & 0x3F)
119 | 		C1 = ((C1 >> 4) & 0x3FFC00) | (C1 & 0x3FF)
120 | 		C1 = ((C1 >> 4) & 0x3C000) | (C1 & 0x3FFF)
121 | 		
122 | 		T1 = (C1 & 0x3C0) << 4
123 | 		T1 |= C1 & 0x3C
124 | 		T1 = (T1 << 2) | (C2 & 0xF0F)
125 | 		
126 | 		T2 = (C1 & 0xFFFFC000) << 4
127 | 		T2 |= C1 & 0x3C00
128 | 		T2 = (T2 << 0xA) | (C2 & 0xF0F0000)
129 | 		
130 | 		return (T1 | T2)
131 | 	
132 | 	def check_hash(self, HashInt):
133 | 		HashStr = "%u" % (HashInt)
134 | 		Flag = 0
135 | 		CheckByte = 0
136 | 		
137 | 		i = len(HashStr) - 1
138 | 		while i >= 0:
139 | 			Byte = int(HashStr[i])
140 | 			if 1 == (Flag % 2):
141 | 				Byte *= 2;
142 | 				Byte = Byte / 10 + Byte % 10
143 | 			CheckByte += Byte
144 | 			Flag += 1
145 | 			i -= 1
146 | 		
147 | 		CheckByte %= 10
148 | 		if 0 != CheckByte:
149 | 			CheckByte = 10 - CheckByte
150 | 			if 1 == Flag % 2:
151 | 				if 1 == CheckByte % 2:
152 | 					CheckByte += 9
153 | 				CheckByte >>= 1
154 | 		
155 | 		return '7' + str(CheckByte) + HashStr
156 | 	
157 | 	
158 | 	def get_url(self,query):
159 | 		hash = self.check_hash(self.hash_url(query))
160 | 		url = self.prurl % (self.prhost,hash,query)
161 | 		return url
162 | 	
163 | 	def get_pr(self,url):
164 | 		endpoint = self.get_url(url)
165 | 		seopy = SeoPy()
166 | 		rank = seopy.execute_http_request(endpoint).read()
167 | 		print "[" + rank + "]"
168 | 		return rank[rank.rfind(":")+1:].replace("\n","")
169 | 
170 | 
171 | 


--------------------------------------------------------------------------------