├── .gitignore
├── README.md
├── analyse.py
└── github.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.txt
2 | *.jpg
3 | dump.py
4 | 
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ###Python keyword usage analysis
 2 | 
 3 | An attempt at verifying if [Zipf's Law](https://en.wikipedia.org/wiki/Zipf%27s_law)(Or any other power law for that matter) holds for programming languages as well.
 4 | We tried to verify that for python. The "library" for our "language" is the [flask project] (https://github.com/mitsuhiko/flask)
 5 | The entire Flask project has 885 python files and close to 19541 python keywords.
 6 | 
 7 | ###Screenshots
 8 | 
 9 | ![alt tag](http://i.imgur.com/Has1m8J.png)
10 | 
11 | Bar chart
12 | 
13 | ![alt tag](http://i.imgur.com/K9mGX58.png)
14 | 
15 | Log-Log graph
16 | 
17 | 
18 | Turns out only the first few keywords follow some kind of distribution.


--------------------------------------------------------------------------------
/analyse.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function,division
  2 | import collections,keyword,tokenize,sys,matplotlib
  3 | from matplotlib import pyplot as plt
  4 | from operator import itemgetter
  5 | import numpy as np
  6 | import requests
  7 | import io
  8 | 
  9 | font = {'family' : 'sans-serif', 'weight' : 'normal', 'size' : 12}
 10 | matplotlib.rc('font', **font)
 11 | fig1 = plt.figure(figsize = (15,15))
 12 | fig2 = plt.figure(figsize = (15,15))
 13 | ax1 = fig1.add_subplot(111)
 14 | ax2 = fig2.add_subplot(111)
 15 | tokenCounter = collections.Counter(keyword.kwlist)
 16 | maxLinks = 850
 17 | 
 18 | for word in keyword.kwlist:
 19 | 	tokenCounter[word] = tokenCounter[word] - 1
 20 | 	
 21 | def countKeywords():
 22 | 	processedKeywords = 0
 23 | 	with open('dump.py','r') as codeReader:
 24 | 		tokens = (token for _, token, _, _, _ in tokenize.generate_tokens(codeReader.readline))
 25 | 		for token in tokens:
 26 | 			if keyword.iskeyword(token):
 27 | 				processedKeywords = processedKeywords + 1
 28 | 				tokenCounter[token] = tokenCounter[token] + 1
 29 | 		progressString = "\rProcessed {0} keywords".format(processedKeywords)
 30 | 		print(progressString,end = '')
 31 | 	return sorted(tokenCounter.items(),key = itemgetter(1),reverse = True)
 32 | 	
 33 | def getLotsOfCode():
 34 | 	processedLinks = 0
 35 | 	with open('links.txt') as links, open('dump.py','a') as codeFile:
 36 | 		for link in links:
 37 | 			try:
 38 | 				if processedLinks > maxLinks:
 39 | 					break
 40 | 				htmlFile = requests.get(link.rstrip()).text
 41 | 				with io.StringIO(htmlFile) as fileReader:
 42 | 					for line in fileReader.readlines():
 43 | 						codeFile.write(line)
 44 | 				processedLinks += 1
 45 | 				progressString = "\rProcessed {0} links".format(processedLinks)
 46 | 				print(progressString,end = '')
 47 | 			except:
 48 | 				pass
 49 | 	return sorted(tokenCounter.items(),key = itemgetter(1),reverse = True)
 50 | 
 51 | def getHarmonicSum(N,S):
 52 | 	array = np.arange(1,N+1,dtype = float)
 53 | 	harmonicSum = 0
 54 | 	for element in array:
 55 | 		harmonicSum += (1/(element**S))
 56 | 	return harmonicSum
 57 | 	
 58 | def getIdealFrequency(K,S):
 59 | 	harmonicSum = getHarmonicSum(K,S)
 60 | 	array = np.arange(1,K+1,dtype = float)
 61 | 	for index in xrange(K):
 62 | 		array[index] = harmonicSum/24*(1/(array[index]**S))
 63 | 	return array
 64 | 	
 65 | def plotBarGraph(tokenCounts):
 66 | 	tokenCounts = zip(*tokenCounts)
 67 | 	numElements = np.arange(len(tokenCounts[0]))
 68 | 	thickness = 0.45
 69 | 	topOffset = max(tokenCounts[1]) + len(str(max(tokenCounts[1])))
 70 | 	ax1.set_title('Keywords vs Occurnces')
 71 | 	ax1.set_xlabel('Keywords')
 72 | 	ax1.set_ylabel('Occurences')
 73 | 	ax1.xaxis.set_label_coords(1.05, 0.015)
 74 | 	ax1.set_xticks(numElements)
 75 | 	ax1.set_xticklabels(tokenCounts[0],rotation = 55, verticalalignment = 'top')
 76 | 	ax1.set_ylim([0,topOffset])
 77 | 	ax1.set_xlim([-1,len(tokenCounts[0])])
 78 | 	rects = ax1.bar(numElements,tokenCounts[1], width = thickness, linewidth = 1.5, edgecolor = 'black', color = 'green', align = 'center')
 79 | 	for rect,count in zip(rects,tokenCounts[1]):
 80 | 		height = rect.get_height()
 81 | 		ax1.text(rect.get_x()+rect.get_width()/2., 1.01*height, '%d'% count,
 82 | 			ha='center', va='bottom')
 83 | 
 84 | def plotLog(tokenCounts):
 85 | 	tokenCounts = zip(*tokenCounts)
 86 | 	numElements = np.arange(31)
 87 | 	idealCurve = getIdealFrequency(31,0.2)
 88 | 	ax2.set_title('Keywords vs Occurnces')
 89 | 	ax2.set_xlabel('Log(Rank)')
 90 | 	ax2.set_ylabel('Log(Frequency)')
 91 | 	ax2.set_xticks(numElements)
 92 | 	ax2.set_xticklabels(tokenCounts[0], verticalalignment = 'top')
 93 | 	ax2.loglog(numElements,idealCurve,basex = 10,basey = 10)
 94 | 	frequencies = list(tokenCounts[1])
 95 | 	maxFreq = max(frequencies)
 96 | 	for index in xrange(len(frequencies)):
 97 | 		frequencies[index] /= maxFreq
 98 | 	ax2.loglog(numElements,frequencies,basex = 10,basey = 10)
 99 | 			
100 | def main():
101 | 	tokenCounts = countKeywords()
102 | 	plotBarGraph(tokenCounts)
103 | 	plotLog(tokenCounts)
104 | 	plt.show()
105 | 	#getLotsOfCode()
106 | 	
107 | if __name__ == '__main__':
108 | 	main()


--------------------------------------------------------------------------------
/github.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | 
 4 | code_prefix = "https://raw.githubusercontent.com"
 5 | 
 6 | def get_pylinks (directory_links):
 7 | 	if directory_links == []:
 8 | 		return []
 9 | 	python_file_links = []
10 | 	for directory_link in directory_links:
11 | 		link = directory_link.get('href')
12 | 		name = directory_link.text
13 | 		name = name.split('.')
14 | 		if len(name) == 1 : # It is generally a directory
15 | 			# Do some directory shit
16 | 			split_link = link.split('/')
17 | 			if split_link[3] == 'tree':
18 | 				soup = BeautifulSoup( requests.get ( prefix + link,verify = False ).text )
19 | 				new_directory_links = soup.find_all(class_='js-directory-link')
20 | 				python_file_links += python_file_links + get_pylinks(new_directory_links)
21 | 				
22 | 		elif name[-1] == 'py':
23 | 			# sp = BeautifulSoup( requests.get( code_prefix + link.replace('/blob', '') ).text )
24 | 			# print sp.find('p').text
25 | 			python_file_links.append(code_prefix + link.replace('/blob', ''))
26 | 		else:
27 | 			continue
28 | 	return python_file_links
29 | 	
30 | # URL to fetch top python projects
31 | URL = "https://github.com/search?l=Python&q=stars%3A%3E1&s=stars&type=Repositories"
32 | prefix = "https://github.com"
33 | soup = BeautifulSoup(requests.get(URL,verify = False).text)
34 | 
35 | h3s = soup.find_all(class_='repo-list-name')
36 | repo_links = []
37 | repos = []
38 | for h3 in h3s:
39 | 	a = h3.find('a').get('href')
40 | 	repo_links.append(a)
41 | 	# Get the repo name
42 | 	a = a.split('/')
43 | 	repos.append(a[1])
44 | 	# Just for testing
45 | 	print a[1]
46 | 
47 | for i in xrange(len(repo_links)):
48 | 	repo_links[i] = prefix + repo_links[i]
49 | 	print repo_links[i]
50 | 
51 | 
52 | # fetch code from each repo and store it in a text file
53 | python_file_links = []
54 | for repo in repo_links:
55 | 	repo = repo_links[2]
56 | 	soup = BeautifulSoup( requests.get ( repo,verify = False ).text )
57 | 	directory_links = soup.find_all(class_='js-directory-link')
58 | 	python_file_links += get_pylinks(directory_links)
59 | 
60 | with open('links.txt', 'w') as py_links:
61 | 	for each in python_file_links:
62 | 		py_links.write(each + "\n")
63 | 
64 | print len (python_file_links)
65 | 


--------------------------------------------------------------------------------