├── .gitignore ├── README.md ├── analyse.py └── github.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.txt 2 | *.jpg 3 | dump.py 4 | 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ###Python keyword usage analysis 2 | 3 | An attempt at verifying if [Zipf's Law](https://en.wikipedia.org/wiki/Zipf%27s_law)(Or any other power law for that matter) holds for programming languages as well. 4 | We tried to verify that for python. The "library" for our "language" is the [flask project] (https://github.com/mitsuhiko/flask) 5 | The entire Flask project has 885 python files and close to 19541 python keywords. 6 | 7 | ###Screenshots 8 | 9 | ![alt tag](http://i.imgur.com/Has1m8J.png) 10 | 11 | Bar chart 12 | 13 | ![alt tag](http://i.imgur.com/K9mGX58.png) 14 | 15 | Log-Log graph 16 | 17 | 18 | Turns out only the first few keywords follow some kind of distribution. -------------------------------------------------------------------------------- /analyse.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function,division 2 | import collections,keyword,tokenize,sys,matplotlib 3 | from matplotlib import pyplot as plt 4 | from operator import itemgetter 5 | import numpy as np 6 | import requests 7 | import io 8 | 9 | font = {'family' : 'sans-serif', 'weight' : 'normal', 'size' : 12} 10 | matplotlib.rc('font', **font) 11 | fig1 = plt.figure(figsize = (15,15)) 12 | fig2 = plt.figure(figsize = (15,15)) 13 | ax1 = fig1.add_subplot(111) 14 | ax2 = fig2.add_subplot(111) 15 | tokenCounter = collections.Counter(keyword.kwlist) 16 | maxLinks = 850 17 | 18 | for word in keyword.kwlist: 19 | tokenCounter[word] = tokenCounter[word] - 1 20 | 21 | def countKeywords(): 22 | processedKeywords = 0 23 | with open('dump.py','r') as codeReader: 24 | tokens = (token for _, token, _, _, _ in tokenize.generate_tokens(codeReader.readline)) 25 | for token in tokens: 26 | if keyword.iskeyword(token): 27 | processedKeywords = processedKeywords + 1 28 | tokenCounter[token] = tokenCounter[token] + 1 29 | progressString = "\rProcessed {0} keywords".format(processedKeywords) 30 | print(progressString,end = '') 31 | return sorted(tokenCounter.items(),key = itemgetter(1),reverse = True) 32 | 33 | def getLotsOfCode(): 34 | processedLinks = 0 35 | with open('links.txt') as links, open('dump.py','a') as codeFile: 36 | for link in links: 37 | try: 38 | if processedLinks > maxLinks: 39 | break 40 | htmlFile = requests.get(link.rstrip()).text 41 | with io.StringIO(htmlFile) as fileReader: 42 | for line in fileReader.readlines(): 43 | codeFile.write(line) 44 | processedLinks += 1 45 | progressString = "\rProcessed {0} links".format(processedLinks) 46 | print(progressString,end = '') 47 | except: 48 | pass 49 | return sorted(tokenCounter.items(),key = itemgetter(1),reverse = True) 50 | 51 | def getHarmonicSum(N,S): 52 | array = np.arange(1,N+1,dtype = float) 53 | harmonicSum = 0 54 | for element in array: 55 | harmonicSum += (1/(element**S)) 56 | return harmonicSum 57 | 58 | def getIdealFrequency(K,S): 59 | harmonicSum = getHarmonicSum(K,S) 60 | array = np.arange(1,K+1,dtype = float) 61 | for index in xrange(K): 62 | array[index] = harmonicSum/24*(1/(array[index]**S)) 63 | return array 64 | 65 | def plotBarGraph(tokenCounts): 66 | tokenCounts = zip(*tokenCounts) 67 | numElements = np.arange(len(tokenCounts[0])) 68 | thickness = 0.45 69 | topOffset = max(tokenCounts[1]) + len(str(max(tokenCounts[1]))) 70 | ax1.set_title('Keywords vs Occurnces') 71 | ax1.set_xlabel('Keywords') 72 | ax1.set_ylabel('Occurences') 73 | ax1.xaxis.set_label_coords(1.05, 0.015) 74 | ax1.set_xticks(numElements) 75 | ax1.set_xticklabels(tokenCounts[0],rotation = 55, verticalalignment = 'top') 76 | ax1.set_ylim([0,topOffset]) 77 | ax1.set_xlim([-1,len(tokenCounts[0])]) 78 | rects = ax1.bar(numElements,tokenCounts[1], width = thickness, linewidth = 1.5, edgecolor = 'black', color = 'green', align = 'center') 79 | for rect,count in zip(rects,tokenCounts[1]): 80 | height = rect.get_height() 81 | ax1.text(rect.get_x()+rect.get_width()/2., 1.01*height, '%d'% count, 82 | ha='center', va='bottom') 83 | 84 | def plotLog(tokenCounts): 85 | tokenCounts = zip(*tokenCounts) 86 | numElements = np.arange(31) 87 | idealCurve = getIdealFrequency(31,0.2) 88 | ax2.set_title('Keywords vs Occurnces') 89 | ax2.set_xlabel('Log(Rank)') 90 | ax2.set_ylabel('Log(Frequency)') 91 | ax2.set_xticks(numElements) 92 | ax2.set_xticklabels(tokenCounts[0], verticalalignment = 'top') 93 | ax2.loglog(numElements,idealCurve,basex = 10,basey = 10) 94 | frequencies = list(tokenCounts[1]) 95 | maxFreq = max(frequencies) 96 | for index in xrange(len(frequencies)): 97 | frequencies[index] /= maxFreq 98 | ax2.loglog(numElements,frequencies,basex = 10,basey = 10) 99 | 100 | def main(): 101 | tokenCounts = countKeywords() 102 | plotBarGraph(tokenCounts) 103 | plotLog(tokenCounts) 104 | plt.show() 105 | #getLotsOfCode() 106 | 107 | if __name__ == '__main__': 108 | main() -------------------------------------------------------------------------------- /github.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | 4 | code_prefix = "https://raw.githubusercontent.com" 5 | 6 | def get_pylinks (directory_links): 7 | if directory_links == []: 8 | return [] 9 | python_file_links = [] 10 | for directory_link in directory_links: 11 | link = directory_link.get('href') 12 | name = directory_link.text 13 | name = name.split('.') 14 | if len(name) == 1 : # It is generally a directory 15 | # Do some directory shit 16 | split_link = link.split('/') 17 | if split_link[3] == 'tree': 18 | soup = BeautifulSoup( requests.get ( prefix + link,verify = False ).text ) 19 | new_directory_links = soup.find_all(class_='js-directory-link') 20 | python_file_links += python_file_links + get_pylinks(new_directory_links) 21 | 22 | elif name[-1] == 'py': 23 | # sp = BeautifulSoup( requests.get( code_prefix + link.replace('/blob', '') ).text ) 24 | # print sp.find('p').text 25 | python_file_links.append(code_prefix + link.replace('/blob', '')) 26 | else: 27 | continue 28 | return python_file_links 29 | 30 | # URL to fetch top python projects 31 | URL = "https://github.com/search?l=Python&q=stars%3A%3E1&s=stars&type=Repositories" 32 | prefix = "https://github.com" 33 | soup = BeautifulSoup(requests.get(URL,verify = False).text) 34 | 35 | h3s = soup.find_all(class_='repo-list-name') 36 | repo_links = [] 37 | repos = [] 38 | for h3 in h3s: 39 | a = h3.find('a').get('href') 40 | repo_links.append(a) 41 | # Get the repo name 42 | a = a.split('/') 43 | repos.append(a[1]) 44 | # Just for testing 45 | print a[1] 46 | 47 | for i in xrange(len(repo_links)): 48 | repo_links[i] = prefix + repo_links[i] 49 | print repo_links[i] 50 | 51 | 52 | # fetch code from each repo and store it in a text file 53 | python_file_links = [] 54 | for repo in repo_links: 55 | repo = repo_links[2] 56 | soup = BeautifulSoup( requests.get ( repo,verify = False ).text ) 57 | directory_links = soup.find_all(class_='js-directory-link') 58 | python_file_links += get_pylinks(directory_links) 59 | 60 | with open('links.txt', 'w') as py_links: 61 | for each in python_file_links: 62 | py_links.write(each + "\n") 63 | 64 | print len (python_file_links) 65 | --------------------------------------------------------------------------------