├── Course 1 Getting Started With Python ├── Week 3 │ └── Assignment1.py ├── Week 4 │ ├── Assignment2_2.py │ ├── Assignment2_3.py │ ├── Output2_2.JPG │ └── Output2_3.JPG ├── Week 5 │ ├── Assignment3_1.py │ ├── Assignment3_3.py │ ├── Output3_1.JPG │ └── Output3_3.JPG ├── Week 6 │ ├── Assignment4_6.py │ └── Output4_6.JPG └── Week 7 │ ├── Assignment5_2.py │ └── Output5_2.JPG ├── Course 2 Python Data Structures ├── Week 1 │ ├── Assignment6_5.py │ └── Output6_5.JPG ├── Week 3 │ ├── Assignment7_1.py │ ├── Assignment7_2.py │ ├── Output7_1.JPG │ ├── Output7_2.JPG │ ├── mbox-short.txt │ └── words.txt ├── Week 4 │ ├── Assignment8_4.py │ ├── Assignment8_5.py │ ├── Output8_4.JPG │ ├── Output8_5.JPG │ ├── mbox-short.txt │ └── romeo.txt ├── Week 5 │ ├── Assignment9_4.py │ ├── Output9_4.JPG │ └── mbox-short.txt └── Week 6 │ ├── Assignment10_2.py │ ├── Output10_2.JPG │ └── mbox-short.txt ├── Course 3 Using Python to Access WebData ├── Week 2 │ ├── Assignment.py │ ├── Output_ActualData.JPG │ ├── Output_SampleData.JPG │ ├── regex_sum_22075.txt │ └── regex_sum_42.txt ├── Week 3 │ ├── Assignment.py │ ├── Output.JPG │ └── intro-short.txt ├── Week 4 │ ├── Assignment1.py │ ├── Assignment2.py │ ├── Output_Assignment1.JPG │ ├── Output_Assignment2_Actual_data.JPG │ ├── Output_Assignment2_Sample_data.JPG │ ├── comments_22077.html │ └── comments_42.html ├── Week 5 │ ├── AssignmentXML.py │ └── OutputXML.JPG └── Week 6 │ ├── AssignmentJSON1.py │ ├── AssignmentJSON2.py │ ├── OutputJSON1.JPG │ └── OutputJSON2.JPG ├── Course 4 Using Databases with Python ├── Week 2 │ ├── AssignmentDB1.db │ ├── AssignmentDB1Code_SQL.txt │ ├── AssignmentDB2.py │ ├── Output_Assignment_DB1.JPG │ ├── Output_Assignment_DB2_SQLITE_for_MBOX.JPG │ ├── Output_Assignment_DB2_SQLITE_for_mboxshort.JPG │ ├── Output_Assignment_DB2_mboxshort.JPG │ ├── Output_DB2_Mbox.JPG │ ├── dbcount.sqlite │ ├── mbox-short.txt │ └── mbox.txt ├── Week 3 │ ├── AssignmentMT.py │ ├── AssignmentMT.txt │ ├── Library.xml │ ├── Output_MT.JPG │ ├── Output_MT_SQLITE.JPG │ └── assignmentMT.sqlite ├── Week 4 │ ├── AssignmentRoster.py │ ├── Output_SQLITE.JPG │ ├── roster_data.json │ └── rosterdb.sqlite └── Week 5 │ ├── README.txt │ ├── Visualization of Where Data.JPG │ ├── geodata.sqlite │ ├── geodump.py │ ├── geoload.py │ ├── where.data │ ├── where.html │ └── where.js ├── Course 5 Capstone Retrieving, Processing,Visualizing Data ├── Week 2 │ ├── LICENSE │ ├── Output1.jpg │ ├── Output2.jpg │ ├── Output3.jpg │ ├── Output4.jpg │ ├── README.txt │ ├── bs4 │ │ ├── __init__.py │ │ ├── __init__.py.bak │ │ ├── builder │ │ │ ├── __init__.py │ │ │ ├── __init__.py.bak │ │ │ ├── _html5lib.py │ │ │ ├── _html5lib.py.bak │ │ │ ├── _htmlparser.py │ │ │ ├── _htmlparser.py.bak │ │ │ ├── _lxml.py │ │ │ └── _lxml.py.bak │ │ ├── dammit.py │ │ ├── dammit.py.bak │ │ ├── diagnose.py │ │ ├── diagnose.py.bak │ │ ├── element.py │ │ ├── element.py.bak │ │ ├── testing.py │ │ ├── testing.py.bak │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── test_builder_registry.py │ │ │ ├── test_docs.py │ │ │ ├── test_html5lib.py │ │ │ ├── test_html5lib.py.bak │ │ │ ├── test_htmlparser.py │ │ │ ├── test_lxml.py │ │ │ ├── test_lxml.py.bak │ │ │ ├── test_soup.py │ │ │ ├── test_soup.py.bak │ │ │ ├── test_tree.py │ │ │ └── test_tree.py.bak │ ├── d3.v2.js │ ├── force.css │ ├── force.html │ ├── force.js │ ├── spdump.py │ ├── spider.js │ ├── spider.py │ ├── spjson.py │ ├── sprank.py │ └── spreset.py ├── Week 4 │ ├── 1.jpg │ ├── 2.jpg │ ├── 3.jpg │ ├── 4.jpg │ ├── README.txt │ ├── d3.layout.cloud.js │ ├── d3.v2.js │ ├── gbasic.py │ ├── gline.htm │ ├── gline.py │ ├── gmane.py │ ├── gmodel.py │ ├── gword.htm │ ├── gword.py │ ├── gyear.py │ └── mapping.sqlite └── Week 6 │ ├── Output1.jpg │ ├── Output2.jpg │ ├── Output3.jpg │ ├── README.txt │ ├── d3.layout.cloud.js │ ├── d3.v2.js │ ├── gbasic.py │ ├── gline.htm │ ├── gline.py │ ├── gmane.py │ ├── gmodel.py │ ├── gword.htm │ ├── gword.py │ ├── gyear.py │ └── mapping.sqlite ├── LICENSE └── README.md /Course 1 Getting Started With Python/Week 3/Assignment1.py: -------------------------------------------------------------------------------- 1 | print('hello world') 2 | -------------------------------------------------------------------------------- /Course 1 Getting Started With Python/Week 4/Assignment2_2.py: -------------------------------------------------------------------------------- 1 | name = input('Enter your name') 2 | print('Hello',name) 3 | -------------------------------------------------------------------------------- /Course 1 Getting Started With Python/Week 4/Assignment2_3.py: -------------------------------------------------------------------------------- 1 | hrs = input('Enter Hours:') 2 | rph=input("Enter the rate per hour:") 3 | Pay=float(hrs)*float(rph) 4 | print("Pay:",Pay) 5 | -------------------------------------------------------------------------------- /Course 1 Getting Started With Python/Week 4/Output2_2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 1 Getting Started With Python/Week 4/Output2_2.JPG -------------------------------------------------------------------------------- /Course 1 Getting Started With Python/Week 4/Output2_3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 1 Getting Started With Python/Week 4/Output2_3.JPG -------------------------------------------------------------------------------- /Course 1 Getting Started With Python/Week 5/Assignment3_1.py: -------------------------------------------------------------------------------- 1 | '''3.1 Write a program to prompt the user for hours and rate per hour using input to compute gross pay. 2 | Pay the hourly rate for the hours up to 40 and 1.5 times the hourly rate for all hours worked above 3 | 40 hours. Use 45 hours and a rate of 10.50 per hour to test the program (the pay should be 498.75). 4 | You should use input to read a string and float() to convert the string to a number. 5 | Do not worry about error checking the user input - assume the user types numbers properly.''' 6 | 7 | hrs = input("Enter Hours:") 8 | h = float(hrs) 9 | b=input('Enter the hourly rate:') 10 | pr=float(b) 11 | 12 | if h<=40.0 : 13 | pay=h*pr 14 | print(pay) 15 | elif h>40.0 : 16 | pay=(40.0*pr)+(1.5*pr*(h-40.0)) 17 | print(pay) 18 | -------------------------------------------------------------------------------- /Course 1 Getting Started With Python/Week 5/Assignment3_3.py: -------------------------------------------------------------------------------- 1 | '''3.3 Write a program to prompt for a score between 0.0 and 1.0. 2 | If the score is out of range, print an error. 3 | If the score is between 0.0 and 1.0, print a grade using the following table: 4 | Score Grade 5 | >= 0.9 A 6 | >= 0.8 B 7 | >= 0.7 C 8 | >= 0.6 D 9 | < 0.6 F 10 | If the user enters a value out of range, print a suitable error message and exit. 11 | For the test, enter a score of 0.85.''' 12 | 13 | score = input("Enter Score: ") 14 | s=float(score) 15 | if s>=0.0 : 16 | if s<=1.0 : 17 | if s>=0.9 : 18 | print('A') 19 | elif s>=0.8 : 20 | print('B') 21 | elif s>=0.7 : 22 | print("C") 23 | elif s>=0.6 : 24 | print("D") 25 | elif s<0.6 : 26 | print('F') 27 | else : 28 | print("Please enter valid score") 29 | 30 | else : 31 | print('Score is out of range :/') 32 | -------------------------------------------------------------------------------- /Course 1 Getting Started With Python/Week 5/Output3_1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 1 Getting Started With Python/Week 5/Output3_1.JPG -------------------------------------------------------------------------------- /Course 1 Getting Started With Python/Week 5/Output3_3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 1 Getting Started With Python/Week 5/Output3_3.JPG -------------------------------------------------------------------------------- /Course 1 Getting Started With Python/Week 6/Assignment4_6.py: -------------------------------------------------------------------------------- 1 | '''4.6 Write a program to prompt the user for hours and rate per hour using input to compute gross pay. 2 | Award time-and-a-half for the hourly rate for all hours worked above 40 hours. 3 | Put the logic to do the computation of time-and-a-half in a function called computepay() 4 | and use the function to do the computation. The function should return a value. 5 | Use 45 hours and a rate of 10.50 per hour to test the program (the pay should be 498.75). 6 | You should use input to read a string and float() to convert the string to a number. 7 | Do not worry about error checking the user input unless you want to - you can assume the user 8 | types numbers properly. Do not name your variable sum or use the sum() function.''' 9 | a=input('Enter your Work Hours:') 10 | b=input('Enter your pay rate per hour:') 11 | wh=float(a) 12 | rph=float(b) 13 | 14 | 15 | def computepay(h,r): 16 | if h>40.0: 17 | mp=1.5*(h-40.0)*r 18 | py=(40*r)+mp 19 | return py 20 | else: 21 | py=h*r 22 | return py 23 | p = computepay(wh,rph) 24 | print(p) 25 | -------------------------------------------------------------------------------- /Course 1 Getting Started With Python/Week 6/Output4_6.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 1 Getting Started With Python/Week 6/Output4_6.JPG -------------------------------------------------------------------------------- /Course 1 Getting Started With Python/Week 7/Assignment5_2.py: -------------------------------------------------------------------------------- 1 | '''5.2 Write a program that repeatedly prompts a user for integer numbers until the user enters 'done'. 2 | Once 'done' is entered, print out the largest and smallest of the numbers. 3 | If the user enters anything other than a valid number catch it with a try/except and 4 | put out an appropriate message and ignore the number. 5 | Enter 7, 2, bob, 10, and 4 and match the output below.''' 6 | largest = None 7 | smallest = None 8 | while True: 9 | num = input("Enter a number: ") 10 | if num=='done': 11 | break 12 | try: 13 | n=float(num) 14 | if largest==None: 15 | largest=n 16 | elif n>largest: 17 | largest=n 18 | 19 | if smallest==None: 20 | smallest=n 21 | elif nbcount: 31 | bcount=no 32 | bemail=key 33 | print(bemail,bcount) 34 | -------------------------------------------------------------------------------- /Course 2 Python Data Structures/Week 5/Output9_4.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 2 Python Data Structures/Week 5/Output9_4.JPG -------------------------------------------------------------------------------- /Course 2 Python Data Structures/Week 6/Assignment10_2.py: -------------------------------------------------------------------------------- 1 | '''10.2 Write a program to read through the mbox-short.txt and figure out the distribution 2 | by hour of the day for each of the messages. You can pull the hour out from the 'From ' 3 | line by finding the time and then splitting the string a second time using a colon. 4 | From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008 5 | Once you have accumulated the counts for each hour, 6 | print out the counts, sorted by hour as shown below.''' 7 | 8 | fname = input("Enter file name: ") 9 | try: 10 | #fh = open(fname) 11 | if len(fname) <= 1 : 12 | fname = "mbox-short.txt" 13 | fh = open(fname) 14 | except: 15 | print('invalid entry!') 16 | quit() 17 | count=dict() 18 | for lin in fh: 19 | lin=lin.rstrip() 20 | if not lin.startswith('From '): 21 | continue 22 | words=lin.split() 23 | time=words[5] 24 | hr=time.split(':') 25 | hour=hr[0] 26 | count[hour]=count.get(hour,0)+1 27 | # to print all the emails and the numbers print(count) 28 | lst=list() 29 | for key,val in count.items(): 30 | tup=(key,val) 31 | lst.append(tup) 32 | lst=sorted(lst) 33 | for key,val in lst: 34 | print(key,val) 35 | -------------------------------------------------------------------------------- /Course 2 Python Data Structures/Week 6/Output10_2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 2 Python Data Structures/Week 6/Output10_2.JPG -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 2/Assignment.py: -------------------------------------------------------------------------------- 1 | '''Finding Numbers in a Haystack 2 | In this assignment you will read through and parse a file with text and numbers. 3 | You will extract all the numbers in the file and compute the sum of the numbers. 4 | 5 | Data Files 6 | We provide two files for this assignment. 7 | One is a sample file where we give you the sum for your testing and the other is the 8 | actual data you need to process for the assignment. 9 | 10 | Sample data: http://py4e-data.dr-chuck.net/regex_sum_42.txt (There are 90 values with a sum=445833) 11 | Actual data: http://py4e-data.dr-chuck.net/regex_sum_22075.txt 12 | (There are 79 values and the sum ends with 371) 13 | These links open in a new window. Make sure to save the file into the same folder as you will be writing your Python program. Note: Each student will have a distinct data file for the assignment - so only use your own data file for analysis. 14 | Data Format 15 | The file contains much of the text from the introduction of the textbook 16 | except that random numbers are inserted throughout the text. 17 | Here is a sample of the output you might see: 18 | 19 | Why should you learn to write programs? 7746 20 | 12 1929 8827 21 | Writing programs (or programming) is a very creative 22 | 7 and rewarding activity. You can write programs for 23 | many reasons, ranging from making your living to solving 24 | 8837 a difficult data analysis problem to having fun to helping 128 25 | someone else solve a problem. This book assumes that 26 | everyone needs to know how to program ... 27 | The sum for the sample text above is 27486. The numbers can appear anywhere in the line. 28 | There can be any number of numbers in each line (including none). 29 | Handling The Data 30 | The basic outline of this problem is to read the file, look for integers using the re.findall(), 31 | looking for a regular expression of '[0-9]+' and then converting the extracted strings to 32 | integers and summing up the integers''' 33 | 34 | import re 35 | a=input('Enter the file name:') 36 | try: 37 | if a=='s': 38 | fh=open('regex_sum_42.txt') 39 | elif a=='act' or len(a)<1: 40 | fh=open('regex_sum_22075.txt') 41 | except: 42 | Print('please enter a valid name') 43 | quit() 44 | numlist=list() 45 | sum=0 46 | count=0 47 | for line in fh: 48 | line=line.rstrip() 49 | no=re.findall('[0-9]+',line) 50 | numlist=numlist+no 51 | for i in range(len(no)): 52 | lv=float(no[i]) 53 | sum=sum+lv 54 | for num in range(len(numlist)): 55 | count=count+1 56 | print('the sum of',count, 'numbers in text document is',sum) 57 | print('the numbers are',numlist) 58 | -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 2/Output_ActualData.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 3 Using Python to Access WebData/Week 2/Output_ActualData.JPG -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 2/Output_SampleData.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 3 Using Python to Access WebData/Week 2/Output_SampleData.JPG -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 3/Assignment.py: -------------------------------------------------------------------------------- 1 | '''Exploring the HyperText Transport Protocol 2 | 3 | You are to retrieve the following document using the HTTP protocol in a way 4 | that you can examine the HTTP Response headers. 5 | 6 | http://data.pr4e.org/intro-short.txt 7 | There are three ways that you might retrieve this web page and look at the response headers: 8 | 9 | Preferred: Modify the socket1.py program to retrieve the above URL and print out the headers and data. 10 | Make sure to change the code to retrieve the above URL - the values are different for each URL. 11 | Open the URL in a web browser with a developer console or FireBug and manually examine the headers 12 | that are returned. 13 | Use the telnet program as shown in lecture to retrieve the headers and content.''' 14 | 15 | import socket 16 | import re 17 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 18 | mysock.connect(('data.pr4e.org', 80)) 19 | cmd = 'GET http://data.pr4e.org/intro-short.txt HTTP/1.0\r\n\r\n'.encode() 20 | mysock.send(cmd) 21 | count=0 22 | dataR='' 23 | while True: 24 | count=count+1 25 | #print('inside loop',count) 26 | data = mysock.recv(512) 27 | if (len(data) < 1): 28 | break 29 | lst=data.decode() 30 | dataR=dataR+lst 31 | #print (lst) 32 | print('outside loop') 33 | #print(dataR) 34 | lastMod=re.findall('Last-Modified: (.+)\r',dataR) 35 | ET=re.findall('ETag: (.+)\r',dataR) 36 | CL=re.findall('Content-Length: (.+)\r',dataR) 37 | CC=re.findall('Cache-Control: (.+)\r',dataR) 38 | CT=re.findall('Content-Type: (.+)\r',dataR) 39 | print('last mod:',lastMod[0],'ETag:',ET[0],'Content-Length:',CL[0],'Cache-Control:',CC[0],'Content-Type:',CT[0]) 40 | mysock.close() 41 | -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 3/Output.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 3 Using Python to Access WebData/Week 3/Output.JPG -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 3/intro-short.txt: -------------------------------------------------------------------------------- 1 | Why should you learn to write programs? 2 | 3 | Writing programs (or programming) is a very creative 4 | and rewarding activity. You can write programs for 5 | many reasons, ranging from making your living to solving 6 | a difficult data analysis problem to having fun to helping 7 | someone else solve a problem. This book assumes that 8 | everyone needs to know how to program, and that once 9 | you know how to program you will figure out what you want 10 | to do with your newfound skills. 11 | -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 4/Assignment1.py: -------------------------------------------------------------------------------- 1 | '''Scraping Numbers from HTML using BeautifulSoup In this assignment you will write a 2 | Python program similar to http://www.py4e.com/code3/urllink2.py. 3 | The program will use urllib to read the HTML from the data files below, and parse the data, 4 | extracting numbers and compute the sum of the numbers in the file. 5 | 6 | We provide two files for this assignment. One is a sample file where we give you the sum 7 | for your testing and the other is the actual data you need to process for the assignment. 8 | 9 | Sample data: http://py4e-data.dr-chuck.net/comments_42.html (Sum=2553) 10 | Actual data: http://py4e-data.dr-chuck.net/comments_22077.html (Sum ends with 91) 11 | You do not need to save these files to your folder since your program will read the data directly 12 | from the URL. Note: Each student will have a distinct data url for the assignment - 13 | so only use your own data url for analysis.''' 14 | 15 | import urllib.request, urllib.parse, urllib.error 16 | from bs4 import BeautifulSoup 17 | import ssl 18 | import re 19 | 20 | # Ignore SSL certificate errors 21 | ctx = ssl.create_default_context() 22 | ctx.check_hostname = False 23 | ctx.verify_mode = ssl.CERT_NONE 24 | print('For sample data, enter http://py4e-data.dr-chuck.net/comments_42.html \n for actual data, enter http://py4e-data.dr-chuck.net/comments_22077.html') 25 | url = input('Enter the link- ') 26 | 27 | html = urllib.request.urlopen(url, context=ctx).read() 28 | soup = BeautifulSoup(html, 'html.parser') 29 | 30 | # Retrieve all of the anchor tags 31 | tags = soup('span') 32 | total=0 33 | for tag in tags: 34 | locN=tag.contents[0] 35 | conv=float(locN) 36 | total=total+conv 37 | print(total) 38 | -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 4/Assignment2.py: -------------------------------------------------------------------------------- 1 | '''Following Links in Python 2 | 3 | In this assignment you will write a Python program that expands 4 | on http://www.py4e.com/code3/urllinks.py. 5 | The program will use urllib to read the HTML from the data files below, extract the href= vaues 6 | from the anchor tags, scan for a tag that is in a particular position relative to the first name 7 | in the list, follow that link and repeat the process a number of times and report the 8 | last name you find. 9 | 10 | We provide two files for this assignment. 11 | One is a sample file where we give you the name for your testing and the other is the 12 | actual data you need to process for the assignment 13 | 14 | Sample problem: Start at http://py4e-data.dr-chuck.net/known_by_Fikret.html 15 | Find the link at position 3 (the first name is 1). Follow that link. 16 | Repeat this process 4 times. The answer is the last name that you retrieve. 17 | Sequence of names: Fikret Montgomery Mhairade Butchi Anayah 18 | Last name in sequence: Anayah 19 | Actual problem: Start at: http://py4e-data.dr-chuck.net/known_by_Clodagh.html 20 | Find the link at position 18 (the first name is 1). Follow that link. Repeat this process 7 times. 21 | The answer is the last name that you retrieve. 22 | Hint: The first character of the name of the last page that you will load is: A''' 23 | 24 | #in Py4e this progname is loop.py 25 | import urllib.request, urllib.parse, urllib.error 26 | from bs4 import BeautifulSoup 27 | import ssl 28 | position=input('Enter the position you want to go\t') 29 | pos=int(position) 30 | loopNo=input('Enter the number of iterations\t') 31 | lN=int(loopNo) 32 | url = input('Enter the html link- \n') 33 | for itv in range(lN): 34 | # Ignore SSL certificate errors 35 | ctx = ssl.create_default_context() 36 | ctx.check_hostname = False 37 | ctx.verify_mode = ssl.CERT_NONE 38 | html = urllib.request.urlopen(url, context=ctx).read() 39 | soup = BeautifulSoup(html, 'html.parser') 40 | tags = soup('a') 41 | nameList=list() 42 | loc='' 43 | count=0 44 | for tag in tags: 45 | loc=tag.get('href',None) 46 | nameList.append(loc) 47 | print('Retrieving the URL:',url) 48 | print('Retrieved URL:',nameList[pos-1]) 49 | url=nameList[pos-1] 50 | print('this is the end of iteration:',itv+1) 51 | -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 4/Output_Assignment1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 3 Using Python to Access WebData/Week 4/Output_Assignment1.JPG -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 4/Output_Assignment2_Actual_data.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 3 Using Python to Access WebData/Week 4/Output_Assignment2_Actual_data.JPG -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 4/Output_Assignment2_Sample_data.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 3 Using Python to Access WebData/Week 4/Output_Assignment2_Sample_data.JPG -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 4/comments_22077.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Welcome to the comments assignment from www.py4e.com 4 | 5 | 6 |

This file contains the actual data for your assignment - good luck!

7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 |
NameComments
Pasquale97
Shelby96
Kaleb95
Youcef89
Yusef87
Jemma86
Rana82
Rajan79
Abbie78
Tessa78
Fadile77
Sephiroth75
Narvic73
Aadam72
Eabha72
Kimmie71
Konar70
Miranne64
Roshan63
Scarlet61
Evan59
Diana58
Kaydn57
Rhiannon56
Shakira55
Jiao52
Mairi52
Leonardo52
Blaine52
Lillian49
Akira48
Ami46
Kerrigan43
Millie41
Riley40
Maryse38
Evelyne35
Mischa31
Ghyll31
Ragen26
Violet22
Kinsey20
Bryony17
Aref12
Keir11
Valo9
Daegan6
Edwyn6
Dhyia1
Mercy1
63 | 64 | 65 | -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 4/comments_42.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Welcome to the comments assignment from www.py4e.com 4 | 5 | 6 |

This file contains the sample data for testing

7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 |
NameComments
Romina97
Laurie97
Bayli90
Siyona90
Taisha88
Alanda87
Ameelia87
Prasheeta80
Asif79
Risa79
Zi78
Danyil76
Ediomi76
Barry72
Lance72
Hattie66
Mathu66
Bowie65
Samara65
Uchenna64
Shauni61
Georgia61
Rivan59
Kenan58
Hassan57
Isma57
Samanthalee54
Alexa51
Caine49
Grady47
Anne40
Rihan38
Alexei37
Indie36
Rhuairidh36
Annoushka32
Kenzi25
Shahd24
Irvine22
Carys21
Skye19
Atiya18
Rohan18
Nuala14
Maram12
Carlo12
Japleen9
Breeanna7
Zaaine3
Inika2
63 | 64 | 65 | -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 5/AssignmentXML.py: -------------------------------------------------------------------------------- 1 | '''Extracting Data from XML 2 | 3 | In this assignment you will write a Python program somewhat similar to 4 | http://www.py4e.com/code3/geoxml.py. The program will prompt for a URL, read the XML data from 5 | that URL using urllib and then parse and extract the comment counts from the XML data, 6 | compute the sum of the numbers in the file. 7 | 8 | We provide two files for this assignment. 9 | One is a sample file where we give you the sum for your testing and the other is the 10 | actual data you need to process for the assignment. 11 | 12 | Sample data: http://py4e-data.dr-chuck.net/comments_42.xml (Sum=2553) 13 | Actual data: http://py4e-data.dr-chuck.net/comments_22079.xml (Sum ends with 13) 14 | You do not need to save these files to your folder since your program will 15 | read the data directly from the URL. 16 | Note: Each student will have a distinct data url for the assignment - 17 | so only use your own data url for analysis.''' 18 | 19 | #sample Data :http://py4e-data.dr-chuck.net/comments_42.xml 20 | import urllib.request, urllib.parse, urllib.error 21 | import xml.etree.ElementTree as ET 22 | url = 'http://py4e-data.dr-chuck.net/comments_22079.xml' 23 | print('Retrieving', url) 24 | uh = urllib.request.urlopen(url) 25 | data = uh.read() 26 | tree = ET.fromstring(data) 27 | #results = tree.findall('comments/comment') 28 | x=list();a=list() 29 | y=list(); 30 | for item in tree.findall('comments/comment'): 31 | x.append(int(item.find('count').text)) 32 | a.append(item.find('name').text) 33 | #print('Count value is',item.find('count').text) 34 | print('the total sum by method one is', sum(x)) 35 | 36 | print('============') 37 | print('method 2 \n') 38 | for item in tree.findall('.//count'): 39 | y.append(int(item.text)) 40 | #print('Count value is',item.text) 41 | print('the total sum by method two is \n', sum(y)) 42 | #print(sum(x),sum(y)) 43 | print('--------------------------') 44 | print('The list of names found are', a) 45 | -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 5/OutputXML.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 3 Using Python to Access WebData/Week 5/OutputXML.JPG -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 6/AssignmentJSON1.py: -------------------------------------------------------------------------------- 1 | '''Extracting Data from JSON 2 | 3 | In this assignment you will write a Python program somewhat similar to 4 | http://www.py4e.com/code3/json2.py. The program will prompt for a URL, read the JSON data 5 | from that URL using urllib and then parse and extract the comment counts from the JSON data, 6 | compute the sum of the numbers in the file and enter the sum below: 7 | We provide two files for this assignment. One is a sample file where we give you the sum for 8 | your testing and the other is the actual data you need to process for the assignment. 9 | 10 | Sample data: http://py4e-data.dr-chuck.net/comments_42.json (Sum=2553) 11 | Actual data: http://py4e-data.dr-chuck.net/comments_22080.json (Sum ends with 11) 12 | You do not need to save these files to your folder since your program will read the data 13 | directly from the URL. Note: Each student will have a distinct data url for the assignment - 14 | so only use your own data url for analysis.''' 15 | 16 | #sample Data :http://py4e-data.dr-chuck.net/comments_42.json 17 | #Data2 : http://py4e-data.dr-chuck.net/comments_22080.json 18 | import urllib.request, urllib.parse, urllib.error 19 | import json 20 | url = 'http://py4e-data.dr-chuck.net/comments_22080.json' 21 | #url='http://py4e-data.dr-chuck.net/comments_42.json' 22 | print('Retrieving', url) 23 | uh = urllib.request.urlopen(url) 24 | data = uh.read().decode() 25 | js=json.loads(data) 26 | djs=json.dumps(js,indent=2) 27 | total=0 28 | allName=list() 29 | ext=js['comments'] 30 | #print(js['comments'][0]) 31 | for item in ext: 32 | number=int(item['count']) 33 | name=item['name'] 34 | #print(name,number) 35 | total=total+number 36 | #print('--------------------------') 37 | print('The sum of numbers in the JSON link you provided is',total) 38 | -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 6/AssignmentJSON2.py: -------------------------------------------------------------------------------- 1 | '''Calling a JSON API 2 | 3 | In this assignment you will write a Python program somewhat similar 4 | to http://www.py4e.com/code3/geojson.py. The program will prompt for a location, 5 | contact a web service and retrieve JSON for the web service and parse that data, 6 | and retrieve the first place_id from the JSON. A place ID is a textual identifier 7 | that uniquely identifies a place as within Google Maps. 8 | API End Points 9 | 10 | To complete this assignment, you should use this API endpoint that has a static 11 | subset of the Google Data: 12 | 13 | http://py4e-data.dr-chuck.net/geojson? 14 | This API uses the same parameters (sensor and address) as the Google API. 15 | This API also has no rate limit so you can test as often as you like. 16 | If you visit the URL with no parameters, you get a list of all of the address values 17 | which can be used with this API.To call the API, you need to provide address that 18 | you are requesting as the address= parameter that is properly URL encoded using the 19 | urllib.urlencode() fuction as shown in http://www.py4e.com/code3/geojson.py''' 20 | 21 | import urllib.request, urllib.parse, urllib.error 22 | import json 23 | 24 | # Note that Google is increasingly requiring keys 25 | # for this API 26 | serviceurl = 'http://py4e-data.dr-chuck.net/geojson?' 27 | 28 | while True: 29 | address = input('Enter location: ') 30 | if len(address) < 1: break 31 | 32 | url = serviceurl + urllib.parse.urlencode( 33 | {'address': address}) 34 | 35 | print('Retrieving', url) 36 | uh = urllib.request.urlopen(url) 37 | data = uh.read().decode() 38 | print('Retrieved', len(data), 'characters') 39 | 40 | try: 41 | js = json.loads(data) 42 | except: 43 | js = None 44 | 45 | if not js or 'status' not in js or js['status'] != 'OK': 46 | print('==== Failure To Retrieve ====') 47 | print(data) 48 | continue 49 | 50 | #print(json.dumps(js, indent=4)) 51 | 52 | #lat = js["results"][0]["geometry"]["location"]["lat"] 53 | #lng = js["results"][0]["geometry"]["location"]["lng"] 54 | #print('lat', lat, 'lng', lng) 55 | #location = js['results'][0]['formatted_address'] 56 | placeID=js['results'][0]['place_id'] 57 | print('The place Id of the place',address,'you entered is',placeID) 58 | -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 6/OutputJSON1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 3 Using Python to Access WebData/Week 6/OutputJSON1.JPG -------------------------------------------------------------------------------- /Course 3 Using Python to Access WebData/Week 6/OutputJSON2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 3 Using Python to Access WebData/Week 6/OutputJSON2.JPG -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 2/AssignmentDB1.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 2/AssignmentDB1.db -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 2/AssignmentDB1Code_SQL.txt: -------------------------------------------------------------------------------- 1 | CREATE TABLE Ages ( 2 | name VARCHAR(128), 3 | age INTEGER 4 | ); 5 | 6 | DELETE FROM Ages; 7 | INSERT INTO Ages (name, age) VALUES ('Heidi', 16); 8 | INSERT INTO Ages (name, age) VALUES ('Riccardo', 27); 9 | INSERT INTO Ages (name, age) VALUES ('Lavena', 39); 10 | INSERT INTO Ages (name, age) VALUES ('Jaime', 34); 11 | 12 | SELECT hex(name || age) AS X FROM Ages ORDER BY X 13 | -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 2/AssignmentDB2.py: -------------------------------------------------------------------------------- 1 | '''Counting Organizations 2 | This application will read the mailbox data (mbox.txt) and count the number of email 3 | messages per organization (i.e. domain name of the email address) using a database 4 | with the following schema to maintain the counts. 5 | 6 | CREATE TABLE Counts (org TEXT, count INTEGER) 7 | When you have run the program on mbox.txt upload the resulting database file above for grading. 8 | If you run the program multiple times in testing or with dfferent files, make sure to empty 9 | out the data before each run. 10 | 11 | You can use this code as a starting point for your application: http://www.py4e.com/code3/emaildb.py. 12 | 13 | The data file for this application is the same as 14 | in previous assignments: http://www.py4e.com/code3/mbox.txt. 15 | 16 | Because the sample code is using an UPDATE statement and committing the results 17 | to the database as each record is read in the loop, it might take as long as 18 | a few minutes to process all the data. The commit insists on completely writing 19 | all the data to disk every time it is called. 20 | 21 | The program can be speeded up greatly by moving the commit operation outside of the 22 | loop. In any database program, there is a balance between the number of operations you 23 | execute between commits and the importance of not losing the results of operations that 24 | have not yet been committed.''' 25 | 26 | 27 | import sqlite3 28 | 29 | conn = sqlite3.connect('dbcount.sqlite') 30 | cur = conn.cursor() 31 | 32 | cur.execute(''' 33 | DROP TABLE IF EXISTS Counts''') 34 | 35 | cur.execute(''' 36 | CREATE TABLE Counts (org TEXT,count INTEGER)''') 37 | 38 | fname = input('Enter file name: ') 39 | if (len(fname) < 1): fname = 'mbox-short.txt' 40 | fh = open(fname) 41 | for line in fh: 42 | if not line.startswith('From: '): continue 43 | pieces = line.split() 44 | email = pieces[1] 45 | #cur.execute('SELECT count FROM Counts WHERE email = ? ', (email,)) 46 | #row = cur.fetchone() 47 | #if row is None: 48 | # cur.execute('''INSERT INTO Counts (email, count) 49 | # VALUES (?, 1)''', (email,)) 50 | #else: 51 | # cur.execute('UPDATE Counts SET count = count + 1 WHERE email = ?', 52 | # (email,)) 53 | 54 | spl2=email.split('@') 55 | domain=spl2[1] 56 | cur.execute('SELECT count FROM Counts WHERE org=?', (domain,)) 57 | roww=cur.fetchone() 58 | if roww is None: 59 | cur.execute('''INSERT INTO Counts (org,count) 60 | VALUES (?,1)''',(domain,)) 61 | else: 62 | cur.execute('UPDATE Counts SET count=count+1 WHERE org =?', (domain,)) 63 | cur.execute('SELECT org,count FROM Counts ORDER BY count DESC') 64 | conn.commit() 65 | 66 | # https://www.sqlite.org/lang_select.html 67 | sqlstr = 'SELECT org,count FROM Counts ORDER BY count DESC' 68 | 69 | for row in cur.execute(sqlstr): 70 | print(str(row[0]), row[1]) 71 | 72 | cur.close() 73 | -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 2/Output_Assignment_DB1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 2/Output_Assignment_DB1.JPG -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 2/Output_Assignment_DB2_SQLITE_for_MBOX.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 2/Output_Assignment_DB2_SQLITE_for_MBOX.JPG -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 2/Output_Assignment_DB2_SQLITE_for_mboxshort.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 2/Output_Assignment_DB2_SQLITE_for_mboxshort.JPG -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 2/Output_Assignment_DB2_mboxshort.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 2/Output_Assignment_DB2_mboxshort.JPG -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 2/Output_DB2_Mbox.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 2/Output_DB2_Mbox.JPG -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 2/dbcount.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 2/dbcount.sqlite -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 3/AssignmentMT.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Musical Track Database 3 | 4 | You can use this code as a starting point for your application: 5 | http://www.py4e.com/code3/tracks.zip. The ZIP file contains the Library.xml file to be 6 | used for this assignment. You can export your own tracks from iTunes and create a database, 7 | but for the database that you turn in for this assignment, 8 | only use the Library.xml data that is provided. 9 | 10 | ''' 11 | 12 | import xml.etree.ElementTree as ET 13 | import sqlite3 14 | dic=dict() 15 | conn = sqlite3.connect('assignmentMT.sqlite') 16 | cur = conn.cursor() 17 | 18 | # Make some fresh tables using executescript() 19 | cur.executescript(''' 20 | DROP TABLE IF EXISTS Artist; 21 | DROP TABLE IF EXISTS Album; 22 | DROP TABLE IF EXISTS Track; 23 | DROP TABLE IF EXISTS Genre; 24 | 25 | CREATE TABLE Artist ( 26 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 27 | name TEXT UNIQUE 28 | ); 29 | 30 | CREATE TABLE Album ( 31 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 32 | artist_id INTEGER, 33 | title TEXT UNIQUE 34 | ); 35 | 36 | CREATE TABLE Track ( 37 | id INTEGER NOT NULL PRIMARY KEY 38 | AUTOINCREMENT UNIQUE, 39 | title TEXT UNIQUE, 40 | album_id INTEGER, 41 | genre_id INTEGER, 42 | len INTEGER, rating INTEGER, count INTEGER 43 | ); 44 | 45 | CREATE TABLE Genre( 46 | id INTEGER NOT NULL PRIMARY KEY 47 | AUTOINCREMENT UNIQUE, 48 | name TEXT UNIQUE 49 | ); 50 | ''') 51 | 52 | 53 | fname = input('Enter file name: ') 54 | if ( len(fname) < 1 ) : fname = 'Library.xml' 55 | 56 | # Track ID369 57 | # NameAnother One Bites The Dust 58 | # ArtistQueen 59 | def lookup(d, key): 60 | found = False 61 | for child in d: 62 | if found : 63 | return child.text 64 | if child.tag == 'key' and child.text == key : 65 | found = True 66 | return None 67 | 68 | stuff = ET.parse(fname) 69 | all = stuff.findall('dict/dict/dict') 70 | print('Dict count:', len(all)) 71 | genList=list() 72 | for entry in all: 73 | if ( lookup(entry, 'Track ID') is None ) : 74 | continue 75 | 76 | name = lookup(entry, 'Name') 77 | artist = lookup(entry, 'Artist') 78 | album = lookup(entry, 'Album') 79 | count = lookup(entry, 'Play Count') 80 | rating = lookup(entry, 'Rating') 81 | length = lookup(entry, 'Total Time') 82 | genre=lookup(entry,'Genre') 83 | genList.append(genre) 84 | 85 | if name is None or artist is None or album is None : 86 | continue 87 | 88 | print('Genre Name :\t' ,genre) 89 | 90 | cur.execute('''INSERT OR IGNORE INTO Artist (name) 91 | VALUES ( ? )''', ( artist, ) ) 92 | cur.execute('SELECT id FROM Artist WHERE name = ? ', (artist, )) 93 | artist_id = cur.fetchone()[0] 94 | 95 | cur.execute('''INSERT OR IGNORE INTO Album (title, artist_id) 96 | VALUES ( ?, ? )''', ( album, artist_id ) ) 97 | cur.execute('SELECT id FROM Album WHERE title = ? ', (album, )) 98 | album_id = cur.fetchone()[0] 99 | 100 | cur.execute('''INSERT OR IGNORE INTO Genre 101 | (name) VALUES (?)''', (genre,) ) 102 | cur.execute('SELECT id FROM Genre WHERE name=?', (genre,)) 103 | try: 104 | genre_id=cur.fetchone()[0] 105 | except: 106 | genre_id='' 107 | 108 | cur.execute('''INSERT OR REPLACE INTO Track 109 | (title, album_id, len, rating, count,genre_id) 110 | VALUES ( ?, ?, ?, ?, ? ,?)''',( name, album_id, length, rating, count,genre_id ) ) 111 | 112 | 113 | conn.commit() 114 | 115 | for gn in genList: 116 | dic[gn]=dic.get(gn,0)+1 117 | print(dic,'\nThe different entries in the dictionary is',len(dic), 118 | '\n the total sum of entries is',dic.values() ) 119 | -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 3/AssignmentMT.txt: -------------------------------------------------------------------------------- 1 | CREATE TABLE Artist ( 2 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 3 | name TEXT UNIQUE 4 | ); 5 | 6 | CREATE TABLE Genre ( 7 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 8 | name TEXT UNIQUE 9 | ); 10 | 11 | CREATE TABLE Album ( 12 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 13 | artist_id INTEGER, 14 | title TEXT UNIQUE 15 | ); 16 | 17 | CREATE TABLE Track ( 18 | id INTEGER NOT NULL PRIMARY KEY 19 | AUTOINCREMENT UNIQUE, 20 | title TEXT UNIQUE, 21 | album_id INTEGER, 22 | genre_id INTEGER, 23 | len INTEGER, rating INTEGER, count INTEGER 24 | ); -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 3/Output_MT.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 3/Output_MT.JPG -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 3/Output_MT_SQLITE.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 3/Output_MT_SQLITE.JPG -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 3/assignmentMT.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 3/assignmentMT.sqlite -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 4/AssignmentRoster.py: -------------------------------------------------------------------------------- 1 | '''This application will read roster data in JSON format, parse the file, 2 | and then produce an SQLite database that contains 3 | a User, Course, and Member table and populate the tables from the data file. 4 | ''' 5 | import json 6 | import sqlite3 7 | 8 | conn = sqlite3.connect('rosterdb.sqlite') 9 | cur = conn.cursor() 10 | 11 | # Do some setup 12 | cur.executescript(''' 13 | DROP TABLE IF EXISTS User; 14 | DROP TABLE IF EXISTS Member; 15 | DROP TABLE IF EXISTS Course; 16 | 17 | CREATE TABLE User ( 18 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 19 | name TEXT UNIQUE 20 | ); 21 | 22 | CREATE TABLE Course ( 23 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 24 | title TEXT UNIQUE 25 | ); 26 | 27 | CREATE TABLE Member ( 28 | user_id INTEGER, 29 | course_id INTEGER, 30 | role INTEGER, 31 | PRIMARY KEY (user_id, course_id) 32 | ) 33 | ''') 34 | 35 | fname = input('Enter file name: ') 36 | if len(fname) < 1: 37 | fname = 'roster_data.json' 38 | 39 | # [ 40 | # [ "Charley", "si110", 1 ], 41 | # [ "Mea", "si110", 0 ], 42 | 43 | str_data = open(fname).read() 44 | json_data = json.loads(str_data) 45 | 46 | for entry in json_data: 47 | 48 | name = entry[0]; 49 | title = entry[1]; 50 | role=entry[2]; 51 | 52 | print((name, title,role)) 53 | 54 | cur.execute('''INSERT OR IGNORE INTO User (name) 55 | VALUES ( ? )''', ( name, ) ) 56 | cur.execute('SELECT id FROM User WHERE name = ? ', (name, )) 57 | user_id = cur.fetchone()[0] 58 | 59 | cur.execute('''INSERT OR IGNORE INTO Course (title) 60 | VALUES ( ? )''', ( title, ) ) 61 | cur.execute('SELECT id FROM Course WHERE title = ? ', (title, )) 62 | course_id = cur.fetchone()[0] 63 | 64 | cur.execute('''INSERT OR REPLACE INTO Member 65 | (user_id, course_id,role) VALUES ( ?, ?, ? )''', 66 | ( user_id, course_id ,role) ) 67 | 68 | conn.commit() 69 | -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 4/Output_SQLITE.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 4/Output_SQLITE.JPG -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 4/rosterdb.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 4/rosterdb.sqlite -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 5/README.txt: -------------------------------------------------------------------------------- 1 | Using the Google Places API with a Database and 2 | Visualizing Data on Google Map 3 | 4 | In this project, we are using the Google geocoding API 5 | to clean up some user-entered geographic locations of 6 | university names and then placing the data on a Google 7 | Map. 8 | 9 | Note: Windows has difficulty in displaying UTF-8 characters 10 | in the console so for each command window you open, you may need 11 | to type the following command before running this code: 12 | 13 | chcp 65001 14 | 15 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 16 | 17 | 18 | You should install the SQLite browser to view and modify 19 | the databases from: 20 | 21 | http://sqlitebrowser.org/ 22 | 23 | The first problem to solve is that the Google geocoding 24 | API is rate limited to a fixed number of requests per day. 25 | So if you have a lot of data you might need to stop and 26 | restart the lookup process several times. So we break 27 | the problem into two phases. 28 | 29 | In the first phase we take our input data in the file 30 | (where.data) and read it one line at a time, and retrieve the 31 | geocoded response and store it in a database (geodata.sqlite). 32 | Before we use the geocoding API, we simply check to see if 33 | we already have the data for that particular line of input. 34 | 35 | You can re-start the process at any time by removing the file 36 | geodata.sqlite 37 | 38 | Run the geoload.py program. This program will read the input 39 | lines in where.data and for each line check to see if it is already 40 | in the database and if we don't have the data for the location, 41 | call the geocoding API to retrieve the data and store it in 42 | the database. 43 | 44 | As of December 2016, the Google Geocoding APIs changed dramatically. 45 | They moved some functionality that we use from the Geocoding API 46 | into the Places API. Also all the Google Geo-related APIs require an 47 | API key. To complete this assignment without a Google account, 48 | without an API key, or from a country that blocks 49 | access to Google, you can use a subset of that data which is 50 | available at: 51 | 52 | http://py4e-data.dr-chuck.net/geojson 53 | 54 | To use this, simply leave the api_key set to False in 55 | geoload.py. 56 | 57 | This URL only has a subset of the data but it has no rate limit so 58 | it is good for testing. 59 | 60 | If you want to try this with the API key, follow the 61 | instructions at: 62 | 63 | https://developers.google.com/maps/documentation/geocoding/intro 64 | 65 | and put the API key in the code. 66 | 67 | Here is a sample run after there is already some data in the 68 | database: 69 | 70 | Mac: python3 geoload.py 71 | Win: geoload.py 72 | 73 | Found in database Northeastern University 74 | 75 | Found in database University of Hong Kong, Illinois Institute of Technology, Bradley University 76 | 77 | Found in database Technion 78 | 79 | Found in database Viswakarma Institute, Pune, India 80 | 81 | Found in database UMD 82 | 83 | Found in database Tufts University 84 | 85 | Resolving Monash University 86 | Retrieving http://py4e-data.dr-chuck.net/geojson?address=Monash+University 87 | Retrieved 2063 characters { "results" : [ 88 | {u'status': u'OK', u'results': ... } 89 | 90 | Resolving Kokshetau Institute of Economics and Management 91 | Retrieving http://py4e-data.dr-chuck.net/geojson?address=Kokshetau+Institute+of+Economics+and+Management 92 | Retrieved 1749 characters { "results" : [ 93 | {u'status': u'OK', u'results': ... } 94 | 95 | The first five locations are already in the database and so they 96 | are skipped. The program scans to the point where it finds un-retrieved 97 | locations and starts retrieving them. 98 | 99 | The geoload.py can be stopped at any time, and there is a counter 100 | that you can use to limit the number of calls to the geocoding 101 | API for each run. 102 | 103 | Once you have some data loaded into geodata.sqlite, you can 104 | visualize the data using the (geodump.py) program. This 105 | program reads the database and writes tile file (where.js) 106 | with the location, latitude, and longitude in the form of 107 | executable JavaScript code. 108 | 109 | A run of the geodump.py program is as follows: 110 | 111 | Mac: python3 geodump.py 112 | Win: geodump.py 113 | 114 | Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA 42.3396998 -71.08975 115 | Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA 40.6963857 -89.6160811 116 | ... 117 | Technion, Viazman 87, Kesalsaba, 32000, Israel 32.7775 35.0216667 118 | Monash University Clayton Campus, Wellington Road, Clayton VIC 3800, Australia -37.9152113 145.134682 119 | Kokshetau, Kazakhstan 53.2833333 69.3833333 120 | ... 121 | 12 records written to where.js 122 | Open where.html to view the data in a browser 123 | 124 | The file (where.html) consists of HTML and JavaScript to visualize 125 | a Google Map. It reads the most recent data in where.js to get 126 | the data to be visualized. Here is the format of the where.js file: 127 | 128 | myData = [ 129 | [42.3396998,-71.08975, 'Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA'], 130 | [40.6963857,-89.6160811, 'Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA'], 131 | [32.7775,35.0216667, 'Technion, Viazman 87, Kesalsaba, 32000, Israel'], 132 | ... 133 | ]; 134 | 135 | This is a JavaScript list of lists. The syntax for JavaScript 136 | list constants is very similar to Python so the syntax should 137 | be familiar to you. 138 | 139 | Simply open where.html in a browser to see the locations. You 140 | can hover over each map pin to find the location that the 141 | gecoding API returned for the user-entered input. If you 142 | cannot see any data when you open the where.html file, you might 143 | want to check the JavaScript or developer console for your browser. 144 | 145 | -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 5/Visualization of Where Data.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 5/Visualization of Where Data.JPG -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 5/geodata.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 4 Using Databases with Python/Week 5/geodata.sqlite -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 5/geodump.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import json 3 | import codecs 4 | 5 | conn = sqlite3.connect('geodata.sqlite') 6 | cur = conn.cursor() 7 | 8 | cur.execute('SELECT * FROM Locations') 9 | fhand = codecs.open('where.js', 'w', "utf-8") 10 | fhand.write("myData = [\n") 11 | count = 0 12 | for row in cur : 13 | data = str(row[1].decode()) 14 | try: js = json.loads(str(data)) 15 | except: continue 16 | 17 | if not('status' in js and js['status'] == 'OK') : continue 18 | 19 | lat = js["results"][0]["geometry"]["location"]["lat"] 20 | lng = js["results"][0]["geometry"]["location"]["lng"] 21 | if lat == 0 or lng == 0 : continue 22 | where = js['results'][0]['formatted_address'] 23 | where = where.replace("'", "") 24 | try : 25 | print(where, lat, lng) 26 | 27 | count = count + 1 28 | if count > 1 : fhand.write(",\n") 29 | output = "["+str(lat)+","+str(lng)+", '"+where+"']" 30 | fhand.write(output) 31 | except: 32 | continue 33 | 34 | fhand.write("\n];\n") 35 | cur.close() 36 | fhand.close() 37 | print(count, "records written to where.js") 38 | print("Open where.html to view the data in a browser") 39 | 40 | -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 5/geoload.py: -------------------------------------------------------------------------------- 1 | import urllib.request, urllib.parse, urllib.error 2 | import http 3 | import sqlite3 4 | import json 5 | import time 6 | import ssl 7 | import sys 8 | 9 | api_key = False 10 | # If you have a Google Places API key, enter it here 11 | # api_key = 'AIzaSy___IDByT70' 12 | 13 | if api_key is False: 14 | serviceurl = "http://py4e-data.dr-chuck.net/geojson?" 15 | else : 16 | serviceurl = "https://maps.googleapis.com/maps/api/place/textsearch/json?" 17 | 18 | # Additional detail for urllib 19 | # http.client.HTTPConnection.debuglevel = 1 20 | 21 | conn = sqlite3.connect('geodata.sqlite') 22 | cur = conn.cursor() 23 | 24 | cur.execute(''' 25 | CREATE TABLE IF NOT EXISTS Locations (address TEXT, geodata TEXT)''') 26 | 27 | # Ignore SSL certificate errors 28 | ctx = ssl.create_default_context() 29 | ctx.check_hostname = False 30 | ctx.verify_mode = ssl.CERT_NONE 31 | 32 | fh = open("where.data") 33 | count = 0 34 | for line in fh: 35 | if count > 200 : 36 | print('Retrieved 200 locations, restart to retrieve more') 37 | break 38 | 39 | address = line.strip() 40 | print('') 41 | cur.execute("SELECT geodata FROM Locations WHERE address= ?", 42 | (memoryview(address.encode()), )) 43 | 44 | try: 45 | data = cur.fetchone()[0] 46 | print("Found in database ",address) 47 | continue 48 | except: 49 | pass 50 | 51 | parms = dict() 52 | parms["query"] = address 53 | if api_key is not False: parms['key'] = api_key 54 | url = serviceurl + urllib.parse.urlencode(parms) 55 | 56 | print('Retrieving', url) 57 | uh = urllib.request.urlopen(url, context=ctx) 58 | data = uh.read().decode() 59 | print('Retrieved', len(data), 'characters', data[:20].replace('\n', ' ')) 60 | count = count + 1 61 | 62 | try: 63 | js = json.loads(data) 64 | except: 65 | print(data) # We print in case unicode causes an error 66 | continue 67 | 68 | if 'status' not in js or (js['status'] != 'OK' and js['status'] != 'ZERO_RESULTS') : 69 | print('==== Failure To Retrieve ====') 70 | print(data) 71 | break 72 | 73 | cur.execute('''INSERT INTO Locations (address, geodata) 74 | VALUES ( ?, ? )''', (memoryview(address.encode()), memoryview(data.encode()) ) ) 75 | conn.commit() 76 | if count % 10 == 0 : 77 | print('Pausing for a bit...') 78 | time.sleep(5) 79 | 80 | print("Run geodump.py to read the data from the database so you can vizualize it on a map.") 81 | -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 5/where.data: -------------------------------------------------------------------------------- 1 | Anna University 2 | SASTRA University 3 | Virginia Tech 4 | Nanyang Technological University 5 | University of Texas 6 | Colorado School of Mines 7 | -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 5/where.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | A Map of Information 6 | 7 | 8 | 10 | 11 | 12 | 13 | 14 | 43 | 44 | 45 |
46 |

About this Map

47 |

48 | This is a cool map from 49 | www.py4e.com. 50 |

51 | 52 | 53 | -------------------------------------------------------------------------------- /Course 4 Using Databases with Python/Week 5/where.js: -------------------------------------------------------------------------------- 1 | myData = [ 2 | [10.7295115,79.0196067, 'Sastra University Road, Tirumalaisamudram, Tamil Nadu 613401, India'], 3 | [36.8743583,-76.17454409999999, 'Virginia Tech Trail, Virginia Beach, VA 23455, USA'], 4 | [31.204638,121.5853839, 'Nanyang Technological University Shanghai Office, Pudong, Shanghai, China, 201203'], 5 | [30.2306914,-97.75552049999999, 'University, Austin, TX 78704, USA'] 6 | ]; 7 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, Michael Bostock 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * The name Michael Bostock may not be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT, 21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/Output1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/Output1.jpg -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/Output2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/Output2.jpg -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/Output3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/Output3.jpg -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/Output4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/Output4.jpg -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/README.txt: -------------------------------------------------------------------------------- 1 | Simple Python Search Spider, Page Ranker, and Visualizer 2 | 3 | This is a set of programs that emulate some of the functions of a 4 | search engine. They store their data in a SQLITE3 database named 5 | 'spider.sqlite'. This file can be removed at any time to restart the 6 | process. 7 | 8 | You should install the SQLite browser to view and modify 9 | the databases from: 10 | 11 | http://sqlitebrowser.org/ 12 | 13 | This program crawls a web site and pulls a series of pages into the 14 | database, recording the links between pages. 15 | 16 | Note: Windows has difficulty in displaying UTF-8 characters 17 | in the console so for each console window you open, you may need 18 | to type the following command before running this code: 19 | 20 | chcp 65001 21 | 22 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 23 | 24 | Mac: rm spider.sqlite 25 | Mac: python3 spider.py 26 | 27 | Win: del spider.sqlite 28 | Win: spider.py 29 | 30 | Enter web url or enter: http://www.dr-chuck.com/ 31 | ['http://www.dr-chuck.com'] 32 | How many pages:2 33 | 1 http://www.dr-chuck.com/ 12 34 | 2 http://www.dr-chuck.com/csev-blog/ 57 35 | How many pages: 36 | 37 | In this sample run, we told it to crawl a website and retrieve two 38 | pages. If you restart the program again and tell it to crawl more 39 | pages, it will not re-crawl any pages already in the database. Upon 40 | restart it goes to a random non-crawled page and starts there. So 41 | each successive run of spider.py is additive. 42 | 43 | Mac: python3 spider.py 44 | Win: spider.py 45 | 46 | Enter web url or enter: http://www.dr-chuck.com/ 47 | ['http://www.dr-chuck.com'] 48 | How many pages:3 49 | 3 http://www.dr-chuck.com/csev-blog 57 50 | 4 http://www.dr-chuck.com/dr-chuck/resume/speaking.htm 1 51 | 5 http://www.dr-chuck.com/dr-chuck/resume/index.htm 13 52 | How many pages: 53 | 54 | You can have multiple starting points in the same database - 55 | within the program these are called "webs". The spider 56 | chooses randomly amongst all non-visited links across all 57 | the webs. 58 | 59 | If you want to dump the contents of the spider.sqlite file, you can 60 | run spdump.py as follows: 61 | 62 | Mac: python3 spdump.py 63 | Win: spdump.py 64 | 65 | (5, None, 1.0, 3, u'http://www.dr-chuck.com/csev-blog') 66 | (3, None, 1.0, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 67 | (1, None, 1.0, 2, u'http://www.dr-chuck.com/csev-blog/') 68 | (1, None, 1.0, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 69 | 4 rows. 70 | 71 | This shows the number of incoming links, the old page rank, the new page 72 | rank, the id of the page, and the url of the page. The spdump.py program 73 | only shows pages that have at least one incoming link to them. 74 | 75 | Once you have a few pages in the database, you can run Page Rank on the 76 | pages using the sprank.py program. You simply tell it how many Page 77 | Rank iterations to run. 78 | 79 | Mac: python3 sprank.py 80 | Win: sprank.py 81 | 82 | How many iterations:2 83 | 1 0.546848992536 84 | 2 0.226714939664 85 | [(1, 0.559), (2, 0.659), (3, 0.985), (4, 2.135), (5, 0.659)] 86 | 87 | You can dump the database again to see that page rank has been updated: 88 | 89 | Mac: python3 spdump.py 90 | Win: spdump.py 91 | 92 | (5, 1.0, 0.985, 3, u'http://www.dr-chuck.com/csev-blog') 93 | (3, 1.0, 2.135, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 94 | (1, 1.0, 0.659, 2, u'http://www.dr-chuck.com/csev-blog/') 95 | (1, 1.0, 0.659, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 96 | 4 rows. 97 | 98 | You can run sprank.py as many times as you like and it will simply refine 99 | the page rank the more times you run it. You can even run sprank.py a few times 100 | and then go spider a few more pages sith spider.py and then run sprank.py 101 | to converge the page ranks. 102 | 103 | If you want to restart the Page Rank calculations without re-spidering the 104 | web pages, you can use spreset.py 105 | 106 | Mac: python3 spreset.py 107 | Win: spreset.py 108 | 109 | All pages set to a rank of 1.0 110 | 111 | Mac: python3 sprank.py 112 | Win: sprank.py 113 | 114 | How many iterations:50 115 | 1 0.546848992536 116 | 2 0.226714939664 117 | 3 0.0659516187242 118 | 4 0.0244199333 119 | 5 0.0102096489546 120 | 6 0.00610244329379 121 | ... 122 | 42 0.000109076928206 123 | 43 9.91987599002e-05 124 | 44 9.02151706798e-05 125 | 45 8.20451504471e-05 126 | 46 7.46150183837e-05 127 | 47 6.7857770908e-05 128 | 48 6.17124694224e-05 129 | 49 5.61236959327e-05 130 | 50 5.10410499467e-05 131 | [(512, 0.02963718031139026), (1, 12.790786721866658), (2, 28.939418898678284), (3, 6.808468390725946), (4, 13.469889092397006)] 132 | 133 | For each iteration of the page rank algorithm it prints the average 134 | change per page of the page rank. The network initially is quite 135 | unbalanced and so the individual page ranks are changeing wildly. 136 | But in a few short iterations, the page rank converges. You 137 | should run prank.py long enough that the page ranks converge. 138 | 139 | If you want to visualize the current top pages in terms of page rank, 140 | run spjson.py to write the pages out in JSON format to be viewed in a 141 | web browser. 142 | 143 | Mac: python3 spjson.py 144 | Win: spjson.py 145 | 146 | Creating JSON output on spider.js... 147 | How many nodes? 30 148 | Open force.html in a browser to view the visualization 149 | 150 | You can view this data by opening the file force.html in your web browser. 151 | This shows an automatic layout of the nodes and links. You can click and 152 | drag any node and you can also double click on a node to find the URL 153 | that is represented by the node. 154 | 155 | This visualization is provided using the force layout from: 156 | 157 | http://mbostock.github.com/d3/ 158 | 159 | If you rerun the other utilities and then re-run spjson.py - you merely 160 | have to press refresh in the browser to get the new data from spider.js. 161 | 162 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/bs4/builder/_lxml.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'LXMLTreeBuilderForXML', 3 | 'LXMLTreeBuilder', 4 | ] 5 | 6 | from io import BytesIO 7 | from io import StringIO 8 | import collections 9 | from lxml import etree 10 | from bs4.element import ( 11 | Comment, 12 | Doctype, 13 | NamespacedAttribute, 14 | ProcessingInstruction, 15 | ) 16 | from bs4.builder import ( 17 | FAST, 18 | HTML, 19 | HTMLTreeBuilder, 20 | PERMISSIVE, 21 | ParserRejectedMarkup, 22 | TreeBuilder, 23 | XML) 24 | from bs4.dammit import EncodingDetector 25 | 26 | LXML = 'lxml' 27 | 28 | class LXMLTreeBuilderForXML(TreeBuilder): 29 | DEFAULT_PARSER_CLASS = etree.XMLParser 30 | 31 | is_xml = True 32 | 33 | NAME = "lxml-xml" 34 | ALTERNATE_NAMES = ["xml"] 35 | 36 | # Well, it's permissive by XML parser standards. 37 | features = [NAME, LXML, XML, FAST, PERMISSIVE] 38 | 39 | CHUNK_SIZE = 512 40 | 41 | # This namespace mapping is specified in the XML Namespace 42 | # standard. 43 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 44 | 45 | def default_parser(self, encoding): 46 | # This can either return a parser object or a class, which 47 | # will be instantiated with default arguments. 48 | if self._default_parser is not None: 49 | return self._default_parser 50 | return etree.XMLParser( 51 | target=self, strip_cdata=False, recover=True, encoding=encoding) 52 | 53 | def parser_for(self, encoding): 54 | # Use the default parser. 55 | parser = self.default_parser(encoding) 56 | 57 | if isinstance(parser, collections.Callable): 58 | # Instantiate the parser with default arguments 59 | parser = parser(target=self, strip_cdata=False, encoding=encoding) 60 | return parser 61 | 62 | def __init__(self, parser=None, empty_element_tags=None): 63 | # TODO: Issue a warning if parser is present but not a 64 | # callable, since that means there's no way to create new 65 | # parsers for different encodings. 66 | self._default_parser = parser 67 | if empty_element_tags is not None: 68 | self.empty_element_tags = set(empty_element_tags) 69 | self.soup = None 70 | self.nsmaps = [self.DEFAULT_NSMAPS] 71 | 72 | def _getNsTag(self, tag): 73 | # Split the namespace URL out of a fully-qualified lxml tag 74 | # name. Copied from lxml's src/lxml/sax.py. 75 | if tag[0] == '{': 76 | return tuple(tag[1:].split('}', 1)) 77 | else: 78 | return (None, tag) 79 | 80 | def prepare_markup(self, markup, user_specified_encoding=None, 81 | exclude_encodings=None, 82 | document_declared_encoding=None): 83 | """ 84 | :yield: A series of 4-tuples. 85 | (markup, encoding, declared encoding, 86 | has undergone character replacement) 87 | 88 | Each 4-tuple represents a strategy for parsing the document. 89 | """ 90 | if isinstance(markup, str): 91 | # We were given Unicode. Maybe lxml can parse Unicode on 92 | # this system? 93 | yield markup, None, document_declared_encoding, False 94 | 95 | if isinstance(markup, str): 96 | # No, apparently not. Convert the Unicode to UTF-8 and 97 | # tell lxml to parse it as UTF-8. 98 | yield (markup.encode("utf8"), "utf8", 99 | document_declared_encoding, False) 100 | 101 | # Instead of using UnicodeDammit to convert the bytestring to 102 | # Unicode using different encodings, use EncodingDetector to 103 | # iterate over the encodings, and tell lxml to try to parse 104 | # the document as each one in turn. 105 | is_html = not self.is_xml 106 | try_encodings = [user_specified_encoding, document_declared_encoding] 107 | detector = EncodingDetector( 108 | markup, try_encodings, is_html, exclude_encodings) 109 | for encoding in detector.encodings: 110 | yield (detector.markup, encoding, document_declared_encoding, False) 111 | 112 | def feed(self, markup): 113 | if isinstance(markup, bytes): 114 | markup = BytesIO(markup) 115 | elif isinstance(markup, str): 116 | markup = StringIO(markup) 117 | 118 | # Call feed() at least once, even if the markup is empty, 119 | # or the parser won't be initialized. 120 | data = markup.read(self.CHUNK_SIZE) 121 | try: 122 | self.parser = self.parser_for(self.soup.original_encoding) 123 | self.parser.feed(data) 124 | while len(data) != 0: 125 | # Now call feed() on the rest of the data, chunk by chunk. 126 | data = markup.read(self.CHUNK_SIZE) 127 | if len(data) != 0: 128 | self.parser.feed(data) 129 | self.parser.close() 130 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 131 | raise ParserRejectedMarkup(str(e)) 132 | 133 | def close(self): 134 | self.nsmaps = [self.DEFAULT_NSMAPS] 135 | 136 | def start(self, name, attrs, nsmap={}): 137 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 138 | attrs = dict(attrs) 139 | nsprefix = None 140 | # Invert each namespace map as it comes in. 141 | if len(self.nsmaps) > 1: 142 | # There are no new namespaces for this tag, but 143 | # non-default namespaces are in play, so we need a 144 | # separate tag stack to know when they end. 145 | self.nsmaps.append(None) 146 | elif len(nsmap) > 0: 147 | # A new namespace mapping has come into play. 148 | inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) 149 | self.nsmaps.append(inverted_nsmap) 150 | # Also treat the namespace mapping as a set of attributes on the 151 | # tag, so we can recreate it later. 152 | attrs = attrs.copy() 153 | for prefix, namespace in list(nsmap.items()): 154 | attribute = NamespacedAttribute( 155 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 156 | attrs[attribute] = namespace 157 | 158 | # Namespaces are in play. Find any attributes that came in 159 | # from lxml with namespaces attached to their names, and 160 | # turn then into NamespacedAttribute objects. 161 | new_attrs = {} 162 | for attr, value in list(attrs.items()): 163 | namespace, attr = self._getNsTag(attr) 164 | if namespace is None: 165 | new_attrs[attr] = value 166 | else: 167 | nsprefix = self._prefix_for_namespace(namespace) 168 | attr = NamespacedAttribute(nsprefix, attr, namespace) 169 | new_attrs[attr] = value 170 | attrs = new_attrs 171 | 172 | namespace, name = self._getNsTag(name) 173 | nsprefix = self._prefix_for_namespace(namespace) 174 | self.soup.handle_starttag(name, namespace, nsprefix, attrs) 175 | 176 | def _prefix_for_namespace(self, namespace): 177 | """Find the currently active prefix for the given namespace.""" 178 | if namespace is None: 179 | return None 180 | for inverted_nsmap in reversed(self.nsmaps): 181 | if inverted_nsmap is not None and namespace in inverted_nsmap: 182 | return inverted_nsmap[namespace] 183 | return None 184 | 185 | def end(self, name): 186 | self.soup.endData() 187 | completed_tag = self.soup.tagStack[-1] 188 | namespace, name = self._getNsTag(name) 189 | nsprefix = None 190 | if namespace is not None: 191 | for inverted_nsmap in reversed(self.nsmaps): 192 | if inverted_nsmap is not None and namespace in inverted_nsmap: 193 | nsprefix = inverted_nsmap[namespace] 194 | break 195 | self.soup.handle_endtag(name, nsprefix) 196 | if len(self.nsmaps) > 1: 197 | # This tag, or one of its parents, introduced a namespace 198 | # mapping, so pop it off the stack. 199 | self.nsmaps.pop() 200 | 201 | def pi(self, target, data): 202 | self.soup.endData() 203 | self.soup.handle_data(target + ' ' + data) 204 | self.soup.endData(ProcessingInstruction) 205 | 206 | def data(self, content): 207 | self.soup.handle_data(content) 208 | 209 | def doctype(self, name, pubid, system): 210 | self.soup.endData() 211 | doctype = Doctype.for_name_and_ids(name, pubid, system) 212 | self.soup.object_was_parsed(doctype) 213 | 214 | def comment(self, content): 215 | "Handle comments as Comment objects." 216 | self.soup.endData() 217 | self.soup.handle_data(content) 218 | self.soup.endData(Comment) 219 | 220 | def test_fragment_to_document(self, fragment): 221 | """See `TreeBuilder`.""" 222 | return '\n%s' % fragment 223 | 224 | 225 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 226 | 227 | NAME = LXML 228 | ALTERNATE_NAMES = ["lxml-html"] 229 | 230 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 231 | is_xml = False 232 | 233 | def default_parser(self, encoding): 234 | return etree.HTMLParser 235 | 236 | def feed(self, markup): 237 | encoding = self.soup.original_encoding 238 | try: 239 | self.parser = self.parser_for(encoding) 240 | self.parser.feed(markup) 241 | self.parser.close() 242 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 243 | raise ParserRejectedMarkup(str(e)) 244 | 245 | 246 | def test_fragment_to_document(self, fragment): 247 | """See `TreeBuilder`.""" 248 | return '%s' % fragment 249 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/bs4/diagnose.py: -------------------------------------------------------------------------------- 1 | """Diagnostic functions, mainly for use when doing tech support.""" 2 | import cProfile 3 | from io import StringIO 4 | from html.parser import HTMLParser 5 | import bs4 6 | from bs4 import BeautifulSoup, __version__ 7 | from bs4.builder import builder_registry 8 | 9 | import os 10 | import pstats 11 | import random 12 | import tempfile 13 | import time 14 | import traceback 15 | import sys 16 | import cProfile 17 | 18 | def diagnose(data): 19 | """Diagnostic suite for isolating common problems.""" 20 | print("Diagnostic running on Beautiful Soup %s" % __version__) 21 | print("Python version %s" % sys.version) 22 | 23 | basic_parsers = ["html.parser", "html5lib", "lxml"] 24 | for name in basic_parsers: 25 | for builder in builder_registry.builders: 26 | if name in builder.features: 27 | break 28 | else: 29 | basic_parsers.remove(name) 30 | print(( 31 | "I noticed that %s is not installed. Installing it may help." % 32 | name)) 33 | 34 | if 'lxml' in basic_parsers: 35 | basic_parsers.append(["lxml", "xml"]) 36 | try: 37 | from lxml import etree 38 | print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) 39 | except ImportError as e: 40 | print ( 41 | "lxml is not installed or couldn't be imported.") 42 | 43 | 44 | if 'html5lib' in basic_parsers: 45 | try: 46 | import html5lib 47 | print("Found html5lib version %s" % html5lib.__version__) 48 | except ImportError as e: 49 | print ( 50 | "html5lib is not installed or couldn't be imported.") 51 | 52 | if hasattr(data, 'read'): 53 | data = data.read() 54 | elif os.path.exists(data): 55 | print('"%s" looks like a filename. Reading data from the file.' % data) 56 | data = open(data).read() 57 | elif data.startswith("http:") or data.startswith("https:"): 58 | print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) 59 | print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") 60 | return 61 | print() 62 | 63 | for parser in basic_parsers: 64 | print("Trying to parse your markup with %s" % parser) 65 | success = False 66 | try: 67 | soup = BeautifulSoup(data, parser) 68 | success = True 69 | except Exception as e: 70 | print("%s could not parse the markup." % parser) 71 | traceback.print_exc() 72 | if success: 73 | print("Here's what %s did with the markup:" % parser) 74 | print(soup.prettify()) 75 | 76 | print("-" * 80) 77 | 78 | def lxml_trace(data, html=True, **kwargs): 79 | """Print out the lxml events that occur during parsing. 80 | 81 | This lets you see how lxml parses a document when no Beautiful 82 | Soup code is running. 83 | """ 84 | from lxml import etree 85 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 86 | print(("%s, %4s, %s" % (event, element.tag, element.text))) 87 | 88 | class AnnouncingParser(HTMLParser): 89 | """Announces HTMLParser parse events, without doing anything else.""" 90 | 91 | def _p(self, s): 92 | print(s) 93 | 94 | def handle_starttag(self, name, attrs): 95 | self._p("%s START" % name) 96 | 97 | def handle_endtag(self, name): 98 | self._p("%s END" % name) 99 | 100 | def handle_data(self, data): 101 | self._p("%s DATA" % data) 102 | 103 | def handle_charref(self, name): 104 | self._p("%s CHARREF" % name) 105 | 106 | def handle_entityref(self, name): 107 | self._p("%s ENTITYREF" % name) 108 | 109 | def handle_comment(self, data): 110 | self._p("%s COMMENT" % data) 111 | 112 | def handle_decl(self, data): 113 | self._p("%s DECL" % data) 114 | 115 | def unknown_decl(self, data): 116 | self._p("%s UNKNOWN-DECL" % data) 117 | 118 | def handle_pi(self, data): 119 | self._p("%s PI" % data) 120 | 121 | def htmlparser_trace(data): 122 | """Print out the HTMLParser events that occur during parsing. 123 | 124 | This lets you see how HTMLParser parses a document when no 125 | Beautiful Soup code is running. 126 | """ 127 | parser = AnnouncingParser() 128 | parser.feed(data) 129 | 130 | _vowels = "aeiou" 131 | _consonants = "bcdfghjklmnpqrstvwxyz" 132 | 133 | def rword(length=5): 134 | "Generate a random word-like string." 135 | s = '' 136 | for i in range(length): 137 | if i % 2 == 0: 138 | t = _consonants 139 | else: 140 | t = _vowels 141 | s += random.choice(t) 142 | return s 143 | 144 | def rsentence(length=4): 145 | "Generate a random sentence-like string." 146 | return " ".join(rword(random.randint(4,9)) for i in range(length)) 147 | 148 | def rdoc(num_elements=1000): 149 | """Randomly generate an invalid HTML document.""" 150 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 151 | elements = [] 152 | for i in range(num_elements): 153 | choice = random.randint(0,3) 154 | if choice == 0: 155 | # New tag. 156 | tag_name = random.choice(tag_names) 157 | elements.append("<%s>" % tag_name) 158 | elif choice == 1: 159 | elements.append(rsentence(random.randint(1,4))) 160 | elif choice == 2: 161 | # Close a tag. 162 | tag_name = random.choice(tag_names) 163 | elements.append("" % tag_name) 164 | return "" + "\n".join(elements) + "" 165 | 166 | def benchmark_parsers(num_elements=100000): 167 | """Very basic head-to-head performance benchmark.""" 168 | print("Comparative parser benchmark on Beautiful Soup %s" % __version__) 169 | data = rdoc(num_elements) 170 | print("Generated a large invalid HTML document (%d bytes)." % len(data)) 171 | 172 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 173 | success = False 174 | try: 175 | a = time.time() 176 | soup = BeautifulSoup(data, parser) 177 | b = time.time() 178 | success = True 179 | except Exception as e: 180 | print("%s could not parse the markup." % parser) 181 | traceback.print_exc() 182 | if success: 183 | print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) 184 | 185 | from lxml import etree 186 | a = time.time() 187 | etree.HTML(data) 188 | b = time.time() 189 | print("Raw lxml parsed the markup in %.2fs." % (b-a)) 190 | 191 | import html5lib 192 | parser = html5lib.HTMLParser() 193 | a = time.time() 194 | parser.parse(data) 195 | b = time.time() 196 | print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 197 | 198 | def profile(num_elements=100000, parser="lxml"): 199 | 200 | filehandle = tempfile.NamedTemporaryFile() 201 | filename = filehandle.name 202 | 203 | data = rdoc(num_elements) 204 | vars = dict(bs4=bs4, data=data, parser=parser) 205 | cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 206 | 207 | stats = pstats.Stats(filename) 208 | # stats.strip_dirs() 209 | stats.sort_stats("cumulative") 210 | stats.print_stats('_html5lib|bs4', 50) 211 | 212 | if __name__ == '__main__': 213 | diagnose(sys.stdin.read()) 214 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/bs4/diagnose.py.bak: -------------------------------------------------------------------------------- 1 | """Diagnostic functions, mainly for use when doing tech support.""" 2 | import cProfile 3 | from StringIO import StringIO 4 | from HTMLParser import HTMLParser 5 | import bs4 6 | from bs4 import BeautifulSoup, __version__ 7 | from bs4.builder import builder_registry 8 | 9 | import os 10 | import pstats 11 | import random 12 | import tempfile 13 | import time 14 | import traceback 15 | import sys 16 | import cProfile 17 | 18 | def diagnose(data): 19 | """Diagnostic suite for isolating common problems.""" 20 | print "Diagnostic running on Beautiful Soup %s" % __version__ 21 | print "Python version %s" % sys.version 22 | 23 | basic_parsers = ["html.parser", "html5lib", "lxml"] 24 | for name in basic_parsers: 25 | for builder in builder_registry.builders: 26 | if name in builder.features: 27 | break 28 | else: 29 | basic_parsers.remove(name) 30 | print ( 31 | "I noticed that %s is not installed. Installing it may help." % 32 | name) 33 | 34 | if 'lxml' in basic_parsers: 35 | basic_parsers.append(["lxml", "xml"]) 36 | try: 37 | from lxml import etree 38 | print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) 39 | except ImportError, e: 40 | print ( 41 | "lxml is not installed or couldn't be imported.") 42 | 43 | 44 | if 'html5lib' in basic_parsers: 45 | try: 46 | import html5lib 47 | print "Found html5lib version %s" % html5lib.__version__ 48 | except ImportError, e: 49 | print ( 50 | "html5lib is not installed or couldn't be imported.") 51 | 52 | if hasattr(data, 'read'): 53 | data = data.read() 54 | elif os.path.exists(data): 55 | print '"%s" looks like a filename. Reading data from the file.' % data 56 | data = open(data).read() 57 | elif data.startswith("http:") or data.startswith("https:"): 58 | print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data 59 | print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." 60 | return 61 | print 62 | 63 | for parser in basic_parsers: 64 | print "Trying to parse your markup with %s" % parser 65 | success = False 66 | try: 67 | soup = BeautifulSoup(data, parser) 68 | success = True 69 | except Exception, e: 70 | print "%s could not parse the markup." % parser 71 | traceback.print_exc() 72 | if success: 73 | print "Here's what %s did with the markup:" % parser 74 | print soup.prettify() 75 | 76 | print "-" * 80 77 | 78 | def lxml_trace(data, html=True, **kwargs): 79 | """Print out the lxml events that occur during parsing. 80 | 81 | This lets you see how lxml parses a document when no Beautiful 82 | Soup code is running. 83 | """ 84 | from lxml import etree 85 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 86 | print("%s, %4s, %s" % (event, element.tag, element.text)) 87 | 88 | class AnnouncingParser(HTMLParser): 89 | """Announces HTMLParser parse events, without doing anything else.""" 90 | 91 | def _p(self, s): 92 | print(s) 93 | 94 | def handle_starttag(self, name, attrs): 95 | self._p("%s START" % name) 96 | 97 | def handle_endtag(self, name): 98 | self._p("%s END" % name) 99 | 100 | def handle_data(self, data): 101 | self._p("%s DATA" % data) 102 | 103 | def handle_charref(self, name): 104 | self._p("%s CHARREF" % name) 105 | 106 | def handle_entityref(self, name): 107 | self._p("%s ENTITYREF" % name) 108 | 109 | def handle_comment(self, data): 110 | self._p("%s COMMENT" % data) 111 | 112 | def handle_decl(self, data): 113 | self._p("%s DECL" % data) 114 | 115 | def unknown_decl(self, data): 116 | self._p("%s UNKNOWN-DECL" % data) 117 | 118 | def handle_pi(self, data): 119 | self._p("%s PI" % data) 120 | 121 | def htmlparser_trace(data): 122 | """Print out the HTMLParser events that occur during parsing. 123 | 124 | This lets you see how HTMLParser parses a document when no 125 | Beautiful Soup code is running. 126 | """ 127 | parser = AnnouncingParser() 128 | parser.feed(data) 129 | 130 | _vowels = "aeiou" 131 | _consonants = "bcdfghjklmnpqrstvwxyz" 132 | 133 | def rword(length=5): 134 | "Generate a random word-like string." 135 | s = '' 136 | for i in range(length): 137 | if i % 2 == 0: 138 | t = _consonants 139 | else: 140 | t = _vowels 141 | s += random.choice(t) 142 | return s 143 | 144 | def rsentence(length=4): 145 | "Generate a random sentence-like string." 146 | return " ".join(rword(random.randint(4,9)) for i in range(length)) 147 | 148 | def rdoc(num_elements=1000): 149 | """Randomly generate an invalid HTML document.""" 150 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 151 | elements = [] 152 | for i in range(num_elements): 153 | choice = random.randint(0,3) 154 | if choice == 0: 155 | # New tag. 156 | tag_name = random.choice(tag_names) 157 | elements.append("<%s>" % tag_name) 158 | elif choice == 1: 159 | elements.append(rsentence(random.randint(1,4))) 160 | elif choice == 2: 161 | # Close a tag. 162 | tag_name = random.choice(tag_names) 163 | elements.append("" % tag_name) 164 | return "" + "\n".join(elements) + "" 165 | 166 | def benchmark_parsers(num_elements=100000): 167 | """Very basic head-to-head performance benchmark.""" 168 | print "Comparative parser benchmark on Beautiful Soup %s" % __version__ 169 | data = rdoc(num_elements) 170 | print "Generated a large invalid HTML document (%d bytes)." % len(data) 171 | 172 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 173 | success = False 174 | try: 175 | a = time.time() 176 | soup = BeautifulSoup(data, parser) 177 | b = time.time() 178 | success = True 179 | except Exception, e: 180 | print "%s could not parse the markup." % parser 181 | traceback.print_exc() 182 | if success: 183 | print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 184 | 185 | from lxml import etree 186 | a = time.time() 187 | etree.HTML(data) 188 | b = time.time() 189 | print "Raw lxml parsed the markup in %.2fs." % (b-a) 190 | 191 | import html5lib 192 | parser = html5lib.HTMLParser() 193 | a = time.time() 194 | parser.parse(data) 195 | b = time.time() 196 | print "Raw html5lib parsed the markup in %.2fs." % (b-a) 197 | 198 | def profile(num_elements=100000, parser="lxml"): 199 | 200 | filehandle = tempfile.NamedTemporaryFile() 201 | filename = filehandle.name 202 | 203 | data = rdoc(num_elements) 204 | vars = dict(bs4=bs4, data=data, parser=parser) 205 | cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 206 | 207 | stats = pstats.Stats(filename) 208 | # stats.strip_dirs() 209 | stats.sort_stats("cumulative") 210 | stats.print_stats('_html5lib|bs4', 50) 211 | 212 | if __name__ == '__main__': 213 | diagnose(sys.stdin.read()) 214 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/bs4/tests/__init__.py: -------------------------------------------------------------------------------- 1 | "The beautifulsoup tests." 2 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/bs4/tests/test_builder_registry.py: -------------------------------------------------------------------------------- 1 | """Tests of the builder registry.""" 2 | 3 | import unittest 4 | import warnings 5 | 6 | from bs4 import BeautifulSoup 7 | from bs4.builder import ( 8 | builder_registry as registry, 9 | HTMLParserTreeBuilder, 10 | TreeBuilderRegistry, 11 | ) 12 | 13 | try: 14 | from bs4.builder import HTML5TreeBuilder 15 | HTML5LIB_PRESENT = True 16 | except ImportError: 17 | HTML5LIB_PRESENT = False 18 | 19 | try: 20 | from bs4.builder import ( 21 | LXMLTreeBuilderForXML, 22 | LXMLTreeBuilder, 23 | ) 24 | LXML_PRESENT = True 25 | except ImportError: 26 | LXML_PRESENT = False 27 | 28 | 29 | class BuiltInRegistryTest(unittest.TestCase): 30 | """Test the built-in registry with the default builders registered.""" 31 | 32 | def test_combination(self): 33 | if LXML_PRESENT: 34 | self.assertEqual(registry.lookup('fast', 'html'), 35 | LXMLTreeBuilder) 36 | 37 | if LXML_PRESENT: 38 | self.assertEqual(registry.lookup('permissive', 'xml'), 39 | LXMLTreeBuilderForXML) 40 | self.assertEqual(registry.lookup('strict', 'html'), 41 | HTMLParserTreeBuilder) 42 | if HTML5LIB_PRESENT: 43 | self.assertEqual(registry.lookup('html5lib', 'html'), 44 | HTML5TreeBuilder) 45 | 46 | def test_lookup_by_markup_type(self): 47 | if LXML_PRESENT: 48 | self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) 49 | self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) 50 | else: 51 | self.assertEqual(registry.lookup('xml'), None) 52 | if HTML5LIB_PRESENT: 53 | self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) 54 | else: 55 | self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) 56 | 57 | def test_named_library(self): 58 | if LXML_PRESENT: 59 | self.assertEqual(registry.lookup('lxml', 'xml'), 60 | LXMLTreeBuilderForXML) 61 | self.assertEqual(registry.lookup('lxml', 'html'), 62 | LXMLTreeBuilder) 63 | if HTML5LIB_PRESENT: 64 | self.assertEqual(registry.lookup('html5lib'), 65 | HTML5TreeBuilder) 66 | 67 | self.assertEqual(registry.lookup('html.parser'), 68 | HTMLParserTreeBuilder) 69 | 70 | def test_beautifulsoup_constructor_does_lookup(self): 71 | 72 | with warnings.catch_warnings(record=True) as w: 73 | # This will create a warning about not explicitly 74 | # specifying a parser, but we'll ignore it. 75 | 76 | # You can pass in a string. 77 | BeautifulSoup("", features="html") 78 | # Or a list of strings. 79 | BeautifulSoup("", features=["html", "fast"]) 80 | 81 | # You'll get an exception if BS can't find an appropriate 82 | # builder. 83 | self.assertRaises(ValueError, BeautifulSoup, 84 | "", features="no-such-feature") 85 | 86 | class RegistryTest(unittest.TestCase): 87 | """Test the TreeBuilderRegistry class in general.""" 88 | 89 | def setUp(self): 90 | self.registry = TreeBuilderRegistry() 91 | 92 | def builder_for_features(self, *feature_list): 93 | cls = type('Builder_' + '_'.join(feature_list), 94 | (object,), {'features' : feature_list}) 95 | 96 | self.registry.register(cls) 97 | return cls 98 | 99 | def test_register_with_no_features(self): 100 | builder = self.builder_for_features() 101 | 102 | # Since the builder advertises no features, you can't find it 103 | # by looking up features. 104 | self.assertEqual(self.registry.lookup('foo'), None) 105 | 106 | # But you can find it by doing a lookup with no features, if 107 | # this happens to be the only registered builder. 108 | self.assertEqual(self.registry.lookup(), builder) 109 | 110 | def test_register_with_features_makes_lookup_succeed(self): 111 | builder = self.builder_for_features('foo', 'bar') 112 | self.assertEqual(self.registry.lookup('foo'), builder) 113 | self.assertEqual(self.registry.lookup('bar'), builder) 114 | 115 | def test_lookup_fails_when_no_builder_implements_feature(self): 116 | builder = self.builder_for_features('foo', 'bar') 117 | self.assertEqual(self.registry.lookup('baz'), None) 118 | 119 | def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): 120 | builder1 = self.builder_for_features('foo') 121 | builder2 = self.builder_for_features('bar') 122 | self.assertEqual(self.registry.lookup(), builder2) 123 | 124 | def test_lookup_fails_when_no_tree_builders_registered(self): 125 | self.assertEqual(self.registry.lookup(), None) 126 | 127 | def test_lookup_gets_most_recent_builder_supporting_all_features(self): 128 | has_one = self.builder_for_features('foo') 129 | has_the_other = self.builder_for_features('bar') 130 | has_both_early = self.builder_for_features('foo', 'bar', 'baz') 131 | has_both_late = self.builder_for_features('foo', 'bar', 'quux') 132 | lacks_one = self.builder_for_features('bar') 133 | has_the_other = self.builder_for_features('foo') 134 | 135 | # There are two builders featuring 'foo' and 'bar', but 136 | # the one that also features 'quux' was registered later. 137 | self.assertEqual(self.registry.lookup('foo', 'bar'), 138 | has_both_late) 139 | 140 | # There is only one builder featuring 'foo', 'bar', and 'baz'. 141 | self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), 142 | has_both_early) 143 | 144 | def test_lookup_fails_when_cannot_reconcile_requested_features(self): 145 | builder1 = self.builder_for_features('foo', 'bar') 146 | builder2 = self.builder_for_features('foo', 'baz') 147 | self.assertEqual(self.registry.lookup('bar', 'baz'), None) 148 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/bs4/tests/test_docs.py: -------------------------------------------------------------------------------- 1 | "Test harness for doctests." 2 | 3 | # pylint: disable-msg=E0611,W0142 4 | 5 | __metaclass__ = type 6 | __all__ = [ 7 | 'additional_tests', 8 | ] 9 | 10 | import atexit 11 | import doctest 12 | import os 13 | #from pkg_resources import ( 14 | # resource_filename, resource_exists, resource_listdir, cleanup_resources) 15 | import unittest 16 | 17 | DOCTEST_FLAGS = ( 18 | doctest.ELLIPSIS | 19 | doctest.NORMALIZE_WHITESPACE | 20 | doctest.REPORT_NDIFF) 21 | 22 | 23 | # def additional_tests(): 24 | # "Run the doc tests (README.txt and docs/*, if any exist)" 25 | # doctest_files = [ 26 | # os.path.abspath(resource_filename('bs4', 'README.txt'))] 27 | # if resource_exists('bs4', 'docs'): 28 | # for name in resource_listdir('bs4', 'docs'): 29 | # if name.endswith('.txt'): 30 | # doctest_files.append( 31 | # os.path.abspath( 32 | # resource_filename('bs4', 'docs/%s' % name))) 33 | # kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) 34 | # atexit.register(cleanup_resources) 35 | # return unittest.TestSuite(( 36 | # doctest.DocFileSuite(*doctest_files, **kwargs))) 37 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/bs4/tests/test_html5lib.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html5lib tree builder generates good trees.""" 2 | 3 | import warnings 4 | 5 | try: 6 | from bs4.builder import HTML5TreeBuilder 7 | HTML5LIB_PRESENT = True 8 | except ImportError as e: 9 | HTML5LIB_PRESENT = False 10 | from bs4.element import SoupStrainer 11 | from bs4.testing import ( 12 | HTML5TreeBuilderSmokeTest, 13 | SoupTest, 14 | skipIf, 15 | ) 16 | 17 | @skipIf( 18 | not HTML5LIB_PRESENT, 19 | "html5lib seems not to be present, not testing its tree builder.") 20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21 | """See ``HTML5TreeBuilderSmokeTest``.""" 22 | 23 | @property 24 | def default_builder(self): 25 | return HTML5TreeBuilder() 26 | 27 | def test_soupstrainer(self): 28 | # The html5lib tree builder does not support SoupStrainers. 29 | strainer = SoupStrainer("b") 30 | markup = "

A bold statement.

" 31 | with warnings.catch_warnings(record=True) as w: 32 | soup = self.soup(markup, parse_only=strainer) 33 | self.assertEqual( 34 | soup.decode(), self.document_for(markup)) 35 | 36 | self.assertTrue( 37 | "the html5lib tree builder doesn't support parse_only" in 38 | str(w[0].message)) 39 | 40 | def test_correctly_nested_tables(self): 41 | """html5lib inserts tags where other parsers don't.""" 42 | markup = ('' 43 | '' 44 | "') 48 | 49 | self.assertSoupEquals( 50 | markup, 51 | '
Here's another table:" 45 | '' 46 | '' 47 | '
foo
Here\'s another table:' 52 | '
foo
' 53 | '
') 54 | 55 | self.assertSoupEquals( 56 | "" 57 | "" 58 | "
Foo
Bar
Baz
") 59 | 60 | def test_xml_declaration_followed_by_doctype(self): 61 | markup = ''' 62 | 63 | 64 | 65 | 66 | 67 |

foo

68 | 69 | ''' 70 | soup = self.soup(markup) 71 | # Verify that we can reach the

tag; this means the tree is connected. 72 | self.assertEqual(b"

foo

", soup.p.encode()) 73 | 74 | def test_reparented_markup(self): 75 | markup = '

foo

\n

bar

' 76 | soup = self.soup(markup) 77 | self.assertEqual("

foo

\n

bar

", soup.body.decode()) 78 | self.assertEqual(2, len(soup.find_all('p'))) 79 | 80 | 81 | def test_reparented_markup_ends_with_whitespace(self): 82 | markup = '

foo

\n

bar

\n' 83 | soup = self.soup(markup) 84 | self.assertEqual("

foo

\n

bar

\n", soup.body.decode()) 85 | self.assertEqual(2, len(soup.find_all('p'))) 86 | 87 | def test_processing_instruction(self): 88 | """Processing instructions become comments.""" 89 | markup = b"""""" 90 | soup = self.soup(markup) 91 | assert str(soup).startswith("") 92 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/bs4/tests/test_html5lib.py.bak: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html5lib tree builder generates good trees.""" 2 | 3 | import warnings 4 | 5 | try: 6 | from bs4.builder import HTML5TreeBuilder 7 | HTML5LIB_PRESENT = True 8 | except ImportError, e: 9 | HTML5LIB_PRESENT = False 10 | from bs4.element import SoupStrainer 11 | from bs4.testing import ( 12 | HTML5TreeBuilderSmokeTest, 13 | SoupTest, 14 | skipIf, 15 | ) 16 | 17 | @skipIf( 18 | not HTML5LIB_PRESENT, 19 | "html5lib seems not to be present, not testing its tree builder.") 20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21 | """See ``HTML5TreeBuilderSmokeTest``.""" 22 | 23 | @property 24 | def default_builder(self): 25 | return HTML5TreeBuilder() 26 | 27 | def test_soupstrainer(self): 28 | # The html5lib tree builder does not support SoupStrainers. 29 | strainer = SoupStrainer("b") 30 | markup = "

A bold statement.

" 31 | with warnings.catch_warnings(record=True) as w: 32 | soup = self.soup(markup, parse_only=strainer) 33 | self.assertEqual( 34 | soup.decode(), self.document_for(markup)) 35 | 36 | self.assertTrue( 37 | "the html5lib tree builder doesn't support parse_only" in 38 | str(w[0].message)) 39 | 40 | def test_correctly_nested_tables(self): 41 | """html5lib inserts tags where other parsers don't.""" 42 | markup = ('' 43 | '' 44 | "') 48 | 49 | self.assertSoupEquals( 50 | markup, 51 | '
Here's another table:" 45 | '' 46 | '' 47 | '
foo
Here\'s another table:' 52 | '
foo
' 53 | '
') 54 | 55 | self.assertSoupEquals( 56 | "" 57 | "" 58 | "
Foo
Bar
Baz
") 59 | 60 | def test_xml_declaration_followed_by_doctype(self): 61 | markup = ''' 62 | 63 | 64 | 65 | 66 | 67 |

foo

68 | 69 | ''' 70 | soup = self.soup(markup) 71 | # Verify that we can reach the

tag; this means the tree is connected. 72 | self.assertEqual(b"

foo

", soup.p.encode()) 73 | 74 | def test_reparented_markup(self): 75 | markup = '

foo

\n

bar

' 76 | soup = self.soup(markup) 77 | self.assertEqual(u"

foo

\n

bar

", soup.body.decode()) 78 | self.assertEqual(2, len(soup.find_all('p'))) 79 | 80 | 81 | def test_reparented_markup_ends_with_whitespace(self): 82 | markup = '

foo

\n

bar

\n' 83 | soup = self.soup(markup) 84 | self.assertEqual(u"

foo

\n

bar

\n", soup.body.decode()) 85 | self.assertEqual(2, len(soup.find_all('p'))) 86 | 87 | def test_processing_instruction(self): 88 | """Processing instructions become comments.""" 89 | markup = b"""""" 90 | soup = self.soup(markup) 91 | assert str(soup).startswith("") 92 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/bs4/tests/test_htmlparser.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html.parser tree builder generates good 2 | trees.""" 3 | 4 | from pdb import set_trace 5 | import pickle 6 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest 7 | from bs4.builder import HTMLParserTreeBuilder 8 | 9 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 10 | 11 | @property 12 | def default_builder(self): 13 | return HTMLParserTreeBuilder() 14 | 15 | def test_namespaced_system_doctype(self): 16 | # html.parser can't handle namespaced doctypes, so skip this one. 17 | pass 18 | 19 | def test_namespaced_public_doctype(self): 20 | # html.parser can't handle namespaced doctypes, so skip this one. 21 | pass 22 | 23 | def test_builder_is_pickled(self): 24 | """Unlike most tree builders, HTMLParserTreeBuilder and will 25 | be restored after pickling. 26 | """ 27 | tree = self.soup("foo") 28 | dumped = pickle.dumps(tree, 2) 29 | loaded = pickle.loads(dumped) 30 | self.assertTrue(isinstance(loaded.builder, type(tree.builder))) 31 | 32 | 33 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/bs4/tests/test_lxml.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the lxml tree builder generates good trees.""" 2 | 3 | import re 4 | import warnings 5 | 6 | try: 7 | import lxml.etree 8 | LXML_PRESENT = True 9 | LXML_VERSION = lxml.etree.LXML_VERSION 10 | except ImportError as e: 11 | LXML_PRESENT = False 12 | LXML_VERSION = (0,) 13 | 14 | if LXML_PRESENT: 15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 16 | 17 | from bs4 import ( 18 | BeautifulSoup, 19 | BeautifulStoneSoup, 20 | ) 21 | from bs4.element import Comment, Doctype, SoupStrainer 22 | from bs4.testing import skipIf 23 | from bs4.tests import test_htmlparser 24 | from bs4.testing import ( 25 | HTMLTreeBuilderSmokeTest, 26 | XMLTreeBuilderSmokeTest, 27 | SoupTest, 28 | skipIf, 29 | ) 30 | 31 | @skipIf( 32 | not LXML_PRESENT, 33 | "lxml seems not to be present, not testing its tree builder.") 34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 35 | """See ``HTMLTreeBuilderSmokeTest``.""" 36 | 37 | @property 38 | def default_builder(self): 39 | return LXMLTreeBuilder() 40 | 41 | def test_out_of_range_entity(self): 42 | self.assertSoupEquals( 43 | "

foo�bar

", "

foobar

") 44 | self.assertSoupEquals( 45 | "

foo�bar

", "

foobar

") 46 | self.assertSoupEquals( 47 | "

foo�bar

", "

foobar

") 48 | 49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this 50 | # test if an old version of lxml is installed. 51 | 52 | @skipIf( 53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), 54 | "Skipping doctype test for old version of lxml to avoid segfault.") 55 | def test_empty_doctype(self): 56 | soup = self.soup("") 57 | doctype = soup.contents[0] 58 | self.assertEqual("", doctype.strip()) 59 | 60 | def test_beautifulstonesoup_is_xml_parser(self): 61 | # Make sure that the deprecated BSS class uses an xml builder 62 | # if one is installed. 63 | with warnings.catch_warnings(record=True) as w: 64 | soup = BeautifulStoneSoup("") 65 | self.assertEqual("", str(soup.b)) 66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 67 | 68 | @skipIf( 69 | not LXML_PRESENT, 70 | "lxml seems not to be present, not testing its XML tree builder.") 71 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): 72 | """See ``HTMLTreeBuilderSmokeTest``.""" 73 | 74 | @property 75 | def default_builder(self): 76 | return LXMLTreeBuilderForXML() 77 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/bs4/tests/test_lxml.py.bak: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the lxml tree builder generates good trees.""" 2 | 3 | import re 4 | import warnings 5 | 6 | try: 7 | import lxml.etree 8 | LXML_PRESENT = True 9 | LXML_VERSION = lxml.etree.LXML_VERSION 10 | except ImportError, e: 11 | LXML_PRESENT = False 12 | LXML_VERSION = (0,) 13 | 14 | if LXML_PRESENT: 15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 16 | 17 | from bs4 import ( 18 | BeautifulSoup, 19 | BeautifulStoneSoup, 20 | ) 21 | from bs4.element import Comment, Doctype, SoupStrainer 22 | from bs4.testing import skipIf 23 | from bs4.tests import test_htmlparser 24 | from bs4.testing import ( 25 | HTMLTreeBuilderSmokeTest, 26 | XMLTreeBuilderSmokeTest, 27 | SoupTest, 28 | skipIf, 29 | ) 30 | 31 | @skipIf( 32 | not LXML_PRESENT, 33 | "lxml seems not to be present, not testing its tree builder.") 34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 35 | """See ``HTMLTreeBuilderSmokeTest``.""" 36 | 37 | @property 38 | def default_builder(self): 39 | return LXMLTreeBuilder() 40 | 41 | def test_out_of_range_entity(self): 42 | self.assertSoupEquals( 43 | "

foo�bar

", "

foobar

") 44 | self.assertSoupEquals( 45 | "

foo�bar

", "

foobar

") 46 | self.assertSoupEquals( 47 | "

foo�bar

", "

foobar

") 48 | 49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this 50 | # test if an old version of lxml is installed. 51 | 52 | @skipIf( 53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), 54 | "Skipping doctype test for old version of lxml to avoid segfault.") 55 | def test_empty_doctype(self): 56 | soup = self.soup("") 57 | doctype = soup.contents[0] 58 | self.assertEqual("", doctype.strip()) 59 | 60 | def test_beautifulstonesoup_is_xml_parser(self): 61 | # Make sure that the deprecated BSS class uses an xml builder 62 | # if one is installed. 63 | with warnings.catch_warnings(record=True) as w: 64 | soup = BeautifulStoneSoup("") 65 | self.assertEqual(u"", unicode(soup.b)) 66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 67 | 68 | @skipIf( 69 | not LXML_PRESENT, 70 | "lxml seems not to be present, not testing its XML tree builder.") 71 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): 72 | """See ``HTMLTreeBuilderSmokeTest``.""" 73 | 74 | @property 75 | def default_builder(self): 76 | return LXMLTreeBuilderForXML() 77 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/force.css: -------------------------------------------------------------------------------- 1 | circle.node { 2 | stroke: #fff; 3 | stroke-width: 1.5px; 4 | } 5 | 6 | line.link { 7 | stroke: #999; 8 | stroke-opacity: .6; 9 | } 10 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/force.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Force-Directed Layout 5 | 6 | 7 | 8 | 9 | 10 | 13 |
14 | 15 |

If you don't see a chart above, check the JavaScript console. You may 16 | need to use a different browser.

17 | 18 | 19 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/force.js: -------------------------------------------------------------------------------- 1 | var width = 600, 2 | height = 600; 3 | 4 | var color = d3.scale.category20(); 5 | 6 | var dist = (width + height) / 4; 7 | 8 | var force = d3.layout.force() 9 | .charge(-120) 10 | .linkDistance(dist) 11 | .size([width, height]); 12 | 13 | function getrank(rval) { 14 | return (rval/2.0) + 3; 15 | } 16 | 17 | function getcolor(rval) { 18 | return color(rval); 19 | } 20 | 21 | var svg = d3.select("#chart").append("svg") 22 | .attr("width", width) 23 | .attr("height", height); 24 | 25 | function loadData(json) { 26 | force 27 | .nodes(json.nodes) 28 | .links(json.links); 29 | 30 | var k = Math.sqrt(json.nodes.length / (width * height)); 31 | 32 | force 33 | .charge(-10 / k) 34 | .gravity(100 * k) 35 | .start(); 36 | 37 | var link = svg.selectAll("line.link") 38 | .data(json.links) 39 | .enter().append("line") 40 | .attr("class", "link") 41 | .style("stroke-width", function(d) { return Math.sqrt(d.value); }); 42 | 43 | var node = svg.selectAll("circle.node") 44 | .data(json.nodes) 45 | .enter().append("circle") 46 | .attr("class", "node") 47 | .attr("r", function(d) { return getrank(d.rank); } ) 48 | .style("fill", function(d) { return getcolor(d.rank); }) 49 | .on("dblclick",function(d) { 50 | if ( confirm('Do you want to open '+d.url) ) 51 | window.open(d.url,'_new',''); 52 | d3.event.stopPropagation(); 53 | }) 54 | .call(force.drag); 55 | 56 | node.append("title") 57 | .text(function(d) { return d.url; }); 58 | 59 | force.on("tick", function() { 60 | link.attr("x1", function(d) { return d.source.x; }) 61 | .attr("y1", function(d) { return d.source.y; }) 62 | .attr("x2", function(d) { return d.target.x; }) 63 | .attr("y2", function(d) { return d.target.y; }); 64 | 65 | node.attr("cx", function(d) { return d.x; }) 66 | .attr("cy", function(d) { return d.y; }); 67 | }); 68 | 69 | } 70 | loadData(spiderJson); 71 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/spdump.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 7 | FROM Pages JOIN Links ON Pages.id = Links.to_id 8 | WHERE html IS NOT NULL 9 | GROUP BY id ORDER BY inbound DESC''') 10 | 11 | count = 0 12 | for row in cur : 13 | if count < 50 : print(row) 14 | count = count + 1 15 | print(count, 'rows.') 16 | cur.close() 17 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/spider.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import urllib.error 3 | import ssl 4 | from urllib.parse import urljoin 5 | from urllib.parse import urlparse 6 | from urllib.request import urlopen 7 | from bs4 import BeautifulSoup 8 | 9 | # Ignore SSL certificate errors 10 | ctx = ssl.create_default_context() 11 | ctx.check_hostname = False 12 | ctx.verify_mode = ssl.CERT_NONE 13 | 14 | conn = sqlite3.connect('spider.sqlite') 15 | cur = conn.cursor() 16 | 17 | cur.execute('''CREATE TABLE IF NOT EXISTS Pages 18 | (id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT, 19 | error INTEGER, old_rank REAL, new_rank REAL)''') 20 | 21 | cur.execute('''CREATE TABLE IF NOT EXISTS Links 22 | (from_id INTEGER, to_id INTEGER)''') 23 | 24 | cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''') 25 | 26 | # Check to see if we are already in progress... 27 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 28 | row = cur.fetchone() 29 | if row is not None: 30 | print("Restarting existing crawl. Remove spider.sqlite to start a fresh crawl.") 31 | else : 32 | starturl = input('Enter web url or enter: ') 33 | if ( len(starturl) < 1 ) : starturl = 'http://www.dr-chuck.com/' 34 | if ( starturl.endswith('/') ) : starturl = starturl[:-1] 35 | web = starturl 36 | if ( starturl.endswith('.htm') or starturl.endswith('.html') ) : 37 | pos = starturl.rfind('/') 38 | web = starturl[:pos] 39 | 40 | if ( len(web) > 1 ) : 41 | cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) ) 42 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) ) 43 | conn.commit() 44 | 45 | # Get the current webs 46 | cur.execute('''SELECT url FROM Webs''') 47 | webs = list() 48 | for row in cur: 49 | webs.append(str(row[0])) 50 | 51 | print(webs) 52 | 53 | many = 0 54 | while True: 55 | if ( many < 1 ) : 56 | sval = input('How many pages:') 57 | if ( len(sval) < 1 ) : break 58 | many = int(sval) 59 | many = many - 1 60 | 61 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 62 | try: 63 | row = cur.fetchone() 64 | # print row 65 | fromid = row[0] 66 | url = row[1] 67 | except: 68 | print('No unretrieved HTML pages found') 69 | many = 0 70 | break 71 | 72 | print(fromid, url, end=' ') 73 | 74 | # If we are retrieving this page, there should be no links from it 75 | cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) ) 76 | try: 77 | document = urlopen(url, context=ctx) 78 | 79 | html = document.read() 80 | if document.getcode() != 200 : 81 | print("Error on page: ",document.getcode()) 82 | cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) ) 83 | 84 | if 'text/html' != document.info().get_content_type() : 85 | print("Ignore non text/html page") 86 | cur.execute('DELETE FROM Pages WHERE url=?', ( url, ) ) 87 | cur.execute('UPDATE Pages SET error=0 WHERE url=?', (url, ) ) 88 | conn.commit() 89 | continue 90 | 91 | print('('+str(len(html))+')', end=' ') 92 | 93 | soup = BeautifulSoup(html, "html.parser") 94 | except KeyboardInterrupt: 95 | print('') 96 | print('Program interrupted by user...') 97 | break 98 | except: 99 | print("Unable to retrieve or parse page") 100 | cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) ) 101 | conn.commit() 102 | continue 103 | 104 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) ) 105 | cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url ) ) 106 | conn.commit() 107 | 108 | # Retrieve all of the anchor tags 109 | tags = soup('a') 110 | count = 0 111 | for tag in tags: 112 | href = tag.get('href', None) 113 | if ( href is None ) : continue 114 | # Resolve relative references like href="/contact" 115 | up = urlparse(href) 116 | if ( len(up.scheme) < 1 ) : 117 | href = urljoin(url, href) 118 | ipos = href.find('#') 119 | if ( ipos > 1 ) : href = href[:ipos] 120 | if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue 121 | if ( href.endswith('/') ) : href = href[:-1] 122 | # print href 123 | if ( len(href) < 1 ) : continue 124 | 125 | # Check if the URL is in any of the webs 126 | found = False 127 | for web in webs: 128 | if ( href.startswith(web) ) : 129 | found = True 130 | break 131 | if not found : continue 132 | 133 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) ) 134 | count = count + 1 135 | conn.commit() 136 | 137 | cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, )) 138 | try: 139 | row = cur.fetchone() 140 | toid = row[0] 141 | except: 142 | print('Could not retrieve id') 143 | continue 144 | # print fromid, toid 145 | cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) ) 146 | 147 | 148 | print(count) 149 | 150 | cur.close() 151 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/spjson.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | print("Creating JSON output on spider.js...") 7 | howmany = int(input("How many nodes? ")) 8 | 9 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 10 | FROM Pages JOIN Links ON Pages.id = Links.to_id 11 | WHERE html IS NOT NULL AND ERROR IS NULL 12 | GROUP BY id ORDER BY id,inbound''') 13 | 14 | fhand = open('spider.js','w') 15 | nodes = list() 16 | maxrank = None 17 | minrank = None 18 | for row in cur : 19 | nodes.append(row) 20 | rank = row[2] 21 | if maxrank is None or maxrank < rank: maxrank = rank 22 | if minrank is None or minrank > rank : minrank = rank 23 | if len(nodes) > howmany : break 24 | 25 | if maxrank == minrank or maxrank is None or minrank is None: 26 | print("Error - please run sprank.py to compute page rank") 27 | quit() 28 | 29 | fhand.write('spiderJson = {"nodes":[\n') 30 | count = 0 31 | map = dict() 32 | ranks = dict() 33 | for row in nodes : 34 | if count > 0 : fhand.write(',\n') 35 | # print row 36 | rank = row[2] 37 | rank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 38 | fhand.write('{'+'"weight":'+str(row[0])+',"rank":'+str(rank)+',') 39 | fhand.write(' "id":'+str(row[3])+', "url":"'+row[4]+'"}') 40 | map[row[3]] = count 41 | ranks[row[3]] = rank 42 | count = count + 1 43 | fhand.write('],\n') 44 | 45 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 46 | fhand.write('"links":[\n') 47 | 48 | count = 0 49 | for row in cur : 50 | # print row 51 | if row[0] not in map or row[1] not in map : continue 52 | if count > 0 : fhand.write(',\n') 53 | rank = ranks[row[0]] 54 | srank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 55 | fhand.write('{"source":'+str(map[row[0]])+',"target":'+str(map[row[1]])+',"value":3}') 56 | count = count + 1 57 | fhand.write(']};') 58 | fhand.close() 59 | cur.close() 60 | 61 | print("Open force.html in a browser to view the visualization") 62 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/sprank.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | # Find the ids that send out page rank - we only are interested 7 | # in pages in the SCC that have in and out links 8 | cur.execute('''SELECT DISTINCT from_id FROM Links''') 9 | from_ids = list() 10 | for row in cur: 11 | from_ids.append(row[0]) 12 | 13 | # Find the ids that receive page rank 14 | to_ids = list() 15 | links = list() 16 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 17 | for row in cur: 18 | from_id = row[0] 19 | to_id = row[1] 20 | if from_id == to_id : continue 21 | if from_id not in from_ids : continue 22 | if to_id not in from_ids : continue 23 | links.append(row) 24 | if to_id not in to_ids : to_ids.append(to_id) 25 | 26 | # Get latest page ranks for strongly connected component 27 | prev_ranks = dict() 28 | for node in from_ids: 29 | cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''', (node, )) 30 | row = cur.fetchone() 31 | prev_ranks[node] = row[0] 32 | 33 | sval = input('How many iterations:') 34 | many = 1 35 | if ( len(sval) > 0 ) : many = int(sval) 36 | 37 | # Sanity check 38 | if len(prev_ranks) < 1 : 39 | print("Nothing to page rank. Check data.") 40 | quit() 41 | 42 | # Lets do Page Rank in memory so it is really fast 43 | for i in range(many): 44 | # print prev_ranks.items()[:5] 45 | next_ranks = dict(); 46 | total = 0.0 47 | for (node, old_rank) in list(prev_ranks.items()): 48 | total = total + old_rank 49 | next_ranks[node] = 0.0 50 | # print total 51 | 52 | # Find the number of outbound links and sent the page rank down each 53 | for (node, old_rank) in list(prev_ranks.items()): 54 | # print node, old_rank 55 | give_ids = list() 56 | for (from_id, to_id) in links: 57 | if from_id != node : continue 58 | # print ' ',from_id,to_id 59 | 60 | if to_id not in to_ids: continue 61 | give_ids.append(to_id) 62 | if ( len(give_ids) < 1 ) : continue 63 | amount = old_rank / len(give_ids) 64 | # print node, old_rank,amount, give_ids 65 | 66 | for id in give_ids: 67 | next_ranks[id] = next_ranks[id] + amount 68 | 69 | newtot = 0 70 | for (node, next_rank) in list(next_ranks.items()): 71 | newtot = newtot + next_rank 72 | evap = (total - newtot) / len(next_ranks) 73 | 74 | # print newtot, evap 75 | for node in next_ranks: 76 | next_ranks[node] = next_ranks[node] + evap 77 | 78 | newtot = 0 79 | for (node, next_rank) in list(next_ranks.items()): 80 | newtot = newtot + next_rank 81 | 82 | # Compute the per-page average change from old rank to new rank 83 | # As indication of convergence of the algorithm 84 | totdiff = 0 85 | for (node, old_rank) in list(prev_ranks.items()): 86 | new_rank = next_ranks[node] 87 | diff = abs(old_rank-new_rank) 88 | totdiff = totdiff + diff 89 | 90 | avediff = totdiff / len(prev_ranks) 91 | print(i+1, avediff) 92 | 93 | # rotate 94 | prev_ranks = next_ranks 95 | 96 | # Put the final ranks back into the database 97 | print(list(next_ranks.items())[:5]) 98 | cur.execute('''UPDATE Pages SET old_rank=new_rank''') 99 | for (id, new_rank) in list(next_ranks.items()) : 100 | cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''', (new_rank, id)) 101 | conn.commit() 102 | cur.close() 103 | 104 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 2/spreset.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''UPDATE Pages SET new_rank=1.0, old_rank=0.0''') 7 | conn.commit() 8 | 9 | cur.close() 10 | 11 | print("All pages set to a rank of 1.0") 12 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/1.jpg -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/2.jpg -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/3.jpg -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/4.jpg -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/README.txt: -------------------------------------------------------------------------------- 1 | Analyzing an EMAIL Archive from gmane and vizualizing the data 2 | using the D3 JavaScript library 3 | 4 | This is a set of tools that allow you to pull down an archive 5 | of a gmane repository using the instructions at: 6 | 7 | http://gmane.org/export.php 8 | 9 | In order not to overwhelm the gmane.org server, I have put up 10 | my own copy of the messages at: 11 | 12 | http://mbox.dr-chuck.net/ 13 | 14 | This server will be faster and take a lot of load off the 15 | gmane.org server. 16 | 17 | You should install the SQLite browser to view and modify the databases from: 18 | 19 | http://sqlitebrowser.org/ 20 | 21 | The first step is to spider the gmane repository. The base URL 22 | is hard-coded in the gmane.py and is hard-coded to the Sakai 23 | developer list. You can spider another repository by changing that 24 | base url. Make sure to delete the content.sqlite file if you 25 | switch the base url. The gmane.py file operates as a spider in 26 | that it runs slowly and retrieves one mail message per second so 27 | as to avoid getting throttled by gmane.org. It stores all of 28 | its data in a database and can be interrupted and re-started 29 | as often as needed. It may take many hours to pull all the data 30 | down. So you may need to restart several times. 31 | 32 | To give you a head-start, I have put up 600MB of pre-spidered Sakai 33 | email here: 34 | 35 | https://online.dr-chuck.com/files/sakai/email/content.sqlite 36 | 37 | If you download this, you can "catch up with the latest" by 38 | running gmane.py. 39 | 40 | Navigate to the folder where you extracted the gmane.zip 41 | 42 | Note: Windows has difficulty in displaying UTF-8 characters 43 | in the console so for each console window you open, you may need 44 | to type the following command before running this code: 45 | 46 | chcp 65001 47 | 48 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 49 | 50 | Here is a run of gmane.py getting the last five messages of the 51 | sakai developer list: 52 | 53 | Mac: python3 gmane.py 54 | Win: gmane.py 55 | 56 | How many messages:10 57 | http://mbox.dr-chuck.net/sakai.devel/1/2 2662 58 | ggolden@umich.edu 2005-12-08T23:34:30-06:00 call for participation: developers documentation 59 | http://mbox.dr-chuck.net/sakai.devel/2/3 2434 60 | csev@umich.edu 2005-12-09T00:58:01-05:00 report from the austin conference: sakai developers break into song 61 | http://mbox.dr-chuck.net/sakai.devel/3/4 3055 62 | kevin.carpenter@rsmart.com 2005-12-09T09:01:49-07:00 cas and sakai 1.5 63 | http://mbox.dr-chuck.net/sakai.devel/4/5 11721 64 | michael.feldstein@suny.edu 2005-12-09T09:43:12-05:00 re: lms/vle rants/comments 65 | http://mbox.dr-chuck.net/sakai.devel/5/6 9443 66 | john@caret.cam.ac.uk 2005-12-09T13:32:29+00:00 re: lms/vle rants/comments 67 | Does not start with From 68 | 69 | The program scans content.sqlite from 1 up to the first message number not 70 | already spidered and starts spidering at that message. It continues spidering 71 | until it has spidered the desired number of messages or it reaches a page 72 | that does not appear to be a properly formatted message. 73 | 74 | Sometimes gmane.org is missing a message. Perhaps administrators can delete messages 75 | or perhaps they get lost - I don't know. If your spider stops, and it seems it has hit 76 | a missing message, go into the SQLite Manager and add a row with the missing id - leave 77 | all the other fields blank - and then restart gmane.py. This will unstick the 78 | spidering process and allow it to continue. These empty messages will be ignored in the next 79 | phase of the process. 80 | 81 | One nice thing is that once you have spidered all of the messages and have them in 82 | content.sqlite, you can run gmane.py again to get new messages as they get sent to the 83 | list. gmane.py will quickly scan to the end of the already-spidered pages and check 84 | if there are new messages and then quickly retrieve those messages and add them 85 | to content.sqlite. 86 | 87 | The content.sqlite data is pretty raw, with an innefficient data model, and not compressed. 88 | This is intentional as it allows you to look at content.sqlite to debug the process. 89 | It would be a bad idea to run any queries against this database as they would be 90 | slow. 91 | 92 | The second process is running the program gmodel.py. gmodel.py reads the rough/raw 93 | data from content.sqlite and produces a cleaned-up and well-modeled version of the 94 | data in the file index.sqlite. The file index.sqlite will be much smaller (often 10X 95 | smaller) than content.sqlite because it also compresses the header and body text. 96 | 97 | Each time gmodel.py runs - it completely wipes out and re-builds index.sqlite, allowing 98 | you to adjust its parameters and edit the mapping tables in content.sqlite to tweak the 99 | data cleaning process. 100 | 101 | Running gmodel.py works as follows: 102 | 103 | Mac: python3 gmodel.py 104 | Win: gmodel.py 105 | 106 | Loaded allsenders 1588 and mapping 28 dns mapping 1 107 | 1 2005-12-08T23:34:30-06:00 ggolden22@mac.com 108 | 251 2005-12-22T10:03:20-08:00 tpamsler@ucdavis.edu 109 | 501 2006-01-12T11:17:34-05:00 lance@indiana.edu 110 | 751 2006-01-24T11:13:28-08:00 vrajgopalan@ucmerced.edu 111 | ... 112 | 113 | The gmodel.py program does a number of data cleaing steps 114 | 115 | Domain names are truncated to two levels for .com, .org, .edu, and .net 116 | other domain names are truncated to three levels. So si.umich.edu becomes 117 | umich.edu and caret.cam.ac.uk becomes cam.ac.uk. Also mail addresses are 118 | forced to lower case and some of the @gmane.org address like the following 119 | 120 | arwhyte-63aXycvo3TyHXe+LvDLADg@public.gmane.org 121 | 122 | are converted to the real address whenever there is a matching real email 123 | address elsewhere in the message corpus. 124 | 125 | If you look in the content.sqlite database there are two tables that allow 126 | you to map both domain names and individual email addresses that change over 127 | the lifetime of the email list. For example, Steve Githens used the following 128 | email addresses over the life of the Sakai developer list: 129 | 130 | s-githens@northwestern.edu 131 | sgithens@cam.ac.uk 132 | swgithen@mtu.edu 133 | 134 | We can add two entries to the Mapping table 135 | 136 | s-githens@northwestern.edu -> swgithen@mtu.edu 137 | sgithens@cam.ac.uk -> swgithen@mtu.edu 138 | 139 | And so all the mail messages will be collected under one sender even if 140 | they used several email addresses over the lifetime of the mailing list. 141 | 142 | You can also make similar entries in the DNSMapping table if there are multiple 143 | DNS names you want mapped to a single DNS. In the Sakai data I add the following 144 | mapping: 145 | 146 | iupui.edu -> indiana.edu 147 | 148 | So all the folks from the various Indiana University campuses are tracked together 149 | 150 | You can re-run the gmodel.py over and over as you look at the data, and add mappings 151 | to make the data cleaner and cleaner. When you are done, you will have a nicely 152 | indexed version of the email in index.sqlite. This is the file to use to do data 153 | analysis. With this file, data analysis will be really quick. 154 | 155 | The first, simplest data analysis is to do a "who does the most" and "which 156 | organzation does the most"? This is done using gbasic.py: 157 | 158 | Mac: python3 gbasic.py 159 | Win: gbasic.py 160 | 161 | How many to dump? 5 162 | Loaded messages= 51330 subjects= 25033 senders= 1584 163 | 164 | Top 5 Email list participants 165 | steve.swinsburg@gmail.com 2657 166 | azeckoski@unicon.net 1742 167 | ieb@tfd.co.uk 1591 168 | csev@umich.edu 1304 169 | david.horwitz@uct.ac.za 1184 170 | 171 | Top 5 Email list organizations 172 | gmail.com 7339 173 | umich.edu 6243 174 | uct.ac.za 2451 175 | indiana.edu 2258 176 | unicon.net 2055 177 | 178 | You can look at the data in index.sqlite and if you find a problem, you 179 | can update the Mapping table and DNSMapping table in content.sqlite and 180 | re-run gmodel.py. 181 | 182 | There is a simple vizualization of the word frequence in the subject lines 183 | in the file gword.py: 184 | 185 | Mac: python3 gword.py 186 | Win: gword.py 187 | 188 | Range of counts: 33229 129 189 | Output written to gword.js 190 | 191 | This produces the file gword.js which you can visualize using the file 192 | gword.htm. 193 | 194 | A second visualization is in gline.py. It visualizes email participation by 195 | organizations over time. 196 | 197 | Mac: python3 gline.py 198 | Win: gline.py 199 | 200 | Loaded messages= 51330 subjects= 25033 senders= 1584 201 | Top 10 Oranizations 202 | ['gmail.com', 'umich.edu', 'uct.ac.za', 'indiana.edu', 'unicon.net', 'tfd.co.uk', 'berkeley.edu', 'longsight.com', 'stanford.edu', 'ox.ac.uk'] 203 | Output written to gline.js 204 | 205 | Its output is written to gline.js which is visualized using gline.htm. 206 | 207 | Some URLs for visualization ideas: 208 | 209 | https://developers.google.com/chart/ 210 | 211 | https://developers.google.com/chart/interactive/docs/gallery/motionchart 212 | 213 | https://code.google.com/apis/ajax/playground/?type=visualization#motion_chart_time_formats 214 | 215 | https://developers.google.com/chart/interactive/docs/gallery/annotatedtimeline 216 | 217 | http://bost.ocks.org/mike/uberdata/ 218 | 219 | http://mbostock.github.io/d3/talk/20111018/calendar.html 220 | 221 | http://nltk.org/install.html 222 | 223 | As always - comments welcome. 224 | 225 | -- Dr. Chuck 226 | Sun Sep 29 00:11:01 EDT 2013 227 | 228 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/gbasic.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | 5 | howmany = int(input("How many to dump? ")) 6 | 7 | conn = sqlite3.connect('index.sqlite') 8 | cur = conn.cursor() 9 | 10 | cur.execute('SELECT id, sender FROM Senders') 11 | senders = dict() 12 | for message_row in cur : 13 | senders[message_row[0]] = message_row[1] 14 | 15 | cur.execute('SELECT id, subject FROM Subjects') 16 | subjects = dict() 17 | for message_row in cur : 18 | subjects[message_row[0]] = message_row[1] 19 | 20 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages') 21 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 22 | messages = dict() 23 | for message_row in cur : 24 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 25 | 26 | print("Loaded messages=",len(messages),"subjects=",len(subjects),"senders=",len(senders)) 27 | 28 | sendcounts = dict() 29 | sendorgs = dict() 30 | for (message_id, message) in list(messages.items()): 31 | sender = message[1] 32 | sendcounts[sender] = sendcounts.get(sender,0) + 1 33 | pieces = senders[sender].split("@") 34 | if len(pieces) != 2 : continue 35 | dns = pieces[1] 36 | sendorgs[dns] = sendorgs.get(dns,0) + 1 37 | 38 | print('') 39 | print('Top',howmany,'Email list participants') 40 | 41 | x = sorted(sendcounts, key=sendcounts.get, reverse=True) 42 | for k in x[:howmany]: 43 | print(senders[k], sendcounts[k]) 44 | if sendcounts[k] < 10 : break 45 | 46 | print('') 47 | print('Top',howmany,'Email list organizations') 48 | 49 | x = sorted(sendorgs, key=sendorgs.get, reverse=True) 50 | for k in x[:howmany]: 51 | print(k, sendorgs[k]) 52 | if sendorgs[k] < 10 : break 53 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/gline.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 19 | 20 | 21 |
22 | 23 | 24 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/gline.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | 5 | conn = sqlite3.connect('index.sqlite') 6 | cur = conn.cursor() 7 | 8 | cur.execute('SELECT id, sender FROM Senders') 9 | senders = dict() 10 | for message_row in cur : 11 | senders[message_row[0]] = message_row[1] 12 | 13 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 14 | messages = dict() 15 | for message_row in cur : 16 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 17 | 18 | print("Loaded messages=",len(messages),"senders=",len(senders)) 19 | 20 | sendorgs = dict() 21 | for (message_id, message) in list(messages.items()): 22 | sender = message[1] 23 | pieces = senders[sender].split("@") 24 | if len(pieces) != 2 : continue 25 | dns = pieces[1] 26 | sendorgs[dns] = sendorgs.get(dns,0) + 1 27 | 28 | # pick the top schools 29 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 30 | orgs = orgs[:10] 31 | print("Top 10 Oranizations") 32 | print(orgs) 33 | 34 | counts = dict() 35 | months = list() 36 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 37 | for (message_id, message) in list(messages.items()): 38 | sender = message[1] 39 | pieces = senders[sender].split("@") 40 | if len(pieces) != 2 : continue 41 | dns = pieces[1] 42 | if dns not in orgs : continue 43 | month = message[3][:7] 44 | if month not in months : months.append(month) 45 | key = (month, dns) 46 | counts[key] = counts.get(key,0) + 1 47 | 48 | months.sort() 49 | # print counts 50 | # print months 51 | 52 | fhand = open('gline.js','w') 53 | fhand.write("gline = [ ['Year'") 54 | for org in orgs: 55 | fhand.write(",'"+org+"'") 56 | fhand.write("]") 57 | 58 | for month in months: 59 | fhand.write(",\n['"+month+"'") 60 | for org in orgs: 61 | key = (month, org) 62 | val = counts.get(key,0) 63 | fhand.write(","+str(val)) 64 | fhand.write("]"); 65 | 66 | fhand.write("\n];\n") 67 | fhand.close() 68 | 69 | print("Output written to gline.js") 70 | print("Open gline.htm to visualize the data") 71 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/gmane.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import ssl 4 | import urllib.request, urllib.parse, urllib.error 5 | from urllib.parse import urljoin 6 | from urllib.parse import urlparse 7 | import re 8 | from datetime import datetime, timedelta 9 | 10 | # Not all systems have this so conditionally define parser 11 | try: 12 | import dateutil.parser as parser 13 | except: 14 | pass 15 | 16 | def parsemaildate(md) : 17 | # See if we have dateutil 18 | try: 19 | pdate = parser.parse(tdate) 20 | test_at = pdate.isoformat() 21 | return test_at 22 | except: 23 | pass 24 | 25 | # Non-dateutil version - we try our best 26 | 27 | pieces = md.split() 28 | notz = " ".join(pieces[:4]).strip() 29 | 30 | # Try a bunch of format variations - strptime() is *lame* 31 | dnotz = None 32 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 33 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 34 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 35 | try: 36 | dnotz = datetime.strptime(notz, form) 37 | break 38 | except: 39 | continue 40 | 41 | if dnotz is None : 42 | # print 'Bad Date:',md 43 | return None 44 | 45 | iso = dnotz.isoformat() 46 | 47 | tz = "+0000" 48 | try: 49 | tz = pieces[4] 50 | ival = int(tz) # Only want numeric timezone values 51 | if tz == '-0000' : tz = '+0000' 52 | tzh = tz[:3] 53 | tzm = tz[3:] 54 | tz = tzh+":"+tzm 55 | except: 56 | pass 57 | 58 | return iso+tz 59 | 60 | # Ignore SSL certificate errors 61 | ctx = ssl.create_default_context() 62 | ctx.check_hostname = False 63 | ctx.verify_mode = ssl.CERT_NONE 64 | 65 | conn = sqlite3.connect('content.sqlite') 66 | cur = conn.cursor() 67 | 68 | baseurl = "http://mbox.dr-chuck.net/sakai.devel/" 69 | 70 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 71 | (id INTEGER UNIQUE, email TEXT, sent_at TEXT, 72 | subject TEXT, headers TEXT, body TEXT)''') 73 | 74 | # Pick up where we left off 75 | start = None 76 | cur.execute('SELECT max(id) FROM Messages' ) 77 | try: 78 | row = cur.fetchone() 79 | if row is None : 80 | start = 0 81 | else: 82 | start = row[0] 83 | except: 84 | start = 0 85 | 86 | if start is None : start = 0 87 | 88 | many = 0 89 | count = 0 90 | fail = 0 91 | while True: 92 | if ( many < 1 ) : 93 | conn.commit() 94 | sval = input('How many messages:') 95 | if ( len(sval) < 1 ) : break 96 | many = int(sval) 97 | 98 | start = start + 1 99 | cur.execute('SELECT id FROM Messages WHERE id=?', (start,) ) 100 | try: 101 | row = cur.fetchone() 102 | if row is not None : continue 103 | except: 104 | row = None 105 | 106 | many = many - 1 107 | url = baseurl + str(start) + '/' + str(start + 1) 108 | 109 | text = "None" 110 | try: 111 | # Open with a timeout of 30 seconds 112 | document = urllib.request.urlopen(url, None, 30, context=ctx) 113 | text = document.read().decode() 114 | if document.getcode() != 200 : 115 | print("Error code=",document.getcode(), url) 116 | break 117 | except KeyboardInterrupt: 118 | print('') 119 | print('Program interrupted by user...') 120 | break 121 | except Exception as e: 122 | print("Unable to retrieve or parse page",url) 123 | print("Error",e) 124 | fail = fail + 1 125 | if fail > 5 : break 126 | continue 127 | 128 | print(url,len(text)) 129 | count = count + 1 130 | 131 | if not text.startswith("From "): 132 | print(text) 133 | print("Did not find From ") 134 | fail = fail + 1 135 | if fail > 5 : break 136 | continue 137 | 138 | pos = text.find("\n\n") 139 | if pos > 0 : 140 | hdr = text[:pos] 141 | body = text[pos+2:] 142 | else: 143 | print(text) 144 | print("Could not find break between headers and body") 145 | fail = fail + 1 146 | if fail > 5 : break 147 | continue 148 | 149 | email = None 150 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 151 | if len(x) == 1 : 152 | email = x[0]; 153 | email = email.strip().lower() 154 | email = email.replace("<","") 155 | else: 156 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 157 | if len(x) == 1 : 158 | email = x[0]; 159 | email = email.strip().lower() 160 | email = email.replace("<","") 161 | 162 | date = None 163 | y = re.findall('\Date: .*, (.*)\n', hdr) 164 | if len(y) == 1 : 165 | tdate = y[0] 166 | tdate = tdate[:26] 167 | try: 168 | sent_at = parsemaildate(tdate) 169 | except: 170 | print(text) 171 | print("Parse fail",tdate) 172 | fail = fail + 1 173 | if fail > 5 : break 174 | continue 175 | 176 | subject = None 177 | z = re.findall('\Subject: (.*)\n', hdr) 178 | if len(z) == 1 : subject = z[0].strip().lower(); 179 | 180 | # Reset the fail counter 181 | fail = 0 182 | print(" ",email,sent_at,subject) 183 | cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body) 184 | VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body)) 185 | if count % 50 == 0 : conn.commit() 186 | if count % 100 == 0 : time.sleep(1) 187 | 188 | conn.commit() 189 | cur.close() 190 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/gmodel.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import re 4 | import zlib 5 | from datetime import datetime, timedelta 6 | 7 | # Not all systems have this 8 | try: 9 | import dateutil.parser as parser 10 | except: 11 | pass 12 | 13 | dnsmapping = dict() 14 | mapping = dict() 15 | 16 | def fixsender(sender,allsenders=None) : 17 | global dnsmapping 18 | global mapping 19 | if sender is None : return None 20 | sender = sender.strip().lower() 21 | sender = sender.replace('<','').replace('>','') 22 | 23 | # Check if we have a hacked gmane.org from address 24 | if allsenders is not None and sender.endswith('gmane.org') : 25 | pieces = sender.split('-') 26 | realsender = None 27 | for s in allsenders: 28 | if s.startswith(pieces[0]) : 29 | realsender = sender 30 | sender = s 31 | # print(realsender, sender) 32 | break 33 | if realsender is None : 34 | for s in mapping: 35 | if s.startswith(pieces[0]) : 36 | realsender = sender 37 | sender = mapping[s] 38 | # print(realsender, sender) 39 | break 40 | if realsender is None : sender = pieces[0] 41 | 42 | mpieces = sender.split("@") 43 | if len(mpieces) != 2 : return sender 44 | dns = mpieces[1] 45 | x = dns 46 | pieces = dns.split(".") 47 | if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") : 48 | dns = ".".join(pieces[-2:]) 49 | else: 50 | dns = ".".join(pieces[-3:]) 51 | # if dns != x : print(x,dns) 52 | # if dns != dnsmapping.get(dns,dns) : print(dns,dnsmapping.get(dns,dns)) 53 | dns = dnsmapping.get(dns,dns) 54 | return mpieces[0] + '@' + dns 55 | 56 | def parsemaildate(md) : 57 | # See if we have dateutil 58 | try: 59 | pdate = parser.parse(tdate) 60 | test_at = pdate.isoformat() 61 | return test_at 62 | except: 63 | pass 64 | 65 | # Non-dateutil version - we try our best 66 | 67 | pieces = md.split() 68 | notz = " ".join(pieces[:4]).strip() 69 | 70 | # Try a bunch of format variations - strptime() is *lame* 71 | dnotz = None 72 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 73 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 74 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 75 | try: 76 | dnotz = datetime.strptime(notz, form) 77 | break 78 | except: 79 | continue 80 | 81 | if dnotz is None : 82 | # print('Bad Date:',md) 83 | return None 84 | 85 | iso = dnotz.isoformat() 86 | 87 | tz = "+0000" 88 | try: 89 | tz = pieces[4] 90 | ival = int(tz) # Only want numeric timezone values 91 | if tz == '-0000' : tz = '+0000' 92 | tzh = tz[:3] 93 | tzm = tz[3:] 94 | tz = tzh+":"+tzm 95 | except: 96 | pass 97 | 98 | return iso+tz 99 | 100 | # Parse out the info... 101 | def parseheader(hdr, allsenders=None): 102 | if hdr is None or len(hdr) < 1 : return None 103 | sender = None 104 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 105 | if len(x) >= 1 : 106 | sender = x[0] 107 | else: 108 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 109 | if len(x) >= 1 : 110 | sender = x[0] 111 | 112 | # normalize the domain name of Email addresses 113 | sender = fixsender(sender, allsenders) 114 | 115 | date = None 116 | y = re.findall('\nDate: .*, (.*)\n', hdr) 117 | sent_at = None 118 | if len(y) >= 1 : 119 | tdate = y[0] 120 | tdate = tdate[:26] 121 | try: 122 | sent_at = parsemaildate(tdate) 123 | except Exception as e: 124 | # print('Date ignored ',tdate, e) 125 | return None 126 | 127 | subject = None 128 | z = re.findall('\nSubject: (.*)\n', hdr) 129 | if len(z) >= 1 : subject = z[0].strip().lower() 130 | 131 | guid = None 132 | z = re.findall('\nMessage-ID: (.*)\n', hdr) 133 | if len(z) >= 1 : guid = z[0].strip().lower() 134 | 135 | if sender is None or sent_at is None or subject is None or guid is None : 136 | return None 137 | return (guid, sender, subject, sent_at) 138 | 139 | conn = sqlite3.connect('index.sqlite') 140 | cur = conn.cursor() 141 | 142 | cur.execute('''DROP TABLE IF EXISTS Messages ''') 143 | cur.execute('''DROP TABLE IF EXISTS Senders ''') 144 | cur.execute('''DROP TABLE IF EXISTS Subjects ''') 145 | cur.execute('''DROP TABLE IF EXISTS Replies ''') 146 | 147 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 148 | (id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER, 149 | sender_id INTEGER, subject_id INTEGER, 150 | headers BLOB, body BLOB)''') 151 | cur.execute('''CREATE TABLE IF NOT EXISTS Senders 152 | (id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''') 153 | cur.execute('''CREATE TABLE IF NOT EXISTS Subjects 154 | (id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''') 155 | cur.execute('''CREATE TABLE IF NOT EXISTS Replies 156 | (from_id INTEGER, to_id INTEGER)''') 157 | 158 | conn_1 = sqlite3.connect('mapping.sqlite') 159 | cur_1 = conn_1.cursor() 160 | 161 | cur_1.execute('''SELECT old,new FROM DNSMapping''') 162 | for message_row in cur_1 : 163 | dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower() 164 | 165 | mapping = dict() 166 | cur_1.execute('''SELECT old,new FROM Mapping''') 167 | for message_row in cur_1 : 168 | old = fixsender(message_row[0]) 169 | new = fixsender(message_row[1]) 170 | mapping[old] = fixsender(new) 171 | 172 | # Done with mapping.sqlite 173 | conn_1.close() 174 | 175 | # Open the main content (Read only) 176 | conn_1 = sqlite3.connect('file:content.sqlite?mode=ro', uri=True) 177 | cur_1 = conn_1.cursor() 178 | 179 | allsenders = list() 180 | cur_1.execute('''SELECT email FROM Messages''') 181 | for message_row in cur_1 : 182 | sender = fixsender(message_row[0]) 183 | if sender is None : continue 184 | if 'gmane.org' in sender : continue 185 | if sender in allsenders: continue 186 | allsenders.append(sender) 187 | 188 | print("Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping)) 189 | 190 | cur_1.execute('''SELECT headers, body, sent_at 191 | FROM Messages ORDER BY sent_at''') 192 | 193 | senders = dict() 194 | subjects = dict() 195 | guids = dict() 196 | 197 | count = 0 198 | 199 | for message_row in cur_1 : 200 | hdr = message_row[0] 201 | parsed = parseheader(hdr, allsenders) 202 | if parsed is None: continue 203 | (guid, sender, subject, sent_at) = parsed 204 | 205 | # Apply the sender mapping 206 | sender = mapping.get(sender,sender) 207 | 208 | count = count + 1 209 | if count % 250 == 1 : print(count,sent_at, sender) 210 | # print(guid, sender, subject, sent_at) 211 | 212 | if 'gmane.org' in sender: 213 | print("Error in sender ===", sender) 214 | 215 | sender_id = senders.get(sender,None) 216 | subject_id = subjects.get(subject,None) 217 | guid_id = guids.get(guid,None) 218 | 219 | if sender_id is None : 220 | cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) ) 221 | conn.commit() 222 | cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, )) 223 | try: 224 | row = cur.fetchone() 225 | sender_id = row[0] 226 | senders[sender] = sender_id 227 | except: 228 | print('Could not retrieve sender id',sender) 229 | break 230 | if subject_id is None : 231 | cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) ) 232 | conn.commit() 233 | cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, )) 234 | try: 235 | row = cur.fetchone() 236 | subject_id = row[0] 237 | subjects[subject] = subject_id 238 | except: 239 | print('Could not retrieve subject id',subject) 240 | break 241 | # print(sender_id, subject_id) 242 | cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )', 243 | ( guid, sender_id, subject_id, sent_at, 244 | zlib.compress(message_row[0].encode()), zlib.compress(message_row[1].encode())) ) 245 | conn.commit() 246 | cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, )) 247 | try: 248 | row = cur.fetchone() 249 | message_id = row[0] 250 | guids[guid] = message_id 251 | except: 252 | print('Could not retrieve guid id',guid) 253 | break 254 | 255 | cur.close() 256 | cur_1.close() 257 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/gword.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 37 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/gword.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | import string 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | cur = conn.cursor() 8 | 9 | cur.execute('SELECT id, subject FROM Subjects') 10 | subjects = dict() 11 | for message_row in cur : 12 | subjects[message_row[0]] = message_row[1] 13 | 14 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages') 15 | cur.execute('SELECT subject_id FROM Messages') 16 | counts = dict() 17 | for message_row in cur : 18 | text = subjects[message_row[0]] 19 | text = text.translate(str.maketrans('','',string.punctuation)) 20 | text = text.translate(str.maketrans('','','1234567890')) 21 | text = text.strip() 22 | text = text.lower() 23 | words = text.split() 24 | for word in words: 25 | if len(word) < 4 : continue 26 | counts[word] = counts.get(word,0) + 1 27 | 28 | x = sorted(counts, key=counts.get, reverse=True) 29 | highest = None 30 | lowest = None 31 | for k in x[:100]: 32 | if highest is None or highest < counts[k] : 33 | highest = counts[k] 34 | if lowest is None or lowest > counts[k] : 35 | lowest = counts[k] 36 | print('Range of counts:',highest,lowest) 37 | 38 | # Spread the font sizes across 20-100 based on the count 39 | bigsize = 80 40 | smallsize = 20 41 | 42 | fhand = open('gword.js','w') 43 | fhand.write("gword = [") 44 | first = True 45 | for k in x[:100]: 46 | if not first : fhand.write( ",\n") 47 | first = False 48 | size = counts[k] 49 | size = (size - lowest) / float(highest - lowest) 50 | size = int((size * bigsize) + smallsize) 51 | fhand.write("{text: '"+k+"', size: "+str(size)+"}") 52 | fhand.write( "\n];\n") 53 | fhand.close() 54 | 55 | print("Output written to gword.js") 56 | print("Open gword.htm in a browser to see the vizualization") 57 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/gyear.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib.request, urllib.parse, urllib.error 4 | import zlib 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | cur = conn.cursor() 8 | 9 | cur.execute('SELECT id, sender FROM Senders') 10 | senders = dict() 11 | for message_row in cur : 12 | senders[message_row[0]] = message_row[1] 13 | 14 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 15 | messages = dict() 16 | for message_row in cur : 17 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 18 | 19 | print("Loaded messages=",len(messages),"senders=",len(senders)) 20 | 21 | sendorgs = dict() 22 | for (message_id, message) in list(messages.items()): 23 | sender = message[1] 24 | pieces = senders[sender].split("@") 25 | if len(pieces) != 2 : continue 26 | dns = pieces[1] 27 | sendorgs[dns] = sendorgs.get(dns,0) + 1 28 | 29 | # pick the top schools 30 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 31 | orgs = orgs[:10] 32 | print("Top 10 Oranizations") 33 | print(orgs) 34 | # orgs = ['total'] + orgs 35 | 36 | counts = dict() 37 | months = list() 38 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 39 | for (message_id, message) in list(messages.items()): 40 | sender = message[1] 41 | pieces = senders[sender].split("@") 42 | if len(pieces) != 2 : continue 43 | dns = pieces[1] 44 | if dns not in orgs : continue 45 | month = message[3][:4] 46 | if month not in months : months.append(month) 47 | key = (month, dns) 48 | counts[key] = counts.get(key,0) + 1 49 | tkey = (month, 'total') 50 | counts[tkey] = counts.get(tkey,0) + 1 51 | 52 | months.sort() 53 | # print counts 54 | # print months 55 | 56 | fhand = open('gline.js','w') 57 | fhand.write("gline = [ ['Year'") 58 | for org in orgs: 59 | fhand.write(",'"+org+"'") 60 | fhand.write("]") 61 | 62 | for month in months[1:-1]: 63 | fhand.write(",\n['"+month+"'") 64 | for org in orgs: 65 | key = (month, org) 66 | val = counts.get(key,0) 67 | fhand.write(","+str(val)) 68 | fhand.write("]"); 69 | 70 | fhand.write("\n];\n") 71 | fhand.close() 72 | 73 | print("Output written to gline.js") 74 | print("Open gline.htm to visualize the data") 75 | 76 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/mapping.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 4/mapping.sqlite -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/Output1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/Output1.jpg -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/Output2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/Output2.jpg -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/Output3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/Output3.jpg -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/README.txt: -------------------------------------------------------------------------------- 1 | Analyzing an EMAIL Archive from gmane and vizualizing the data 2 | using the D3 JavaScript library 3 | 4 | This is a set of tools that allow you to pull down an archive 5 | of a gmane repository using the instructions at: 6 | 7 | http://gmane.org/export.php 8 | 9 | In order not to overwhelm the gmane.org server, I have put up 10 | my own copy of the messages at: 11 | 12 | http://mbox.dr-chuck.net/ 13 | 14 | This server will be faster and take a lot of load off the 15 | gmane.org server. 16 | 17 | You should install the SQLite browser to view and modify the databases from: 18 | 19 | http://sqlitebrowser.org/ 20 | 21 | The first step is to spider the gmane repository. The base URL 22 | is hard-coded in the gmane.py and is hard-coded to the Sakai 23 | developer list. You can spider another repository by changing that 24 | base url. Make sure to delete the content.sqlite file if you 25 | switch the base url. The gmane.py file operates as a spider in 26 | that it runs slowly and retrieves one mail message per second so 27 | as to avoid getting throttled by gmane.org. It stores all of 28 | its data in a database and can be interrupted and re-started 29 | as often as needed. It may take many hours to pull all the data 30 | down. So you may need to restart several times. 31 | 32 | To give you a head-start, I have put up 600MB of pre-spidered Sakai 33 | email here: 34 | 35 | https://online.dr-chuck.com/files/sakai/email/content.sqlite 36 | 37 | If you download this, you can "catch up with the latest" by 38 | running gmane.py. 39 | 40 | Navigate to the folder where you extracted the gmane.zip 41 | 42 | Note: Windows has difficulty in displaying UTF-8 characters 43 | in the console so for each console window you open, you may need 44 | to type the following command before running this code: 45 | 46 | chcp 65001 47 | 48 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 49 | 50 | Here is a run of gmane.py getting the last five messages of the 51 | sakai developer list: 52 | 53 | Mac: python3 gmane.py 54 | Win: gmane.py 55 | 56 | How many messages:10 57 | http://mbox.dr-chuck.net/sakai.devel/1/2 2662 58 | ggolden@umich.edu 2005-12-08T23:34:30-06:00 call for participation: developers documentation 59 | http://mbox.dr-chuck.net/sakai.devel/2/3 2434 60 | csev@umich.edu 2005-12-09T00:58:01-05:00 report from the austin conference: sakai developers break into song 61 | http://mbox.dr-chuck.net/sakai.devel/3/4 3055 62 | kevin.carpenter@rsmart.com 2005-12-09T09:01:49-07:00 cas and sakai 1.5 63 | http://mbox.dr-chuck.net/sakai.devel/4/5 11721 64 | michael.feldstein@suny.edu 2005-12-09T09:43:12-05:00 re: lms/vle rants/comments 65 | http://mbox.dr-chuck.net/sakai.devel/5/6 9443 66 | john@caret.cam.ac.uk 2005-12-09T13:32:29+00:00 re: lms/vle rants/comments 67 | Does not start with From 68 | 69 | The program scans content.sqlite from 1 up to the first message number not 70 | already spidered and starts spidering at that message. It continues spidering 71 | until it has spidered the desired number of messages or it reaches a page 72 | that does not appear to be a properly formatted message. 73 | 74 | Sometimes gmane.org is missing a message. Perhaps administrators can delete messages 75 | or perhaps they get lost - I don't know. If your spider stops, and it seems it has hit 76 | a missing message, go into the SQLite Manager and add a row with the missing id - leave 77 | all the other fields blank - and then restart gmane.py. This will unstick the 78 | spidering process and allow it to continue. These empty messages will be ignored in the next 79 | phase of the process. 80 | 81 | One nice thing is that once you have spidered all of the messages and have them in 82 | content.sqlite, you can run gmane.py again to get new messages as they get sent to the 83 | list. gmane.py will quickly scan to the end of the already-spidered pages and check 84 | if there are new messages and then quickly retrieve those messages and add them 85 | to content.sqlite. 86 | 87 | The content.sqlite data is pretty raw, with an innefficient data model, and not compressed. 88 | This is intentional as it allows you to look at content.sqlite to debug the process. 89 | It would be a bad idea to run any queries against this database as they would be 90 | slow. 91 | 92 | The second process is running the program gmodel.py. gmodel.py reads the rough/raw 93 | data from content.sqlite and produces a cleaned-up and well-modeled version of the 94 | data in the file index.sqlite. The file index.sqlite will be much smaller (often 10X 95 | smaller) than content.sqlite because it also compresses the header and body text. 96 | 97 | Each time gmodel.py runs - it completely wipes out and re-builds index.sqlite, allowing 98 | you to adjust its parameters and edit the mapping tables in content.sqlite to tweak the 99 | data cleaning process. 100 | 101 | Running gmodel.py works as follows: 102 | 103 | Mac: python3 gmodel.py 104 | Win: gmodel.py 105 | 106 | Loaded allsenders 1588 and mapping 28 dns mapping 1 107 | 1 2005-12-08T23:34:30-06:00 ggolden22@mac.com 108 | 251 2005-12-22T10:03:20-08:00 tpamsler@ucdavis.edu 109 | 501 2006-01-12T11:17:34-05:00 lance@indiana.edu 110 | 751 2006-01-24T11:13:28-08:00 vrajgopalan@ucmerced.edu 111 | ... 112 | 113 | The gmodel.py program does a number of data cleaing steps 114 | 115 | Domain names are truncated to two levels for .com, .org, .edu, and .net 116 | other domain names are truncated to three levels. So si.umich.edu becomes 117 | umich.edu and caret.cam.ac.uk becomes cam.ac.uk. Also mail addresses are 118 | forced to lower case and some of the @gmane.org address like the following 119 | 120 | arwhyte-63aXycvo3TyHXe+LvDLADg@public.gmane.org 121 | 122 | are converted to the real address whenever there is a matching real email 123 | address elsewhere in the message corpus. 124 | 125 | If you look in the content.sqlite database there are two tables that allow 126 | you to map both domain names and individual email addresses that change over 127 | the lifetime of the email list. For example, Steve Githens used the following 128 | email addresses over the life of the Sakai developer list: 129 | 130 | s-githens@northwestern.edu 131 | sgithens@cam.ac.uk 132 | swgithen@mtu.edu 133 | 134 | We can add two entries to the Mapping table 135 | 136 | s-githens@northwestern.edu -> swgithen@mtu.edu 137 | sgithens@cam.ac.uk -> swgithen@mtu.edu 138 | 139 | And so all the mail messages will be collected under one sender even if 140 | they used several email addresses over the lifetime of the mailing list. 141 | 142 | You can also make similar entries in the DNSMapping table if there are multiple 143 | DNS names you want mapped to a single DNS. In the Sakai data I add the following 144 | mapping: 145 | 146 | iupui.edu -> indiana.edu 147 | 148 | So all the folks from the various Indiana University campuses are tracked together 149 | 150 | You can re-run the gmodel.py over and over as you look at the data, and add mappings 151 | to make the data cleaner and cleaner. When you are done, you will have a nicely 152 | indexed version of the email in index.sqlite. This is the file to use to do data 153 | analysis. With this file, data analysis will be really quick. 154 | 155 | The first, simplest data analysis is to do a "who does the most" and "which 156 | organzation does the most"? This is done using gbasic.py: 157 | 158 | Mac: python3 gbasic.py 159 | Win: gbasic.py 160 | 161 | How many to dump? 5 162 | Loaded messages= 51330 subjects= 25033 senders= 1584 163 | 164 | Top 5 Email list participants 165 | steve.swinsburg@gmail.com 2657 166 | azeckoski@unicon.net 1742 167 | ieb@tfd.co.uk 1591 168 | csev@umich.edu 1304 169 | david.horwitz@uct.ac.za 1184 170 | 171 | Top 5 Email list organizations 172 | gmail.com 7339 173 | umich.edu 6243 174 | uct.ac.za 2451 175 | indiana.edu 2258 176 | unicon.net 2055 177 | 178 | You can look at the data in index.sqlite and if you find a problem, you 179 | can update the Mapping table and DNSMapping table in content.sqlite and 180 | re-run gmodel.py. 181 | 182 | There is a simple vizualization of the word frequence in the subject lines 183 | in the file gword.py: 184 | 185 | Mac: python3 gword.py 186 | Win: gword.py 187 | 188 | Range of counts: 33229 129 189 | Output written to gword.js 190 | 191 | This produces the file gword.js which you can visualize using the file 192 | gword.htm. 193 | 194 | A second visualization is in gline.py. It visualizes email participation by 195 | organizations over time. 196 | 197 | Mac: python3 gline.py 198 | Win: gline.py 199 | 200 | Loaded messages= 51330 subjects= 25033 senders= 1584 201 | Top 10 Oranizations 202 | ['gmail.com', 'umich.edu', 'uct.ac.za', 'indiana.edu', 'unicon.net', 'tfd.co.uk', 'berkeley.edu', 'longsight.com', 'stanford.edu', 'ox.ac.uk'] 203 | Output written to gline.js 204 | 205 | Its output is written to gline.js which is visualized using gline.htm. 206 | 207 | Some URLs for visualization ideas: 208 | 209 | https://developers.google.com/chart/ 210 | 211 | https://developers.google.com/chart/interactive/docs/gallery/motionchart 212 | 213 | https://code.google.com/apis/ajax/playground/?type=visualization#motion_chart_time_formats 214 | 215 | https://developers.google.com/chart/interactive/docs/gallery/annotatedtimeline 216 | 217 | http://bost.ocks.org/mike/uberdata/ 218 | 219 | http://mbostock.github.io/d3/talk/20111018/calendar.html 220 | 221 | http://nltk.org/install.html 222 | 223 | As always - comments welcome. 224 | 225 | -- Dr. Chuck 226 | Sun Sep 29 00:11:01 EDT 2013 227 | 228 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/gbasic.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | 5 | howmany = int(input("How many to dump? ")) 6 | 7 | conn = sqlite3.connect('index.sqlite') 8 | cur = conn.cursor() 9 | 10 | cur.execute('SELECT id, sender FROM Senders') 11 | senders = dict() 12 | for message_row in cur : 13 | senders[message_row[0]] = message_row[1] 14 | 15 | cur.execute('SELECT id, subject FROM Subjects') 16 | subjects = dict() 17 | for message_row in cur : 18 | subjects[message_row[0]] = message_row[1] 19 | 20 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages') 21 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 22 | messages = dict() 23 | for message_row in cur : 24 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 25 | 26 | print("Loaded messages=",len(messages),"subjects=",len(subjects),"senders=",len(senders)) 27 | 28 | sendcounts = dict() 29 | sendorgs = dict() 30 | for (message_id, message) in list(messages.items()): 31 | sender = message[1] 32 | sendcounts[sender] = sendcounts.get(sender,0) + 1 33 | pieces = senders[sender].split("@") 34 | if len(pieces) != 2 : continue 35 | dns = pieces[1] 36 | sendorgs[dns] = sendorgs.get(dns,0) + 1 37 | 38 | print('') 39 | print('Top',howmany,'Email list participants') 40 | 41 | x = sorted(sendcounts, key=sendcounts.get, reverse=True) 42 | for k in x[:howmany]: 43 | print(senders[k], sendcounts[k]) 44 | if sendcounts[k] < 10 : break 45 | 46 | print('') 47 | print('Top',howmany,'Email list organizations') 48 | 49 | x = sorted(sendorgs, key=sendorgs.get, reverse=True) 50 | for k in x[:howmany]: 51 | print(k, sendorgs[k]) 52 | if sendorgs[k] < 10 : break 53 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/gline.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 19 | 20 | 21 |
22 | 23 | 24 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/gline.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | 5 | conn = sqlite3.connect('index.sqlite') 6 | cur = conn.cursor() 7 | 8 | cur.execute('SELECT id, sender FROM Senders') 9 | senders = dict() 10 | for message_row in cur : 11 | senders[message_row[0]] = message_row[1] 12 | 13 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 14 | messages = dict() 15 | for message_row in cur : 16 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 17 | 18 | print("Loaded messages=",len(messages),"senders=",len(senders)) 19 | 20 | sendorgs = dict() 21 | for (message_id, message) in list(messages.items()): 22 | sender = message[1] 23 | pieces = senders[sender].split("@") 24 | if len(pieces) != 2 : continue 25 | dns = pieces[1] 26 | sendorgs[dns] = sendorgs.get(dns,0) + 1 27 | 28 | # pick the top schools 29 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 30 | orgs = orgs[:10] 31 | print("Top 10 Oranizations") 32 | print(orgs) 33 | 34 | counts = dict() 35 | months = list() 36 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 37 | for (message_id, message) in list(messages.items()): 38 | sender = message[1] 39 | pieces = senders[sender].split("@") 40 | if len(pieces) != 2 : continue 41 | dns = pieces[1] 42 | if dns not in orgs : continue 43 | month = message[3][:7] 44 | if month not in months : months.append(month) 45 | key = (month, dns) 46 | counts[key] = counts.get(key,0) + 1 47 | 48 | months.sort() 49 | # print counts 50 | # print months 51 | 52 | fhand = open('gline.js','w') 53 | fhand.write("gline = [ ['Year'") 54 | for org in orgs: 55 | fhand.write(",'"+org+"'") 56 | fhand.write("]") 57 | 58 | for month in months: 59 | fhand.write(",\n['"+month+"'") 60 | for org in orgs: 61 | key = (month, org) 62 | val = counts.get(key,0) 63 | fhand.write(","+str(val)) 64 | fhand.write("]"); 65 | 66 | fhand.write("\n];\n") 67 | fhand.close() 68 | 69 | print("Output written to gline.js") 70 | print("Open gline.htm to visualize the data") 71 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/gmane.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import ssl 4 | import urllib.request, urllib.parse, urllib.error 5 | from urllib.parse import urljoin 6 | from urllib.parse import urlparse 7 | import re 8 | from datetime import datetime, timedelta 9 | 10 | # Not all systems have this so conditionally define parser 11 | try: 12 | import dateutil.parser as parser 13 | except: 14 | pass 15 | 16 | def parsemaildate(md) : 17 | # See if we have dateutil 18 | try: 19 | pdate = parser.parse(tdate) 20 | test_at = pdate.isoformat() 21 | return test_at 22 | except: 23 | pass 24 | 25 | # Non-dateutil version - we try our best 26 | 27 | pieces = md.split() 28 | notz = " ".join(pieces[:4]).strip() 29 | 30 | # Try a bunch of format variations - strptime() is *lame* 31 | dnotz = None 32 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 33 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 34 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 35 | try: 36 | dnotz = datetime.strptime(notz, form) 37 | break 38 | except: 39 | continue 40 | 41 | if dnotz is None : 42 | # print 'Bad Date:',md 43 | return None 44 | 45 | iso = dnotz.isoformat() 46 | 47 | tz = "+0000" 48 | try: 49 | tz = pieces[4] 50 | ival = int(tz) # Only want numeric timezone values 51 | if tz == '-0000' : tz = '+0000' 52 | tzh = tz[:3] 53 | tzm = tz[3:] 54 | tz = tzh+":"+tzm 55 | except: 56 | pass 57 | 58 | return iso+tz 59 | 60 | # Ignore SSL certificate errors 61 | ctx = ssl.create_default_context() 62 | ctx.check_hostname = False 63 | ctx.verify_mode = ssl.CERT_NONE 64 | 65 | conn = sqlite3.connect('content.sqlite') 66 | cur = conn.cursor() 67 | 68 | baseurl = "http://mbox.dr-chuck.net/sakai.devel/" 69 | 70 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 71 | (id INTEGER UNIQUE, email TEXT, sent_at TEXT, 72 | subject TEXT, headers TEXT, body TEXT)''') 73 | 74 | # Pick up where we left off 75 | start = None 76 | cur.execute('SELECT max(id) FROM Messages' ) 77 | try: 78 | row = cur.fetchone() 79 | if row is None : 80 | start = 0 81 | else: 82 | start = row[0] 83 | except: 84 | start = 0 85 | 86 | if start is None : start = 0 87 | 88 | many = 0 89 | count = 0 90 | fail = 0 91 | while True: 92 | if ( many < 1 ) : 93 | conn.commit() 94 | sval = input('How many messages:') 95 | if ( len(sval) < 1 ) : break 96 | many = int(sval) 97 | 98 | start = start + 1 99 | cur.execute('SELECT id FROM Messages WHERE id=?', (start,) ) 100 | try: 101 | row = cur.fetchone() 102 | if row is not None : continue 103 | except: 104 | row = None 105 | 106 | many = many - 1 107 | url = baseurl + str(start) + '/' + str(start + 1) 108 | 109 | text = "None" 110 | try: 111 | # Open with a timeout of 30 seconds 112 | document = urllib.request.urlopen(url, None, 30, context=ctx) 113 | text = document.read().decode() 114 | if document.getcode() != 200 : 115 | print("Error code=",document.getcode(), url) 116 | break 117 | except KeyboardInterrupt: 118 | print('') 119 | print('Program interrupted by user...') 120 | break 121 | except Exception as e: 122 | print("Unable to retrieve or parse page",url) 123 | print("Error",e) 124 | fail = fail + 1 125 | if fail > 5 : break 126 | continue 127 | 128 | print(url,len(text)) 129 | count = count + 1 130 | 131 | if not text.startswith("From "): 132 | print(text) 133 | print("Did not find From ") 134 | fail = fail + 1 135 | if fail > 5 : break 136 | continue 137 | 138 | pos = text.find("\n\n") 139 | if pos > 0 : 140 | hdr = text[:pos] 141 | body = text[pos+2:] 142 | else: 143 | print(text) 144 | print("Could not find break between headers and body") 145 | fail = fail + 1 146 | if fail > 5 : break 147 | continue 148 | 149 | email = None 150 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 151 | if len(x) == 1 : 152 | email = x[0]; 153 | email = email.strip().lower() 154 | email = email.replace("<","") 155 | else: 156 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 157 | if len(x) == 1 : 158 | email = x[0]; 159 | email = email.strip().lower() 160 | email = email.replace("<","") 161 | 162 | date = None 163 | y = re.findall('\Date: .*, (.*)\n', hdr) 164 | if len(y) == 1 : 165 | tdate = y[0] 166 | tdate = tdate[:26] 167 | try: 168 | sent_at = parsemaildate(tdate) 169 | except: 170 | print(text) 171 | print("Parse fail",tdate) 172 | fail = fail + 1 173 | if fail > 5 : break 174 | continue 175 | 176 | subject = None 177 | z = re.findall('\Subject: (.*)\n', hdr) 178 | if len(z) == 1 : subject = z[0].strip().lower(); 179 | 180 | # Reset the fail counter 181 | fail = 0 182 | print(" ",email,sent_at,subject) 183 | cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body) 184 | VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body)) 185 | if count % 50 == 0 : conn.commit() 186 | if count % 100 == 0 : time.sleep(1) 187 | 188 | conn.commit() 189 | cur.close() 190 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/gmodel.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import re 4 | import zlib 5 | from datetime import datetime, timedelta 6 | 7 | # Not all systems have this 8 | try: 9 | import dateutil.parser as parser 10 | except: 11 | pass 12 | 13 | dnsmapping = dict() 14 | mapping = dict() 15 | 16 | def fixsender(sender,allsenders=None) : 17 | global dnsmapping 18 | global mapping 19 | if sender is None : return None 20 | sender = sender.strip().lower() 21 | sender = sender.replace('<','').replace('>','') 22 | 23 | # Check if we have a hacked gmane.org from address 24 | if allsenders is not None and sender.endswith('gmane.org') : 25 | pieces = sender.split('-') 26 | realsender = None 27 | for s in allsenders: 28 | if s.startswith(pieces[0]) : 29 | realsender = sender 30 | sender = s 31 | # print(realsender, sender) 32 | break 33 | if realsender is None : 34 | for s in mapping: 35 | if s.startswith(pieces[0]) : 36 | realsender = sender 37 | sender = mapping[s] 38 | # print(realsender, sender) 39 | break 40 | if realsender is None : sender = pieces[0] 41 | 42 | mpieces = sender.split("@") 43 | if len(mpieces) != 2 : return sender 44 | dns = mpieces[1] 45 | x = dns 46 | pieces = dns.split(".") 47 | if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") : 48 | dns = ".".join(pieces[-2:]) 49 | else: 50 | dns = ".".join(pieces[-3:]) 51 | # if dns != x : print(x,dns) 52 | # if dns != dnsmapping.get(dns,dns) : print(dns,dnsmapping.get(dns,dns)) 53 | dns = dnsmapping.get(dns,dns) 54 | return mpieces[0] + '@' + dns 55 | 56 | def parsemaildate(md) : 57 | # See if we have dateutil 58 | try: 59 | pdate = parser.parse(tdate) 60 | test_at = pdate.isoformat() 61 | return test_at 62 | except: 63 | pass 64 | 65 | # Non-dateutil version - we try our best 66 | 67 | pieces = md.split() 68 | notz = " ".join(pieces[:4]).strip() 69 | 70 | # Try a bunch of format variations - strptime() is *lame* 71 | dnotz = None 72 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 73 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 74 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 75 | try: 76 | dnotz = datetime.strptime(notz, form) 77 | break 78 | except: 79 | continue 80 | 81 | if dnotz is None : 82 | # print('Bad Date:',md) 83 | return None 84 | 85 | iso = dnotz.isoformat() 86 | 87 | tz = "+0000" 88 | try: 89 | tz = pieces[4] 90 | ival = int(tz) # Only want numeric timezone values 91 | if tz == '-0000' : tz = '+0000' 92 | tzh = tz[:3] 93 | tzm = tz[3:] 94 | tz = tzh+":"+tzm 95 | except: 96 | pass 97 | 98 | return iso+tz 99 | 100 | # Parse out the info... 101 | def parseheader(hdr, allsenders=None): 102 | if hdr is None or len(hdr) < 1 : return None 103 | sender = None 104 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 105 | if len(x) >= 1 : 106 | sender = x[0] 107 | else: 108 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 109 | if len(x) >= 1 : 110 | sender = x[0] 111 | 112 | # normalize the domain name of Email addresses 113 | sender = fixsender(sender, allsenders) 114 | 115 | date = None 116 | y = re.findall('\nDate: .*, (.*)\n', hdr) 117 | sent_at = None 118 | if len(y) >= 1 : 119 | tdate = y[0] 120 | tdate = tdate[:26] 121 | try: 122 | sent_at = parsemaildate(tdate) 123 | except Exception as e: 124 | # print('Date ignored ',tdate, e) 125 | return None 126 | 127 | subject = None 128 | z = re.findall('\nSubject: (.*)\n', hdr) 129 | if len(z) >= 1 : subject = z[0].strip().lower() 130 | 131 | guid = None 132 | z = re.findall('\nMessage-ID: (.*)\n', hdr) 133 | if len(z) >= 1 : guid = z[0].strip().lower() 134 | 135 | if sender is None or sent_at is None or subject is None or guid is None : 136 | return None 137 | return (guid, sender, subject, sent_at) 138 | 139 | conn = sqlite3.connect('index.sqlite') 140 | cur = conn.cursor() 141 | 142 | cur.execute('''DROP TABLE IF EXISTS Messages ''') 143 | cur.execute('''DROP TABLE IF EXISTS Senders ''') 144 | cur.execute('''DROP TABLE IF EXISTS Subjects ''') 145 | cur.execute('''DROP TABLE IF EXISTS Replies ''') 146 | 147 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 148 | (id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER, 149 | sender_id INTEGER, subject_id INTEGER, 150 | headers BLOB, body BLOB)''') 151 | cur.execute('''CREATE TABLE IF NOT EXISTS Senders 152 | (id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''') 153 | cur.execute('''CREATE TABLE IF NOT EXISTS Subjects 154 | (id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''') 155 | cur.execute('''CREATE TABLE IF NOT EXISTS Replies 156 | (from_id INTEGER, to_id INTEGER)''') 157 | 158 | conn_1 = sqlite3.connect('mapping.sqlite') 159 | cur_1 = conn_1.cursor() 160 | 161 | cur_1.execute('''SELECT old,new FROM DNSMapping''') 162 | for message_row in cur_1 : 163 | dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower() 164 | 165 | mapping = dict() 166 | cur_1.execute('''SELECT old,new FROM Mapping''') 167 | for message_row in cur_1 : 168 | old = fixsender(message_row[0]) 169 | new = fixsender(message_row[1]) 170 | mapping[old] = fixsender(new) 171 | 172 | # Done with mapping.sqlite 173 | conn_1.close() 174 | 175 | # Open the main content (Read only) 176 | conn_1 = sqlite3.connect('file:content.sqlite?mode=ro', uri=True) 177 | cur_1 = conn_1.cursor() 178 | 179 | allsenders = list() 180 | cur_1.execute('''SELECT email FROM Messages''') 181 | for message_row in cur_1 : 182 | sender = fixsender(message_row[0]) 183 | if sender is None : continue 184 | if 'gmane.org' in sender : continue 185 | if sender in allsenders: continue 186 | allsenders.append(sender) 187 | 188 | print("Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping)) 189 | 190 | cur_1.execute('''SELECT headers, body, sent_at 191 | FROM Messages ORDER BY sent_at''') 192 | 193 | senders = dict() 194 | subjects = dict() 195 | guids = dict() 196 | 197 | count = 0 198 | 199 | for message_row in cur_1 : 200 | hdr = message_row[0] 201 | parsed = parseheader(hdr, allsenders) 202 | if parsed is None: continue 203 | (guid, sender, subject, sent_at) = parsed 204 | 205 | # Apply the sender mapping 206 | sender = mapping.get(sender,sender) 207 | 208 | count = count + 1 209 | if count % 250 == 1 : print(count,sent_at, sender) 210 | # print(guid, sender, subject, sent_at) 211 | 212 | if 'gmane.org' in sender: 213 | print("Error in sender ===", sender) 214 | 215 | sender_id = senders.get(sender,None) 216 | subject_id = subjects.get(subject,None) 217 | guid_id = guids.get(guid,None) 218 | 219 | if sender_id is None : 220 | cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) ) 221 | conn.commit() 222 | cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, )) 223 | try: 224 | row = cur.fetchone() 225 | sender_id = row[0] 226 | senders[sender] = sender_id 227 | except: 228 | print('Could not retrieve sender id',sender) 229 | break 230 | if subject_id is None : 231 | cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) ) 232 | conn.commit() 233 | cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, )) 234 | try: 235 | row = cur.fetchone() 236 | subject_id = row[0] 237 | subjects[subject] = subject_id 238 | except: 239 | print('Could not retrieve subject id',subject) 240 | break 241 | # print(sender_id, subject_id) 242 | cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )', 243 | ( guid, sender_id, subject_id, sent_at, 244 | zlib.compress(message_row[0].encode()), zlib.compress(message_row[1].encode())) ) 245 | conn.commit() 246 | cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, )) 247 | try: 248 | row = cur.fetchone() 249 | message_id = row[0] 250 | guids[guid] = message_id 251 | except: 252 | print('Could not retrieve guid id',guid) 253 | break 254 | 255 | cur.close() 256 | cur_1.close() 257 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/gword.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 37 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/gword.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | import string 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | cur = conn.cursor() 8 | 9 | cur.execute('SELECT id, subject FROM Subjects') 10 | subjects = dict() 11 | for message_row in cur : 12 | subjects[message_row[0]] = message_row[1] 13 | 14 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages') 15 | cur.execute('SELECT subject_id FROM Messages') 16 | counts = dict() 17 | for message_row in cur : 18 | text = subjects[message_row[0]] 19 | text = text.translate(str.maketrans('','',string.punctuation)) 20 | text = text.translate(str.maketrans('','','1234567890')) 21 | text = text.strip() 22 | text = text.lower() 23 | words = text.split() 24 | for word in words: 25 | if len(word) < 4 : continue 26 | counts[word] = counts.get(word,0) + 1 27 | 28 | x = sorted(counts, key=counts.get, reverse=True) 29 | highest = None 30 | lowest = None 31 | for k in x[:100]: 32 | if highest is None or highest < counts[k] : 33 | highest = counts[k] 34 | if lowest is None or lowest > counts[k] : 35 | lowest = counts[k] 36 | print('Range of counts:',highest,lowest) 37 | 38 | # Spread the font sizes across 20-100 based on the count 39 | bigsize = 80 40 | smallsize = 20 41 | 42 | fhand = open('gword.js','w') 43 | fhand.write("gword = [") 44 | first = True 45 | for k in x[:100]: 46 | if not first : fhand.write( ",\n") 47 | first = False 48 | size = counts[k] 49 | size = (size - lowest) / float(highest - lowest) 50 | size = int((size * bigsize) + smallsize) 51 | fhand.write("{text: '"+k+"', size: "+str(size)+"}") 52 | fhand.write( "\n];\n") 53 | fhand.close() 54 | 55 | print("Output written to gword.js") 56 | print("Open gword.htm in a browser to see the vizualization") 57 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/gyear.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib.request, urllib.parse, urllib.error 4 | import zlib 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | cur = conn.cursor() 8 | 9 | cur.execute('SELECT id, sender FROM Senders') 10 | senders = dict() 11 | for message_row in cur : 12 | senders[message_row[0]] = message_row[1] 13 | 14 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 15 | messages = dict() 16 | for message_row in cur : 17 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 18 | 19 | print("Loaded messages=",len(messages),"senders=",len(senders)) 20 | 21 | sendorgs = dict() 22 | for (message_id, message) in list(messages.items()): 23 | sender = message[1] 24 | pieces = senders[sender].split("@") 25 | if len(pieces) != 2 : continue 26 | dns = pieces[1] 27 | sendorgs[dns] = sendorgs.get(dns,0) + 1 28 | 29 | # pick the top schools 30 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 31 | orgs = orgs[:10] 32 | print("Top 10 Oranizations") 33 | print(orgs) 34 | # orgs = ['total'] + orgs 35 | 36 | counts = dict() 37 | months = list() 38 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 39 | for (message_id, message) in list(messages.items()): 40 | sender = message[1] 41 | pieces = senders[sender].split("@") 42 | if len(pieces) != 2 : continue 43 | dns = pieces[1] 44 | if dns not in orgs : continue 45 | month = message[3][:4] 46 | if month not in months : months.append(month) 47 | key = (month, dns) 48 | counts[key] = counts.get(key,0) + 1 49 | tkey = (month, 'total') 50 | counts[tkey] = counts.get(tkey,0) + 1 51 | 52 | months.sort() 53 | # print counts 54 | # print months 55 | 56 | fhand = open('gline.js','w') 57 | fhand.write("gline = [ ['Year'") 58 | for org in orgs: 59 | fhand.write(",'"+org+"'") 60 | fhand.write("]") 61 | 62 | for month in months[1:-1]: 63 | fhand.write(",\n['"+month+"'") 64 | for org in orgs: 65 | key = (month, org) 66 | val = counts.get(key,0) 67 | fhand.write(","+str(val)) 68 | fhand.write("]"); 69 | 70 | fhand.write("\n];\n") 71 | fhand.close() 72 | 73 | print("Output written to gline.js") 74 | print("Open gline.htm to visualize the data") 75 | 76 | -------------------------------------------------------------------------------- /Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/mapping.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srinivasgln/Coursera_Python_For_Everybody/f9c55f04432f784cfa05f67229705c793caf8704/Course 5 Capstone Retrieving, Processing,Visualizing Data/Week 6/mapping.sqlite -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Srinivasan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Coursera_Python_For_Everybody 2 | My projects from the awesome Python Course, Python For Everybody taught by the incredible Dr.Charles Severance 3 | --------------------------------------------------------------------------------