├── .gitattributes ├── .gitignore ├── EXAMPLE CODE ├── .htaccess ├── BeautifulSoup.py ├── aa_readme.txt ├── argfile.py ├── argtest.py ├── avelist.py ├── avenum.py ├── average.py ├── badhtml.html ├── celsius.py ├── cleanup.sh ├── clown.txt ├── copytildone.py ├── count1.py ├── count2.py ├── count3.py ├── curl1.py ├── curl2.py ├── curl3.py ├── db1.py ├── db2.py ├── egg.py ├── emaildb.py ├── fahren.py ├── geodata.zip ├── geodata │ ├── README.txt │ ├── geodump.py │ ├── geoload.py │ ├── where.data │ ├── where.html │ └── where.js ├── geojson.py ├── geoxml.py ├── gmane.zip ├── gmane │ ├── Chart.bundle.js │ ├── README.txt │ ├── d3.layout.cloud.js │ ├── d3.v3.js │ ├── gbasic.py │ ├── gline.htm │ ├── gline.py │ ├── gline2.htm │ ├── gline3.htm │ ├── gmane.py │ ├── gmodel.py │ ├── gword.htm │ ├── gword.py │ ├── gyear.py │ └── mapping.sqlite ├── grade.py ├── graphics │ ├── graphics.py │ └── histogram.py ├── greet.py ├── grep.py ├── hidden.py ├── intro-short.txt ├── intro.txt ├── json1.py ├── json2.py ├── largest.py ├── mailcount.py ├── mailtop.py ├── mbox-short.txt ├── mbox.txt ├── oauth.py ├── open.py ├── pagerank.zip ├── pagerank │ ├── BeautifulSoup.py │ ├── LICENSE │ ├── README.txt │ ├── d3.v2.js │ ├── force.css │ ├── force.html │ ├── force.js │ ├── spdump.py │ ├── spider.js │ ├── spider.py │ ├── spjson.py │ ├── sprank.py │ └── spreset.py ├── pals.py ├── party1.py ├── party2.py ├── party3.py ├── party4.py ├── party5.py ├── pay.py ├── pay2.py ├── pay3.py ├── re01.py ├── re02.py ├── re03.py ├── re04.py ├── re05.py ├── re06.py ├── re07.py ├── re08.py ├── re09.py ├── re10.py ├── re11.py ├── re12.py ├── re13.py ├── re14.py ├── romeo-full.txt ├── romeo.txt ├── roster.py ├── roster.zip ├── roster │ ├── roster.py │ └── roster_data.json ├── search1.py ├── search10.py ├── search2.py ├── search3.py ├── search4.py ├── search5.py ├── search6.py ├── search7.py ├── search8.py ├── search9.py ├── sequence.py ├── socket1.py ├── socket2.py ├── soft.py ├── spamave.py ├── tracks.zip ├── tracks │ ├── Library.xml │ ├── README.txt │ └── tracks.py ├── twdump.py ├── twfriends.py ├── twitter1.py ├── twitter2.py ├── twjoin.py ├── twspider.py ├── twtest.py ├── twurl.py ├── txtcheck.py ├── txtcheck2.py ├── txtcheck3.py ├── txtcount.py ├── txtdelete.py ├── txtmd5.py ├── txtsize.py ├── urljpeg.py ├── urllib1.py ├── urllib2.py ├── urllink2.py ├── urllink3.py ├── urllinks.py ├── urlregex.py ├── urlwords.py ├── whathour.py ├── wikidata.db ├── wikigrade.py ├── wordlist.py ├── words.py ├── words.txt ├── xml1.py └── xml2.py ├── README.md ├── Textbook - Castellano.epub ├── Textbook - English.epub ├── Unit 1 - Introduction ├── A1.1 - Code screenshot.PNG ├── A1.2. - Script execution.PNG └── C1.1 - Firstcode.py ├── Unit 2 - Regular expressions ├── A.2.1 - regex text sample.txt ├── A.2.2 - regex text data.txt └── C2.1 - Programming assignment.py ├── Unit 3 - Networks and sockets └── C3.1. - Programming assignment.py ├── Unit 4 - Programs that surf the web ├── BeautifulSoup.py ├── BeautifulSoup.pyc ├── C4.1. Programming assignment.py └── C4.2. Programming assignment 2.py ├── Unit 5 - Web services and XML ├── BeautifulSoup.py ├── BeautifulSoup.pyc └── C5.1. Programming assignment.py └── Unit 6 - JSON and the REST architecture ├── C.6.1. - Programming assignment 1.py └── C.6.2. - Programming assignment 2.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #### CUSTOM PART 2 | # For non coding-files 3 | *.docx 4 | *.pdf 5 | 6 | 7 | # Windows image file caches 8 | Thumbs.db 9 | ehthumbs.db 10 | 11 | # Folder config file 12 | Desktop.ini 13 | 14 | # Recycle Bin used on file shares 15 | $RECYCLE.BIN/ 16 | 17 | # Windows Installer files 18 | *.cab 19 | *.msi 20 | *.msm 21 | *.msp 22 | 23 | # Windows shortcuts 24 | *.lnk 25 | 26 | # ========================= 27 | # Operating System Files 28 | # ========================= 29 | 30 | # OSX 31 | # ========================= 32 | 33 | .DS_Store 34 | .AppleDouble 35 | .LSOverride 36 | 37 | # Thumbnails 38 | ._* 39 | 40 | # Files that might appear in the root of a volume 41 | .DocumentRevisions-V100 42 | .fseventsd 43 | .Spotlight-V100 44 | .TemporaryItems 45 | .Trashes 46 | .VolumeIcon.icns 47 | 48 | # Directories potentially created on remote AFP share 49 | .AppleDB 50 | .AppleDesktop 51 | Network Trash Folder 52 | Temporary Items 53 | .apdisk 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /EXAMPLE CODE/.htaccess: -------------------------------------------------------------------------------- 1 | Options +Indexes 2 | AddType text/plain .py 3 | 4 | 5 | Header set Cache-Control "max-age=604800, public" 6 | 7 | Header add Access-Control-Allow-Origin "*" 8 | Header add Access-Control-Allow-Headers "origin, x-requested-with, content-type" 9 | Header add Access-Control-Allow-Methods "GET" 10 | 11 | -------------------------------------------------------------------------------- /EXAMPLE CODE/aa_readme.txt: -------------------------------------------------------------------------------- 1 | This is the Python version 2.0 version of the sample code 2 | for Python for Informatics. 3 | 4 | The Python 3.0 version of the code is in the folder "code3" 5 | 6 | /Chuck 7 | 8 | -------------------------------------------------------------------------------- /EXAMPLE CODE/argfile.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | name = sys.argv[1] 4 | handle = open(name, 'r') 5 | text = handle.read() 6 | print name, 'is', len(text), 'bytes' 7 | -------------------------------------------------------------------------------- /EXAMPLE CODE/argtest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | print 'Count:', len(sys.argv) 4 | print 'Type:', type(sys.argv) 5 | 6 | for arg in sys.argv: 7 | print 'Argument:', arg 8 | 9 | -------------------------------------------------------------------------------- /EXAMPLE CODE/avelist.py: -------------------------------------------------------------------------------- 1 | numlist = list() 2 | while ( True ) : 3 | inp = raw_input('Enter a number: ') 4 | if inp == 'done' : break 5 | value = float(inp) 6 | numlist.append(value) 7 | 8 | average = sum(numlist) / len(numlist) 9 | print 'Average:', average 10 | -------------------------------------------------------------------------------- /EXAMPLE CODE/avenum.py: -------------------------------------------------------------------------------- 1 | total = 0 2 | count = 0 3 | while ( True ) : 4 | inp = raw_input('Enter a number: ') 5 | if inp == 'done' : break 6 | value = float(inp) 7 | total = total + value 8 | count = count + 1 9 | 10 | average = total / count 11 | print 'Average:', average 12 | -------------------------------------------------------------------------------- /EXAMPLE CODE/average.py: -------------------------------------------------------------------------------- 1 | total = 0 2 | count = 0 3 | while ( True ) : 4 | inp = raw_input('Enter a number: ') 5 | if inp == 'done' : 6 | break 7 | try: 8 | value = float(inp) 9 | except: 10 | print 'Invalid input' 11 | continue 12 | total = total + value 13 | count = count + 1 14 | 15 | average = total / count 16 | print 'Average:', average 17 | -------------------------------------------------------------------------------- /EXAMPLE CODE/badhtml.html: -------------------------------------------------------------------------------- 1 |

Hello

2 | First 3 | 4 | Second 7 | 8 | Third 12 | -------------------------------------------------------------------------------- /EXAMPLE CODE/celsius.py: -------------------------------------------------------------------------------- 1 | inp = raw_input('Enter Celsius Temperature:') 2 | cel = float(inp) 3 | fahr = ( cel * 9.0 ) / 5.0 + 32.0 4 | print fahr 5 | -------------------------------------------------------------------------------- /EXAMPLE CODE/cleanup.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | rm *.pyc */*.pyc 4 | rm *.sqlite 5 | rm *.zip 6 | 7 | zip -r geodata.zip geodata 8 | zip -r gmane.zip gmane 9 | zip -r pagerank.zip pagerank 10 | zip -r tracks.zip tracks 11 | zip -r roster.zip roster 12 | 13 | -------------------------------------------------------------------------------- /EXAMPLE CODE/clown.txt: -------------------------------------------------------------------------------- 1 | the clown ran after the car and the car ran into the tent and the tent fell down on the clown and the car 2 | -------------------------------------------------------------------------------- /EXAMPLE CODE/copytildone.py: -------------------------------------------------------------------------------- 1 | while True: 2 | line = raw_input('> ') 3 | if line[0] == '#' : 4 | continue 5 | if line == 'done': 6 | break 7 | print line 8 | 9 | print 'Done!' 10 | 11 | -------------------------------------------------------------------------------- /EXAMPLE CODE/count1.py: -------------------------------------------------------------------------------- 1 | fname = raw_input('Enter the file name: ') 2 | try: 3 | fhand = open(fname) 4 | except: 5 | print 'File cannot be opened:', fname 6 | exit() 7 | 8 | counts = dict() 9 | for line in fhand: 10 | words = line.split() 11 | for word in words: 12 | if word not in counts: 13 | counts[word] = 1 14 | else: 15 | counts[word] += 1 16 | 17 | print counts 18 | -------------------------------------------------------------------------------- /EXAMPLE CODE/count2.py: -------------------------------------------------------------------------------- 1 | import string 2 | 3 | fname = raw_input('Enter the file name: ') 4 | try: 5 | fhand = open(fname) 6 | except: 7 | print 'File cannot be opened:', fname 8 | exit() 9 | 10 | counts = dict() 11 | for line in fhand: 12 | line = line.translate(None, string.punctuation) 13 | line = line.lower() 14 | words = line.split() 15 | for word in words: 16 | if word not in counts: 17 | counts[word] = 1 18 | else: 19 | counts[word] += 1 20 | 21 | print counts 22 | -------------------------------------------------------------------------------- /EXAMPLE CODE/count3.py: -------------------------------------------------------------------------------- 1 | import string 2 | fhand = open('romeo-full.txt') 3 | counts = dict() 4 | for line in fhand: 5 | line = line.translate(None, string.punctuation) 6 | line = line.lower() 7 | words = line.split() 8 | for word in words: 9 | if word not in counts: 10 | counts[word] = 1 11 | else: 12 | counts[word] += 1 13 | 14 | # Sort the dictionary by value 15 | lst = list() 16 | for key, val in counts.items(): 17 | lst.append( (val, key) ) 18 | 19 | lst.sort(reverse=True) 20 | 21 | for key, val in lst[:10] : 22 | print key, val 23 | -------------------------------------------------------------------------------- /EXAMPLE CODE/curl1.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | 3 | img = urllib.urlopen('http://www.py4inf.com/cover.jpg').read() 4 | fhand = open('cover.jpg', 'w') 5 | fhand.write(img) 6 | fhand.close() 7 | -------------------------------------------------------------------------------- /EXAMPLE CODE/curl2.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | 3 | img = urllib.urlopen('http://www.py4inf.com/cover.jpg') 4 | fhand = open('cover.jpg', 'w') 5 | size = 0 6 | while True: 7 | info = img.read(100000) 8 | if len(info) < 1 : break 9 | size = size + len(info) 10 | fhand.write(info) 11 | 12 | print size,'characters copied.' 13 | fhand.close() 14 | -------------------------------------------------------------------------------- /EXAMPLE CODE/curl3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urllib 3 | 4 | print 'Please enter a URL like http://www.py4inf.com/cover.jpg' 5 | urlstr = raw_input().strip() 6 | img = urllib.urlopen(urlstr) 7 | 8 | # Get the last "word" 9 | words = urlstr.split('/') 10 | fname = words[-1] 11 | 12 | # Don't overwrite the file 13 | if os.path.exists(fname) : 14 | if raw_input('Replace '+fname+' (Y/n)?') != 'Y' : 15 | print 'Data not copied' 16 | exit() 17 | print 'Replacing',fname 18 | 19 | fhand = open(fname, 'w') 20 | size = 0 21 | while True: 22 | info = img.read(100000) 23 | if len(info) < 1 : break 24 | size = size + len(info) 25 | fhand.write(info) 26 | 27 | print size,'characters copied to',fname 28 | fhand.close() 29 | -------------------------------------------------------------------------------- /EXAMPLE CODE/db1.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('music.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('DROP TABLE IF EXISTS Tracks ') 7 | cur.execute('CREATE TABLE Tracks (title TEXT, plays INTEGER)') 8 | 9 | conn.close() 10 | 11 | -------------------------------------------------------------------------------- /EXAMPLE CODE/db2.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('music.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('INSERT INTO Tracks (title, plays) VALUES ( ?, ? )', 7 | ( 'Thunderstruck', 20 ) ) 8 | cur.execute('INSERT INTO Tracks (title, plays) VALUES ( ?, ? )', 9 | ( 'My Way', 15 ) ) 10 | conn.commit() 11 | 12 | print 'Tracks:' 13 | cur.execute('SELECT title, plays FROM Tracks') 14 | for row in cur : 15 | print row 16 | 17 | cur.execute('DELETE FROM Tracks WHERE plays < 100') 18 | 19 | cur.close() 20 | 21 | -------------------------------------------------------------------------------- /EXAMPLE CODE/egg.py: -------------------------------------------------------------------------------- 1 | fname = raw_input('Enter the file name: ') 2 | if fname == 'na na boo boo' : 3 | print 'NA NA BOO BOO TO YOU - You have been punkd!' 4 | exit() 5 | 6 | try: 7 | fhand = open(fname) 8 | except: 9 | print 'File cannot be opened:', fname 10 | exit() 11 | count = 0 12 | for line in fhand: 13 | if line.startswith('Subject:') : 14 | count = count + 1 15 | print 'There were', count, 'subject lines in', fname 16 | -------------------------------------------------------------------------------- /EXAMPLE CODE/emaildb.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('emaildb.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute(''' 7 | DROP TABLE IF EXISTS Counts''') 8 | 9 | cur.execute(''' 10 | CREATE TABLE Counts (email TEXT, count INTEGER)''') 11 | 12 | fname = raw_input('Enter file name: ') 13 | if ( len(fname) < 1 ) : fname = 'mbox-short.txt' 14 | fh = open(fname) 15 | for line in fh: 16 | if not line.startswith('From: ') : continue 17 | pieces = line.split() 18 | email = pieces[1] 19 | print email 20 | cur.execute('SELECT count FROM Counts WHERE email = ? ', (email, )) 21 | row = cur.fetchone() 22 | if row is None: 23 | cur.execute('''INSERT INTO Counts (email, count) 24 | VALUES ( ?, 1 )''', ( email, ) ) 25 | else : 26 | cur.execute('UPDATE Counts SET count=count+1 WHERE email = ?', 27 | (email, )) 28 | # This statement commits outstanding changes to disk each 29 | # time through the loop - the program can be made faster 30 | # by moving the commit so it runs only after the loop completes 31 | conn.commit() 32 | 33 | # https://www.sqlite.org/lang_select.html 34 | sqlstr = 'SELECT email, count FROM Counts ORDER BY count DESC LIMIT 10' 35 | 36 | print 37 | print "Counts:" 38 | for row in cur.execute(sqlstr) : 39 | print str(row[0]), row[1] 40 | 41 | cur.close() 42 | 43 | -------------------------------------------------------------------------------- /EXAMPLE CODE/fahren.py: -------------------------------------------------------------------------------- 1 | inp = raw_input('Enter Fahrenheit Temperature:') 2 | fahr = float(inp) 3 | cel = (fahr - 32.0) * 5.0 / 9.0 4 | print cel 5 | -------------------------------------------------------------------------------- /EXAMPLE CODE/geodata.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/geodata.zip -------------------------------------------------------------------------------- /EXAMPLE CODE/geodata/README.txt: -------------------------------------------------------------------------------- 1 | Using the Google Geocoding API with a Database and 2 | Visualizing data on Google Map 3 | 4 | In this project, we are using the Google geocoding API 5 | to clean up some user-entered geographic locations of 6 | university names and then placing the data on a Google 7 | Map. 8 | 9 | You should install the SQLite browser to view and modify 10 | the databases from: 11 | 12 | http://sqlitebrowser.org/ 13 | 14 | The first problem to solve is that the Google geocoding 15 | API is rate limited to 2500 requests per day. So if you have 16 | a lot of data you might need to stop and restart the lookup 17 | process several times. So we break the problem into two 18 | phases. 19 | 20 | In the first phase we take our input data in the file 21 | (where.data) and read it one line at a time, and retreive the 22 | geocoded response and store it in a database (geodata.sqlite). 23 | Before we use the geocoding API, we simply check to see if 24 | we already have the data for that particular line of input. 25 | 26 | You can re-start the process at any time by removing the file 27 | geodata.sqlite 28 | 29 | Run the geoload.py program. This program will read the input 30 | lines in where.data and for each line check to see if it is already 31 | in the database and if we don't have the data for the location, 32 | call the geocoding API to retrieve the data and store it in 33 | the database. 34 | 35 | Here is a sample run after there is already some data in the 36 | database: 37 | 38 | Mac: python geoload.py 39 | Win: geoload.py 40 | 41 | Found in database Northeastern University 42 | 43 | Found in database University of Hong Kong, Illinois Institute of Technology, Bradley University 44 | 45 | Found in database Technion 46 | 47 | Found in database Viswakarma Institute, Pune, India 48 | 49 | Found in database UMD 50 | 51 | Found in database Tufts University 52 | 53 | Resolving Monash University 54 | Retrieving http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address=Monash+University 55 | Retrieved 2063 characters { "results" : [ 56 | {u'status': u'OK', u'results': ... } 57 | 58 | Resolving Kokshetau Institute of Economics and Management 59 | Retrieving http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address=Kokshetau+Institute+of+Economics+and+Management 60 | Retrieved 1749 characters { "results" : [ 61 | {u'status': u'OK', u'results': ... } 62 | 63 | The first five locations are already in the database and so they 64 | are skipped. The program scans to the point where it finds un-retrieved 65 | locations and starts retrieving them. 66 | 67 | The geoload.py can be stopped at any time, and there is a counter 68 | that you can use to limit the number of calls to the geocoding 69 | API for each run. 70 | 71 | Once you have some data loaded into geodata.sqlite, you can 72 | visualize the data using the (geodump.py) program. This 73 | program reads the database and writes tile file (where.js) 74 | with the location, latitude, and longitude in the form of 75 | executable JavaScript code. 76 | 77 | A run of the geodump.py program is as follows: 78 | 79 | Mac: python geodump.py 80 | Win: geodump.py 81 | 82 | Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA 42.3396998 -71.08975 83 | Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA 40.6963857 -89.6160811 84 | ... 85 | Technion, Viazman 87, Kesalsaba, 32000, Israel 32.7775 35.0216667 86 | Monash University Clayton Campus, Wellington Road, Clayton VIC 3800, Australia -37.9152113 145.134682 87 | Kokshetau, Kazakhstan 53.2833333 69.3833333 88 | ... 89 | 12 records written to where.js 90 | Open where.html to view the data in a browser 91 | 92 | The file (where.html) consists of HTML and JavaScript to visualize 93 | a Google Map. It reads the most recent data in where.js to get 94 | the data to be visualized. Here is the format of the where.js file: 95 | 96 | myData = [ 97 | [42.3396998,-71.08975, 'Northeastern University, 360 Huntington Avenue, Boston, MA 02115, USA'], 98 | [40.6963857,-89.6160811, 'Bradley University, 1501 West Bradley Avenue, Peoria, IL 61625, USA'], 99 | [32.7775,35.0216667, 'Technion, Viazman 87, Kesalsaba, 32000, Israel'], 100 | ... 101 | ]; 102 | 103 | This is a JavaScript list of lists. The syntax for JavaScript 104 | list constants is very similar to Python so the syntax should 105 | be familiar to you. 106 | 107 | Simply open where.html in a browser to see the locations. You 108 | can hover over each map pin to find the location that the 109 | gecoding API returned for the user-entered input. If you 110 | cannot see any data when you open the where.html file, you might 111 | want to check the JavaScript or developer console for your browser. 112 | 113 | -------------------------------------------------------------------------------- /EXAMPLE CODE/geodata/geodump.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import json 3 | import codecs 4 | 5 | conn = sqlite3.connect('geodata.sqlite') 6 | cur = conn.cursor() 7 | 8 | cur.execute('SELECT * FROM Locations') 9 | fhand = codecs.open('where.js','w', "utf-8") 10 | fhand.write("myData = [\n") 11 | count = 0 12 | for row in cur : 13 | data = str(row[1]) 14 | try: js = json.loads(str(data)) 15 | except: continue 16 | 17 | if not('status' in js and js['status'] == 'OK') : continue 18 | 19 | lat = js["results"][0]["geometry"]["location"]["lat"] 20 | lng = js["results"][0]["geometry"]["location"]["lng"] 21 | if lat == 0 or lng == 0 : continue 22 | where = js['results'][0]['formatted_address'] 23 | where = where.replace("'","") 24 | try : 25 | print where, lat, lng 26 | 27 | count = count + 1 28 | if count > 1 : fhand.write(",\n") 29 | output = "["+str(lat)+","+str(lng)+", '"+where+"']" 30 | fhand.write(output) 31 | except: 32 | continue 33 | 34 | fhand.write("\n];\n") 35 | cur.close() 36 | fhand.close() 37 | print count, "records written to where.js" 38 | print "Open where.html to view the data in a browser" 39 | 40 | -------------------------------------------------------------------------------- /EXAMPLE CODE/geodata/geoload.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import sqlite3 3 | import json 4 | import time 5 | import ssl 6 | 7 | # If you are in China use this URL: 8 | # serviceurl = "http://maps.google.cn/maps/api/geocode/json?" 9 | serviceurl = "http://maps.googleapis.com/maps/api/geocode/json?" 10 | 11 | # Deal with SSL certificate anomalies Python > 2.7 12 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1) 13 | scontext = None 14 | 15 | conn = sqlite3.connect('geodata.sqlite') 16 | cur = conn.cursor() 17 | 18 | cur.execute(''' 19 | CREATE TABLE IF NOT EXISTS Locations (address TEXT, geodata TEXT)''') 20 | 21 | fh = open("where.data") 22 | count = 0 23 | for line in fh: 24 | if count > 200 : break 25 | address = line.strip() 26 | print '' 27 | cur.execute("SELECT geodata FROM Locations WHERE address= ?", (buffer(address), )) 28 | 29 | try: 30 | data = cur.fetchone()[0] 31 | print "Found in database ",address 32 | continue 33 | except: 34 | pass 35 | 36 | print 'Resolving', address 37 | url = serviceurl + urllib.urlencode({"sensor":"false", "address": address}) 38 | print 'Retrieving', url 39 | uh = urllib.urlopen(url, context=scontext) 40 | data = uh.read() 41 | print 'Retrieved',len(data),'characters',data[:20].replace('\n',' ') 42 | count = count + 1 43 | try: 44 | js = json.loads(str(data)) 45 | # print js # We print in case unicode causes an error 46 | except: 47 | continue 48 | 49 | if 'status' not in js or (js['status'] != 'OK' and js['status'] != 'ZERO_RESULTS') : 50 | print '==== Failure To Retrieve ====' 51 | print data 52 | break 53 | 54 | cur.execute('''INSERT INTO Locations (address, geodata) 55 | VALUES ( ?, ? )''', ( buffer(address),buffer(data) ) ) 56 | conn.commit() 57 | time.sleep(1) 58 | 59 | print "Run geodump.py to read the data from the database so you can visualize it on a map." 60 | -------------------------------------------------------------------------------- /EXAMPLE CODE/geodata/where.data: -------------------------------------------------------------------------------- 1 | Northeastern University 2 | University of Hong Kong, Illinois Institute of Technology, Bradley University 3 | Technion 4 | Viswakarma Institute, Pune, India 5 | UMD 6 | Tufts University 7 | Monash University 8 | Kokshetau Institute of Economics and Management 9 | RSU named S.A. Esenin 10 | Tavrida National V.I. Vernadsky University 11 | UOC 12 | Irkutsk State University 13 | Institute of Technology Telkom 14 | Shanghai Jiao Tong University 15 | University of Ilorin, Kwara State. Nigeria 16 | Monash University Churchill Australia 17 | UNISA 18 | Fachhochschule FH Salzburg 19 | Tampere University of Technology (Tampere, Finland) 20 | Saint Petersburg State University 21 | University of São Paulo 22 | Smolensk State University (Russia) 23 | Institute of Business Administration, Karachi 24 | universidad complutense de madrid 25 | Masdar Institute 26 | University of London 27 | University of Oxford 28 | Tallinn University of Technology 29 | University of Tartu 30 | University of Padua 31 | University of Pune, India 32 | National Kyiv Shevchenko University 33 | UC Berkeley 34 | University of Wisconsin - Madison 35 | Lodz University of Technology 36 | NRU IFMO 37 | Dniepropetrovsk National University (Ukraine), Applied Math Faculty 38 | Dokuz Eylul University, Izmir, Turkey 39 | Beijing normal university 40 | University of Piraeus, Athens 41 | Universidad de Buenos Aires (UBA). Argentina. 42 | SASTRA University 43 | Nagpur University 44 | Duke University 45 | San Francisco State University 46 | FATEC-SP - Faculdade de Tecnologia do Estado de São Paulo 47 | University of Texas at Austin 48 | University of applied sciense of Mikkeli (Finland) 49 | Troy University 50 | Universidade do Minho 51 | National University of Sciences and Technology (NUST)-Pakistan 52 | Pontificia universidad catolica de chile 53 | Illinois State University Joliet Junior College 54 | American University in Cairo (AUC) 55 | Obninsk Technical University of Nuclear Power Engineering, Russia 56 | Vyatka State Humanitarian University 57 | Weizmann Institute of Science (Israel) 58 | University of Washington 59 | Kharkiv State Academy of Municipal Economy, Ukraine 60 | Faculty of Electrical Engineering in Sarajevo, University of Sarajevo 61 | Universidad de Los Andes Colombia 62 | University of Colorado at Boulder 63 | Magnitogorsk State Technical University 64 | USC 65 | Simon Fraser University 66 | Columbia University (New York) 67 | University of Southern California 68 | University of Warsaw 69 | Warsaw University of Technology 70 | (Some place in New Zealand you haven't heard of.) 71 | Massey university part-time Distance learning 72 | University of Oklahoma 73 | University of Pavia, Italy 74 | University of Missouri - Columbia 75 | Czech Technical University in Prague 76 | Illinois Institute of Technology 77 | Penn State University 78 | University of Utah 79 | Faculty of Science, University of Zagreb - Department of Mathematics 80 | Universitat Politecnica de Valencia 81 | University of Vienna 82 | University of Puerto Rico - Mayaguez Campus 83 | University "Hyperion" of Bucharest 84 | University of New Haven 85 | University of Washington -Bothell 86 | Drexel University 87 | University of Texas at Austin 88 | University of Helsinki 89 | University of Michigan 90 | Carnegie Mellon University 91 | Kazan Federal University 92 | Pondicherry University 93 | Far-Eastern State University 94 | Nanyang Technological University 95 | Slovak University of Technology 96 | NYU 97 | UFABC - Universidade Federal do ABC, Sanso André - SP - Brazil 98 | University of Debrecen 99 | California State University, San Bernardino 100 | National University "Kyiv-Mohyla Academy" (Kyiv, Ukraine) 101 | Laurentian University 102 | Humanities Institute of TV and Radio, Moscow, Russia 103 | University of Cambridge, UK 104 | Payame Noor University, Tehran, Iran 105 | Middle East Technical University 106 | EPFL 107 | Faculty of Technical Sciences, Novi Sad, Serbia 108 | University of Gothenburg, Sweden 109 | Polytechnic University of Timisoara 110 | University of Hawaii (Go, Rainbows!) 111 | Belarusian State University 112 | Haaga-Helia university of applied sciences 113 | JADAVPUR UNIVERSITY 114 | Gauhati University, India 115 | Universidad de Buenos Aires 116 | Università degli Studi di Genova, Genova, Italia 117 | King Mongkut's University of Technology Thonburi 118 | Universidad de la Sabana, Chia, Colombia 119 | State University of New York (SUNY) College at Oswego 120 | Kyrgyz Slavic Russian University 121 | De La Salle University http://www.dlsu.edu.ph 122 | Jawaharlal Nehru Technological University, INDIA 123 | UCL (Université Catholique de Louvain) in Belgium 124 | Boston University 125 | The University of Manchester 126 | Fachhochschule Düsseldorf 127 | Pine Manor College (AA), Harvard University (BA), Lesley University (MEd) 128 | Simón Bolívar University 129 | Indiana University at Bloomington 130 | RPI 131 | University of Ottawa, Canada 132 | Ural Federal University 133 | BITS Pilani 134 | Transilvania University 135 | IIT(BHU), Varanasi, India 136 | EM Lyon 137 | Universidad Central de Venezuela 138 | NTUU "KPI" 139 | Universidade Federal da Paraiba, Brazil 140 | Budapest University of Technology and Economics 141 | Moscow Institute of Physics & Technology (State University) 142 | Saint Petersburg State University of Aerospace Instrumentation, Russia 143 | North Central College, Naperville, IL 144 | Tech. Uni. Denmark (DTU) 145 | Stanford 146 | "Politehnica" Timisoara 147 | National University of Engineering 148 | Monash 149 | Federal University of Campina Grande (UFCG) 150 | Universidade Federal do Rio Grande do Sul (UFRGS) 151 | Universidad Nacional Autónoma de México 152 | University of New South Wales Harvard Business School 153 | University of Tehran 154 | Old Dominion University 155 | Kyiv Unisersity of Oriental Language 156 | Babcock University 157 | University of Essex 158 | Kharkiv National University of Radio Electronics (Ukraine) 159 | Kaunas Technology University 160 | University of Buenos Aires 161 | University of Jaffna. 162 | R V College of Engineering, Bangalore, India for BE in Instrumentation Technology 163 | Beloit College 164 | UCLA 165 | University of Chicago 166 | University of Sciences and Technology of Oran. Mohamed Boudiaf (USTO-MB). 167 | Zagazig University, Egypt 168 | University of Alberta 169 | Belorussian State University 170 | Jones International University (online) Illinois State Univeristy 171 | University of Florida 172 | Too many to mention. 173 | University of Kerala, India 174 | Politecnico di Milano 175 | Vilnius Gediminas Technical University 176 | Madras university/ Bharthidasan University in India . 177 | Universidade Tecnica de Lisboa - Instituto Superior Técnico 178 | Does not apply. 179 | Stellenbosch University 180 | imt ghazIABAD INDIA 181 | University of Pennsylvania 182 | National Institute of Technology, Jalandhar (India) 183 | Universidad ICESI 184 | Virginia Tech 185 | arizona state university 186 | Universidad del Valle de Guatemala 187 | Mykolas Romeris University, Vilnius, Lithuania 188 | BSU 189 | Distance Learning Center at the Technical University of Kaiserslautern in Germany 190 | Ain shams university, Cairo, Egypt 191 | Universidad Nacional de Colombia 192 | Saint-Petersburg Polytechnic Univesity 193 | NAIT (Northern Alberta Institute of Technology) 194 | Wayne State took courses at U of M 195 | Universidad Nacional, Costa Rica 196 | Marietta College (Ohio) Northwestern University 197 | Grandville 198 | Portland State University, Oregon Institute of Technology 199 | Malayer Azad University, Iran 200 | -------------------------------------------------------------------------------- /EXAMPLE CODE/geodata/where.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | A Map of Information 6 | 7 | 8 | 10 | 11 | 12 | 13 | 14 | 43 | 44 | 45 |
46 |

About this Map

47 |

48 | This is a cool map from 49 | www.pythonlearn.com. 50 |

51 | 52 | 53 | -------------------------------------------------------------------------------- /EXAMPLE CODE/geodata/where.js: -------------------------------------------------------------------------------- 1 | myData = [ 2 | [42.340075,-71.0895367, 'Northeastern, Boston, MA 02115, USA'], 3 | [38.2113643,-85.7470011, 'Bradley Ave, Louisville, KY, USA'], 4 | [32.778949,35.019648, 'Technion/ Sports Building, Haifa'], 5 | [18.4574518,73.8837999, 'Vishwakarma Institutes Play Ground, Yashodhan Society, Kapil Nagar, Kondhwa Budrukh, Vishwakarma, Maharashtra 411048, India'], 6 | [33.1561058,131.826132, 'Japan, 〒875-0002 Ōita-ken, Usuki-shi, Shitanoe, 1232−2 UMD'], 7 | [42.4036847,-71.120482, 'South Hall Tufts University, 30 Lower Campus Rd, Somerville, MA 02144, USA'], 8 | [-37.914517,145.1303881, 'Monash College, Wellington Rd, Clayton VIC 3168, Australia'], 9 | [53.2948229,69.4047872, 'Kokshetau 020000, Kazakhstan'], 10 | [40.7127837,-74.0059413, 'New York, NY, USA'], 11 | [52.2869741,104.3050183, 'Irkutsk, Irkutsk Oblast, Russia'] 12 | ]; 13 | -------------------------------------------------------------------------------- /EXAMPLE CODE/geojson.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import json 3 | 4 | serviceurl = 'http://maps.googleapis.com/maps/api/geocode/json?' 5 | #serviceurl = 'http://python-data.dr-chuck.net/geojson?' 6 | 7 | while True: 8 | address = raw_input('Enter location: ') 9 | if len(address) < 1 : break 10 | 11 | url = serviceurl + urllib.urlencode({'sensor':'false', 'address': address}) 12 | print 'Retrieving', url 13 | uh = urllib.urlopen(url) 14 | data = uh.read() 15 | print 'Retrieved',len(data),'characters' 16 | 17 | try: js = json.loads(str(data)) 18 | except: js = None 19 | if 'status' not in js or js['status'] != 'OK': 20 | print '==== Failure To Retrieve ====' 21 | print data 22 | continue 23 | 24 | print json.dumps(js, indent=4) 25 | 26 | lat = js["results"][0]["geometry"]["location"]["lat"] 27 | lng = js["results"][0]["geometry"]["location"]["lng"] 28 | print 'lat',lat,'lng',lng 29 | location = js['results'][0]['formatted_address'] 30 | print location 31 | 32 | -------------------------------------------------------------------------------- /EXAMPLE CODE/geoxml.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import xml.etree.ElementTree as ET 3 | 4 | serviceurl = 'http://maps.googleapis.com/maps/api/geocode/xml?' 5 | 6 | while True: 7 | address = raw_input('Enter location: ') 8 | if len(address) < 1 : break 9 | 10 | url = serviceurl + urllib.urlencode({'sensor':'false', 'address': address}) 11 | print 'Retrieving', url 12 | uh = urllib.urlopen(url) 13 | data = uh.read() 14 | print 'Retrieved',len(data),'characters' 15 | print data 16 | tree = ET.fromstring(data) 17 | 18 | 19 | results = tree.findall('result') 20 | lat = results[0].find('geometry').find('location').find('lat').text 21 | lng = results[0].find('geometry').find('location').find('lng').text 22 | location = results[0].find('formatted_address').text 23 | 24 | print 'lat',lat,'lng',lng 25 | print location 26 | -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/gmane.zip -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane/README.txt: -------------------------------------------------------------------------------- 1 | Analyzing an EMAIL Archive vizualizing the data using the 2 | D3 JavaScript library 3 | 4 | Here is a copy of the Sakai Developer Mailing list from 2006-2014. 5 | 6 | http://mbox.dr-chuck.net/ 7 | 8 | You should install the SQLite browser to view and modify the databases from: 9 | 10 | http://sqlitebrowser.org/ 11 | 12 | The base URL is hard-coded in the gmane.py. Make sure to delete the 13 | content.sqlite file if you switch the base url. The gmane.py file 14 | operates as a spider in that it runs slowly and retrieves one mail 15 | message per second so as to avoid getting throttled. It stores all of 16 | its data in a database and can be interrupted and re-started 17 | as often as needed. It may take many hours to pull all the data 18 | down. So you may need to restart several times. 19 | 20 | To give you a head-start, I have put up 600MB of pre-spidered Sakai 21 | email here: 22 | 23 | https://online.dr-chuck.com/files/sakai/email/content.sqlite.zip 24 | 25 | If you download and unzip this, you can "catch up with the 26 | latest" by running gmane.py. 27 | 28 | Navigate to the folder where you extracted the gmane.zip 29 | 30 | Here is a run of gmane.py getting the last five messages of the 31 | sakai developer list: 32 | 33 | Mac: python gmane.py 34 | Win: gmane.py 35 | 36 | How many messages:10 37 | http://mbox.dr-chuck.net/sakai.devel/5/6 9443 38 | john@caret.cam.ac.uk 2005-12-09T13:32:29+00:00 re: lms/vle rants/comments 39 | http://mbox.dr-chuck.net/sakai.devel/6/7 3586 40 | s-githens@northwestern.edu 2005-12-09T13:32:31-06:00 re: sakaiportallogin and presense 41 | http://mbox.dr-chuck.net/sakai.devel/7/8 10600 42 | john@caret.cam.ac.uk 2005-12-09T13:42:24+00:00 re: lms/vle rants/comments 43 | 44 | The program scans content.sqlite from 1 up to the first message number not 45 | already spidered and starts spidering at that message. It continues spidering 46 | until it has spidered the desired number of messages or it reaches a page 47 | that does not appear to be a properly formatted message. 48 | 49 | Sometimes there is missing a message. Perhaps administrators can delete messages 50 | or perhaps they get lost - I don't know. If your spider stops, and it seems it has hit 51 | a missing message, go into the SQLite Manager and add a row with the missing id - leave 52 | all the other fields blank - and then restart gmane.py. This will unstick the 53 | spidering process and allow it to continue. These empty messages will be ignored in the next 54 | phase of the process. 55 | 56 | One nice thing is that once you have spidered all of the messages and have them in 57 | content.sqlite, you can run gmane.py again to get new messages as they get sent to the 58 | list. gmane.py will quickly scan to the end of the already-spidered pages and check 59 | if there are new messages and then quickly retrieve those messages and add them 60 | to content.sqlite. 61 | 62 | The content.sqlite data is pretty raw, with an innefficient data model, and not compressed. 63 | This is intentional as it allows you to look at content.sqlite to debug the process. 64 | It would be a bad idea to run any queries against this database as they would be 65 | slow. 66 | 67 | The second process is running the program gmodel.py. gmodel.py reads the rough/raw 68 | data from content.sqlite and produces a cleaned-up and well-modeled version of the 69 | data in the file index.sqlite. The file index.sqlite will be much smaller (often 10X 70 | smaller) than content.sqlite because it also compresses the header and body text. 71 | 72 | Each time gmodel.py runs - it completely wipes out and re-builds index.sqlite, allowing 73 | you to adjust its parameters and edit the mapping tables in content.sqlite to tweak the 74 | data cleaning process. 75 | 76 | Running gmodel.py works as follows: 77 | 78 | Mac: python gmodel.py 79 | Win: gmodel.py 80 | 81 | Loaded allsenders 1588 and mapping 28 dns mapping 1 82 | 1 2005-12-08T23:34:30-06:00 ggolden22@mac.com 83 | 251 2005-12-22T10:03:20-08:00 tpamsler@ucdavis.edu 84 | 501 2006-01-12T11:17:34-05:00 lance@indiana.edu 85 | 751 2006-01-24T11:13:28-08:00 vrajgopalan@ucmerced.edu 86 | ... 87 | 88 | The gmodel.py program does a number of data cleaing steps 89 | 90 | Domain names are truncated to two levels for .com, .org, .edu, and .net 91 | other domain names are truncated to three levels. So si.umich.edu becomes 92 | umich.edu and caret.cam.ac.uk becomes cam.ac.uk. Also mail addresses are 93 | forced to lower case and some of the @gmane.org address like the following 94 | 95 | arwhyte-63aXycvo3TyHXe+LvDLADg@public.gmane.org 96 | 97 | are converted to the real address whenever there is a matching real email 98 | address elsewhere in the message corpus. 99 | 100 | If you look in the content.sqlite database there are two tables that allow 101 | you to map both domain names and individual email addresses that change over 102 | the lifetime of the email list. For example, Steve Githens used the following 103 | email addresses over the life of the Sakai developer list: 104 | 105 | s-githens@northwestern.edu 106 | sgithens@cam.ac.uk 107 | swgithen@mtu.edu 108 | 109 | We can add two entries to the Mapping table 110 | 111 | s-githens@northwestern.edu -> swgithen@mtu.edu 112 | sgithens@cam.ac.uk -> swgithen@mtu.edu 113 | 114 | And so all the mail messages will be collected under one sender even if 115 | they used several email addresses over the lifetime of the mailing list. 116 | 117 | You can also make similar entries in the DNSMapping table if there are multiple 118 | DNS names you want mapped to a single DNS. In the Sakai data I add the following 119 | mapping: 120 | 121 | iupui.edu -> indiana.edu 122 | 123 | So all the folks from the various Indiana University campuses are tracked together 124 | 125 | You can re-run the gmodel.py over and over as you look at the data, and add mappings 126 | to make the data cleaner and cleaner. When you are done, you will have a nicely 127 | indexed version of the email in index.sqlite. This is the file to use to do data 128 | analysis. With this file, data analysis will be really quick. 129 | 130 | The first, simplest data analysis is to do a "who does the most" and "which 131 | organzation does the most"? This is done using gbasic.py: 132 | 133 | Mac: python gbasic.py 134 | Win: gbasic.py 135 | 136 | How many to dump? 5 137 | Loaded messages= 51330 subjects= 25033 senders= 1584 138 | 139 | Top 5 Email list participants 140 | steve.swinsburg@gmail.com 2657 141 | azeckoski@unicon.net 1742 142 | ieb@tfd.co.uk 1591 143 | csev@umich.edu 1304 144 | david.horwitz@uct.ac.za 1184 145 | 146 | Top 5 Email list organizations 147 | gmail.com 7339 148 | umich.edu 6243 149 | uct.ac.za 2451 150 | indiana.edu 2258 151 | unicon.net 2055 152 | 153 | You can look at the data in index.sqlite and if you find a problem, you 154 | can update the Mapping table and DNSMapping table in content.sqlite and 155 | re-run gmodel.py. 156 | 157 | There is a simple vizualization of the word frequence in the subject lines 158 | in the file gword.py: 159 | 160 | Mac: python gword.py 161 | Win: gword.py 162 | 163 | Range of counts: 33229 129 164 | Output written to gword.js 165 | 166 | This produces the file gword.js which you can visualize using the file 167 | gword.htm. 168 | 169 | A second visualization is in gline.py. It visualizes email participation by 170 | organizations over time. 171 | 172 | Mac: python gline.py 173 | Win: gline.py 174 | 175 | Loaded messages= 51330 subjects= 25033 senders= 1584 176 | Top 10 Organizations 177 | ['gmail.com', 'umich.edu', 'uct.ac.za', 'indiana.edu', 'unicon.net', 'tfd.co.uk', 'berkeley.edu', 'longsight.com', 'stanford.edu', 'ox.ac.uk'] 178 | Output written to gline.js 179 | 180 | Its output is written to gline.js which is visualized using gline.htm. 181 | If you have a problem with gline.htm, you can try gline2.htm or gline3.htm 182 | to vizualize your data. 183 | 184 | Some URLs for visualization ideas: 185 | 186 | https://developers.google.com/chart/ 187 | 188 | https://developers.google.com/chart/interactive/docs/gallery/motionchart 189 | 190 | https://code.google.com/apis/ajax/playground/?type=visualization#motion_chart_time_formats 191 | 192 | https://developers.google.com/chart/interactive/docs/gallery/annotatedtimeline 193 | 194 | http://bost.ocks.org/mike/uberdata/ 195 | 196 | http://mbostock.github.io/d3/talk/20111018/calendar.html 197 | 198 | http://nltk.org/install.html 199 | 200 | As always - comments welcome. 201 | 202 | -- Dr. Chuck 203 | Sun Sep 29 00:11:01 EDT 2013 204 | 205 | -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane/d3.layout.cloud.js: -------------------------------------------------------------------------------- 1 | // Word cloud layout by Jason Davies, http://www.jasondavies.com/word-cloud/ 2 | // Algorithm due to Jonathan Feinberg, http://static.mrfeinberg.com/bv_ch03.pdf 3 | (function(exports) { 4 | function cloud() { 5 | var size = [256, 256], 6 | text = cloudText, 7 | font = cloudFont, 8 | fontSize = cloudFontSize, 9 | fontStyle = cloudFontNormal, 10 | fontWeight = cloudFontNormal, 11 | rotate = cloudRotate, 12 | padding = cloudPadding, 13 | spiral = archimedeanSpiral, 14 | words = [], 15 | timeInterval = Infinity, 16 | event = d3.dispatch("word", "end"), 17 | timer = null, 18 | cloud = {}; 19 | 20 | cloud.start = function() { 21 | var board = zeroArray((size[0] >> 5) * size[1]), 22 | bounds = null, 23 | n = words.length, 24 | i = -1, 25 | tags = [], 26 | data = words.map(function(d, i) { 27 | d.text = text.call(this, d, i); 28 | d.font = font.call(this, d, i); 29 | d.style = fontStyle.call(this, d, i); 30 | d.weight = fontWeight.call(this, d, i); 31 | d.rotate = rotate.call(this, d, i); 32 | d.size = ~~fontSize.call(this, d, i); 33 | d.padding = cloudPadding.call(this, d, i); 34 | return d; 35 | }).sort(function(a, b) { return b.size - a.size; }); 36 | 37 | if (timer) clearInterval(timer); 38 | timer = setInterval(step, 0); 39 | step(); 40 | 41 | return cloud; 42 | 43 | function step() { 44 | var start = +new Date, 45 | d; 46 | while (+new Date - start < timeInterval && ++i < n && timer) { 47 | d = data[i]; 48 | d.x = (size[0] * (Math.random() + .5)) >> 1; 49 | d.y = (size[1] * (Math.random() + .5)) >> 1; 50 | cloudSprite(d, data, i); 51 | if (place(board, d, bounds)) { 52 | tags.push(d); 53 | event.word(d); 54 | if (bounds) cloudBounds(bounds, d); 55 | else bounds = [{x: d.x + d.x0, y: d.y + d.y0}, {x: d.x + d.x1, y: d.y + d.y1}]; 56 | // Temporary hack 57 | d.x -= size[0] >> 1; 58 | d.y -= size[1] >> 1; 59 | } 60 | } 61 | if (i >= n) { 62 | cloud.stop(); 63 | event.end(tags, bounds); 64 | } 65 | } 66 | } 67 | 68 | cloud.stop = function() { 69 | if (timer) { 70 | clearInterval(timer); 71 | timer = null; 72 | } 73 | return cloud; 74 | }; 75 | 76 | cloud.timeInterval = function(x) { 77 | if (!arguments.length) return timeInterval; 78 | timeInterval = x == null ? Infinity : x; 79 | return cloud; 80 | }; 81 | 82 | function place(board, tag, bounds) { 83 | var perimeter = [{x: 0, y: 0}, {x: size[0], y: size[1]}], 84 | startX = tag.x, 85 | startY = tag.y, 86 | maxDelta = Math.sqrt(size[0] * size[0] + size[1] * size[1]), 87 | s = spiral(size), 88 | dt = Math.random() < .5 ? 1 : -1, 89 | t = -dt, 90 | dxdy, 91 | dx, 92 | dy; 93 | 94 | while (dxdy = s(t += dt)) { 95 | dx = ~~dxdy[0]; 96 | dy = ~~dxdy[1]; 97 | 98 | if (Math.min(dx, dy) > maxDelta) break; 99 | 100 | tag.x = startX + dx; 101 | tag.y = startY + dy; 102 | 103 | if (tag.x + tag.x0 < 0 || tag.y + tag.y0 < 0 || 104 | tag.x + tag.x1 > size[0] || tag.y + tag.y1 > size[1]) continue; 105 | // TODO only check for collisions within current bounds. 106 | if (!bounds || !cloudCollide(tag, board, size[0])) { 107 | if (!bounds || collideRects(tag, bounds)) { 108 | var sprite = tag.sprite, 109 | w = tag.width >> 5, 110 | sw = size[0] >> 5, 111 | lx = tag.x - (w << 4), 112 | sx = lx & 0x7f, 113 | msx = 32 - sx, 114 | h = tag.y1 - tag.y0, 115 | x = (tag.y + tag.y0) * sw + (lx >> 5), 116 | last; 117 | for (var j = 0; j < h; j++) { 118 | last = 0; 119 | for (var i = 0; i <= w; i++) { 120 | board[x + i] |= (last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0); 121 | } 122 | x += sw; 123 | } 124 | delete tag.sprite; 125 | return true; 126 | } 127 | } 128 | } 129 | return false; 130 | } 131 | 132 | cloud.words = function(x) { 133 | if (!arguments.length) return words; 134 | words = x; 135 | return cloud; 136 | }; 137 | 138 | cloud.size = function(x) { 139 | if (!arguments.length) return size; 140 | size = [+x[0], +x[1]]; 141 | return cloud; 142 | }; 143 | 144 | cloud.font = function(x) { 145 | if (!arguments.length) return font; 146 | font = d3.functor(x); 147 | return cloud; 148 | }; 149 | 150 | cloud.fontStyle = function(x) { 151 | if (!arguments.length) return fontStyle; 152 | fontStyle = d3.functor(x); 153 | return cloud; 154 | }; 155 | 156 | cloud.fontWeight = function(x) { 157 | if (!arguments.length) return fontWeight; 158 | fontWeight = d3.functor(x); 159 | return cloud; 160 | }; 161 | 162 | cloud.rotate = function(x) { 163 | if (!arguments.length) return rotate; 164 | rotate = d3.functor(x); 165 | return cloud; 166 | }; 167 | 168 | cloud.text = function(x) { 169 | if (!arguments.length) return text; 170 | text = d3.functor(x); 171 | return cloud; 172 | }; 173 | 174 | cloud.spiral = function(x) { 175 | if (!arguments.length) return spiral; 176 | spiral = spirals[x + ""] || x; 177 | return cloud; 178 | }; 179 | 180 | cloud.fontSize = function(x) { 181 | if (!arguments.length) return fontSize; 182 | fontSize = d3.functor(x); 183 | return cloud; 184 | }; 185 | 186 | cloud.padding = function(x) { 187 | if (!arguments.length) return padding; 188 | padding = d3.functor(x); 189 | return cloud; 190 | }; 191 | 192 | return d3.rebind(cloud, event, "on"); 193 | } 194 | 195 | function cloudText(d) { 196 | return d.text; 197 | } 198 | 199 | function cloudFont() { 200 | return "serif"; 201 | } 202 | 203 | function cloudFontNormal() { 204 | return "normal"; 205 | } 206 | 207 | function cloudFontSize(d) { 208 | return Math.sqrt(d.value); 209 | } 210 | 211 | function cloudRotate() { 212 | return (~~(Math.random() * 6) - 3) * 30; 213 | } 214 | 215 | function cloudPadding() { 216 | return 1; 217 | } 218 | 219 | // Fetches a monochrome sprite bitmap for the specified text. 220 | // Load in batches for speed. 221 | function cloudSprite(d, data, di) { 222 | if (d.sprite) return; 223 | c.clearRect(0, 0, (cw << 5) / ratio, ch / ratio); 224 | var x = 0, 225 | y = 0, 226 | maxh = 0, 227 | n = data.length; 228 | di--; 229 | while (++di < n) { 230 | d = data[di]; 231 | c.save(); 232 | c.font = d.style + " " + d.weight + " " + ~~((d.size + 1) / ratio) + "px " + d.font; 233 | var w = c.measureText(d.text + "m").width * ratio, 234 | h = d.size << 1; 235 | if (d.rotate) { 236 | var sr = Math.sin(d.rotate * cloudRadians), 237 | cr = Math.cos(d.rotate * cloudRadians), 238 | wcr = w * cr, 239 | wsr = w * sr, 240 | hcr = h * cr, 241 | hsr = h * sr; 242 | w = (Math.max(Math.abs(wcr + hsr), Math.abs(wcr - hsr)) + 0x1f) >> 5 << 5; 243 | h = ~~Math.max(Math.abs(wsr + hcr), Math.abs(wsr - hcr)); 244 | } else { 245 | w = (w + 0x1f) >> 5 << 5; 246 | } 247 | if (h > maxh) maxh = h; 248 | if (x + w >= (cw << 5)) { 249 | x = 0; 250 | y += maxh; 251 | maxh = 0; 252 | } 253 | if (y + h >= ch) break; 254 | c.translate((x + (w >> 1)) / ratio, (y + (h >> 1)) / ratio); 255 | if (d.rotate) c.rotate(d.rotate * cloudRadians); 256 | c.fillText(d.text, 0, 0); 257 | c.restore(); 258 | d.width = w; 259 | d.height = h; 260 | d.xoff = x; 261 | d.yoff = y; 262 | d.x1 = w >> 1; 263 | d.y1 = h >> 1; 264 | d.x0 = -d.x1; 265 | d.y0 = -d.y1; 266 | x += w; 267 | } 268 | var pixels = c.getImageData(0, 0, (cw << 5) / ratio, ch / ratio).data, 269 | sprite = []; 270 | while (--di >= 0) { 271 | d = data[di]; 272 | var w = d.width, 273 | w32 = w >> 5, 274 | h = d.y1 - d.y0, 275 | p = d.padding; 276 | // Zero the buffer 277 | for (var i = 0; i < h * w32; i++) sprite[i] = 0; 278 | x = d.xoff; 279 | if (x == null) return; 280 | y = d.yoff; 281 | var seen = 0, 282 | seenRow = -1; 283 | for (var j = 0; j < h; j++) { 284 | for (var i = 0; i < w; i++) { 285 | var k = w32 * j + (i >> 5), 286 | m = pixels[((y + j) * (cw << 5) + (x + i)) << 2] ? 1 << (31 - (i % 32)) : 0; 287 | if (p) { 288 | if (j) sprite[k - w32] |= m; 289 | if (j < w - 1) sprite[k + w32] |= m; 290 | m |= (m << 1) | (m >> 1); 291 | } 292 | sprite[k] |= m; 293 | seen |= m; 294 | } 295 | if (seen) seenRow = j; 296 | else { 297 | d.y0++; 298 | h--; 299 | j--; 300 | y++; 301 | } 302 | } 303 | d.y1 = d.y0 + seenRow; 304 | d.sprite = sprite.slice(0, (d.y1 - d.y0) * w32); 305 | } 306 | } 307 | 308 | // Use mask-based collision detection. 309 | function cloudCollide(tag, board, sw) { 310 | sw >>= 5; 311 | var sprite = tag.sprite, 312 | w = tag.width >> 5, 313 | lx = tag.x - (w << 4), 314 | sx = lx & 0x7f, 315 | msx = 32 - sx, 316 | h = tag.y1 - tag.y0, 317 | x = (tag.y + tag.y0) * sw + (lx >> 5), 318 | last; 319 | for (var j = 0; j < h; j++) { 320 | last = 0; 321 | for (var i = 0; i <= w; i++) { 322 | if (((last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0)) 323 | & board[x + i]) return true; 324 | } 325 | x += sw; 326 | } 327 | return false; 328 | } 329 | 330 | function cloudBounds(bounds, d) { 331 | var b0 = bounds[0], 332 | b1 = bounds[1]; 333 | if (d.x + d.x0 < b0.x) b0.x = d.x + d.x0; 334 | if (d.y + d.y0 < b0.y) b0.y = d.y + d.y0; 335 | if (d.x + d.x1 > b1.x) b1.x = d.x + d.x1; 336 | if (d.y + d.y1 > b1.y) b1.y = d.y + d.y1; 337 | } 338 | 339 | function collideRects(a, b) { 340 | return a.x + a.x1 > b[0].x && a.x + a.x0 < b[1].x && a.y + a.y1 > b[0].y && a.y + a.y0 < b[1].y; 341 | } 342 | 343 | function archimedeanSpiral(size) { 344 | var e = size[0] / size[1]; 345 | return function(t) { 346 | return [e * (t *= .1) * Math.cos(t), t * Math.sin(t)]; 347 | }; 348 | } 349 | 350 | function rectangularSpiral(size) { 351 | var dy = 4, 352 | dx = dy * size[0] / size[1], 353 | x = 0, 354 | y = 0; 355 | return function(t) { 356 | var sign = t < 0 ? -1 : 1; 357 | // See triangular numbers: T_n = n * (n + 1) / 2. 358 | switch ((Math.sqrt(1 + 4 * sign * t) - sign) & 3) { 359 | case 0: x += dx; break; 360 | case 1: y += dy; break; 361 | case 2: x -= dx; break; 362 | default: y -= dy; break; 363 | } 364 | return [x, y]; 365 | }; 366 | } 367 | 368 | // TODO reuse arrays? 369 | function zeroArray(n) { 370 | var a = [], 371 | i = -1; 372 | while (++i < n) a[i] = 0; 373 | return a; 374 | } 375 | 376 | var cloudRadians = Math.PI / 180, 377 | cw = 1 << 11 >> 5, 378 | ch = 1 << 11, 379 | canvas, 380 | ratio = 1; 381 | 382 | if (typeof document !== "undefined") { 383 | canvas = document.createElement("canvas"); 384 | canvas.width = 1; 385 | canvas.height = 1; 386 | ratio = Math.sqrt(canvas.getContext("2d").getImageData(0, 0, 1, 1).data.length >> 2); 387 | canvas.width = (cw << 5) / ratio; 388 | canvas.height = ch / ratio; 389 | } else { 390 | // node-canvas support 391 | var Canvas = require("canvas"); 392 | canvas = new Canvas(cw << 5, ch); 393 | } 394 | 395 | var c = canvas.getContext("2d"), 396 | spirals = { 397 | archimedean: archimedeanSpiral, 398 | rectangular: rectangularSpiral 399 | }; 400 | c.fillStyle = "red"; 401 | c.textAlign = "center"; 402 | 403 | exports.cloud = cloud; 404 | })(typeof exports === "undefined" ? d3.layout || (d3.layout = {}) : exports); 405 | -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane/gbasic.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib 4 | import zlib 5 | 6 | howmany = int(raw_input("How many to dump? ")) 7 | 8 | conn = sqlite3.connect('index.sqlite') 9 | conn.text_factory = str 10 | cur = conn.cursor() 11 | 12 | cur.execute('''SELECT Messages.id, sender FROM Messages 13 | JOIN Senders ON Messages.sender_id = Senders.id''') 14 | 15 | sendcounts = dict() 16 | sendorgs = dict() 17 | for message in cur : 18 | sender = message[1] 19 | sendcounts[sender] = sendcounts.get(sender,0) + 1 20 | pieces = sender.split("@") 21 | if len(pieces) != 2 : continue 22 | dns = pieces[1] 23 | sendorgs[dns] = sendorgs.get(dns,0) + 1 24 | 25 | print '' 26 | print 'Top',howmany,'Email list participants' 27 | 28 | x = sorted(sendcounts, key=sendcounts.get, reverse=True) 29 | for k in x[:howmany]: 30 | print k, sendcounts[k] 31 | if sendcounts[k] < 10 : break 32 | 33 | print '' 34 | print 'Top',howmany,'Email list organizations' 35 | 36 | x = sorted(sendorgs, key=sendorgs.get, reverse=True) 37 | for k in x[:howmany]: 38 | print k, sendorgs[k] 39 | if sendorgs[k] < 10 : break 40 | -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane/gline.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 19 | 20 | 21 |
22 | 23 | 24 | -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane/gline.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib 4 | import zlib 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | conn.text_factory = str 8 | cur = conn.cursor() 9 | 10 | # Determine the top ten organizations 11 | cur.execute('''SELECT Messages.id, sender FROM Messages 12 | JOIN Senders ON Messages.sender_id = Senders.id''') 13 | 14 | sendorgs = dict() 15 | for message_row in cur : 16 | sender = message_row[1] 17 | pieces = sender.split("@") 18 | if len(pieces) != 2 : continue 19 | dns = pieces[1] 20 | sendorgs[dns] = sendorgs.get(dns,0) + 1 21 | 22 | # pick the top schools 23 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 24 | orgs = orgs[:10] 25 | print "Top 10 Organizations" 26 | print orgs 27 | # orgs = ['total'] + orgs 28 | 29 | # Read through the messages 30 | counts = dict() 31 | months = list() 32 | 33 | cur.execute('''SELECT Messages.id, sender, sent_at FROM Messages 34 | JOIN Senders ON Messages.sender_id = Senders.id''') 35 | 36 | for message_row in cur : 37 | sender = message_row[1] 38 | pieces = sender.split("@") 39 | if len(pieces) != 2 : continue 40 | dns = pieces[1] 41 | if dns not in orgs : continue 42 | month = message_row[2][:7] 43 | if month not in months : months.append(month) 44 | key = (month, dns) 45 | counts[key] = counts.get(key,0) + 1 46 | tkey = (month, 'total') 47 | counts[tkey] = counts.get(tkey,0) + 1 48 | 49 | months.sort() 50 | print counts 51 | print months 52 | 53 | fhand = open('gline.js','w') 54 | fhand.write("gline = [ ['Month'") 55 | for org in orgs: 56 | fhand.write(",'"+org+"'") 57 | fhand.write("]") 58 | 59 | # for month in months[1:-1]: 60 | for month in months: 61 | fhand.write(",\n['"+month+"'") 62 | for org in orgs: 63 | key = (month, org) 64 | val = counts.get(key,0) 65 | fhand.write(","+str(val)) 66 | fhand.write("]"); 67 | 68 | fhand.write("\n];\n") 69 | 70 | print "Data written to gline.js" 71 | print "Open gline.htm in a browser to view" 72 | 73 | -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane/gline2.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 21 | 22 | 23 |
24 | 25 | 26 | -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane/gline3.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Line Chart 6 | 7 | 8 | 15 | 16 | 17 | 18 |
19 | 20 |
21 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane/gmane.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import sqlite3 3 | import time 4 | import ssl 5 | import urllib 6 | from urlparse import urljoin 7 | from urlparse import urlparse 8 | import re 9 | from datetime import datetime, timedelta 10 | 11 | # Not all systems have this so conditionally define parser 12 | try: 13 | import dateutil.parser as parser 14 | except: 15 | pass 16 | 17 | def parsemaildate(md) : 18 | # See if we have dateutil 19 | try: 20 | pdate = parser.parse(tdate) 21 | test_at = pdate.isoformat() 22 | return test_at 23 | except: 24 | pass 25 | 26 | # Non-dateutil version - we try our best 27 | 28 | pieces = md.split() 29 | notz = " ".join(pieces[:4]).strip() 30 | 31 | # Try a bunch of format variations - strptime() is *lame* 32 | dnotz = None 33 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 34 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 35 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 36 | try: 37 | dnotz = datetime.strptime(notz, form) 38 | break 39 | except: 40 | continue 41 | 42 | if dnotz is None : 43 | # print 'Bad Date:',md 44 | return None 45 | 46 | iso = dnotz.isoformat() 47 | 48 | tz = "+0000" 49 | try: 50 | tz = pieces[4] 51 | ival = int(tz) # Only want numeric timezone values 52 | if tz == '-0000' : tz = '+0000' 53 | tzh = tz[:3] 54 | tzm = tz[3:] 55 | tz = tzh+":"+tzm 56 | except: 57 | pass 58 | 59 | return iso+tz 60 | 61 | conn = sqlite3.connect('content.sqlite') 62 | cur = conn.cursor() 63 | conn.text_factory = str 64 | 65 | baseurl = "http://mbox.dr-chuck.net/sakai.devel/" 66 | 67 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 68 | (id INTEGER UNIQUE, email TEXT, sent_at TEXT, 69 | subject TEXT, headers TEXT, body TEXT)''') 70 | 71 | start = 0 72 | cur.execute('SELECT max(id) FROM Messages') 73 | try: 74 | row = cur.fetchone() 75 | if row[0] is not None: 76 | start = row[0] 77 | except: 78 | start = 0 79 | row = None 80 | 81 | print start 82 | 83 | many = 0 84 | 85 | # Skip up to five messages 86 | skip = 5 87 | while True: 88 | if ( many < 1 ) : 89 | sval = raw_input('How many messages:') 90 | if ( len(sval) < 1 ) : break 91 | many = int(sval) 92 | 93 | start = start + 1 94 | cur.execute('SELECT id FROM Messages WHERE id=?', (start,) ) 95 | try: 96 | row = cur.fetchone() 97 | if row is not None : continue 98 | except: 99 | row = None 100 | 101 | many = many - 1 102 | url = baseurl + str(start) + '/' + str(start + 1) 103 | 104 | try: 105 | # Deal with SSL certificate anomalies Python > 2.7 106 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1) 107 | # document = urllib.urlopen(url, context=scontext) 108 | 109 | document = urllib.urlopen(url) 110 | 111 | text = document.read() 112 | if document.getcode() != 200 : 113 | print "Error code=",document.getcode(), url 114 | break 115 | except KeyboardInterrupt: 116 | print '' 117 | print 'Program interrupted by user...' 118 | break 119 | except: 120 | print "Unable to retrieve or parse page",url 121 | print sys.exc_info()[0] 122 | break 123 | 124 | print url,len(text) 125 | 126 | if not text.startswith("From "): 127 | if skip < 1 : 128 | print text 129 | print "End of mail stream reached..." 130 | quit () 131 | print " Skipping badly formed message" 132 | skip = skip-1 133 | continue 134 | 135 | pos = text.find("\n\n") 136 | if pos > 0 : 137 | hdr = text[:pos] 138 | body = text[pos+2:] 139 | else: 140 | print text 141 | print "Could not find break between headers and body" 142 | break 143 | 144 | skip = 5 # reset skip count 145 | 146 | email = None 147 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 148 | if len(x) == 1 : 149 | email = x[0]; 150 | email = email.strip().lower() 151 | email = email.replace("<","") 152 | else: 153 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 154 | if len(x) == 1 : 155 | email = x[0]; 156 | email = email.strip().lower() 157 | email = email.replace("<","") 158 | 159 | date = None 160 | y = re.findall('\Date: .*, (.*)\n', hdr) 161 | if len(y) == 1 : 162 | tdate = y[0] 163 | tdate = tdate[:26] 164 | try: 165 | sent_at = parsemaildate(tdate) 166 | except: 167 | print text 168 | print "Parse fail",tdate 169 | break 170 | 171 | subject = None 172 | z = re.findall('\Subject: (.*)\n', hdr) 173 | if len(z) == 1 : subject = z[0].strip().lower(); 174 | 175 | print " ",email,sent_at,subject 176 | cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body) 177 | VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body)) 178 | 179 | # Only commit every 50th record 180 | # if (many % 50) == 0 : conn.commit() 181 | time.sleep(1) 182 | 183 | conn.commit() 184 | cur.close() 185 | 186 | -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane/gmodel.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib 4 | import re 5 | import zlib 6 | from datetime import datetime, timedelta 7 | # Not all systems have this 8 | try: 9 | import dateutil.parser as parser 10 | except: 11 | pass 12 | 13 | dnsmapping = dict() 14 | mapping = dict() 15 | 16 | def fixsender(sender,allsenders=None) : 17 | global dnsmapping 18 | global mapping 19 | if sender is None : return None 20 | sender = sender.strip().lower() 21 | sender = sender.replace('<','').replace('>','') 22 | 23 | # Check if we have a hacked gmane.org from address 24 | if allsenders is not None and sender.endswith('gmane.org') : 25 | pieces = sender.split('-') 26 | realsender = None 27 | for s in allsenders: 28 | if s.startswith(pieces[0]) : 29 | realsender = sender 30 | sender = s 31 | # print realsender, sender 32 | break 33 | if realsender is None : 34 | for s in mapping: 35 | if s.startswith(pieces[0]) : 36 | realsender = sender 37 | sender = mapping[s] 38 | # print realsender, sender 39 | break 40 | if realsender is None : sender = pieces[0] 41 | 42 | mpieces = sender.split("@") 43 | if len(mpieces) != 2 : return sender 44 | dns = mpieces[1] 45 | x = dns 46 | pieces = dns.split(".") 47 | if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") : 48 | dns = ".".join(pieces[-2:]) 49 | else: 50 | dns = ".".join(pieces[-3:]) 51 | # if dns != x : print x,dns 52 | # if dns != dnsmapping.get(dns,dns) : print dns,dnsmapping.get(dns,dns) 53 | dns = dnsmapping.get(dns,dns) 54 | return mpieces[0] + '@' + dns 55 | 56 | def parsemaildate(md) : 57 | # See if we have dateutil 58 | try: 59 | pdate = parser.parse(tdate) 60 | test_at = pdate.isoformat() 61 | return test_at 62 | except: 63 | pass 64 | 65 | # Non-dateutil version - we try our best 66 | 67 | pieces = md.split() 68 | notz = " ".join(pieces[:4]).strip() 69 | 70 | # Try a bunch of format variations - strptime() is *lame* 71 | dnotz = None 72 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 73 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 74 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 75 | try: 76 | dnotz = datetime.strptime(notz, form) 77 | break 78 | except: 79 | continue 80 | 81 | if dnotz is None : 82 | # print 'Bad Date:',md 83 | return None 84 | 85 | iso = dnotz.isoformat() 86 | 87 | tz = "+0000" 88 | try: 89 | tz = pieces[4] 90 | ival = int(tz) # Only want numeric timezone values 91 | if tz == '-0000' : tz = '+0000' 92 | tzh = tz[:3] 93 | tzm = tz[3:] 94 | tz = tzh+":"+tzm 95 | except: 96 | pass 97 | 98 | return iso+tz 99 | 100 | # Parse out the info... 101 | def parseheader(hdr, allsenders=None): 102 | if hdr is None or len(hdr) < 1 : return None 103 | sender = None 104 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 105 | if len(x) >= 1 : 106 | sender = x[0] 107 | else: 108 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 109 | if len(x) >= 1 : 110 | sender = x[0] 111 | 112 | # normalize the domain name of Email addresses 113 | sender = fixsender(sender, allsenders) 114 | 115 | date = None 116 | y = re.findall('\nDate: .*, (.*)\n', hdr) 117 | sent_at = None 118 | if len(y) >= 1 : 119 | tdate = y[0] 120 | tdate = tdate[:26] 121 | try: 122 | sent_at = parsemaildate(tdate) 123 | except Exception, e: 124 | # print 'Date ignored ',tdate, e 125 | return None 126 | 127 | subject = None 128 | z = re.findall('\nSubject: (.*)\n', hdr) 129 | if len(z) >= 1 : subject = z[0].strip().lower() 130 | 131 | guid = None 132 | z = re.findall('\nMessage-ID: (.*)\n', hdr) 133 | if len(z) >= 1 : guid = z[0].strip().lower() 134 | 135 | if sender is None or sent_at is None or subject is None or guid is None : 136 | return None 137 | return (guid, sender, subject, sent_at) 138 | 139 | # Open the output database and create empty tables 140 | conn = sqlite3.connect('index.sqlite') 141 | conn.text_factory = str 142 | cur = conn.cursor() 143 | 144 | cur.execute('''DROP TABLE IF EXISTS Messages ''') 145 | cur.execute('''DROP TABLE IF EXISTS Senders ''') 146 | cur.execute('''DROP TABLE IF EXISTS Subjects ''') 147 | cur.execute('''DROP TABLE IF EXISTS Replies ''') 148 | 149 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 150 | (id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER, 151 | sender_id INTEGER, subject_id INTEGER, 152 | headers BLOB, body BLOB)''') 153 | cur.execute('''CREATE TABLE IF NOT EXISTS Senders 154 | (id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''') 155 | cur.execute('''CREATE TABLE IF NOT EXISTS Subjects 156 | (id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''') 157 | cur.execute('''CREATE TABLE IF NOT EXISTS Replies 158 | (from_id INTEGER, to_id INTEGER)''') 159 | 160 | # Open the mapping information 161 | conn_1 = sqlite3.connect('mapping.sqlite') 162 | conn_1.text_factory = str 163 | cur_1 = conn_1.cursor() 164 | 165 | # Load up the mapping information into memory structures 166 | cur_1.execute('''SELECT old,new FROM DNSMapping''') 167 | for message_row in cur_1 : 168 | dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower() 169 | 170 | mapping = dict() 171 | cur_1.execute('''SELECT old,new FROM Mapping''') 172 | for message_row in cur_1 : 173 | old = fixsender(message_row[0]) 174 | new = fixsender(message_row[1]) 175 | mapping[old] = fixsender(new) 176 | 177 | cur_1.close() 178 | 179 | # Open the raw data retrieved from the network 180 | conn_2 = sqlite3.connect('content.sqlite') 181 | conn_2.text_factory = str 182 | cur_2 = conn_2.cursor() 183 | 184 | allsenders = list() 185 | cur_2.execute('''SELECT email FROM Messages''') 186 | for message_row in cur_2 : 187 | sender = fixsender(message_row[0]) 188 | if sender is None : continue 189 | if 'gmane.org' in sender : continue 190 | if sender in allsenders: continue 191 | allsenders.append(sender) 192 | 193 | print "Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping) 194 | 195 | cur_2.execute('''SELECT headers, body, sent_at 196 | FROM Messages ORDER BY sent_at''') 197 | 198 | senders = dict() 199 | subjects = dict() 200 | guids = dict() 201 | 202 | count = 0 203 | 204 | for message_row in cur_2 : 205 | hdr = message_row[0] 206 | parsed = parseheader(hdr, allsenders) 207 | if parsed is None: continue 208 | (guid, sender, subject, sent_at) = parsed 209 | 210 | # Apply the sender mapping 211 | sender = mapping.get(sender,sender) 212 | 213 | count = count + 1 214 | if count % 250 == 1 : print count,sent_at, sender 215 | # print guid, sender, subject, sent_at 216 | 217 | if 'gmane.org' in sender: 218 | print "Error in sender ===", sender 219 | 220 | sender_id = senders.get(sender,None) 221 | subject_id = subjects.get(subject,None) 222 | guid_id = guids.get(guid,None) 223 | 224 | if sender_id is None : 225 | cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) ) 226 | conn.commit() 227 | cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, )) 228 | try: 229 | row = cur.fetchone() 230 | sender_id = row[0] 231 | senders[sender] = sender_id 232 | except: 233 | print 'Could not retrieve sender id',sender 234 | break 235 | if subject_id is None : 236 | cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) ) 237 | conn.commit() 238 | cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, )) 239 | try: 240 | row = cur.fetchone() 241 | subject_id = row[0] 242 | subjects[subject] = subject_id 243 | except: 244 | print 'Could not retrieve subject id',subject 245 | break 246 | # print sender_id, subject_id 247 | cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )', 248 | ( guid, sender_id, subject_id, sent_at, zlib.compress(message_row[0]), zlib.compress(message_row[1])) ) 249 | conn.commit() 250 | cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, )) 251 | try: 252 | row = cur.fetchone() 253 | message_id = row[0] 254 | guids[guid] = message_id 255 | except: 256 | print 'Could not retrieve guid id',guid 257 | break 258 | 259 | # Close the connections 260 | cur.close() 261 | cur_2.close() 262 | 263 | -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane/gword.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 37 | -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane/gword.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib 4 | import zlib 5 | import string 6 | 7 | conn = sqlite3.connect('index.sqlite') 8 | conn.text_factory = str 9 | cur = conn.cursor() 10 | 11 | cur.execute('''SELECT subject_id,subject FROM Messages 12 | JOIN Subjects ON Messages.subject_id = Subjects.id''') 13 | 14 | counts = dict() 15 | for message_row in cur : 16 | text = message_row[1] 17 | text = text.translate(None, string.punctuation) 18 | text = text.translate(None, '1234567890') 19 | text = text.strip() 20 | text = text.lower() 21 | words = text.split() 22 | for word in words: 23 | if len(word) < 4 : continue 24 | counts[word] = counts.get(word,0) + 1 25 | 26 | # Find the top 100 words 27 | words = sorted(counts, key=counts.get, reverse=True) 28 | highest = None 29 | lowest = None 30 | for w in words[:100]: 31 | if highest is None or highest < counts[w] : 32 | highest = counts[w] 33 | if lowest is None or lowest > counts[w] : 34 | lowest = counts[w] 35 | print 'Range of counts:',highest,lowest 36 | 37 | # Spread the font sizes across 20-100 based on the count 38 | bigsize = 80 39 | smallsize = 20 40 | 41 | fhand = open('gword.js','w') 42 | fhand.write("gword = [") 43 | first = True 44 | for k in words[:100]: 45 | if not first : fhand.write( ",\n") 46 | first = False 47 | size = counts[k] 48 | size = (size - lowest) / float(highest - lowest) 49 | size = int((size * bigsize) + smallsize) 50 | fhand.write("{text: '"+k+"', size: "+str(size)+"}") 51 | fhand.write( "\n];\n") 52 | 53 | print "Output written to gword.js" 54 | print "Open gword.htm in a browser to view" 55 | -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane/gyear.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib 4 | import zlib 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | conn.text_factory = str 8 | cur = conn.cursor() 9 | 10 | # Determine the top ten organizations 11 | cur.execute('''SELECT Messages.id, sender FROM Messages 12 | JOIN Senders ON Messages.sender_id = Senders.id''') 13 | 14 | sendorgs = dict() 15 | for message_row in cur : 16 | sender = message_row[1] 17 | pieces = sender.split("@") 18 | if len(pieces) != 2 : continue 19 | dns = pieces[1] 20 | sendorgs[dns] = sendorgs.get(dns,0) + 1 21 | 22 | # pick the top schools 23 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 24 | orgs = orgs[:10] 25 | print "Top 10 Organizations" 26 | print orgs 27 | # orgs = ['total'] + orgs 28 | 29 | # Read through the messages 30 | counts = dict() 31 | years = list() 32 | 33 | cur.execute('''SELECT Messages.id, sender, sent_at FROM Messages 34 | JOIN Senders ON Messages.sender_id = Senders.id''') 35 | 36 | for message_row in cur : 37 | sender = message_row[1] 38 | pieces = sender.split("@") 39 | if len(pieces) != 2 : continue 40 | dns = pieces[1] 41 | if dns not in orgs : continue 42 | year = message_row[2][:4] 43 | if year not in years : years.append(year) 44 | key = (year, dns) 45 | counts[key] = counts.get(key,0) + 1 46 | tkey = (year, 'total') 47 | counts[tkey] = counts.get(tkey,0) + 1 48 | 49 | years.sort() 50 | print counts 51 | print years 52 | 53 | fhand = open('gline.js','w') 54 | fhand.write("gline = [ ['Year'") 55 | for org in orgs: 56 | fhand.write(",'"+org+"'") 57 | fhand.write("]") 58 | 59 | # for year in years[1:-1]: 60 | for year in years: 61 | fhand.write(",\n['"+year+"'") 62 | for org in orgs: 63 | key = (year, org) 64 | val = counts.get(key,0) 65 | fhand.write(","+str(val)) 66 | fhand.write("]"); 67 | 68 | fhand.write("\n];\n") 69 | 70 | print "Data written to gline.js" 71 | print "Open gline.htm in a browser to view" 72 | 73 | -------------------------------------------------------------------------------- /EXAMPLE CODE/gmane/mapping.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/gmane/mapping.sqlite -------------------------------------------------------------------------------- /EXAMPLE CODE/grade.py: -------------------------------------------------------------------------------- 1 | inp = raw_input('Enter score: ') 2 | try: 3 | score = float(inp) 4 | except: 5 | score = -1 6 | 7 | if score > 1.0 or score < 0.0: 8 | print 'Bad score' 9 | elif score > 0.9: 10 | print 'A' 11 | elif score > 0.8: 12 | print 'B' 13 | elif score > 0.7: 14 | print 'C' 15 | elif score > 0.6: 16 | print 'D' 17 | else: 18 | print 'F' 19 | -------------------------------------------------------------------------------- /EXAMPLE CODE/graphics/histogram.py: -------------------------------------------------------------------------------- 1 | import string 2 | from graphics import * 3 | 4 | fname = raw_input("Enter file name:") 5 | if len(fname) == 0 : 6 | print "Assuming mbox-short.txt" 7 | fname = "mbox-short.txt" 8 | infile = open(fname, "r") 9 | 10 | # Set up a 24 element list of zeros 11 | totals = [0] * 24; 12 | print totals; 13 | 14 | # Accumulate the times 15 | for line in infile: 16 | if line[0:5] == "From " : 17 | words = line.split() 18 | time = words[5] 19 | print "Time", time 20 | 21 | # Split time 22 | tsplit = time.split(':') 23 | try : 24 | hour = int(tsplit[0]) 25 | print "Hour", hour 26 | except: 27 | print "Hour not found" 28 | continue 29 | 30 | totals[hour] = totals[hour] + 1 31 | print totals 32 | 33 | bmax = max(totals) 34 | print "Maximum value", bmax 35 | 36 | ymax = ( int(bmax / 10) + 1 ) * 10 37 | 38 | print "Y-Axis Maximum", ymax 39 | 40 | win = GraphWin("Distribution of Commits "+fname, 600,400) 41 | win.setCoords(0,0,1,1) 42 | 43 | # Draw the X-Axis 44 | xaxis = Line(Point(0.1,0.1),Point(0.9,0.1)) 45 | xaxis.draw(win) 46 | 47 | # Label the X-Axis - we have 24 hours (0-23) 48 | # so we need to know each slot's width 49 | width = 0.8 * (1.0 / 24.0) 50 | for i in range(24): 51 | center = (i * width) + (width / 2.0) + 0.1; 52 | txt = Text(Point(center, 0.066), str(i)) 53 | txt.draw(win) 54 | 55 | txt = Text(Point(0.5,0.033),"Hour of the Day"); 56 | txt.draw(win) 57 | 58 | # Draw the Y-Axis 59 | yaxis = Line(Point(0.1,0.1),Point(0.1,0.9)) 60 | yaxis.draw(win) 61 | 62 | # Label the Y-Axis 63 | # we will have 10 labels up to ymax 64 | unit = ymax / 10.0; 65 | for i in range(10) : 66 | center = 0.1 + (i + 1) * 0.08; 67 | value = int( (i + 1) * unit ) ; 68 | txt = Text(Point(0.066,center), str(value)) 69 | txt.draw(win) 70 | 71 | 72 | # Draw the bars 73 | for i in range(24): 74 | if totals[i] == 0: 75 | continue 76 | left = i * width + 0.1; 77 | right = i * width + width + 0.1; 78 | height = (float(totals[i]) / ymax) * 0.8; 79 | rec = Rectangle(Point(left,0.1), Point(right,0.1+height)) 80 | rec.setFill('blue') 81 | rec.draw(win) 82 | 83 | win.getMouse() 84 | -------------------------------------------------------------------------------- /EXAMPLE CODE/greet.py: -------------------------------------------------------------------------------- 1 | name = raw_input('Enter your name:') 2 | print 'Hello', name 3 | -------------------------------------------------------------------------------- /EXAMPLE CODE/grep.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with From and have an at sign 2 | import re 3 | hand = open('mbox.txt') 4 | search = raw_input('Enter a regular expression: ') 5 | count = 0 6 | for line in hand: 7 | line = line.rstrip() 8 | if re.search(search,line) : count = count + 1 9 | 10 | print 'mbox.txt had',count,'lines that matched',search 11 | -------------------------------------------------------------------------------- /EXAMPLE CODE/hidden.py: -------------------------------------------------------------------------------- 1 | # Keep this file separate 2 | # https://apps.twitter.com/ 3 | 4 | def oauth() : 5 | return { "consumer_key" : "h7Lu...Ng", 6 | "consumer_secret" : "dNKenAC3New...mmn7Q", 7 | "token_key" : "10185562-eibxCp9n2...P4GEQQOSGI", 8 | "token_secret" : "H0ycCFemmC4wyf1...qoIpBo" } 9 | -------------------------------------------------------------------------------- /EXAMPLE CODE/intro-short.txt: -------------------------------------------------------------------------------- 1 | Why should you learn to write programs? 2 | 3 | Writing programs (or programming) is a very creative 4 | and rewarding activity. You can write programs for 5 | many reasons, ranging from making your living to solving 6 | a difficult data analysis problem to having fun to helping 7 | someone else solve a problem. This book assumes that 8 | everyone needs to know how to program, and that once 9 | you know how to program you will figure out what you want 10 | to do with your newfound skills. 11 | -------------------------------------------------------------------------------- /EXAMPLE CODE/json1.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | data = """{ 4 | "name" : "Chuck", 5 | "phone" : { 6 | "type" : "intl", 7 | "number" : "+1 734 303 4456" 8 | }, 9 | "email" : { 10 | "hide" : "yes" 11 | } 12 | }""" 13 | 14 | info = json.loads(data) 15 | print 'Name:',info["name"] 16 | print 'Hide:',info["email"]["hide"] 17 | -------------------------------------------------------------------------------- /EXAMPLE CODE/json2.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | input = ''' 4 | [ 5 | { "id" : "001", 6 | "x" : "2", 7 | "name" : "Chuck" 8 | } , 9 | { "id" : "009", 10 | "x" : "7", 11 | "name" : "Chuck" 12 | } 13 | ]''' 14 | 15 | info = json.loads(input) 16 | print 'User count:', len(info) 17 | 18 | for item in info: 19 | print 'Name', item['name'] 20 | print 'Id', item['id'] 21 | print 'Attribute', item['x'] 22 | 23 | -------------------------------------------------------------------------------- /EXAMPLE CODE/largest.py: -------------------------------------------------------------------------------- 1 | largest = None 2 | print 'Before:', largest 3 | for iterval in [3, 41, 12, 9, 74, 15]: 4 | if largest == None or largest < iterval: 5 | largest = iterval 6 | print 'Loop:', iterval, largest 7 | print 'Largest:', largest 8 | 9 | -------------------------------------------------------------------------------- /EXAMPLE CODE/mailcount.py: -------------------------------------------------------------------------------- 1 | fname = raw_input('Enter file name: ') 2 | fhand = open(fname) 3 | c = dict() 4 | for line in fhand: 5 | if not line.startswith('From ') : continue 6 | pieces = line.split() 7 | email = pieces[1] 8 | c[email] = c.get(email,0) + 1 9 | 10 | print c 11 | -------------------------------------------------------------------------------- /EXAMPLE CODE/mailtop.py: -------------------------------------------------------------------------------- 1 | fname = raw_input('Enter file name: ') 2 | fhand = open(fname) 3 | c = dict() 4 | for line in fhand: 5 | if not line.startswith('From ') : continue 6 | pieces = line.split() 7 | email = pieces[1] 8 | c[email] = c.get(email,0) + 1 9 | 10 | bigc = None 11 | bige = None 12 | for word in c: 13 | value = c[word] 14 | if bigc == None or value > bigc: 15 | bigw = word 16 | bigc = value 17 | 18 | print bigw, bigc 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /EXAMPLE CODE/open.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox.txt') 2 | count = 0 3 | for line in fhand: 4 | count = count + 1 5 | print 'Line Count:', count 6 | 7 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pagerank.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/pagerank.zip -------------------------------------------------------------------------------- /EXAMPLE CODE/pagerank/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, Michael Bostock 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * The name Michael Bostock may not be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT, 21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pagerank/README.txt: -------------------------------------------------------------------------------- 1 | Simple Python Search Spider, Page Ranker, and Visualizer 2 | 3 | This is a set of programs that emulate some of the functions of a 4 | search engine. They store their data in a SQLITE3 database named 5 | 'spider.sqlite'. This file can be removed at any time to restart the 6 | process. 7 | 8 | You should install the SQLite browser to view and modify 9 | the databases from: 10 | 11 | http://sqlitebrowser.org/ 12 | 13 | This program crawls a web site and pulls a series of pages into the 14 | database, recording the links between pages. 15 | 16 | Mac: rm spider.sqlite 17 | Mac: python spider.py 18 | 19 | Win: del spider.sqlite 20 | Win: spider.py 21 | 22 | Enter web url or enter: http://www.dr-chuck.com/ 23 | ['http://www.dr-chuck.com'] 24 | How many pages:2 25 | 1 http://www.dr-chuck.com/ 12 26 | 2 http://www.dr-chuck.com/csev-blog/ 57 27 | How many pages: 28 | 29 | In this sample run, we told it to crawl a website and retrieve two 30 | pages. If you restart the program again and tell it to crawl more 31 | pages, it will not re-crawl any pages already in the database. Upon 32 | restart it goes to a random non-crawled page and starts there. So 33 | each successive run of spider.py is additive. 34 | 35 | Mac: python spider.py 36 | Win: spider.py 37 | 38 | Enter web url or enter: http://www.dr-chuck.com/ 39 | ['http://www.dr-chuck.com'] 40 | How many pages:3 41 | 3 http://www.dr-chuck.com/csev-blog 57 42 | 4 http://www.dr-chuck.com/dr-chuck/resume/speaking.htm 1 43 | 5 http://www.dr-chuck.com/dr-chuck/resume/index.htm 13 44 | How many pages: 45 | 46 | You can have multiple starting points in the same database - 47 | within the program these are called "webs". The spider 48 | chooses randomly amongst all non-visited links across all 49 | the webs. 50 | 51 | If your code fails complainin about certificate probems, 52 | there is some code (SSL) that can be un-commented to work 53 | around certificate problems. 54 | 55 | If you want to dump the contents of the spider.sqlite file, you can 56 | run spdump.py as follows: 57 | 58 | Mac: python spdump.py 59 | Win: spdump.py 60 | 61 | (5, None, 1.0, 3, u'http://www.dr-chuck.com/csev-blog') 62 | (3, None, 1.0, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 63 | (1, None, 1.0, 2, u'http://www.dr-chuck.com/csev-blog/') 64 | (1, None, 1.0, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 65 | 4 rows. 66 | 67 | This shows the number of incoming links, the old page rank, the new page 68 | rank, the id of the page, and the url of the page. The spdump.py program 69 | only shows pages that have at least one incoming link to them. 70 | 71 | Once you have a few pages in the database, you can run Page Rank on the 72 | pages using the sprank.py program. You simply tell it how many Page 73 | Rank iterations to run. 74 | 75 | Mac: python sprank.py 76 | Win: sprank.py 77 | 78 | How many iterations:2 79 | 1 0.546848992536 80 | 2 0.226714939664 81 | [(1, 0.559), (2, 0.659), (3, 0.985), (4, 2.135), (5, 0.659)] 82 | 83 | You can dump the database again to see that page rank has been updated: 84 | 85 | Mac: python spdump.py 86 | Win: spdump.py 87 | 88 | (5, 1.0, 0.985, 3, u'http://www.dr-chuck.com/csev-blog') 89 | (3, 1.0, 2.135, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 90 | (1, 1.0, 0.659, 2, u'http://www.dr-chuck.com/csev-blog/') 91 | (1, 1.0, 0.659, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 92 | 4 rows. 93 | 94 | You can run sprank.py as many times as you like and it will simply refine 95 | the page rank the more times you run it. You can even run sprank.py a few times 96 | and then go spider a few more pages sith spider.py and then run sprank.py 97 | to converge the page ranks. 98 | 99 | If you want to restart the Page Rank calculations without re-spidering the 100 | web pages, you can use spreset.py 101 | 102 | Mac: python spreset.py 103 | Win: spreset.py 104 | 105 | All pages set to a rank of 1.0 106 | 107 | Mac: python sprank.py 108 | Win: sprank.py 109 | 110 | How many iterations:50 111 | 1 0.546848992536 112 | 2 0.226714939664 113 | 3 0.0659516187242 114 | 4 0.0244199333 115 | 5 0.0102096489546 116 | 6 0.00610244329379 117 | ... 118 | 42 0.000109076928206 119 | 43 9.91987599002e-05 120 | 44 9.02151706798e-05 121 | 45 8.20451504471e-05 122 | 46 7.46150183837e-05 123 | 47 6.7857770908e-05 124 | 48 6.17124694224e-05 125 | 49 5.61236959327e-05 126 | 50 5.10410499467e-05 127 | [(512, 0.02963718031139026), (1, 12.790786721866658), (2, 28.939418898678284), (3, 6.808468390725946), (4, 13.469889092397006)] 128 | 129 | For each iteration of the page rank algorithm it prints the average 130 | change per page of the page rank. The network initially is quite 131 | unbalanced and so the individual page ranks are changeing wildly. 132 | But in a few short iterations, the page rank converges. You 133 | should run prank.py long enough that the page ranks converge. 134 | 135 | If you want to visualize the current top pages in terms of page rank, 136 | run spjson.py to write the pages out in JSON format to be viewed in a 137 | web browser. 138 | 139 | Mac: python spjson.py 140 | Win: spjson.py 141 | 142 | Creating JSON output on spider.js... 143 | How many nodes? 30 144 | Open force.html in a browser to view the visualization 145 | 146 | You can view this data by opening the file force.html in your web browser. 147 | This shows an automatic layout of the nodes and links. You can click and 148 | drag any node and you can also double click on a node to find the URL 149 | that is represented by the node. 150 | 151 | This visualization is provided using the force layout from: 152 | 153 | http://mbostock.github.com/d3/ 154 | 155 | If you rerun the other utilities and then re-run spjson.py - you merely 156 | have to press refresh in the browser to get the new data from spider.js. 157 | 158 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pagerank/force.css: -------------------------------------------------------------------------------- 1 | circle.node { 2 | stroke: #fff; 3 | stroke-width: 1.5px; 4 | } 5 | 6 | line.link { 7 | stroke: #999; 8 | stroke-opacity: .6; 9 | } 10 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pagerank/force.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Force-Directed Layout 5 | 6 | 7 | 8 | 9 | 10 | 13 |
14 | 15 |

If you don't see a chart above, check the JavaScript console. You may 16 | need to use a different browser.

17 | 18 | 19 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pagerank/force.js: -------------------------------------------------------------------------------- 1 | var width = 600, 2 | height = 600; 3 | 4 | var color = d3.scale.category20(); 5 | 6 | var dist = (width + height) / 4; 7 | 8 | var force = d3.layout.force() 9 | .charge(-120) 10 | .linkDistance(dist) 11 | .size([width, height]); 12 | 13 | function getrank(rval) { 14 | return (rval/2.0) + 3; 15 | } 16 | 17 | function getcolor(rval) { 18 | return color(rval); 19 | } 20 | 21 | var svg = d3.select("#chart").append("svg") 22 | .attr("width", width) 23 | .attr("height", height); 24 | 25 | function loadData(json) { 26 | force 27 | .nodes(json.nodes) 28 | .links(json.links); 29 | 30 | var k = Math.sqrt(json.nodes.length / (width * height)); 31 | 32 | force 33 | .charge(-10 / k) 34 | .gravity(100 * k) 35 | .start(); 36 | 37 | var link = svg.selectAll("line.link") 38 | .data(json.links) 39 | .enter().append("line") 40 | .attr("class", "link") 41 | .style("stroke-width", function(d) { return Math.sqrt(d.value); }); 42 | 43 | var node = svg.selectAll("circle.node") 44 | .data(json.nodes) 45 | .enter().append("circle") 46 | .attr("class", "node") 47 | .attr("r", function(d) { return getrank(d.rank); } ) 48 | .style("fill", function(d) { return getcolor(d.rank); }) 49 | .on("dblclick",function(d) { 50 | if ( confirm('Do you want to open '+d.url) ) 51 | window.open(d.url,'_new',''); 52 | d3.event.stopPropagation(); 53 | }) 54 | .call(force.drag); 55 | 56 | node.append("title") 57 | .text(function(d) { return d.url; }); 58 | 59 | force.on("tick", function() { 60 | link.attr("x1", function(d) { return d.source.x; }) 61 | .attr("y1", function(d) { return d.source.y; }) 62 | .attr("x2", function(d) { return d.target.x; }) 63 | .attr("y2", function(d) { return d.target.y; }); 64 | 65 | node.attr("cx", function(d) { return d.x; }) 66 | .attr("cy", function(d) { return d.y; }); 67 | }); 68 | 69 | } 70 | loadData(spiderJson); 71 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pagerank/spdump.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 7 | FROM Pages JOIN Links ON Pages.id = Links.to_id 8 | WHERE html IS NOT NULL 9 | GROUP BY id ORDER BY inbound DESC''') 10 | 11 | count = 0 12 | for row in cur : 13 | if count < 50 : print row 14 | count = count + 1 15 | print count, 'rows.' 16 | cur.close() 17 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pagerank/spider.js: -------------------------------------------------------------------------------- 1 | spiderJson = {"nodes":[ 2 | {"weight":1,"rank":0.0, "id":1, "url":"http://python-data.dr-chuck.net"}, 3 | {"weight":1,"rank":4.66423227024, "id":4, "url":"http://python-data.dr-chuck.net/comments_42.html"}, 4 | {"weight":1,"rank":1.38142061792, "id":7, "url":"http://python-data.dr-chuck.net/known_by_42.html"}, 5 | {"weight":1,"rank":0.690710255581, "id":9, "url":"http://python-data.dr-chuck.net/known_by_Kaylyn.html"}, 6 | {"weight":2,"rank":2.26669663573, "id":40, "url":"http://python-data.dr-chuck.net/known_by_Takua.html"}, 7 | {"weight":1,"rank":0.690710255581, "id":82, "url":"http://python-data.dr-chuck.net/known_by_Marwan.html"}, 8 | {"weight":2,"rank":7.45553422719, "id":85, "url":"http://python-data.dr-chuck.net/known_by_Samiya.html"}, 9 | {"weight":2,"rank":8.48734569457, "id":145, "url":"http://python-data.dr-chuck.net/known_by_Shihed.html"}, 10 | {"weight":1,"rank":0.518032667194, "id":189, "url":"http://python-data.dr-chuck.net/known_by_Cassidy.html"}, 11 | {"weight":2,"rank":1.56869025396, "id":199, "url":"http://python-data.dr-chuck.net/known_by_Vinnie.html"}, 12 | {"weight":2,"rank":2.54881807574, "id":203, "url":"http://python-data.dr-chuck.net/known_by_Charlee.html"}, 13 | {"weight":1,"rank":8.83695381234, "id":248, "url":"http://python-data.dr-chuck.net/known_by_Atli.html"}, 14 | {"weight":2,"rank":4.16614971195, "id":309, "url":"http://python-data.dr-chuck.net/known_by_Abbiegail.html"}, 15 | {"weight":2,"rank":2.2314317079, "id":326, "url":"http://python-data.dr-chuck.net/known_by_Nisha.html"}, 16 | {"weight":1,"rank":1.21603900362, "id":382, "url":"http://python-data.dr-chuck.net/known_by_Ciar.html"}, 17 | {"weight":1,"rank":1.89945314693, "id":413, "url":"http://python-data.dr-chuck.net/known_by_Brodie.html"}, 18 | {"weight":2,"rank":19.0, "id":501, "url":"http://python-data.dr-chuck.net/known_by_Kylar.html"}, 19 | {"weight":2,"rank":5.3834045047, "id":642, "url":"http://python-data.dr-chuck.net/known_by_Mohamed.html"}, 20 | {"weight":1,"rank":3.93023811326, "id":676, "url":"http://python-data.dr-chuck.net/known_by_Oluwaferanmi.html"}, 21 | {"weight":1,"rank":2.59745947896, "id":813, "url":"http://python-data.dr-chuck.net/known_by_Maree.html"}, 22 | {"weight":1,"rank":1.77055254257, "id":873, "url":"http://python-data.dr-chuck.net/known_by_Shaw.html"}], 23 | "links":[ 24 | {"source":0,"target":1,"value":3}, 25 | {"source":0,"target":2,"value":3}, 26 | {"source":0,"target":0,"value":3}, 27 | {"source":2,"target":3,"value":3}, 28 | {"source":2,"target":4,"value":3}, 29 | {"source":2,"target":5,"value":3}, 30 | {"source":2,"target":6,"value":3}, 31 | {"source":5,"target":7,"value":3}, 32 | {"source":5,"target":8,"value":3}, 33 | {"source":5,"target":9,"value":3}, 34 | {"source":5,"target":10,"value":3}, 35 | {"source":6,"target":11,"value":3}, 36 | {"source":4,"target":12,"value":3}, 37 | {"source":4,"target":13,"value":3}, 38 | {"source":4,"target":14,"value":3}, 39 | {"source":8,"target":15,"value":3}, 40 | {"source":7,"target":16,"value":3}, 41 | {"source":13,"target":17,"value":3}, 42 | {"source":10,"target":18,"value":3}, 43 | {"source":14,"target":19,"value":3}, 44 | {"source":18,"target":20,"value":3}, 45 | {"source":18,"target":17,"value":3}, 46 | {"source":20,"target":9,"value":3}, 47 | {"source":17,"target":6,"value":3}, 48 | {"source":9,"target":12,"value":3}]}; -------------------------------------------------------------------------------- /EXAMPLE CODE/pagerank/spider.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import urllib 3 | import ssl 4 | from urlparse import urljoin 5 | from urlparse import urlparse 6 | from BeautifulSoup import * 7 | 8 | # Deal with SSL certificate anomalies Python > 2.7 9 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1) 10 | scontext = None 11 | 12 | conn = sqlite3.connect('spider.sqlite') 13 | cur = conn.cursor() 14 | 15 | cur.execute('''CREATE TABLE IF NOT EXISTS Pages 16 | (id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT, 17 | error INTEGER, old_rank REAL, new_rank REAL)''') 18 | 19 | cur.execute('''CREATE TABLE IF NOT EXISTS Links 20 | (from_id INTEGER, to_id INTEGER)''') 21 | 22 | cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''') 23 | 24 | # Check to see if we are already in progress... 25 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 26 | row = cur.fetchone() 27 | if row is not None: 28 | print "Restarting existing crawl. Remove spider.sqlite to start a fresh crawl." 29 | else : 30 | starturl = raw_input('Enter web url or enter: ') 31 | if ( len(starturl) < 1 ) : starturl = 'http://python-data.dr-chuck.net/' 32 | if ( starturl.endswith('/') ) : starturl = starturl[:-1] 33 | web = starturl 34 | if ( starturl.endswith('.htm') or starturl.endswith('.html') ) : 35 | pos = starturl.rfind('/') 36 | web = starturl[:pos] 37 | 38 | if ( len(web) > 1 ) : 39 | cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) ) 40 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) ) 41 | conn.commit() 42 | 43 | # Get the current webs 44 | cur.execute('''SELECT url FROM Webs''') 45 | webs = list() 46 | for row in cur: 47 | webs.append(str(row[0])) 48 | 49 | print webs 50 | 51 | many = 0 52 | while True: 53 | if ( many < 1 ) : 54 | sval = raw_input('How many pages:') 55 | if ( len(sval) < 1 ) : break 56 | many = int(sval) 57 | many = many - 1 58 | 59 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 60 | try: 61 | row = cur.fetchone() 62 | # print row 63 | fromid = row[0] 64 | url = row[1] 65 | except: 66 | print 'No unretrieved HTML pages found' 67 | many = 0 68 | break 69 | 70 | print fromid, url, 71 | 72 | # If we are retrieving this page, there should be no links from it 73 | cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) ) 74 | try: 75 | # Deal with SSL certificate anomalies Python > 2.7 76 | # scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1) 77 | # document = urllib.urlopen(url, context=scontext) 78 | 79 | # Normal Unless you encounter certificate problems 80 | document = urllib.urlopen(url) 81 | 82 | html = document.read() 83 | if document.getcode() != 200 : 84 | print "Error on page: ",document.getcode() 85 | cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) ) 86 | 87 | if 'text/html' != document.info().gettype() : 88 | print "Ignore non text/html page" 89 | cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) ) 90 | conn.commit() 91 | continue 92 | 93 | print '('+str(len(html))+')', 94 | 95 | soup = BeautifulSoup(html) 96 | except KeyboardInterrupt: 97 | print '' 98 | print 'Program interrupted by user...' 99 | break 100 | except: 101 | print "Unable to retrieve or parse page" 102 | cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) ) 103 | conn.commit() 104 | continue 105 | 106 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) ) 107 | cur.execute('UPDATE Pages SET html=? WHERE url=?', (buffer(html), url ) ) 108 | conn.commit() 109 | 110 | # Retrieve all of the anchor tags 111 | tags = soup('a') 112 | count = 0 113 | for tag in tags: 114 | href = tag.get('href', None) 115 | if ( href is None ) : continue 116 | # Resolve relative references like href="/contact" 117 | up = urlparse(href) 118 | if ( len(up.scheme) < 1 ) : 119 | href = urljoin(url, href) 120 | ipos = href.find('#') 121 | if ( ipos > 1 ) : href = href[:ipos] 122 | if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue 123 | if ( href.endswith('/') ) : href = href[:-1] 124 | # print href 125 | if ( len(href) < 1 ) : continue 126 | 127 | # Check if the URL is in any of the webs 128 | found = False 129 | for web in webs: 130 | if ( href.startswith(web) ) : 131 | found = True 132 | break 133 | if not found : continue 134 | 135 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) ) 136 | count = count + 1 137 | conn.commit() 138 | 139 | cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, )) 140 | try: 141 | row = cur.fetchone() 142 | toid = row[0] 143 | except: 144 | print 'Could not retrieve id' 145 | continue 146 | # print fromid, toid 147 | cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) ) 148 | 149 | 150 | print count 151 | 152 | cur.close() 153 | 154 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pagerank/spjson.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | print "Creating JSON output on spider.js..." 7 | howmany = int(raw_input("How many nodes? ")) 8 | 9 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 10 | FROM Pages JOIN Links ON Pages.id = Links.to_id 11 | WHERE html IS NOT NULL AND ERROR IS NULL 12 | GROUP BY id ORDER BY id,inbound''') 13 | 14 | fhand = open('spider.js','w') 15 | nodes = list() 16 | maxrank = None 17 | minrank = None 18 | for row in cur : 19 | nodes.append(row) 20 | rank = row[2] 21 | if maxrank < rank or maxrank is None : maxrank = rank 22 | if minrank > rank or minrank is None : minrank = rank 23 | if len(nodes) > howmany : break 24 | 25 | if maxrank == minrank or maxrank is None or minrank is None: 26 | print "Error - please run sprank.py to compute page rank" 27 | quit() 28 | 29 | fhand.write('spiderJson = {"nodes":[\n') 30 | count = 0 31 | map = dict() 32 | ranks = dict() 33 | for row in nodes : 34 | if count > 0 : fhand.write(',\n') 35 | # print row 36 | rank = row[2] 37 | rank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 38 | fhand.write('{'+'"weight":'+str(row[0])+',"rank":'+str(rank)+',') 39 | fhand.write(' "id":'+str(row[3])+', "url":"'+row[4]+'"}') 40 | map[row[3]] = count 41 | ranks[row[3]] = rank 42 | count = count + 1 43 | fhand.write('],\n') 44 | 45 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 46 | fhand.write('"links":[\n') 47 | 48 | count = 0 49 | for row in cur : 50 | # print row 51 | if row[0] not in map or row[1] not in map : continue 52 | if count > 0 : fhand.write(',\n') 53 | rank = ranks[row[0]] 54 | srank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 55 | fhand.write('{"source":'+str(map[row[0]])+',"target":'+str(map[row[1]])+',"value":3}') 56 | count = count + 1 57 | fhand.write(']};') 58 | fhand.close() 59 | cur.close() 60 | 61 | print "Open force.html in a browser to view the visualization" 62 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pagerank/sprank.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | # Find the ids that send out page rank - we only are interested 7 | # in pages in the SCC that have in and out links 8 | cur.execute('''SELECT DISTINCT from_id FROM Links''') 9 | from_ids = list() 10 | for row in cur: 11 | from_ids.append(row[0]) 12 | 13 | # Find the ids that receive page rank 14 | to_ids = list() 15 | links = list() 16 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 17 | for row in cur: 18 | from_id = row[0] 19 | to_id = row[1] 20 | if from_id == to_id : continue 21 | if from_id not in from_ids : continue 22 | if to_id not in from_ids : continue 23 | links.append(row) 24 | if to_id not in to_ids : to_ids.append(to_id) 25 | 26 | # Get latest page ranks for strongly connected component 27 | prev_ranks = dict() 28 | for node in from_ids: 29 | cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''', (node, )) 30 | row = cur.fetchone() 31 | prev_ranks[node] = row[0] 32 | 33 | sval = raw_input('How many iterations:') 34 | many = 1 35 | if ( len(sval) > 0 ) : many = int(sval) 36 | 37 | # Sanity check 38 | if len(prev_ranks) < 1 : 39 | print "Nothing to page rank. Check data." 40 | quit() 41 | 42 | # Lets do Page Rank in memory so it is really fast 43 | for i in range(many): 44 | # print prev_ranks.items()[:5] 45 | next_ranks = dict(); 46 | total = 0.0 47 | for (node, old_rank) in prev_ranks.items(): 48 | total = total + old_rank 49 | next_ranks[node] = 0.0 50 | # print total 51 | 52 | # Find the number of outbound links and sent the page rank down each 53 | for (node, old_rank) in prev_ranks.items(): 54 | # print node, old_rank 55 | give_ids = list() 56 | for (from_id, to_id) in links: 57 | if from_id != node : continue 58 | # print ' ',from_id,to_id 59 | 60 | if to_id not in to_ids: continue 61 | give_ids.append(to_id) 62 | if ( len(give_ids) < 1 ) : continue 63 | amount = old_rank / len(give_ids) 64 | # print node, old_rank,amount, give_ids 65 | 66 | for id in give_ids: 67 | next_ranks[id] = next_ranks[id] + amount 68 | 69 | newtot = 0 70 | for (node, next_rank) in next_ranks.items(): 71 | newtot = newtot + next_rank 72 | evap = (total - newtot) / len(next_ranks) 73 | 74 | # print newtot, evap 75 | for node in next_ranks: 76 | next_ranks[node] = next_ranks[node] + evap 77 | 78 | newtot = 0 79 | for (node, next_rank) in next_ranks.items(): 80 | newtot = newtot + next_rank 81 | 82 | # Compute the per-page average change from old rank to new rank 83 | # As indication of convergence of the algorithm 84 | totdiff = 0 85 | for (node, old_rank) in prev_ranks.items(): 86 | new_rank = next_ranks[node] 87 | diff = abs(old_rank-new_rank) 88 | totdiff = totdiff + diff 89 | 90 | avediff = totdiff / len(prev_ranks) 91 | print i+1, avediff 92 | 93 | # rotate 94 | prev_ranks = next_ranks 95 | 96 | # Put the final ranks back into the database 97 | print next_ranks.items()[:5] 98 | cur.execute('''UPDATE Pages SET old_rank=new_rank''') 99 | for (id, new_rank) in next_ranks.items() : 100 | cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''', (new_rank, id)) 101 | conn.commit() 102 | cur.close() 103 | 104 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pagerank/spreset.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''UPDATE Pages SET new_rank=1.0, old_rank=0.0''') 7 | conn.commit() 8 | 9 | cur.close() 10 | 11 | print "All pages set to a rank of 1.0" 12 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pals.py: -------------------------------------------------------------------------------- 1 | friends = ['Joseph', 'Glenn', 'Sally'] 2 | for friend in friends: 3 | print 'Happy New Year:', friend 4 | print 'Done!' 5 | 6 | -------------------------------------------------------------------------------- /EXAMPLE CODE/party1.py: -------------------------------------------------------------------------------- 1 | class PartyAnimal: 2 | x = 0 3 | 4 | def party(self) : 5 | self.x = self.x + 1 6 | print "So far",self.x 7 | 8 | an = PartyAnimal() 9 | 10 | an.party() 11 | an.party() 12 | an.party() 13 | 14 | -------------------------------------------------------------------------------- /EXAMPLE CODE/party2.py: -------------------------------------------------------------------------------- 1 | class PartyAnimal: 2 | x = 0 3 | 4 | def party(self) : 5 | self.x = self.x + 1 6 | print "So far",self.x 7 | 8 | an = PartyAnimal() 9 | 10 | print "Type", type(an) 11 | print "Dir ", dir(an) 12 | 13 | -------------------------------------------------------------------------------- /EXAMPLE CODE/party3.py: -------------------------------------------------------------------------------- 1 | class PartyAnimal: 2 | x = 0 3 | 4 | def __init__(self): 5 | print "I am constructed" 6 | 7 | def party(self) : 8 | self.x = self.x + 1 9 | print "So far",self.x 10 | 11 | def __del__(self): 12 | print "I am destructed", self.x 13 | 14 | an = PartyAnimal() 15 | an.party() 16 | an.party() 17 | an.party() 18 | 19 | -------------------------------------------------------------------------------- /EXAMPLE CODE/party4.py: -------------------------------------------------------------------------------- 1 | class PartyAnimal: 2 | x = 0 3 | name = "" 4 | def __init__(self, nam): 5 | self.name = nam 6 | print self.name,"constructed" 7 | 8 | def party(self) : 9 | self.x = self.x + 1 10 | print self.name,"party count",self.x 11 | 12 | s = PartyAnimal("Sally") 13 | s.party() 14 | 15 | j = PartyAnimal("Jim") 16 | j.party() 17 | s.party() 18 | 19 | -------------------------------------------------------------------------------- /EXAMPLE CODE/party5.py: -------------------------------------------------------------------------------- 1 | class PartyAnimal: 2 | x = 0 3 | name = "" 4 | def __init__(self, nam): 5 | self.name = nam 6 | print self.name,"constructed" 7 | 8 | def party(self) : 9 | self.x = self.x + 1 10 | print self.name,"party count",self.x 11 | 12 | class FootballFan(PartyAnimal): 13 | points = 0 14 | def touchdown(self): 15 | self.points = self.points + 7 16 | self.party() 17 | print self.name,"points",self.points 18 | 19 | s = PartyAnimal("Sally") 20 | s.party() 21 | 22 | j = FootballFan("Jim") 23 | j.party() 24 | j.touchdown() 25 | 26 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pay.py: -------------------------------------------------------------------------------- 1 | inp = raw_input('Enter Hours: ') 2 | hours = float(inp) 3 | inp = raw_input('Enter Rate: ') 4 | rate = float(inp) 5 | pay = hours * rate 6 | print 'Pay:', pay 7 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pay2.py: -------------------------------------------------------------------------------- 1 | inp = raw_input('Enter Hours: ') 2 | hours = float(inp) 3 | inp = raw_input('Enter Rate: ') 4 | rate = float(inp) 5 | if hours > 40: 6 | pay = hours * rate + (hours - 40) * rate * 0.5 7 | else: 8 | pay = hours * rate 9 | print 'Pay:', pay 10 | -------------------------------------------------------------------------------- /EXAMPLE CODE/pay3.py: -------------------------------------------------------------------------------- 1 | try: 2 | inp = raw_input('Enter Hours: ') 3 | hours = float(inp) 4 | inp = raw_input('Enter Rate: ') 5 | rate = float(inp) 6 | if hours > 40: 7 | pay = hours * rate + (hours - 40) * rate * 1.5 8 | else: 9 | pay = hours * rate 10 | print 'Pay:', pay 11 | except: 12 | print 'Error, please enter numeric input' 13 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re01.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with From and have an at sign 2 | import re 3 | hand = open('mbox-short.txt') 4 | for line in hand: 5 | line = line.rstrip() 6 | if re.search('From:', line) : 7 | print line 8 | 9 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re02.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with From and have an at sign 2 | import re 3 | hand = open('mbox-short.txt') 4 | for line in hand: 5 | line = line.rstrip() 6 | if re.search('^From:', line) : 7 | print line 8 | 9 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re03.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with F and have an at sign 2 | import re 3 | hand = open('mbox-short.txt') 4 | for line in hand: 5 | line = line.rstrip() 6 | if re.search('^F..m:', line) : 7 | print line 8 | 9 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re04.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with From and have an at sign 2 | import re 3 | hand = open('mbox-short.txt') 4 | for line in hand: 5 | line = line.rstrip() 6 | if re.search('^From:.+@', line) : 7 | print line 8 | 9 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re05.py: -------------------------------------------------------------------------------- 1 | import re 2 | s = 'This message from csev@umich.edu to cwen@iupui.edu is about a meeting @2PM' 3 | lst = re.findall('\S+@\S+', s) 4 | print lst 5 | 6 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re06.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with From and have an at sign 2 | import re 3 | hand = open('mbox-short.txt') 4 | for line in hand: 5 | line = line.rstrip() 6 | x = re.findall('\S+@\S+', line) 7 | if len(x) > 0 : 8 | print x 9 | 10 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re07.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with From and have an at sign 2 | import re 3 | hand = open('mbox-short.txt') 4 | for line in hand: 5 | line = line.rstrip() 6 | x = re.findall('[a-zA-Z0-9]\S+@\S+[a-zA-Z]', line) 7 | if len(x) > 0 : 8 | print x 9 | 10 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re08.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with From and have an at sign 2 | import re 3 | hand = open('mbox-short.txt') 4 | for line in hand: 5 | line = line.rstrip() 6 | x = re.findall('^X\S*: (\S+)', line) 7 | if not x : continue 8 | print x 9 | 10 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re09.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with From and have an at sign 2 | import re 3 | hand = open('mbox-short.txt') 4 | for line in hand: 5 | line = line.rstrip() 6 | if re.search('^X\S*: [0-9.]+', line) : 7 | print line 8 | 9 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re10.py: -------------------------------------------------------------------------------- 1 | import re 2 | hand = open('mbox-short.txt') 3 | for line in hand: 4 | line = line.rstrip() 5 | x = re.findall('^X\S*: ([0-9.]+)', line) 6 | if len(x) > 0 : 7 | print x 8 | 9 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re11.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with From and have an at sign 2 | import re 3 | hand = open('mbox-short.txt') 4 | for line in hand: 5 | line = line.rstrip() 6 | x = re.findall('^Details:.*rev=([0-9.]+)', line) 7 | if len(x) > 0: 8 | print x 9 | 10 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re12.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with From and have an at sign 2 | import re 3 | hand = open('mbox-short.txt') 4 | for line in hand: 5 | line = line.rstrip() 6 | x = re.findall('^From .* ([0-9][0-9]):', line) 7 | if len(x) > 0 : print x 8 | 9 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re13.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with From and have an at sign 2 | import re 3 | hand = open('mbox-short.txt') 4 | for line in hand: 5 | line = line.rstrip() 6 | x = re.findall('Author:.*@(\S+)', line) 7 | if not x : continue 8 | print x 9 | 10 | -------------------------------------------------------------------------------- /EXAMPLE CODE/re14.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with From and have an at sign 2 | import re 3 | fname = raw_input('Enter file:') 4 | hand = open(fname) 5 | nums = list() 6 | for line in hand: 7 | line = line.rstrip() 8 | x = re.findall('New Revision: ([0-9]+)', line) 9 | if len(x) == 1 : 10 | val = float(x[0]) 11 | nums.append(val) 12 | print len(nums) 13 | print sum(nums)/len(nums) 14 | 15 | -------------------------------------------------------------------------------- /EXAMPLE CODE/romeo-full.txt: -------------------------------------------------------------------------------- 1 | Romeo and Juliet 2 | Act 2, Scene 2 3 | 4 | SCENE II. Capulet's orchard. 5 | 6 | Enter ROMEO 7 | 8 | ROMEO 9 | 10 | He jests at scars that never felt a wound. 11 | JULIET appears above at a window 12 | 13 | But, soft! what light through yonder window breaks? 14 | It is the east, and Juliet is the sun. 15 | Arise, fair sun, and kill the envious moon, 16 | Who is already sick and pale with grief, 17 | That thou her maid art far more fair than she: 18 | Be not her maid, since she is envious; 19 | Her vestal livery is but sick and green 20 | And none but fools do wear it; cast it off. 21 | It is my lady, O, it is my love! 22 | O, that she knew she were! 23 | She speaks yet she says nothing: what of that? 24 | Her eye discourses; I will answer it. 25 | I am too bold, 'tis not to me she speaks: 26 | Two of the fairest stars in all the heaven, 27 | Having some business, do entreat her eyes 28 | To twinkle in their spheres till they return. 29 | What if her eyes were there, they in her head? 30 | The brightness of her cheek would shame those stars, 31 | As daylight doth a lamp; her eyes in heaven 32 | Would through the airy region stream so bright 33 | That birds would sing and think it were not night. 34 | See, how she leans her cheek upon her hand! 35 | O, that I were a glove upon that hand, 36 | That I might touch that cheek! 37 | 38 | JULIET 39 | 40 | Ay me! 41 | 42 | ROMEO 43 | 44 | She speaks: 45 | O, speak again, bright angel! for thou art 46 | As glorious to this night, being o'er my head 47 | As is a winged messenger of heaven 48 | Unto the white-upturned wondering eyes 49 | Of mortals that fall back to gaze on him 50 | When he bestrides the lazy-pacing clouds 51 | And sails upon the bosom of the air. 52 | 53 | JULIET 54 | 55 | O Romeo, Romeo! wherefore art thou Romeo? 56 | Deny thy father and refuse thy name; 57 | Or, if thou wilt not, be but sworn my love, 58 | And I'll no longer be a Capulet. 59 | 60 | ROMEO 61 | 62 | [Aside] Shall I hear more, or shall I speak at this? 63 | 64 | JULIET 65 | 66 | 'Tis but thy name that is my enemy; 67 | Thou art thyself, though not a Montague. 68 | What's Montague? it is nor hand, nor foot, 69 | Nor arm, nor face, nor any other part 70 | Belonging to a man. O, be some other name! 71 | What's in a name? that which we call a rose 72 | By any other name would smell as sweet; 73 | So Romeo would, were he not Romeo call'd, 74 | Retain that dear perfection which he owes 75 | Without that title. Romeo, doff thy name, 76 | And for that name which is no part of thee 77 | Take all myself. 78 | 79 | ROMEO 80 | 81 | I take thee at thy word: 82 | Call me but love, and I'll be new baptized; 83 | Henceforth I never will be Romeo. 84 | 85 | JULIET 86 | 87 | What man art thou that thus bescreen'd in night 88 | So stumblest on my counsel? 89 | 90 | ROMEO 91 | 92 | By a name 93 | I know not how to tell thee who I am: 94 | My name, dear saint, is hateful to myself, 95 | Because it is an enemy to thee; 96 | Had I it written, I would tear the word. 97 | 98 | JULIET 99 | 100 | My ears have not yet drunk a hundred words 101 | Of that tongue's utterance, yet I know the sound: 102 | Art thou not Romeo and a Montague? 103 | 104 | ROMEO 105 | 106 | Neither, fair saint, if either thee dislike. 107 | 108 | JULIET 109 | 110 | How camest thou hither, tell me, and wherefore? 111 | The orchard walls are high and hard to climb, 112 | And the place death, considering who thou art, 113 | If any of my kinsmen find thee here. 114 | 115 | ROMEO 116 | 117 | With love's light wings did I o'er-perch these walls; 118 | For stony limits cannot hold love out, 119 | And what love can do that dares love attempt; 120 | Therefore thy kinsmen are no let to me. 121 | 122 | JULIET 123 | 124 | If they do see thee, they will murder thee. 125 | 126 | ROMEO 127 | 128 | Alack, there lies more peril in thine eye 129 | Than twenty of their swords: look thou but sweet, 130 | And I am proof against their enmity. 131 | 132 | JULIET 133 | 134 | I would not for the world they saw thee here. 135 | 136 | ROMEO 137 | 138 | I have night's cloak to hide me from their sight; 139 | And but thou love me, let them find me here: 140 | My life were better ended by their hate, 141 | Than death prorogued, wanting of thy love. 142 | 143 | JULIET 144 | 145 | By whose direction found'st thou out this place? 146 | 147 | ROMEO 148 | 149 | By love, who first did prompt me to inquire; 150 | He lent me counsel and I lent him eyes. 151 | I am no pilot; yet, wert thou as far 152 | As that vast shore wash'd with the farthest sea, 153 | I would adventure for such merchandise. 154 | 155 | JULIET 156 | 157 | Thou know'st the mask of night is on my face, 158 | Else would a maiden blush bepaint my cheek 159 | For that which thou hast heard me speak to-night 160 | Fain would I dwell on form, fain, fain deny 161 | What I have spoke: but farewell compliment! 162 | Dost thou love me? I know thou wilt say 'Ay,' 163 | And I will take thy word: yet if thou swear'st, 164 | Thou mayst prove false; at lovers' perjuries 165 | Then say, Jove laughs. O gentle Romeo, 166 | If thou dost love, pronounce it faithfully: 167 | Or if thou think'st I am too quickly won, 168 | I'll frown and be perverse an say thee nay, 169 | So thou wilt woo; but else, not for the world. 170 | In truth, fair Montague, I am too fond, 171 | And therefore thou mayst think my 'havior light: 172 | But trust me, gentleman, I'll prove more true 173 | Than those that have more cunning to be strange. 174 | I should have been more strange, I must confess, 175 | But that thou overheard'st, ere I was ware, 176 | My true love's passion: therefore pardon me, 177 | And not impute this yielding to light love, 178 | Which the dark night hath so discovered. 179 | 180 | ROMEO 181 | 182 | Lady, by yonder blessed moon I swear 183 | That tips with silver all these fruit-tree tops-- 184 | 185 | JULIET 186 | 187 | O, swear not by the moon, the inconstant moon, 188 | That monthly changes in her circled orb, 189 | Lest that thy love prove likewise variable. 190 | 191 | ROMEO 192 | 193 | What shall I swear by? 194 | 195 | JULIET 196 | 197 | Do not swear at all; 198 | Or, if thou wilt, swear by thy gracious self, 199 | Which is the god of my idolatry, 200 | And I'll believe thee. 201 | 202 | ROMEO 203 | 204 | If my heart's dear love-- 205 | 206 | JULIET 207 | 208 | Well, do not swear: although I joy in thee, 209 | I have no joy of this contract to-night: 210 | It is too rash, too unadvised, too sudden; 211 | Too like the lightning, which doth cease to be 212 | Ere one can say 'It lightens.' Sweet, good night! 213 | This bud of love, by summer's ripening breath, 214 | May prove a beauteous flower when next we meet. 215 | Good night, good night! as sweet repose and rest 216 | Come to thy heart as that within my breast! 217 | 218 | ROMEO 219 | 220 | O, wilt thou leave me so unsatisfied? 221 | 222 | JULIET 223 | 224 | What satisfaction canst thou have to-night? 225 | 226 | ROMEO 227 | 228 | The exchange of thy love's faithful vow for mine. 229 | 230 | JULIET 231 | 232 | I gave thee mine before thou didst request it: 233 | And yet I would it were to give again. 234 | 235 | ROMEO 236 | 237 | Wouldst thou withdraw it? for what purpose, love? 238 | 239 | JULIET 240 | 241 | But to be frank, and give it thee again. 242 | And yet I wish but for the thing I have: 243 | My bounty is as boundless as the sea, 244 | My love as deep; the more I give to thee, 245 | The more I have, for both are infinite. 246 | 247 | Nurse calls within 248 | 249 | I hear some noise within; dear love, adieu! 250 | Anon, good nurse! Sweet Montague, be true. 251 | Stay but a little, I will come again. 252 | Exit, above 253 | 254 | ROMEO 255 | 256 | O blessed, blessed night! I am afeard. 257 | Being in night, all this is but a dream, 258 | Too flattering-sweet to be substantial. 259 | 260 | Re-enter JULIET, above 261 | 262 | JULIET 263 | 264 | Three words, dear Romeo, and good night indeed. 265 | If that thy bent of love be honourable, 266 | Thy purpose marriage, send me word to-morrow, 267 | By one that I'll procure to come to thee, 268 | Where and what time thou wilt perform the rite; 269 | And all my fortunes at thy foot I'll lay 270 | And follow thee my lord throughout the world. 271 | 272 | Nurse 273 | 274 | [Within] Madam! 275 | 276 | JULIET 277 | 278 | I come, anon.--But if thou mean'st not well, 279 | I do beseech thee-- 280 | 281 | Nurse 282 | [Within] Madam! 283 | 284 | JULIET 285 | 286 | By and by, I come:-- 287 | To cease thy suit, and leave me to my grief: 288 | To-morrow will I send. 289 | 290 | ROMEO 291 | 292 | So thrive my soul-- 293 | 294 | JULIET 295 | 296 | A thousand times good night! 297 | Exit, above 298 | 299 | ROMEO 300 | 301 | A thousand times the worse, to want thy light. 302 | Love goes toward love, as schoolboys from 303 | their books, 304 | But love from love, toward school with heavy looks. 305 | Retiring 306 | 307 | Re-enter JULIET, above 308 | 309 | JULIET 310 | 311 | Hist! Romeo, hist! O, for a falconer's voice, 312 | To lure this tassel-gentle back again! 313 | Bondage is hoarse, and may not speak aloud; 314 | Else would I tear the cave where Echo lies, 315 | And make her airy tongue more hoarse than mine, 316 | With repetition of my Romeo's name. 317 | 318 | ROMEO 319 | 320 | It is my soul that calls upon my name: 321 | How silver-sweet sound lovers' tongues by night, 322 | Like softest music to attending ears! 323 | 324 | JULIET 325 | 326 | Romeo! 327 | 328 | ROMEO 329 | 330 | My dear? 331 | 332 | JULIET 333 | 334 | At what o'clock to-morrow 335 | Shall I send to thee? 336 | 337 | ROMEO 338 | 339 | At the hour of nine. 340 | 341 | JULIET 342 | 343 | I will not fail: 'tis twenty years till then. 344 | I have forgot why I did call thee back. 345 | 346 | ROMEO 347 | 348 | Let me stand here till thou remember it. 349 | 350 | JULIET 351 | 352 | I shall forget, to have thee still stand there, 353 | Remembering how I love thy company. 354 | 355 | ROMEO 356 | 357 | And I'll still stay, to have thee still forget, 358 | Forgetting any other home but this. 359 | 360 | JULIET 361 | 362 | 'Tis almost morning; I would have thee gone: 363 | And yet no further than a wanton's bird; 364 | Who lets it hop a little from her hand, 365 | Like a poor prisoner in his twisted gyves, 366 | And with a silk thread plucks it back again, 367 | So loving-jealous of his liberty. 368 | 369 | ROMEO 370 | 371 | I would I were thy bird. 372 | 373 | JULIET 374 | 375 | Sweet, so would I: 376 | Yet I should kill thee with much cherishing. 377 | Good night, good night! parting is such 378 | sweet sorrow, 379 | That I shall say good night till it be morrow. 380 | 381 | Exit above 382 | 383 | ROMEO 384 | 385 | Sleep dwell upon thine eyes, peace in thy breast! 386 | Would I were sleep and peace, so sweet to rest! 387 | Hence will I to my ghostly father's cell, 388 | His help to crave, and my dear hap to tell. 389 | 390 | Exit 391 | -------------------------------------------------------------------------------- /EXAMPLE CODE/romeo.txt: -------------------------------------------------------------------------------- 1 | But soft what light through yonder window breaks 2 | It is the east and Juliet is the sun 3 | Arise fair sun and kill the envious moon 4 | Who is already sick and pale with grief 5 | -------------------------------------------------------------------------------- /EXAMPLE CODE/roster.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sqlite3 3 | 4 | conn = sqlite3.connect('rosterdb.sqlite') 5 | cur = conn.cursor() 6 | 7 | # Do some setup 8 | cur.executescript(''' 9 | DROP TABLE IF EXISTS User; 10 | DROP TABLE IF EXISTS Member; 11 | DROP TABLE IF EXISTS Course; 12 | 13 | CREATE TABLE User ( 14 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 15 | name TEXT UNIQUE 16 | ); 17 | 18 | CREATE TABLE Course ( 19 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 20 | title TEXT UNIQUE 21 | ); 22 | 23 | CREATE TABLE Member ( 24 | user_id INTEGER, 25 | course_id INTEGER, 26 | role INTEGER, 27 | PRIMARY KEY (user_id, course_id) 28 | ) 29 | ''') 30 | 31 | fname = raw_input('Enter file name: ') 32 | if ( len(fname) < 1 ) : fname = 'roster_data.json' 33 | 34 | # [ 35 | # [ "Charley", "si110", 1 ], 36 | # [ "Mea", "si110", 0 ], 37 | 38 | str_data = open(fname).read() 39 | json_data = json.loads(str_data) 40 | 41 | for entry in json_data: 42 | 43 | name = entry[0]; 44 | title = entry[1]; 45 | 46 | print name, title 47 | 48 | cur.execute('''INSERT OR IGNORE INTO User (name) 49 | VALUES ( ? )''', ( name, ) ) 50 | cur.execute('SELECT id FROM User WHERE name = ? ', (name, )) 51 | user_id = cur.fetchone()[0] 52 | 53 | cur.execute('''INSERT OR IGNORE INTO Course (title) 54 | VALUES ( ? )''', ( title, ) ) 55 | cur.execute('SELECT id FROM Course WHERE title = ? ', (title, )) 56 | course_id = cur.fetchone()[0] 57 | 58 | cur.execute('''INSERT OR REPLACE INTO Member 59 | (user_id, course_id) VALUES ( ?, ? )''', 60 | ( user_id, course_id ) ) 61 | 62 | conn.commit() 63 | -------------------------------------------------------------------------------- /EXAMPLE CODE/roster.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/roster.zip -------------------------------------------------------------------------------- /EXAMPLE CODE/roster/roster.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sqlite3 3 | 4 | conn = sqlite3.connect('rosterdb.sqlite') 5 | cur = conn.cursor() 6 | 7 | # Do some setup 8 | cur.executescript(''' 9 | DROP TABLE IF EXISTS User; 10 | DROP TABLE IF EXISTS Member; 11 | DROP TABLE IF EXISTS Course; 12 | 13 | CREATE TABLE User ( 14 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 15 | name TEXT UNIQUE 16 | ); 17 | 18 | CREATE TABLE Course ( 19 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 20 | title TEXT UNIQUE 21 | ); 22 | 23 | CREATE TABLE Member ( 24 | user_id INTEGER, 25 | course_id INTEGER, 26 | role INTEGER, 27 | PRIMARY KEY (user_id, course_id) 28 | ) 29 | ''') 30 | 31 | fname = raw_input('Enter file name: ') 32 | if ( len(fname) < 1 ) : fname = 'roster_data.json' 33 | 34 | # [ 35 | # [ "Charley", "si110", 1 ], 36 | # [ "Mea", "si110", 0 ], 37 | 38 | str_data = open(fname).read() 39 | json_data = json.loads(str_data) 40 | 41 | for entry in json_data: 42 | 43 | name = entry[0]; 44 | title = entry[1]; 45 | 46 | print name, title 47 | 48 | cur.execute('''INSERT OR IGNORE INTO User (name) 49 | VALUES ( ? )''', ( name, ) ) 50 | cur.execute('SELECT id FROM User WHERE name = ? ', (name, )) 51 | user_id = cur.fetchone()[0] 52 | 53 | cur.execute('''INSERT OR IGNORE INTO Course (title) 54 | VALUES ( ? )''', ( title, ) ) 55 | cur.execute('SELECT id FROM Course WHERE title = ? ', (title, )) 56 | course_id = cur.fetchone()[0] 57 | 58 | cur.execute('''INSERT OR REPLACE INTO Member 59 | (user_id, course_id) VALUES ( ?, ? )''', 60 | ( user_id, course_id ) ) 61 | 62 | conn.commit() 63 | -------------------------------------------------------------------------------- /EXAMPLE CODE/roster/roster_data.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | "Charley", 4 | "si110", 5 | 1 6 | ], 7 | [ 8 | "Mea", 9 | "si110", 10 | 0 11 | ], 12 | [ 13 | "Hattie", 14 | "si110", 15 | 0 16 | ], 17 | [ 18 | "Lyena", 19 | "si110", 20 | 0 21 | ], 22 | [ 23 | "Keziah", 24 | "si110", 25 | 0 26 | ], 27 | [ 28 | "Ellyce", 29 | "si110", 30 | 0 31 | ], 32 | [ 33 | "Thalia", 34 | "si110", 35 | 0 36 | ], 37 | [ 38 | "Meabh", 39 | "si110", 40 | 0 41 | ], 42 | [ 43 | "Aria", 44 | "si110", 45 | 0 46 | ], 47 | [ 48 | "Reena", 49 | "si110", 50 | 0 51 | ], 52 | [ 53 | "Ioannis", 54 | "si110", 55 | 0 56 | ], 57 | [ 58 | "Reily", 59 | "si110", 60 | 0 61 | ], 62 | [ 63 | "Sidharth", 64 | "si110", 65 | 0 66 | ], 67 | [ 68 | "Keiara", 69 | "si110", 70 | 0 71 | ], 72 | [ 73 | "Yann", 74 | "si110", 75 | 0 76 | ], 77 | [ 78 | "Marykate", 79 | "si110", 80 | 0 81 | ], 82 | [ 83 | "Dylan", 84 | "si110", 85 | 0 86 | ], 87 | [ 88 | "Kiran", 89 | "si110", 90 | 0 91 | ], 92 | [ 93 | "Faizaan", 94 | "si110", 95 | 0 96 | ], 97 | [ 98 | "Aneshia", 99 | "si110", 100 | 0 101 | ], 102 | [ 103 | "Kamron", 104 | "si110", 105 | 0 106 | ], 107 | [ 108 | "Allen", 109 | "si110", 110 | 0 111 | ], 112 | [ 113 | "Marshall", 114 | "si110", 115 | 0 116 | ], 117 | [ 118 | "Rosa", 119 | "si106", 120 | 1 121 | ], 122 | [ 123 | "Nora", 124 | "si106", 125 | 0 126 | ], 127 | [ 128 | "Mairin", 129 | "si106", 130 | 0 131 | ], 132 | [ 133 | "Zendel", 134 | "si106", 135 | 0 136 | ], 137 | [ 138 | "Honie", 139 | "si106", 140 | 0 141 | ], 142 | [ 143 | "Betsy", 144 | "si106", 145 | 0 146 | ], 147 | [ 148 | "Davie", 149 | "si106", 150 | 0 151 | ], 152 | [ 153 | "Larissa", 154 | "si106", 155 | 0 156 | ], 157 | [ 158 | "Shaurya", 159 | "si106", 160 | 0 161 | ], 162 | [ 163 | "Shania", 164 | "si106", 165 | 0 166 | ], 167 | [ 168 | "Sorcha", 169 | "si106", 170 | 0 171 | ], 172 | [ 173 | "Jeanna", 174 | "si106", 175 | 0 176 | ], 177 | [ 178 | "Temba", 179 | "si106", 180 | 0 181 | ], 182 | [ 183 | "Buse", 184 | "si106", 185 | 0 186 | ], 187 | [ 188 | "Mohammed", 189 | "si106", 190 | 0 191 | ], 192 | [ 193 | "Kayah", 194 | "si106", 195 | 0 196 | ], 197 | [ 198 | "Kareena", 199 | "si106", 200 | 0 201 | ], 202 | [ 203 | "Dineo", 204 | "si106", 205 | 0 206 | ], 207 | [ 208 | "Philippa", 209 | "si106", 210 | 0 211 | ], 212 | [ 213 | "Lia", 214 | "si206", 215 | 1 216 | ], 217 | [ 218 | "Sharlyn", 219 | "si206", 220 | 0 221 | ], 222 | [ 223 | "Linton", 224 | "si206", 225 | 0 226 | ], 227 | [ 228 | "Temilade", 229 | "si206", 230 | 0 231 | ], 232 | [ 233 | "Areez", 234 | "si206", 235 | 0 236 | ], 237 | [ 238 | "MacCartney", 239 | "si206", 240 | 0 241 | ], 242 | [ 243 | "Abubakar", 244 | "si206", 245 | 0 246 | ], 247 | [ 248 | "Derryn", 249 | "si206", 250 | 0 251 | ], 252 | [ 253 | "Elan", 254 | "si206", 255 | 0 256 | ], 257 | [ 258 | "Vikki", 259 | "si206", 260 | 0 261 | ], 262 | [ 263 | "Anisa", 264 | "si206", 265 | 0 266 | ], 267 | [ 268 | "Klevis", 269 | "si206", 270 | 0 271 | ], 272 | [ 273 | "Tait", 274 | "si206", 275 | 0 276 | ], 277 | [ 278 | "Rhea", 279 | "si206", 280 | 0 281 | ], 282 | [ 283 | "Pearsen", 284 | "si206", 285 | 0 286 | ], 287 | [ 288 | "Willow", 289 | "si206", 290 | 0 291 | ], 292 | [ 293 | "Skye", 294 | "si206", 295 | 0 296 | ], 297 | [ 298 | "Caralee", 299 | "si206", 300 | 0 301 | ], 302 | [ 303 | "Charlee", 304 | "si206", 305 | 0 306 | ], 307 | [ 308 | "Karyn", 309 | "si206", 310 | 0 311 | ], 312 | [ 313 | "Elana", 314 | "si206", 315 | 0 316 | ], 317 | [ 318 | "Maggie", 319 | "si206", 320 | 0 321 | ], 322 | [ 323 | "Eryk", 324 | "si206", 325 | 0 326 | ], 327 | [ 328 | "Zulaikha", 329 | "si301", 330 | 1 331 | ], 332 | [ 333 | "Elshan", 334 | "si301", 335 | 0 336 | ], 337 | [ 338 | "Anastasia", 339 | "si301", 340 | 0 341 | ], 342 | [ 343 | "Connar", 344 | "si301", 345 | 0 346 | ], 347 | [ 348 | "Anay", 349 | "si301", 350 | 0 351 | ], 352 | [ 353 | "Jayla", 354 | "si301", 355 | 0 356 | ], 357 | [ 358 | "Cai", 359 | "si301", 360 | 0 361 | ], 362 | [ 363 | "Zijie", 364 | "si301", 365 | 0 366 | ], 367 | [ 368 | "Riana", 369 | "si301", 370 | 0 371 | ], 372 | [ 373 | "Codie", 374 | "si301", 375 | 0 376 | ], 377 | [ 378 | "Colette", 379 | "si301", 380 | 0 381 | ], 382 | [ 383 | "Lucee", 384 | "si301", 385 | 0 386 | ], 387 | [ 388 | "Tatiana", 389 | "si301", 390 | 0 391 | ], 392 | [ 393 | "Zhong", 394 | "si301", 395 | 0 396 | ], 397 | [ 398 | "Lowri", 399 | "si301", 400 | 0 401 | ], 402 | [ 403 | "Maggy", 404 | "si301", 405 | 0 406 | ], 407 | [ 408 | "Basher", 409 | "si301", 410 | 0 411 | ], 412 | [ 413 | "Tanika", 414 | "si301", 415 | 0 416 | ], 417 | [ 418 | "Aria", 419 | "si301", 420 | 0 421 | ], 422 | [ 423 | "Belle", 424 | "si301", 425 | 0 426 | ], 427 | [ 428 | "Laranya", 429 | "si301", 430 | 0 431 | ], 432 | [ 433 | "Dayna", 434 | "si301", 435 | 0 436 | ], 437 | [ 438 | "Elleanne", 439 | "si301", 440 | 0 441 | ], 442 | [ 443 | "Maanav", 444 | "si310", 445 | 1 446 | ], 447 | [ 448 | "Tamta", 449 | "si310", 450 | 0 451 | ], 452 | [ 453 | "Frazer", 454 | "si310", 455 | 0 456 | ], 457 | [ 458 | "Sacha", 459 | "si310", 460 | 0 461 | ], 462 | [ 463 | "Aidan", 464 | "si310", 465 | 0 466 | ], 467 | [ 468 | "Abel", 469 | "si310", 470 | 0 471 | ], 472 | [ 473 | "Ahtasham", 474 | "si310", 475 | 0 476 | ], 477 | [ 478 | "Avinash", 479 | "si310", 480 | 0 481 | ], 482 | [ 483 | "Colette", 484 | "si310", 485 | 0 486 | ], 487 | [ 488 | "Cohen", 489 | "si310", 490 | 0 491 | ], 492 | [ 493 | "Rori", 494 | "si310", 495 | 0 496 | ], 497 | [ 498 | "Youer", 499 | "si310", 500 | 0 501 | ], 502 | [ 503 | "Jamey", 504 | "si310", 505 | 0 506 | ], 507 | [ 508 | "Makenzie", 509 | "si310", 510 | 0 511 | ], 512 | [ 513 | "Ida", 514 | "si310", 515 | 0 516 | ], 517 | [ 518 | "Alexzander", 519 | "si310", 520 | 0 521 | ], 522 | [ 523 | "Kavita", 524 | "si310", 525 | 0 526 | ], 527 | [ 528 | "Talia", 529 | "si310", 530 | 0 531 | ], 532 | [ 533 | "Anthony", 534 | "si310", 535 | 0 536 | ], 537 | [ 538 | "Elona", 539 | "si334", 540 | 1 541 | ], 542 | [ 543 | "Inan", 544 | "si334", 545 | 0 546 | ], 547 | [ 548 | "Caoilainn", 549 | "si334", 550 | 0 551 | ], 552 | [ 553 | "Ainsley", 554 | "si334", 555 | 0 556 | ], 557 | [ 558 | "Franciszek", 559 | "si334", 560 | 0 561 | ], 562 | [ 563 | "Corrie", 564 | "si334", 565 | 0 566 | ], 567 | [ 568 | "Nolan", 569 | "si334", 570 | 0 571 | ], 572 | [ 573 | "Makala", 574 | "si334", 575 | 0 576 | ], 577 | [ 578 | "Obieluem", 579 | "si334", 580 | 0 581 | ], 582 | [ 583 | "Camryn", 584 | "si334", 585 | 0 586 | ], 587 | [ 588 | "Honie", 589 | "si334", 590 | 0 591 | ], 592 | [ 593 | "Ole", 594 | "si334", 595 | 0 596 | ], 597 | [ 598 | "Raine", 599 | "si334", 600 | 0 601 | ], 602 | [ 603 | "Tyllor", 604 | "si334", 605 | 0 606 | ], 607 | [ 608 | "Diane", 609 | "si334", 610 | 0 611 | ], 612 | [ 613 | "Cullen", 614 | "si334", 615 | 0 616 | ], 617 | [ 618 | "Taylor", 619 | "si334", 620 | 0 621 | ], 622 | [ 623 | "Schekina", 624 | "si334", 625 | 0 626 | ], 627 | [ 628 | "Kensey", 629 | "si334", 630 | 0 631 | ], 632 | [ 633 | "Zhi", 634 | "si334", 635 | 0 636 | ], 637 | [ 638 | "Kiran", 639 | "si334", 640 | 0 641 | ], 642 | [ 643 | "Tymoteusz", 644 | "si334", 645 | 0 646 | ], 647 | [ 648 | "Windsor", 649 | "si363", 650 | 1 651 | ], 652 | [ 653 | "Kashish", 654 | "si363", 655 | 0 656 | ], 657 | [ 658 | "Diarmid", 659 | "si363", 660 | 0 661 | ], 662 | [ 663 | "Laura", 664 | "si363", 665 | 0 666 | ], 667 | [ 668 | "Jaskaran", 669 | "si363", 670 | 0 671 | ], 672 | [ 673 | "Presley", 674 | "si363", 675 | 0 676 | ], 677 | [ 678 | "Brooklynn", 679 | "si363", 680 | 0 681 | ], 682 | [ 683 | "Heddle", 684 | "si363", 685 | 0 686 | ], 687 | [ 688 | "Travis", 689 | "si363", 690 | 0 691 | ], 692 | [ 693 | "Alx", 694 | "si363", 695 | 0 696 | ], 697 | [ 698 | "Airen", 699 | "si363", 700 | 0 701 | ], 702 | [ 703 | "Erika", 704 | "si363", 705 | 0 706 | ], 707 | [ 708 | "Mackie", 709 | "si363", 710 | 0 711 | ], 712 | [ 713 | "Wen", 714 | "si363", 715 | 0 716 | ], 717 | [ 718 | "Seaan", 719 | "si363", 720 | 0 721 | ], 722 | [ 723 | "Meghan", 724 | "si363", 725 | 0 726 | ], 727 | [ 728 | "Ryaan", 729 | "si363", 730 | 0 731 | ], 732 | [ 733 | "Imogem", 734 | "si364", 735 | 1 736 | ], 737 | [ 738 | "Harlie", 739 | "si364", 740 | 0 741 | ], 742 | [ 743 | "Ronnie", 744 | "si364", 745 | 0 746 | ], 747 | [ 748 | "Lucca", 749 | "si364", 750 | 0 751 | ], 752 | [ 753 | "Shanelle", 754 | "si364", 755 | 0 756 | ], 757 | [ 758 | "Ieuan", 759 | "si364", 760 | 0 761 | ], 762 | [ 763 | "Anneliese", 764 | "si364", 765 | 0 766 | ], 767 | [ 768 | "Simon", 769 | "si364", 770 | 0 771 | ], 772 | [ 773 | "Sorche", 774 | "si364", 775 | 0 776 | ], 777 | [ 778 | "Nawal", 779 | "si364", 780 | 0 781 | ], 782 | [ 783 | "Adelaide", 784 | "si364", 785 | 0 786 | ], 787 | [ 788 | "Rhia", 789 | "si364", 790 | 0 791 | ], 792 | [ 793 | "Katarzyna", 794 | "si364", 795 | 0 796 | ], 797 | [ 798 | "LLeyton", 799 | "si364", 800 | 0 801 | ], 802 | [ 803 | "Enzo", 804 | "si364", 805 | 0 806 | ], 807 | [ 808 | "Declan", 809 | "si364", 810 | 0 811 | ], 812 | [ 813 | "Emelie", 814 | "si364", 815 | 0 816 | ], 817 | [ 818 | "Baillie", 819 | "si364", 820 | 0 821 | ], 822 | [ 823 | "Shola", 824 | "si364", 825 | 0 826 | ], 827 | [ 828 | "Jenna", 829 | "si422", 830 | 1 831 | ], 832 | [ 833 | "Miles", 834 | "si422", 835 | 0 836 | ], 837 | [ 838 | "Sakina", 839 | "si422", 840 | 0 841 | ], 842 | [ 843 | "Melanie", 844 | "si422", 845 | 0 846 | ], 847 | [ 848 | "Bailie", 849 | "si422", 850 | 0 851 | ], 852 | [ 853 | "Cassy", 854 | "si422", 855 | 0 856 | ], 857 | [ 858 | "Nikash", 859 | "si422", 860 | 0 861 | ], 862 | [ 863 | "Hebe", 864 | "si422", 865 | 0 866 | ], 867 | [ 868 | "Sia", 869 | "si422", 870 | 0 871 | ], 872 | [ 873 | "Skyla", 874 | "si422", 875 | 0 876 | ], 877 | [ 878 | "Jamaal", 879 | "si422", 880 | 0 881 | ], 882 | [ 883 | "Keanna", 884 | "si422", 885 | 0 886 | ], 887 | [ 888 | "Vanya", 889 | "si422", 890 | 0 891 | ], 892 | [ 893 | "Temperance", 894 | "si422", 895 | 0 896 | ], 897 | [ 898 | "Hafiza", 899 | "si422", 900 | 0 901 | ], 902 | [ 903 | "Alx", 904 | "si422", 905 | 0 906 | ], 907 | [ 908 | "Brigitte", 909 | "si422", 910 | 0 911 | ], 912 | [ 913 | "Eliana", 914 | "si422", 915 | 0 916 | ], 917 | [ 918 | "Kayden", 919 | "si422", 920 | 0 921 | ], 922 | [ 923 | "Man", 924 | "si422", 925 | 0 926 | ], 927 | [ 928 | "Jaydyn", 929 | "si422", 930 | 0 931 | ], 932 | [ 933 | "Soukina", 934 | "si430", 935 | 1 936 | ], 937 | [ 938 | "Stephenjunior", 939 | "si430", 940 | 0 941 | ], 942 | [ 943 | "Buddy", 944 | "si430", 945 | 0 946 | ], 947 | [ 948 | "Holly", 949 | "si430", 950 | 0 951 | ], 952 | [ 953 | "Kamilia", 954 | "si430", 955 | 0 956 | ], 957 | [ 958 | "Cassie", 959 | "si430", 960 | 0 961 | ], 962 | [ 963 | "Kris", 964 | "si430", 965 | 0 966 | ], 967 | [ 968 | "Maia", 969 | "si430", 970 | 0 971 | ], 972 | [ 973 | "Abel", 974 | "si430", 975 | 0 976 | ], 977 | [ 978 | "Tamika", 979 | "si430", 980 | 0 981 | ], 982 | [ 983 | "Deano", 984 | "si430", 985 | 0 986 | ], 987 | [ 988 | "Rosa", 989 | "si430", 990 | 0 991 | ], 992 | [ 993 | "Georgia", 994 | "si430", 995 | 0 996 | ], 997 | [ 998 | "Louie", 999 | "si430", 1000 | 0 1001 | ], 1002 | [ 1003 | "Kassie", 1004 | "si430", 1005 | 0 1006 | ], 1007 | [ 1008 | "Mutinta", 1009 | "si430", 1010 | 0 1011 | ], 1012 | [ 1013 | "Manwen", 1014 | "si430", 1015 | 0 1016 | ] 1017 | ] -------------------------------------------------------------------------------- /EXAMPLE CODE/search1.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox-short.txt') 2 | count = 0 3 | for line in fhand: 4 | if line.startswith('From:') : 5 | print line 6 | -------------------------------------------------------------------------------- /EXAMPLE CODE/search10.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox-short.txt') 2 | for line in fhand: 3 | words = line.split() 4 | # print 'Debug:', words 5 | if len(words) == 0 : continue 6 | if words[0] != 'From' : continue 7 | print words[2] 8 | -------------------------------------------------------------------------------- /EXAMPLE CODE/search2.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox-short.txt') 2 | for line in fhand: 3 | line = line.rstrip() 4 | if line.startswith('From:') : 5 | print line 6 | -------------------------------------------------------------------------------- /EXAMPLE CODE/search3.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox-short.txt') 2 | for line in fhand: 3 | line = line.rstrip() 4 | # Skip 'uninteresting lines' 5 | if not line.startswith('From:') : 6 | continue 7 | # Process our 'interesting' line 8 | print line 9 | -------------------------------------------------------------------------------- /EXAMPLE CODE/search4.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox-short.txt') 2 | for line in fhand: 3 | line = line.rstrip() 4 | if line.find('@uct.ac.za') == -1 : continue 5 | print line 6 | -------------------------------------------------------------------------------- /EXAMPLE CODE/search5.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox-short.txt') 2 | for line in fhand: 3 | line = line.rstrip() 4 | if not line.startswith('From ') : continue 5 | words = line.split() 6 | print words[2] 7 | -------------------------------------------------------------------------------- /EXAMPLE CODE/search6.py: -------------------------------------------------------------------------------- 1 | fname = raw_input('Enter the file name: ') 2 | fhand = open(fname) 3 | count = 0 4 | for line in fhand: 5 | if line.startswith('Subject:') : 6 | count = count + 1 7 | print 'There were', count, 'subject lines in', fname 8 | -------------------------------------------------------------------------------- /EXAMPLE CODE/search7.py: -------------------------------------------------------------------------------- 1 | fname = raw_input('Enter the file name: ') 2 | try: 3 | fhand = open(fname) 4 | except: 5 | print 'File cannot be opened:', fname 6 | exit() 7 | count = 0 8 | for line in fhand: 9 | if line.startswith('Subject:') : 10 | count = count + 1 11 | print 'There were', count, 'subject lines in', fname 12 | -------------------------------------------------------------------------------- /EXAMPLE CODE/search8.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox-short.txt') 2 | count = 0 3 | for line in fhand: 4 | words = line.split() 5 | if words[0] != 'From' : continue 6 | print words[2] 7 | -------------------------------------------------------------------------------- /EXAMPLE CODE/search9.py: -------------------------------------------------------------------------------- 1 | fhand = open('mbox-short.txt') 2 | count = 0 3 | for line in fhand: 4 | words = line.split() 5 | print 'Debug:', words 6 | if words[0] != 'From' : continue 7 | print words[2] 8 | -------------------------------------------------------------------------------- /EXAMPLE CODE/sequence.py: -------------------------------------------------------------------------------- 1 | inp = raw_input('Enter a Number:') 2 | n = int(inp) 3 | while n != 1: 4 | print n, # Use comma to suppress newline 5 | if n%2 == 0: # n is even 6 | n = n/2 7 | else: # n is odd 8 | n = n*3+1 9 | -------------------------------------------------------------------------------- /EXAMPLE CODE/socket1.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 4 | mysock.connect(('www.py4inf.com', 80)) 5 | mysock.send('GET http://www.py4inf.com/code/romeo.txt HTTP/1.0\n\n') 6 | 7 | while True: 8 | data = mysock.recv(512) 9 | if ( len(data) < 1 ) : 10 | break 11 | print data; 12 | 13 | mysock.close() 14 | -------------------------------------------------------------------------------- /EXAMPLE CODE/socket2.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | url = raw_input('Enter: ') 4 | words = url.split('/') 5 | host = words[2] 6 | 7 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 8 | mysock.connect((host, 80)) 9 | mysock.send('GET '+url+' HTTP/1.0\n\n') 10 | 11 | while True: 12 | data = mysock.recv(512) 13 | if ( len(data) < 1 ) : 14 | break 15 | print data, 16 | 17 | mysock.close() 18 | 19 | -------------------------------------------------------------------------------- /EXAMPLE CODE/soft.py: -------------------------------------------------------------------------------- 1 | txt = 'but soft what light in yonder window breaks' 2 | words = txt.split() 3 | t = list() 4 | for word in words: 5 | t.append((len(word), word)) 6 | 7 | t.sort(reverse=True) 8 | 9 | res = list() 10 | for length, word in t: 11 | res.append(word) 12 | 13 | print res 14 | -------------------------------------------------------------------------------- /EXAMPLE CODE/spamave.py: -------------------------------------------------------------------------------- 1 | fname = raw_input('Enter the file name: ') 2 | try: 3 | fhand = open(fname) 4 | except: 5 | print 'File cannot be opened:', fname 6 | exit() 7 | count = 0 8 | total = 0 9 | for line in fhand: 10 | words = line.split() 11 | if len(words) != 2 : continue 12 | if words[0] != 'X-DSPAM-Confidence:' : continue 13 | try: 14 | conf = float(words[1]) 15 | except: 16 | continue 17 | count = count + 1 18 | total = total + conf 19 | average = total / count 20 | print 'Average spam confidence:', average 21 | -------------------------------------------------------------------------------- /EXAMPLE CODE/tracks.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/tracks.zip -------------------------------------------------------------------------------- /EXAMPLE CODE/tracks/README.txt: -------------------------------------------------------------------------------- 1 | TBD 2 | 3 | -------------------------------------------------------------------------------- /EXAMPLE CODE/tracks/tracks.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | import sqlite3 3 | 4 | conn = sqlite3.connect('trackdb.sqlite') 5 | cur = conn.cursor() 6 | 7 | # Make some fresh tables using executescript() 8 | cur.executescript(''' 9 | DROP TABLE IF EXISTS Artist; 10 | DROP TABLE IF EXISTS Album; 11 | DROP TABLE IF EXISTS Track; 12 | 13 | CREATE TABLE Artist ( 14 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 15 | name TEXT UNIQUE 16 | ); 17 | 18 | CREATE TABLE Album ( 19 | id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, 20 | artist_id INTEGER, 21 | title TEXT UNIQUE 22 | ); 23 | 24 | CREATE TABLE Track ( 25 | id INTEGER NOT NULL PRIMARY KEY 26 | AUTOINCREMENT UNIQUE, 27 | title TEXT UNIQUE, 28 | album_id INTEGER, 29 | len INTEGER, rating INTEGER, count INTEGER 30 | ); 31 | ''') 32 | 33 | 34 | fname = raw_input('Enter file name: ') 35 | if ( len(fname) < 1 ) : fname = 'Library.xml' 36 | 37 | # Track ID369 38 | # NameAnother One Bites The Dust 39 | # ArtistQueen 40 | def lookup(d, key): 41 | found = False 42 | for child in d: 43 | if found : return child.text 44 | if child.tag == 'key' and child.text == key : 45 | found = True 46 | return None 47 | 48 | stuff = ET.parse(fname) 49 | all = stuff.findall('dict/dict/dict') 50 | print 'Dict count:', len(all) 51 | for entry in all: 52 | if ( lookup(entry, 'Track ID') is None ) : continue 53 | 54 | name = lookup(entry, 'Name') 55 | artist = lookup(entry, 'Artist') 56 | album = lookup(entry, 'Album') 57 | count = lookup(entry, 'Play Count') 58 | rating = lookup(entry, 'Rating') 59 | length = lookup(entry, 'Total Time') 60 | 61 | if name is None or artist is None or album is None : 62 | continue 63 | 64 | print name, artist, album, count, rating, length 65 | 66 | cur.execute('''INSERT OR IGNORE INTO Artist (name) 67 | VALUES ( ? )''', ( artist, ) ) 68 | cur.execute('SELECT id FROM Artist WHERE name = ? ', (artist, )) 69 | artist_id = cur.fetchone()[0] 70 | 71 | cur.execute('''INSERT OR IGNORE INTO Album (title, artist_id) 72 | VALUES ( ?, ? )''', ( album, artist_id ) ) 73 | cur.execute('SELECT id FROM Album WHERE title = ? ', (album, )) 74 | album_id = cur.fetchone()[0] 75 | 76 | cur.execute('''INSERT OR REPLACE INTO Track 77 | (title, album_id, len, rating, count) 78 | VALUES ( ?, ?, ?, ?, ? )''', 79 | ( name, album_id, length, rating, count ) ) 80 | 81 | conn.commit() 82 | -------------------------------------------------------------------------------- /EXAMPLE CODE/twdump.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | cur.execute('SELECT * FROM Twitter') 6 | count = 0 7 | for row in cur : 8 | print row 9 | count = count + 1 10 | print count, 'rows.' 11 | cur.close() 12 | -------------------------------------------------------------------------------- /EXAMPLE CODE/twfriends.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import twurl 3 | import json 4 | import sqlite3 5 | 6 | TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json' 7 | 8 | conn = sqlite3.connect('friends.sqlite') 9 | cur = conn.cursor() 10 | 11 | cur.execute('''CREATE TABLE IF NOT EXISTS People 12 | (id INTEGER PRIMARY KEY, name TEXT UNIQUE, retrieved INTEGER)''') 13 | cur.execute('''CREATE TABLE IF NOT EXISTS Follows 14 | (from_id INTEGER, to_id INTEGER, UNIQUE(from_id, to_id))''') 15 | 16 | while True: 17 | acct = raw_input('Enter a Twitter account, or quit: ') 18 | if ( acct == 'quit' ) : break 19 | if ( len(acct) < 1 ) : 20 | cur.execute('SELECT id, name FROM People WHERE retrieved = 0 LIMIT 1') 21 | try: 22 | (id, acct) = cur.fetchone() 23 | except: 24 | print 'No unretrieved Twitter accounts found' 25 | continue 26 | else: 27 | cur.execute('SELECT id FROM People WHERE name = ? LIMIT 1', 28 | (acct, ) ) 29 | try: 30 | id = cur.fetchone()[0] 31 | except: 32 | cur.execute('INSERT OR IGNORE INTO People (name, retrieved) VALUES ( ?, 0)', 33 | ( acct, ) ) 34 | conn.commit() 35 | if cur.rowcount != 1 : 36 | print 'Error inserting account:',acct 37 | continue 38 | id = cur.lastrowid 39 | 40 | url = twurl.augment(TWITTER_URL, {'screen_name': acct, 'count': '5'} ) 41 | print 'Retrieving account', acct 42 | connection = urllib.urlopen(url) 43 | data = connection.read() 44 | headers = connection.info().dict 45 | print 'Remaining', headers['x-rate-limit-remaining'] 46 | 47 | js = json.loads(data) 48 | # print json.dumps(js, indent=4) 49 | 50 | cur.execute('UPDATE People SET retrieved=1 WHERE name = ?', (acct, ) ) 51 | 52 | countnew = 0 53 | countold = 0 54 | for u in js['users'] : 55 | friend = u['screen_name'] 56 | print friend 57 | cur.execute('SELECT id FROM People WHERE name = ? LIMIT 1', 58 | (friend, ) ) 59 | try: 60 | friend_id = cur.fetchone()[0] 61 | countold = countold + 1 62 | except: 63 | cur.execute('''INSERT OR IGNORE INTO People (name, retrieved) 64 | VALUES ( ?, 0)''', ( friend, ) ) 65 | conn.commit() 66 | if cur.rowcount != 1 : 67 | print 'Error inserting account:',friend 68 | continue 69 | friend_id = cur.lastrowid 70 | countnew = countnew + 1 71 | cur.execute('INSERT OR IGNORE INTO Follows (from_id, to_id) VALUES (?, ?)', 72 | (id, friend_id) ) 73 | print 'New accounts=',countnew,' revisited=',countold 74 | conn.commit() 75 | 76 | cur.close() 77 | 78 | -------------------------------------------------------------------------------- /EXAMPLE CODE/twitter1.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import twurl 3 | 4 | TWITTER_URL = 'https://api.twitter.com/1.1/statuses/user_timeline.json' 5 | 6 | while True: 7 | print '' 8 | acct = raw_input('Enter Twitter Account:') 9 | if ( len(acct) < 1 ) : break 10 | url = twurl.augment(TWITTER_URL, 11 | {'screen_name': acct, 'count': '2'} ) 12 | print 'Retrieving', url 13 | connection = urllib.urlopen(url) 14 | data = connection.read() 15 | print data[:250] 16 | headers = connection.info().dict 17 | # print headers 18 | print 'Remaining', headers['x-rate-limit-remaining'] 19 | -------------------------------------------------------------------------------- /EXAMPLE CODE/twitter2.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import twurl 3 | import json 4 | 5 | TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json' 6 | 7 | while True: 8 | print '' 9 | acct = raw_input('Enter Twitter Account:') 10 | if ( len(acct) < 1 ) : break 11 | url = twurl.augment(TWITTER_URL, 12 | {'screen_name': acct, 'count': '5'} ) 13 | print 'Retrieving', url 14 | connection = urllib.urlopen(url) 15 | data = connection.read() 16 | headers = connection.info().dict 17 | print 'Remaining', headers['x-rate-limit-remaining'] 18 | js = json.loads(data) 19 | print json.dumps(js, indent=4) 20 | 21 | for u in js['users'] : 22 | print u['screen_name'] 23 | s = u['status']['text'] 24 | print ' ',s[:50] 25 | -------------------------------------------------------------------------------- /EXAMPLE CODE/twjoin.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('friends.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('SELECT * FROM People') 7 | count = 0 8 | print 'People:' 9 | for row in cur : 10 | if count < 5: print row 11 | count = count + 1 12 | print count, 'rows.' 13 | 14 | cur.execute('SELECT * FROM Follows') 15 | count = 0 16 | print 'Follows:' 17 | for row in cur : 18 | if count < 5: print row 19 | count = count + 1 20 | print count, 'rows.' 21 | 22 | cur.execute('''SELECT * FROM Follows JOIN People 23 | ON Follows.to_id = People.id WHERE Follows.from_id = 2''') 24 | count = 0 25 | print 'Connections for id=2:' 26 | for row in cur : 27 | if count < 5: print row 28 | count = count + 1 29 | print count, 'rows.' 30 | 31 | cur.close() 32 | -------------------------------------------------------------------------------- /EXAMPLE CODE/twspider.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import twurl 3 | import json 4 | import sqlite3 5 | 6 | TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json' 7 | 8 | conn = sqlite3.connect('spider.sqlite') 9 | cur = conn.cursor() 10 | 11 | cur.execute(''' 12 | CREATE TABLE IF NOT EXISTS Twitter (name TEXT, retrieved INTEGER, friends INTEGER)''') 13 | 14 | while True: 15 | acct = raw_input('Enter a Twitter account, or quit: ') 16 | if ( acct == 'quit' ) : break 17 | if ( len(acct) < 1 ) : 18 | cur.execute('SELECT name FROM Twitter WHERE retrieved = 0 LIMIT 1') 19 | try: 20 | acct = cur.fetchone()[0] 21 | except: 22 | print 'No unretrieved Twitter accounts found' 23 | continue 24 | 25 | url = twurl.augment(TWITTER_URL, {'screen_name': acct, 'count': '5'} ) 26 | print 'Retrieving', url 27 | connection = urllib.urlopen(url) 28 | data = connection.read() 29 | headers = connection.info().dict 30 | print 'Remaining', headers['x-rate-limit-remaining'] 31 | js = json.loads(data) 32 | # print json.dumps(js, indent=4) 33 | 34 | cur.execute('UPDATE Twitter SET retrieved=1 WHERE name = ?', (acct, ) ) 35 | 36 | countnew = 0 37 | countold = 0 38 | for u in js['users'] : 39 | friend = u['screen_name'] 40 | print friend 41 | cur.execute('SELECT friends FROM Twitter WHERE name = ? LIMIT 1', 42 | (friend, ) ) 43 | try: 44 | count = cur.fetchone()[0] 45 | cur.execute('UPDATE Twitter SET friends = ? WHERE name = ?', 46 | (count+1, friend) ) 47 | countold = countold + 1 48 | except: 49 | cur.execute('''INSERT INTO Twitter (name, retrieved, friends) 50 | VALUES ( ?, 0, 1 )''', ( friend, ) ) 51 | countnew = countnew + 1 52 | print 'New accounts=',countnew,' revisited=',countold 53 | conn.commit() 54 | 55 | cur.close() 56 | 57 | -------------------------------------------------------------------------------- /EXAMPLE CODE/twtest.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | from twurl import augment 3 | 4 | print '* Calling Twitter...' 5 | url = augment('https://api.twitter.com/1.1/statuses/user_timeline.json', 6 | {'screen_name': 'drchuck', 'count': '2'} ) 7 | print url 8 | connection = urllib.urlopen(url) 9 | data = connection.read() 10 | print data 11 | headers = connection.info().dict 12 | print headers 13 | -------------------------------------------------------------------------------- /EXAMPLE CODE/twurl.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import oauth 3 | import hidden 4 | 5 | def augment(url, parameters) : 6 | secrets = hidden.oauth() 7 | consumer = oauth.OAuthConsumer(secrets['consumer_key'], secrets['consumer_secret']) 8 | token = oauth.OAuthToken(secrets['token_key'],secrets['token_secret']) 9 | 10 | oauth_request = oauth.OAuthRequest.from_consumer_and_token(consumer, 11 | token=token, http_method='GET', http_url=url, parameters=parameters) 12 | oauth_request.sign_request(oauth.OAuthSignatureMethod_HMAC_SHA1(), consumer, token) 13 | return oauth_request.to_url() 14 | 15 | 16 | def test_me() : 17 | print '* Calling Twitter...' 18 | url = augment('https://api.twitter.com/1.1/statuses/user_timeline.json', 19 | {'screen_name': 'drchuck', 'count': '2'} ) 20 | print url 21 | connection = urllib.urlopen(url) 22 | data = connection.read() 23 | print data 24 | headers = connection.info().dict 25 | print headers 26 | -------------------------------------------------------------------------------- /EXAMPLE CODE/txtcheck.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join 3 | for (dirname, dirs, files) in os.walk('.'): 4 | for filename in files: 5 | if filename.endswith('.txt') : 6 | thefile = os.path.join(dirname,filename) 7 | size = os.path.getsize(thefile) 8 | if size == 2578 or size == 2565: 9 | continue 10 | fhand = open(thefile,'r') 11 | lines = list() 12 | for line in fhand: 13 | lines.append(line) 14 | fhand.close() 15 | if len(lines) > 1: 16 | print len(lines), thefile 17 | print lines[:4] 18 | -------------------------------------------------------------------------------- /EXAMPLE CODE/txtcheck2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join 3 | for (dirname, dirs, files) in os.walk('.'): 4 | for filename in files: 5 | if filename.endswith('.txt') : 6 | thefile = os.path.join(dirname,filename) 7 | size = os.path.getsize(thefile) 8 | if size == 2578 or size == 2565: 9 | continue 10 | fhand = open(thefile,'r') 11 | lines = list() 12 | for line in fhand: 13 | lines.append(line) 14 | fhand.close() 15 | if len(lines) == 3 and lines[2].startswith('Sent from my iPhone') : 16 | continue 17 | if len(lines) > 1: 18 | print len(lines), thefile 19 | print lines[:4] 20 | -------------------------------------------------------------------------------- /EXAMPLE CODE/txtcheck3.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join 3 | for (dirname, dirs, files) in os.walk('.'): 4 | for filename in files: 5 | if filename.endswith('.txt') : 6 | thefile = os.path.join(dirname,filename) 7 | size = os.path.getsize(thefile) 8 | if size == 2578 or size == 2565: 9 | print 'T-Mobile:',thefile 10 | continue 11 | fhand = open(thefile,'r') 12 | lines = list() 13 | for line in fhand: 14 | lines.append(line) 15 | fhand.close() 16 | if len(lines) == 3 and lines[2].startswith('Sent from my iPhone') : 17 | print 'iPhone:', thefile 18 | continue 19 | -------------------------------------------------------------------------------- /EXAMPLE CODE/txtcount.py: -------------------------------------------------------------------------------- 1 | import os 2 | count = 0 3 | for dirname, dirs, files in os.walk('.'): 4 | for filename in files: 5 | if filename.endswith('.txt') : 6 | count = count + 1 7 | 8 | print 'Files:', count 9 | -------------------------------------------------------------------------------- /EXAMPLE CODE/txtdelete.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join 3 | for (dirname, dirs, files) in os.walk('.'): 4 | for filename in files: 5 | if filename.endswith('.txt') : 6 | thefile = os.path.join(dirname,filename) 7 | size = os.path.getsize(thefile) 8 | if size == 2578 or size == 2565: 9 | print 'T-Mobile:',thefile 10 | os.remove(thefile) 11 | continue 12 | fhand = open(thefile,'r') 13 | lines = list() 14 | for line in fhand: 15 | lines.append(line) 16 | fhand.close() 17 | if len(lines) == 3 and lines[2].startswith('Sent from my iPhone') : 18 | print 'iPhone:', thefile 19 | os.remove(thefile) 20 | continue 21 | -------------------------------------------------------------------------------- /EXAMPLE CODE/txtmd5.py: -------------------------------------------------------------------------------- 1 | import os 2 | import hashlib 3 | from os.path import join 4 | 5 | hashes = dict() 6 | for (dirname, dirs, files) in os.walk('.'): 7 | for filename in files: 8 | if filename.endswith('.txt') : 9 | thefile = os.path.join(dirname,filename) 10 | fhand = open(thefile,'r') 11 | data = fhand.read() 12 | fhand.close() 13 | hash = hashlib.md5(data).hexdigest() 14 | # print thefile, hash 15 | if hash in hashes: 16 | print hashes[hash], thefile 17 | else: 18 | hashes[hash] = thefile 19 | -------------------------------------------------------------------------------- /EXAMPLE CODE/txtsize.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join 3 | for (dirname, dirs, files) in os.walk('.'): 4 | for filename in files: 5 | if filename.endswith('.txt') : 6 | thefile = os.path.join(dirname,filename) 7 | print os.path.getsize(thefile), thefile 8 | -------------------------------------------------------------------------------- /EXAMPLE CODE/urljpeg.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import time 3 | 4 | mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 5 | mysock.connect(('www.py4inf.com', 80)) 6 | mysock.send('GET http://www.py4inf.com/cover.jpg HTTP/1.0\n\n') 7 | 8 | 9 | count = 0 10 | picture = ""; 11 | while True: 12 | data = mysock.recv(5120) 13 | if ( len(data) < 1 ) : break 14 | time.sleep(0.25) 15 | count = count + len(data) 16 | print len(data),count 17 | picture = picture + data 18 | 19 | mysock.close() 20 | 21 | # Look for the end of the header (2 CRLF) 22 | pos = picture.find("\r\n\r\n"); 23 | print 'Header length',pos 24 | print picture[:pos] 25 | 26 | # Skip past the header and save the picture data 27 | picture = picture[pos+4:] 28 | fhand = open("stuff.jpg","wb") 29 | fhand.write(picture); 30 | fhand.close() 31 | -------------------------------------------------------------------------------- /EXAMPLE CODE/urllib1.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | 3 | fhand = urllib.urlopen('http://www.py4inf.com/code/romeo.txt') 4 | for line in fhand: 5 | print line.strip() 6 | 7 | -------------------------------------------------------------------------------- /EXAMPLE CODE/urllib2.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | 3 | fhand = urllib.urlopen('http://www.dr-chuck.com/page1.htm') 4 | for line in fhand: 5 | print line.strip() -------------------------------------------------------------------------------- /EXAMPLE CODE/urllink2.py: -------------------------------------------------------------------------------- 1 | # Note - this code must run in Python 2.x and you must download 2 | # http://www.pythonlearn.com/code/BeautifulSoup.py 3 | # Into the same folder as this program 4 | 5 | import urllib 6 | from BeautifulSoup import * 7 | 8 | url = raw_input('Enter - ') 9 | html = urllib.urlopen(url).read() 10 | 11 | soup = BeautifulSoup(html) 12 | 13 | # Retrieve all of the anchor tags 14 | tags = soup('a') 15 | for tag in tags: 16 | # Look at the parts of a tag 17 | print 'TAG:',tag 18 | print 'URL:',tag.get('href', None) 19 | print 'Contents:',tag.contents[0] 20 | print 'Attrs:',tag.attrs 21 | -------------------------------------------------------------------------------- /EXAMPLE CODE/urllink3.py: -------------------------------------------------------------------------------- 1 | # Note - this code must run in Python 2.x and you must download 2 | # http://www.pythonlearn.com/code/BeautifulSoup.py 3 | # Into the same folder as this program 4 | 5 | import urllib 6 | from BeautifulSoup import * 7 | 8 | todo = list() 9 | visited = list() 10 | url = raw_input('Enter - ') 11 | todo.append(url) 12 | 13 | while len(todo) > 0 : 14 | print "====== Todo list count is ",len(todo) 15 | url = todo.pop() 16 | 17 | if ( not url.startswith('http') ) : 18 | print "Skipping", url 19 | continue 20 | 21 | if ( url.find('facebook') > 0 ) : 22 | continue 23 | 24 | if ( url in visited ) : 25 | print "Visited", url 26 | continue 27 | 28 | print "===== Retrieving ", url 29 | 30 | html = urllib.urlopen(url).read() 31 | soup = BeautifulSoup(html) 32 | visited.append(url) 33 | 34 | # Retrieve all of the anchor tags 35 | tags = soup('a') 36 | for tag in tags: 37 | newurl = tag.get('href', None) 38 | if ( newurl != None ) : 39 | todo.append(newurl) 40 | 41 | -------------------------------------------------------------------------------- /EXAMPLE CODE/urllinks.py: -------------------------------------------------------------------------------- 1 | # Note - this code must run in Python 2.x and you must download 2 | # http://www.pythonlearn.com/code/BeautifulSoup.py 3 | # Into the same folder as this program 4 | 5 | import urllib 6 | from BeautifulSoup import * 7 | 8 | url = raw_input('Enter - ') 9 | html = urllib.urlopen(url).read() 10 | soup = BeautifulSoup(html) 11 | 12 | # Retrieve all of the anchor tags 13 | tags = soup('a') 14 | for tag in tags: 15 | print tag.get('href', None) 16 | -------------------------------------------------------------------------------- /EXAMPLE CODE/urlregex.py: -------------------------------------------------------------------------------- 1 | # Search for lines that start with From and have an at sign 2 | import urllib 3 | import re 4 | 5 | url = raw_input('Enter - ') 6 | html = urllib.urlopen(url).read() 7 | links = re.findall('href="(http://.*?)"', html) 8 | for link in links: 9 | print link 10 | 11 | -------------------------------------------------------------------------------- /EXAMPLE CODE/urlwords.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | 3 | counts = dict() 4 | fhand = urllib.urlopen('http://www.py4inf.com/code/romeo.txt') 5 | for line in fhand: 6 | words = line.split() 7 | for word in words: 8 | counts[word] = counts.get(word,0) + 1 9 | print counts 10 | -------------------------------------------------------------------------------- /EXAMPLE CODE/whathour.py: -------------------------------------------------------------------------------- 1 | fname = raw_input('Enter file name: ') 2 | fhand = open(fname) 3 | c = dict() 4 | for line in fhand: 5 | if not line.startswith('From ') : continue 6 | pieces = line.split() 7 | time = pieces[5] 8 | parts = time.split(':') 9 | hour = parts[0] 10 | c[hour] = c.get(hour,0) + 1 11 | 12 | lst = list() 13 | for key in c: 14 | value = c[key] 15 | lst.append( (value, key) ) 16 | 17 | lst.sort() 18 | 19 | for value, key in lst: 20 | print key, value 21 | 22 | 23 | -------------------------------------------------------------------------------- /EXAMPLE CODE/wikidata.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/EXAMPLE CODE/wikidata.db -------------------------------------------------------------------------------- /EXAMPLE CODE/wikigrade.py: -------------------------------------------------------------------------------- 1 | # Note - this code must run in Python 2.x and you must download 2 | # http://www.pythonlearn.com/code/BeautifulSoup.py 3 | # Into the same folder as this program 4 | 5 | import string 6 | import sqlite3 7 | import urllib 8 | import xml.etree.ElementTree as ET 9 | from BeautifulSoup import * 10 | 11 | conn = sqlite3.connect('wikidata.db') 12 | cur = conn.cursor() 13 | 14 | cur.execute(''' 15 | CREATE TABLE IF NOT EXISTS TinyTable (id INTEGER PRIMARY KEY, 16 | url TEXT, page BLOB, retrieved_at timestamp)''') 17 | 18 | # A slightly extended dictionary 19 | class sash(dict): 20 | def sortvalues(self,reverse=True): 21 | return sorted(self.items(),key=lambda x: (x[1], x[0]), reverse=reverse) 22 | 23 | def tinyTable(url): 24 | global cur,conn 25 | cur.execute('SELECT id,page,retrieved_at FROM TinyTable WHERE URL = ?', (url, )) 26 | try: 27 | row = cur.fetchone() 28 | print 'DATE',row[2] 29 | return row[1] 30 | except: 31 | row = None 32 | print 'Retrieving', url 33 | 34 | data = urllib.urlopen (url).read() 35 | if row != None: 36 | cur.execute("UPDATE TinyTable SET page=?,retrieved_at=datetime('now') WHERE id=?", (unicode(data, 'utf-8'), row[0])) 37 | else: 38 | cur.execute("INSERT INTO TinyTable (url, page, retrieved_at) VALUES (?, ?, datetime('now'))",(url, unicode(data, 'utf-8'))) 39 | conn.commit() 40 | return data 41 | 42 | cururl = 'https://ctools.umich.edu/portal/tool/27500dea-c105-4f7b-a195-3c89536a64b7?pageName=%2Fsite%2Ff57681b8-6db9-46cf-aad1-3a0bdd621138%2Fhome&action=view&panel=Main&realm=%2Fsite%2Ff57681b8-6db9-46cf-aad1-3a0bdd621138' 43 | prefix = 'https://ctools.umich.edu/portal/tool/27500dea-c105-4f7b-a195-3c89536a64b7' 44 | 45 | urls = list() 46 | urls.append(cururl) 47 | visited = list() 48 | editcounts = sash() 49 | postcounts = sash() 50 | 51 | while len(urls) > 0 : 52 | print '=== URLS Yet To Retrieve:',len(urls) 53 | cururl = urls.pop() 54 | if cururl in visited: continue 55 | print 'RETRIEVING',cururl 56 | data = tinyTable(cururl) 57 | visited.append(cururl) 58 | soup = BeautifulSoup(data) 59 | tags = soup('a') 60 | # print 'Tags' 61 | for tag in tags: 62 | print tag 63 | url = tag.get('href',None) 64 | if url == None : continue 65 | # Don't follow absolute urls 66 | if not url.startswith(prefix) : continue 67 | newurl = urllib.basejoin(cururl,url) 68 | if newurl in visited : continue 69 | # print 'APPENDING',newurl 70 | if newurl.find('action=view') > 0 or newurl.find('action=history') > 0 : 71 | urls.append(newurl) 72 | 73 | print 'EDITS:' 74 | for (key,val) in editcounts.sortvalues(): 75 | print key, val 76 | 77 | for (key,val) in sorted(postcounts.items()): 78 | print key, val 79 | 80 | conn.close() 81 | -------------------------------------------------------------------------------- /EXAMPLE CODE/wordlist.py: -------------------------------------------------------------------------------- 1 | name = raw_input('Enter file: ') 2 | handle = open(name, 'r') 3 | wordlist = list() 4 | for line in handle: 5 | words = line.split() 6 | for word in words: 7 | if word in wordlist: continue 8 | wordlist.append(word) 9 | 10 | wordlist.sort() 11 | print wordlist 12 | -------------------------------------------------------------------------------- /EXAMPLE CODE/words.py: -------------------------------------------------------------------------------- 1 | name = raw_input('Enter file:') 2 | handle = open(name, 'r') 3 | text = handle.read() 4 | words = text.split() 5 | counts = dict() 6 | for word in words: 7 | counts[word] = counts.get(word,0) + 1 8 | 9 | bigcount = None 10 | bigword = None 11 | for word,count in counts.items(): 12 | if bigcount == None or count > bigcount: 13 | bigword = word 14 | bigcount = count 15 | 16 | print bigword, bigcount 17 | -------------------------------------------------------------------------------- /EXAMPLE CODE/words.txt: -------------------------------------------------------------------------------- 1 | Writing programs or programming is a very creative 2 | and rewarding activity You can write programs for 3 | many reasons ranging from making your living to solving 4 | a difficult data analysis problem to having fun to helping 5 | someone else solve a problem This book assumes that 6 | {\em everyone} needs to know how to program and that once 7 | you know how to program, you will figure out what you want 8 | to do with your newfound skills 9 | 10 | We are surrounded in our daily lives with computers ranging 11 | from laptops to cell phones We can think of these computers 12 | as our personal assistants who can take care of many things 13 | on our behalf The hardware in our current-day computers 14 | is essentially built to continuously ask us the question 15 | What would you like me to do next 16 | 17 | Our computers are fast and have vasts amounts of memory and 18 | could be very helpful to us if we only knew the language to 19 | speak to explain to the computer what we would like it to 20 | do next If we knew this language we could tell the 21 | computer to do tasks on our behalf that were reptitive 22 | Interestingly, the kinds of things computers can do best 23 | are often the kinds of things that we humans find boring 24 | and mind-numbing 25 | -------------------------------------------------------------------------------- /EXAMPLE CODE/xml1.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | 3 | data = ''' 4 | 5 | Chuck 6 | 7 | +1 734 303 4456 8 | 9 | 10 | ''' 11 | 12 | tree = ET.fromstring(data) 13 | print 'Name:',tree.find('name').text 14 | print 'Attr:',tree.find('email').get('hide') 15 | -------------------------------------------------------------------------------- /EXAMPLE CODE/xml2.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | 3 | input = ''' 4 | 5 | 6 | 7 | 001 8 | Chuck 9 | 10 | 11 | 009 12 | Brent 13 | 14 | 15 | ''' 16 | 17 | stuff = ET.fromstring(input) 18 | lst = stuff.findall('users/user') 19 | print 'User count:', len(lst) 20 | 21 | for item in lst: 22 | print 'Name', item.find('name').text 23 | print 'Id', item.find('id').text 24 | print 'Attribute', item.get("x") 25 | print "" 26 | 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #Introduction 2 | This repository contains the resources and materials I've generated myself during the course ["Using Python to Access Web Data"](https://www.coursera.org/learn/python-network-data/), from University of Michigan and offered in Coursera 3 | 4 | #Content available 5 | Currently, the only content available are the Python files corresponding to the programming assignments I've coded for the course. I plan on making available also documents with my notes from the course, but I'm still finishing them 6 | 7 | #Folder structure 8 | The content follows the same structure than the course: there's a folder for each of the weeks the course is structured in, and them contains all the material from that week. There are two types of files: the ones that start with "C" are the coding files containing the exercises, while the ones that start with "A" are other resources used on or obtained as a result from the assignments. The other two numbers represent the unit and the index of that document within the type+week respectively. 9 | 10 | *Example: the file starting with C4.2. is the 2nd coding file needed for the programming assignment used on Unit 4* 11 | -------------------------------------------------------------------------------- /Textbook - Castellano.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Textbook - Castellano.epub -------------------------------------------------------------------------------- /Textbook - English.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Textbook - English.epub -------------------------------------------------------------------------------- /Unit 1 - Introduction/A1.1 - Code screenshot.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Unit 1 - Introduction/A1.1 - Code screenshot.PNG -------------------------------------------------------------------------------- /Unit 1 - Introduction/A1.2. - Script execution.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Unit 1 - Introduction/A1.2. - Script execution.PNG -------------------------------------------------------------------------------- /Unit 1 - Introduction/C1.1 - Firstcode.py: -------------------------------------------------------------------------------- 1 | print("Hello pythonistas!") -------------------------------------------------------------------------------- /Unit 2 - Regular expressions/C2.1 - Programming assignment.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | #Opening the file in which we'll need to find the numbers 4 | sample_file = open('A.2.2 - regex text data.txt') 5 | 6 | #Obtaining strings representing the numbers in that file 7 | text = sample_file.read() #With read, we read the entire text and not line by line 8 | number_regex = '[0-9]+' 9 | numbers = re.findall(number_regex, text) #Match any combination of one or more digits 10 | 11 | #Casting them to integers and getting the total sum 12 | total = sum(int(num) for num in numbers) 13 | 14 | print(total) 15 | 16 | #Closing the file to avoid memory problems 17 | sample_file.close() -------------------------------------------------------------------------------- /Unit 3 - Networks and sockets/C3.1. - Programming assignment.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | #Setting the socket 4 | mysocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 5 | mysocket.connect( ('www.pythonlearn.com', 80) ) 6 | 7 | #Making the HTTP request that will get us the desired document 8 | mysocket.send("GET http://www.pythonlearn.com/code/intro-short.txt HTTP/1.0 \n\n") 9 | 10 | while True: 11 | #Obtaining the web data 12 | webdata = mysocket.recv(512) 13 | 14 | #When there's no more data left, we'll stop the loop 15 | if len(webdata) < 1: 16 | break 17 | 18 | #Printing the obtained data 19 | print webdata 20 | 21 | mysocket.close() -------------------------------------------------------------------------------- /Unit 4 - Programs that surf the web/BeautifulSoup.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Unit 4 - Programs that surf the web/BeautifulSoup.pyc -------------------------------------------------------------------------------- /Unit 4 - Programs that surf the web/C4.1. Programming assignment.py: -------------------------------------------------------------------------------- 1 | """ 2 | This assignment consists of using urllib to read the HTML from the data files 3 | indicated, and parse the data, extract the numbers and compute the sum of the 4 | numbers in the file 5 | 6 | DATA FORMAT: 7 | The file is a table of names and comment counts. You can ignore most of the data 8 | in the file except for lines like the following: 9 | 10 | Modu90 11 | Kenzie88 12 | Hubert87 13 | 14 | You are to find all the tags in the file and pull out the numbers from the 15 | tag and sum the numbers. 16 | 17 | Look at the sample code (http://www.pythonlearn.com/code/urllink2.py) provided. It 18 | shows how to find all of a certain kind of tag, loop through the tags and extract 19 | the various aspects of the tags. 20 | 21 | # Retrieve all of the anchor tags 22 | tags = soup('a') 23 | for tag in tags: 24 | # Look at the parts of a tag 25 | print 'TAG:',tag 26 | print 'URL:',tag.get('href', None) 27 | print 'Contents:',tag.contents[0] 28 | print 'Attrs:',tag.attrs 29 | 30 | You need to adjust this code to look for span tags and pull out the text content of the 31 | span tag, convert them to integers and add them up to complete the assignment. """ 32 | 33 | 34 | import urllib 35 | from BeautifulSoup import * 36 | 37 | sample_url = "http://python-data.dr-chuck.net/comments_42.html" 38 | data_url = "http://python-data.dr-chuck.net/comments_277464.html" 39 | 40 | #Getting the html information and parsing it with BeautifulSoup 41 | html = urllib.urlopen(data_url).read() 42 | soup = BeautifulSoup(html) 43 | 44 | #Getting a list with the "span" tags 45 | tags = soup('span') 46 | 47 | #Counting the sum of all the values within the span tags 48 | count = 0 49 | for tag in tags: 50 | 51 | #We need to cast them to int, as they're parsed as text strings 52 | count += int(tag.contents[0]) 53 | 54 | print(count) 55 | 56 | -------------------------------------------------------------------------------- /Unit 4 - Programs that surf the web/C4.2. Programming assignment 2.py: -------------------------------------------------------------------------------- 1 | """ 2 | In this assignment you will write a Python program that expands on http://www.pythonlearn.com/code/urllinks.py 3 | The program will use urllib to read the HTML from the data files below, extract 4 | the href= vaues from the anchor tags, scan for a tag that is in a particular 5 | position from the top and follow that link, repeat the process a number of times, 6 | and report the last name you find. 7 | 8 | 9 | SAMPLE: 10 | Find the link at position 3 (the first name is 1). Follow that link. Repeat this 11 | process 4 times. The answer is the last name that you retrieve. 12 | The result should be: Anayah 13 | 14 | PROBLEM: 15 | Find the link at position 18 (the first name is 1). Follow that link. Repeat this 16 | process 7 times. The answer is the last name that you retrieve. 17 | Hint: the name starts with S 18 | """ 19 | 20 | import urllib 21 | from BeautifulSoup import * 22 | 23 | #SAMPLE DATA 24 | sample_url = "http://python-data.dr-chuck.net/known_by_Fikret.html" 25 | sample_repetitions = 4 26 | sample_resultPosition = 3 27 | 28 | #ACTUAL PROBLEM DATA 29 | problem_url = "http://python-data.dr-chuck.net/known_by_Max.html" 30 | problem_repetitions = 7 31 | problem_resultPosition = 18 32 | 33 | 34 | #Choosing the type of execution we're trying 35 | type_of_execution = 'problem' 36 | if type_of_execution == 'sample': 37 | (link, repetitions, resultPosition) = (sample_url, sample_repetitions, sample_resultPosition) 38 | 39 | elif type_of_execution == 'problem': 40 | (link, repetitions, resultPosition) = (problem_url, problem_repetitions, problem_resultPosition) 41 | 42 | 43 | #Amount of iterations needed 44 | for times in range(repetitions): 45 | 46 | #Getting the information of the correspondent url 47 | html = urllib.urlopen(link).read() 48 | soup = BeautifulSoup(html) 49 | tags = soup('a') 50 | 51 | #We are indicated that the first name is 1, but in Python the array begins in 0, 52 | #so we have to take 1 unit from the index 53 | link = tags[resultPosition - 1].get('href') 54 | 55 | #Getting the content of the tag in the specified position. It should correspond to 56 | #the answer we're looking for 57 | result_name = tags[resultPosition - 1].contents[0] 58 | print(result_name) 59 | 60 | -------------------------------------------------------------------------------- /Unit 5 - Web services and XML/BeautifulSoup.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexGascon/Using-Python-to-Access-Web-Data---Coursera/30306dde188406926956eb62bbdad63158c59251/Unit 5 - Web services and XML/BeautifulSoup.pyc -------------------------------------------------------------------------------- /Unit 5 - Web services and XML/C5.1. Programming assignment.py: -------------------------------------------------------------------------------- 1 | """ 2 | EXTRACTING DATA FROM XML 3 | In this assignment you will write a Python program somewhat similar to 4 | http://www.pythonlearn.com/code/geoxml.py. The program will prompt for a URL, 5 | read the XML data from that URL using urllib and then parse and extract the 6 | comment counts from the XML data, compute the sum of the numbers in the file. 7 | 8 | We provide two files for this assignment. One is a sample file where we give you 9 | the sum for your testing and the other is the actual data you need to process for 10 | the assignment. 11 | 12 | Sample data: http://python-data.dr-chuck.net/comments_42.xml (Sum=2553) 13 | Actual data: http://python-data.dr-chuck.net/comments_277461.xml 14 | 15 | You do not need to save these files to your folder since your program will read 16 | the data directly from the URL. Note: Each student will have a distinct data url 17 | for the assignment - so only use your own data url for analysis. 18 | 19 | 20 | DATA FORMAT AND APPROACH 21 | The data consists of a number of names and comment counts in XML as follows: 22 | 23 | 24 | Matthias 25 | 97 26 | 27 | 28 | You are to look through all the tags and find the values sum 29 | the numbers. The closest sample code that shows how to parse XML is geoxml.py. 30 | But since the nesting of the elements in our data is different than the data we 31 | are parsing in that sample code you will have to make real changes to the code. 32 | 33 | To make the code a little simpler, you can use an XPath selector string to look 34 | through the entire tree of XML for any tag named 'count' with the following line 35 | of code: 36 | 37 | counts = tree.findall('.//count') 38 | 39 | Take a look at the Python ElementTree documentation and look for the supported 40 | XPath syntax for details. You could also work from the top of the XML down to 41 | the comments node and then loop through the child nodes of the comments node. 42 | """ 43 | #We'll left XPath for another moment, as it requires further investigation. For 44 | #now we'll look for the count tags by knowing its structure: 45 | #commentinfo -> comments -> comment -> count 46 | 47 | 48 | import urllib 49 | from BeautifulSoup import * 50 | import xml.etree.ElementTree as ET 51 | 52 | sample_data = "http://python-data.dr-chuck.net/comments_42.xml" 53 | actual_data = "http://python-data.dr-chuck.net/comments_277461.xml" 54 | 55 | #We'll analyze this generic parameter, so we will only need to change its source 56 | #and not every single one of its appearances in the code 57 | #NOTE: I'm using Sublime Text and it doesn't accept raw_input, so I'll set the URL 58 | #from here isntead from a user prompt 59 | data_url = actual_data 60 | data = urllib.urlopen(data_url).read() 61 | 62 | #xml_data contains the commentinfo object, as it is the main structure, so we 63 | #have to look for the comments element and then for all its comment elements 64 | xml_data = ET.fromstring(data) 65 | search_str = "comments/comment" 66 | count_tags = xml_data.findall(search_str) 67 | 68 | #Computing the sum 69 | total_count = 0 70 | for tag in count_tags: 71 | #We'll find the "count" element inside each "comment" element and add it 72 | count = tag.find('count') 73 | total_count += int(count.text) 74 | 75 | print(total_count) 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /Unit 6 - JSON and the REST architecture/C.6.1. - Programming assignment 1.py: -------------------------------------------------------------------------------- 1 | """ EXTRACTING DATA FROM JSON 2 | In this assignment you will write a Python program somewhat similar to 3 | http://www.pythonlearn.com/code/json2.py. The program will prompt for a URL, read 4 | the JSON data from that URL using urllib and then parse and extract the comment 5 | counts from the JSON data, compute the sum of the numbers in the file and enter 6 | the sum below: 7 | 8 | We provide two files for this assignment. One is a sample file where we give you 9 | the sum for your testing and the other is the actual data you need to process for 10 | the assignment. 11 | 12 | - Sample data: http://python-data.dr-chuck.net/comments_42.json (Sum=2553) 13 | - Actual data: http://python-data.dr-chuck.net/comments_277465.json 14 | 15 | You do not need to save these files to your folder since your program will read 16 | the data directly from the URL. Note: Each student will have a distinct data url 17 | for the assignment - so only use your own data url for analysis. 18 | 19 | 20 | DATA FORMAT 21 | The data consists of a number of names and comment counts in JSON as follows: 22 | 23 | { 24 | comments: [ 25 | { 26 | name: "Matthias" 27 | count: 97 28 | }, 29 | { 30 | name: "Geomer" 31 | count: 97 32 | } 33 | ... 34 | ] 35 | } 36 | The closest sample code that shows how to parse JSON and extract a list is 37 | json2.py. You might also want to look at geoxml.py to see how to prompt for a URL 38 | and retrieve data from a URL. 39 | """ 40 | 41 | 42 | import urllib 43 | import json 44 | 45 | sample_url = "http://python-data.dr-chuck.net/comments_42.json" 46 | data_url = "http://python-data.dr-chuck.net/comments_277465.json" 47 | 48 | #Reading the URL and parsing its data 49 | urldata = urllib.urlopen(data_url).read() 50 | data = json.loads(urldata) 51 | 52 | #Finding each "count" field and adding its value to the total sum. 53 | total = 0 54 | for comment in data["comments"]: 55 | total += comment["count"] 56 | 57 | print("TOTAL SUM: ", total) -------------------------------------------------------------------------------- /Unit 6 - JSON and the REST architecture/C.6.2. - Programming assignment 2.py: -------------------------------------------------------------------------------- 1 | """ 2 | CALLING A JSON API 3 | In this assignment you will write a Python program somewhat similar to 4 | http://www.pythonlearn.com/code/geojson.py. The program will prompt for a location, 5 | contact a web service and retrieve JSON for the web service and parse that data, 6 | and retrieve the first place_id from the JSON. A place ID is a textual identifier 7 | that uniquely identifies a place as within Google Maps. 8 | 9 | 10 | API ENDPOINTS 11 | To complete this assignment, you should use this API endpoint that has a static 12 | subset of the Google Data: 13 | 14 | http://python-data.dr-chuck.net/geojson 15 | 16 | This API uses the same parameters (sensor and address) as the Google API. This 17 | API also has no rate limit so you can test as often as you like. If you visit 18 | the URL with no parameters, you get a list of all of the address values which 19 | can be used with this API. 20 | 21 | To call the API, you need to provide a sensor=false parameter and the address 22 | that you are requesting as the address= parameter that is properly URL encoded 23 | using the urllib.urlencode() fuction as shown in 24 | http://www.pythonlearn.com/code/geojson.py 25 | 26 | 27 | TEST DATA / SAMPLE EXECUTION 28 | You can test to see if your program is working with a location of "South Federal 29 | University" which will have a place_id of "ChIJJ8oO7_B_bIcR2AlhC8nKlok". 30 | 31 | 32 | TURN IN 33 | Please run your program to find the place_id for this location: Columbia 34 | University 35 | 36 | Make sure to enter the name and case exactly as above and enter the place_id and 37 | your Python code below. Hint: The first seven characters of the place_id are 38 | "ChIJdeM ...". Make sure to retreive the data from the URL specified above and 39 | not the normal Google API. Your program should work with the Google API - but the 40 | place_id may not match for this assignment. 41 | """ 42 | 43 | import json 44 | import urllib 45 | 46 | #Stroring the given parameters 47 | serviceurl = "http://python-data.dr-chuck.net/geojson?" 48 | sample_address = "South Federal University" 49 | data_address = "Columbia University" 50 | address_wanted = data_address 51 | 52 | #Setting the GET parameters on the URL 53 | parameters = {"sensor": "false", "address": address_wanted} 54 | paramsurl = urllib.urlencode(parameters) 55 | 56 | #Generating the complete URL. Printing it in order to check if it's correct. 57 | queryurl = serviceurl + paramsurl 58 | print("DATA URL: ", queryurl) 59 | 60 | #Obtaining and reading the data 61 | data = urllib.urlopen(queryurl).read() 62 | 63 | #Parsing the data and looking for the field we want. 64 | #That field is inside the "Results" array, in its first item (if our address is 65 | #correct we can assume that the result would be the correct one) and on its 66 | #"place_id" field 67 | jsondata = json.loads(str(data)) 68 | place_id = jsondata["results"][0]["place_id"] 69 | print("PLACE ID: ", place_id) 70 | 71 | --------------------------------------------------------------------------------